From 290e62f179a86328f90aaac3c87d4d640637dec6 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 18:01:39 -0800
Subject: [PATCH 01/11] Remove casts to integers for texture and EFB formats

The only remaining casts for these types that I know of are in TextureInfo (where format_name is set to the int version of the format, and since that affects filenames and probably would break resource packs, I'm not changing it) and in TextureDecoder_Common's TexDecoder_DrawOverlay, which will be handled separately.
---
 .../Core/VideoCommon/FramebufferShaderGen.cpp  |  4 ++--
 Source/Core/VideoCommon/TextureCacheBase.cpp   |  8 +++-----
 .../VideoCommon/TextureConversionShader.cpp    |  6 ++----
 .../VideoCommon/TextureConverterShaderGen.cpp  |  9 +++------
 .../Core/VideoCommon/TextureDecoder_Common.cpp | 18 ++++++------------
 Source/Core/VideoCommon/TextureDecoder_x64.cpp |  3 +--
 6 files changed, 17 insertions(+), 31 deletions(-)
diff --git a/Source/Core/VideoCommon/FramebufferShaderGen.cpp b/Source/Core/VideoCommon/FramebufferShaderGen.cpp
index 40949f34aa..345578d071 100644
--- a/Source/Core/VideoCommon/FramebufferShaderGen.cpp
+++ b/Source/Core/VideoCommon/FramebufferShaderGen.cpp
@@ -550,7 +550,7 @@ std::string GenerateTextureReinterpretShader(TextureFormat from_format, TextureF
   break;
 
   default:
-    WARN_LOG_FMT(VIDEO, "From format {} is not supported", static_cast<u32>(from_format));
+    WARN_LOG_FMT(VIDEO, "From format {} is not supported", from_format);
     return "{}\n";
   }
 
@@ -602,7 +602,7 @@ std::string GenerateTextureReinterpretShader(TextureFormat from_format, TextureF
   }
   break;
   default:
-    WARN_LOG_FMT(VIDEO, "To format {} is not supported", static_cast<u32>(to_format));
+    WARN_LOG_FMT(VIDEO, "To format {} is not supported", to_format);
     return "{}\n";
   }
 
diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp
index 9eb6e09792..a14c63af5b 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@@ -276,8 +276,7 @@ TextureCacheBase::ApplyPaletteToEntry(TCacheEntry* entry, const u8* palette, TLU
   const AbstractPipeline* pipeline = g_shader_cache->GetPaletteConversionPipeline(tlutfmt);
   if (!pipeline)
   {
-    ERROR_LOG_FMT(VIDEO, "Failed to get conversion pipeline for format {:#04X}",
-                  static_cast<u32>(tlutfmt));
+    ERROR_LOG_FMT(VIDEO, "Failed to get conversion pipeline for format {}", tlutfmt);
     return nullptr;
   }
 
@@ -345,9 +344,8 @@ TextureCacheBase::TCacheEntry* TextureCacheBase::ReinterpretEntry(const TCacheEn
       g_shader_cache->GetTextureReinterpretPipeline(existing_entry->format.texfmt, new_format);
   if (!pipeline)
   {
-    ERROR_LOG_FMT(VIDEO,
-                  "Failed to obtain texture reinterpreting pipeline from format {:#04X} to {:#04X}",
-                  static_cast<u32>(existing_entry->format.texfmt), static_cast<u32>(new_format));
+    ERROR_LOG_FMT(VIDEO, "Failed to obtain texture reinterpreting pipeline from format {} to {}",
+                  existing_entry->format.texfmt, new_format);
     return nullptr;
   }
 
diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index ed97c48144..e7a2d4a392 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -48,8 +48,7 @@ u16 GetEncodedSampleCount(EFBCopyFormat format)
   case EFBCopyFormat::XFB:
     return 2;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ({:#X})! (GetEncodedSampleCount)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid EFB Copy Format {}! (GetEncodedSampleCount)", format);
     return 1;
   }
 }
@@ -802,8 +801,7 @@ std::string GenerateEncodingShader(const EFBCopyParams& params, APIType api_type
     WriteXFBEncoder(code, api_type, params);
     break;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ({:#X})! (GenerateEncodingShader)",
-                  static_cast<int>(params.copy_format));
+    PanicAlertFmt("Invalid EFB Copy Format {}! (GenerateEncodingShader)", params.copy_format);
     break;
   }
 
diff --git a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
index ae09871b98..0667f8c621 100644
--- a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
+++ b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
@@ -173,8 +173,7 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
       break;
 
     default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy zbuf format: {:#X}",
-                    static_cast<int>(uid_data->dst_format));
+      ERROR_LOG_FMT(VIDEO, "Unknown copy zbuf format: {}", uid_data->dst_format);
       out.Write("  ocol0 = float4(texcol.bgr, 0.0);\n");
       break;
     }
@@ -207,8 +206,7 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
       break;
 
     default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy intensity format: {:#X}",
-                    static_cast<int>(uid_data->dst_format));
+      ERROR_LOG_FMT(VIDEO, "Unknown copy intensity format: {}", uid_data->dst_format);
       out.Write("  ocol0 = texcol;\n");
       break;
     }
@@ -283,8 +281,7 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
       break;
 
     default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy color format: {:#X}",
-                    static_cast<int>(uid_data->dst_format));
+      ERROR_LOG_FMT(VIDEO, "Unknown copy color format: {}", uid_data->dst_format);
       out.Write("  ocol0 = texcol;\n");
       break;
     }
diff --git a/Source/Core/VideoCommon/TextureDecoder_Common.cpp b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
index 798496ac6d..bc2de947ea 100644
--- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
@@ -50,8 +50,7 @@ int TexDecoder_GetTexelSizeInNibbles(TextureFormat format)
   case TextureFormat::XFB:
     return 4;
   default:
-    PanicAlertFmt("Invalid Texture Format ({:#X})! (GetTexelSizeInNibbles)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid Texture Format {}! (GetTexelSizeInNibbles)", format);
     return 1;
   }
 }
@@ -90,8 +89,7 @@ int TexDecoder_GetBlockWidthInTexels(TextureFormat format)
   case TextureFormat::XFB:
     return 16;
   default:
-    PanicAlertFmt("Invalid Texture Format ({:#X})! (GetBlockWidthInTexels)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid Texture Format {}! (GetBlockWidthInTexels)", format);
     return 8;
   }
 }
@@ -125,8 +123,7 @@ int TexDecoder_GetBlockHeightInTexels(TextureFormat format)
   case TextureFormat::XFB:
     return 1;
   default:
-    PanicAlertFmt("Invalid Texture Format ({:#X})! (GetBlockHeightInTexels)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid Texture Format {}! (GetBlockHeightInTexels)", format);
     return 4;
   }
 }
@@ -160,8 +157,7 @@ int TexDecoder_GetEFBCopyBlockWidthInTexels(EFBCopyFormat format)
   case EFBCopyFormat::XFB:
     return 16;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ({:#X})! (GetEFBCopyBlockWidthInTexels)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid EFB Copy Format {}! (GetEFBCopyBlockWidthInTexels)", format);
     return 8;
   }
 }
@@ -195,8 +191,7 @@ int TexDecoder_GetEFBCopyBlockHeightInTexels(EFBCopyFormat format)
   case EFBCopyFormat::XFB:
     return 1;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ({:#X})! (GetEFBCopyBlockHeightInTexels)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid EFB Copy Format {}! (GetEFBCopyBlockHeightInTexels)", format);
     return 4;
   }
 }
@@ -247,8 +242,7 @@ TextureFormat TexDecoder_GetEFBCopyBaseFormat(EFBCopyFormat format)
   case EFBCopyFormat::XFB:
     return TextureFormat::XFB;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ({:#X})! (GetEFBCopyBaseFormat)",
-                  static_cast<int>(format));
+    PanicAlertFmt("Invalid EFB Copy Format ()! (GetEFBCopyBaseFormat)", format);
     return static_cast<TextureFormat>(format);
   }
 }
diff --git a/Source/Core/VideoCommon/TextureDecoder_x64.cpp b/Source/Core/VideoCommon/TextureDecoder_x64.cpp
index 761fc0cd64..85a6e5e731 100644
--- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp
@@ -1495,8 +1495,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, Text
     break;
 
   default:
-    PanicAlertFmt("Invalid Texture Format ({:#X})! (_TexDecoder_DecodeImpl)",
-                  static_cast<int>(texformat));
+    PanicAlertFmt("Invalid Texture Format {}! (_TexDecoder_DecodeImpl)", texformat);
     break;
   }
 }

From 850e524514c364f7f619e891f4dc2d687fa019f9 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 18:22:47 -0800
Subject: [PATCH 02/11] Use fmt::to_string for texture format overlay

This required adding parentheses to the font used by that.
---
 .../VideoCommon/TextureDecoder_Common.cpp     | 80 +------------------
 Source/Core/VideoCommon/sfont.inc             | 24 +++++-
 2 files changed, 27 insertions(+), 77 deletions(-)

diff --git a/Source/Core/VideoCommon/TextureDecoder_Common.cpp b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
index bc2de947ea..51ea0572f2 100644
--- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
@@ -242,7 +242,7 @@ TextureFormat TexDecoder_GetEFBCopyBaseFormat(EFBCopyFormat format)
   case EFBCopyFormat::XFB:
     return TextureFormat::XFB;
   default:
-    PanicAlertFmt("Invalid EFB Copy Format ()! (GetEFBCopyBaseFormat)", format);
+    PanicAlertFmt("Invalid EFB Copy Format {}! (GetEFBCopyBaseFormat)", format);
     return static_cast<TextureFormat>(format);
   }
 }
@@ -253,77 +253,6 @@ void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center)
   TexFmt_Overlay_Center = center;
 }
 
-static const char* texfmt[] = {
-    // pixel
-    "I4",
-    "I8",
-    "IA4",
-    "IA8",
-    "RGB565",
-    "RGB5A3",
-    "RGBA8",
-    "0x07",
-    "C4",
-    "C8",
-    "C14X2",
-    "0x0B",
-    "0x0C",
-    "0x0D",
-    "CMPR",
-    "0x0F",
-    // Z-buffer
-    "0x10",
-    "Z8",
-    "0x12",
-    "Z16",
-    "0x14",
-    "0x15",
-    "Z24X8",
-    "0x17",
-    "0x18",
-    "0x19",
-    "0x1A",
-    "0x1B",
-    "0x1C",
-    "0x1D",
-    "0x1E",
-    "0x1F",
-    // pixel + copy
-    "CR4",
-    "0x21",
-    "CRA4",
-    "CRA8",
-    "0x24",
-    "0x25",
-    "CYUVA8",
-    "CA8",
-    "CR8",
-    "CG8",
-    "CB8",
-    "CRG8",
-    "CGB8",
-    "0x2D",
-    "0x2E",
-    "XFB",
-    // Z + copy
-    "CZ4",
-    "0x31",
-    "0x32",
-    "0x33",
-    "0x34",
-    "0x35",
-    "0x36",
-    "0x37",
-    "0x38",
-    "CZ8M",
-    "CZ8L",
-    "0x3B",
-    "CZ16L",
-    "0x3D",
-    "0x3E",
-    "0x3F",
-};
-
 static void TexDecoder_DrawOverlay(u8* dst, int width, int height, TextureFormat texformat)
 {
   int w = std::min(width, 40);
@@ -338,11 +267,11 @@ static void TexDecoder_DrawOverlay(u8* dst, int width, int height, TextureFormat
     yoff = 0;
   }
 
-  const char* fmt = texfmt[static_cast<int>(texformat) & 15];
-  while (*fmt)
+  const auto fmt_str = fmt::to_string(texformat);
+  for (char ch : fmt_str)
   {
     int xcnt = 0;
-    int nchar = sfont_map[(int)*fmt];
+    int nchar = sfont_map[ch];
 
     const unsigned char* ptr = sfont_raw[nchar];  // each char is up to 9x10
 
@@ -363,7 +292,6 @@ static void TexDecoder_DrawOverlay(u8* dst, int width, int height, TextureFormat
       ptr += 9;
     }
     xoff += xcnt;
-    fmt++;
   }
 }
 
diff --git a/Source/Core/VideoCommon/sfont.inc b/Source/Core/VideoCommon/sfont.inc
index d5ddaa7d40..b9883ca058 100644
--- a/Source/Core/VideoCommon/sfont.inc
+++ b/Source/Core/VideoCommon/sfont.inc
@@ -4,7 +4,7 @@
 static const unsigned char sfont_map[] = {
 	10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,
 	10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,
-	10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,
+	10,10,10,10,10,10,10,10,63,64,10,10,10,10,10,10,
 	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,10,10,10,10,10,
 	10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
 	26,27,28,29,30,31,32,33,34,35,36,10,10,10,10,10,
@@ -713,5 +713,27 @@ static const unsigned char sfont_raw[][9*10] = {
 	0xff, 0x00, 0x00, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78,
+	},{
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	},{
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0x00, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0x00, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
+	0xff, 0xff, 0xff, 0xff, 0x78, 0x78, 0x78, 0x78, 0x78,
 	},
 };

From cd2cc5fa2278f451e84dd3d3595a7499ddd29678 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 16:00:32 -0800
Subject: [PATCH 03/11] Rename EFB copy bit 2 from yuv to unknown_bit

It was named yuv in 522746b2c223f37c45569ee7fd4a226b278cb6d9, but hardware testing indicates that that bit does nothing (the intensity format bit enables YUV conversion, instead).
---
 Source/Core/Core/FifoPlayer/FifoPlayer.cpp |  2 +-
 Source/Core/VideoCommon/BPMemory.h         | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
index 9117e4d0f4..a0c04765b6 100644
--- a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
+++ b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
@@ -591,7 +591,7 @@ void FifoPlayer::ClearEfb()
   UPE_Copy copy = bpmem.triggerEFBCopy;
   copy.clamp_top = false;
   copy.clamp_bottom = false;
-  copy.yuv = false;
+  copy.unknown_bit = false;
   copy.target_pixel_format = static_cast<u32>(EFBCopyFormat::RGBA8) << 1;
   copy.gamma = 0;
   copy.half_scale = false;
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index 28155a2cd6..2a837c862b 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -2039,9 +2039,9 @@ union UPE_Copy
 {
   u32 Hex;
 
-  BitField<0, 1, bool, u32> clamp_top;      // if set clamp top
-  BitField<1, 1, bool, u32> clamp_bottom;   // if set clamp bottom
-  BitField<2, 1, bool, u32> yuv;            // if set, color conversion from RGB to YUV
+  BitField<0, 1, bool, u32> clamp_top;     // if set clamp top
+  BitField<1, 1, bool, u32> clamp_bottom;  // if set clamp bottom
+  BitField<2, 1, u32> unknown_bit;
   BitField<3, 4, u32> target_pixel_format;  // realformat is (fmt/2)+((fmt&1)*8).... for some reason
                                             // the msb is the lsb (pattern: cycling right shift)
   // gamma correction.. 0 = 1.0 ; 1 = 1.7 ; 2 = 2.2 ; 3 is reserved
@@ -2100,7 +2100,7 @@ struct fmt::formatter<UPE_Copy>
 
     return fmt::format_to(ctx.out(),
                           "Clamping: {}\n"
-                          "Converting from RGB to YUV: {}\n"
+                          "Unknown bit: {}\n"
                           "Target pixel format: {}\n"
                           "Gamma correction: {}\n"
                           "Half scale: {}\n"
@@ -2110,7 +2110,7 @@ struct fmt::formatter<UPE_Copy>
                           "Copy to XFB: {}\n"
                           "Intensity format: {}\n"
                           "Automatic color conversion: {}",
-                          clamp, no_yes[copy.yuv], copy.tp_realFormat(), gamma,
+                          clamp, copy.unknown_bit, copy.tp_realFormat(), gamma,
                           no_yes[copy.half_scale], no_yes[copy.scale_invert], no_yes[copy.clear],
                           copy.frame_to_field, no_yes[copy.copy_to_xfb], no_yes[copy.intensity_fmt],
                           no_yes[copy.auto_conv]);

From dd41a72378ee45e2947fa65e6db03b5898272b45 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 15:49:14 -0800
Subject: [PATCH 04/11] Only use intensity formats if both intensity_fmt and
 auto_conv are set

auto_conv is normally always set for EFB copies, but hardware testing indicates that intensity_fmt does nothing if auto_conv is not set.
---
 Source/Core/VideoCommon/BPStructs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp
index c6746d444a..2e34ef6d9b 100644
--- a/Source/Core/VideoCommon/BPStructs.cpp
+++ b/Source/Core/VideoCommon/BPStructs.cpp
@@ -276,7 +276,7 @@ static void BPWritten(const BPCmd& bp, int cycles_into_future)
       bool is_depth_copy = bpmem.zcontrol.pixel_format == PixelFormat::Z24;
       g_texture_cache->CopyRenderTargetToTexture(
           destAddr, PE_copy.tp_realFormat(), copy_width, copy_height, destStride, is_depth_copy,
-          srcRect, PE_copy.intensity_fmt, PE_copy.half_scale, 1.0f, 1.0f,
+          srcRect, PE_copy.intensity_fmt && PE_copy.auto_conv, PE_copy.half_scale, 1.0f, 1.0f,
           bpmem.triggerEFBCopy.clamp_top, bpmem.triggerEFBCopy.clamp_bottom,
           bpmem.copyfilter.GetCoefficients());
     }

From bed278d3b7d200e25527b2deb1a1e5f70ed3ff05 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Mon, 7 Feb 2022 12:11:15 -0800
Subject: [PATCH 05/11] Create dedicated enum for EFB/XFB gamma correction

This also changes the behavior for the invalid gamma value, which was confirmed to behave the same as 2.2.

Note that currently, the gamma value is only used for XFB copies, even though hardware testing indicates it also works for EFB copies.  This will be changed in a later commit.
---
 Source/Core/Core/FifoPlayer/FifoPlayer.cpp |  2 +-
 Source/Core/VideoCommon/BPMemory.h         | 32 +++++++++++-----------
 Source/Core/VideoCommon/BPStructs.cpp      | 10 ++++---
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
index a0c04765b6..b93fcf980a 100644
--- a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
+++ b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
@@ -593,7 +593,7 @@ void FifoPlayer::ClearEfb()
   copy.clamp_bottom = false;
   copy.unknown_bit = false;
   copy.target_pixel_format = static_cast<u32>(EFBCopyFormat::RGBA8) << 1;
-  copy.gamma = 0;
+  copy.gamma = GammaCorrection::Gamma1_0;
   copy.half_scale = false;
   copy.scale_invert = false;
   copy.clear = true;
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index 2a837c862b..38281d9493 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -2035,6 +2035,20 @@ struct fmt::formatter<FrameToField> : EnumFormatter<FrameToField::InterlacedOdd>
   constexpr formatter() : EnumFormatter(names) {}
 };
 
+enum class GammaCorrection : u32
+{
+  Gamma1_0 = 0,
+  Gamma1_7 = 1,
+  Gamma2_2 = 2,
+  // Hardware testing indicates this behaves the same as Gamma2_2
+  Invalid2_2 = 3,
+};
+template <>
+struct fmt::formatter<GammaCorrection> : EnumFormatter<GammaCorrection::Invalid2_2>
+{
+  constexpr formatter() : EnumFormatter({"1.0", "1.7", "2.2", "Invalid 2.2"}) {}
+};
+
 union UPE_Copy
 {
   u32 Hex;
@@ -2044,8 +2058,7 @@ union UPE_Copy
   BitField<2, 1, u32> unknown_bit;
   BitField<3, 4, u32> target_pixel_format;  // realformat is (fmt/2)+((fmt&1)*8).... for some reason
                                             // the msb is the lsb (pattern: cycling right shift)
-  // gamma correction.. 0 = 1.0 ; 1 = 1.7 ; 2 = 2.2 ; 3 is reserved
-  BitField<7, 2, u32> gamma;
+  BitField<7, 2, GammaCorrection> gamma;
   // "mipmap" filter... false = no filter (scale 1:1) ; true = box filter (scale 2:1)
   BitField<9, 1, bool, u32> half_scale;
   BitField<10, 1, bool, u32> scale_invert;  // if set vertical scaling is on
@@ -2084,19 +2097,6 @@ struct fmt::formatter<UPE_Copy>
       else
         clamp = "None";
     }
-    std::string_view gamma = "Invalid";
-    switch (copy.gamma)
-    {
-    case 0:
-      gamma = "1.0";
-      break;
-    case 1:
-      gamma = "1.7";
-      break;
-    case 2:
-      gamma = "2.2";
-      break;
-    }
 
     return fmt::format_to(ctx.out(),
                           "Clamping: {}\n"
@@ -2110,7 +2110,7 @@ struct fmt::formatter<UPE_Copy>
                           "Copy to XFB: {}\n"
                           "Intensity format: {}\n"
                           "Automatic color conversion: {}",
-                          clamp, copy.unknown_bit, copy.tp_realFormat(), gamma,
+                          clamp, copy.unknown_bit, copy.tp_realFormat(), copy.gamma,
                           no_yes[copy.half_scale], no_yes[copy.scale_invert], no_yes[copy.clear],
                           copy.frame_to_field, no_yes[copy.copy_to_xfb], no_yes[copy.intensity_fmt],
                           no_yes[copy.auto_conv]);
diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp
index 2e34ef6d9b..010dbf6abe 100644
--- a/Source/Core/VideoCommon/BPStructs.cpp
+++ b/Source/Core/VideoCommon/BPStructs.cpp
@@ -11,6 +11,7 @@
 #include <fmt/format.h>
 
 #include "Common/CommonTypes.h"
+#include "Common/EnumMap.h"
 #include "Common/Logging/Log.h"
 
 #include "Core/ConfigManager.h"
@@ -42,7 +43,8 @@
 
 using namespace BPFunctions;
 
-static const float s_gammaLUT[] = {1.0f, 1.7f, 2.2f, 1.0f};
+static constexpr Common::EnumMap<float, GammaCorrection::Invalid2_2> s_gammaLUT = {1.0f, 1.7f, 2.2f,
+                                                                                   2.2f};
 
 void BPInit()
 {
@@ -276,9 +278,9 @@ static void BPWritten(const BPCmd& bp, int cycles_into_future)
       bool is_depth_copy = bpmem.zcontrol.pixel_format == PixelFormat::Z24;
       g_texture_cache->CopyRenderTargetToTexture(
           destAddr, PE_copy.tp_realFormat(), copy_width, copy_height, destStride, is_depth_copy,
-          srcRect, PE_copy.intensity_fmt && PE_copy.auto_conv, PE_copy.half_scale, 1.0f, 1.0f,
-          bpmem.triggerEFBCopy.clamp_top, bpmem.triggerEFBCopy.clamp_bottom,
-          bpmem.copyfilter.GetCoefficients());
+          srcRect, PE_copy.intensity_fmt && PE_copy.auto_conv, PE_copy.half_scale, 1.0f,
+          s_gammaLUT[PE_copy.gamma], bpmem.triggerEFBCopy.clamp_top,
+          bpmem.triggerEFBCopy.clamp_bottom, bpmem.copyfilter.GetCoefficients());
     }
     else
     {

From d20094efa22829640849c8c4d906c6b0e1a24cd6 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Wed, 23 Feb 2022 13:05:50 -0800
Subject: [PATCH 06/11] Add extra Low and High fields to CopyFilterCoefficients

This struct is the only one in BPMemory that uses u64 as its base.  These fields are to allow viewing it as two u32s instead.  It's not used by Dolphin right now, but it is used in the copy of BPMemory.h used by hwtests.
---
 Source/Core/VideoCommon/BPMemory.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index 38281d9493..8d57b76a38 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -2123,10 +2123,12 @@ union CopyFilterCoefficients
 
   u64 Hex;
 
+  BitField<0, 32, u32, u64> Low;
   BitField<0, 6, u64> w0;
   BitField<6, 6, u64> w1;
   BitField<12, 6, u64> w2;
   BitField<18, 6, u64> w3;
+  BitField<32, 32, u32, u64> High;
   BitField<32, 6, u64> w4;
   BitField<38, 6, u64> w5;
   BitField<44, 6, u64> w6;

From 791bd16b281d3b123a2b9cf2d5c215d4ab235c1d Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Mon, 7 Feb 2022 13:37:28 -0800
Subject: [PATCH 07/11] Restructure parameters to
 TetxureConverterShaderGen/TextureConversionShader

This will be used for later refactoring for increased accuracy.
---
 Source/Core/VideoBackends/Null/TextureCache.h |  4 +-
 .../VideoBackends/Software/TextureCache.h     |  4 +-
 Source/Core/VideoCommon/TextureCacheBase.cpp  | 65 ++++++++++---------
 Source/Core/VideoCommon/TextureCacheBase.h    | 43 ++++++------
 .../VideoCommon/TextureConversionShader.cpp   | 12 ++--
 .../VideoCommon/TextureConverterShaderGen.cpp | 24 ++++---
 .../VideoCommon/TextureConverterShaderGen.h   | 13 ++--
 7 files changed, 89 insertions(+), 76 deletions(-)

diff --git a/Source/Core/VideoBackends/Null/TextureCache.h b/Source/Core/VideoBackends/Null/TextureCache.h
index 5b2b73c365..2b95586f44 100644
--- a/Source/Core/VideoBackends/Null/TextureCache.h
+++ b/Source/Core/VideoBackends/Null/TextureCache.h
@@ -14,7 +14,7 @@ protected:
                u32 bytes_per_row, u32 num_blocks_y, u32 memory_stride,
                const MathUtil::Rectangle<int>& src_rect, bool scale_by_half, bool linear_filter,
                float y_scale, float gamma, bool clamp_top, bool clamp_bottom,
-               const EFBCopyFilterCoefficients& filter_coefficients) override
+               const std::array<u32, 3>& filter_coefficients) override
   {
   }
 
@@ -22,7 +22,7 @@ protected:
                            const MathUtil::Rectangle<int>& src_rect, bool scale_by_half,
                            bool linear_filter, EFBCopyFormat dst_format, bool is_intensity,
                            float gamma, bool clamp_top, bool clamp_bottom,
-                           const EFBCopyFilterCoefficients& filter_coefficients) override
+                           const std::array<u32, 3>& filter_coefficients) override
   {
   }
 };
diff --git a/Source/Core/VideoBackends/Software/TextureCache.h b/Source/Core/VideoBackends/Software/TextureCache.h
index 9ffa8fa4f4..a7d241197f 100644
--- a/Source/Core/VideoBackends/Software/TextureCache.h
+++ b/Source/Core/VideoBackends/Software/TextureCache.h
@@ -14,7 +14,7 @@ protected:
                u32 bytes_per_row, u32 num_blocks_y, u32 memory_stride,
                const MathUtil::Rectangle<int>& src_rect, bool scale_by_half, bool linear_filter,
                float y_scale, float gamma, bool clamp_top, bool clamp_bottom,
-               const EFBCopyFilterCoefficients& filter_coefficients) override
+               const std::array<u32, 3>& filter_coefficients) override
   {
     TextureEncoder::Encode(dst, params, native_width, bytes_per_row, num_blocks_y, memory_stride,
                            src_rect, scale_by_half, y_scale, gamma);
@@ -23,7 +23,7 @@ protected:
                            const MathUtil::Rectangle<int>& src_rect, bool scale_by_half,
                            bool linear_filter, EFBCopyFormat dst_format, bool is_intensity,
                            float gamma, bool clamp_top, bool clamp_bottom,
-                           const EFBCopyFilterCoefficients& filter_coefficients) override
+                           const std::array<u32, 3>& filter_coefficients) override
   {
     // TODO: If we ever want to "fake" vram textures, we would need to implement this
   }
diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp
index a14c63af5b..2f896027ad 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@@ -1978,44 +1978,49 @@ void TextureCacheBase::StitchXFBCopy(TCacheEntry* stitched_entry)
   }
 }
 
-EFBCopyFilterCoefficients
+std::array<u32, 3>
 TextureCacheBase::GetRAMCopyFilterCoefficients(const CopyFilterCoefficients::Values& coefficients)
 {
   // To simplify the backend, we precalculate the three coefficients in common. Coefficients 0, 1
   // are for the row above, 2, 3, 4 are for the current pixel, and 5, 6 are for the row below.
-  return EFBCopyFilterCoefficients{
-      static_cast<float>(static_cast<u32>(coefficients[0]) + static_cast<u32>(coefficients[1])) /
-          64.0f,
-      static_cast<float>(static_cast<u32>(coefficients[2]) + static_cast<u32>(coefficients[3]) +
-                         static_cast<u32>(coefficients[4])) /
-          64.0f,
-      static_cast<float>(static_cast<u32>(coefficients[5]) + static_cast<u32>(coefficients[6])) /
-          64.0f,
+  return {
+      static_cast<u32>(coefficients[0]) + static_cast<u32>(coefficients[1]),
+      static_cast<u32>(coefficients[2]) + static_cast<u32>(coefficients[3]) +
+          static_cast<u32>(coefficients[4]),
+      static_cast<u32>(coefficients[5]) + static_cast<u32>(coefficients[6]),
   };
 }
 
-EFBCopyFilterCoefficients
+std::array<u32, 3>
 TextureCacheBase::GetVRAMCopyFilterCoefficients(const CopyFilterCoefficients::Values& coefficients)
 {
   // If the user disables the copy filter, only apply it to the VRAM copy.
   // This way games which are sensitive to changes to the RAM copy of the XFB will be unaffected.
-  EFBCopyFilterCoefficients res = GetRAMCopyFilterCoefficients(coefficients);
+  std::array<u32, 3> res = GetRAMCopyFilterCoefficients(coefficients);
   if (!g_ActiveConfig.bDisableCopyFilter)
     return res;
 
   // Disabling the copy filter in options should not ignore the values the game sets completely,
   // as some games use the filter coefficients to control the brightness of the screen. Instead,
   // add all coefficients to the middle sample, so the deflicker/vertical filter has no effect.
-  res.middle = res.upper + res.middle + res.lower;
-  res.upper = 0.0f;
-  res.lower = 0.0f;
+  res[1] = res[0] + res[1] + res[2];
+  res[0] = 0;
+  res[2] = 0;
   return res;
 }
 
-bool TextureCacheBase::NeedsCopyFilterInShader(const EFBCopyFilterCoefficients& coefficients)
+bool TextureCacheBase::AllCopyFilterCoefsNeeded(const std::array<u32, 3>& coefficients)
 {
   // If the top/bottom coefficients are zero, no point sampling/blending from these rows.
-  return coefficients.upper != 0 || coefficients.lower != 0;
+  return coefficients[0] != 0 || coefficients[2] != 0;
+}
+
+bool TextureCacheBase::CopyFilterCanOverflow(const std::array<u32, 3>& coefficients)
+{
+  // Normally, the copy filter coefficients will sum to at most 64.  If the sum is higher than that,
+  // colors are clamped to the range [0, 255], but if the sum is higher than 128, that clamping
+  // breaks (as colors end up >= 512, which wraps back to 0).
+  return coefficients[0] + coefficients[1] + coefficients[2] >= 128;
 }
 
 void TextureCacheBase::CopyRenderTargetToTexture(
@@ -2255,10 +2260,11 @@ void TextureCacheBase::CopyRenderTargetToTexture(
 
   if (copy_to_ram)
   {
-    EFBCopyFilterCoefficients coefficients = GetRAMCopyFilterCoefficients(filter_coefficients);
+    const std::array<u32, 3> coefficients = GetRAMCopyFilterCoefficients(filter_coefficients);
     PixelFormat srcFormat = bpmem.zcontrol.pixel_format;
     EFBCopyParams format(srcFormat, dstFormat, is_depth_copy, isIntensity,
-                         NeedsCopyFilterInShader(coefficients));
+                         AllCopyFilterCoefsNeeded(coefficients),
+                         CopyFilterCanOverflow(coefficients), gamma != 1.0);
 
     std::unique_ptr<AbstractStagingTexture> staging_texture = GetEFBCopyStagingTexture();
     if (staging_texture)
@@ -2716,16 +2722,15 @@ void TextureCacheBase::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_cop
                                            bool scale_by_half, bool linear_filter,
                                            EFBCopyFormat dst_format, bool is_intensity, float gamma,
                                            bool clamp_top, bool clamp_bottom,
-                                           const EFBCopyFilterCoefficients& filter_coefficients)
+                                           const std::array<u32, 3>& filter_coefficients)
 {
   // Flush EFB pokes first, as they're expected to be included.
   g_framebuffer_manager->FlushEFBPokes();
 
   // Get the pipeline which we will be using. If the compilation failed, this will be null.
-  const AbstractPipeline* copy_pipeline =
-      g_shader_cache->GetEFBCopyToVRAMPipeline(TextureConversionShaderGen::GetShaderUid(
-          dst_format, is_depth_copy, is_intensity, scale_by_half,
-          NeedsCopyFilterInShader(filter_coefficients)));
+  const AbstractPipeline* copy_pipeline = g_shader_cache->GetEFBCopyToVRAMPipeline(
+      TextureConversionShaderGen::GetShaderUid(dst_format, is_depth_copy, is_intensity,
+                                               scale_by_half, 1.0f / gamma, filter_coefficients));
   if (!copy_pipeline)
   {
     WARN_LOG_FMT(VIDEO, "Skipping EFB copy to VRAM due to missing pipeline.");
@@ -2746,7 +2751,7 @@ void TextureCacheBase::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_cop
   struct Uniforms
   {
     float src_left, src_top, src_width, src_height;
-    float filter_coefficients[3];
+    std::array<u32, 3> filter_coefficients;
     float gamma_rcp;
     float clamp_top;
     float clamp_bottom;
@@ -2761,9 +2766,7 @@ void TextureCacheBase::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_cop
   uniforms.src_top = framebuffer_rect.top * rcp_efb_height;
   uniforms.src_width = framebuffer_rect.GetWidth() * rcp_efb_width;
   uniforms.src_height = framebuffer_rect.GetHeight() * rcp_efb_height;
-  uniforms.filter_coefficients[0] = filter_coefficients.upper;
-  uniforms.filter_coefficients[1] = filter_coefficients.middle;
-  uniforms.filter_coefficients[2] = filter_coefficients.lower;
+  uniforms.filter_coefficients = filter_coefficients;
   uniforms.gamma_rcp = 1.0f / gamma;
   //   NOTE: when the clamp bits aren't set, the hardware will happily read beyond the EFB,
   //         which returns random garbage from the empty bus (confirmed by hardware tests).
@@ -2795,7 +2798,7 @@ void TextureCacheBase::CopyEFB(AbstractStagingTexture* dst, const EFBCopyParams&
                                u32 memory_stride, const MathUtil::Rectangle<int>& src_rect,
                                bool scale_by_half, bool linear_filter, float y_scale, float gamma,
                                bool clamp_top, bool clamp_bottom,
-                               const EFBCopyFilterCoefficients& filter_coefficients)
+                               const std::array<u32, 3>& filter_coefficients)
 {
   // Flush EFB pokes first, as they're expected to be included.
   g_framebuffer_manager->FlushEFBPokes();
@@ -2826,7 +2829,7 @@ void TextureCacheBase::CopyEFB(AbstractStagingTexture* dst, const EFBCopyParams&
     float gamma_rcp;
     float clamp_top;
     float clamp_bottom;
-    float filter_coefficients[3];
+    std::array<u32, 3> filter_coefficients;
     u32 padding;
   };
   Uniforms encoder_params;
@@ -2847,9 +2850,7 @@ void TextureCacheBase::CopyEFB(AbstractStagingTexture* dst, const EFBCopyParams&
   encoder_params.clamp_top = (static_cast<float>(top_coord) + .5f) * rcp_efb_height;
   const u32 bottom_coord = (clamp_bottom ? framebuffer_rect.bottom : efb_height) - 1;
   encoder_params.clamp_bottom = (static_cast<float>(bottom_coord) + .5f) * rcp_efb_height;
-  encoder_params.filter_coefficients[0] = filter_coefficients.upper;
-  encoder_params.filter_coefficients[1] = filter_coefficients.middle;
-  encoder_params.filter_coefficients[2] = filter_coefficients.lower;
+  encoder_params.filter_coefficients = filter_coefficients;
   g_vertex_manager->UploadUtilityUniforms(&encoder_params, sizeof(encoder_params));
 
   // Because the shader uses gl_FragCoord and we read it back, we must render to the lower-left.
diff --git a/Source/Core/VideoCommon/TextureCacheBase.h b/Source/Core/VideoCommon/TextureCacheBase.h
index 287f3b840f..4fe11a64f4 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.h
+++ b/Source/Core/VideoCommon/TextureCacheBase.h
@@ -57,23 +57,30 @@ struct TextureAndTLUTFormat
 struct EFBCopyParams
 {
   EFBCopyParams(PixelFormat efb_format_, EFBCopyFormat copy_format_, bool depth_, bool yuv_,
-                bool copy_filter_)
+                bool all_copy_filter_coefs_needed_, bool copy_filter_can_overflow_,
+                bool apply_gamma_)
       : efb_format(efb_format_), copy_format(copy_format_), depth(depth_), yuv(yuv_),
-        copy_filter(copy_filter_)
+        all_copy_filter_coefs_needed(all_copy_filter_coefs_needed_),
+        copy_filter_can_overflow(copy_filter_can_overflow_), apply_gamma(apply_gamma_)
   {
   }
 
   bool operator<(const EFBCopyParams& rhs) const
   {
-    return std::tie(efb_format, copy_format, depth, yuv, copy_filter) <
-           std::tie(rhs.efb_format, rhs.copy_format, rhs.depth, rhs.yuv, rhs.copy_filter);
+    return std::tie(efb_format, copy_format, depth, yuv, all_copy_filter_coefs_needed,
+                    copy_filter_can_overflow,
+                    apply_gamma) < std::tie(rhs.efb_format, rhs.copy_format, rhs.depth, rhs.yuv,
+                                            rhs.all_copy_filter_coefs_needed,
+                                            rhs.copy_filter_can_overflow, rhs.apply_gamma);
   }
 
   PixelFormat efb_format;
   EFBCopyFormat copy_format;
   bool depth;
   bool yuv;
-  bool copy_filter;
+  bool all_copy_filter_coefs_needed;
+  bool copy_filter_can_overflow;
+  bool apply_gamma;
 };
 
 template <>
@@ -89,19 +96,13 @@ struct fmt::formatter<EFBCopyParams>
     else
       copy_format = fmt::to_string(uid.copy_format);
     return fmt::format_to(ctx.out(),
-                          "format: {}, copy format: {}, depth: {}, yuv: {}, copy filter: {}",
-                          uid.efb_format, copy_format, uid.depth, uid.yuv, uid.copy_filter);
+                          "format: {}, copy format: {}, depth: {}, yuv: {}, apply_gamma: {}, "
+                          "all_copy_filter_coefs_needed: {}, copy_filter_can_overflow: {}",
+                          uid.efb_format, copy_format, uid.depth, uid.yuv, uid.apply_gamma,
+                          uid.all_copy_filter_coefs_needed, uid.copy_filter_can_overflow);
   }
 };
 
-// Reduced version of the full coefficient array, with a single value for each row.
-struct EFBCopyFilterCoefficients
-{
-  float upper;
-  float middle;
-  float lower;
-};
-
 class TextureCacheBase
 {
 private:
@@ -267,8 +268,8 @@ public:
   // Save States
   void DoState(PointerWrap& p);
 
-  // Returns false if the top/bottom row coefficients are zero.
-  static bool NeedsCopyFilterInShader(const EFBCopyFilterCoefficients& coefficients);
+  static bool AllCopyFilterCoefsNeeded(const std::array<u32, 3>& coefficients);
+  static bool CopyFilterCanOverflow(const std::array<u32, 3>& coefficients);
 
 protected:
   // Decodes the specified data to the GPU texture specified by entry.
@@ -285,12 +286,12 @@ protected:
                        u32 bytes_per_row, u32 num_blocks_y, u32 memory_stride,
                        const MathUtil::Rectangle<int>& src_rect, bool scale_by_half,
                        bool linear_filter, float y_scale, float gamma, bool clamp_top,
-                       bool clamp_bottom, const EFBCopyFilterCoefficients& filter_coefficients);
+                       bool clamp_bottom, const std::array<u32, 3>& filter_coefficients);
   virtual void CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_copy,
                                    const MathUtil::Rectangle<int>& src_rect, bool scale_by_half,
                                    bool linear_filter, EFBCopyFormat dst_format, bool is_intensity,
                                    float gamma, bool clamp_top, bool clamp_bottom,
-                                   const EFBCopyFilterCoefficients& filter_coefficients);
+                                   const std::array<u32, 3>& filter_coefficients);
 
   alignas(16) u8* temp = nullptr;
   size_t temp_size = 0;
@@ -338,9 +339,9 @@ private:
   void UninitializeXFBMemory(u8* dst, u32 stride, u32 bytes_per_row, u32 num_blocks_y);
 
   // Precomputing the coefficients for the previous, current, and next lines for the copy filter.
-  static EFBCopyFilterCoefficients
+  static std::array<u32, 3>
   GetRAMCopyFilterCoefficients(const CopyFilterCoefficients::Values& coefficients);
-  static EFBCopyFilterCoefficients
+  static std::array<u32, 3>
   GetVRAMCopyFilterCoefficients(const CopyFilterCoefficients::Values& coefficients);
 
   // Flushes a pending EFB copy to RAM from the host to the guest RAM.
diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index e7a2d4a392..fc27c9cc99 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -62,7 +62,7 @@ static void WriteHeader(ShaderCode& code, APIType api_type)
              "  float y_scale;\n"
              "  float gamma_rcp;\n"
              "  float2 clamp_tb;\n"
-             "  float3 filter_coefficients;\n"
+             "  uint3 filter_coefficients;\n"
              "}};\n");
   if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
   {
@@ -151,7 +151,7 @@ static void WriteSampleFunction(ShaderCode& code, const EFBCopyParams& params, A
   // The filter is only applied to the RGB channels, the alpha channel is left intact.
   code.Write("float4 SampleEFB(float2 uv, float2 pixel_size, int xoffset)\n"
              "{{\n");
-  if (params.copy_filter)
+  if (params.all_copy_filter_coefs_needed)
   {
     code.Write("  float4 prev_row = ");
     WriteSampleOp(-1);
@@ -162,9 +162,9 @@ static void WriteSampleFunction(ShaderCode& code, const EFBCopyParams& params, A
                "  float4 next_row = ");
     WriteSampleOp(1);
     code.Write(";\n"
-               "  return float4(min(prev_row.rgb * filter_coefficients[0] +\n"
-               "                      current_row.rgb * filter_coefficients[1] +\n"
-               "                      next_row.rgb * filter_coefficients[2], \n"
+               "  return float4(min(prev_row.rgb * filter_coefficients[0] / 64.0 +\n"
+               "                      current_row.rgb * filter_coefficients[1] / 64.0 +\n"
+               "                      next_row.rgb * filter_coefficients[2] / 64.0, \n"
                "                    float3(1, 1, 1)), current_row.a);\n");
   }
   else
@@ -172,7 +172,7 @@ static void WriteSampleFunction(ShaderCode& code, const EFBCopyParams& params, A
     code.Write("  float4 current_row = ");
     WriteSampleOp(0);
     code.Write(";\n"
-               "return float4(min(current_row.rgb * filter_coefficients[1], float3(1, 1, 1)),\n"
+               "return float4(min(current_row.rgb * filter_coefficients[1] / 64.0, float3(1, 1, 1)),\n"
                "              current_row.a);\n");
   }
   code.Write("}}\n");
diff --git a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
index 0667f8c621..6b10a34aed 100644
--- a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
+++ b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
@@ -6,13 +6,15 @@
 #include "Common/Assert.h"
 #include "Common/CommonTypes.h"
 #include "VideoCommon/BPMemory.h"
+#include "VideoCommon/TextureCacheBase.h"
 #include "VideoCommon/VideoCommon.h"
 #include "VideoCommon/VideoConfig.h"
 
 namespace TextureConversionShaderGen
 {
 TCShaderUid GetShaderUid(EFBCopyFormat dst_format, bool is_depth_copy, bool is_intensity,
-                         bool scale_by_half, bool copy_filter)
+                         bool scale_by_half, float gamma_rcp,
+                         const std::array<u32, 3>& filter_coefficients)
 {
   TCShaderUid out;
 
@@ -22,7 +24,11 @@ TCShaderUid GetShaderUid(EFBCopyFormat dst_format, bool is_depth_copy, bool is_i
   uid_data->is_depth_copy = is_depth_copy;
   uid_data->is_intensity = is_intensity;
   uid_data->scale_by_half = scale_by_half;
-  uid_data->copy_filter = copy_filter;
+  uid_data->all_copy_filter_coefs_needed =
+      TextureCacheBase::AllCopyFilterCoefsNeeded(filter_coefficients);
+  uid_data->copy_filter_can_overflow = TextureCacheBase::CopyFilterCanOverflow(filter_coefficients);
+  // If the gamma is needed, then include that too.
+  uid_data->apply_gamma = gamma_rcp != 1.0f;
 
   return out;
 }
@@ -31,7 +37,7 @@ static void WriteHeader(APIType api_type, ShaderCode& out)
 {
   out.Write("UBO_BINDING(std140, 1) uniform PSBlock {{\n"
             "  float2 src_offset, src_size;\n"
-            "  float3 filter_coefficients;\n"
+            "  uint3 filter_coefficients;\n"
             "  float gamma_rcp;\n"
             "  float2 clamp_tb;\n"
             "  float pixel_height;\n"
@@ -98,22 +104,22 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
 
   // The copy filter applies to both color and depth copies. This has been verified on hardware.
   // The filter is only applied to the RGB channels, the alpha channel is left intact.
-  if (uid_data->copy_filter)
+  if (uid_data->all_copy_filter_coefs_needed)
   {
     out.Write("  float4 prev_row = SampleEFB(v_tex0, -1.0f);\n"
               "  float4 current_row = SampleEFB(v_tex0, 0.0f);\n"
               "  float4 next_row = SampleEFB(v_tex0, 1.0f);\n"
-              "  float4 texcol = float4(min(prev_row.rgb * filter_coefficients[0] +\n"
-              "                               current_row.rgb * filter_coefficients[1] +\n"
-              "                               next_row.rgb * filter_coefficients[2], \n"
+              "  float4 texcol = float4(min(prev_row.rgb * filter_coefficients[0] / 64.0 +\n"
+              "                               current_row.rgb * filter_coefficients[1] / 64.0 +\n"
+              "                               next_row.rgb * filter_coefficients[2] / 64.0, \n"
               "                             float3(1, 1, 1)), current_row.a);\n");
   }
   else
   {
     out.Write(
         "  float4 current_row = SampleEFB(v_tex0, 0.0f);\n"
-        "  float4 texcol = float4(min(current_row.rgb * filter_coefficients[1], float3(1, 1, 1)),\n"
-        "                         current_row.a);\n");
+        "  float4 texcol = float4(min(current_row.rgb * filter_coefficients[1] / 64.0,\n"
+        "                         float3(1, 1, 1)), current_row.a);\n");
   }
 
   if (uid_data->is_depth_copy)
diff --git a/Source/Core/VideoCommon/TextureConverterShaderGen.h b/Source/Core/VideoCommon/TextureConverterShaderGen.h
index 54665104f6..10745cb3dc 100644
--- a/Source/Core/VideoCommon/TextureConverterShaderGen.h
+++ b/Source/Core/VideoCommon/TextureConverterShaderGen.h
@@ -25,7 +25,9 @@ struct UidData
   u32 is_depth_copy : 1;
   u32 is_intensity : 1;
   u32 scale_by_half : 1;
-  u32 copy_filter : 1;
+  u32 all_copy_filter_coefs_needed : 1;
+  u32 copy_filter_can_overflow : 1;
+  u32 apply_gamma : 1;
 };
 #pragma pack()
 
@@ -35,7 +37,8 @@ ShaderCode GenerateVertexShader(APIType api_type);
 ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data);
 
 TCShaderUid GetShaderUid(EFBCopyFormat dst_format, bool is_depth_copy, bool is_intensity,
-                         bool scale_by_half, bool copy_filter);
+                         bool scale_by_half, float gamma_rcp,
+                         const std::array<u32, 3>& filter_coefficients);
 
 }  // namespace TextureConversionShaderGen
 
@@ -53,8 +56,10 @@ struct fmt::formatter<TextureConversionShaderGen::UidData>
       dst_format = fmt::to_string(uid.dst_format);
     return fmt::format_to(ctx.out(),
                           "dst_format: {}, efb_has_alpha: {}, is_depth_copy: {}, is_intensity: {}, "
-                          "scale_by_half: {}, copy_filter: {}",
+                          "scale_by_half: {}, all_copy_filter_coefs_needed: {}, "
+                          "copy_filter_can_overflow: {}, apply_gamma: {}",
                           dst_format, uid.efb_has_alpha, uid.is_depth_copy, uid.is_intensity,
-                          uid.scale_by_half, uid.copy_filter);
+                          uid.scale_by_half, uid.all_copy_filter_coefs_needed,
+                          uid.copy_filter_can_overflow, uid.apply_gamma);
   }
 };

From b16ec5b6dccc347f9fd15a4fad80edb173f007eb Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 20:34:15 -0800
Subject: [PATCH 08/11] Rework TextureConverterShaderGen for hardware accuracy
 and simplicity

---
 .../VideoCommon/TextureConverterShaderGen.cpp | 277 +++++++-----------
 1 file changed, 109 insertions(+), 168 deletions(-)

diff --git a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
index 6b10a34aed..3bee37060a 100644
--- a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
+++ b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
@@ -84,11 +84,25 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
   WriteHeader(api_type, out);
 
   out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp0;\n");
-  out.Write("float4 SampleEFB(float3 uv, float y_offset) {{\n"
-            "  return texture(samp0, float3(uv.x, clamp(uv.y + (y_offset * pixel_height), "
-            "clamp_tb.x, clamp_tb.y), {}));\n"
-            "}}\n",
+  out.Write("uint4 SampleEFB(float3 uv, float y_offset) {{\n"
+            "  float4 tex_sample = texture(samp0, float3(uv.x, clamp(uv.y + (y_offset * "
+            "pixel_height), clamp_tb.x, clamp_tb.y), {}));\n",
             mono_depth ? "0.0" : "uv.z");
+  if (uid_data->is_depth_copy)
+  {
+    if (!g_ActiveConfig.backend_info.bSupportsReversedDepthRange)
+      out.Write("  tex_sample.x = 1.0 - tex_sample.x;\n");
+
+    out.Write("  uint depth = uint(tex_sample.x * 16777216.0);\n"
+              "  return uint4((depth >> 16) & 255u, (depth >> 8) & 255u, depth & 255u, 255u);\n"
+              "}}\n");
+  }
+  else
+  {
+    out.Write("  return uint4(tex_sample * 255.0);\n"
+              "}}\n");
+  }
+
   if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
   {
     out.Write("VARYING_LOCATION(0) in VertexData {{\n"
@@ -99,6 +113,7 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
   {
     out.Write("VARYING_LOCATION(0) in vec3 v_tex0;\n");
   }
+
   out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"
             "void main()\n{{\n");
 
@@ -106,191 +121,117 @@ ShaderCode GeneratePixelShader(APIType api_type, const UidData* uid_data)
   // The filter is only applied to the RGB channels, the alpha channel is left intact.
   if (uid_data->all_copy_filter_coefs_needed)
   {
-    out.Write("  float4 prev_row = SampleEFB(v_tex0, -1.0f);\n"
-              "  float4 current_row = SampleEFB(v_tex0, 0.0f);\n"
-              "  float4 next_row = SampleEFB(v_tex0, 1.0f);\n"
-              "  float4 texcol = float4(min(prev_row.rgb * filter_coefficients[0] / 64.0 +\n"
-              "                               current_row.rgb * filter_coefficients[1] / 64.0 +\n"
-              "                               next_row.rgb * filter_coefficients[2] / 64.0, \n"
-              "                             float3(1, 1, 1)), current_row.a);\n");
+    out.Write("  uint4 prev_row = SampleEFB(v_tex0, -1.0f);\n"
+              "  uint4 current_row = SampleEFB(v_tex0, 0.0f);\n"
+              "  uint4 next_row = SampleEFB(v_tex0, 1.0f);\n"
+              "  uint3 combined_rows = prev_row.rgb * filter_coefficients[0] +\n"
+              "                        current_row.rgb * filter_coefficients[1] +\n"
+              "                        next_row.rgb * filter_coefficients[2];\n");
   }
   else
   {
-    out.Write(
-        "  float4 current_row = SampleEFB(v_tex0, 0.0f);\n"
-        "  float4 texcol = float4(min(current_row.rgb * filter_coefficients[1] / 64.0,\n"
-        "                         float3(1, 1, 1)), current_row.a);\n");
+    out.Write("  uint4 current_row = SampleEFB(v_tex0, 0.0f);\n"
+              "  uint3 combined_rows = current_row.rgb * filter_coefficients[1];\n");
+  }
+  out.Write("  // Shift right by 6 to divide by 64, as filter coefficients\n"
+            "  // that sum to 64 result in no change in brightness\n"
+            "  uint4 texcol_raw = uint4(combined_rows.rgb >> 6, {});\n",
+            uid_data->efb_has_alpha ? "current_row.a" : "255");
+
+  if (uid_data->copy_filter_can_overflow)
+    out.Write("  texcol_raw &= 0x1ffu;\n");
+  // Note that overflow occurs when the sum of values is >= 128, but this max situation can be hit
+  // on >= 64, so we always include it.
+  out.Write("  texcol_raw = min(texcol_raw, uint4(255, 255, 255, 255));\n");
+
+  if (uid_data->apply_gamma)
+  {
+    out.Write("  texcol_raw = uint4(round(pow(abs(float4(texcol_raw) / 255.0),\n"
+              "                     float4(gamma_rcp, gamma_rcp, gamma_rcp, 1.0)) * 255.0));\n");
   }
 
-  if (uid_data->is_depth_copy)
+  if (uid_data->is_intensity)
   {
-    if (!g_ActiveConfig.backend_info.bSupportsReversedDepthRange)
-      out.Write("texcol.x = 1.0 - texcol.x;\n");
-
-    out.Write("  int depth = int(texcol.x * 16777216.0);\n"
-
-              // Convert to Z24 format
-              "  int4 workspace;\n"
-              "  workspace.r = (depth >> 16) & 255;\n"
-              "  workspace.g = (depth >> 8) & 255;\n"
-              "  workspace.b = depth & 255;\n"
-
-              // Convert to Z4 format
-              "  workspace.a = (depth >> 16) & 0xF0;\n"
-
-              // Normalize components to [0.0..1.0]
-              "  texcol = float4(workspace) / 255.0;\n");
-    switch (uid_data->dst_format)
-    {
-    case EFBCopyFormat::R4:  // Z4
-      out.Write("  ocol0 = texcol.aaaa;\n");
-      break;
-
-    case EFBCopyFormat::R8_0x1:  // Z8
-    case EFBCopyFormat::R8:      // Z8H
-      out.Write("  ocol0 = texcol.rrrr;\n");
-      break;
-
-    case EFBCopyFormat::RA8:  // Z16
-      out.Write("  ocol0 = texcol.gggr;\n");
-      break;
-
-    case EFBCopyFormat::RG8:  // Z16 (reverse order)
-      out.Write("  ocol0 = texcol.rrrg;\n");
-      break;
-
-    case EFBCopyFormat::RGBA8:  // Z24X8
-      out.Write("  ocol0 = float4(texcol.rgb, 1.0);\n");
-      break;
-
-    case EFBCopyFormat::G8:  // Z8M
-      out.Write("  ocol0 = texcol.gggg;\n");
-      break;
-
-    case EFBCopyFormat::B8:  // Z8L
-      out.Write("  ocol0 = texcol.bbbb;\n");
-      break;
-
-    case EFBCopyFormat::GB8:  // Z16L - copy lower 16 depth bits
-      // expected to be used as an IA8 texture (upper 8 bits stored as intensity, lower 8 bits
-      // stored as alpha)
-      // Used e.g. in Zelda: Skyward Sword
-      out.Write("  ocol0 = texcol.gggb;\n");
-      break;
-
-    default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy zbuf format: {}", uid_data->dst_format);
-      out.Write("  ocol0 = float4(texcol.bgr, 0.0);\n");
-      break;
-    }
+    out.Write("  // Intensity/YUV format conversion constants determined by hardware testing\n"
+              "  const float4 y_const = float4( 66, 129,  25,  16);\n"
+              "  const float4 u_const = float4(-38, -74, 112, 128);\n"
+              "  const float4 v_const = float4(112, -94, -18, 128);\n"
+              "  // Intensity/YUV format conversion\n"
+              "  texcol_raw.rgb = uint3(dot(y_const, float4(texcol_raw.rgb, 256)),\n"
+              "                         dot(u_const, float4(texcol_raw.rgb, 256)),\n"
+              "                         dot(v_const, float4(texcol_raw.rgb, 256)));\n"
+              "  // Divide by 256 and round .5 and higher up\n"
+              "  texcol_raw.rgb = (texcol_raw.rgb >> 8) + ((texcol_raw.rgb >> 7) & 1);\n");
   }
-  else if (uid_data->is_intensity)
+
+  switch (uid_data->dst_format)
   {
-    if (!uid_data->efb_has_alpha)
-      out.Write("  texcol.a = 1.0;\n");
+  case EFBCopyFormat::R4:  // R4
+    out.Write("  float red = float(texcol_raw.r & 0xF0u) / 240.0;\n"
+              "  ocol0 = float4(red, red, red, red);\n");
+    break;
 
-    bool has_four_bits =
-        (uid_data->dst_format == EFBCopyFormat::R4 || uid_data->dst_format == EFBCopyFormat::RA4);
-    bool has_alpha =
-        (uid_data->dst_format == EFBCopyFormat::RA4 || uid_data->dst_format == EFBCopyFormat::RA8);
+  case EFBCopyFormat::R8_0x1:  // R8
+  case EFBCopyFormat::R8:      // R8
+    out.Write("  ocol0 = float4(texcol_raw).rrrr / 255.0;\n");
+    break;
 
-    switch (uid_data->dst_format)
-    {
-    case EFBCopyFormat::R4:      // I4
-    case EFBCopyFormat::R8_0x1:  // I8
-    case EFBCopyFormat::R8:      // I8
-    case EFBCopyFormat::RA4:     // IA4
-    case EFBCopyFormat::RA8:     // IA8
-      if (has_four_bits)
-        out.Write("  texcol = float4(int4(texcol * 255.0) & 0xF0) * (1.0 / 240.0);\n");
+  case EFBCopyFormat::RA4:  // RA4
+    out.Write("  float2 red_alpha = float2(texcol_raw.ra & 0xF0u) / 240.0;\n"
+              "  ocol0 = red_alpha.rrrg;\n");
+    break;
 
-      // TODO - verify these coefficients
-      out.Write("  const float3 coefficients = float3(0.257, 0.504, 0.098);\n"
-                "  float intensity = dot(texcol.rgb, coefficients) + 16.0 / 255.0;\n"
-                "  ocol0 = float4(intensity, intensity, intensity, {});\n",
-                has_alpha ? "texcol.a" : "intensity");
-      break;
+  case EFBCopyFormat::RA8:  // RA8
+    out.Write("  ocol0 = float4(texcol_raw).rrra / 255.0;\n");
+    break;
 
-    default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy intensity format: {}", uid_data->dst_format);
-      out.Write("  ocol0 = texcol;\n");
-      break;
-    }
-  }
-  else
-  {
-    if (!uid_data->efb_has_alpha)
-      out.Write("  texcol.a = 1.0;\n");
+  case EFBCopyFormat::A8:  // A8
+    out.Write("  ocol0 = float4(texcol_raw).aaaa / 255.0;\n");
+    break;
 
-    switch (uid_data->dst_format)
-    {
-    case EFBCopyFormat::R4:  // R4
-      out.Write("  float red = float(int(texcol.r * 255.0) & 0xF0) * (1.0 / 240.0);\n"
-                "  ocol0 = float4(red, red, red, red);\n");
-      break;
+  case EFBCopyFormat::G8:  // G8
+    out.Write("  ocol0 = float4(texcol_raw).gggg / 255.0;\n");
+    break;
 
-    case EFBCopyFormat::R8_0x1:  // R8
-    case EFBCopyFormat::R8:      // R8
-      out.Write("  ocol0 = texcol.rrrr;\n");
-      break;
+  case EFBCopyFormat::B8:  // B8
+    out.Write("  ocol0 = float4(texcol_raw).bbbb / 255.0;\n");
+    break;
 
-    case EFBCopyFormat::RA4:  // RA4
-      out.Write("  float2 red_alpha = float2(int2(texcol.ra * 255.0) & 0xF0) * (1.0 / 240.0);\n"
-                "  ocol0 = red_alpha.rrrg;\n");
-      break;
+  case EFBCopyFormat::RG8:  // RG8
+    out.Write("  ocol0 = float4(texcol_raw).rrrg / 255.0;\n");
+    break;
 
-    case EFBCopyFormat::RA8:  // RA8
-      out.Write("  ocol0 = texcol.rrra;\n");
-      break;
+  case EFBCopyFormat::GB8:  // GB8
+    out.Write("  ocol0 = float4(texcol_raw).gggb / 255.0;\n");
+    break;
 
-    case EFBCopyFormat::A8:  // A8
-      out.Write("  ocol0 = texcol.aaaa;\n");
-      break;
+  case EFBCopyFormat::RGB565:  // RGB565
+    out.Write("  float2 red_blue = float2(texcol_raw.rb & 0xF8u) / 248.0;\n"
+              "  float green = float(texcol_raw.g & 0xFCu) / 252.0;\n"
+              "  ocol0 = float4(red_blue.r, green, red_blue.g, 1.0);\n");
+    break;
 
-    case EFBCopyFormat::G8:  // G8
-      out.Write("  ocol0 = texcol.gggg;\n");
-      break;
+  case EFBCopyFormat::RGB5A3:  // RGB5A3
+    // TODO: The MSB controls whether we have RGB5 or RGB4A3, this selection
+    // will need to be implemented once we move away from floats.
+    out.Write("  float3 color = float3(texcol_raw.rgb & 0xF8u) / 248.0;\n"
+              "  float alpha = float(texcol_raw.a & 0xE0u) / 224.0;\n"
+              "  ocol0 = float4(color, alpha);\n");
+    break;
 
-    case EFBCopyFormat::B8:  // B8
-      out.Write("  ocol0 = texcol.bbbb;\n");
-      break;
+  case EFBCopyFormat::RGBA8:  // RGBA8
+    out.Write("  ocol0 = float4(texcol_raw.rgba) / 255.0;\n");
+    break;
 
-    case EFBCopyFormat::RG8:  // RG8
-      out.Write("  ocol0 = texcol.rrrg;\n");
-      break;
+  case EFBCopyFormat::XFB:
+    out.Write("  ocol0 = float4(float3(texcol_raw.rgb) / 255.0, 1.0);\n");
+    break;
 
-    case EFBCopyFormat::GB8:  // GB8
-      out.Write("  ocol0 = texcol.gggb;\n");
-      break;
-
-    case EFBCopyFormat::RGB565:  // RGB565
-      out.Write("  float2 red_blue = float2(int2(texcol.rb * 255.0) & 0xF8) * (1.0 / 248.0);\n"
-                "  float green = float(int(texcol.g * 255.0) & 0xFC) * (1.0 / 252.0);\n"
-                "  ocol0 = float4(red_blue.r, green, red_blue.g, 1.0);\n");
-      break;
-
-    case EFBCopyFormat::RGB5A3:  // RGB5A3
-      // TODO: The MSB controls whether we have RGB5 or RGB4A3, this selection
-      // will need to be implemented once we move away from floats.
-      out.Write("  float3 color = float3(int3(texcol.rgb * 255.0) & 0xF8) * (1.0 / 248.0);\n"
-                "  float alpha = float(int(texcol.a * 255.0) & 0xE0) * (1.0 / 224.0);\n"
-                "  ocol0 = float4(color, alpha);\n");
-      break;
-
-    case EFBCopyFormat::RGBA8:  // RGBA8
-      out.Write("  ocol0 = texcol;\n");
-      break;
-
-    case EFBCopyFormat::XFB:
-      out.Write("  ocol0 = float4(pow(abs(texcol.rgb), float3(gamma_rcp, gamma_rcp, gamma_rcp)), "
-                "1.0f);\n");
-      break;
-
-    default:
-      ERROR_LOG_FMT(VIDEO, "Unknown copy color format: {}", uid_data->dst_format);
-      out.Write("  ocol0 = texcol;\n");
-      break;
-    }
+  default:
+    ERROR_LOG_FMT(VIDEO, "Unknown copy/intensity color format: {} {}", uid_data->dst_format,
+                  uid_data->is_intensity);
+    out.Write("  ocol0 = float4(texcol_raw.rgba) / 255.0;\n");
+    break;
   }
 
   out.Write("}}\n");

From e7339d63f15bdf2b41e3f6fab1c16b844514ae3a Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 20:36:11 -0800
Subject: [PATCH 09/11] Rework TextureConversionShader for hardware accuracy
 and simplicity

---
 .../VideoCommon/TextureConversionShader.cpp   | 535 ++++--------------
 1 file changed, 116 insertions(+), 419 deletions(-)

diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index fc27c9cc99..c1aac4482a 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -18,8 +18,6 @@
 
 namespace TextureConversionShaderTiled
 {
-static bool IntensityConstantAdded = false;
-
 u16 GetEncodedSampleCount(EFBCopyFormat format)
 {
   switch (format)
@@ -85,115 +83,124 @@ static void WriteHeader(ShaderCode& code, APIType api_type)
 
              "float4 RGBA8ToRGBA6(float4 src)\n"
              "{{\n"
-             "  int4 val = int4(roundEven(src * 255.0)) >> 2;\n"
-             "  return float4(val) / 63.0;\n"
+             "  int4 val = int4(roundEven(src * 255.0));\n"
+             "  val = (val & 0xfc) | (val >> 6);\n"
+             "  return float4(val) / 255.0;\n"
              "}}\n"
 
              "float4 RGBA8ToRGB565(float4 src)\n"
              "{{\n"
              "  int4 val = int4(roundEven(src * 255.0));\n"
-             "  val = int4(val.r >> 3, val.g >> 2, val.b >> 3, 1);\n"
-             "  return float4(val) / float4(31.0, 63.0, 31.0, 1.0);\n"
+             "  val.r = (val.r & 0xf8) | (val.r >> 5);\n"
+             "  val.g = (val.g & 0xfc) | (val.g >> 6);\n"
+             "  val.b = (val.b & 0xf8) | (val.b >> 5);\n"
+             "  val.a = 255;\n"
+             "  return float4(val) / 255.0;\n"
              "}}\n");
 }
 
 static void WriteSampleFunction(ShaderCode& code, const EFBCopyParams& params, APIType api_type)
 {
-  const auto WriteSampleOp = [api_type, &code, &params](int yoffset) {
-    if (!params.depth)
-    {
-      switch (params.efb_format)
-      {
-      case PixelFormat::RGB8_Z24:
-        code.Write("RGBA8ToRGB8(");
-        break;
-      case PixelFormat::RGBA6_Z24:
-        code.Write("RGBA8ToRGBA6(");
-        break;
-      case PixelFormat::RGB565_Z16:
-        code.Write("RGBA8ToRGB565(");
-        break;
-      default:
-        code.Write("(");
-        break;
-      }
-    }
-    else
-    {
-      // Handle D3D depth inversion.
-      if (!g_ActiveConfig.backend_info.bSupportsReversedDepthRange)
-        code.Write("1.0 - (");
-      else
-        code.Write("(");
-    }
+  code.Write("uint4 SampleEFB0(float2 uv, float2 pixel_size, float x_offset, float y_offset) {{\n"
+             "  float4 tex_sample = texture(samp0, float3(uv.x + x_offset * pixel_size.x, ");
 
-    code.Write("texture(samp0, float3(");
+  // Reverse the direction for OpenGL, since positive numbers are distance from the bottom row.
+  // TODO: This isn't done on TextureConverterShaderGen - maybe it handles that via pixel_size?
+  if (api_type == APIType::OpenGL)
+    code.Write("clamp(uv.y - y_offset * pixel_size.y, clamp_tb.x, clamp_tb.y)");
+  else
+    code.Write("clamp(uv.y + y_offset * pixel_size.y, clamp_tb.x, clamp_tb.y)");
 
-    code.Write("uv.x + float(xoffset) * pixel_size.x, ");
+  code.Write(", 0.0));\n");
 
-    // Reverse the direction for OpenGL, since positive numbers are distance from the bottom row.
-    if (yoffset != 0)
-    {
-      if (api_type == APIType::OpenGL)
-        code.Write("clamp(uv.y - float({}) * pixel_size.y, clamp_tb.x, clamp_tb.y)", yoffset);
-      else
-        code.Write("clamp(uv.y + float({}) * pixel_size.y, clamp_tb.x, clamp_tb.y)", yoffset);
-    }
-    else
-    {
-      code.Write("uv.y");
-    }
+  // TODO: Is this really needed?  Doesn't the EFB only store appropriate values?  Or is this for
+  // EFB2Ram having consistent output with force 32-bit color?
+  if (params.efb_format == PixelFormat::RGB8_Z24)
+    code.Write("  tex_sample = RGBA8ToRGB8(tex_sample);\n");
+  else if (params.efb_format == PixelFormat::RGBA6_Z24)
+    code.Write("  tex_sample = RGBA8ToRGBA6(tex_sample);\n");
+  else if (params.efb_format == PixelFormat::RGB565_Z16)
+    code.Write("  tex_sample = RGBA8ToRGB565(tex_sample);\n");
 
-    code.Write(", 0.0)))");
-  };
-
-  // The copy filter applies to both color and depth copies. This has been verified on hardware.
-  // The filter is only applied to the RGB channels, the alpha channel is left intact.
-  code.Write("float4 SampleEFB(float2 uv, float2 pixel_size, int xoffset)\n"
-             "{{\n");
-  if (params.all_copy_filter_coefs_needed)
+  if (params.depth)
   {
-    code.Write("  float4 prev_row = ");
-    WriteSampleOp(-1);
-    code.Write(";\n"
-               "  float4 current_row = ");
-    WriteSampleOp(0);
-    code.Write(";\n"
-               "  float4 next_row = ");
-    WriteSampleOp(1);
-    code.Write(";\n"
-               "  return float4(min(prev_row.rgb * filter_coefficients[0] / 64.0 +\n"
-               "                      current_row.rgb * filter_coefficients[1] / 64.0 +\n"
-               "                      next_row.rgb * filter_coefficients[2] / 64.0, \n"
-               "                    float3(1, 1, 1)), current_row.a);\n");
+    if (!g_ActiveConfig.backend_info.bSupportsReversedDepthRange)
+      code.Write("  tex_sample.x = 1.0 - tex_sample.x;\n");
+
+    code.Write("  uint depth = uint(tex_sample.x * 16777216.0);\n"
+               "  return uint4((depth >> 16) & 255u, (depth >> 8) & 255u, depth & 255u, 255u);\n"
+               "}}\n");
   }
   else
   {
-    code.Write("  float4 current_row = ");
-    WriteSampleOp(0);
-    code.Write(";\n"
-               "return float4(min(current_row.rgb * filter_coefficients[1] / 64.0, float3(1, 1, 1)),\n"
-               "              current_row.a);\n");
+    code.Write("  return uint4(tex_sample * 255.0);\n"
+               "}}\n");
   }
+
+  // The copy filter applies to both color and depth copies. This has been verified on hardware.
+  // The filter is only applied to the RGB channels, the alpha channel is left intact.
+  code.Write("float4 SampleEFB(float2 uv, float2 pixel_size, int x_offset)\n"
+             "{{\n");
+  if (params.all_copy_filter_coefs_needed)
+  {
+    code.Write("  uint4 prev_row = SampleEFB0(uv, pixel_size, float(x_offset), -1.0f);\n"
+               "  uint4 current_row = SampleEFB0(uv, pixel_size, float(x_offset), 0.0f);\n"
+               "  uint4 next_row = SampleEFB0(uv, pixel_size, float(x_offset), 1.0f);\n"
+               "  uint3 combined_rows = prev_row.rgb * filter_coefficients[0] +\n"
+               "                        current_row.rgb * filter_coefficients[1] +\n"
+               "                        next_row.rgb * filter_coefficients[2];\n");
+  }
+  else
+  {
+    code.Write("  uint4 current_row = SampleEFB0(uv, pixel_size, float(x_offset), 0.0f);\n"
+               "  uint3 combined_rows = current_row.rgb * filter_coefficients[1];\n");
+  }
+  code.Write("  // Shift right by 6 to divide by 64, as filter coefficients\n"
+             "  // that sum to 64 result in no change in brightness\n"
+             "  uint4 texcol_raw = uint4(combined_rows.rgb >> 6, current_row.a);\n");
+
+  if (params.copy_filter_can_overflow)
+    code.Write("  texcol_raw &= 0x1ffu;\n");
+  // Note that overflow occurs when the sum of values is >= 128, but this max situation can be hit
+  // on >= 64, so we always include it.
+  code.Write("  texcol_raw = min(texcol_raw, uint4(255, 255, 255, 255));\n");
+
+  if (params.apply_gamma)
+  {
+    code.Write("  texcol_raw = uint4(round(pow(float4(texcol_raw) / 255.0,\n"
+               "                     float4(gamma_rcp, gamma_rcp, gamma_rcp, 1.0)) * 255.0));\n");
+  }
+
+  if (params.yuv)
+  {
+    code.Write("  // Intensity/YUV format conversion constants determined by hardware testing\n"
+               "  const float4 y_const = float4( 66, 129,  25,  16);\n"
+               "  const float4 u_const = float4(-38, -74, 112, 128);\n"
+               "  const float4 v_const = float4(112, -94, -18, 128);\n"
+               "  // Intensity/YUV format conversion\n"
+               "  texcol_raw.rgb = uint3(dot(y_const, float4(texcol_raw.rgb, 256)),\n"
+               "                         dot(u_const, float4(texcol_raw.rgb, 256)),\n"
+               "                         dot(v_const, float4(texcol_raw.rgb, 256)));\n"
+               "  // Divide by 256 and round .5 and higher up\n"
+               "  texcol_raw.rgb = (texcol_raw.rgb >> 8) + ((texcol_raw.rgb >> 7) & 1);\n");
+  }
+
+  code.Write("  return float4(texcol_raw) / 255.0;\n");
   code.Write("}}\n");
 }
 
 // Block dimensions   : widthStride, heightStride
 // Texture dimensions : width, height, x offset, y offset
-static void WriteSwizzler(ShaderCode& code, const EFBCopyParams& params, EFBCopyFormat format,
-                          APIType api_type)
+static void WriteSwizzler(ShaderCode& code, const EFBCopyParams& params, APIType api_type)
 {
-  WriteHeader(code, api_type);
-  WriteSampleFunction(code, params, api_type);
-
   code.Write("void main()\n"
              "{{\n"
              "  int2 sampleUv;\n"
              "  int2 uv1 = int2(gl_FragCoord.xy);\n");
 
-  const int blkW = TexDecoder_GetEFBCopyBlockWidthInTexels(format);
-  const int blkH = TexDecoder_GetEFBCopyBlockHeightInTexels(format);
-  int samples = GetEncodedSampleCount(format);
+  const int blkW = TexDecoder_GetEFBCopyBlockWidthInTexels(params.copy_format);
+  const int blkH = TexDecoder_GetEFBCopyBlockHeightInTexels(params.copy_format);
+  int samples = GetEncodedSampleCount(params.copy_format);
 
   code.Write("  int x_block_position = (uv1.x >> {}) << {};\n", IntLog2(blkH * blkW / samples),
              IntLog2(blkW));
@@ -243,146 +250,13 @@ static void WriteSampleColor(ShaderCode& code, std::string_view color_comp, std:
   code.Write("  {} = SampleEFB(uv0, pixel_size, {}).{};\n", dest, x_offset, color_comp);
 }
 
-static void WriteColorToIntensity(ShaderCode& code, std::string_view src, std::string_view dest)
-{
-  if (!IntensityConstantAdded)
-  {
-    code.Write("  float4 IntensityConst = float4(0.257f,0.504f,0.098f,0.0625f);\n");
-    IntensityConstantAdded = true;
-  }
-  code.Write("  {} = dot(IntensityConst.rgb, {}.rgb);\n", dest, src);
-  // don't add IntensityConst.a yet, because doing it later is faster and uses less instructions,
-  // due to vectorization
-}
-
 static void WriteToBitDepth(ShaderCode& code, u8 depth, std::string_view src, std::string_view dest)
 {
   code.Write("  {} = floor({} * 255.0 / exp2(8.0 - {}.0));\n", dest, src, depth);
 }
 
-static void WriteEncoderEnd(ShaderCode& code)
-{
-  code.Write("}}\n");
-  IntensityConstantAdded = false;
-}
-
-static void WriteI8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::R8, api_type);
-  code.Write("  float3 texSample;\n");
-
-  WriteSampleColor(code, "rgb", "texSample", 0, api_type, params);
-  WriteColorToIntensity(code, "texSample", "ocol0.b");
-
-  WriteSampleColor(code, "rgb", "texSample", 1, api_type, params);
-  WriteColorToIntensity(code, "texSample", "ocol0.g");
-
-  WriteSampleColor(code, "rgb", "texSample", 2, api_type, params);
-  WriteColorToIntensity(code, "texSample", "ocol0.r");
-
-  WriteSampleColor(code, "rgb", "texSample", 3, api_type, params);
-  WriteColorToIntensity(code, "texSample", "ocol0.a");
-
-  // See WriteColorToIntensity
-  code.Write("  ocol0.rgba += IntensityConst.aaaa;\n");
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteI4Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::R4, api_type);
-  code.Write("  float3 texSample;\n"
-             "  float4 color0;\n"
-             "  float4 color1;\n");
-
-  WriteSampleColor(code, "rgb", "texSample", 0, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color0.b");
-
-  WriteSampleColor(code, "rgb", "texSample", 1, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color1.b");
-
-  WriteSampleColor(code, "rgb", "texSample", 2, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color0.g");
-
-  WriteSampleColor(code, "rgb", "texSample", 3, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color1.g");
-
-  WriteSampleColor(code, "rgb", "texSample", 4, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color0.r");
-
-  WriteSampleColor(code, "rgb", "texSample", 5, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color1.r");
-
-  WriteSampleColor(code, "rgb", "texSample", 6, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color0.a");
-
-  WriteSampleColor(code, "rgb", "texSample", 7, api_type, params);
-  WriteColorToIntensity(code, "texSample", "color1.a");
-
-  code.Write("  color0.rgba += IntensityConst.aaaa;\n"
-             "  color1.rgba += IntensityConst.aaaa;\n");
-
-  WriteToBitDepth(code, 4, "color0", "color0");
-  WriteToBitDepth(code, 4, "color1", "color1");
-
-  code.Write("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
-  WriteEncoderEnd(code);
-}
-
-static void WriteIA8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
-  code.Write("  float4 texSample;\n");
-
-  WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
-  code.Write("  ocol0.b = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "ocol0.g");
-
-  WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
-  code.Write("  ocol0.r = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "ocol0.a");
-
-  code.Write("  ocol0.ga += IntensityConst.aa;\n");
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteIA4Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::RA4, api_type);
-  code.Write("  float4 texSample;\n"
-             "  float4 color0;\n"
-             "  float4 color1;\n");
-
-  WriteSampleColor(code, "rgba", "texSample", 0, api_type, params);
-  code.Write("  color0.b = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "color1.b");
-
-  WriteSampleColor(code, "rgba", "texSample", 1, api_type, params);
-  code.Write("  color0.g = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "color1.g");
-
-  WriteSampleColor(code, "rgba", "texSample", 2, api_type, params);
-  code.Write("  color0.r = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "color1.r");
-
-  WriteSampleColor(code, "rgba", "texSample", 3, api_type, params);
-  code.Write("  color0.a = texSample.a;\n");
-  WriteColorToIntensity(code, "texSample", "color1.a");
-
-  code.Write("  color1.rgba += IntensityConst.aaaa;\n");
-
-  WriteToBitDepth(code, 4, "color0", "color0");
-  WriteToBitDepth(code, 4, "color1", "color1");
-
-  code.Write("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
-  WriteEncoderEnd(code);
-}
-
 static void WriteRGB565Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::RGB565, api_type);
   code.Write("  float3 texSample0;\n"
              "  float3 texSample1;\n");
 
@@ -402,13 +276,10 @@ static void WriteRGB565Encoder(ShaderCode& code, APIType api_type, const EFBCopy
   code.Write("  ocol0.ga = ocol0.ga + gLower * 32.0;\n");
 
   code.Write("  ocol0 = ocol0 / 255.0;\n");
-  WriteEncoderEnd(code);
 }
 
 static void WriteRGB5A3Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::RGB5A3, api_type);
-
   code.Write("  float4 texSample;\n"
              "  float color0;\n"
              "  float gUpper;\n"
@@ -466,13 +337,10 @@ static void WriteRGB5A3Encoder(ShaderCode& code, APIType api_type, const EFBCopy
   code.Write("}}\n");
 
   code.Write("  ocol0 = ocol0 / 255.0;\n");
-  WriteEncoderEnd(code);
 }
 
 static void WriteRGBA8Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::RGBA8, api_type);
-
   code.Write("  float4 texSample;\n"
              "  float4 color0;\n"
              "  float4 color1;\n");
@@ -490,14 +358,11 @@ static void WriteRGBA8Encoder(ShaderCode& code, APIType api_type, const EFBCopyP
              "  color1.a = texSample.b;\n");
 
   code.Write("  ocol0 = first ? color0 : color1;\n");
-
-  WriteEncoderEnd(code);
 }
 
 static void WriteC4Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
                            const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::R4, api_type);
   code.Write("  float4 color0;\n"
              "  float4 color1;\n");
 
@@ -514,26 +379,20 @@ static void WriteC4Encoder(ShaderCode& code, std::string_view comp, APIType api_
   WriteToBitDepth(code, 4, "color1", "color1");
 
   code.Write("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
-  WriteEncoderEnd(code);
 }
 
 static void WriteC8Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
                            const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::R8, api_type);
-
   WriteSampleColor(code, comp, "ocol0.b", 0, api_type, params);
   WriteSampleColor(code, comp, "ocol0.g", 1, api_type, params);
   WriteSampleColor(code, comp, "ocol0.r", 2, api_type, params);
   WriteSampleColor(code, comp, "ocol0.a", 3, api_type, params);
-
-  WriteEncoderEnd(code);
 }
 
 static void WriteCC4Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
                             const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::RA4, api_type);
   code.Write("  float2 texSample;\n"
              "  float4 color0;\n"
              "  float4 color1;\n");
@@ -558,198 +417,52 @@ static void WriteCC4Encoder(ShaderCode& code, std::string_view comp, APIType api
   WriteToBitDepth(code, 4, "color1", "color1");
 
   code.Write("  ocol0 = (color0 * 16.0 + color1) / 255.0;\n");
-  WriteEncoderEnd(code);
 }
 
 static void WriteCC8Encoder(ShaderCode& code, std::string_view comp, APIType api_type,
                             const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
-
   WriteSampleColor(code, comp, "ocol0.bg", 0, api_type, params);
   WriteSampleColor(code, comp, "ocol0.ra", 1, api_type, params);
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteZ8Encoder(ShaderCode& code, std::string_view multiplier, APIType api_type,
-                           const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::G8, api_type);
-
-  code.Write(" float depth;\n");
-
-  WriteSampleColor(code, "r", "depth", 0, api_type, params);
-  code.Write("ocol0.b = frac(depth * {});\n", multiplier);
-
-  WriteSampleColor(code, "r", "depth", 1, api_type, params);
-  code.Write("ocol0.g = frac(depth * {});\n", multiplier);
-
-  WriteSampleColor(code, "r", "depth", 2, api_type, params);
-  code.Write("ocol0.r = frac(depth * {});\n", multiplier);
-
-  WriteSampleColor(code, "r", "depth", 3, api_type, params);
-  code.Write("ocol0.a = frac(depth * {});\n", multiplier);
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteZ16Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::RA8, api_type);
-
-  code.Write("  float depth;\n"
-             "  float3 expanded;\n");
-
-  // Byte order is reversed
-
-  WriteSampleColor(code, "r", "depth", 0, api_type, params);
-
-  code.Write("  depth *= 16777216.0;\n"
-             "  expanded.r = floor(depth / (256.0 * 256.0));\n"
-             "  depth -= expanded.r * 256.0 * 256.0;\n"
-             "  expanded.g = floor(depth / 256.0);\n");
-
-  code.Write("  ocol0.b = expanded.g / 255.0;\n"
-             "  ocol0.g = expanded.r / 255.0;\n");
-
-  WriteSampleColor(code, "r", "depth", 1, api_type, params);
-
-  code.Write("  depth *= 16777216.0;\n"
-             "  expanded.r = floor(depth / (256.0 * 256.0));\n"
-             "  depth -= expanded.r * 256.0 * 256.0;\n"
-             "  expanded.g = floor(depth / 256.0);\n");
-
-  code.Write("  ocol0.r = expanded.g / 255.0;\n"
-             "  ocol0.a = expanded.r / 255.0;\n");
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteZ16LEncoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::GB8, api_type);
-
-  code.Write("  float depth;\n"
-             "  float3 expanded;\n");
-
-  // Byte order is reversed
-
-  WriteSampleColor(code, "r", "depth", 0, api_type, params);
-
-  code.Write("  depth *= 16777216.0;\n"
-             "  expanded.r = floor(depth / (256.0 * 256.0));\n"
-             "  depth -= expanded.r * 256.0 * 256.0;\n"
-             "  expanded.g = floor(depth / 256.0);\n"
-             "  depth -= expanded.g * 256.0;\n"
-             "  expanded.b = depth;\n");
-
-  code.Write("  ocol0.b = expanded.b / 255.0;\n"
-             "  ocol0.g = expanded.g / 255.0;\n");
-
-  WriteSampleColor(code, "r", "depth", 1, api_type, params);
-
-  code.Write("  depth *= 16777216.0;\n"
-             "  expanded.r = floor(depth / (256.0 * 256.0));\n"
-             "  depth -= expanded.r * 256.0 * 256.0;\n"
-             "  expanded.g = floor(depth / 256.0);\n"
-             "  depth -= expanded.g * 256.0;\n"
-             "  expanded.b = depth;\n");
-
-  code.Write("  ocol0.r = expanded.b / 255.0;\n"
-             "  ocol0.a = expanded.g / 255.0;\n");
-
-  WriteEncoderEnd(code);
-}
-
-static void WriteZ24Encoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
-{
-  WriteSwizzler(code, params, EFBCopyFormat::RGBA8, api_type);
-
-  code.Write("  float depth0;\n"
-             "  float depth1;\n"
-             "  float3 expanded0;\n"
-             "  float3 expanded1;\n");
-
-  WriteSampleColor(code, "r", "depth0", 0, api_type, params);
-  WriteSampleColor(code, "r", "depth1", 1, api_type, params);
-
-  for (int i = 0; i < 2; i++)
-  {
-    code.Write("  depth{} *= 16777216.0;\n", i);
-
-    code.Write("  expanded{}.r = floor(depth{} / (256.0 * 256.0));\n", i, i);
-    code.Write("  depth{} -= expanded{}.r * 256.0 * 256.0;\n", i, i);
-    code.Write("  expanded{}.g = floor(depth{} / 256.0);\n", i, i);
-    code.Write("  depth{} -= expanded{}.g * 256.0;\n", i, i);
-    code.Write("  expanded{}.b = depth{};\n", i, i);
-  }
-
-  code.Write("  if (!first) {{\n");
-  // Upper 16
-  code.Write("     ocol0.b = expanded0.g / 255.0;\n"
-             "     ocol0.g = expanded0.b / 255.0;\n"
-             "     ocol0.r = expanded1.g / 255.0;\n"
-             "     ocol0.a = expanded1.b / 255.0;\n"
-             "  }} else {{\n");
-  // Lower 8
-  code.Write("     ocol0.b = 1.0;\n"
-             "     ocol0.g = expanded0.r / 255.0;\n"
-             "     ocol0.r = 1.0;\n"
-             "     ocol0.a = expanded1.r / 255.0;\n"
-             "  }}\n");
-
-  WriteEncoderEnd(code);
 }
 
 static void WriteXFBEncoder(ShaderCode& code, APIType api_type, const EFBCopyParams& params)
 {
-  WriteSwizzler(code, params, EFBCopyFormat::XFB, api_type);
-
-  code.Write("float3 color0, color1;\n");
-  WriteSampleColor(code, "rgb", "color0", 0, api_type, params);
-  WriteSampleColor(code, "rgb", "color1", 1, api_type, params);
-
-  // Gamma is only applied to XFB copies.
-  code.Write("  color0 = pow(abs(color0), float3(gamma_rcp, gamma_rcp, gamma_rcp));\n"
-             "  color1 = pow(abs(color1), float3(gamma_rcp, gamma_rcp, gamma_rcp));\n");
+  code.Write("float4 color0 = float4(0, 0, 0, 1), color1 = float4(0, 0, 0, 1);\n");
+  WriteSampleColor(code, "rgb", "color0.rgb", 0, api_type, params);
+  WriteSampleColor(code, "rgb", "color1.rgb", 1, api_type, params);
 
   // Convert to YUV.
-  code.Write("  const float3 y_const = float3(0.257, 0.504, 0.098);\n"
-             "  const float3 u_const = float3(-0.148, -0.291, 0.439);\n"
-             "  const float3 v_const = float3(0.439, -0.368, -0.071);\n"
-             "  float3 average = (color0 + color1) * 0.5;\n"
-             "  ocol0.b = dot(color0,  y_const) + 0.0625;\n"
-             "  ocol0.g = dot(average, u_const) + 0.5;\n"
-             "  ocol0.r = dot(color1,  y_const) + 0.0625;\n"
-             "  ocol0.a = dot(average, v_const) + 0.5;\n");
-
-  WriteEncoderEnd(code);
+  code.Write("  // Intensity/YUV format conversion constants determined by hardware testing\n"
+             "  const float4 y_const = float4( 66, 129,  25,  16);\n"
+             "  const float4 u_const = float4(-38, -74, 112, 128);\n"
+             "  const float4 v_const = float4(112, -94, -18, 128);\n"
+             "  float4 average = (color0 + color1) * 0.5;\n"
+             "  // TODO: check rounding\n"
+             "  ocol0.b = round(dot(color0,  y_const)) / 256.0;\n"
+             "  ocol0.g = round(dot(average, u_const)) / 256.0;\n"
+             "  ocol0.r = round(dot(color1,  y_const)) / 256.0;\n"
+             "  ocol0.a = round(dot(average, v_const)) / 256.0;\n");
 }
 
 std::string GenerateEncodingShader(const EFBCopyParams& params, APIType api_type)
 {
   ShaderCode code;
 
+  WriteHeader(code, api_type);
+  WriteSampleFunction(code, params, api_type);
+  WriteSwizzler(code, params, api_type);
+
   switch (params.copy_format)
   {
   case EFBCopyFormat::R4:
-    if (params.yuv)
-      WriteI4Encoder(code, api_type, params);
-    else
-      WriteC4Encoder(code, "r", api_type, params);
+    WriteC4Encoder(code, "r", api_type, params);
     break;
   case EFBCopyFormat::RA4:
-    if (params.yuv)
-      WriteIA4Encoder(code, api_type, params);
-    else
-      WriteCC4Encoder(code, "ar", api_type, params);
+    WriteCC4Encoder(code, "ar", api_type, params);
     break;
   case EFBCopyFormat::RA8:
-    if (params.yuv)
-      WriteIA8Encoder(code, api_type, params);
-    else
-      WriteCC8Encoder(code, "ar", api_type, params);
+    WriteCC8Encoder(code, "ar", api_type, params);
     break;
   case EFBCopyFormat::RGB565:
     WriteRGB565Encoder(code, api_type, params);
@@ -758,44 +471,26 @@ std::string GenerateEncodingShader(const EFBCopyParams& params, APIType api_type
     WriteRGB5A3Encoder(code, api_type, params);
     break;
   case EFBCopyFormat::RGBA8:
-    if (params.depth)
-      WriteZ24Encoder(code, api_type, params);
-    else
-      WriteRGBA8Encoder(code, api_type, params);
+    WriteRGBA8Encoder(code, api_type, params);
     break;
   case EFBCopyFormat::A8:
     WriteC8Encoder(code, "a", api_type, params);
     break;
   case EFBCopyFormat::R8_0x1:
   case EFBCopyFormat::R8:
-    if (params.yuv)
-      WriteI8Encoder(code, api_type, params);
-    else
-      WriteC8Encoder(code, "r", api_type, params);
+    WriteC8Encoder(code, "r", api_type, params);
     break;
   case EFBCopyFormat::G8:
-    if (params.depth)
-      WriteZ8Encoder(code, "256.0", api_type, params);  // Z8M
-    else
-      WriteC8Encoder(code, "g", api_type, params);
+    WriteC8Encoder(code, "g", api_type, params);
     break;
   case EFBCopyFormat::B8:
-    if (params.depth)
-      WriteZ8Encoder(code, "65536.0", api_type, params);  // Z8L
-    else
-      WriteC8Encoder(code, "b", api_type, params);
+    WriteC8Encoder(code, "b", api_type, params);
     break;
   case EFBCopyFormat::RG8:
-    if (params.depth)
-      WriteZ16Encoder(code, api_type, params);  // Z16H
-    else
-      WriteCC8Encoder(code, "gr", api_type, params);
+    WriteCC8Encoder(code, "gr", api_type, params);
     break;
   case EFBCopyFormat::GB8:
-    if (params.depth)
-      WriteZ16LEncoder(code, api_type, params);  // Z16L
-    else
-      WriteCC8Encoder(code, "bg", api_type, params);
+    WriteCC8Encoder(code, "bg", api_type, params);
     break;
   case EFBCopyFormat::XFB:
     WriteXFBEncoder(code, api_type, params);
@@ -805,6 +500,8 @@ std::string GenerateEncodingShader(const EFBCopyParams& params, APIType api_type
     break;
   }
 
+  code.Write("}}\n");
+
   return code.GetBuffer();
 }
 

From 2f43889141179bd46f027e017fead35371fdd104 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 20:40:07 -0800
Subject: [PATCH 10/11] Software: Use hardware-verified numbers for RGB->YUV
 conversion

---
 Source/Core/VideoBackends/Software/EfbInterface.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Source/Core/VideoBackends/Software/EfbInterface.cpp b/Source/Core/VideoBackends/Software/EfbInterface.cpp
index 1e5498e28a..399e0bb57f 100644
--- a/Source/Core/VideoBackends/Software/EfbInterface.cpp
+++ b/Source/Core/VideoBackends/Software/EfbInterface.cpp
@@ -535,9 +535,14 @@ static yuv444 ConvertColorToYUV(u32 color)
 
   // GameCube/Wii uses the BT.601 standard algorithm for converting to YCbCr; see
   // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion
-  return {static_cast<u8>(0.257f * red + 0.504f * green + 0.098f * blue),
-          static_cast<s8>(-0.148f * red + -0.291f * green + 0.439f * blue),
-          static_cast<s8>(0.439f * red + -0.368f * green + -0.071f * blue)};
+  // These numbers were determined by hardware testing
+  const u16 y = +66 * red + 129 * green + +25 * blue;
+  const s16 u = -38 * red + -74 * green + 112 * blue;
+  const s16 v = 112 * red + -94 * green + -18 * blue;
+  const u8 y_round = static_cast<u8>((y >> 8) + ((y >> 7) & 1));
+  const s8 u_round = static_cast<s8>((u >> 8) + ((u >> 7) & 1));
+  const s8 v_round = static_cast<s8>((v >> 8) + ((v >> 7) & 1));
+  return {y_round, u_round, v_round};
 }
 
 u32 GetDepth(u16 x, u16 y)

From a6e06f38adb933187605ac07ef7a0cb1940afa80 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 22 Feb 2022 20:42:23 -0800
Subject: [PATCH 11/11] Add notes about precision of YUV->RGB conversion
 factors for XFB

---
 Source/Core/VideoCommon/TextureConversionShader.cpp | 2 ++
 Source/Core/VideoCommon/TextureDecoder_Common.cpp   | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index c1aac4482a..6cabb4ce88 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -968,6 +968,8 @@ static const std::map<TextureFormat, DecodingShaderInfo> s_decoding_shader_info{
 
     // We do the inverse BT.601 conversion for YCbCr to RGB
     // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion
+    // TODO: Use more precise numbers for this conversion (although on real hardware, the XFB isn't
+    // in a real texture format, so does this conversion actually ever happen?)
     {TextureFormat::XFB,
      {TEXEL_BUFFER_FORMAT_RGBA8_UINT, 0, 8, 8, false,
       R"(
diff --git a/Source/Core/VideoCommon/TextureDecoder_Common.cpp b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
index 51ea0572f2..30121aeb7b 100644
--- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
@@ -629,6 +629,8 @@ void TexDecoder_DecodeTexel(u8* dst, const u8* src, int s, int t, int imageWidth
 
     // We do the inverse BT.601 conversion for YCbCr to RGB
     // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion
+    // TODO: Use more precise numbers for this conversion (although on real hardware, the XFB isn't
+    // in a real texture format, so does this conversion actually ever happen?)
     u8 R = std::clamp(int(1.164f * Y + 1.596f * V), 0, 255);
     u8 G = std::clamp(int(1.164f * Y - 0.392f * U - 0.813f * V), 0, 255);
     u8 B = std::clamp(int(1.164f * Y + 2.017f * U), 0, 255);
@@ -694,6 +696,8 @@ void TexDecoder_DecodeXFB(u8* dst, const u8* src, u32 width, u32 height, u32 str
 
       // We do the inverse BT.601 conversion for YCbCr to RGB
       // http://www.equasys.de/colorconversion.html#YCbCr-RGBColorFormatConversion
+      // TODO: Use more precise numbers for this conversion (although on real hardware, the XFB
+      // isn't in a real texture format, so does this conversion actually ever happen?)
       u8 R1 = static_cast<u8>(std::clamp(int(1.164f * Y1 + 1.596f * V), 0, 255));
       u8 G1 = static_cast<u8>(std::clamp(int(1.164f * Y1 - 0.392f * U - 0.813f * V), 0, 255));
       u8 B1 = static_cast<u8>(std::clamp(int(1.164f * Y1 + 2.017f * U), 0, 255));