From 0622979d3b1761cfec6108ddb4b77d67518986f3 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 3 Sep 2017 16:32:37 +1000
Subject: [PATCH] ShaderGen: Support writing integer colors when logic op is
 enabled

This is required for D3D to support logic op.
---
 Source/Core/VideoCommon/BPMemory.cpp        | 13 +++++++
 Source/Core/VideoCommon/BPMemory.h          |  2 +
 Source/Core/VideoCommon/PixelShaderGen.cpp  | 26 ++++++++++---
 Source/Core/VideoCommon/PixelShaderGen.h    |  3 +-
 Source/Core/VideoCommon/UberShaderPixel.cpp | 42 ++++++++++++++-------
 Source/Core/VideoCommon/UberShaderPixel.h   |  1 +
 6 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/Source/Core/VideoCommon/BPMemory.cpp b/Source/Core/VideoCommon/BPMemory.cpp
index 26891ba941..8c0a2ec43a 100644
--- a/Source/Core/VideoCommon/BPMemory.cpp
+++ b/Source/Core/VideoCommon/BPMemory.cpp
@@ -10,6 +10,19 @@
 // STATE_TO_SAVE
 BPMemory bpmem;
 
+bool BlendMode::UseLogicOp() const
+{
+  // Logicop bit has lowest priority.
+  if (subtract || blendenable || !logicopenable)
+    return false;
+
+  // Fast path for Kirby's Return to Dreamland, they use it with dstAlpha.
+  if (logicmode == BlendMode::NOOP)
+    return false;
+
+  return true;
+}
+
 float FogParam0::GetA() const
 {
   // scale mantissa from 11 to 23 bits
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index e2730c1472..fce322c4fd 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -648,6 +648,8 @@ union BlendMode
   BitField<12, 4, LogicOp> logicmode;
 
   u32 hex;
+
+  bool UseLogicOp() const;
 };
 
 union FogParam0
diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index d503b21176..5e561456ab 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -175,6 +175,7 @@ PixelShaderUid GetPixelShaderUid()
   uid_data->rgba6_format =
       bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor;
   uid_data->dither = bpmem.blendmode.dither && uid_data->rgba6_format;
+  uid_data->uint_output = bpmem.blendmode.UseLogicOp();
 
   u32 numStages = uid_data->genMode_numtevstages + 1;
 
@@ -434,7 +435,7 @@ static void SampleTexture(ShaderCode& out, const char* texcoords, const char* te
 static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType ApiType,
                            bool per_pixel_depth, bool use_dual_source);
 static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
-static void WriteColor(ShaderCode& out, const pixel_shader_uid_data* uid_data,
+static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
                        bool use_dual_source);
 
 ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host_config,
@@ -568,8 +569,12 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host
   else  // D3D
   {
     out.Write("void main(\n");
-    out.Write("  out float4 ocol0 : SV_Target0,\n"
-              "  out float4 ocol1 : SV_Target1,\n%s"
+    if (uid_data->uint_output)
+      out.Write("  out uint4 ocol0 : SV_Target,\n");
+    else
+      out.Write("  out float4 ocol0 : SV_Target0,\n"
+                "  out float4 ocol1 : SV_Target1,\n");
+    out.Write("%s"
               "  in float4 rawpos : SV_Position,\n",
               uid_data->per_pixel_depth ? "  out float depth : SV_Depth,\n" : "");
 
@@ -778,7 +783,7 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host
   WriteFog(out, uid_data);
 
   // Write the color and alpha values to the framebuffer
-  WriteColor(out, uid_data, use_dual_source);
+  WriteColor(out, ApiType, uid_data, use_dual_source);
 
   if (uid_data->bounding_box)
   {
@@ -1302,8 +1307,19 @@ static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data)
   out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n");
 }
 
-static void WriteColor(ShaderCode& out, const pixel_shader_uid_data* uid_data, bool use_dual_source)
+static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
+                       bool use_dual_source)
 {
+  // D3D requires that the shader outputs be uint when writing to a uint render target for logic op.
+  if (api_type == APIType::D3D && uid_data->uint_output)
+  {
+    if (uid_data->rgba6_format)
+      out.Write("\tocol0 = uint4(prev & 0xFC);\n");
+    else
+      out.Write("\tocol0 = uint4(prev);\n");
+    return;
+  }
+
   if (uid_data->rgba6_format)
     out.Write("\tocol0.rgb = float3(prev.rgb >> 2) / 63.0;\n");
   else
diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h
index ee422bee8d..253dc59e22 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@@ -43,7 +43,8 @@ struct pixel_shader_uid_data
   u32 numColorChans : 2;
   u32 rgba6_format : 1;
   u32 dither : 1;
-  u32 pad : 16;
+  u32 uint_output : 1;
+  u32 pad : 15;
 
   u32 texMtxInfo_n_projection : 8;  // 8x1 bit
   u32 tevindref_bi0 : 3;
diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index 9b4d8d2251..8f6521e890 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -25,6 +25,7 @@ PixelShaderUid GetPixelShaderUid()
       (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) ||
       (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !uid_data->early_depth) ||
       (bpmem.zmode.testenable && bpmem.genMode.zfreeze);
+  uid_data->uint_output = bpmem.blendmode.UseLogicOp();
   return out;
 }
 
@@ -1164,18 +1165,29 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
             "  }\n"
             "\n");
 
-  // TODO: Do we still want to support two pass alpha blending?
-  out.Write("  if (bpmem_rgba6_format)\n"
-            "    ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;\n"
-            "  else\n"
-            "    ocol0.rgb = float3(TevResult.rgb) / 255.0;\n"
-            "\n"
-            "  if (bpmem_dstalpha != 0u)\n");
-  out.Write("    ocol0.a = float(%s >> 2) / 63.0;\n",
-            BitfieldExtract("bpmem_dstalpha", ConstantAlpha().alpha).c_str());
-  out.Write("  else\n"
-            "    ocol0.a = float(TevResult.a >> 2) / 63.0;\n"
-            "  \n");
+  // D3D requires that the shader outputs be uint when writing to a uint render target for logic op.
+  if (ApiType == APIType::D3D && uid_data->uint_output)
+  {
+    out.Write("  if (bpmem_rgba6_format)\n"
+              "    ocol0 = uint4(TevResult & 0xFC);\n"
+              "  else\n"
+              "    ocol0 = uint4(TevResult);\n"
+              "\n");
+  }
+  else
+  {
+    out.Write("  if (bpmem_rgba6_format)\n"
+              "    ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;\n"
+              "  else\n"
+              "    ocol0.rgb = float3(TevResult.rgb) / 255.0;\n"
+              "\n"
+              "  if (bpmem_dstalpha != 0u)\n");
+    out.Write("    ocol0.a = float(%s >> 2) / 63.0;\n",
+              BitfieldExtract("bpmem_dstalpha", ConstantAlpha().alpha).c_str());
+    out.Write("  else\n"
+              "    ocol0.a = float(TevResult.a >> 2) / 63.0;\n"
+              "  \n");
+  }
 
   if (use_dual_source)
   {
@@ -1260,7 +1272,11 @@ void EnumeratePixelShaderUids(const std::function<void(const PixelShaderUid&)>&
           continue;
 
         puid->per_pixel_depth = per_pixel_depth != 0;
-        callback(uid);
+        for (u32 uint_output = 0; uint_output < 2; uint_output++)
+        {
+          puid->uint_output = uint_output;
+          callback(uid);
+        }
       }
     }
   }
diff --git a/Source/Core/VideoCommon/UberShaderPixel.h b/Source/Core/VideoCommon/UberShaderPixel.h
index d7dc8109e8..3a5a8f8435 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.h
+++ b/Source/Core/VideoCommon/UberShaderPixel.h
@@ -15,6 +15,7 @@ struct pixel_ubershader_uid_data
   u32 num_texgens : 4;
   u32 early_depth : 1;
   u32 per_pixel_depth : 1;
+  u32 uint_output : 1;
 
   u32 NumValues() const { return sizeof(pixel_ubershader_uid_data); }
 };