From 236f75aac3e3fb09bbebdd1bc01c2191cdd7b029 Mon Sep 17 00:00:00 2001
From: pierre <pierre@pirsoft.de>
Date: Sun, 19 Sep 2010 20:01:17 +0000
Subject: [PATCH] Make shader for copying from depth buffer to texture more
 resilent against precision loss.

The theoretical result is slightly different to the original shader because the
final adjustment is to the range [0/255..255/255] instead of
[0/16777215..16777215/16777215].

The real result is vastly different on some gpus that were giving incorrect results
of bits 23..16 (y-component) wrapping around while bits 31..24 (z-component)
stayed the same, and bits 31..24 changing while in the middle of the value range
for bits 23..16 for large depth values.

This should fix issue #3123.


git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6217 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 .../Plugin_VideoOGL/Src/PixelShaderCache.cpp  | 39 +++++++++++++++----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp b/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp
index a2ddf6e4ce..ba97af6d66 100644
--- a/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp
@@ -107,17 +107,42 @@ void PixelShaderCache::Init()
 		s_ColorMatrixProgram = 0;
 	}
 
-    sprintf(pmatrixprog, "!!ARBfp1.0"
+    sprintf(pmatrixprog, "!!ARBfp1.0\n"
 						"TEMP R0;\n"
 						"TEMP R1;\n"
                         "TEMP R2;\n"
-                        "PARAM K0 = { 65535.0, 255.0,1.0,16777215.0};\n" 
-						"PARAM K1 = { 0.999999940395355224609375, 1.0000000596046483281045155587504,0.0,0.0};\n" 
+                        //16777215/16777216*256, 1/255, 256, 0
+                        "PARAM K0 = { 255.99998474121, 0.003921568627451, 256.0, 0.0};\n" 
+                        //sample the depth value
 						"TEX R2, fragment.texcoord[0], texture[0], RECT;\n"
-						"MUL R0, R2.x, K1.x;\n"
-                        "MUL R0, R0.x, K0;\n"						
-                        "FRC R0, R0;\n"
-						"MUL R0, R0, K1.y;\n"
+
+                        //scale from [0*16777216..1*16777216] to
+                        //[0*16777215..1*16777215], multiply by 256
+						"MUL R0, R2.x, K0.x;\n" // *16777215/16777216*256
+
+                        //It is easy to get bad results due to low precision
+                        //here, for example converting like this:
+                        //MUL R0,R0,{ 65536, 256, 1, 16777216 }
+                        //FRC R0,R0
+                        //gives {?, 128/255, 254/255, ?} for depth value 254/255
+                        //on some gpus
+
+                        "FLR R0.z,R0;\n"        //bits 31..24
+
+                        "SUB R0.xyw,R0,R0.z;\n" //subtract bits 31..24 from rest
+                        "MUL R0.xyw,R0,K0.z;\n" // *256
+                        "FLR R0.y,R0;\n"        //bits 23..16
+
+                        "SUB R0.xw,R0,R0.y;\n"  //subtract bits 23..16 from rest
+                        "MUL R0.xw,R0,K0.z;\n"  // *256
+                        "FLR R0.x,R0;\n"        //bits 15..8
+
+                        "SUB R0.w,R0,R0.x;\n"   //subtract bits 15..8 from rest
+                        "MUL R0.w,R0,K0.z;\n"   // *256
+                        "FLR R0.w,R0;\n"        //bits 7..0
+
+                        "MUL R0,R0,K0.y;\n"     // /255
+
 						"DP4 R1.x, R0, program.env[%d];\n"
 						"DP4 R1.y, R0, program.env[%d];\n"
                         "DP4 R1.z, R0, program.env[%d];\n"