Make shader for copying from depth buffer to texture more resilent against precision loss.

The theoretical result is slightly different to the original shader because the final adjustment is to the range [0/255..255/255] instead of [0/16777215..16777215/16777215]. The real result is vastly different on some gpus that were giving incorrect results of bits 23..16 (y-component) wrapping around while bits 31..24 (z-component) stayed the same, and bits 31..24 changing while in the middle of the value range for bits 23..16 for large depth values. This should fix issue #3123. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6217 8ced0084-cf51-0410-be5f-012b33b47a6e
2010-09-19 20:01:17 +00:00 · 2010-09-19 20:01:17 +00:00 · 236f75aac3
parent eda652b7a0
commit 236f75aac3
1 changed files with 32 additions and 7 deletions
--- a/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/PixelShaderCache.cpp
@ -107,17 +107,42 @@ void PixelShaderCache::Init()
 		s_ColorMatrixProgram = 0;
 	}

-    sprintf(pmatrixprog, "!!ARBfp1.0"
+    sprintf(pmatrixprog, "!!ARBfp1.0\n"
 						"TEMP R0;\n"
 						"TEMP R1;\n"
                        "TEMP R2;\n"
-                        "PARAM K0 = { 65535.0, 255.0,1.0,16777215.0};\n" 
-						"PARAM K1 = { 0.999999940395355224609375, 1.0000000596046483281045155587504,0.0,0.0};\n" 
+                        //16777215/16777216*256, 1/255, 256, 0
+                        "PARAM K0 = { 255.99998474121, 0.003921568627451, 256.0, 0.0};\n" 
+                        //sample the depth value
 						"TEX R2, fragment.texcoord[0], texture[0], RECT;\n"
-						"MUL R0, R2.x, K1.x;\n"
-                        "MUL R0, R0.x, K0;\n"						
-                        "FRC R0, R0;\n"
-						"MUL R0, R0, K1.y;\n"
+
+                        //scale from [0*16777216..1*16777216] to
+                        //[0*16777215..1*16777215], multiply by 256
+						"MUL R0, R2.x, K0.x;\n" // *16777215/16777216*256
+
+                        //It is easy to get bad results due to low precision
+                        //here, for example converting like this:
+                        //MUL R0,R0,{ 65536, 256, 1, 16777216 }
+                        //FRC R0,R0
+                        //gives {?, 128/255, 254/255, ?} for depth value 254/255
+                        //on some gpus
+
+                        "FLR R0.z,R0;\n"        //bits 31..24
+
+                        "SUB R0.xyw,R0,R0.z;\n" //subtract bits 31..24 from rest
+                        "MUL R0.xyw,R0,K0.z;\n" // *256
+                        "FLR R0.y,R0;\n"        //bits 23..16
+
+                        "SUB R0.xw,R0,R0.y;\n"  //subtract bits 23..16 from rest
+                        "MUL R0.xw,R0,K0.z;\n"  // *256
+                        "FLR R0.x,R0;\n"        //bits 15..8
+
+                        "SUB R0.w,R0,R0.x;\n"   //subtract bits 15..8 from rest
+                        "MUL R0.w,R0,K0.z;\n"   // *256
+                        "FLR R0.w,R0;\n"        //bits 7..0
+
+                        "MUL R0,R0,K0.y;\n"     // /255
+
 						"DP4 R1.x, R0, program.env[%d];\n"
 						"DP4 R1.y, R0, program.env[%d];\n"
                        "DP4 R1.z, R0, program.env[%d];\n"