From 718c88ff6a4befba4b6d393670b5a0d071fea757 Mon Sep 17 00:00:00 2001
From: rice1964 <rice1964@gmail.com>
Date: Mon, 31 Aug 2009 04:23:30 +0000
Subject: [PATCH] Some optimization on the pixel and vertex shader generators.
 As tested on Zelda TP, these changes should be able to reduce the number of
 lines in the compiled pixel shader program by 20% to 30%, for example, from
 38 lines to 28 lines after the generated cg/HLSL pixel shader compiled by the
 cg/HLSL compiler. These could means slightly faster rasterization on slower
 video cards.

Also fixed shader compilation errors for DX9 plugin by using correct pixel/vertex attribute names that are compatibile with DX9 HLSL shader compiler. Now the generated vertex or pixel shader programs will compile correctly in either OGL or DX9.

However, DX9 plugin is still not fixed, even though the shader programs can be compiled now.


git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4113 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 .../Core/VideoCommon/Src/PixelShaderGen.cpp   | 134 ++++++++++++------
 .../Core/VideoCommon/Src/VertexShaderGen.cpp  |  15 +-
 .../Src/NativeVertexFormat.cpp                |   2 +-
 .../Plugin_VideoDX9/Src/PixelShaderCache.cpp  |   2 +-
 .../Plugin_VideoDX9/Src/VertexShaderCache.cpp |   4 +-
 5 files changed, 104 insertions(+), 53 deletions(-)

diff --git a/Source/Core/VideoCommon/Src/PixelShaderGen.cpp b/Source/Core/VideoCommon/Src/PixelShaderGen.cpp
index 78a2939857..9833969476 100644
--- a/Source/Core/VideoCommon/Src/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/Src/PixelShaderGen.cpp
@@ -140,8 +140,8 @@ void GetPixelShaderId(PIXELSHADERUID &uid, u32 s_texturemask, u32 dstAlphaEnable
 //   output is given by .outreg
 //   tevtemp is set according to swapmodetables and 
 
-static void WriteStage(char *&p, int n, u32 texture_mask);
-static void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask);
+static void WriteStage(char *&p, int n, u32 texture_mask, bool HLSL);
+static void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask, bool HLSL);
 static void WriteAlphaCompare(char *&p, int num, int comp);
 static bool WriteAlphaTest(char *&p, bool HLSL);
 static void WriteFog(char *&p);
@@ -405,7 +405,10 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
     // Declare samplers
     if (texture_mask)
 	{
-        WRITE(p, "uniform samplerRECT ");
+		if (HLSL)
+			WRITE(p, "uniform sampler ");
+		else
+			WRITE(p, "uniform samplerRECT ");
         bool bfirst = true;
         for (int i = 0; i < 8; ++i)
             if (texture_mask & (1<<i))
@@ -460,15 +463,15 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
 			WRITE(p, "  in float%d uv%d : TEXCOORD%d, \n", i<4?4:3, i, i);
 	}
 
-	WRITE(p, "  in float4 colors_0 : COLOR0, in float4 colors_1 : COLOR1){\n");
+	WRITE(p, "  in float4 colors_0 : COLOR0,\n in float4 colors_1 : COLOR1){\n");
 
     char* pmainstart = p;
 
-    WRITE(p, "float4 c0="I_COLORS"[1],c1="I_COLORS"[2],c2="I_COLORS"[3],prev=float4(0.0f,0.0f,0.0f,0.0f),textemp,rastemp,konsttemp=float4(0.0f,0.0f,0.0f,0.0f);\n"
-            "float3 comp16 = float3(1,255,0), comp24 = float3(1,255,255*255);\n"
-            "float4 alphabump=0;\n"
-            "float3 tevcoord;\n"
-            "float2 wrappedcoord, tempcoord;\n\n");
+    WRITE(p, "  float4 c0="I_COLORS"[1],c1="I_COLORS"[2],c2="I_COLORS"[3],prev=float4(0.0f,0.0f,0.0f,0.0f),textemp,rastemp,konsttemp=float4(0.0f,0.0f,0.0f,0.0f);\n"
+            "  float3 comp16 = float3(1,255,0), comp24 = float3(1,255,255*255);\n"
+            "  float4 alphabump=0;\n"
+            "  float3 tevcoord;\n"
+            "  float2 wrappedcoord, tempcoord;\n\n");
 
     for (int i = 0; i < numTexgen; ++i) 
 	{
@@ -494,7 +497,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
 
             char buffer[32];
             sprintf(buffer, "float3 indtex%d", i);
-            SampleTexture(p, buffer, "tempcoord", "abg", bpmem.tevindref.getTexMap(i), texture_mask);
+            SampleTexture(p, buffer, "tempcoord", "abg", bpmem.tevindref.getTexMap(i), texture_mask,HLSL);
         }
     }
 
@@ -503,7 +506,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
         WRITE(p, "float3 uv0 = float3(0.0f,0.0f,0.0f);\n");
 
     for (int i = 0; i < numStages; i++)
-        WriteStage(p, i, texture_mask); //build the equation for this stage
+        WriteStage(p, i, texture_mask,HLSL); //build the equation for this stage
 
 	if (numTexgen >= 7)
 		WRITE(p, "float4 clipPos = float4(uv0.w, uv1.w, uv2.w, uv3.w);\n");
@@ -547,7 +550,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
     return text;
 }
 
-static void WriteStage(char *&p, int n, u32 texture_mask)
+static void WriteStage(char *&p, int n, u32 texture_mask, bool HLSL)
 {
     char *rasswap = swapModeTable[bpmem.combiners[n].alphaC.rswap];
     char *texswap = swapModeTable[bpmem.combiners[n].alphaC.tswap];
@@ -656,7 +659,7 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
                 WRITE(p, "tevcoord.xy = float2(0.0f,0.0f);\n");
         }
 
-        SampleTexture(p, "textemp", "tevcoord", texswap, texmap, texture_mask);
+        SampleTexture(p, "textemp", "tevcoord", texswap, texmap, texture_mask, HLSL);
     }
     else
         WRITE(p, "textemp=float4(1,1,1,1);\n");
@@ -672,16 +675,35 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
     if (bCKonst || bAKonst )
         WRITE(p, "konsttemp=float4(%s,%s);\n",tevKSelTableC[kc],tevKSelTableA[ka]);  
 
-    WRITE(p, "%s= ", tevCOutputTable[cc.dest]); 
+    if (cc.clamp)
+		WRITE(p, "%s= saturate(", tevCOutputTable[cc.dest]);
+	else
+		WRITE(p, "%s= (", tevCOutputTable[cc.dest]);
 
     // combine the color channel
     if (cc.bias != 3) // if not compare
 	{
         //normal color combiner goes here
-        WRITE(p, "   %s*(%s%s",tevScaleTable[cc.shift],tevCInputTable[cc.d],tevOpTable[cc.op]);
-        WRITE(p, "lerp(%s,%s,%s) %s);\n",
-              tevCInputTable[cc.a], tevCInputTable[cc.b],
-              tevCInputTable[cc.c], tevBiasTable[cc.bias]);
+		if (cc.shift>0)
+			WRITE(p, "   %s*(%s%s",tevScaleTable[cc.shift],tevCInputTable[cc.d],tevOpTable[cc.op]);
+		else
+			WRITE(p, "   (%s%s",tevCInputTable[cc.d],tevOpTable[cc.op]);
+
+		if (cc.a == 15 && cc.b == 15)
+			WRITE(p, "0");
+		else if (cc.a == 15 && cc.c == 15)
+			WRITE(p, "0");
+		else if (cc.b == 15 && cc.c == 15)
+			WRITE(p,"%s",tevCInputTable[cc.a]);
+		else if (cc.a == 15)
+			WRITE(p,"(%s)*(%s)",tevCInputTable[cc.b],tevCInputTable[cc.c]);
+		else if (cc.b == 15)
+			WRITE(p,"(%s)*(1-%s)",tevCInputTable[cc.a],tevCInputTable[cc.c]);
+		else if (cc.c == 15)
+			WRITE(p,"%s",tevCInputTable[cc.a]);
+		else
+			WRITE(p, "lerp(%s,%s,%s)",tevCInputTable[cc.a], tevCInputTable[cc.b],tevCInputTable[cc.c]);
+		WRITE(p, " %s)",tevBiasTable[cc.bias]);
     }
     else 
 	{
@@ -690,44 +712,62 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
 		{
         case TEVCMP_R8_GT:
         case TEVCMP_RGB8_GT: // per component compares
-            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : float3(0.0f,0.0f,0.0f))",
                 tevCInputTable[cc.d], tevCInputTable2[cc.a], cmp==TEVCMP_R8_GT?"r":"rgb", tevCInputTable2[cc.b], cmp==TEVCMP_R8_GT?"r":"rgb", tevCInputTable[cc.c]);
             break;
         case TEVCMP_R8_EQ:
         case TEVCMP_RGB8_EQ:
-            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : float3(0.0f,0.0f,0.0f))n",
                 tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], epsilon8bit, tevCInputTable[cc.c]);
             break;
         
         case TEVCMP_GR16_GT: // 16 bit compares: 255*g+r (probably used for ztextures, so make sure in ztextures, g is the most significant byte)
         case TEVCMP_BGR24_GT: // 24 bit compares: 255*255*b+255*g+r
-            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : float3(0.0f,0.0f,0.0f))",
                 tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], cmp==TEVCMP_GR16_GT?"16":"24", tevCInputTable[cc.c]);
             break;
         case TEVCMP_GR16_EQ:
         case TEVCMP_BGR24_EQ:
-            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : float3(0.0f,0.0f,0.0f))",
                 tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], cmp==TEVCMP_GR16_EQ?"16":"24", epsilon8bit, tevCInputTable[cc.c]);
             break;
         default:
-            WRITE(p, "float3(0.0f,0.0f,0.0f);\n");
+            WRITE(p, "float3(0.0f,0.0f,0.0f)");
             break;
         }
     }
-    
-    if (cc.clamp)
-        WRITE(p, "%s = clamp(%s,0.0f,1.0f);\n", tevCOutputTable[cc.dest],tevCOutputTable[cc.dest]);
 
+	WRITE(p,");\n");
+    
     // combine the alpha channel
-    WRITE(p, "%s= ", tevAOutputTable[ac.dest]);
+    if (ac.clamp)
+	    WRITE(p, "%s= saturate(", tevAOutputTable[ac.dest]);
+	else
+		WRITE(p, "%s= (", tevAOutputTable[ac.dest]);
 
     if (ac.bias != 3) // if not compare
 	{
         //normal alpha combiner goes here
-        WRITE(p, "   %s*(%s%s",tevScaleTable[ac.shift],tevAInputTable[ac.d],tevOpTable[ac.op]);
-        WRITE(p, "lerp(%s,%s,%s) %s)\n",
-            tevAInputTable[ac.a],tevAInputTable[ac.b],
-            tevAInputTable[ac.c],tevBiasTable[ac.bias]);
+		if (ac.shift>0)
+			WRITE(p, "   %s*(%s%s",tevScaleTable[ac.shift],tevAInputTable[ac.d],tevOpTable[ac.op]);
+		else
+			WRITE(p, "   (%s%s",tevAInputTable[ac.d],tevOpTable[ac.op]);
+
+		if (ac.a == 7 && ac.b == 7)
+			WRITE(p, "0");
+		else if (ac.a == 7 && ac.c == 7)
+			WRITE(p, "0");
+		else if (ac.b == 7 && ac.c == 7)
+			WRITE(p,"%s",tevAInputTable[ac.a]);
+		else if (ac.a == 7)
+			WRITE(p,"(%s)*(%s)",tevAInputTable[ac.b],tevAInputTable[ac.c]);
+		else if (ac.b == 7)
+			WRITE(p,"(%s)*(1-%s)",tevAInputTable[ac.a],tevAInputTable[ac.c]);
+		else if (ac.c == 7)
+			WRITE(p,"%s",tevAInputTable[ac.a]);
+		else
+	        WRITE(p, "lerp(%s,%s,%s)",tevAInputTable[ac.a],tevAInputTable[ac.b],tevAInputTable[ac.c]);
+		WRITE(p, " %s)",tevBiasTable[ac.bias]);
     }
     else 
 	{
@@ -737,40 +777,35 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
 		{
         case TEVCMP_R8_GT:
         case TEVCMP_A8_GT:
-            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : 0)\n",
+            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : 0)",
                 tevAInputTable[ac.d],tevAInputTable2[ac.a], cmp==TEVCMP_R8_GT?"r":"a", tevAInputTable2[ac.b], cmp==TEVCMP_R8_GT?"r":"a", tevAInputTable[ac.c]);
             break;
         case TEVCMP_R8_EQ:
         case TEVCMP_A8_EQ:
-            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : 0)\n",
+            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : 0)",
                 tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b],epsilon8bit,tevAInputTable[ac.c]);
             break;
         
         case TEVCMP_GR16_GT: // 16 bit compares: 255*g+r (probably used for ztextures, so make sure in ztextures, g is the most significant byte)
         case TEVCMP_BGR24_GT: // 24 bit compares: 255*255*b+255*g+r
-            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : 0)\n",
+            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : 0)",
                 tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b], cmp==TEVCMP_GR16_GT?"16":"24", tevAInputTable[ac.c]);
             break;
         case TEVCMP_GR16_EQ:
         case TEVCMP_BGR24_EQ:
-            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : 0)\n",
+            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : 0)",
                 tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b],cmp==TEVCMP_GR16_EQ?"16":"24",epsilon8bit,tevAInputTable[ac.c]);
             break;
         default:
-            WRITE(p, "0)\n");
+            WRITE(p, "0)");
             break;
         }
     }
 
-    WRITE(p, ";\n");
-
-    if (ac.clamp)
-        WRITE(p, "%s = clamp(%s,0.0f,1.0f);\n", tevAOutputTable[ac.dest],tevAOutputTable[ac.dest]);
-
-    WRITE(p, "\n");
+    WRITE(p, ");\n\n");
 }
 
-void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask)
+void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask, bool HLSL)
 {
     if (texture_mask & (1<<texmap)) {
         // non pow 2
@@ -792,10 +827,16 @@ void SampleTexture(char *&p, const char *destination, const char *texcoords, con
                  WRITE(p, "tempcoord.y = %s.y;\n", texcoords);
              }
 
-             WRITE(p, "%s=texRECT(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
+			 if (HLSL)
+				 WRITE(p, "%s=tex2D(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
+			 else
+				WRITE(p, "%s=texRECT(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
          }
          else {
-             WRITE(p, "%s=texRECT(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
+			 if (HLSL)
+				 WRITE(p, "%s=tex2D(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
+			 else
+				WRITE(p, "%s=texRECT(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
          }
     }
     else {
@@ -910,7 +951,8 @@ static void WriteFog(char *&p)
             WRITE (p, "  float ze = "I_FOG"[1].x * depth;\n");
         }
 
-        WRITE (p, "  float fog = clamp(ze - "I_FOG"[1].z, 0.0f, 1.0f);\n");
+        //WRITE (p, "  float fog = clamp(ze - "I_FOG"[1].z, 0.0f, 1.0f);\n");
+		WRITE (p, "  float fog = saturate(ze - "I_FOG"[1].z);\n");
     }
 
     switch (bpmem.fog.c_proj_fsel.fsel) 
diff --git a/Source/Core/VideoCommon/Src/VertexShaderGen.cpp b/Source/Core/VideoCommon/Src/VertexShaderGen.cpp
index 86ecf89a7c..ceeb44f5fd 100644
--- a/Source/Core/VideoCommon/Src/VertexShaderGen.cpp
+++ b/Source/Core/VideoCommon/Src/VertexShaderGen.cpp
@@ -168,9 +168,15 @@ const char *GenerateVertexShader(u32 components, bool D3D)
     if (components & VB_HAS_NRM0)
         WRITE(p, "  float3 rawnorm0 : NORMAL,\n");
     if (components & VB_HAS_NRM1)
-        WRITE(p, "  float3 rawnorm1 : ATTR%d,\n", SHADER_NORM1_ATTRIB);
+		if (D3D)
+			WRITE(p, "  float3 rawnorm1 : PSIZE,\n");
+		else
+			WRITE(p, "  float3 rawnorm1 : ATTR%d,\n", SHADER_NORM1_ATTRIB);
     if (components & VB_HAS_NRM2)
-        WRITE(p, "  float3 rawnorm2 : ATTR%d,\n", SHADER_NORM2_ATTRIB);
+		if (D3D)
+			WRITE(p, "  float3 rawnorm2 : BLENDINDICES,\n");
+		else
+			WRITE(p, "  float3 rawnorm2 : ATTR%d,\n", SHADER_NORM2_ATTRIB);
     if (components & VB_HAS_COL0)
         WRITE(p, "  float4 color0 : COLOR0,\n");
     if (components & VB_HAS_COL1)
@@ -181,7 +187,10 @@ const char *GenerateVertexShader(u32 components, bool D3D)
             WRITE(p, "  float%d tex%d : TEXCOORD%d,\n", hastexmtx ? 3 : 2, i,i);
     }
     if (components & VB_HAS_POSMTXIDX)
-        WRITE(p, "  half posmtx : ATTR%d,\n", SHADER_POSMTX_ATTRIB);
+		if (D3D)
+			WRITE(p, "  half posmtx : BLENDWEIGHT,\n");
+		else
+			WRITE(p, "  half posmtx : ATTR%d,\n", SHADER_POSMTX_ATTRIB);
 
     WRITE(p, "  float4 rawpos : POSITION) {\n");
     WRITE(p, "VS_OUTPUT o;\n");
diff --git a/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp b/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp
index 4dfa798db3..15c1ae7a43 100644
--- a/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp
@@ -142,7 +142,7 @@ void D3DVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl)
 
 	if (_vtx_decl.posmtx_offset != -1)
 	{
-		PanicAlert("Posmtx stream not supported correctly. %i", _vtx_decl.posmtx_offset);
+		//PanicAlert("Posmtx stream not supported correctly. %i", _vtx_decl.posmtx_offset);
 		// glVertexAttribPointer(SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, vtx_decl.stride, (void *)vtx_decl.posmtx_offset);
 		elems[elem_idx].Offset = _vtx_decl.posmtx_offset;
 		elems[elem_idx].Usage = D3DDECLUSAGE_BLENDINDICES;
diff --git a/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp b/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp
index 7c27713d45..ebdff9b2d6 100644
--- a/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp
@@ -84,7 +84,7 @@ void PixelShaderCache::SetShader()
 		return;
 	}
 
-	bool HLSL = false;
+	bool HLSL = true;
 	const char *code = GeneratePixelShader(PixelShaderManager::GetTextureMask(), false, HLSL);
 	LPDIRECT3DPIXELSHADER9 shader = HLSL ? D3D::CompilePixelShader(code, (int)strlen(code), false) : CompileCgShader(code);
 	if (shader)
diff --git a/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp b/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp
index 42f5bac0dc..63fce637d3 100644
--- a/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp
@@ -85,8 +85,8 @@ void VertexShaderCache::SetShader(u32 components)
 		return;
 	}
 
-	bool HLSL = false;
-	const char *code = GenerateVertexShader(components, true);
+	bool HLSL = true;
+	const char *code = GenerateVertexShader(components, HLSL);
 	LPDIRECT3DVERTEXSHADER9 shader = HLSL ? D3D::CompileVertexShader(code, (int)strlen(code), false) : CompileCgShader(code);
 	if (shader)
 	{