Merge pull request #193 from neobrain/tev_combiner_fixes

PixelShaderGen: Cleanups and fixes for tev combiners.
2014-03-26 10:05:46 +01:00 · 2014-03-26 10:05:46 +01:00 · ea6b37cb75
parent c6070b94ce 1dead05cae
commit ea6b37cb75
3 changed files with 202 additions and 267 deletions
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@ -55,14 +55,14 @@ void Tev::Init()
 	m_ColorInputLUT[14][RED_INP] = &StageKonst[RED_C]; m_ColorInputLUT[14][GRN_INP] = &StageKonst[GRN_C]; m_ColorInputLUT[14][BLU_INP] = &StageKonst[BLU_C]; // konst
 	m_ColorInputLUT[15][RED_INP] = &FixedConstants[0]; m_ColorInputLUT[15][GRN_INP] = &FixedConstants[0]; m_ColorInputLUT[15][BLU_INP] = &FixedConstants[0]; // zero

-	m_AlphaInputLUT[0] = Reg[0]; // prev
-	m_AlphaInputLUT[1] = Reg[1]; // c0
-	m_AlphaInputLUT[2] = Reg[2]; // c1
-	m_AlphaInputLUT[3] = Reg[3]; // c2
-	m_AlphaInputLUT[4] = TexColor; // tex
-	m_AlphaInputLUT[5] = RasColor; // ras
-	m_AlphaInputLUT[6] = StageKonst; // konst
-	m_AlphaInputLUT[7] = Zero16; // zero
+	m_AlphaInputLUT[0] = &Reg[0][ALP_C]; // prev
+	m_AlphaInputLUT[1] = &Reg[1][ALP_C]; // c0
+	m_AlphaInputLUT[2] = &Reg[2][ALP_C]; // c1
+	m_AlphaInputLUT[3] = &Reg[3][ALP_C]; // c2
+	m_AlphaInputLUT[4] = &TexColor[ALP_C]; // tex
+	m_AlphaInputLUT[5] = &RasColor[ALP_C]; // ras
+	m_AlphaInputLUT[6] = &StageKonst[ALP_C]; // konst
+	m_AlphaInputLUT[7] = &Zero16[ALP_C]; // zero

 	for (int comp = 0; comp < 4; comp++)
 	{
@ -176,239 +176,150 @@ void Tev::SetRasColor(int colorChan, int swaptable)
 	}
 }

-void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc)
+void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc, const InputRegType inputs[4])
 {
-	InputRegType InputReg;
-
 	for (int i = 0; i < 3; i++)
 	{
-		InputReg.a = *m_ColorInputLUT[cc.a][i];
-		InputReg.b = *m_ColorInputLUT[cc.b][i];
-		InputReg.c = *m_ColorInputLUT[cc.c][i];
-		InputReg.d = *m_ColorInputLUT[cc.d][i];
+		const InputRegType& InputReg = inputs[BLU_C + i];

 		u16 c = InputReg.c + (InputReg.c >> 7);

 		s32 temp = InputReg.a * (256 - c) + (InputReg.b * c);
-		temp = cc.op?(-temp >> 8):(temp >> 8);
+		temp <<= m_ScaleLShiftLUT[cc.shift];
+		temp += (cc.shift != 3) ? 0 : (cc.op == 1) ? 127 : 128;
+		temp = cc.op ? (-temp >> 8) : (temp >> 8);

-		s32 result = InputReg.d + temp + m_BiasLUT[cc.bias];
-		result = result << m_ScaleLShiftLUT[cc.shift];
+		s32 result = ((InputReg.d + m_BiasLUT[cc.bias]) << m_ScaleLShiftLUT[cc.shift]) + temp;
 		result = result >> m_ScaleRShiftLUT[cc.shift];

 		Reg[cc.dest][BLU_C + i] = result;
 	}
 }

-void Tev::DrawColorCompare(TevStageCombiner::ColorCombiner &cc)
+void Tev::DrawColorCompare(TevStageCombiner::ColorCombiner &cc, const InputRegType inputs[4])
 {
-	int cmp = (cc.shift<<1)|cc.op|8; // comparemode stored here
+	for (int i = BLU_C; i < RED_C; i++)
+	{
+		switch ((cc.shift<<1)|cc.op|8)  // encoded compare mode
+		{
+		case TEVCMP_R8_GT:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[RED_C].a > inputs[RED_C].b) ? inputs[i].c : 0);
+			break;

-	u32 a;
-	u32 b;
+		case TEVCMP_R8_EQ:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[RED_C].a == inputs[RED_C].b) ? inputs[i].c : 0);
+			break;

-	InputRegType InputReg;
+		case TEVCMP_GR16_GT:
+			{
+				u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a > b) ? inputs[i].c : 0);
+			}
+			break;

-	switch (cmp) {
-	case TEVCMP_R8_GT:
-		{
-			a = *m_ColorInputLUT[cc.a][RED_INP] & 0xff;
-			b = *m_ColorInputLUT[cc.b][RED_INP] & 0xff;
-			for (int i = 0; i < 3; i++)
+		case TEVCMP_GR16_EQ:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
+				u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a == b) ? inputs[i].c : 0);
 			}
-		}
-		break;
+			break;

-	case TEVCMP_R8_EQ:
-		{
-			a = *m_ColorInputLUT[cc.a][RED_INP] & 0xff;
-			b = *m_ColorInputLUT[cc.b][RED_INP] & 0xff;
-			for (int i = 0; i < 3; i++)
+		case TEVCMP_BGR24_GT:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
+				u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a > b) ? inputs[i].c : 0);
 			}
-		}
-		break;
-	case TEVCMP_GR16_GT:
-		{
-			a = ((*m_ColorInputLUT[cc.a][GRN_INP] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][GRN_INP] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
+			break;
+
+		case TEVCMP_BGR24_EQ:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
+				u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a == b) ? inputs[i].c : 0);
 			}
+			break;
+
+		case TEVCMP_RGB8_GT:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[i].a > inputs[i].b) ? inputs[i].c : 0);
+			break;
+
+		case TEVCMP_RGB8_EQ:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[i].a == inputs[i].b) ? inputs[i].c : 0);
+			break;
 		}
-		break;
-	case TEVCMP_GR16_EQ:
-		{
-			a = ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_BGR24_GT:
-		{
-			a = ((*m_ColorInputLUT[cc.a][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_BGR24_EQ:
-		{
-			a = ((*m_ColorInputLUT[cc.a][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_RGB8_GT:
-		for (int i = 0; i < 3; i++)
-		{
-			InputReg.a = *m_ColorInputLUT[cc.a][i];
-			InputReg.b = *m_ColorInputLUT[cc.b][i];
-			InputReg.c = *m_ColorInputLUT[cc.c][i];
-			InputReg.d = *m_ColorInputLUT[cc.d][i];
-			Reg[cc.dest][BLU_C + i] = InputReg.d + ((InputReg.a > InputReg.b) ? InputReg.c : 0);
-		}
-		break;
-	case TEVCMP_RGB8_EQ:
-		for (int i = 0; i < 3; i++)
-		{
-			InputReg.a = *m_ColorInputLUT[cc.a][i];
-			InputReg.b = *m_ColorInputLUT[cc.b][i];
-			InputReg.c = *m_ColorInputLUT[cc.c][i];
-			InputReg.d = *m_ColorInputLUT[cc.d][i];
-			Reg[cc.dest][BLU_C + i] = InputReg.d + ((InputReg.a == InputReg.b) ? InputReg.c : 0);
-		}
-		break;
 	}
 }

-void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac)
+void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac, const InputRegType inputs[4])
 {
-	InputRegType InputReg;
-
-	InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-	InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-	InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-	InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
+	const InputRegType& InputReg = inputs[ALP_C];

 	u16 c = InputReg.c + (InputReg.c >> 7);

 	s32 temp = InputReg.a * (256 - c) + (InputReg.b * c);
-	temp = ac.op?(-temp >> 8):(temp >> 8);
+	temp <<= m_ScaleLShiftLUT[ac.shift];
+	temp += (ac.shift != 3) ? 0 : (ac.op == 1) ? 127 : 128;
+	temp = ac.op ? (-temp >> 8) : (temp >> 8);

-	s32 result = InputReg.d + temp + m_BiasLUT[ac.bias];
-	result = result << m_ScaleLShiftLUT[ac.shift];
+	s32 result = ((InputReg.d + m_BiasLUT[ac.bias]) << m_ScaleLShiftLUT[ac.shift]) + temp;
 	result = result >> m_ScaleRShiftLUT[ac.shift];

 	Reg[ac.dest][ALP_C] = result;
 }

-void Tev::DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac)
+void Tev::DrawAlphaCompare(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4])
 {
-	int cmp = (ac.shift<<1)|ac.op|8; // comparemode stored here
-
-	u32 a;
-	u32 b;
-
-	InputRegType InputReg;
-
-	switch (cmp) {
+	switch ((ac.shift<<1)|ac.op|8)  // encoded compare mode
+	{
 	case TEVCMP_R8_GT:
-		{
-			a = m_AlphaInputLUT[ac.a][RED_C] & 0xff;
-			b = m_AlphaInputLUT[ac.b][RED_C] & 0xff;
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[RED_C].a > inputs[RED_C].b) ? inputs[ALP_C].c : 0);
 		break;

 	case TEVCMP_R8_EQ:
-		{
-			a = m_AlphaInputLUT[ac.a][RED_C] & 0xff;
-			b = m_AlphaInputLUT[ac.b][RED_C] & 0xff;
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[RED_C].a == inputs[RED_C].b) ? inputs[ALP_C].c : 0);
 		break;
+
 	case TEVCMP_GR16_GT:
 		{
-			a = ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
+			u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a > b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_GR16_EQ:
 		{
-			a = ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
+			u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a == b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_BGR24_GT:
 		{
-			a = ((m_AlphaInputLUT[ac.a][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
+			u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a > b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_BGR24_EQ:
 		{
-			a = ((m_AlphaInputLUT[ac.a][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
+			u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a == b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_A8_GT:
-		{
-			InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-			InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((InputReg.a > InputReg.b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[ALP_C].a > inputs[ALP_C].b) ? inputs[ALP_C].c : 0);
 		break;
+
 	case TEVCMP_A8_EQ:
-		{
-			InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-			InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((InputReg.a == InputReg.b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[ALP_C].a == inputs[ALP_C].b) ? inputs[ALP_C].c : 0);
 		break;
 	}
 }
@ -666,10 +577,23 @@ void Tev::Draw()
 		SetRasColor(order.getColorChan(stageOdd), ac.rswap * 2);

 		// combine inputs
+		InputRegType inputs[4];
+		for (int i = 0; i < 3; i++)
+		{
+			inputs[BLU_C + i].a = *m_ColorInputLUT[cc.a][i];
+			inputs[BLU_C + i].b = *m_ColorInputLUT[cc.b][i];
+			inputs[BLU_C + i].c = *m_ColorInputLUT[cc.c][i];
+			inputs[BLU_C + i].d = *m_ColorInputLUT[cc.d][i];
+		}
+		inputs[ALP_C].a = *m_AlphaInputLUT[ac.a];
+		inputs[ALP_C].b = *m_AlphaInputLUT[ac.b];
+		inputs[ALP_C].c = *m_AlphaInputLUT[ac.c];
+		inputs[ALP_C].d = *m_AlphaInputLUT[ac.d];
+
 		if (cc.bias != 3)
-			DrawColorRegular(cc);
+			DrawColorRegular(cc, inputs);
 		else
-			DrawColorCompare(cc);
+			DrawColorCompare(cc, inputs);

 		if (cc.clamp)
 		{
@ -685,9 +609,9 @@ void Tev::Draw()
 		}

 		if (ac.bias != 3)
-			DrawAlphaRegular(ac);
+			DrawAlphaRegular(ac, inputs);
 		else
-			DrawAlphaCompare(ac);
+			DrawAlphaCompare(ac, inputs);

 		if (ac.clamp)
 			Reg[ac.dest][ALP_C] = Clamp255(Reg[ac.dest][ALP_C]);
--- a/Source/Core/VideoBackends/Software/Tev.h
+++ b/Source/Core/VideoBackends/Software/Tev.h
@ -60,10 +60,10 @@ class Tev

 	void SetRasColor(int colorChan, int swaptable);

-	void DrawColorRegular(TevStageCombiner::ColorCombiner &cc);
-	void DrawColorCompare(TevStageCombiner::ColorCombiner &cc);
-	void DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac);
-	void DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac);
+	void DrawColorRegular(TevStageCombiner::ColorCombiner& cc, const InputRegType inputs[4]);
+	void DrawColorCompare(TevStageCombiner::ColorCombiner& cc, const InputRegType inputs[4]);
+	void DrawAlphaRegular(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4]);
+	void DrawAlphaCompare(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4]);

 	void Indirect(unsigned int stageNum, s32 s, s32 t);

--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@ -90,27 +90,6 @@ static const char *tevKSelTableA[] =
 	I_KCOLORS"[3].a", // K3_A = 0x1F
 };

-static const char *tevScaleTable[] =
-{
-	"",       // SCALE_1
-	" << 1",  // SCALE_2
-	" << 2",  // SCALE_4
-	" >> 1",  // DIVIDE_2
-};
-
-static const char *tevBiasTable[] =
-{
-	"",       // ZERO,
-	"+ 128",  // ADDHALF,
-	"- 128",  // SUBHALF,
-	"",
-};
-
-static const char *tevOpTable[] = {
-	"+",      // TEVOP_ADD = 0,
-	"-",      // TEVOP_SUB = 1,
-};
-
 static const char *tevCInputTable[] =
 {
 	"prev.rgb",          // CPREV,
@ -133,14 +112,14 @@ static const char *tevCInputTable[] =

 static const char *tevAInputTable[] =
 {
-	"prev",            // APREV,
-	"c0",              // A0,
-	"c1",              // A1,
-	"c2",              // A2,
-	"textemp",         // TEXA,
-	"rastemp",         // RASA,
-	"konsttemp",       // KONST,  (hw1 had quarter)
-	"int4(0,0,0,0)",   // ZERO
+	"prev.a",        // APREV,
+	"c0.a",          // A0,
+	"c1.a",          // A1,
+	"c2.a",          // A2,
+	"textemp.a",     // TEXA,
+	"rastemp.a",     // RASA,
+	"konsttemp.a",   // KONST,  (hw1 had quarter)
+	"0",             // ZERO
 };

 static const char *tevRasTable[] =
@ -161,6 +140,7 @@ static const char *tevAOutputTable[]  = { "prev.a", "c0.a", "c1.a", "c2.a" };
 static char text[16384];

 template<class T> static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, API_TYPE ApiType, const char swapModeTable[4][5]);
+template<class T> static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift);
 template<class T> static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType);
 template<class T> static inline void WriteAlphaTest(T& out, pixel_shader_uid_data& uid_data, API_TYPE ApiType,DSTALPHA_MODE dstAlphaMode, bool per_pixel_depth);
 template<class T> static inline void WriteFog(T& out, pixel_shader_uid_data& uid_data);
@ -343,7 +323,8 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 	          "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
 	          "\tint alphabump=0;\n"
 	          "\tint3 tevcoord=int3(0, 0, 0);\n"
-	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n\n");
+	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
+	          "\tint4 tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,0);\n\n"); // tev combiner inputs

 	if (ApiType == API_OPENGL)
 	{
@ -778,44 +759,35 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	if (ac.dest >= GX_TEVREG0 && ac.dest <= GX_TEVREG2)
 		out.SetConstantsUsed(C_COLORS+ac.dest, C_COLORS+ac.dest);

+
+	out.Write("tevin_a = int4(%s, %s)&255;\n", tevCInputTable[cc.a], tevAInputTable[ac.a]);
+	out.Write("tevin_b = int4(%s, %s)&255;\n", tevCInputTable[cc.b], tevAInputTable[ac.b]);
+	out.Write("tevin_c = int4(%s, %s)&255;\n", tevCInputTable[cc.c], tevAInputTable[ac.c]);
+	out.Write("tevin_d = int4(%s, %s);\n", tevCInputTable[cc.d], tevAInputTable[ac.d]);
+
 	out.Write("\t// color combine\n");
 	out.Write("\t%s = clamp(", tevCOutputTable[cc.dest]);
-
-	// combine the color channel
-	if (cc.bias != TevBias_COMPARE) // if not compare
+	if (cc.bias != TevBias_COMPARE)
 	{
-		//normal color combiner goes here
-		if (cc.shift > TEVSCALE_1)
-			out.Write("(");
-
-		if (!(cc.d == TEVCOLORARG_ZERO && cc.op == TEVOP_ADD))
-			out.Write("%s %s ", tevCInputTable[cc.d], tevOpTable[cc.op]);
-
-		out.Write("((%s&255) * (int3(255,255,255) - (%s&255)) + (%s&255) * (%s&255)) / 255", tevCInputTable[cc.a], tevCInputTable[cc.c], tevCInputTable[cc.b], tevCInputTable[cc.c]);
-
-		out.Write(" %s", tevBiasTable[cc.bias]);
-
-		if (cc.shift > TEVSCALE_1)
-			out.Write(")%s", tevScaleTable[cc.shift]);
+		WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.shift);
 	}
 	else
 	{
 		const char *function_table[] =
 		{
-			"(((%s.r&255) > %s.r) ? (%s&255): int3(0,0,0))", // TEVCMP_R8_GT
-			"(((%s.r&255) == %s.r) ? (%s&255): int3(0,0,0))", // TEVCMP_R8_EQ
-			"((idot((%s.rgb&255), comp16) >  idot((%s.rgb&255), comp16)) ? (%s&255): int3(0,0,0))", // TEVCMP_GR16_GT
-			"((idot((%s.rgb&255), comp16) == idot((%s.rgb&255), comp16)) ? (%s&255): int3(0,0,0))", // TEVCMP_GR16_EQ
-			"((idot((%s.rgb&255), comp24) >  idot((%s.rgb&255), comp24)) ? (%s&255): int3(0,0,0))", // TEVCMP_BGR24_GT
-			"((idot((%s.rgb&255), comp24) == idot((%s.rgb&255), comp24)) ? (%s&255): int3(0,0,0))", // TEVCMP_BGR24_EQ
-			"int3(max(sign(int3((%s.rgb&255)) - int3((%s.rgb&255))), int3(0,0,0)) * (%s&255))", // TEVCMP_RGB8_GT
-			"int3((int3(255,255,255) - max(sign(abs(int3((%s.rgb&255)) - int3((%s.rgb&255)))), int3(0,0,0))) * (%s&255))" // TEVCMP_RGB8_EQ
+			"((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_EQ
+			"(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
+			"((int3(255,255,255) - max(sign(abs(tevin_a.rgb - tevin_b.rgb))), int3(0,0,0))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
 		};

 		int mode = (cc.shift<<1)|cc.op;
-		out.Write("   %s + ", tevCInputTable[cc.d]);
-		out.Write(function_table[mode], tevCInputTable[cc.a],
-		          tevCInputTable[cc.b], tevCInputTable[cc.c]);
+		out.Write("   tevin_d.rgb + ");
+		out.Write(function_table[mode]);
 	}
 	if (cc.clamp)
 		out.Write(", int3(0,0,0), int3(255,255,255))");
@ -825,41 +797,27 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP

 	out.Write("\t// alpha combine\n");
 	out.Write("\t%s = clamp(", tevAOutputTable[ac.dest]);
-
-	if (ac.bias != TevBias_COMPARE) // if not compare
+	if (ac.bias != TevBias_COMPARE)
 	{
-		//normal alpha combiner goes here
-		if (ac.shift > 0)
-			out.Write("(");
-
-		if (!(ac.d == TEVALPHAARG_ZERO && ac.op == TEVOP_ADD))
-			out.Write("%s.a %s ", tevAInputTable[ac.d], tevOpTable[ac.op]);
-
-		out.Write("((%s.a&255) * (255 - (%s.a&255)) + (%s.a&255) * (%s.a&255)) / 255", tevAInputTable[ac.a], tevAInputTable[ac.c], tevAInputTable[ac.b], tevAInputTable[ac.c]);
-
-		out.Write(" %s",tevBiasTable[ac.bias]);
-
-		if (ac.shift>0)
-			out.Write(")%s", tevScaleTable[ac.shift]);
+		WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.shift);
 	}
 	else
 	{
 		const char *function_table[] =
 		{
-			"(((%s.r&255) > (%s.r&255)) ? (%s.a&255) : 0)", // TEVCMP_R8_GT
-			"(((%s.r&255) == (%s.r&255)) ? (%s.a&255) : 0)", // TEVCMP_R8_EQ
-			"((idot((%s.rgb&255), comp16) >  idot((%s.rgb&255), comp16)) ? (%s.a&255) : 0)", // TEVCMP_GR16_GT
-			"((idot((%s.rgb&255), comp16) == idot((%s.rgb&255), comp16)) ? (%s.a&255) : 0)", // TEVCMP_GR16_EQ
-			"((idot((%s.rgb&255), comp24) >  idot((%s.rgb&255), comp24)) ? (%s.a&255) : 0)", // TEVCMP_BGR24_GT
-			"((idot((%s.rgb&255), comp24) == idot((%s.rgb&255), comp24)) ? (%s.a&255) : 0)", // TEVCMP_BGR24_EQ
-			"(((%s.a&255) >  (%s.a&255)) ? (%s.a&255) : 0)", // TEVCMP_A8_GT
-			"(((%s.a&255) == (%s.a&255)) ? (%s.a&255) : 0)" // TEVCMP_A8_EQ
+			"((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
+			"((tevin_a.a >  tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
+			"((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
 		};

 		int mode = (ac.shift<<1)|ac.op;
-		out.Write("   %s.a + ", tevAInputTable[ac.d]);
-		out.Write(function_table[mode], tevAInputTable[ac.a],
-		          tevAInputTable[ac.b], tevAInputTable[ac.c]);
+		out.Write("   tevin_d.a + ");
+		out.Write(function_table[mode]);
 	}
 	if (ac.clamp)
 		out.Write(", 0, 255)");
@ -869,6 +827,59 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	out.Write(";\n\n");
 }

+template<class T>
+static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift)
+{
+	const char *tevScaleTableLeft[] =
+	{
+		"",       // SCALE_1
+		" << 1",  // SCALE_2
+		" << 2",  // SCALE_4
+		"",       // DIVIDE_2
+	};
+
+	const char *tevScaleTableRight[] =
+	{
+		"",       // SCALE_1
+		"",       // SCALE_2
+		"",       // SCALE_4
+		" >> 1",  // DIVIDE_2
+	};
+
+	const char *tevLerpBias[] = // indexed by 2*op+(shift==3)
+	{
+		"",
+		" + 128",
+		"",
+		" + 127",
+	};
+
+	const char *tevBiasTable[] =
+	{
+		"",        // ZERO,
+		" + 128",  // ADDHALF,
+		" - 128",  // SUBHALF,
+		"",
+	};
+
+	const char *tevOpTable[] = {
+		"+",      // TEVOP_ADD = 0,
+		"-",      // TEVOP_SUB = 1,
+	};
+
+	// Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
+	// The GC/Wii GPU uses a very sophisticated algorithm for scale-lerping:
+	// - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
+	// - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
+	// - a rounding bias is added before dividing by 256
+	out.Write("(((tevin_d.%s%s)%s)", components, tevBiasTable[bias], tevScaleTableLeft[shift]);
+	out.Write(" %s ", tevOpTable[op]);
+	out.Write("((((tevin_a.%s*256 + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+(tevin_c.%s>>7)))%s)%s)>>8)",
+	          components, components, components, components, components,
+	          tevScaleTableLeft[shift], tevLerpBias[2*op+(shift==3)]);
+	out.Write(")%s", tevScaleTableRight[shift]);
+}
+
 template<class T>
 static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType)
 {