From cc7c6cd35f0cccb19a4e3681259b55cebeeb80ba Mon Sep 17 00:00:00 2001
From: donkopunchstania <donkopunchstania@gmail.com>
Date: Tue, 9 Mar 2010 04:38:07 +0000
Subject: [PATCH] Texture coordinates are stored in fixed point format in TEV
 which allows overflows to be emulated correctly. Added logic to calculated
 texture LOD and use the correct mip. Dumping textures will now dump all mip
 levels. Added line rendering. Changed data stored in vertex from float arrays
 to vectors for cleaner math.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5178 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 Source/Core/VideoCommon/Src/BPMemory.h        |   3 +-
 .../Plugin_VideoSoftware/Src/Clipper.cpp      | 183 +++++++++++++--
 .../Plugin_VideoSoftware/Src/Clipper.h        |   1 +
 .../Plugin_VideoSoftware/Src/DebugUtil.cpp    |  57 +++--
 .../Plugin_VideoSoftware/Src/DebugUtil.h      |   2 +-
 .../Plugin_VideoSoftware/Src/HwRasterizer.cpp |   2 +-
 .../Src/NativeVertexFormat.h                  |  41 ++--
 .../Plugin_VideoSoftware/Src/Rasterizer.cpp   | 211 ++++++++++++++----
 .../Plugin_VideoSoftware/Src/Rasterizer.h     |  15 ++
 .../Plugin_VideoSoftware/Src/SetupUnit.cpp    |  32 ++-
 .../Plugins/Plugin_VideoSoftware/Src/Tev.cpp  |  82 +++----
 Source/Plugins/Plugin_VideoSoftware/Src/Tev.h |  35 +--
 .../Src/TextureSampler.cpp                    | 166 +++++++++-----
 .../Plugin_VideoSoftware/Src/TextureSampler.h |   4 +-
 .../Src/TransformUnit.cpp                     | 157 ++++++-------
 .../Src/VertexFormatConverter.cpp             |  36 +--
 16 files changed, 704 insertions(+), 323 deletions(-)

diff --git a/Source/Core/VideoCommon/Src/BPMemory.h b/Source/Core/VideoCommon/Src/BPMemory.h
index cb022893fd..0a62bea162 100644
--- a/Source/Core/VideoCommon/Src/BPMemory.h
+++ b/Source/Core/VideoCommon/Src/BPMemory.h
@@ -451,7 +451,8 @@ union TexMode0
         unsigned mag_filter : 1;
         unsigned min_filter : 3;
         unsigned diag_lod : 1;
-        signed lod_bias : 10;
+        signed lod_bias : 8;
+		unsigned pad0 : 2;
         unsigned max_aniso : 2;
         unsigned lod_clamp : 1;
     };
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
index 0507503267..b97ad3795d 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
@@ -90,13 +90,13 @@ namespace Clipper
     static inline int CalcClipMask(OutputVertexData *v)
     {
 	    int cmask = 0;
-        float* pos = v->projectedPosition;
-	    if (pos[3] - pos[0] < 0) cmask |= CLIP_POS_X_BIT;
-	    if (pos[0] + pos[3] < 0) cmask |= CLIP_NEG_X_BIT;
-	    if (pos[3] - pos[1] < 0) cmask |= CLIP_POS_Y_BIT;
-	    if (pos[1] + pos[3] < 0) cmask |= CLIP_NEG_Y_BIT;
-	    if (pos[3] * pos[2] > 0) cmask |= CLIP_POS_Z_BIT;
-	    if (pos[2] + pos[3] < 0) cmask |= CLIP_NEG_Z_BIT;
+        Vec4 pos = v->projectedPosition;
+	    if (pos.w - pos.x < 0) cmask |= CLIP_POS_X_BIT;
+	    if (pos.x + pos.w < 0) cmask |= CLIP_NEG_X_BIT;
+	    if (pos.w - pos.y < 0) cmask |= CLIP_POS_Y_BIT;
+	    if (pos.y + pos.w < 0) cmask |= CLIP_NEG_Y_BIT;
+	    if (pos.w * pos.z > 0) cmask |= CLIP_POS_Z_BIT;
+	    if (pos.z + pos.w < 0) cmask |= CLIP_NEG_Z_BIT;
 	    return cmask;
     }
 
@@ -109,7 +109,7 @@ namespace Clipper
     #define DIFFERENT_SIGNS(x,y) ((x <= 0 && y > 0) || (x > 0 && y <= 0))
 
     #define CLIP_DOTPROD(I, A, B, C, D) \
-	    (Vertices[I]->projectedPosition[0] * A + Vertices[I]->projectedPosition[1] * B + Vertices[I]->projectedPosition[2] * C + Vertices[I]->projectedPosition[3] * D)
+	    (Vertices[I]->projectedPosition.x * A + Vertices[I]->projectedPosition.y * B + Vertices[I]->projectedPosition.z * C + Vertices[I]->projectedPosition.w * D)
 
     #define POLY_CLIP( PLANE_BIT, A, B, C, D )                          \
     {                                                                   \
@@ -153,6 +153,27 @@ namespace Clipper
 	    }									                            \
     }
 
+	#define LINE_CLIP(PLANE_BIT, A, B, C, D )					\
+	{															\
+		if (mask & PLANE_BIT) {									\
+			const float dp0 = CLIP_DOTPROD( 0, A, B, C, D );	\
+			const float dp1 = CLIP_DOTPROD( 1, A, B, C, D );	\
+			const bool neg_dp0 = dp0 < 0;						\
+			const bool neg_dp1 = dp1 < 0;						\
+																\
+			if (neg_dp0 && neg_dp1)								\
+				return;											\
+																\
+			if (neg_dp1) {										\
+				float t = dp1 / (dp1 - dp0);					\
+				if (t > t1) t1 = t;								\
+			} else if (neg_dp0) {								\
+				float t = dp0 / (dp0 - dp1);					\
+				if (t > t0) t0 = t;								\
+			}													\
+		}														\
+	}
+
     void ClipTriangle(int *indices, int &numIndices)
     {
 	    int mask = 0;
@@ -202,6 +223,53 @@ namespace Clipper
 	    }
     }
 
+	void ClipLine(int *indices)
+	{
+		int mask = 0;
+		int clip_mask[2] = { 0, 0 };
+
+		for (int i = 0; i < 2; ++i)
+		{
+			clip_mask[i] = CalcClipMask(Vertices[i]);
+			mask |= clip_mask[i];
+		}
+
+		if (mask == 0) 
+			return;
+
+		float t0 = 0;
+		float t1 = 0;
+
+		// Mark unused in case of early termination 
+		// of the macros below. (When fully clipped)
+		indices[0] = SKIP_FLAG;
+		indices[1] = SKIP_FLAG;
+
+		LINE_CLIP(CLIP_POS_X_BIT, -1,  0,  0, 1);
+		LINE_CLIP(CLIP_NEG_X_BIT,  1,  0,  0, 1);
+		LINE_CLIP(CLIP_POS_Y_BIT,  0, -1,  0, 1);
+		LINE_CLIP(CLIP_NEG_Y_BIT,  0,  1,  0, 1);
+		LINE_CLIP(CLIP_POS_Z_BIT,  0,  0, -1, 1);
+		LINE_CLIP(CLIP_NEG_Z_BIT,  0,  0,  1, 1);
+
+		// Restore the old values as this line 
+		// was not fully clipped.
+		indices[0] = 0;
+		indices[1] = 1;
+
+		int numVertices = 2;
+
+		if (clip_mask[0]) {
+			indices[0] = numVertices;
+			AddInterpolatedVertex(t0, 0, 1, numVertices);
+		}
+
+		if (clip_mask[1]) {
+			indices[1] = numVertices;
+			AddInterpolatedVertex(t1, 1, 0, numVertices);
+		}
+	}
+
     void ProcessTriangle(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2)
     {
         if (stats.thisFrame.numDrawnObjects < g_Config.drawStart || stats.thisFrame.numDrawnObjects >= g_Config.drawEnd )
@@ -247,6 +315,75 @@ namespace Clipper
         }
     }
 
+	void CopyVertex(OutputVertexData *dst, OutputVertexData *src, float dx, float dy, unsigned int sOffset)
+	{
+		dst->screenPosition.x = src->screenPosition.x + dx;
+		dst->screenPosition.y = src->screenPosition.y + dy;
+		dst->screenPosition.z = src->screenPosition.z;
+
+		for (int i = 0; i < 3; ++i)
+			dst->normal[i] = src->normal[i];
+
+		for (int i = 0; i < 4; ++i)
+			dst->color[0][i] = src->color[0][i];
+
+		// todo - s offset
+		for (int i = 0; i < 8; ++i)
+			dst->texCoords[i] = src->texCoords[i];
+	}
+
+	void ProcessLine(OutputVertexData *lineV0, OutputVertexData *lineV1)
+	{
+		int indices[4] = { 0, 1, SKIP_FLAG, SKIP_FLAG };
+
+		Vertices[0] = lineV0;
+        Vertices[1] = lineV1;
+
+		ClipLine(indices);
+
+		if(indices[0] != SKIP_FLAG)
+		{
+			OutputVertexData *v0 = Vertices[indices[0]];
+			OutputVertexData *v1 = Vertices[indices[1]];
+
+			PerspectiveDivide(v0);
+            PerspectiveDivide(v1);
+
+			float dx = v1->screenPosition.x - v0->screenPosition.x;
+			float dy = v1->screenPosition.y - v0->screenPosition.y;
+			
+			float screenDx = 0;
+			float screenDy = 0;
+
+			if(abs(dx) > abs(dy))
+			{
+				if(dx > 0)
+					screenDy = bpmem.lineptwidth.linesize / -12.0f;
+				else
+					screenDy = bpmem.lineptwidth.linesize / 12.0f;
+			}
+			else
+			{
+				if(dy > 0)
+					screenDx = bpmem.lineptwidth.linesize / 12.0f;
+				else
+					screenDx = bpmem.lineptwidth.linesize / -12.0f;
+			}
+
+			OutputVertexData triangle[3];
+
+			CopyVertex(&triangle[0], v0, screenDx, screenDy, 0);
+			CopyVertex(&triangle[1], v1, screenDx, screenDy, 0);
+			CopyVertex(&triangle[2], v1, -screenDx, -screenDy, bpmem.lineptwidth.lineoff);
+
+			// ccw winding
+			Rasterizer::DrawTriangleFrontFace(&triangle[2], &triangle[1], &triangle[0]);
+
+			CopyVertex(&triangle[1], v0, -screenDx, -screenDy, bpmem.lineptwidth.lineoff);
+
+			Rasterizer::DrawTriangleFrontFace(&triangle[0], &triangle[1], &triangle[2]);
+		}
+	}
         
     bool CullTest(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2, bool &backface)
     {
@@ -260,15 +397,15 @@ namespace Clipper
             return false;
         }
 
-        float x0 = v0->projectedPosition[0];
-        float x1 = v1->projectedPosition[0];
-        float x2 = v2->projectedPosition[0];
-        float y1 = v1->projectedPosition[1];
-        float y0 = v0->projectedPosition[1];
-        float y2 = v2->projectedPosition[1];
-        float w0 = v0->projectedPosition[3];
-        float w1 = v1->projectedPosition[3];
-        float w2 = v2->projectedPosition[3];
+        float x0 = v0->projectedPosition.x;
+        float x1 = v1->projectedPosition.x;
+        float x2 = v2->projectedPosition.x;
+        float y1 = v1->projectedPosition.y;
+        float y0 = v0->projectedPosition.y;
+        float y2 = v2->projectedPosition.y;
+        float w0 = v0->projectedPosition.w;
+        float w1 = v1->projectedPosition.w;
+        float w2 = v2->projectedPosition.w;
 
         float normalZDir = (x0*w2 - x2*w0)*y1 + (x2*y0 - x0*y2)*w1 + (y2*w0 - y0*w2)*x1; 
 
@@ -291,13 +428,13 @@ namespace Clipper
 
     void PerspectiveDivide(OutputVertexData *vertex)
     {
-        float *projected = vertex->projectedPosition;
-        float *screen = vertex->screenPosition;
+        Vec4 &projected = vertex->projectedPosition;
+        Vec3 &screen = vertex->screenPosition;
 
-        float wInverse = 1.0f/projected[3];
-        screen[0] = projected[0] * wInverse * xfregs.viewport.wd + m_ViewOffset[0];
-        screen[1] = projected[1] * wInverse * xfregs.viewport.ht + m_ViewOffset[1];
-        screen[2] = projected[2] * wInverse + m_ViewOffset[2];
+        float wInverse = 1.0f/projected.w;
+        screen.x = projected.x * wInverse * xfregs.viewport.wd + m_ViewOffset[0];
+        screen.y = projected.y * wInverse * xfregs.viewport.ht + m_ViewOffset[1];
+        screen.z = projected.z * wInverse + m_ViewOffset[2];
     }
     
 }
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
index 476b224783..ee9e1d8ebb 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
@@ -31,6 +31,7 @@ namespace Clipper
 
     void ProcessTriangle(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2);
 
+	void ProcessLine(OutputVertexData *v0, OutputVertexData *v1);
 
     bool CullTest(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2, bool &backface);
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
index 95a73d6721..05cb82ee06 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
@@ -49,36 +49,32 @@ void Init()
     }
 }
 
-bool SaveTexture(const char* filename, u32 texmap, int width, int height)
-{
-    u8 *data = new u8[width * height * 4];
-    
-    GetTextureBGRA(data, texmap, width, height);
-
-    bool result = SaveTGA(filename, width, height, data);
-
-    delete []data;
-
-    return result;
-}
-
-void SaveTexture(const char* filename, u32 texmap)
+void SaveTexture(const char* filename, u32 texmap, s32 mip)
 {
     FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
     u8 subTexmap = texmap & 3;
 
     TexImage0& ti0 = texUnit.texImage0[subTexmap];
 
-    SaveTexture(filename, texmap, ti0.width + 1, ti0.height + 1);
+	int width = ti0.width + 1;
+	int height = ti0.height + 1;
+
+	u8 *data = new u8[width * height * 4];
+    
+    GetTextureBGRA(data, texmap, mip, width, height);
+
+    bool result = SaveTGA(filename, width, height, data);
+
+    delete []data;
 }
 
-void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height)
+void GetTextureBGRA(u8 *dst, u32 texmap, s32 mip, int width, int height)
 {
     u8 sample[4];    
 
     for (int y = 0; y < height; y++)
         for (int x = 0; x < width; x++) {
-            TextureSampler::Sample((float)x, (float)y, 0, texmap, sample);
+            TextureSampler::SampleMip(x << 7, y << 7, mip, false, texmap, sample);
             // rgba to bgra
             *(dst++) = sample[2];
             *(dst++) = sample[1];
@@ -87,13 +83,32 @@ void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height)
         }
 }
 
+s32 GetMaxTextureLod(u32 texmap)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    u8 subTexmap = texmap & 3;
+
+	u8 maxLod = texUnit.texMode1[subTexmap].max_lod;
+	u8 mip = maxLod >> 4;
+	u8 fract = maxLod & 0xf;
+
+	if(fract)
+		++mip;
+
+	return (s32)mip;
+}
+
 void DumpActiveTextures()
 {
     for (unsigned int stageNum = 0; stageNum < bpmem.genMode.numindstages; stageNum++)
     {
         u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
-        SaveTexture(StringFromFormat("%star%i_ind%i_map%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap).c_str(), texmap);     
+		s32 maxLod = GetMaxTextureLod(texmap);
+		for (s32 mip = 0; mip < maxLod; ++mip)
+		{
+			SaveTexture(StringFromFormat("%star%i_ind%i_map%i_mip%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap, mip).c_str(), texmap, mip);
+		}
     }
 
     for (unsigned int stageNum = 0; stageNum <= bpmem.genMode.numtevstages; stageNum++)
@@ -104,7 +119,11 @@ void DumpActiveTextures()
 
         int texmap = order.getTexMap(stageOdd);
 
-        SaveTexture(StringFromFormat("%star%i_stage%i_map%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap).c_str(), texmap);           
+        s32 maxLod = GetMaxTextureLod(texmap);
+		for (s32 mip = 0; mip < maxLod; ++mip)
+		{
+			SaveTexture(StringFromFormat("%star%i_stage%i_map%i_mip%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap, mip).c_str(), texmap, mip);
+		}
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
index c03d291f66..d40a4dc3e7 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
@@ -22,7 +22,7 @@ namespace DebugUtil
 {
     void Init();
 
-    void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height);
+    void GetTextureBGRA(u8 *dst, u32 texmap, s32 mip, int width, int height);
 
     void DumpActiveTextures();
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
index ab70b0fa96..5975e0967b 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
@@ -155,7 +155,7 @@ namespace HwRasterizer
         int width = texImage0.width;
         int height = texImage0.height;
 
-        DebugUtil::GetTextureBGRA(temp, 0, width, height);
+        DebugUtil::GetTextureBGRA(temp, 0, 0, width, height);
 
         glGenTextures(1, (GLuint *)&texture);
 		glBindTexture(GL_TEXTURE_RECTANGLE_ARB, texture);
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h b/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
index befc048f37..a4e9af8dad 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
@@ -18,6 +18,8 @@
 #ifndef _NATIVEVERTEXFORMAT_H
 #define _NATIVEVERTEXFORMAT_H
 
+#include "../../Plugin_VideoDX9/Src/Vec3.h"
+
 #ifdef WIN32
 #define LOADERDECL __cdecl
 #else
@@ -26,25 +28,33 @@
 
 typedef void (LOADERDECL *TPipelineFunction)();
 
+struct Vec4
+{
+	float x;
+	float y;
+	float z;
+	float w;
+};
+
 struct InputVertexData
 {
     u8 posMtx;
     u8 texMtx[8];
 
-    float position[4];    
-    float normal[3][3];
+    Vec3 position;    
+    Vec3 normal[3];
     u8 color[2][4];
     float texCoords[8][2];
 };
 
 struct OutputVertexData
 {
-    float mvPosition[3];
-    float projectedPosition[4];
-    float screenPosition[3];
-    float normal[3][3];
+    Vec3 mvPosition;
+    Vec4 projectedPosition;
+    Vec3 screenPosition;
+    Vec3 normal[3];
     u8 color[2][4];
-    float texCoords[8][3];
+    Vec3 texCoords[8];
 
     void Lerp(float t, OutputVertexData *a, OutputVertexData *b)
     {
@@ -52,17 +62,16 @@ struct OutputVertexData
 
         #define LINTERP_INT(T, OUT, IN) (OUT) + (((IN - OUT) * T) >> 8)
 
-        for (int i = 0; i < 3; ++i)
-            mvPosition[i] = LINTERP(t, a->mvPosition[i], b->mvPosition[i]);
+        mvPosition = LINTERP(t, a->mvPosition, b->mvPosition);
 
-        for (int i = 0; i < 4; ++i)
-            projectedPosition[i] = LINTERP(t, a->projectedPosition[i], b->projectedPosition[i]);
+        projectedPosition.x = LINTERP(t, a->projectedPosition.x, b->projectedPosition.x);
+		projectedPosition.y = LINTERP(t, a->projectedPosition.y, b->projectedPosition.y);
+		projectedPosition.z = LINTERP(t, a->projectedPosition.z, b->projectedPosition.z);
+		projectedPosition.w = LINTERP(t, a->projectedPosition.w, b->projectedPosition.w);
 
         for (int i = 0; i < 3; ++i)
         {
-            normal[i][0] = LINTERP(t, a->normal[i][0], b->normal[i][0]);
-            normal[i][1] = LINTERP(t, a->normal[i][1], b->normal[i][1]);
-            normal[i][2] = LINTERP(t, a->normal[i][2], b->normal[i][2]);
+            normal[i] = LINTERP(t, a->normal[i], b->normal[i]);
         }
 
         u16 t_int = (u16)(t * 256);
@@ -74,9 +83,7 @@ struct OutputVertexData
 
         for (int i = 0; i < 8; ++i)
         {
-            texCoords[i][0] = LINTERP(t, a->texCoords[i][0], b->texCoords[i][0]);
-            texCoords[i][1] = LINTERP(t, a->texCoords[i][1], b->texCoords[i][1]);
-            texCoords[i][2] = LINTERP(t, a->texCoords[i][2], b->texCoords[i][2]);
+            texCoords[i] = LINTERP(t, a->texCoords[i], b->texCoords[i]);
         }
 
         #undef LINTERP
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
index cf85d494ac..22ecb1c6d5 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
@@ -27,8 +27,20 @@
 #include "VideoConfig.h"
 
 
-#define BLOCK_SIZE 8
+#define BLOCK_SIZE 2
 
+#define CLAMP(x, a, b) (x>b)?b:(x<a)?a:x
+
+// returns approximation of log2(f) in s28.4
+// results are close enough to use for LOD
+static inline s32 FixedLog2(float f)
+{
+	u32 *x = (u32*)&f;
+	s32 logInt = ((*x & 0x7F800000) >> 19) - 2032; // integer part
+	s32 logFract = (*x & 0x007fffff) >> 19; // approximate fractional part
+
+	return logInt + logFract;
+}
 
 namespace Rasterizer
 {
@@ -43,6 +55,7 @@ s32 scissorRight = 0;
 s32 scissorBottom = 0;
 
 Tev tev;
+RasterBlock rasterBlock;
 
 void Init()
 {
@@ -91,53 +104,58 @@ void SetTevReg(int reg, int comp, bool konst, s16 color)
     tev.SetRegColor(reg, comp, konst, color);
 }
 
-inline void Draw(s32 x, s32 y)
+inline void Draw(s32 x, s32 y, s32 xi, s32 yi)
 {
     INCSTAT(stats.thisFrame.rasterizedPixels);
 
-    float zFloat = 1.0f + ZSlope.GetValue(x, y);
-    if(zFloat < 0|| zFloat > 1)
-        return;
+	float zFloat = 1.0f + ZSlope.GetValue(x, y);
+	if (zFloat < 0.0f || zFloat > 1.0f)
+		return;
 
-    u32 z = (u32)(zFloat * 0x00ffffff);
+	s32 z = (s32)(zFloat * 0x00ffffff);
 
-    if (bpmem.zcontrol.zcomploc && bpmem.zmode.testenable)
-    {
-        // early z
-        if (!EfbInterface::ZCompare(x, y, z))
-            return;
-    }
+	if (bpmem.zcontrol.zcomploc && bpmem.zmode.testenable)
+	{
+		// early z
+		if (!EfbInterface::ZCompare(x, y, z))
+			return;
+	}
 
-    float invW = 1.0f / WSlope.GetValue(x, y);
+	RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
 
-    tev.Position[0] = x;
-    tev.Position[1] = y;
-    tev.Position[2] = z;
+	float invW = pixel.InvW;
 
-    for(unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
-    {
-        for(int comp = 0; comp < 4; comp++)
-            tev.Color[i][comp] = (u8)ColorSlopes[i][comp].GetValue(x, y);
-    }
+	tev.Position[0] = x;
+	tev.Position[1] = y;
+	tev.Position[2] = z;
 
-    for(unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
-    {
-        if (xfregs.texMtxInfo[i].projection)
-        {
-            float q = TexSlopes[i][2].GetValue(x, y) * invW;
-            float invQ = invW / q;
-            tev.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * invQ * (bpmem.texcoords[i].s.scale_minus_1 + 1);
-            tev.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * invQ * (bpmem.texcoords[i].t.scale_minus_1 + 1);
-            tev.Lod[i] = 0;
-        }
-        else
-        {
-            tev.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * invW * (bpmem.texcoords[i].s.scale_minus_1 + 1);
-            tev.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * invW * (bpmem.texcoords[i].t.scale_minus_1 + 1);
-            tev.Lod[i] = 0;
-        }
-    }
+	//  colors
+	for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
+	{
+		for(int comp = 0; comp < 4; comp++)
+			tev.Color[i][comp] = (u8)ColorSlopes[i][comp].GetValue(x, y);
+	}
 
+	// tex coords
+	for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
+	{
+		// multiply by 128 because TEV stores stores UVs as s17.7
+		tev.Uv[i].s = (s32)(pixel.Uv[i][0] * 128);
+		tev.Uv[i].t = (s32)(pixel.Uv[i][1] * 128);
+	}
+
+	for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
+	{
+		tev.IndirectLod[i] = rasterBlock.IndirectLod[i];
+		tev.IndirectLinear[i] = rasterBlock.IndirectLinear[i];
+	}
+
+	for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
+	{
+		tev.TextureLod[i] = rasterBlock.TextureLod[i];
+		tev.TextureLinear[i] = rasterBlock.TextureLinear[i];
+	}
+   
     tev.Draw();
 }
 
@@ -155,6 +173,109 @@ void InitSlope(Slope *slope, float f1, float f2, float f3, float DX31, float DX1
     slope->y0 = Y1;
 }
 
+inline void CalculateLOD(s32 &lod, bool &linear, u32 texmap, u32 texcoord)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+	u8 subTexmap = texmap & 3;
+
+	// LOD calculation requires data from the texture mode for bias, etc.
+	// it does not seem to use the actual texture size
+	TexMode0& tm0 = texUnit.texMode0[subTexmap];
+	TexMode1& tm1 = texUnit.texMode1[subTexmap];
+
+	float sDelta, tDelta;
+	if (tm0.diag_lod)
+	{
+		float *uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
+		float *uv1 = rasterBlock.Pixel[1][1].Uv[texcoord];
+
+		sDelta = abs(uv0[0] - uv1[0]);
+		tDelta = abs(uv0[1] - uv1[1]);
+	}
+	else
+	{
+		float *uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
+		float *uv1 = rasterBlock.Pixel[1][0].Uv[texcoord];
+		float *uv2 = rasterBlock.Pixel[0][1].Uv[texcoord];
+
+		sDelta = max(abs(uv0[0] - uv1[0]), abs(uv0[0] - uv2[0]));
+		tDelta = max(abs(uv0[1] - uv1[1]), abs(uv0[1] - uv2[1]));
+	}
+
+	// get LOD in s28.4
+	lod = FixedLog2(max(sDelta, tDelta));
+
+	// bias is s2.5
+	int bias = tm0.lod_bias;
+	bias >>= 1;
+	lod += bias;
+
+	linear = (lod >= 0 && (tm0.min_filter & 4) || lod < 0 && tm0.mag_filter);
+
+	// order of checks matters
+	// should be:
+	// if lod > max then max
+	// else if lod < min then min
+	lod = CLAMP(lod, (s32)tm1.min_lod, (s32)tm1.max_lod);
+}
+
+void BuildBlock(s32 blockX, s32 blockY)
+{
+	for (s32 yi = 0; yi < BLOCK_SIZE; yi++)
+	{
+		for (s32 xi = 0; xi < BLOCK_SIZE; xi++)
+		{
+			RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
+
+			s32 x = xi + blockX;
+			s32 y = yi + blockY;			
+
+			float invW = 1.0f / WSlope.GetValue(x, y);
+			pixel.InvW = invW;
+
+			// tex coords
+			for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
+			{
+				float projection;
+				if (xfregs.texMtxInfo[i].projection)
+				{
+					float q = TexSlopes[i][2].GetValue(x, y) * invW;
+					projection = invW / q;
+				}
+				else
+					projection = invW;
+
+				pixel.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * projection;
+				pixel.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * projection;
+			}
+		}
+	}
+
+	u32 indref = bpmem.tevindref.hex;
+	for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
+	{
+		u32 texmap = indref & 3;
+		indref >>= 3;
+		u32 texcoord = indref & 3;
+		indref >>= 3;
+
+		CalculateLOD(rasterBlock.IndirectLod[i], rasterBlock.IndirectLinear[i], texmap, texcoord);
+	}
+
+	for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
+	{
+		int stageOdd = i&1;
+		TwoTevStageOrders &order = bpmem.tevorders[i >> 1];
+		if(order.getEnable(stageOdd))
+		{
+			u32 texmap = order.getTexMap(stageOdd);
+			u32 texcoord = order.getTexCoord(stageOdd);
+
+			CalculateLOD(rasterBlock.TextureLod[i], rasterBlock.TextureLinear[i], texmap, texcoord);
+		}
+	}
+}
+
 void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2)
 {
     INCSTAT(stats.thisFrame.numTrianglesDrawn);
@@ -217,7 +338,7 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
     float fltdy12 = flty1 - v1->screenPosition[1];
     float fltdy31 = v2->screenPosition[1] - flty1;
 
-    float w[3] = { 1.0f / v0->projectedPosition[3], 1.0f / v1->projectedPosition[3], 1.0f / v2->projectedPosition[3] };
+    float w[3] = { 1.0f / v0->projectedPosition.w, 1.0f / v1->projectedPosition.w, 1.0f / v2->projectedPosition.w };
     InitSlope(&WSlope, w[0], w[1], w[2], fltdx31, fltdx12, fltdy12, fltdy31, fltx1, flty1);
 
     InitSlope(&ZSlope, v0->screenPosition[2], v1->screenPosition[2], v2->screenPosition[2], fltdx31, fltdx12, fltdy12, fltdy31, fltx1, flty1);
@@ -281,14 +402,16 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
             // Skip block when outside an edge
             if(a == 0x0 || b == 0x0 || c == 0x0) continue;
 
+			BuildBlock(x, y);
+
             // Accept whole block when totally covered
             if(a == 0xF && b == 0xF && c == 0xF)
             {
                 for(s32 iy = 0; iy < BLOCK_SIZE; iy++)
                 {
-                    for(s32 ix = x; ix < x + BLOCK_SIZE; ix++)
+                    for(s32 ix = 0; ix < BLOCK_SIZE; ix++)
                     {                        
-                        Draw(ix, iy + y);
+                        Draw(x + ix, y + iy, ix, iy);
                     }
                 }
             }
@@ -298,17 +421,17 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
                 s32 CY2 = C2 + DX23 * y0 - DY23 * x0;
                 s32 CY3 = C3 + DX31 * y0 - DY31 * x0;
 
-                for(s32 iy = y; iy < y + BLOCK_SIZE; iy++)
+                for(s32 iy = 0; iy < BLOCK_SIZE; iy++)
                 {
                     s32 CX1 = CY1;
                     s32 CX2 = CY2;
                     s32 CX3 = CY3;
 
-                    for(s32 ix = x; ix < x + BLOCK_SIZE; ix++)
+                    for(s32 ix = 0; ix < BLOCK_SIZE; ix++)
                     {
                         if(CX1 > 0 && CX2 > 0 && CX3 > 0)
                         {
-                            Draw(ix, iy);
+                            Draw(x + ix, y + iy, ix, iy);
                         }
 
                         CX1 -= FDY12;
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
index 33c152703e..403b0459ba 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
@@ -39,6 +39,21 @@ namespace Rasterizer
         float y0;
         float GetValue(s32 x, s32 y) { return f0 + (dfdx * (x - x0)) + (dfdy * (y - y0)); }
     };
+
+	struct RasterBlockPixel
+	{
+		float InvW;
+		float Uv[8][2];
+	};
+
+	struct RasterBlock
+	{
+		RasterBlockPixel Pixel[2][2];
+		s32 IndirectLod[4];
+		bool IndirectLinear[4];
+		s32 TextureLod[16];
+		bool TextureLinear[16];
+	};
     
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
index de28989972..6bc92dc071 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
@@ -134,10 +134,38 @@ void SetupUnit::SetupTriFan()
 }
 
 void SetupUnit::SetupLine()
-{}
+{
+	if (m_VertexCounter < 1)
+    {
+        m_VertexCounter++;
+        m_VertWritePointer = m_VertPointer[m_VertexCounter];
+        return;
+    }
+
+    Clipper::ProcessLine(m_VertPointer[0], m_VertPointer[1]);
+
+    m_VertexCounter = 0;
+    m_VertWritePointer = m_VertPointer[0];
+}
 
 void SetupUnit::SetupLineStrip()
-{}
+{
+	if (m_VertexCounter < 1)
+    {
+        m_VertexCounter++;
+		m_VertWritePointer = m_VertPointer[m_VertexCounter];
+        return;
+    }
+
+	m_VertexCounter++;
+
+    Clipper::ProcessLine(m_VertPointer[0], m_VertPointer[1]);
+
+	m_VertWritePointer = m_VertPointer[0];
+
+	m_VertPointer[0] = m_VertPointer[1];
+	m_VertPointer[1] = &m_Vertices[m_VertexCounter & 1];
+}
 
 void SetupUnit::SetupPoint()
 {}
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
index 83c095c972..680806e85a 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
@@ -439,34 +439,33 @@ static bool AlphaTest(int alpha)
     return true;
 }
 
-inline float WrapIndirectCoord(float coord, int wrapMode)
+inline s32 WrapIndirectCoord(s32 coord, int wrapMode)
 {
     switch (wrapMode) {
         case ITW_OFF:
             return coord;
         case ITW_256:
-            return fmod(coord, 256);
-         case ITW_128:
-            return fmod(coord, 128);
+            return (coord % (256 << 7));
+        case ITW_128:
+            return (coord % (128 << 7));
         case ITW_64:
-            return fmod(coord, 64);
+            return (coord % (64 << 7));
         case ITW_32:
-            return fmod(coord, 32);
+            return (coord % (32 << 7));
         case ITW_16:
-            return fmod(coord, 16);
+            return (coord % (16 << 7));
         case ITW_0:
             return 0;
     }
     return 0;
 }
 
-void Tev::Indirect(unsigned int stageNum, float s, float t)
+void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
 {
     TevStageIndirect &indirect = bpmem.tevind[stageNum];
     u8 *indmap = IndirectTex[indirect.bt];
-    
 
-    float indcoord[3];
+    s32 indcoord[3];
 
     // alpha bump select
     switch (indirect.bs) {
@@ -494,32 +493,32 @@ void Tev::Indirect(unsigned int stageNum, float s, float t)
     // format
     switch(indirect.fmt) {
         case ITF_8:
-            indcoord[0] = (float)indmap[ALP_C] + bias[0];
-            indcoord[1] = (float)indmap[BLU_C] + bias[1];
-            indcoord[2] = (float)indmap[GRN_C] + bias[2];
+            indcoord[0] = indmap[ALP_C] + bias[0];
+            indcoord[1] = indmap[BLU_C] + bias[1];
+            indcoord[2] = indmap[GRN_C] + bias[2];
             AlphaBump = AlphaBump & 0xf8;
             break;
         case ITF_5:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x1f) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x1f) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x1f) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x1f) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x1f) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x1f) + bias[2];
             AlphaBump = AlphaBump & 0xe0;
             break;
         case ITF_4:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x0f) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x0f) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x0f) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x0f) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x0f) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x0f) + bias[2];
             AlphaBump = AlphaBump & 0xf0;
             break;
         case ITF_3:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x07) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x07) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x07) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x07) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x07) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x07) + bias[2];
             AlphaBump = AlphaBump & 0xf8;
             break;
     }
 
-    float indtevtrans[2] = { 0,0 };
+    s64 indtevtrans[2] = { 0,0 };
 
     // matrix multiply
     int indmtxid = indirect.mid & 3;
@@ -529,39 +528,40 @@ void Tev::Indirect(unsigned int stageNum, float s, float t)
         int scale = ((u32)indmtx.col0.s0 << 0) |
 	                ((u32)indmtx.col1.s1 << 2) |
 	                ((u32)indmtx.col2.s2 << 4);
-        float fscale = 0.0f;
+
+		int shift;
 
         switch (indirect.mid & 12) {
-            case 0:
-                fscale = powf(2.0f, (float)(scale - 17)) / 1024.0f;
+            case 0:   
+				shift = 3 + (17 - scale);
                 indtevtrans[0] = indmtx.col0.ma * indcoord[0] + indmtx.col1.mc * indcoord[1] + indmtx.col2.me * indcoord[2];
                 indtevtrans[1] = indmtx.col0.mb * indcoord[0] + indmtx.col1.md * indcoord[1] + indmtx.col2.mf * indcoord[2];
                 break;
             case 4: // s matrix
-                fscale = powf(2.0f, (float)(scale - 17)) / 256;
+				shift = 8 + (17 - scale);
                 indtevtrans[0] = s * indcoord[0];
                 indtevtrans[1] = t * indcoord[0];
                 break;
             case 8: // t matrix
-                fscale = powf(2.0f, (float)(scale - 17)) / 256;
+				shift = 8 + (17 - scale);
                 indtevtrans[0] = s * indcoord[1];
                 indtevtrans[1] = t * indcoord[1];
                 break;
         }
 
-        indtevtrans[0] *= fscale;
-        indtevtrans[1] *= fscale;
+		indtevtrans[0] = shift >= 0 ? indtevtrans[0] >> shift : indtevtrans[0] << -shift;
+		indtevtrans[1] = shift >= 0 ? indtevtrans[1] >> shift : indtevtrans[1] << -shift;
     }
 
-    if (indirect.fb_addprev)
+	if (indirect.fb_addprev)
     {
-        TexCoord[0] += WrapIndirectCoord(s, indirect.sw) + indtevtrans[0];
-        TexCoord[1] += WrapIndirectCoord(t, indirect.tw) + indtevtrans[1];
+        TexCoord.s += (int)(WrapIndirectCoord(s, indirect.sw) + indtevtrans[0]);
+        TexCoord.t += (int)(WrapIndirectCoord(t, indirect.tw) + indtevtrans[1]);
     }
     else
     {
-        TexCoord[0] = WrapIndirectCoord(s, indirect.sw) + indtevtrans[0];
-        TexCoord[1] = WrapIndirectCoord(t, indirect.tw) + indtevtrans[1];
+        TexCoord.s = (int)(WrapIndirectCoord(s, indirect.sw) + indtevtrans[0]);
+        TexCoord.t = (int)(WrapIndirectCoord(t, indirect.tw) + indtevtrans[1]);
     }
 }
 
@@ -580,10 +580,12 @@ void Tev::Draw()
         u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
         u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
-        float scaleS = bpmem.texscale[stageNum2].getScaleS(stageOdd);
-        float scaleT = bpmem.texscale[stageNum2].getScaleT(stageOdd);
+		const TEXSCALE& texscale = bpmem.texscale[stageNum2];
+		s32 scaleS = stageOdd ? texscale.ss1:texscale.ss0;
+        s32 scaleT = stageOdd ? texscale.ts1:texscale.ts0;
 
-        TextureSampler::Sample(Uv[texcoordSel][0] * scaleS, Uv[texcoordSel][1] * scaleT, Lod[texcoordSel], texmap, IndirectTex[stageNum]);
+        TextureSampler::Sample(Uv[texcoordSel].s >> scaleS, Uv[texcoordSel].t >> scaleT,
+			IndirectLod[stageNum], IndirectLinear[stageNum], texmap, IndirectTex[stageNum]);
 
 #ifdef _DEBUG
         if (g_Config.bDumpTevStages)
@@ -608,14 +610,14 @@ void Tev::Draw()
         int texcoordSel = order.getTexCoord(stageOdd);
         int texmap = order.getTexMap(stageOdd);
 
-        Indirect(stageNum, Uv[texcoordSel][0], Uv[texcoordSel][1]);
+        Indirect(stageNum, Uv[texcoordSel].s, Uv[texcoordSel].t);
 
         // sample texture
         if (order.getEnable(stageOdd))
         {
             u8 texel[4];
     
-            TextureSampler::Sample(TexCoord[0], TexCoord[1], Lod[texcoordSel], texmap, texel);
+			TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum], TextureLinear[stageNum], texmap, texel);
 
             int swaptable = ac.tswap * 2;            
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
index 0419df8086..caaa88cfc9 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
@@ -21,7 +21,20 @@
 #include "BPMemLoader.h"
 
 class Tev
-{
+{ 
+	struct InputRegType {
+        unsigned a : 8;
+        unsigned b : 8;
+        unsigned c : 8;
+        signed   d : 11;
+    };
+
+	struct TextureCoordinateType
+	{
+		signed s : 24;
+		signed t : 24;
+	};
+
     // color order: RGBA
     s16 Reg[4][4];    
     s16 KonstantColors[4][4];
@@ -32,7 +45,7 @@ class Tev
     s16 Zero16[4];
     u8 AlphaBump;
     u8 IndirectTex[4][4];
-    float TexCoord[2];
+	TextureCoordinateType TexCoord;
 
     s16 *m_ColorInputLUT[16][3];
     s16 *m_AlphaInputLUT[8];        // values must point to RGBA color
@@ -49,20 +62,16 @@ class Tev
     void DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac);
     void DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac);
 
-    void Indirect(unsigned int stageNum, float s, float t);    
-
-    struct InputRegType {
-        unsigned a : 8;
-        unsigned b : 8;
-        unsigned c : 8;
-        signed   d : 11;
-    };
+    void Indirect(unsigned int stageNum, s32 s, s32 t);
 
 public:
-    s32 Position[3];
+	s32 Position[3];
     u8 Color[2][4];
-    float Uv[8][2];
-    float Lod[8];
+    TextureCoordinateType Uv[8];
+    s32 IndirectLod[4];
+	bool IndirectLinear[4];
+	s32 TextureLod[16];
+	bool TextureLinear[16];
 
     void Init();
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
index e7001a537b..44878e262b 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
@@ -23,29 +23,11 @@
 
 #include <cmath>
 
+#define ALLOW_MIPMAP 1
+
 namespace TextureSampler
 {
 
-inline int iround(float x)
-{
-    int t;
-
-#if defined(_WIN32) && !defined(_M_X64)
-    __asm
-    {
-        fld  x
-        fistp t
-    }
-#else
-	t = (int)x;
-	if((x - t) >= 0.5)
-		return t + 1;
-#endif
-
-    return t;
-}
-
-
 inline void WrapCoord(int &coord, int wrapMode, int imageSize)
 {
     switch (wrapMode)
@@ -85,9 +67,53 @@ inline void AddTexel(u8 *inTexel, u32 *outTexel, u32 fract)
     outTexel[3] += inTexel[3] * fract;
 }
 
-void Sample(float s, float t, float lod, u8 texmap, u8 *sample)
+void Sample(s32 s, s32 t, s32 lod, bool linear, u8 texmap, u8 *sample)
 {
-    FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    int baseMip = 0;
+	bool mipLinear = false;
+
+#if (ALLOW_MIPMAP)
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    TexMode0& tm0 = texUnit.texMode0[texmap & 3];
+
+	s32 lodFract = lod & 0xf;
+
+	if (lod > 0 && tm0.min_filter & 3)
+	{
+		// use mipmap
+		baseMip = lod >> 4;
+		mipLinear = (lodFract && tm0.min_filter & 2);
+
+		// if using nearest mip filter and lodFract >= 0.5 round up to next mip
+		baseMip += (lodFract >> 3) & (tm0.min_filter & 1);
+	}
+
+	if (mipLinear)
+	{
+		u8 sampledTex[4];
+        u32 texel[4];
+
+		SampleMip(s, t, baseMip, linear, texmap, sampledTex);
+		SetTexel(sampledTex, texel, (16 - lodFract));
+
+		SampleMip(s, t, baseMip + 1, linear, texmap, sampledTex);
+		AddTexel(sampledTex, texel, lodFract);
+
+		sample[0] = (u8)(texel[0] >> 4);
+        sample[1] = (u8)(texel[1] >> 4);
+        sample[2] = (u8)(texel[2] >> 4);
+        sample[3] = (u8)(texel[3] >> 4);
+	}
+	else
+#endif
+	{
+		SampleMip(s, t, baseMip, linear, texmap, sample);
+	}	
+}
+
+void SampleMip(s32 s, s32 t, s32 mip, bool linear, u8 texmap, u8 *sample)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
     u8 subTexmap = texmap & 3;
 
     TexMode0& tm0 = texUnit.texMode0[subTexmap];
@@ -97,59 +123,85 @@ void Sample(float s, float t, float lod, u8 texmap, u8 *sample)
     u32 imageBase = texUnit.texImage3[subTexmap].image_base << 5;    
     u8 *imageSrc = g_VideoInitialize.pGetMemoryPointer(imageBase);
 
-    bool linear = false;
-    if ((lod > 0 && tm0.min_filter > 4) || (lod <= 0 && tm0.mag_filter))
-        linear = true;
+	int imageWidth = ti0.width;
+	int imageHeight = ti0.height;
+
+	int tlutAddress = texTlut.tmem_offset << 9;
+	
+	// reduce sample location and texture size to mip level
+	// move texture pointer to mip location
+	if (mip)
+	{
+		int mipWidth = imageWidth + 1;
+		int mipHeight = imageHeight + 1;
+
+		int fmtWidth = TexDecoder_GetBlockWidthInTexels(ti0.format);
+		int fmtHeight = TexDecoder_GetBlockHeightInTexels(ti0.format);
+		int fmtDepth = TexDecoder_GetTexelSizeInNibbles(ti0.format);
+
+		imageWidth >>= mip;
+		imageHeight >>= mip;
+		s >>= mip;
+		t >>= mip;
+
+		while (mip)
+		{
+			mipWidth = max(mipWidth, fmtWidth);
+			mipHeight = max(mipHeight, fmtHeight);
+			u32 size = (mipWidth * mipHeight * fmtDepth) >> 1;
+
+			imageSrc += size;
+			mipWidth >>= 1;
+			mipHeight >>= 1;
+			mip--;
+		}
+	}
+
+	// integer part of sample location
+	int imageS = s >> 7;
+	int imageT = t >> 7;
 
     if (linear)
     {
-        s32 s256 = s32((s - 0.5f) * 256);
-        s32 t256 = s32((t- 0.5f) * 256);
-
-        int imageS = s256 >> 8;
-        int imageSPlus1 = imageS + 1;
-        u32 fractS = s256 & 0xff;
-        fractS += fractS >> 7;
-
-        int imageT = t256 >> 8;
+        // linear sampling
+		int imageSPlus1 = imageS + 1;
+        int fractS = s & 0x7f;
+        
         int imageTPlus1 = imageT + 1;
-        u32 fractT = t256 & 0xff;
-        fractT += fractT >> 7;
+        int fractT = t & 0x7f;
 
         u8 sampledTex[4];
         u32 texel[4];
 
-        WrapCoord(imageS, tm0.wrap_s, ti0.width);
-        WrapCoord(imageT, tm0.wrap_t, ti0.height);
-        WrapCoord(imageSPlus1, tm0.wrap_s, ti0.width);
-        WrapCoord(imageTPlus1, tm0.wrap_t, ti0.height);
+        WrapCoord(imageS, tm0.wrap_s, imageWidth);
+        WrapCoord(imageT, tm0.wrap_t, imageHeight);
+        WrapCoord(imageSPlus1, tm0.wrap_s, imageWidth);
+        WrapCoord(imageTPlus1, tm0.wrap_t, imageHeight);
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        SetTexel(sampledTex, texel, (256 - fractS) * (256 - fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        SetTexel(sampledTex, texel, (128 - fractS) * (128 - fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        AddTexel(sampledTex, texel, (fractS) * (256 - fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        AddTexel(sampledTex, texel, (fractS) * (128 - fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageTPlus1, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        AddTexel(sampledTex, texel, (256 - fractS) * (fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageTPlus1, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        AddTexel(sampledTex, texel, (128 - fractS) * (fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageTPlus1, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageTPlus1, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
         AddTexel(sampledTex, texel, (fractS) * (fractT));
 
-        sample[0] = (u8)(texel[0] >> 16);
-        sample[1] = (u8)(texel[1] >> 16);
-        sample[2] = (u8)(texel[2] >> 16);
-        sample[3] = (u8)(texel[3] >> 16);
+        sample[0] = (u8)(texel[0] >> 14);
+        sample[1] = (u8)(texel[1] >> 14);
+        sample[2] = (u8)(texel[2] >> 14);
+        sample[3] = (u8)(texel[3] >> 14);
     }
     else
     {
-        int imageS = int(s);
-        int imageT = int(t);
+        // nearest neighbor sampling
+		WrapCoord(imageS, tm0.wrap_s, imageWidth);
+        WrapCoord(imageT, tm0.wrap_t, imageHeight);
 
-        WrapCoord(imageS, tm0.wrap_s, ti0.width);
-        WrapCoord(imageT, tm0.wrap_t, ti0.height);
-
-        TexDecoder_DecodeTexel(sample, imageSrc, imageS, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);   
+        TexDecoder_DecodeTexel(sample, imageSrc, imageS, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);   
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
index 27d786068d..b456769c92 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
@@ -23,7 +23,9 @@
 
 namespace TextureSampler
 {
-    void Sample(float s, float t, float lod, u8 texmap, u8 *sample);
+	void Sample(s32 s, s32 t, s32 lod, bool linear, u8 texmap, u8 *sample);
+
+	void SampleMip(s32 s, s32 t, s32 mip, bool linear, u8 texmap, u8 *sample);
 }
 
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
index b955d233e9..c614af8627 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
@@ -22,6 +22,7 @@
 #include "TransformUnit.h"
 #include "XFMemLoader.h"
 #include "CPMemLoader.h"
+#include "BPMemLoader.h"
 #include "NativeVertexFormat.h"
 
 #include "../../Plugin_VideoDX9/Src/Vec3.h"
@@ -30,48 +31,48 @@
 namespace TransformUnit
 {
 
-void MultiplyVec2Mat24(const float *vec, const float *mat, float *result)
+void MultiplyVec2Mat24(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] + mat[7];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] + mat[7];
 }
 
-void MultiplyVec2Mat34(const float *vec, const float *mat, float *result)
+void MultiplyVec2Mat34(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] + mat[7];
-    result[2] = mat[8] * vec[0] + mat[9] * vec[1] + mat[10] + mat[11];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] + mat[7];
+    result.z = mat[8] * vec.x + mat[9] * vec.y + mat[10] + mat[11];
 }
 
-void MultiplyVec3Mat33(const float *vec, const float *mat, float *result)
+void MultiplyVec3Mat33(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
-    result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
-    result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] * vec.z;
+    result.y = mat[3] * vec.x + mat[4] * vec.y + mat[5] * vec.z;
+    result.z = mat[6] * vec.x + mat[7] * vec.y + mat[8] * vec.z;
 }
 
-void MultiplyVec3Mat34(const float *vec, const float *mat, float *result)
+void MultiplyVec3Mat34(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] * vec[2] + mat[7];
-    result[2] = mat[8] * vec[0] + mat[9] * vec[1] + mat[10] * vec[2] + mat[11];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] * vec.z + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] * vec.z + mat[7];
+    result.z = mat[8] * vec.x + mat[9] * vec.y + mat[10] * vec.z + mat[11];
 }
 
-void MultipleVec3Perspective(const float *vec, const float *proj, float *result)
+void MultipleVec3Perspective(const Vec3 &vec, const float *proj, Vec4 &result)
 {
-    result[0] = proj[0] * vec[0] + proj[1] * vec[2];
-    result[1] = proj[2] * vec[1] + proj[3] * vec[2];
-    //result[2] = (proj[4] * vec[2] + proj[5]);
-    result[2] = (proj[4] * vec[2] + proj[5]) * (1.0f - (float)1e-7);
-    result[3] = -vec[2];
+    result.x = proj[0] * vec.x + proj[1] * vec.z;
+    result.y = proj[2] * vec.y + proj[3] * vec.z;
+    //result.z = (proj[4] * vec.z + proj[5]);
+    result.z = (proj[4] * vec.z + proj[5]) * (1.0f - (float)1e-7);
+    result.w = -vec.z;
 }
 
-void MultipleVec3Ortho(const float *vec, const float *proj, float *result)
+void MultipleVec3Ortho(const Vec3 &vec, const float *proj, Vec4 &result)
 {
-    result[0] = proj[0] * vec[0] + proj[1];
-    result[1] = proj[2] * vec[1] + proj[3];
-    result[2] = proj[4] * vec[2] + proj[5];
-    result[3] = 1;
+    result.x = proj[0] * vec.x + proj[1];
+    result.y = proj[2] * vec.y + proj[3];
+    result.z = proj[4] * vec.z + proj[5];
+    result.w = 1;
 }
 
 void TransformPosition(const InputVertexData *src, OutputVertexData *dst)
@@ -98,55 +99,53 @@ void TransformNormal(const InputVertexData *src, bool nbt, OutputVertexData *dst
         MultiplyVec3Mat33(src->normal[0], mat, dst->normal[0]);
         MultiplyVec3Mat33(src->normal[1], mat, dst->normal[1]);
         MultiplyVec3Mat33(src->normal[2], mat, dst->normal[2]);
-        Vec3 *norm0 = (Vec3*)dst->normal[0];
-        norm0->normalize();
+        dst->normal[0].normalize();
     }
     else
     {
         MultiplyVec3Mat33(src->normal[0], mat, dst->normal[0]);
-        Vec3 *norm0 = (Vec3*)dst->normal[0];
-        norm0->normalize();
+        dst->normal[0].normalize();
     }    
 }
 
 inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bool specialCase, const InputVertexData *srcVertex, OutputVertexData *dstVertex)
 {
-    const float *src;
+    const Vec3 *src;
     switch (texinfo.sourcerow)
     {
         case XF_SRCGEOM_INROW:
-            src = srcVertex->position;
+            src = &srcVertex->position;
             break;
         case XF_SRCNORMAL_INROW:
-            src = srcVertex->normal[0];
+            src = &srcVertex->normal[0];
             break;
         case XF_SRCBINORMAL_T_INROW:
-            src = srcVertex->normal[1];
+            src = &srcVertex->normal[1];
             break;
         case XF_SRCBINORMAL_B_INROW:
-            src = srcVertex->normal[2];
+            src = &srcVertex->normal[2];
             break;
         default:
             _assert_(texinfo.sourcerow >= XF_SRCTEX0_INROW && texinfo.sourcerow <= XF_SRCTEX7_INROW);
-            src = srcVertex->texCoords[texinfo.sourcerow - XF_SRCTEX0_INROW];
+            src = (Vec3*)srcVertex->texCoords[texinfo.sourcerow - XF_SRCTEX0_INROW];
             break;
     }
 
     const float *mat = (const float*)&xfregs.posMatrices[srcVertex->texMtx[coordNum] * 4];
-    float *dst = dstVertex->texCoords[coordNum];
+    Vec3 *dst = &dstVertex->texCoords[coordNum];
 
     if (texinfo.inputform == XF_TEXINPUT_AB11)
     {
-        MultiplyVec2Mat34(src, mat, dst); 
+        MultiplyVec2Mat34(*src, mat, *dst); 
     }
     else
     {
-        MultiplyVec3Mat34(src, mat, dst); 
+        MultiplyVec3Mat34(*src, mat, *dst); 
     }
 
     if (xfregs.dualTexTrans)
     {
-        float tempCoord[3];
+        Vec3 tempCoord;
 
         // normalize
         const PostMtxInfo &postInfo = xfregs.postMtxInfo[coordNum];
@@ -157,12 +156,12 @@ inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bo
 			// no normalization
 			// q of input is 1
 			// q of output is unknown
-			tempCoord[0] = dst[0];
-			tempCoord[1] = dst[1];
+			tempCoord.x = dst->x;
+			tempCoord.y = dst->y;
 
-			dst[0] = postMat[0] * tempCoord[0] + postMat[1] * tempCoord[1] + postMat[2] + postMat[3];
-			dst[1] = postMat[4] * tempCoord[0] + postMat[5] * tempCoord[1] + postMat[6] + postMat[7];
-			dst[2] = 0.0f;
+			dst->x = postMat[0] * tempCoord.x + postMat[1] * tempCoord.y + postMat[2] + postMat[3];
+			dst->y = postMat[4] * tempCoord.x + postMat[5] * tempCoord.y + postMat[6] + postMat[7];
+			dst->z = 1.0f;
 		}
 		else
 		{		
@@ -170,18 +169,14 @@ inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bo
 			{
 				float length = sqrtf(dst[0] * dst[0] + dst[1] * dst[1] + dst[2] * dst[2]);
 				float invL = 1.0f / length;
-				tempCoord[0] = invL * dst[0];
-				tempCoord[1] = invL * dst[1];
-				tempCoord[2] = invL * dst[2];
+				tempCoord = *dst * invL;
 			}
 			else
 			{
-				tempCoord[0] = dst[0];
-				tempCoord[1] = dst[1];
-				tempCoord[2] = dst[2];
+				tempCoord = *dst;
 			}
 
-			MultiplyVec3Mat34(tempCoord, postMat, dst);
+			MultiplyVec3Mat34(tempCoord, postMat, *dst);
 		}
     }
 }
@@ -220,13 +215,8 @@ inline float SafeDivide(float n, float d)
     return (d==0)?(n>0?1:0):n/d;
 }
 
-void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const LitChannel &chan, Vec3 &lightCol)
+void LightColor(const Vec3 &pos, const Vec3 &normal, u8 lightNum, const LitChannel &chan, Vec3 &lightCol)
 {
-    // must be the size of 3 32bit floats for the light pointer to be valid
-    _assert_(sizeof(Vec3) == 12);
-
-    const Vec3 *pos = (const Vec3*)vertexPos;
-    const Vec3 *norm0 = (const Vec3*)normal;
     const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*lightNum];
 
     if (!(chan.attnfunc & 1)) {
@@ -237,15 +227,15 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = ldir * (*norm0);
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = ldir * normal;
                     AddScaledIntegerColor(light->color, diffuse, lightCol);
                 }
                 break;
             case LIGHTDIF_CLAMP:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = max(0.0f, ldir * (*norm0));
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = max(0.0f, ldir * normal);
                     AddScaledIntegerColor(light->color, diffuse, lightCol);
                 }
                 break;
@@ -254,7 +244,7 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
     }
     else { // spec and spot
         // not sure about divide by zero checks
-        Vec3 ldir = light->pos - *pos;
+        Vec3 ldir = light->pos - pos;
         float attn;
 
         if (chan.attnfunc == 3) { // spot
@@ -269,7 +259,7 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
         }
         else if (chan.attnfunc == 1) { // specular
             // donko - what is going on here?  655.36 is a guess but seems about right.
-            attn = (light->pos * (*norm0)) > -655.36 ? max(0.0f, (light->dir * (*norm0))) : 0;
+            attn = (light->pos * normal) > -655.36 ? max(0.0f, (light->dir * normal)) : 0;
             ldir.set(1.0f, attn, attn * attn);
 
             float cosAtt = max(0.0f, light->cosatt * ldir);
@@ -283,14 +273,14 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    float difAttn = ldir * (*norm0);
+                    float difAttn = ldir * normal;
                     AddScaledIntegerColor(light->color, attn * difAttn, lightCol);
                 }
                 break;
 
             case LIGHTDIF_CLAMP:
                 {
-                    float difAttn = max(0.0f, ldir * (*norm0));
+                    float difAttn = max(0.0f, ldir * normal);
                     AddScaledIntegerColor(light->color, attn * difAttn, lightCol);
                 }
                 break;
@@ -299,13 +289,8 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
     }
 }
 
-void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const LitChannel &chan, float &lightCol)
+void LightAlpha(const Vec3 &pos, const Vec3 &normal, u8 lightNum, const LitChannel &chan, float &lightCol)
 {
-    // must be the size of 3 32bit floats for the light pointer to be valid
-    _assert_(sizeof(Vec3) == 12);
-
-    const Vec3 *pos = (const Vec3*)vertexPos;
-    const Vec3 *norm0 = (const Vec3*)normal;
     const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*lightNum];
 
     if (!(chan.attnfunc & 1)) {
@@ -316,15 +301,15 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();                    
-                    float diffuse = ldir * (*norm0);
+                    Vec3 ldir = (light->pos - pos).normalized();                    
+                    float diffuse = ldir * normal;
                     lightCol += light->color[0] * diffuse;
                 }
                 break;
             case LIGHTDIF_CLAMP:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = max(0.0f, ldir * (*norm0));
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = max(0.0f, ldir * normal);
                     lightCol += light->color[0] * diffuse;
                 }
                 break;
@@ -332,7 +317,7 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
         }
     }
     else { // spec and spot
-        Vec3 ldir = light->pos - *pos;
+        Vec3 ldir = light->pos - pos;
         float attn;
 
         if (chan.attnfunc == 3) { // spot
@@ -347,7 +332,7 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
         }
         else if (chan.attnfunc == 1) { // specular
             // donko - what is going on here?  655.36 is a guess but seems about right.
-            attn = (light->pos * (*norm0)) > -655.36 ? max(0.0f, (light->dir * (*norm0))) : 0;
+            attn = (light->pos * normal) > -655.36 ? max(0.0f, (light->dir * normal)) : 0;
             ldir.set(1.0f, attn, attn * attn);
 
             float cosAtt = light->cosatt * ldir;
@@ -361,14 +346,14 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    float difAttn = ldir * (*norm0);
+                    float difAttn = ldir * normal;
                     lightCol += light->color[0] * attn * difAttn;
                 }
                 break;
 
             case LIGHTDIF_CLAMP:
                 {
-                    float difAttn = max(0.0f, ldir * (*norm0));
+                    float difAttn = max(0.0f, ldir * normal);
                     lightCol += light->color[0] * attn * difAttn;
                 }
                 break;
@@ -472,14 +457,11 @@ void TransformTexCoord(const InputVertexData *src, OutputVertexData *dst, bool s
             break;
         case XF_TEXGEN_EMBOSS_MAP:
             {
-                const Vec3 *pos = (const Vec3*)dst->mvPosition;
-                const Vec3 *norm1 = (const Vec3*)dst->normal[1];
-                const Vec3 *norm2 = (const Vec3*)dst->normal[2];
                 const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*texinfo.embosslightshift];
 
-                Vec3 ldir = (light->pos - *pos).normalized();
-                float d1 = ldir * (*norm1);
-                float d2 = ldir * (*norm2);
+                Vec3 ldir = (light->pos - dst->mvPosition).normalized();
+                float d1 = ldir * dst->normal[1];
+                float d2 = ldir * dst->normal[2];
 
                 dst->texCoords[coordNum][0] = dst->texCoords[texinfo.embosssourceshift][0] + d1;
                 dst->texCoords[coordNum][1] = dst->texCoords[texinfo.embosssourceshift][1] + d2;
@@ -503,6 +485,9 @@ void TransformTexCoord(const InputVertexData *src, OutputVertexData *dst, bool s
         default:
             ERROR_LOG(VIDEO, "Bad tex gen type %i", texinfo.texgentype);            
         }
+
+		dst->texCoords[coordNum][0] *= (bpmem.texcoords[coordNum].s.scale_minus_1 + 1);
+		dst->texCoords[coordNum][1] *= (bpmem.texcoords[coordNum].t.scale_minus_1 + 1);
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
index d3d3f421c9..f50220aaad 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
@@ -24,32 +24,32 @@ namespace VertexFormatConverter
 {
     void LoadNormal1_Byte(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = (float)(s8)src[0] / 128;
-        dst->normal[0][1] = (float)(s8)src[1] / 128;
-        dst->normal[0][2] = (float)(s8)src[2] / 128;
+        dst->normal[0].x = (float)(s8)src[0] / 128;
+        dst->normal[0].y = (float)(s8)src[1] / 128;
+        dst->normal[0].z = (float)(s8)src[2] / 128;
     }
 
     void LoadNormal1_Short(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = (float)((s16*)src)[0] / 32768;
-        dst->normal[0][1] = (float)((s16*)src)[1] / 32768;
-        dst->normal[0][2] = (float)((s16*)src)[2] / 32768;
+        dst->normal[0].x = (float)((s16*)src)[0] / 32768;
+        dst->normal[0].y = (float)((s16*)src)[1] / 32768;
+        dst->normal[0].z = (float)((s16*)src)[2] / 32768;
     }
 
     void LoadNormal1_Float(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = ((float*)src)[0];
-        dst->normal[0][1] = ((float*)src)[1];
-        dst->normal[0][2] = ((float*)src)[2];
+        dst->normal[0].x = ((float*)src)[0];
+        dst->normal[0].y = ((float*)src)[1];
+        dst->normal[0].z = ((float*)src)[2];
     }
 
     void LoadNormal3_Byte(InputVertexData *dst, u8 *src)
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = (float)(s8)src[j + 0] / 128;
-            dst->normal[i][1] = (float)(s8)src[j + 1] / 128;
-            dst->normal[i][2] = (float)(s8)src[j + 2] / 128;
+            dst->normal[i].x = (float)(s8)src[j + 0] / 128;
+            dst->normal[i].y = (float)(s8)src[j + 1] / 128;
+            dst->normal[i].z = (float)(s8)src[j + 2] / 128;
         }
     }
 
@@ -57,9 +57,9 @@ namespace VertexFormatConverter
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = (float)((s16*)src)[j + 0] / 32768;
-            dst->normal[i][1] = (float)((s16*)src)[j + 1] / 32768;
-            dst->normal[i][2] = (float)((s16*)src)[j + 2] / 32768;
+            dst->normal[i].x = (float)((s16*)src)[j + 0] / 32768;
+            dst->normal[i].y = (float)((s16*)src)[j + 1] / 32768;
+            dst->normal[i].z = (float)((s16*)src)[j + 2] / 32768;
         }
     }
 
@@ -67,9 +67,9 @@ namespace VertexFormatConverter
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = ((float*)src)[j + 0];
-            dst->normal[i][1] = ((float*)src)[j + 1];
-            dst->normal[i][2] = ((float*)src)[j + 2];
+            dst->normal[i].x = ((float*)src)[j + 0];
+            dst->normal[i].y = ((float*)src)[j + 1];
+            dst->normal[i].z = ((float*)src)[j + 2];
         }
     }
 }