From b31634df8fa49f35646777973d2a73d64d0c9ec0 Mon Sep 17 00:00:00 2001
From: gabest11 <gabest11@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 12 Mar 2011 22:10:58 +0000
Subject: [PATCH] GSdx: using mipmap levels (only per batch, no tri-linear) and
 a couple of small changes, including the stdcall fix for linux.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4419 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp |   3 +-
 plugins/GSdx/GPURendererSW.cpp                |   6 +-
 plugins/GSdx/GS.h                             |   2 +-
 plugins/GSdx/GSDirtyRect.cpp                  |   4 +-
 plugins/GSdx/GSDrawScanline.cpp               |   2 +-
 plugins/GSdx/GSDrawScanlineCodeGenerator.cpp  |   3 -
 .../GSDrawScanlineCodeGenerator.x64.avx.cpp   |  11 +-
 .../GSDrawScanlineCodeGenerator.x86.avx.cpp   |  11 +-
 .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp  |   9 +-
 plugins/GSdx/GSLocalMemory.cpp                |   2 +-
 plugins/GSdx/GSLocalMemory.h                  |  18 +-
 plugins/GSdx/GSRasterizer.cpp                 |  83 +-
 plugins/GSdx/GSRasterizer.h                   |   2 +-
 plugins/GSdx/GSRenderer.cpp                   |  63 +-
 plugins/GSdx/GSRenderer.h                     |   4 +-
 plugins/GSdx/GSRendererDX.h                   |   2 +-
 plugins/GSdx/GSRendererHW.h                   |   2 +-
 plugins/GSdx/GSRendererSW.cpp                 | 164 +++-
 plugins/GSdx/GSState.cpp                      |  41 +
 plugins/GSdx/GSTextureCache.cpp               |   4 +-
 plugins/GSdx/GSTextureCacheSW.cpp             |  44 +-
 plugins/GSdx/GSTextureCacheSW.h               |   3 +-
 plugins/GSdx/GSVector.h                       | 824 +++++++++++++++---
 plugins/GSdx/GSVertexSW.h                     | 132 ++-
 plugins/GSdx/GSVertexTrace.cpp                |  53 ++
 plugins/GSdx/GSVertexTrace.h                  |  24 +-
 plugins/GSdx/stdafx.h                         |   2 +-
 27 files changed, 1178 insertions(+), 340 deletions(-)

diff --git a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp
index ed31b4c136..1e2dbfd629 100644
--- a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp
+++ b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp
@@ -128,9 +128,8 @@ void GPUDrawScanlineCodeGenerator::Init()
 	add(edi, edx);
 	lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);
 
-	// int steps = right - left - 8;
+	// int steps = pixels - 8;
 
-	sub(ecx, edx);
 	sub(ecx, 8);
 
 	if(m_sel.dtd)
diff --git a/plugins/GSdx/GPURendererSW.cpp b/plugins/GSdx/GPURendererSW.cpp
index f13d2e611d..97afc7de84 100644
--- a/plugins/GSdx/GPURendererSW.cpp
+++ b/plugins/GSdx/GPURendererSW.cpp
@@ -174,10 +174,10 @@ void GPURendererSW::VertexKick()
 	int x = (int)(m_v.XY.X + m_env.DROFF.X) << m_scale.x;
 	int y = (int)(m_v.XY.Y + m_env.DROFF.Y) << m_scale.y;
 
-	int s = m_v.UV.X;
-	int t = m_v.UV.Y;
+	int u = m_v.UV.X;
+	int v = m_v.UV.Y;
 
-	GSVector4 pt(x, y, s, t);
+	GSVector4 pt(x, y, u, v);
 
 	dst.p = pt.xyxy(GSVector4::zero());
 	dst.t = (pt.zwzw(GSVector4::zero()) + GSVector4(0.125f)) * 256.0f;
diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h
index 3169bf0880..ac32878bef 100644
--- a/plugins/GSdx/GS.h
+++ b/plugins/GSdx/GS.h
@@ -815,7 +815,7 @@ union
 	};
 };
 REG_END2
-	__forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6);}
+	__forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6) || (PSM == PSM_PSMT8 || PSM == PSM_PSMT4) && TBW == 1;}
 REG_END2
 
 REG64_(GIFReg, TEX1)
diff --git a/plugins/GSdx/GSDirtyRect.cpp b/plugins/GSdx/GSDirtyRect.cpp
index 0ce2017e17..bf4ede42f7 100644
--- a/plugins/GSdx/GSDirtyRect.cpp
+++ b/plugins/GSdx/GSDirtyRect.cpp
@@ -54,7 +54,7 @@ GSVector4i GSDirtyRect::GetDirtyRect(const GIFRegTEX0& TEX0)
 	}
 	else
 	{
-		r = GSVector4i(left, top, right, bottom).ralign<GSVector4i::Outside>(src);
+		r = GSVector4i(left, top, right, bottom).ralign<Align_Outside>(src);
 	}
 
 	return r;
@@ -77,7 +77,7 @@ GSVector4i GSDirtyRectList::GetDirtyRectAndClear(const GIFRegTEX0& TEX0, const G
 
 		GSVector2i bs = GSLocalMemory::m_psm[TEX0.PSM].bs;
 
-		return r.ralign<GSVector4i::Outside>(bs).rintersect(GSVector4i(0, 0, size.x, size.y));
+		return r.ralign<Align_Outside>(bs).rintersect(GSVector4i(0, 0, size.x, size.y));
 	}
 
 	return GSVector4i::zero();
diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp
index 04d8cb7ab9..ad995dedbf 100644
--- a/plugins/GSdx/GSDrawScanline.cpp
+++ b/plugins/GSdx/GSDrawScanline.cpp
@@ -189,7 +189,7 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col,
 
 	color = color.andnot(mask);
 
-	GSVector4i br = r.ralign<GSVector4i::Inside>(GSVector2i(8 * 4 / sizeof(T), 8));
+	GSVector4i br = r.ralign<Align_Inside>(GSVector2i(8 * 4 / sizeof(T), 8));
 
 	if(!br.rempty())
 	{
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
index dd3975bce7..8a1dbd85f2 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
@@ -19,9 +19,6 @@
  *
  */
 
-// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix)
-// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4)
-
 #include "stdafx.h"
 #include "GSDrawScanlineCodeGenerator.h"
 
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
index cd432f9e8d..bbaa210067 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp
@@ -224,10 +224,9 @@ void GSDrawScanlineCodeGenerator::Init()
 
 	sub(rbx, rdx);
 
-	// int steps = right - left - 4;
+	// int steps = pixels + skip - 4;
 
-	sub(rcx, rbx);
-	sub(rcx, 4);
+	lea(rcx, ptr[rcx + rdx - 4]);
 
 	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
 
@@ -478,6 +477,12 @@ void GSDrawScanlineCodeGenerator::Step()
 
 				vpaddw(xmm13, xmm1);
 				vpaddw(xmm14, xmm2);
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				vpxor(xmm0, xmm0);
+				vpmaxsw(xmm13, xmm0);
+				vpmaxsw(xmm14, xmm0);
 			}
 			else
 			{
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
index 1ff9c35bf1..ff75b94f9f 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
@@ -252,10 +252,9 @@ void GSDrawScanlineCodeGenerator::Init()
 
 	sub(ebx, edx);
 
-	// int steps = right - left - 4;
+	// int steps = pixels + skip - 4;
 
-	sub(ecx, ebx);
-	sub(ecx, 4);
+	lea(ecx, ptr[ecx + edx - 4]);
 
 	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
 
@@ -553,6 +552,12 @@ void GSDrawScanlineCodeGenerator::Step()
 				vpaddw(xmm5, ptr[&m_local.temp.rb]);
 				vpaddw(xmm6, ptr[&m_local.temp.ga]);
 
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				vpxor(xmm7, xmm7);
+				vpmaxsw(xmm5, xmm7);
+				vpmaxsw(xmm6, xmm7);
+
 				vmovdqa(ptr[&m_local.temp.rb], xmm5);
 				vmovdqa(ptr[&m_local.temp.ga], xmm6);
 			}
diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp
index 7586038513..ef3faa26d8 100644
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp
@@ -251,8 +251,7 @@ void GSDrawScanlineCodeGenerator::Init()
 
 	// int steps = right - left - 4;
 
-	sub(ecx, ebx);
-	sub(ecx, 4);
+	lea(ecx, ptr[ecx + edx - 4]);
 
 	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
 
@@ -555,6 +554,12 @@ void GSDrawScanlineCodeGenerator::Step()
 				paddw(xmm5, ptr[&m_local.temp.rb]);
 				paddw(xmm6, ptr[&m_local.temp.ga]);
 
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				pxor(xmm7, xmm7);
+				pmaxsw(xmm5, xmm7);
+				pmaxsw(xmm6, xmm7);
+
 				movdqa(ptr[&m_local.temp.rb], xmm5);
 				movdqa(ptr[&m_local.temp.ga], xmm6);
 			}
diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp
index a0fe87fe6d..8df9b14164 100644
--- a/plugins/GSdx/GSLocalMemory.cpp
+++ b/plugins/GSdx/GSLocalMemory.cpp
@@ -1708,7 +1708,7 @@ void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r,
 		TEX0.TBW = o->bw;
 		TEX0.PSM = o->psm;
 
-		GSVector4i cr = r.ralign<GSVector4i::Inside>(psm.bs);
+		GSVector4i cr = r.ralign<Align_Inside>(psm.bs);
 
 		bool aligned = ((size_t)(dst + (cr.left - r.left) * sizeof(uint32)) & 0xf) == 0;
 
diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h
index c6cf33dd9d..1bd4943e71 100644
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@@ -173,14 +173,14 @@ public:
 
 	static uint32 BlockNumber8(int x, int y, uint32 bp, uint32 bw)
 	{
-		ASSERT((bw & 1) == 0);
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
 
 		return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
 	}
 
 	static uint32 BlockNumber4(int x, int y, uint32 bp, uint32 bw)
 	{
-		ASSERT((bw & 1) == 0);
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
 
 		return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
 	}
@@ -291,6 +291,7 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6);
 		uint32 word = (page << 11) + pageOffset32[bp & 0x1f][y & 0x1f][x & 0x3f];
+
 		return word;
 	}
 
@@ -298,6 +299,7 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
 		uint32 word = (page << 12) + pageOffset16[bp & 0x1f][y & 0x3f][x & 0x3f];
+
 		return word;
 	}
 
@@ -305,22 +307,27 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
 		uint32 word = (page << 12) + pageOffset16S[bp & 0x1f][y & 0x3f][x & 0x3f];
+
 		return word;
 	}
 
 	static __forceinline uint32 PixelAddress8(int x, int y, uint32 bp, uint32 bw)
 	{
-		ASSERT((bw & 1) == 0);
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
 		uint32 page = (bp >> 5) + (y >> 6) * (bw >> 1) + (x >> 7);
 		uint32 word = (page << 13) + pageOffset8[bp & 0x1f][y & 0x3f][x & 0x7f];
+
 		return word;
 	}
 
 	static __forceinline uint32 PixelAddress4(int x, int y, uint32 bp, uint32 bw)
 	{
-		ASSERT((bw & 1) == 0);
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
 		uint32 page = (bp >> 5) + (y >> 7) * (bw >> 1) + (x >> 7);
 		uint32 word = (page << 14) + pageOffset4[bp & 0x1f][y & 0x7f][x & 0x7f];
+
 		return word;
 	}
 
@@ -328,6 +335,7 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6);
 		uint32 word = (page << 11) + pageOffset32Z[bp & 0x1f][y & 0x1f][x & 0x3f];
+
 		return word;
 	}
 
@@ -335,6 +343,7 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
 		uint32 word = (page << 12) + pageOffset16Z[bp & 0x1f][y & 0x3f][x & 0x3f];
+
 		return word;
 	}
 
@@ -342,6 +351,7 @@ public:
 	{
 		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
 		uint32 word = (page << 12) + pageOffset16SZ[bp & 0x1f][y & 0x3f][x & 0x3f];
+
 		return word;
 	}
 
diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp
index 89c5333377..a457e14843 100644
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@@ -129,7 +129,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v)
 
 			m_ds->SetupPrim(v, *v);
 
-			m_ds->DrawScanline(p.x + 1, p.x, p.y, *v);
+			m_ds->DrawScanline(1, p.x, p.y, *v);
 		}
 	}
 }
@@ -144,16 +144,10 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
 
 	if(m_ds->IsEdge())
 	{
-		GSVertexSW dscan;
-
-		dscan.p = GSVector4::zero();
-		dscan.t = GSVector4::zero();
-		dscan.c = GSVector4::zero();
-
 		DrawEdge(v[0], v[1], dv, i, 0);
 		DrawEdge(v[0], v[1], dv, i, 1);
 
-		Flush(v, dscan, true);
+		Flush(v, GSVertexSW::zero(), true);
 
 		return;
 	}
@@ -170,13 +164,13 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
 
 			GSVertexSW l, dl;
 
-			l.p = v[0].p.blend8(v[1].p, mask);
-			l.t = v[0].t.blend8(v[1].t, mask);
-			l.c = v[0].c.blend8(v[1].c, mask);
+			l.p = v[0].p.blend32(v[1].p, mask);
+			l.t = v[0].t.blend32(v[1].t, mask);
+			l.c = v[0].c.blend32(v[1].c, mask);
 
 			GSVector4 r;
 
-			r = v[1].p.blend8(v[0].p, mask);
+			r = v[1].p.blend32(v[0].p, mask);
 
 			GSVector4i p(l.p);
 
@@ -216,7 +210,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
 
 					e->p.i16[0] = (int16)p.x;
 					e->p.i16[1] = (int16)p.y;
-					e->p.i16[2] = (int16)(p.x + 1);
+					e->p.i16[2] = 1;
 
 					e++;
 				}
@@ -231,13 +225,7 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
 
 		m_stats.pixels += m_edge.count;
 
-		GSVertexSW dscan;
-
-		dscan.p = GSVector4::zero();
-		dscan.t = GSVector4::zero();
-		dscan.c = GSVector4::zero();
-
-		Flush(v, dscan);
+		Flush(v, GSVertexSW::zero());
 	}
 }
 
@@ -278,8 +266,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
 	int i = (aabb == bccb).mask() & 7;
 
 	GSVector4 tbf = aabb.xzxz(bccb).ceil();
-	GSVector4 tbmax = tbf.max(m_fscissor.yyyy());
-	GSVector4 tbmin = tbf.min(m_fscissor.wwww());
+	GSVector4 tbmax = tbf.max(m_fscissor.ywyw());
+	GSVector4 tbmin = tbf.min(m_fscissor.ywyw());
 	GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin));
 
 	dv[0] = v[1] - v[0];
@@ -335,13 +323,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
 		DrawEdge(v[0], v[2], dv[1], i & 2, j & 2);
 		DrawEdge(v[1], v[2], dv[2], i & 4, j & 4);
 
-		GSVertexSW dscan;
-
-		dscan.p = GSVector4::zero();
-		dscan.t = GSVector4::zero();
-		dscan.c = GSVector4::zero();
-
-		Flush(v, dscan, true);
+		Flush(v, GSVertexSW::zero(), true);
 	}
 
 	switch(i)
@@ -365,6 +347,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
 
 		if(tb.y < tb.w)
 		{
+			// TODO: j == 1 (x2 < x3 < x0 < x1)
+			// v[3] isn't accurate enough, it may leave gaps horizontally if it happens to be on the left side of the triangle
+			// example: previous triangle's scanline ends on 48.9999, this one's starts from 49.0001, the pixel at 49 isn't drawn
+
 			GSVertexSW l = v[1 + (1 << j)];
 			GSVertexSW dl = ddv[2 - j];
 
@@ -436,8 +422,8 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
 		if(IsOneOfMyScanlines(top))
 		{
 			GSVector4 lrf = l.p.ceil();
-			GSVector4 lrmax = lrf.max(m_fscissor.xxxx());
-			GSVector4 lrmin = lrf.min(m_fscissor.zzzz());
+			GSVector4 lrmax = lrf.max(m_fscissor.xzxz());
+			GSVector4 lrmin = lrf.min(m_fscissor.xzxz());
 			GSVector4i lr = GSVector4i(lrmax.xxyy(lrmin));
 
 			int left = lr.extract32<0>();
@@ -453,7 +439,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
 
 				e->p.i16[0] = (int16)left;
 				e->p.i16[1] = (int16)top;
-				e->p.i16[2] = (int16)right;
+				e->p.i16[2] = (int16)pixels;
 
 				e++;
 			}
@@ -473,12 +459,12 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
 
 	GSVector4 mask = (vertices[0].p < vertices[1].p).xyzw(GSVector4::zero());
 
-	v[0].p = vertices[1].p.blend8(vertices[0].p, mask);
-	v[0].t = vertices[1].t.blend8(vertices[0].t, mask);
+	v[0].p = vertices[1].p.blend32(vertices[0].p, mask);
+	v[0].t = vertices[1].t.blend32(vertices[0].t, mask);
 	v[0].c = vertices[1].c;
 
-	v[1].p = vertices[0].p.blend8(vertices[1].p, mask);
-	v[1].t = vertices[0].t.blend8(vertices[1].t, mask);
+	v[1].p = vertices[0].p.blend32(vertices[1].p, mask);
+	v[1].t = vertices[0].t.blend32(vertices[1].t, mask);
 
 	GSVector4i r(v[0].p.xyxy(v[1].p).ceil());
 
@@ -500,18 +486,13 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
 		return;
 	}
 
-	GSVector4 zero = GSVector4::zero();
-
-	GSVertexSW dedge, dscan;
-
-	dedge.p = zero;
-	dscan.p = zero;
-
-	dedge.c = zero;
-	dscan.c = zero;
+	GSVertexSW dedge = GSVertexSW::zero();
+	GSVertexSW dscan = GSVertexSW::zero();
 
 	GSVertexSW dv = v[1] - v[0];
 
+	GSVector4 zero = GSVector4::zero();
+
 	dedge.t = (dv.t / dv.p.yyyy()).xyxy(zero).wyww();
 	dscan.t = (dv.t / dv.p.xxxx()).xyxy(zero).xwww();
 
@@ -526,7 +507,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices)
 		{
 			m_stats.pixels += r.width();
 
-			m_ds->DrawScanline(r.right, r.left, r.top, scan);
+			m_ds->DrawScanline(r.width(), r.left, r.top, scan);
 		}
 
 		if(++r.top >= r.bottom) break;
@@ -555,7 +536,6 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 	{
 		GSVector4 tbmax = lrtb.max(m_fscissor.yyyy());
 		GSVector4 tbmin = lrtb.min(m_fscissor.wwww());
-
 		GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin));
 
 		int top, bottom;
@@ -609,7 +589,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 
 					e->p.i16[0] = (int16)xi;
 					e->p.i16[1] = (int16)top;
-					e->p.i16[2] = (int16)(xi + 1);
+					e->p.i16[2] = 1;
 
 					e++;
 				}
@@ -637,7 +617,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 
 					e->p.i16[0] = (int16)xi;
 					e->p.i16[1] = (int16)top;
-					e->p.i16[2] = (int16)(xi + 1);
+					e->p.i16[2] = 1;
 
 					e++;
 				}
@@ -653,7 +633,6 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 	{
 		GSVector4 lrmax = lrtb.max(m_fscissor.xxxx());
 		GSVector4 lrmin = lrtb.min(m_fscissor.zzzz());
-
 		GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin));
 
 		int left, right;
@@ -707,7 +686,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 
 					e->p.i16[0] = (int16)left;
 					e->p.i16[1] = (int16)yi;
-					e->p.i16[2] = (int16)(left + 1);
+					e->p.i16[2] = 1;
 
 					e++;
 				}
@@ -735,7 +714,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
 
 					e->p.i16[0] = (int16)left;
 					e->p.i16[1] = (int16)yi;
-					e->p.i16[2] = (int16)(left + 1);
+					e->p.i16[2] = 1;
 
 					e++;
 				}
diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h
index 308bda8b21..dfb7725621 100644
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@@ -42,7 +42,7 @@ class IDrawScanline : public GSAlignedClass<32>
 {
 public:
 	typedef void (__fastcall *SetupPrimPtr)(const GSVertexSW* vertices, const GSVertexSW& dscan);
-	typedef void (__fastcall *DrawScanlinePtr)(int right, int left, int top, const GSVertexSW& scan);
+	typedef void (__fastcall *DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan);
 	typedef void (IDrawScanline::*DrawRectPtr)(const GSVector4i& r, const GSVertexSW& v); // TODO: jit
 
 protected:
diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp
index 2aad4d892e..99ec01710a 100644
--- a/plugins/GSdx/GSRenderer.cpp
+++ b/plugins/GSdx/GSRenderer.cpp
@@ -35,6 +35,7 @@ GSRenderer::GSRenderer()
 	m_filter = theApp.GetConfig("filter", 1);
 	m_vsync = !!theApp.GetConfig("vsync", 0);
 	m_aa1 = !!theApp.GetConfig("aa1", 0);
+	m_mipmap = !!theApp.GetConfig("mipmap", 1);
 
 	s_n = 0;
 	s_dump = !!theApp.GetConfig("dump", 0);
@@ -513,6 +514,9 @@ void GSRenderer::KeyEvent(GSKeyEventData* e)
 		case VK_DELETE:
 			m_aa1 = !m_aa1;
 			return;
+		case VK_INSERT:
+			m_mipmap = !m_mipmap;
+			return;
 		}
 
 		#else
@@ -523,25 +527,23 @@ void GSRenderer::KeyEvent(GSKeyEventData* e)
 	}
 }
 
-void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear)
+void GSRenderer::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear)
 {
-	const GSDrawingContext* context = m_context;
-
-	int tw = context->TEX0.TW;
-	int th = context->TEX0.TH;
+	int tw = TEX0.TW;
+	int th = TEX0.TH;
 
 	int w = 1 << tw;
 	int h = 1 << th;
 
 	GSVector4i tr(0, 0, w, h);
 
-	int wms = context->CLAMP.WMS;
-	int wmt = context->CLAMP.WMT;
+	int wms = CLAMP.WMS;
+	int wmt = CLAMP.WMT;
 
-	int minu = (int)context->CLAMP.MINU;
-	int minv = (int)context->CLAMP.MINV;
-	int maxu = (int)context->CLAMP.MAXU;
-	int maxv = (int)context->CLAMP.MAXV;
+	int minu = (int)CLAMP.MINU;
+	int minv = (int)CLAMP.MINV;
+	int maxu = (int)CLAMP.MAXU;
+	int maxv = (int)CLAMP.MAXV;
 
 	GSVector4i vr = tr;
 
@@ -619,7 +621,7 @@ void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear)
 			if(vr.x < uv.x) vr.x = uv.x;
 			if(vr.z > uv.z + 1) vr.z = uv.z + 1;
 			break;
-		case CLAMP_REGION_REPEAT: // TODO
+		case CLAMP_REGION_REPEAT:
 			break;
 		default:
 			__assume(0);
@@ -635,9 +637,7 @@ void GSRenderer::GetTextureMinMax(GSVector4i& r, bool linear)
 			if(vr.y < uv.y) vr.y = uv.y;
 			if(vr.w > uv.w + 1) vr.w = uv.w + 1;
 			break;
-		case CLAMP_REGION_REPEAT: // TODO
-			//Xenosaga 2 and 3 use it
-			//printf("gsdx: CLAMP_REGION_REPEAT not implemented, please report\n");
+		case CLAMP_REGION_REPEAT:
 			break;
 		default:
 			__assume(0);
@@ -791,39 +791,6 @@ bool GSRenderer::TryAlphaTest(uint32& fm, uint32& zm)
 	return true;
 }
 
-bool GSRenderer::IsLinear()
-{
-	const GIFRegTEX1& TEX1 = m_context->TEX1;
-
-	bool mmin = TEX1.IsMinLinear();
-	bool mmag = TEX1.IsMagLinear();
-
-	if(mmag == mmin || TEX1.MXL == 0) // MXL == 0 => MMIN ignored, tested it on ps2
-	{
-		return mmag;
-	}
-
-	// if FST => assume Q = 1.0f (should not, but Q is very often bogus, 0 or DEN)
-	// Fixme : Why should Q be bogus? (it used to be - Gabest)
-
-	if(!TEX1.LCM && !PRIM->FST)
-	{
-		float K = (float)TEX1.K / 16;
-		float f = (float)(1 << TEX1.L) / log(2.0f);
-
-		// TODO: abs(Qmin) may not be <= abs(Qmax), check the sign
-
-		float LODmin = K + log(1.0f / fabs(m_vt.m_max.t.z)) * f;
-		float LODmax = K + log(1.0f / fabs(m_vt.m_min.t.z)) * f;
-
-		return LODmax <= 0 ? mmag : LODmin > 0 ? mmin : mmag || mmin;
-	}
-	else
-	{
-		return TEX1.K <= 0 ? mmag : TEX1.K > 0 ? mmin : mmag || mmin;
-	}
-}
-
 bool GSRenderer::IsOpaque()
 {
 	if(PRIM->AA1)
diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h
index d7659928c9..975938868b 100644
--- a/plugins/GSdx/GSRenderer.h
+++ b/plugins/GSdx/GSRenderer.h
@@ -43,6 +43,7 @@ protected:
 	int m_filter;
 	bool m_vsync;
 	bool m_aa1;
+	bool m_mipmap;
 	bool m_framelimit;
 
 	virtual GSTexture* GetOutput(int i) = 0;
@@ -51,10 +52,9 @@ protected:
 
 	// following functions need m_vt to be initialized
 
-	void GetTextureMinMax(GSVector4i& r, bool linear);
+	void GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear);
 	void GetAlphaMinMax();
 	bool TryAlphaTest(uint32& fm, uint32& zm);
-	bool IsLinear();
 	bool IsOpaque();
 
 public:
diff --git a/plugins/GSdx/GSRendererDX.h b/plugins/GSdx/GSRendererDX.h
index fc794cc3f4..9534570ae3 100644
--- a/plugins/GSdx/GSRendererDX.h
+++ b/plugins/GSdx/GSRendererDX.h
@@ -296,7 +296,7 @@ public:
 			ps_sel.aem = env.TEXA.AEM;
 			ps_sel.tfx = context->TEX0.TFX;
 			ps_sel.tcc = context->TEX0.TCC;
-			ps_sel.ltf = m_filter == 2 ? IsLinear() : m_filter;
+			ps_sel.ltf = m_filter == 2 ? m_vt.IsLinear() : m_filter;
 			ps_sel.rt = tex->m_target;
 
 			int w = tex->m_texture->GetWidth();
diff --git a/plugins/GSdx/GSRendererHW.h b/plugins/GSdx/GSRendererHW.h
index 561ac01734..4bb4a5766d 100644
--- a/plugins/GSdx/GSRendererHW.h
+++ b/plugins/GSdx/GSRendererHW.h
@@ -566,7 +566,7 @@ protected:
 
 			GSVector4i r;
 
-			GetTextureMinMax(r, IsLinear());
+			GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
 
 			tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
 
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 9b1c39a6d8..cb07b8600a 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -102,7 +102,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
 
 		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
 
-		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<GSVector4i::Outside>(psm.bs), m_output, pitch, m_env.TEXA);
+		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
 
 		m_texture[i]->Update(r, m_output, pitch);
 
@@ -136,7 +136,7 @@ void GSRendererSW::Draw()
 		return;
 	}
 
-	if(s_dump)
+	if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
 	{
 		uint64 frame = m_perfmon.GetFrame();
 
@@ -204,7 +204,7 @@ void GSRendererSW::Draw()
 	m_perfmon.Put(GSPerfMon::Prim, stats.prims);
 	m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
 
-	if(s_dump)
+	if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
 	{
 		uint64 frame = m_perfmon.GetFrame();
 
@@ -324,7 +324,7 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 			gd.sel.tfx = context->TEX0.TFX;
 			gd.sel.tcc = context->TEX0.TCC;
 			gd.sel.fst = PRIM->FST;
-			gd.sel.ltf = IsLinear();
+			gd.sel.ltf = m_vt.IsLinear();
 			gd.sel.tlu = GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0;
 			gd.sel.wms = context->CLAMP.WMS;
 			gd.sel.wmt = context->CLAMP.WMT;
@@ -370,6 +370,134 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 				}
 			}
 
+			GIFRegTEX0 MIP_TEX0 = context->TEX0;
+			GIFRegCLAMP MIP_CLAMP = context->CLAMP;
+
+			if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
+			{
+				int level = (int)(m_vt.m_lod.x + 0.5f);
+
+				// FIXME: onimusa 3
+
+				level = std::min<int>(level, context->TEX1.MXL); 
+				level = std::min<int>(level, 6);
+
+				if(level > 0)
+				{
+					// printf("lvl %d\n", level);
+
+					switch(level)
+					{
+					case 1: 
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP1; 
+						MIP_TEX0.TBW = context->MIPTBP1.TBW1; 
+						break;
+					case 2: 
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP2; 
+						MIP_TEX0.TBW = context->MIPTBP1.TBW2; 
+						break;
+					case 3: 
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP3; 
+						MIP_TEX0.TBW = context->MIPTBP1.TBW3; 
+						break;
+					case 4: 
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP4; 
+						MIP_TEX0.TBW = context->MIPTBP2.TBW4; 
+						break;
+					case 5: 
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP5; 
+						MIP_TEX0.TBW = context->MIPTBP2.TBW5; 
+						break;
+					case 6: 
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP6; 
+						MIP_TEX0.TBW = context->MIPTBP2.TBW6; 
+						break;
+					default:
+						__assume(0);
+					}
+
+					ASSERT(MIP_TEX0.TBP0 != 0 && MIP_TEX0.TBW != 0);
+				
+					int tw = (int)MIP_TEX0.TW - level;
+					int th = (int)MIP_TEX0.TH - level;
+
+					switch(context->TEX1.MMIN)
+					{
+					case 2: case 3: // point (min size 1)
+						tw = std::max<int>(tw, 0);
+						th = std::max<int>(th, 0);
+						break; 
+					case 4: case 5: // linear (min size 8)
+						tw = std::max<int>(tw, 3);
+						th = std::max<int>(th, 3);
+						break; 
+					default:
+						__assume(0);
+					}
+
+					// scale down the texture coordinates, including vertex trace
+
+					GSVector4 scale = GSVector4(1.0f) / GSVector4(1 << ((int)MIP_TEX0.TW - tw), 1 << ((int)MIP_TEX0.TH - th), 1, 1);
+
+					GSVertexSW* v = m_vertices;
+
+					for(int i = 0, j = m_count; i < j; i++)
+					{
+						v[i].t *= scale;
+					}
+
+					m_vt.m_min.t *= scale;
+					m_vt.m_max.t *= scale;
+
+					MIP_TEX0.TW = (uint32)tw;
+					MIP_TEX0.TH = (uint32)th;
+
+					// this shift is done even for repeat modes
+
+					MIP_CLAMP.MINU >>= level;
+					MIP_CLAMP.MAXU >>= level;
+					MIP_CLAMP.MINV >>= level;
+					MIP_CLAMP.MAXV >>= level;
+/*
+					printf("%d%d%d%d%d L %d K %03x %.2f lod %.2f %.2f q %f %f\n", 
+						m_context->TEX1.MXL, 
+						m_context->TEX1.MMAG,
+						m_context->TEX1.MMIN,
+						PRIM->FST,
+						m_context->TEX1.LCM,
+						m_context->TEX1.L,
+						m_context->TEX1.K,
+						(float)m_context->TEX1.K / 16,
+						m_context->TEX1.MXL > 0 ? m_vt.m_lod.x : 0, 
+						m_context->TEX1.MXL > 0 ? m_vt.m_lod.y : 0,
+						1.0f / m_vt.m_min.t.z,
+						1.0f / m_vt.m_max.t.z);
+*/
+					if(s_dump)
+					{
+						uint64 frame = m_perfmon.GetFrame();
+
+						string s;
+
+						if(s_save && s_n >= s_saven)
+						{
+							s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d_(%d%d%d%d%d %.2f %.2f).bmp", 
+								s_n, frame, (int)MIP_TEX0.TBP0, (int)MIP_TEX0.PSM,
+								m_context->TEX1.MXL, 
+								m_context->TEX1.MMAG,
+								m_vt.m_filter.mmag, 
+								m_context->TEX1.MMIN,
+								m_vt.m_filter.mmin, 
+								m_context->TEX1.MXL > 0 ? m_vt.m_lod.x : 0, 
+								m_context->TEX1.MXL > 0 ? m_vt.m_lod.y : 0
+								);
+
+							m_mem.SaveBMP(s, MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM, 1 << MIP_TEX0.TW, 1 << MIP_TEX0.TH);
+						}
+					}
+				}
+			}
+
 			if(gd.sel.ltf)
 			{
 				if(gd.sel.fst)
@@ -389,9 +517,9 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 
 			GSVector4i r;
 
-			GetTextureMinMax(r, gd.sel.ltf);
+			GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf);
 
-			const GSTextureCacheSW::GSTexture* t = m_tc->Lookup(context->TEX0, env.TEXA, r);
+			const GSTextureCacheSW::GSTexture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, r);
 
 			if(!t) {ASSERT(0); return;}
 
@@ -400,10 +528,10 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 
 			gd.sel.tw = t->m_tw - 3;
 
-			uint16 tw = (uint16)(1 << context->TEX0.TW);
-			uint16 th = (uint16)(1 << context->TEX0.TH);
+			uint16 tw = (uint16)(1 << MIP_TEX0.TW);
+			uint16 th = (uint16)(1 << MIP_TEX0.TH);
 
-			switch(context->CLAMP.WMS)
+			switch(MIP_CLAMP.WMS)
 			{
 			case CLAMP_REPEAT:
 				gd.t.min.u16[0] = tw - 1;
@@ -416,20 +544,20 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 				gd.t.mask.u32[0] = 0;
 				break;
 			case CLAMP_REGION_CLAMP:
-				gd.t.min.u16[0] = std::min<int>(context->CLAMP.MINU, tw - 1);
-				gd.t.max.u16[0] = std::min<int>(context->CLAMP.MAXU, tw - 1);
+				gd.t.min.u16[0] = std::min<int>(MIP_CLAMP.MINU, tw - 1);
+				gd.t.max.u16[0] = std::min<int>(MIP_CLAMP.MAXU, tw - 1);
 				gd.t.mask.u32[0] = 0;
 				break;
 			case CLAMP_REGION_REPEAT:
-				gd.t.min.u16[0] = context->CLAMP.MINU;
-				gd.t.max.u16[0] = context->CLAMP.MAXU;
+				gd.t.min.u16[0] = MIP_CLAMP.MINU;
+				gd.t.max.u16[0] = MIP_CLAMP.MAXU;
 				gd.t.mask.u32[0] = 0xffffffff;
 				break;
 			default:
 				__assume(0);
 			}
 
-			switch(context->CLAMP.WMT)
+			switch(MIP_CLAMP.WMT)
 			{
 			case CLAMP_REPEAT:
 				gd.t.min.u16[4] = th - 1;
@@ -442,13 +570,13 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
 				gd.t.mask.u32[2] = 0;
 				break;
 			case CLAMP_REGION_CLAMP:
-				gd.t.min.u16[4] = std::min<int>(context->CLAMP.MINV, th - 1);
-				gd.t.max.u16[4] = std::min<int>(context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
+				gd.t.min.u16[4] = std::min<int>(MIP_CLAMP.MINV, th - 1);
+				gd.t.max.u16[4] = std::min<int>(MIP_CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
 				gd.t.mask.u32[2] = 0;
 				break;
 			case CLAMP_REGION_REPEAT:
-				gd.t.min.u16[4] = context->CLAMP.MINV;
-				gd.t.max.u16[4] = context->CLAMP.MAXV;
+				gd.t.min.u16[4] = MIP_CLAMP.MINV;
+				gd.t.max.u16[4] = MIP_CLAMP.MAXV;
 				gd.t.mask.u32[2] = 0xffffffff;
 				break;
 			default:
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index ab8ac4d87e..019a228cd4 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -587,6 +587,47 @@ template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* r)
 	if(TEX0.TH > 10) TEX0.TH = 10;
 
 	ApplyTEX0(i, TEX0);
+
+	if(m_env.CTXT[i].TEX1.MTBA)
+	{
+		uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp;
+
+		uint32 tbp = TEX0.TBP0;
+		uint32 tbw = TEX0.TBW;
+		uint32 th = TEX0.TH;
+
+		if(th >= 3)
+		{
+			tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8;
+			tbw = std::max<uint32>(tbw >> 1, 1);
+			th--;
+
+			m_env.CTXT[i].MIPTBP1.TBP1 = tbp;
+			m_env.CTXT[i].MIPTBP1.TBW1 = tbw;
+
+			tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8;
+			tbw = std::max<uint32>(tbw >> 1, 1);
+			th--;
+
+			m_env.CTXT[i].MIPTBP1.TBP2 = tbp;
+			m_env.CTXT[i].MIPTBP1.TBW2 = tbw;
+
+			tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8;
+			tbw = std::max<uint32>(tbw >> 1, 1);
+			th--;
+
+			m_env.CTXT[i].MIPTBP1.TBP3 = tbp;
+			m_env.CTXT[i].MIPTBP1.TBW3 = tbw;
+
+			// NOTE: TEX1.MXL must not be automatically set to 3 here
+		}
+		else
+		{
+			ASSERT(0);
+		}
+
+		// printf("MTBA\n");
+	}
 }
 
 template<int i> void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp
index 84b1c43ec1..209de07871 100644
--- a/plugins/GSdx/GSTextureCache.cpp
+++ b/plugins/GSdx/GSTextureCache.cpp
@@ -292,7 +292,7 @@ void GSTextureCache::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rec
 
 	GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
 
-	GSVector4i r = rect.ralign<GSVector4i::Outside>(bs);
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
 
 	if(!target)
 	{
@@ -881,7 +881,7 @@ void GSTextureCache::Source::Update(const GIFRegTEX0& TEX0, const GIFRegTEXA& TE
 	int tw = std::max<int>(1 << m_TEX0.TW, bs.x);
 	int th = std::max<int>(1 << m_TEX0.TH, bs.y);
 
-	GSVector4i r = rect.ralign<GSVector4i::Outside>(bs);
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
 
 	if(r.eq(GSVector4i(0, 0, tw, th)))
 	{
diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp
index 52af9e756e..19cf4dbaf0 100644
--- a/plugins/GSdx/GSTextureCacheSW.cpp
+++ b/plugins/GSdx/GSTextureCacheSW.cpp
@@ -66,12 +66,12 @@ const GSTextureCacheSW::GSTexture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TE
 
 	if(t == NULL)
 	{
-		t = new GSTexture(m_state);
+		const GSOffset* o = m_state->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+		t = new GSTexture(m_state, o);
 
 		m_textures.insert(t);
 
-		const GSOffset* o = m_state->m_context->offset.tex;
-
 		GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
 
 		int tw = 1 << TEX0.TW;
@@ -132,7 +132,7 @@ void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
 
 	GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
 
-	GSVector4i r = rect.ralign<GSVector4i::Outside>(bs);
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
 
 	for(int y = r.top; y < r.bottom; y += bs.y)
 	{
@@ -209,8 +209,9 @@ void GSTextureCacheSW::IncAge()
 
 //
 
-GSTextureCacheSW::GSTexture::GSTexture(GSState* state)
+GSTextureCacheSW::GSTexture::GSTexture(GSState* state, const GSOffset* offset)
 	: m_state(state)
+	, m_offset(offset)
 	, m_buff(NULL)
 	, m_tw(0)
 	, m_age(0)
@@ -241,10 +242,25 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
 
 	GSVector2i bs = psm.bs;
 
+	int shift = psm.pal == 0 ? 2 : 0;
+
 	int tw = std::max<int>(1 << TEX0.TW, bs.x);
 	int th = std::max<int>(1 << TEX0.TH, bs.y);
 
-	GSVector4i r = rect.ralign<GSVector4i::Outside>(bs);
+	GSVector4i r = rect;
+
+	bool repeating = m_TEX0.IsRepeating();
+
+	if(m_TEX0.TBW == 1) // repeating)
+	{
+		// FIXME: 
+		// - marking a block prevents fetching it again to a different part of the texture
+		// - only a real issue for TBW = 1 mipmap levels, where the repeating part is below and often exploited
+
+		r = GSVector4i(0, 0, tw, th);
+	}
+
+	r = r.ralign<Align_Outside>(bs);
 
 	if(r.eq(GSVector4i(0, 0, tw, th)))
 	{
@@ -260,20 +276,20 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
 			return false;
 		}
 
-		m_tw = std::max<int>(TEX0.TW, psm.pal > 0 ? 5 : 3); // makes one row 32 bytes at least, matches the smallest block size that is allocated above for m_buff
+#ifdef DEBUG
+		for(uint32 i = 0, j = tw * th * sizeof(uint8); i < j; i++) ((uint8*)m_buff)[i] = 0xff;
+#endif
+
+		m_tw = std::max<int>(TEX0.TW, 5 - shift); // makes one row 32 bytes at least, matches the smallest block size that is allocated above for m_buff
 	}
 
 	GSLocalMemory& mem = m_state->m_mem;
 
-	const GSOffset* o = m_state->m_context->offset.tex;
-
-	bool repeating = m_TEX0.IsRepeating();
+	const GSOffset* RESTRICT o = m_offset;
 
 	uint32 blocks = 0;
 
-	GSLocalMemory::readTextureBlock rtxb = psm.rtxbP;
-
-	int shift = psm.pal == 0 ? 2 : 0;
+	GSLocalMemory::readTextureBlock rtxbP = psm.rtxbP;
 
 	uint32 pitch = (1 << m_tw) << shift;
 
@@ -299,7 +315,7 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
 						m_valid[row] |= col;
 					}
 
-					(mem.*rtxb)(block, &dst[x << shift], pitch, TEXA);
+					(mem.*rtxbP)(block, &dst[x << shift], pitch, TEXA);
 
 					blocks++;
 				}
diff --git a/plugins/GSdx/GSTextureCacheSW.h b/plugins/GSdx/GSTextureCacheSW.h
index fd2a1c031f..3ab4bc9760 100644
--- a/plugins/GSdx/GSTextureCacheSW.h
+++ b/plugins/GSdx/GSTextureCacheSW.h
@@ -30,6 +30,7 @@ public:
 	{
 	public:
 		GSState* m_state;
+		const GSOffset* m_offset;
 		GIFRegTEX0 m_TEX0;
 		GIFRegTEXA m_TEXA;
 		void* m_buff;
@@ -38,7 +39,7 @@ public:
 		uint32 m_age;
 		bool m_complete;
 
-		explicit GSTexture(GSState* state);
+		explicit GSTexture(GSState* state, const GSOffset* offset);
 		virtual ~GSTexture();
 
 		bool Update(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r);
diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h
index 5d94535ccc..f5bc6af07b 100644
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
@@ -3,7 +3,21 @@
 
 #pragma once
 
-// NOTE: x64 version of the _mm_set_* functions are terrible, first they store components into memory then reload in one piece (VS2008 SP1)
+enum Align_Mode 
+{
+	Align_Outside, 
+	Align_Inside, 
+	Align_NegInf, 
+	Align_PosInf
+};
+
+enum Round_Mode 
+{
+	Round_NearestInt = 8, 
+	Round_NegInf = 9, 
+	Round_PosInf = 10, 
+	Round_Truncate = 11
+};
 
 #pragma pack(push, 1)
 
@@ -196,8 +210,6 @@ public:
 		return sat_i32(a);
 	}
 
-	enum RoundMode {Outside, Inside, NegInf, PosInf};
-
 	template<int mode> __forceinline GSVector4i ralign(const GSVector2i& a) const
 	{
 		// a must be 1 << n
@@ -208,10 +220,10 @@ public:
 
 		switch(mode)
 		{
-		case Inside: v = *this + mask; break;
-		case Outside: v = *this + mask.zwxy(); break;
-		case NegInf: v = *this; break;
-		case PosInf: v = *this + mask.zwzw(); break;
+		case Align_Inside: v = *this + mask; break;
+		case Align_Outside: v = *this + mask.zwxy(); break;
+		case Align_NegInf: v = *this; break;
+		case Align_PosInf: v = *this + mask.zwzw(); break;
 		default: ASSERT(0); break;
 		}
 
@@ -1029,15 +1041,19 @@ public:
 
 	__forceinline bool alltrue() const
 	{
-		return _mm_movemask_epi8(m) == 0xffff;
+		return mask() == 0xffff;
 	}
 
 	__forceinline bool allfalse() const
 	{
 		#if _M_SSE >= 0x401
+
 		return _mm_testz_si128(m, m) != 0;
+
 		#else
-		return _mm_movemask_epi8(m) == 0;
+
+		return mask() == 0;
+
 		#endif
 	}
 
@@ -1053,9 +1069,13 @@ public:
 	template<int i> __forceinline int extract8() const
 	{
 		#if _M_SSE >= 0x401
+
 		return _mm_extract_epi8(m, i);
+
 		#else
+
 		return (int)u8[i];
+
 		#endif
 	}
 
@@ -1081,10 +1101,15 @@ public:
 	template<int i> __forceinline int extract32() const
 	{
 		if(i == 0) return GSVector4i::store(*this);
+
 		#if _M_SSE >= 0x401
+
 		return _mm_extract_epi32(m, i);
+
 		#else
+
 		return i32[i];
+
 		#endif
 	}
 
@@ -1102,10 +1127,15 @@ public:
 	template<int i> __forceinline int64 extract64() const
 	{
 		if(i == 0) return GSVector4i::storeq(*this);
+
 		#if _M_SSE >= 0x401
+
 		return _mm_extract_epi64(m, i);
+
 		#else
+
 		return i64[i];
+
 		#endif
 	}
 
@@ -2388,8 +2418,6 @@ public:
 		return (v + v) - (v * v) * *this;
 	}
 
-	enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11};
-
 	template<int mode> __forceinline GSVector4 round() const
 	{
 		#if _M_SSE >= 0x401
@@ -2404,17 +2432,17 @@ public:
 
 		b = a + b - b;
 
-		if((mode & 7) == (NegInf & 7))
+		if((mode & 7) == (Round_NegInf & 7))
 		{
 			return b - ((a < b) & m_x3f800000);
 		}
 
-		if((mode & 7) == (PosInf & 7))
+		if((mode & 7) == (Round_PosInf & 7))
 		{
 			return b + ((a > b) & m_x3f800000);
 		}
 
-		ASSERT((mode & 7) == (NearestInt & 7)); // other modes aren't implemented
+		ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented
 
 		return b;
 
@@ -2423,12 +2451,62 @@ public:
 
 	__forceinline GSVector4 floor() const
 	{
-		return round<NegInf>();
+		return round<Round_NegInf>();
 	}
 
 	__forceinline GSVector4 ceil() const
 	{
-		return round<PosInf>();
+		return round<Round_PosInf>();
+	}
+
+	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+
+	#define LOG_POLY0(x, c0) GSVector4(c0)
+	#define LOG_POLY1(x, c0, c1) ((LOG_POLY0(x, c1) * x) + GSVector4(c0))
+	#define LOG_POLY2(x, c0, c1, c2) ((LOG_POLY1(x, c1, c2) * x) + GSVector4(c0))
+	#define LOG_POLY3(x, c0, c1, c2, c3) ((LOG_POLY2(x, c1, c2, c3) * x) + GSVector4(c0))
+	#define LOG_POLY4(x, c0, c1, c2, c3, c4) ((LOG_POLY3(x, c1, c2, c3, c4) * x) + GSVector4(c0))
+	#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) ((LOG_POLY4(x, c1, c2, c3, c4, c5) * x) + GSVector4(c0))
+
+	__forceinline GSVector4 log2(int precision = 5) const
+	{
+		// NOTE: sign bit ignored, safe to pass negative numbers
+
+		GSVector4i exp = GSVector4i::xff000000() >> 1;
+		GSVector4i mant = GSVector4i::x007fffff();
+		GSVector4 one(1.0f);
+
+		GSVector4i i = GSVector4i::cast(*this);
+
+		GSVector4 e = GSVector4(((i & exp) >> 23) - GSVector4i::x0000007f());
+		GSVector4 m = GSVector4::cast(i & mant) | one;
+
+		GSVector4 p;
+
+		// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+
+		switch(precision)
+		{
+		case 3:
+			p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+			break;
+		case 4:
+			p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+			break;
+		default:
+		case 5:
+			p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+			break;
+		case 6:
+			p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+			break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+		
+		p = p * (m - one);
+
+		return p + e;
 	}
 
 	__forceinline GSVector4 mod2x(const GSVector4& f, const int scale = 256) const
@@ -2528,7 +2606,7 @@ public:
 		return GSVector4(_mm_max_ps(m, a));
 	}
 
-	__forceinline GSVector4 blend8(const GSVector4& a, const GSVector4& mask)  const
+	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask)  const
 	{
 		#if _M_SSE >= 0x401
 
@@ -2573,16 +2651,19 @@ public:
 
 	__forceinline bool alltrue() const
 	{
-		return _mm_movemask_ps(m) == 0xf;
+		return mask() == 0xf;
 	}
 
 	__forceinline bool allfalse() const
 	{
 		#if _M_SSE >= 0x401
-		__m128i a = _mm_castps_si128(m);
-		return _mm_testz_si128(a, a) != 0;
+
+		return _mm_testz_ps(m, m) != 0;
+
 		#else
-		return _mm_movemask_ps(m) == 0;
+
+		return mask() == 0;
+
 		#endif
 	}
 
@@ -2591,9 +2672,13 @@ public:
 	template<int i> __forceinline int extract() const
 	{
 		#if _M_SSE >= 0x401
+
 		return _mm_extract_ps(m, i);
+
 		#else
+
 		return i32[i];
+
 		#endif
 	}
 
@@ -2861,8 +2946,6 @@ public:
 		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
 
 	#define VECTOR4_SHUFFLE_1(xs, xn) \
-		__forceinline GSVector4 xs##4() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
-		__forceinline GSVector4 xs##4(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
 		VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
 		VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
 		VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
@@ -2894,8 +2977,6 @@ __forceinline GSVector4 GSVector4::cast(const GSVector4i& v)
 	return GSVector4(_mm_castsi128_ps(v.m));
 }
 
-#if _M_SSE >= 0x500
-
 class GSVector8;
 
 __aligned(class, 32) GSVector8i
@@ -2915,16 +2996,25 @@ public:
 		uint16 u16[16];
 		uint32 u32[8];
 		uint64 u64[4];
+		#if _M_SSE >= 0x500
 		__m256i m;
+		__m128i m0, m1;
+		#else
+		__m128i m[2];
+		#endif
 	};
 
-	__forceinline GSVector8i()
-	{
-	}
+	__forceinline GSVector8i() {}
+
+	__forceinline explicit GSVector8i(const GSVector8& v);
+
+	static GSVector8i cast(const GSVector8& v);
+
+	#if _M_SSE >= 0x500
 
 	__forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
 	{
-		m = _mm256_set_epi32(w0, z0, y0, x0, w0, z0, y0, x0);
+		m = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0);
 	}
 
 	__forceinline GSVector8i(__m128i m0, __m128i m1)
@@ -2952,8 +3042,6 @@ public:
 		this->m = m;
 	}
 
-	__forceinline explicit GSVector8i(const GSVector8& v);
-
 	__forceinline void operator = (const GSVector8i& v)
 	{
 		m = v.m;
@@ -2979,8 +3067,6 @@ public:
 		return m;
 	}
 
-	static GSVector8i cast(const GSVector8& v);
-
 	// TODO
 
 	template<int i> __forceinline GSVector4i extract() const
@@ -3002,7 +3088,7 @@ public:
 
 	template<bool aligned> __forceinline static GSVector8i load(const void* p)
 	{
-		return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m128i*)p));
+		return GSVector8i(aligned ? _mm_load256i_si256((__m256i*)p) : _mm256i_loadu_si256((__m256i*)p));
 	}
 
 	template<bool aligned> __forceinline static void store(void* p, const GSVector4i& v)
@@ -3010,6 +3096,103 @@ public:
 		if(aligned) _mm256i_store_si256((__m256i*)p, v.m);
 		else _mm256i_storeu_si256((__m256i*)p, v.m);
 	}
+
+	#else
+
+	__forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m[0] = _mm_set_epi32(w0, z0, y0, x0);
+		m[1] = _mm_set_epi32(w1, z1, y1, x1);		
+	}
+
+	__forceinline GSVector8i(__m128i m0, __m128i m1)
+	{
+		m[0] = m0;
+		m[1] = m1;
+	}
+
+	__forceinline GSVector8i(const GSVector8i& v)
+	{
+		m[0] = v.m[0];
+		m[1] = v.m[1];
+	}
+
+	__forceinline explicit GSVector8i(int i)
+	{
+		m[0] = m[1] = _mm_set1_epi32(i);
+	}
+
+	__forceinline explicit GSVector8i(__m128i m)
+	{
+		this->m[0] = this->m[1] = m;
+	}
+
+	__forceinline void operator = (const GSVector8i& v)
+	{
+		m[0] = v.m[0];
+		m[1] = v.m[1];
+	}
+
+	__forceinline void operator = (int i)
+	{
+		m[0] = m[1] = _mm_set1_epi32(i);
+	}
+
+	__forceinline void operator = (__m128i m)
+	{
+		this->m[0] = this->m[1] = m;
+	}
+
+	// TODO
+
+	template<int i> __forceinline GSVector4i extract() const
+	{
+		return GSVector4i(m[i]);
+	}
+
+	template<int i> __forceinline GSVector8i insert(__m128i m) const
+	{
+		GSVector8i v = *this;
+
+		v.m[i] = m;
+
+		return v;
+	}
+
+	__forceinline static GSVector8i zero()
+	{
+		GSVector8i v;
+
+		v.m[0] = v.m[1] = _mm_setzero_si128();
+		
+		return v;
+	}
+
+	// TODO
+
+	template<bool aligned> __forceinline static GSVector8i load(const void* p)
+	{
+		return GSVector8i(
+			aligned ? _mm_load_si128((__m128i*)p + 0) : _mm_loadu_si128((__m128i*)p + 0),
+			aligned ? _mm_load_si128((__m128i*)p + 1) : _mm_loadu_si128((__m128i*)p + 1),
+			);
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4i& v)
+	{
+		if(aligned)
+		{
+			_mm_store_si128((__m128i*)p + 0, v.m[0]);
+			_mm_store_si128((__m128i*)p + 1, v.m[1]);
+		}
+		else
+		{
+			_mm_storeu_si128((__m128i*)p + 0, v.m[0]);
+			_mm_storeu_si128((__m128i*)p + 1, v.m[1]);
+		}
+	}
+
+	#endif
 };
 
 __aligned(class, 32) GSVector8
@@ -3029,25 +3212,97 @@ public:
 		uint16 u16[16];
 		uint32 u32[8];
 		uint64 u64[4];
+		#if _M_SSE >= 0x500
 		__m256 m;
-
-		// TODO: _M_SSE < 0x500 => union {__m128 m0, m1;}; and replace each function with a pair of 128 bit intructions
+		__m128 m0, m1;
+		#else
+		__m128 m[2];
+		#endif
 	};
 
-	__forceinline GSVector8()
+	__forceinline GSVector8() {}
+
+	__forceinline explicit GSVector8(const GSVector8i& v);
+
+	__forceinline static GSVector8 cast(const GSVector8i& v);
+
+	__forceinline GSVector8 rcpnr() const
 	{
+		GSVector8 v = rcp();
+
+		return (v + v) - (v * v) * *this;
 	}
 
+	__forceinline GSVector8 floor() const
+	{
+		return round<Round_NegInf>();
+	}
+
+	__forceinline GSVector8 ceil() const
+	{
+		return round<Round_PosInf>();
+	}
+
+	__forceinline GSVector8 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector8(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector8(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector8(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v, float f)
+	{
+		return v + GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v, float f)
+	{
+		return v - GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v, float f)
+	{
+		return v * GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v, float f)
+	{
+		return v / GSVector8(f);
+	}
+
+	__forceinline static GSVector8 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	#if _M_SSE >= 0x500
+
 	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
 	{
-		m = _mm256_set_ps(w0, z0, y0, x0, w0, z0, y0, x0);
+		m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
 	}
 
 	__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
 	{
-		GSVector8i v(x0, y0, z0, w0, x1, y1, z1, w1);
-
-		m = _mm256_cvtepi32_ps(v);
+		m = _mm256_cvtepi32_ps(_mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0));
 	}
 
 	__forceinline GSVector8(__m128 m0, __m128 m1)
@@ -3062,7 +3317,7 @@ public:
 
 	__forceinline explicit GSVector8(float f)
 	{
-		m = _mm256_set1_ps(f); // _mm256_broadcast_ss(&f); ?
+		m = _mm256_set1_ps(f);
 	}
 
 	__forceinline explicit GSVector8(__m128 m)
@@ -3076,8 +3331,6 @@ public:
 		this->m = m;
 	}
 
-	__forceinline explicit GSVector8(const GSVector8i& v);
-
 	__forceinline void operator = (const GSVector8& v)
 	{
 		m = v.m;
@@ -3104,8 +3357,6 @@ public:
 		return m;
 	}
 
-	__forceinline static GSVector8 cast(const GSVector8i& v);
-
 	__forceinline GSVector8 abs() const
 	{
 		return *this & cast(GSVector8i(GSVector4i::x7fffffff())); // TODO: add GSVector8 consts
@@ -3121,30 +3372,11 @@ public:
 		return GSVector8(_mm256_rcp_ps(m));
 	}
 
-	__forceinline GSVector8 rcpnr() const
-	{
-		GSVector8 v = rcp();
-
-		return (v + v) - (v * v) * *this;
-	}
-
-	enum RoundMode {NearestInt = 8, NegInf = 9, PosInf = 10, Truncate = 11};
-
 	template<int mode> __forceinline GSVector8 round() const
 	{
 		return GSVector8(_mm256_round_ps(m, mode));
 	}
 
-	__forceinline GSVector8 floor() const
-	{
-		return round<NegInf>();
-	}
-
-	__forceinline GSVector8 ceil() const
-	{
-		return round<PosInf>();
-	}
-
 	// TODO
 
 	__forceinline GSVector8 min(const GSVector8& a) const
@@ -3157,7 +3389,12 @@ public:
 		return GSVector8(_mm256_max_ps(m, a));
 	}
 
-	__forceinline GSVector8 blend8(const GSVector8& a, const GSVector8& mask)  const
+	template<int mask> __forceinline GSVector8 blend32(const GSVector8& a)  const
+	{
+		return GSVector8(_mm256_blend_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask)  const
 	{
 		return GSVector8(_mm256_blendv_ps(m, a, mask));
 	}
@@ -3186,12 +3423,12 @@ public:
 
 	__forceinline GSVector8 l2h() const
 	{
-		return insert<1>(extract<0>());
+		return GSVector8(_mm256_shuffle_ps(m, m, 0x88));
 	}
 
 	__forceinline GSVector8 h2l() const
 	{
-		return insert<0>(extract<1>());
+		return GSVector8(_mm256_shuffle_ps(m, m, 0x22));
 	}
 
 	__forceinline GSVector8 andnot(const GSVector8& v) const
@@ -3206,12 +3443,12 @@ public:
 
 	__forceinline bool alltrue() const
 	{
-		return _mm256_movemask_ps(m) == 0xff;
+		return mask() == 0xff;
 	}
 
 	__forceinline bool allfalse() const
 	{
-		return _mm256_movemask_ps(m) == 0;
+		return _mm256_testz_ps(m, m) != 0;
 	}
 
 	template<int i> __forceinline GSVector4 extract() const
@@ -3239,11 +3476,6 @@ public:
 		_mm256_zeroall();
 	}
 
-	__forceinline static GSVector8 xffffffff()
-	{
-		return zero() == zero();
-	}
-
 	// TODO: load low, ss
 
 	template<bool aligned> __forceinline static GSVector8 load(const void* p)
@@ -3261,11 +3493,6 @@ public:
 
 	// TODO
 
-	__forceinline GSVector8 operator - () const
-	{
-		return neg();
-	}
-
 	__forceinline void operator += (const GSVector8& v)
 	{
 		m = _mm256_add_ps(m, v);
@@ -3286,26 +3513,6 @@ public:
 		m = _mm256_div_ps(m, v);
 	}
 
-	__forceinline void operator += (float f)
-	{
-		*this += GSVector8(f);
-	}
-
-	__forceinline void operator -= (float f)
-	{
-		*this -= GSVector8(f);
-	}
-
-	__forceinline void operator *= (float f)
-	{
-		*this *= GSVector8(f);
-	}
-
-	__forceinline void operator /= (float f)
-	{
-		*this /= GSVector8(f);
-	}
-
 	__forceinline void operator &= (const GSVector8& v)
 	{
 		m = _mm256_and_ps(m, v);
@@ -3341,26 +3548,6 @@ public:
 		return GSVector8(_mm256_div_ps(v1, v2));
 	}
 
-	__forceinline friend GSVector8 operator + (const GSVector8& v, float f)
-	{
-		return v + GSVector8(f);
-	}
-
-	__forceinline friend GSVector8 operator - (const GSVector8& v, float f)
-	{
-		return v - GSVector8(f);
-	}
-
-	__forceinline friend GSVector8 operator * (const GSVector8& v, float f)
-	{
-		return v * GSVector8(f);
-	}
-
-	__forceinline friend GSVector8 operator / (const GSVector8& v, float f)
-	{
-		return v / GSVector8(f);
-	}
-
 	__forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2)
 	{
 		return GSVector8(_mm256_and_ps(v1, v2));
@@ -3410,6 +3597,361 @@ public:
 		__forceinline GSVector8 xs##ys() const {return GSVector8(_mm256_permute2f128_ps(m, m, xn | (yn << 4)));} \
 		__forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, xn | (yn << 4)));} \
 
+	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#else
+
+	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
+	{
+		m[0] = _mm_set_ps(w0, z0, y0, x0);
+		m[1] = _mm_set_ps(w1, z1, y1, x1);
+	}
+
+	__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m[0] = _mm_cvtepi32_ps(_mm_set_epi32(w0, z0, y0, x0));
+		m[1] = _mm_cvtepi32_ps(_mm_set_epi32(w1, z1, y1, x1));
+	}
+
+	__forceinline GSVector8(__m128 m0, __m128 m1)
+	{
+		m[0] = m0;
+		m[1] = m1;
+	}
+
+	__forceinline GSVector8(const GSVector8& v)
+	{
+		m[0] = v.m[0];
+		m[1] = v.m[1];
+	}
+
+	__forceinline explicit GSVector8(float f)
+	{
+		m[0] = m[1] = _mm_set1_ps(f);
+	}
+
+	__forceinline explicit GSVector8(__m128 m)
+	{
+		this->m[0] = this->m[1] = m;
+	}
+
+	__forceinline void operator = (const GSVector8& v)
+	{
+		m[0] = v.m[0];
+		m[1] = v.m[1];
+	}
+
+	__forceinline void operator = (float f)
+	{
+		m[0] = m[1] = _mm_set1_ps(f);
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m[0] = this->m[1] = m;
+	}
+
+	__forceinline GSVector8 abs() const
+	{
+		GSVector4 mask = GSVector4::cast(GSVector4i::x7fffffff());
+
+		return GSVector8(_mm_and_ps(m[0], mask), _mm_and_ps(m[1], mask));
+	}
+
+	__forceinline GSVector8 neg() const
+	{
+		GSVector4 mask = GSVector4::cast(GSVector4i::x80000000());
+
+		return GSVector8(_mm_xor_ps(m[0], mask), _mm_xor_ps(m[1], mask));
+	}
+
+	__forceinline GSVector8 rcp() const
+	{
+		return GSVector8(_mm_rcp_ps(m[0]), _mm_rcp_ps(m[1]));
+	}
+
+	template<int mode> __forceinline GSVector8 round() const
+	{
+		return GSVector8(_mm_round_ps(m[0], mode), _mm_round_ps(m[1], mode));
+	}
+
+	// TODO
+
+	__forceinline GSVector8 min(const GSVector8& a) const
+	{
+		return GSVector8(_mm_min_ps(m[0], a.m[0]), _mm_min_ps(m[1], a.m[1]));
+	}
+
+	__forceinline GSVector8 max(const GSVector8& a) const
+	{
+		return GSVector8(_mm_max_ps(m[0], a.m[0]), _mm_max_ps(m[1], a.m[1]));
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int mask> __forceinline GSVector8 blend32(const GSVector8& a)  const
+	{
+		return GSVector8(_mm_blend_ps(m[0], a.m[0], mask & 0x0f), _mm_blend_ps(m[1], a.m[1], (mask >> 4) & 0x0f));
+	}
+
+	#endif
+
+	__forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask)  const
+	{
+		return GSVector8(_mm_blendv_ps(m[0], a.m[0], mask.m[0]), _mm_blendv_ps(m[1], a.m[1], mask.m[1]));
+	}
+
+	__forceinline GSVector8 upl(const GSVector8& a) const
+	{
+		return GSVector8(_mm_unpacklo_ps(m[0], a.m[0]), _mm_unpacklo_ps(m[1], a.m[1]));
+	}
+
+	__forceinline GSVector8 uph(const GSVector8& a) const
+	{
+		return GSVector8(_mm_unpackhi_ps(m[0], a.m[0]), _mm_unpackhi_ps(m[1], a.m[1]));
+	}
+
+	__forceinline GSVector8 upl64(const GSVector8& a) const
+	{
+		return GSVector8(
+			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))),
+			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1])))
+			);
+	}
+
+	__forceinline GSVector8 uph64(const GSVector8& a) const
+	{
+		return GSVector8(
+			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[0]), _mm_castps_pd(a.m[0]))),
+			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m[1]), _mm_castps_pd(a.m[1])))
+			);
+	}
+
+	// TODO
+
+	__forceinline GSVector8 l2h() const
+	{
+		return GSVector8(_mm_movelh_ps(m[0], m[0]), _mm_movelh_ps(m[1], m[1]));
+	}
+
+	__forceinline GSVector8 h2l() const
+	{
+		return GSVector8(_mm_movehl_ps(m[0], m[0]), _mm_movehl_ps(m[1], m[1]));
+	}
+
+	__forceinline GSVector8 andnot(const GSVector8& v) const
+	{
+		return GSVector8(_mm_andnot_ps(v.m[0], m[0]), _mm_andnot_ps(v.m[1], m[1]));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm_movemask_ps(m[0]) | (_mm_movemask_ps(m[1]) << 4);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		#if _M_SSE >= 0x401
+
+		return (_mm_testz_ps(m[0], m[0]) & _mm_testz_ps(m[1], m[1])) != 0;
+
+		#else
+
+		return mask() == 0;
+
+		#endif
+	}
+
+	template<int i> __forceinline GSVector4 extract() const
+	{
+		return GSVector4(m[i]);
+	}
+
+	template<int i> __forceinline GSVector8 insert(__m128 m) const
+	{
+		return GSVector8(i == 0 ? m : this->m[0], i == 1 ? m : this->m[1]);
+	}
+
+	__forceinline static GSVector8 zero()
+	{
+		return GSVector8(_mm_setzero_ps(), _mm_setzero_ps());
+	}
+
+	__forceinline static void zeroupper()
+	{
+		// N/A
+	}
+
+	__forceinline static void zeroall()
+	{
+		// N/A
+	}
+
+	// TODO: load low, ss
+
+	template<bool aligned> __forceinline static GSVector8 load(const void* p)
+	{
+		return GSVector8(
+			aligned ? _mm_load_ps((const float*)p + 0) : _mm_loadu_ps((const float*)p + 0),
+			aligned ? _mm_load_ps((const float*)p + 4) : _mm_loadu_ps((const float*)p + 4),
+			);
+	}
+
+	// TODO: store low, ss
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
+	{
+		if(aligned)
+		{
+			_mm_store_ps((float*)p + 0, v.m[0]);
+			_mm_store_ps((float*)p + 4, v.m[1]);
+		}
+		else
+		{
+			_mm_storeu_ps((float*)p + 0, v.m[0]);
+			_mm_storeu_ps((float*)p + 4, v.m[1]);
+		}
+	}
+
+	// TODO
+
+	__forceinline void operator += (const GSVector8& v)
+	{
+		m[0] = _mm_add_ps(m[0], v.m[0]);
+		m[1] = _mm_add_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator -= (const GSVector8& v)
+	{
+		m[0] = _mm_sub_ps(m[0], v.m[0]);
+		m[1] = _mm_sub_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator *= (const GSVector8& v)
+	{
+		m[0] = _mm_mul_ps(m[0], v.m[0]);
+		m[1] = _mm_mul_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator /= (const GSVector8& v)
+	{
+		m[0] = _mm_div_ps(m[0], v.m[0]);
+		m[1] = _mm_div_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator &= (const GSVector8& v)
+	{
+		m[0] = _mm_and_ps(m[0], v.m[0]);
+		m[1] = _mm_and_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator |= (const GSVector8& v)
+	{
+		m[0] = _mm_or_ps(m[0], v.m[0]);
+		m[1] = _mm_or_ps(m[1], v.m[1]);
+	}
+
+	__forceinline void operator ^= (const GSVector8& v)
+	{
+		m[0] = _mm_xor_ps(m[0], v.m[0]);
+		m[1] = _mm_xor_ps(m[1], v.m[1]);
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_add_ps(v1.m[0], v2.m[0]), _mm_add_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_sub_ps(v1.m[0], v2.m[0]), _mm_sub_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_mul_ps(v1.m[0], v2.m[0]), _mm_mul_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_div_ps(v1.m[0], v2.m[0]), _mm_div_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_and_ps(v1.m[0], v2.m[0]), _mm_and_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_or_ps(v1.m[0], v2.m[0]), _mm_or_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_xor_ps(v1.m[0], v2.m[0]), _mm_xor_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmpeq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmpneq_ps(v1.m[0], v2.m[0]), _mm_cmpeq_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmpgt_ps(v1.m[0], v2.m[0]), _mm_cmpgt_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmplt_ps(v1.m[0], v2.m[0]), _mm_cmplt_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmpge_ps(v1.m[0], v2.m[0]), _mm_cmpge_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm_cmple_ps(v1.m[0], v2.m[0]), _mm_cmple_ps(v1.m[1], v2.m[1]));
+	}
+
+	__forceinline static __m128 VECTOR8_SELECT(const GSVector8& v1, const GSVector8& v2, int n)
+	{
+		switch(n)
+		{
+		case 0: return v1.m[0];
+		case 1: return v1.m[1];
+		case 2: return v2.m[0];
+		case 3: return v2.m[1];
+		}
+		
+		return _mm_setzero_ps();
+	}
+
+	#define VECTOR8_PERMUTE_2(xs, xn, ys, yn) \
+		__forceinline GSVector8 xs##ys() const {return GSVector8(VECTOR8_SELECT(*this, *this, xn), VECTOR8_SELECT(*this, *this, yn));} \
+		__forceinline GSVector8 xs##ys(const GSVector8& v) const {return GSVector8(VECTOR8_SELECT(*this, v, xn), VECTOR8_SELECT(*this, v, yn));} \
+
+	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm_shuffle_ps(m[0], m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm_shuffle_ps(m[0], v.m[0], _MM_SHUFFLE(wn, zn, yn, xn)), _mm_shuffle_ps(m[1], v.m[1], _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#endif
+
 	#define VECTOR8_PERMUTE_1(xs, xn) \
 		VECTOR8_PERMUTE_2(xs, xn, x, 0) \
 		VECTOR8_PERMUTE_2(xs, xn, y, 1) \
@@ -3423,10 +3965,6 @@ public:
 	VECTOR8_PERMUTE_1(w, 3)
 	VECTOR8_PERMUTE_1(_, 8)
 
-	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
-		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
-
 	#define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
 		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
 		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
@@ -3440,8 +3978,6 @@ public:
 		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
 
 	#define VECTOR8_SHUFFLE_1(xs, xn) \
-		__forceinline GSVector8 xs##4() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
-		__forceinline GSVector8 xs##4(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
 		VECTOR8_SHUFFLE_2(xs, xn, x, 0) \
 		VECTOR8_SHUFFLE_2(xs, xn, y, 1) \
 		VECTOR8_SHUFFLE_2(xs, xn, z, 2) \
@@ -3453,6 +3989,8 @@ public:
 	VECTOR8_SHUFFLE_1(w, 3)
 };
 
+#if _M_SSE >= 0x500
+
 __forceinline GSVector8i::GSVector8i(const GSVector8& v)
 {
 	m = _mm256_cvttps_epi32(v);
@@ -3473,6 +4011,40 @@ __forceinline GSVector8 GSVector8::cast(const GSVector8i& v)
 	return GSVector8(_mm256_castsi256_ps(v.m));
 }
 
+#else
+
+__forceinline GSVector8i::GSVector8i(const GSVector8& v)
+{
+	m[0] = _mm_cvttps_epi32(v.m[0]);
+	m[1] = _mm_cvttps_epi32(v.m[1]);
+}
+
+__forceinline GSVector8::GSVector8(const GSVector8i& v)
+{
+	m[0] = _mm_cvtepi32_ps(v.m[0]);
+	m[1] = _mm_cvtepi32_ps(v.m[1]);
+}
+
+__forceinline GSVector8i GSVector8i::cast(const GSVector8& v)
+{
+	GSVector8i v2;
+	
+	v2.m[0] = _mm_castps_si128(v.m[0]);
+	v2.m[1] = _mm_castps_si128(v.m[1]);
+
+	return v2;
+}
+
+__forceinline GSVector8 GSVector8::cast(const GSVector8i& v)
+{
+	GSVector8 v2;
+	
+	v2.m[0] = _mm_castsi128_ps(v.m[0]);
+	v2.m[1] = _mm_castsi128_ps(v.m[1]);
+
+	return v2;
+}
+
 #endif
 
 #pragma pack(pop)
diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h
index e2d7151ec5..953cd7a672 100644
--- a/plugins/GSdx/GSVertexSW.h
+++ b/plugins/GSdx/GSVertexSW.h
@@ -30,13 +30,73 @@ __aligned(struct, 16) GSVertexSW
 	__forceinline GSVertexSW() {}
 	__forceinline GSVertexSW(const GSVertexSW& v) {*this = v;}
 
-	__forceinline void operator = (const GSVertexSW& v) {c = v.c; p = v.p; t = v.t;}
-	__forceinline void operator += (const GSVertexSW& v) {c += v.c; p += v.p; t += v.t;}
+	__forceinline static GSVertexSW zero()
+	{
+		GSVertexSW v;
 
-	friend GSVertexSW operator + (const GSVertexSW& v1, const GSVertexSW& v2);
-	friend GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2);
-	friend GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv);
-	friend GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv);
+		v.p = GSVector4::zero();
+		v.t = GSVector4::zero();
+		v.c = GSVector4::zero();
+
+		return v;
+	}
+	__forceinline void operator = (const GSVertexSW& v) 
+	{
+		p = v.p; 
+		t = v.t;
+		c = v.c; 
+	}
+	
+	__forceinline void operator += (const GSVertexSW& v) 
+	{
+		p += v.p; 
+		t += v.t;
+		c += v.c; 
+	}
+
+	__forceinline friend GSVertexSW operator + (const GSVertexSW& a, const GSVertexSW& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p + b.p;
+		v.t = a.t + b.t;
+		v.c = a.c + b.c;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator - (const GSVertexSW& a, const GSVertexSW& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p - b.p;
+		v.t = a.t - b.t;
+		v.c = a.c - b.c;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator * (const GSVertexSW& a, const GSVector4& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p * b;
+		v.t = a.t * b;
+		v.c = a.c * b;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator / (const GSVertexSW& a, const GSVector4& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p / b;
+		v.t = a.t / b;
+		v.c = a.c / b;
+
+		return v;
+	}
 
 	static bool IsQuad(const GSVertexSW* v, int& tl, int& br)
 	{
@@ -122,6 +182,25 @@ __aligned(struct, 16) GSVertexSW
 
 		br = i;
 
+		#if _M_SSE >= 0x500
+
+		{
+			// p.z, p.w, t.z, t.w, c.x, c.y, c.z, c.w
+
+			GSVector8 v0 = GSVector8(v[0].p.zwzw(v[0].t), v[0].c);
+			GSVector8 v1 = GSVector8(v[1].p.zwzw(v[1].t), v[1].c);
+			GSVector8 v2 = GSVector8(v[2].p.zwzw(v[2].t), v[2].c);
+			GSVector8 v3 = GSVector8(v[3].p.zwzw(v[3].t), v[3].c);
+			GSVector8 v4 = GSVector8(v[4].p.zwzw(v[4].t), v[4].c);
+			GSVector8 v5 = GSVector8(v[5].p.zwzw(v[5].t), v[5].c);
+
+			GSVector8 test = ((v0 == v1) & (v0 == v2)) & ((v0 == v3) & (v0 == v4)) & (v0 == v5);
+
+			return test.alltrue();
+		}
+		
+		#else
+
 		v0 = v[0].p.zwzw(v[0].t);
 		v1 = v[1].p.zwzw(v[1].t);
 		v2 = v[2].p.zwzw(v[2].t);
@@ -151,42 +230,7 @@ __aligned(struct, 16) GSVertexSW
 		}
 
 		return true;
+
+		#endif
 	}
-};
-
-__forceinline GSVertexSW operator + (const GSVertexSW& v1, const GSVertexSW& v2)
-{
-	GSVertexSW v0;
-	v0.c = v1.c + v2.c;
-	v0.p = v1.p + v2.p;
-	v0.t = v1.t + v2.t;
-	return v0;
-}
-
-__forceinline GSVertexSW operator - (const GSVertexSW& v1, const GSVertexSW& v2)
-{
-	GSVertexSW v0;
-	v0.c = v1.c - v2.c;
-	v0.p = v1.p - v2.p;
-	v0.t = v1.t - v2.t;
-	return v0;
-}
-
-__forceinline GSVertexSW operator * (const GSVertexSW& v, const GSVector4& vv)
-{
-	GSVertexSW v0;
-	v0.c = v.c * vv;
-	v0.p = v.p * vv;
-	v0.t = v.t * vv;
-	return v0;
-}
-
-__forceinline GSVertexSW operator / (const GSVertexSW& v, const GSVector4& vv)
-{
-	GSVertexSW v0;
-	v0.c = v.c / vv;
-	v0.p = v.p / vv;
-	v0.t = v.t / vv;
-	return v0;
-}
-
+};
\ No newline at end of file
diff --git a/plugins/GSdx/GSVertexTrace.cpp b/plugins/GSdx/GSVertexTrace.cpp
index 3c1a6d1ceb..2510c724f7 100644
--- a/plugins/GSdx/GSVertexTrace.cpp
+++ b/plugins/GSdx/GSVertexTrace.cpp
@@ -48,6 +48,52 @@ uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass)
 	return hash;
 }
 
+void GSVertexTrace::UpdateLOD()
+{
+	if(!m_state->PRIM->TME) return;
+
+	const GIFRegTEX1& TEX1 = m_state->m_context->TEX1;
+
+	m_filter.mmag = TEX1.IsMagLinear();
+	m_filter.mmin = TEX1.IsMinLinear();
+
+	if(TEX1.MXL == 0) // MXL == 0 => MMIN ignored, tested it on ps2
+	{
+		m_filter.linear = m_filter.mmag;
+
+		return;
+	}
+
+	float K = (float)TEX1.K / 16;
+
+	if(TEX1.LCM == 0) // && m_state->PRIM->FST == 0 // if FST => assume Q = 1.0f (should not, but Q is very often bogus, 0 or DEN)
+	{
+		// LOD = log2(1/|Q|) * (1 << L) + K
+
+		GSVector4::storel(&m_lod, m_max.t.uph(m_min.t).log2(2).neg() * (float)(1 << TEX1.L) + K);
+
+		if(m_lod.x > m_lod.y) {float tmp = m_lod.x; m_lod.x = m_lod.x; m_lod.y = tmp;}
+	}
+	else
+	{
+		m_lod.x = K;
+		m_lod.y = K;
+	}
+
+	if(m_lod.y <= 0)
+	{
+		m_filter.linear = m_filter.mmag;
+	}
+	else if(m_lod.x > 0)
+	{
+		m_filter.linear = m_filter.mmin;
+	}
+	else
+	{
+		m_filter.linear = m_filter.mmag | m_filter.mmin;
+	}
+}
+
 void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass)
 {
 	m_map_sw[Hash(primclass)](count, v, m_min, m_max);
@@ -55,6 +101,8 @@ void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primcla
 	m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
 
 	m_alpha.valid = false;
+
+	UpdateLOD();
 }
 
 void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass)
@@ -87,6 +135,8 @@ void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primcl
 	m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
 
 	m_alpha.valid = false;
+
+	UpdateLOD();
 }
 
 void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass)
@@ -119,4 +169,7 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
 	m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
 
 	m_alpha.valid = false;
+
+	UpdateLOD();
 }
+
diff --git a/plugins/GSdx/GSVertexTrace.h b/plugins/GSdx/GSVertexTrace.h
index f7eaf35361..d18c73c1cf 100644
--- a/plugins/GSdx/GSVertexTrace.h
+++ b/plugins/GSdx/GSVertexTrace.h
@@ -32,7 +32,7 @@ class GSState;
 __aligned(class, 32) GSVertexTrace
 {
 public:
-	struct Vertex {GSVector4i c; GSVector4 p, t;};
+	struct Vertex {GSVector4i c; GSVector4 p, t;}; // t.xy * 0x10000
 	struct VertexAlpha {int min, max; bool valid;};
 
 private:
@@ -60,16 +60,23 @@ private:
 	GSCodeGeneratorFunctionMap<CGHW9, uint32, VertexTracePtr> m_map_hw9;
 	GSCodeGeneratorFunctionMap<CGHW11, uint32, VertexTracePtr> m_map_hw11;
 
+	const GSState* m_state;
+
 	uint32 Hash(GS_PRIM_CLASS primclass);
 
-	const GSState* m_state;
+	void UpdateLOD();
 
 	static const GSVector4 s_minmax;
 
 public:
 	GS_PRIM_CLASS m_primclass;
-	Vertex m_min, m_max; // t.xy * 0x10000
-	VertexAlpha m_alpha; // source alpha range after tfx, GSRenderer::GetAlphaMinMax() updates it
+
+	Vertex m_min;
+	Vertex m_max;
+
+	// source alpha range after tfx, GSRenderer::GetAlphaMinMax() updates it
+
+	VertexAlpha m_alpha; 
 
 	union
 	{
@@ -78,10 +85,19 @@ public:
 		struct {uint32 rgba:16, xyzf:4, stq:4;};
 	} m_eq;
 
+	union 
+	{
+		struct {uint32 mmag:1, mmin:1, linear:1;};
+	} m_filter;
+
+	GSVector2 m_lod; // x = min, y = max
+
 	GSVertexTrace(const GSState* state);
 
 	void Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass);
 	void Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass);
 	void Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass);
 	void Update(const GSVertexNull* v, int count, GS_PRIM_CLASS primclass) {}
+
+	bool IsLinear() const {return m_filter.linear;}
 };
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index 9695f5c58c..a9a6a3c209 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -107,7 +107,7 @@ using namespace stdext;
     #define __aligned(t, n) t __attribute__((aligned(n)))
     #define __fastcall __attribute__((fastcall))
 
-    #define EXPORT_C_(type) extern "C" type
+    #define EXPORT_C_(type) extern "C" __attribute__((stdcall,externally_visible,visibility("default"))) type
     #define EXPORT_C EXPORT_C_(void)
 
     #ifdef __GNUC__