diff --git a/plugins/GSdx_legacy/CMakeLists.txt b/plugins/GSdx_legacy/CMakeLists.txt
new file mode 100644
index 0000000000..20cbd9e61b
--- /dev/null
+++ b/plugins/GSdx_legacy/CMakeLists.txt
@@ -0,0 +1,228 @@
+# Check that people use the good file
+if(NOT TOP_CMAKE_WAS_SOURCED)
+    message(FATAL_ERROR "
+    You did not 'cmake' the good CMakeLists.txt file. Use the one in the top dir.
+    It is advice to delete all wrongly generated cmake stuff => CMakeFiles & CMakeCache.txt")
+endif()
+
+
+# plugin name
+set(Output GSdx-1.0.0)
+
+set(CommonFlags
+    -fno-operator-names # because Xbyak uses and()/xor()/or()/not() function
+    -fno-strict-aliasing
+    -Wno-unknown-pragmas
+    -Wno-parentheses
+    -Wunused-variable # __dummy variable need to be investigated
+    )
+
+set(GSdxFinalFlags ${CommonFlags})
+
+if(XDG_STD)
+    set(GSdxFinalFlags ${GSdxFinalFlags} -DXDG_STD)
+endif()
+
+if(EGL_API AND EGL_FOUND)
+    set(GSdxFinalFlags ${GSdxFinalFlags} -DEGL_SUPPORTED)
+endif()
+
+if(LIBLZMA_FOUND)
+	set(GSdxFinalFlags ${GSdxFinalFlags} -DLZMA_SUPPORTED)
+endif()
+
+#Clang doesn't support a few common flags that GCC does.
+if(NOT USE_CLANG)
+    set(GSdxFinalFlags ${GSdxFinalFlags} -fabi-version=6)
+endif()
+
+set(GSdxSources
+    GLLoader.cpp
+    GLState.cpp
+    GPU.cpp
+    GPUDrawScanline.cpp
+    GPUDrawScanlineCodeGenerator.cpp
+    GPULocalMemory.cpp
+    GPURenderer.cpp
+    GPURendererSW.cpp
+    GPUSetupPrimCodeGenerator.cpp
+    GPUState.cpp
+    GS.cpp
+    GSAlignedClass.cpp
+    GSBlock.cpp
+    GSCapture.cpp
+    GSClut.cpp
+    GSCodeBuffer.cpp
+    GSCrc.cpp
+    GSDevice.cpp
+    GSDeviceOGL.cpp
+    GSDeviceSW.cpp
+    GSDeviceNull.cpp
+    GSDirtyRect.cpp
+    GSDrawingContext.cpp
+    GSDrawScanline.cpp
+    GSDrawScanlineCodeGenerator.cpp
+    GSDrawScanlineCodeGenerator.x86.avx.cpp
+    GSDrawScanlineCodeGenerator.x86.avx2.cpp
+    GSDrawScanlineCodeGenerator.x64.cpp
+    GSDrawScanlineCodeGenerator.x86.cpp
+    GSDrawScanlineCodeGenerator.x64.avx.cpp
+    GSDump.cpp
+    GSFunctionMap.cpp
+    GSLinuxDialog.cpp
+    GSLocalMemory.cpp
+	GSLzma.cpp
+    GSPerfMon.cpp
+    GSPng.cpp
+    GSRasterizer.cpp
+    GSRenderer.cpp
+    GSRendererCL.cpp
+    GSRendererHW.cpp
+    GSRendererNull.cpp
+    GSRendererOGL.cpp
+    GSRendererSW.cpp
+    GSSetting.cpp
+    GSSetupPrimCodeGenerator.cpp
+    GSSetupPrimCodeGenerator.x86.avx.cpp
+    GSSetupPrimCodeGenerator.x86.avx2.cpp
+    GSSetupPrimCodeGenerator.x64.avx.cpp
+    GSSetupPrimCodeGenerator.x86.cpp
+    GSSetupPrimCodeGenerator.x64.cpp
+    GSShaderOGL.cpp
+    GSState.cpp
+    GSTables.cpp
+    GSTexture.cpp
+    GSTextureCache.cpp
+    GSTextureCacheSW.cpp
+    GSTextureCacheOGL.cpp
+    GSTextureFXOGL.cpp
+    GSTextureOGL.cpp
+    GSTextureNull.cpp
+    GSTextureSW.cpp
+    GSThread.cpp
+    GSUtil.cpp
+    GSVector.cpp
+    GSVertexTrace.cpp
+    GSWnd.cpp
+    GSWndOGL.cpp
+    GSWndEGL.cpp
+    GSdx.cpp
+    stdafx.cpp
+    )
+
+set(GSdxHeaders
+    GPU.h
+    GPUDrawScanline.h
+    GPUDrawScanlineCodeGenerator.h
+    GPUDrawingEnvironment.h
+    GPULocalMemory.h
+    GPURenderer.h
+    GPURendererSW.h
+    GPUScanlineEnvironment.h
+    GPUSetupPrimCodeGenerator.h
+    GPUState.h
+    GPUVertex.h
+    GS.h
+    GSAlignedClass.h
+    GSBlock.h
+    GSCapture.h
+    GSClut.h
+    GSCodeBuffer.h
+    GSCrc.h
+    GSDevice.h
+    GSDeviceOGL.h
+    GSDeviceNull.h
+    GSDirtyRect.h
+    GSDrawScanline.h
+    GSDrawScanlineCodeGenerator.h
+    GSDrawingContext.h
+    GSDrawingEnvironment.h
+    GSDump.h
+    GSFunctionMap.h
+    GSLinuxLogo.h
+    GSLocalMemory.h
+    GSPerfMon.h
+    GSRasterizer.h
+    GSRenderer.h
+    GSRendererNull.h
+    GSRendererSW.h
+    GSRendererHW.h
+    GSRendererOGL.h
+    GSScanlineEnvironment.h
+    GSSetting.h
+    GSSetupPrimCodeGenerator.h
+    GSState.h
+    GSTables.h
+    GSTexture.h
+    GSTextureCache.h
+    GSTextureCacheSW.h
+    GSTextureCacheOGL.h
+    GSTextureNull.h
+    GSThread.h
+    GSUtil.h
+    GSVector.h
+    GSVertex.h
+    GSVertexHW.h
+    GSVertexList.h
+    GSVertexSW.h
+    GSVertexTrace.h
+    GSWnd.h
+    GSWndOGL.h
+    GSWndEGL.h
+    GSdx.h
+    res/glsl_source.h
+    stdafx.h
+    xbyak/xbyak.h
+    xbyak/xbyak_bin2hex.h
+    xbyak/xbyak_mnemonic.h
+    xbyak/xbyak_util.h
+    )
+
+set(GSdxFinalSources
+    ${GSdxSources}
+    ${GSdxHeaders}
+)
+
+set(GSdxFinalLibs
+    ${X11_LIBRARIES}
+)
+
+set(GSdxFinalLibs ${GSdxFinalLibs}
+    ${OPENGL_LIBRARIES}
+    ${GTK2_LIBRARIES}
+    ${LIBC_LIBRARIES}
+    ${PNG_LIBRARY}
+)
+
+if(EGL_API AND EGL_FOUND)
+    set(GSdxFinalLibs ${GSdxFinalLibs}
+        ${EGL_LIBRARIES}
+        )
+endif()
+
+if(LIBLZMA_FOUND)
+    set(GSdxFinalLibs ${GSdxFinalLibs}
+		${LIBLZMA_LIBRARIES}
+		)
+endif()
+
+# Generate Glsl header file. Protect with REBUILD_SHADER to avoid build-dependency on PERL
+if (REBUILD_SHADER)
+    add_custom_command(OUTPUT res/glsl_source.h COMMAND perl ${CMAKE_SOURCE_DIR}/linux_various/glsl2h.pl)
+endif()
+
+if(BUILTIN_GS)
+    add_pcsx2_lib(${Output} "${GSdxFinalSources}" "${GSdxFinalLibs}" "${GSdxFinalFlags}")
+else()
+    add_pcsx2_plugin(${Output} "${GSdxFinalSources}" "${GSdxFinalLibs}" "${GSdxFinalFlags}")
+endif()
+
+################################### Replay Loader
+if(BUILD_REPLAY_LOADERS)
+    set(Replay pcsx2_GSReplayLoader)
+    set(GSdxReplayLoaderFinalSources
+        ${GSdxFinalSources}
+        linux_replay.cpp
+    )
+    add_pcsx2_executable(${Replay} "${GSdxReplayLoaderFinalSources}" "${GSdxFinalLibs}" "${GSdxFinalFlags}")
+endif(BUILD_REPLAY_LOADERS)
diff --git a/plugins/GSdx_legacy/GLLoader.cpp b/plugins/GSdx_legacy/GLLoader.cpp
new file mode 100644
index 0000000000..680f19fab1
--- /dev/null
+++ b/plugins/GSdx_legacy/GLLoader.cpp
@@ -0,0 +1,532 @@
+/* *	Copyright (C) 2011-2014 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GLLoader.h"
+#include "GSdx.h"
+
+PFNGLACTIVETEXTUREPROC                 gl_ActiveTexture                    = NULL;
+PFNGLBLENDCOLORPROC                    gl_BlendColor                       = NULL;
+
+PFNGLATTACHSHADERPROC                  glAttachShader                      = NULL;
+PFNGLBINDBUFFERPROC                    glBindBuffer                        = NULL;
+PFNGLBINDBUFFERBASEPROC                glBindBufferBase                    = NULL;
+PFNGLBINDBUFFERRANGEPROC               glBindBufferRange                   = NULL;
+PFNGLBINDFRAMEBUFFERPROC               glBindFramebuffer                   = NULL;
+PFNGLBINDSAMPLERPROC                   glBindSampler                       = NULL;
+PFNGLBINDVERTEXARRAYPROC               glBindVertexArray                   = NULL;
+PFNGLBLENDEQUATIONSEPARATEIARBPROC     glBlendEquationSeparateiARB         = NULL;
+PFNGLBLENDFUNCSEPARATEIARBPROC         glBlendFuncSeparateiARB             = NULL;
+PFNGLBLITFRAMEBUFFERPROC               glBlitFramebuffer                   = NULL;
+PFNGLBUFFERDATAPROC                    glBufferData                        = NULL;
+PFNGLCHECKFRAMEBUFFERSTATUSPROC        glCheckFramebufferStatus            = NULL;
+PFNGLCLEARBUFFERFVPROC                 glClearBufferfv                     = NULL;
+PFNGLCLEARBUFFERIVPROC                 glClearBufferiv                     = NULL;
+PFNGLCLEARBUFFERUIVPROC                glClearBufferuiv                    = NULL;
+PFNGLCOLORMASKIPROC                    glColorMaski                        = NULL;
+PFNGLCOMPILESHADERPROC                 glCompileShader                     = NULL;
+PFNGLCREATEPROGRAMPROC                 glCreateProgram                     = NULL;
+PFNGLCREATESHADERPROC                  glCreateShader                      = NULL;
+PFNGLCREATESHADERPROGRAMVPROC          glCreateShaderProgramv              = NULL;
+PFNGLDELETEBUFFERSPROC                 glDeleteBuffers                     = NULL;
+PFNGLDELETEFRAMEBUFFERSPROC            glDeleteFramebuffers                = NULL;
+PFNGLDELETEPROGRAMPROC                 glDeleteProgram                     = NULL;
+PFNGLDELETESAMPLERSPROC                glDeleteSamplers                    = NULL;
+PFNGLDELETESHADERPROC                  glDeleteShader                      = NULL;
+PFNGLDELETEVERTEXARRAYSPROC            glDeleteVertexArrays                = NULL;
+PFNGLDETACHSHADERPROC                  glDetachShader                      = NULL;
+PFNGLDRAWBUFFERSPROC                   glDrawBuffers                       = NULL;
+PFNGLDRAWELEMENTSBASEVERTEXPROC        glDrawElementsBaseVertex            = NULL;
+PFNGLENABLEVERTEXATTRIBARRAYPROC       glEnableVertexAttribArray           = NULL;
+PFNGLFRAMEBUFFERRENDERBUFFERPROC       glFramebufferRenderbuffer           = NULL;
+PFNGLFRAMEBUFFERTEXTURE2DPROC          glFramebufferTexture2D              = NULL;
+PFNGLGENBUFFERSPROC                    glGenBuffers                        = NULL;
+PFNGLGENFRAMEBUFFERSPROC               glGenFramebuffers                   = NULL;
+PFNGLGENSAMPLERSPROC                   glGenSamplers                       = NULL;
+PFNGLGENVERTEXARRAYSPROC               glGenVertexArrays                   = NULL;
+PFNGLGETBUFFERPARAMETERIVPROC          glGetBufferParameteriv              = NULL;
+PFNGLGETDEBUGMESSAGELOGARBPROC         glGetDebugMessageLogARB             = NULL;
+PFNGLDEBUGMESSAGECALLBACKPROC          glDebugMessageCallback              = NULL;
+PFNGLGETPROGRAMINFOLOGPROC             glGetProgramInfoLog                 = NULL;
+PFNGLGETPROGRAMIVPROC                  glGetProgramiv                      = NULL;
+PFNGLGETSHADERIVPROC                   glGetShaderiv                       = NULL;
+PFNGLGETSTRINGIPROC                    glGetStringi                        = NULL;
+PFNGLISFRAMEBUFFERPROC                 glIsFramebuffer                     = NULL;
+PFNGLLINKPROGRAMPROC                   glLinkProgram                       = NULL;
+PFNGLMAPBUFFERPROC                     glMapBuffer                         = NULL;
+PFNGLMAPBUFFERRANGEPROC                glMapBufferRange                    = NULL;
+PFNGLPROGRAMPARAMETERIPROC             glProgramParameteri                 = NULL;
+PFNGLSAMPLERPARAMETERFPROC             glSamplerParameterf                 = NULL;
+PFNGLSAMPLERPARAMETERIPROC             glSamplerParameteri                 = NULL;
+PFNGLSHADERSOURCEPROC                  glShaderSource                      = NULL;
+PFNGLUNIFORM1IPROC                     glUniform1i                         = NULL;
+PFNGLUNMAPBUFFERPROC                   glUnmapBuffer                       = NULL;
+PFNGLUSEPROGRAMSTAGESPROC              glUseProgramStages                  = NULL;
+PFNGLVERTEXATTRIBIPOINTERPROC          glVertexAttribIPointer              = NULL;
+PFNGLVERTEXATTRIBPOINTERPROC           glVertexAttribPointer               = NULL;
+PFNGLBUFFERSUBDATAPROC                 glBufferSubData                     = NULL;
+PFNGLFENCESYNCPROC                     glFenceSync                         = NULL;
+PFNGLDELETESYNCPROC                    glDeleteSync                        = NULL;
+PFNGLCLIENTWAITSYNCPROC                glClientWaitSync                    = NULL;
+PFNGLFLUSHMAPPEDBUFFERRANGEPROC        glFlushMappedBufferRange            = NULL;
+PFNGLBLENDEQUATIONSEPARATEPROC         glBlendEquationSeparate             = NULL;
+PFNGLBLENDFUNCSEPARATEPROC             glBlendFuncSeparate                 = NULL;
+// Query object
+PFNGLBEGINQUERYPROC                    glBeginQuery                        = NULL;
+PFNGLENDQUERYPROC                      glEndQuery                          = NULL;
+PFNGLGETQUERYIVPROC                    glGetQueryiv                        = NULL;
+PFNGLGETQUERYOBJECTIVPROC              glGetQueryObjectiv                  = NULL;
+PFNGLGETQUERYOBJECTUIVPROC             glGetQueryObjectuiv                 = NULL;
+PFNGLQUERYCOUNTERPROC                  glQueryCounter                      = NULL;
+PFNGLGETQUERYOBJECTI64VPROC            glGetQueryObjecti64v                = NULL;
+PFNGLGETQUERYOBJECTUI64VPROC           glGetQueryObjectui64v               = NULL;
+PFNGLGETINTEGER64VPROC                 glGetInteger64v                     = NULL;
+// GL4.0
+// GL4.1
+PFNGLBINDPROGRAMPIPELINEPROC           glBindProgramPipeline               = NULL;
+PFNGLGENPROGRAMPIPELINESPROC           glGenProgramPipelines               = NULL;
+PFNGLDELETEPROGRAMPIPELINESPROC        glDeleteProgramPipelines            = NULL;
+PFNGLGETPROGRAMPIPELINEIVPROC          glGetProgramPipelineiv              = NULL;
+PFNGLVALIDATEPROGRAMPIPELINEPROC       glValidateProgramPipeline           = NULL;
+PFNGLGETPROGRAMPIPELINEINFOLOGPROC     glGetProgramPipelineInfoLog         = NULL;
+PFNGLGETPROGRAMBINARYPROC              glGetProgramBinary                  = NULL;
+PFNGLVIEWPORTINDEXEDFPROC              glViewportIndexedf                  = NULL;
+PFNGLVIEWPORTINDEXEDFVPROC             glViewportIndexedfv                 = NULL;
+PFNGLSCISSORINDEXEDPROC                glScissorIndexed                    = NULL;
+PFNGLSCISSORINDEXEDVPROC               glScissorIndexedv                   = NULL;
+// NO GL4.1
+PFNGLUSEPROGRAMPROC                    glUseProgram                        = NULL;
+PFNGLGETSHADERINFOLOGPROC              glGetShaderInfoLog                  = NULL;
+PFNGLPROGRAMUNIFORM1IPROC              glProgramUniform1i                  = NULL;
+// GL4.3
+PFNGLCOPYIMAGESUBDATAPROC              glCopyImageSubData                  = NULL;
+PFNGLINVALIDATETEXIMAGEPROC            glInvalidateTexImage                = NULL;
+PFNGLPUSHDEBUGGROUPPROC                glPushDebugGroup                    = NULL;
+PFNGLPOPDEBUGGROUPPROC                 glPopDebugGroup                     = NULL;
+PFNGLDEBUGMESSAGEINSERTPROC            glDebugMessageInsert                = NULL;
+PFNGLDEBUGMESSAGECONTROLPROC           glDebugMessageControl               = NULL;
+// GL4.2
+PFNGLBINDIMAGETEXTUREPROC              glBindImageTexture                  = NULL;
+PFNGLMEMORYBARRIERPROC                 glMemoryBarrier                     = NULL;
+PFNGLTEXSTORAGE2DPROC                  glTexStorage2D                      = NULL;
+// GL4.4
+PFNGLCLEARTEXIMAGEPROC                 glClearTexImage                     = NULL;
+PFNGLBUFFERSTORAGEPROC                 glBufferStorage                     = NULL;
+
+// GL4.5
+PFNGLCREATETEXTURESPROC                glCreateTextures                    = NULL;
+PFNGLTEXTURESTORAGE2DPROC              glTextureStorage2D                  = NULL;
+PFNGLTEXTURESUBIMAGE2DPROC             glTextureSubImage2D                 = NULL;
+PFNGLCOPYTEXTURESUBIMAGE2DPROC         glCopyTextureSubImage2D             = NULL;
+PFNGLBINDTEXTUREUNITPROC               glBindTextureUnit                   = NULL;
+PFNGLGETTEXTUREIMAGEPROC               glGetTextureImage                   = NULL;
+PFNGLTEXTUREPARAMETERIPROC             glTextureParameteri                 = NULL;
+
+PFNGLCREATEFRAMEBUFFERSPROC            glCreateFramebuffers                = NULL;
+PFNGLCLEARNAMEDFRAMEBUFFERFVPROC       glClearNamedFramebufferfv           = NULL;
+PFNGLCLEARNAMEDFRAMEBUFFERIVPROC       glClearNamedFramebufferiv           = NULL;
+PFNGLCLEARNAMEDFRAMEBUFFERUIVPROC      glClearNamedFramebufferuiv          = NULL;
+PFNGLNAMEDFRAMEBUFFERTEXTUREPROC       glNamedFramebufferTexture           = NULL;
+PFNGLNAMEDFRAMEBUFFERDRAWBUFFERSPROC   glNamedFramebufferDrawBuffers       = NULL;
+PFNGLNAMEDFRAMEBUFFERREADBUFFERPROC    glNamedFramebufferReadBuffer        = NULL;
+PFNGLCHECKNAMEDFRAMEBUFFERSTATUSPROC   glCheckNamedFramebufferStatus       = NULL;
+
+PFNGLCREATEBUFFERSPROC                 glCreateBuffers                     = NULL;
+PFNGLNAMEDBUFFERSTORAGEPROC            glNamedBufferStorage                = NULL;
+PFNGLNAMEDBUFFERDATAPROC               glNamedBufferData                   = NULL;
+PFNGLNAMEDBUFFERSUBDATAPROC            glNamedBufferSubData                = NULL;
+PFNGLMAPNAMEDBUFFERPROC                glMapNamedBuffer                    = NULL;
+PFNGLMAPNAMEDBUFFERRANGEPROC           glMapNamedBufferRange               = NULL;
+PFNGLUNMAPNAMEDBUFFERPROC              glUnmapNamedBuffer                  = NULL;
+PFNGLFLUSHMAPPEDNAMEDBUFFERRANGEPROC   glFlushMappedNamedBufferRange       = NULL;
+
+PFNGLCREATESAMPLERSPROC                glCreateSamplers                    = NULL;
+PFNGLCREATEPROGRAMPIPELINESPROC        glCreateProgramPipelines            = NULL;
+
+PFNGLCLIPCONTROLPROC                   glClipControl                       = NULL;
+PFNGLTEXTUREBARRIERPROC                glTextureBarrier                    = NULL;
+
+namespace Emulate_DSA {
+	// Texture entry point
+	void APIENTRY BindTextureUnit(GLuint unit, GLuint texture) {
+		gl_ActiveTexture(GL_TEXTURE0 + unit);
+		glBindTexture(GL_TEXTURE_2D, texture);
+	}
+
+	void APIENTRY CreateTexture(GLenum target, GLsizei n, GLuint *textures) {
+		glGenTextures(1, textures);
+	}
+
+	void APIENTRY TextureStorage(GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height) {
+		BindTextureUnit(7, texture);
+		glTexStorage2D(GL_TEXTURE_2D, levels, internalformat, width, height);
+	}
+
+	void APIENTRY TextureSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels) {
+		BindTextureUnit(7, texture);
+		glTexSubImage2D(GL_TEXTURE_2D, level, xoffset, yoffset, width, height, format, type, pixels);
+	}
+
+	void APIENTRY CopyTextureSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height) {
+		BindTextureUnit(7, texture);
+		glCopyTexSubImage2D(GL_TEXTURE_2D, level, xoffset, yoffset, x, y, width, height);
+	}
+
+	void APIENTRY GetTexureImage(GLuint texture, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels) {
+		BindTextureUnit(7, texture);
+		glGetTexImage(GL_TEXTURE_2D, level, format, type, pixels);
+	}
+
+	void APIENTRY TextureParameteri (GLuint texture, GLenum pname, GLint param) {
+		BindTextureUnit(7, texture);
+		glTexParameteri(GL_TEXTURE_2D, pname, param);
+	}
+
+	// Framebuffer entry point
+	GLenum fb_target = 0;
+	void SetFramebufferTarget(GLenum target) {
+		fb_target = target;
+	}
+
+	void APIENTRY CreateFramebuffers(GLsizei n, GLuint *framebuffers) {
+		glGenFramebuffers(n, framebuffers);
+	}
+
+	void APIENTRY ClearNamedFramebufferfv(GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glClearBufferfv(buffer, drawbuffer, value);
+	}
+
+	void APIENTRY ClearNamedFramebufferiv(GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLint *value) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glClearBufferiv(buffer, drawbuffer, value);
+	}
+
+	void APIENTRY ClearNamedFramebufferuiv(GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLuint *value) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glClearBufferuiv(buffer, drawbuffer, value);
+	}
+
+	void APIENTRY NamedFramebufferTexture(GLuint framebuffer, GLenum attachment, GLuint texture, GLint level) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glFramebufferTexture2D(fb_target, attachment, GL_TEXTURE_2D, texture, level);
+	}
+
+	void APIENTRY NamedFramebufferDrawBuffers(GLuint framebuffer, GLsizei n, const GLenum *bufs) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glDrawBuffers(n, bufs);
+	}
+
+	void APIENTRY NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src) {
+		glBindFramebuffer(fb_target, framebuffer);
+		glReadBuffer(src);
+		glBindFramebuffer(fb_target, 0);
+	}
+
+	GLenum APIENTRY CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target) {
+		glBindFramebuffer(fb_target, framebuffer);
+		return glCheckFramebufferStatus(fb_target);
+	}
+
+	// Buffer entry point
+	GLenum buffer_target = 0;
+	void SetBufferTarget(GLenum target) {
+		buffer_target = target;
+	}
+
+	void APIENTRY CreateBuffers(GLsizei n, GLuint *buffers) {
+		glGenBuffers(1, buffers);
+	}
+
+	void APIENTRY NamedBufferStorage(GLuint buffer, buffer_proc_t size, const void *data, GLbitfield flags) {
+		glBindBuffer(buffer_target, buffer);
+		glBufferStorage(buffer_target, size, data, flags);
+	}
+
+	void APIENTRY NamedBufferData(GLuint buffer, buffer_proc_t size, const void *data, GLenum usage) {
+		glBindBuffer(buffer_target, buffer);
+		glBufferData(buffer_target, size, data, usage);
+	}
+
+	void APIENTRY NamedBufferSubData(GLuint buffer, GLintptr offset, buffer_proc_t size, const void *data) {
+		glBindBuffer(buffer_target, buffer);
+		glBufferSubData(buffer_target, offset, size, data);
+	}
+
+	void *APIENTRY MapNamedBuffer(GLuint buffer, GLenum access) {
+		glBindBuffer(buffer_target, buffer);
+		return glMapBuffer(buffer_target, access);
+	}
+
+	void *APIENTRY MapNamedBufferRange(GLuint buffer, GLintptr offset, buffer_proc_t length, GLbitfield access) {
+		glBindBuffer(buffer_target, buffer);
+		return glMapBufferRange(buffer_target, offset, length, access);
+	}
+
+	GLboolean APIENTRY UnmapNamedBuffer(GLuint buffer) {
+		glBindBuffer(buffer_target, buffer);
+		return glUnmapBuffer(buffer_target);
+	}
+
+	void APIENTRY FlushMappedNamedBufferRange(GLuint buffer, GLintptr offset, buffer_proc_t length) {
+		glBindBuffer(buffer_target, buffer);
+		glFlushMappedBufferRange(buffer_target, offset, length);
+	}
+
+	// Misc entry point
+	// (only purpose is to have a consistent API otherwise it is useless)
+	void APIENTRY CreateProgramPipelines(GLsizei n, GLuint *pipelines) {
+		glGenProgramPipelines(n, pipelines);
+	}
+
+	void APIENTRY CreateSamplers(GLsizei n, GLuint *samplers) {
+		glGenSamplers(n, samplers);
+	}
+
+	// Replace function pointer to emulate DSA behavior
+	void Init() {
+		fprintf(stderr, "DSA is not supported. Replacing the GL function pointer to emulate it\n");
+		glBindTextureUnit             = BindTextureUnit;
+		glCreateTextures              = CreateTexture;
+		glTextureStorage2D            = TextureStorage;
+		glTextureSubImage2D           = TextureSubImage;
+		glCopyTextureSubImage2D       = CopyTextureSubImage;
+		glGetTextureImage             = GetTexureImage;
+		glTextureParameteri           = TextureParameteri;
+
+		glCreateFramebuffers          = CreateFramebuffers;
+		glClearNamedFramebufferfv     = ClearNamedFramebufferfv;
+		glClearNamedFramebufferiv     = ClearNamedFramebufferiv;
+		glClearNamedFramebufferuiv    = ClearNamedFramebufferuiv;
+		glNamedFramebufferDrawBuffers = NamedFramebufferDrawBuffers;
+		glNamedFramebufferReadBuffer  = NamedFramebufferReadBuffer;
+		glCheckNamedFramebufferStatus = CheckNamedFramebufferStatus;
+
+		glCreateBuffers               = CreateBuffers;
+		glNamedBufferStorage          = NamedBufferStorage;
+		glNamedBufferData             = NamedBufferData;
+		glNamedBufferSubData          = NamedBufferSubData;
+		glMapNamedBuffer              = MapNamedBuffer;
+		glMapNamedBufferRange         = MapNamedBufferRange;
+		glUnmapNamedBuffer            = UnmapNamedBuffer;
+		glFlushMappedNamedBufferRange = FlushMappedNamedBufferRange;
+
+		glCreateProgramPipelines      = CreateProgramPipelines;
+		glCreateSamplers              = CreateSamplers;
+	}
+}
+
+namespace GLLoader {
+
+	bool fglrx_buggy_driver    = false;
+	bool mesa_amd_buggy_driver = false;
+	bool nvidia_buggy_driver   = false;
+	bool intel_buggy_driver    = false;
+	bool in_replayer           = false;
+
+
+	bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
+	bool found_GL_EXT_texture_filter_anisotropic = false;
+	bool found_GL_ARB_clear_texture = false; // Miss AMD Mesa (otherwise seems SW)
+	// DX10 GPU limited driver (SW)
+	bool found_GL_ARB_copy_image = false;
+	bool found_GL_ARB_texture_barrier = false;
+	bool found_GL_ARB_clip_control = false;
+	bool found_GL_ARB_direct_state_access = false;
+	bool found_GL_ARB_separate_shader_objects = false; // Issue with Catalyst...
+	bool found_GL_ARB_buffer_storage = false;
+	// DX11 GPU
+	bool found_GL_ARB_draw_buffers_blend = false; // Not supported on AMD R600 (80 nm class chip, HD2900). Nvidia requires FERMI. Intel SB
+	bool found_GL_ARB_gpu_shader5 = false; // Require IvyBridge
+	bool found_GL_ARB_shader_image_load_store = false; // Intel IB. Nvidia/AMD miss Mesa implementation.
+	bool found_GL_ARB_viewport_array = false; // Intel IB. AMD/NVIDIA DX10
+
+	// Mandatory
+	bool found_GL_ARB_texture_storage = false;
+	bool found_GL_ARB_shading_language_420pack = false;
+
+	static bool status_and_override(bool& found, const std::string& name, bool mandatory = false)
+	{
+		if (mandatory) {
+			if (!found) {
+				fprintf(stderr, "ERROR: %s is NOT SUPPORTED\n", name.c_str());
+			}
+			return found;
+		}
+
+		if (!found) {
+			fprintf(stdout, "INFO: %s is NOT SUPPORTED\n", name.c_str());
+		} else {
+			fprintf(stdout, "INFO: %s is available\n", name.c_str());
+		}
+
+		std::string opt("override_");
+		opt += name;
+
+		if (theApp.GetConfig(opt.c_str(), -1) != -1) {
+			found = !!theApp.GetConfig(opt.c_str(), -1);
+			fprintf(stderr, "Override %s detection (%s)\n", name.c_str(), found ? "Enabled" : "Disabled");
+		}
+
+		return true;
+	}
+
+    bool check_gl_version(int major, int minor) {
+
+		const GLubyte* s = glGetString(GL_VERSION);
+		if (s == NULL) {
+			fprintf(stderr, "Error: GLLoader failed to get GL version\n");
+			return false;
+		}
+		GLuint v = 1;
+		while (s[v] != '\0' && s[v-1] != ' ') v++;
+
+		const char* vendor = (const char*)glGetString(GL_VENDOR);
+		fprintf(stdout, "OpenGL information. GPU: %s. Vendor: %s. Driver: %s\n", glGetString(GL_RENDERER), vendor, &s[v]);
+
+		// Name changed but driver is still bad!
+		if (strstr(vendor, "ATI") || strstr(vendor, "Advanced Micro Devices"))
+			fglrx_buggy_driver = true;
+		if (strstr(vendor, "NVIDIA Corporation"))
+			nvidia_buggy_driver = true;
+		if (strstr(vendor, "Intel"))
+			intel_buggy_driver = true;
+		if (strstr(vendor, "X.Org") || strstr(vendor, "nouveau")) // Note: it might actually catch nouveau too, but bugs are likely to be the same anyway
+			mesa_amd_buggy_driver = true;
+		if (strstr(vendor, "VMware")) // Assume worst case because I don't know the real status
+			mesa_amd_buggy_driver = intel_buggy_driver = true;
+
+		if (mesa_amd_buggy_driver) {
+			fprintf(stderr, "Buggy driver detected. Geometry shaders will be disabled\n");
+			found_geometry_shader = false;
+		}
+		if (theApp.GetConfig("override_geometry_shader", -1) != -1) {
+			found_geometry_shader = !!theApp.GetConfig("override_geometry_shader", -1);
+			fprintf(stderr, "Overriding geometry shaders detection\n");
+		}
+
+		GLint major_gl = 0;
+		GLint minor_gl = 0;
+		glGetIntegerv(GL_MAJOR_VERSION, &major_gl);
+		glGetIntegerv(GL_MINOR_VERSION, &minor_gl);
+		if ( (major_gl < major) || ( major_gl == major && minor_gl < minor ) ) {
+			fprintf(stderr, "OpenGL %d.%d is not supported. Only OpenGL %d.%d\n was found", major, minor, major_gl, minor_gl);
+			return false;
+		}
+
+        return true;
+    }
+
+	bool check_gl_supported_extension() {
+		int max_ext = 0;
+		glGetIntegerv(GL_NUM_EXTENSIONS, &max_ext);
+
+		if (glGetStringi && max_ext) {
+			for (GLint i = 0; i < max_ext; i++) {
+				string ext((const char*)glGetStringi(GL_EXTENSIONS, i));
+				// Bonus
+				if (ext.compare("GL_EXT_texture_filter_anisotropic") == 0) found_GL_EXT_texture_filter_anisotropic = true;
+				// GL4.0
+				if (ext.compare("GL_ARB_gpu_shader5") == 0) found_GL_ARB_gpu_shader5 = true;
+				if (ext.compare("GL_ARB_draw_buffers_blend") == 0) found_GL_ARB_draw_buffers_blend = true;
+				// GL4.1
+				if (ext.compare("GL_ARB_viewport_array") == 0) found_GL_ARB_viewport_array = true;
+				if (ext.compare("GL_ARB_separate_shader_objects") == 0) {
+					if (!fglrx_buggy_driver && !mesa_amd_buggy_driver && !intel_buggy_driver) found_GL_ARB_separate_shader_objects = true;
+					else fprintf(stderr, "Buggy driver detected, GL_ARB_separate_shader_objects will be disabled\n"
+#ifdef __linux__
+							"Note the extension will be fixed on Mesa 11.2 or 11.1.2.\n"
+#endif
+							"AMD proprietary driver => https://community.amd.com/thread/194895\n"
+							"If you want to try it, you can set the variable override_GL_ARB_separate_shader_objects to 1 in the ini file\n");
+				}
+				// GL4.2
+				if (ext.compare("GL_ARB_shading_language_420pack") == 0) found_GL_ARB_shading_language_420pack = true;
+				if (ext.compare("GL_ARB_texture_storage") == 0) found_GL_ARB_texture_storage = true;
+				if (ext.compare("GL_ARB_shader_image_load_store") == 0) found_GL_ARB_shader_image_load_store = true;
+				// GL4.3
+				if (ext.compare("GL_ARB_copy_image") == 0) found_GL_ARB_copy_image = true;
+				// GL4.4
+				if (ext.compare("GL_ARB_buffer_storage") == 0) found_GL_ARB_buffer_storage = true;
+				if (ext.compare("GL_ARB_clear_texture") == 0) found_GL_ARB_clear_texture = true;
+				// GL4.5
+				if (ext.compare("GL_ARB_direct_state_access") == 0) found_GL_ARB_direct_state_access = true;
+				if (ext.compare("GL_ARB_clip_control") == 0) found_GL_ARB_clip_control = true;
+				if (ext.compare("GL_ARB_texture_barrier") == 0) found_GL_ARB_texture_barrier = true;
+
+				//fprintf(stderr, "DEBUG ext: %s\n", ext.c_str());
+			}
+		}
+
+		bool status = true;
+
+		// Bonus
+		status &= status_and_override(found_GL_EXT_texture_filter_anisotropic, "GL_EXT_texture_filter_anisotropic");
+		// GL4.0
+		status &= status_and_override(found_GL_ARB_gpu_shader5, "GL_ARB_gpu_shader5");
+		status &= status_and_override(found_GL_ARB_draw_buffers_blend, "GL_ARB_draw_buffers_blend");
+		// GL4.1
+		status &= status_and_override(found_GL_ARB_viewport_array, "GL_ARB_viewport_array");
+		status &= status_and_override(found_GL_ARB_separate_shader_objects, "GL_ARB_separate_shader_objects");
+		// GL4.2
+		status &= status_and_override(found_GL_ARB_shader_image_load_store, "GL_ARB_shader_image_load_store");
+		status &= status_and_override(found_GL_ARB_shading_language_420pack, "GL_ARB_shading_language_420pack", true);
+		status &= status_and_override(found_GL_ARB_texture_storage, "GL_ARB_texture_storage", true);
+		// GL4.3
+		status &= status_and_override(found_GL_ARB_copy_image, "GL_ARB_copy_image");
+		// GL4.4
+		status &= status_and_override(found_GL_ARB_buffer_storage,"GL_ARB_buffer_storage");
+		status &= status_and_override(found_GL_ARB_clear_texture,"GL_ARB_clear_texture");
+		// GL4.5
+		status &= status_and_override(found_GL_ARB_clip_control, "GL_ARB_clip_control");
+		status &= status_and_override(found_GL_ARB_direct_state_access, "GL_ARB_direct_state_access");
+		status &= status_and_override(found_GL_ARB_texture_barrier, "GL_ARB_texture_barrier");
+
+		if (!found_GL_ARB_direct_state_access) {
+			Emulate_DSA::Init();
+		}
+		if (glBindTextureUnit == NULL) {
+			fprintf(stderr, "FATAL ERROR !!!! Failed to setup DSA function pointer!!!\n");
+			status = false;
+		}
+
+		if (!found_GL_ARB_texture_barrier) {
+			fprintf(stderr, "Error GL_ARB_texture_barrier is not supported by your driver. You can't emulate correctly the GS blending unit! Sorry!\n");
+			theApp.SetConfig("accurate_blending_unit", 0);
+			theApp.SetConfig("accurate_date", 0);
+		}
+
+#ifdef _WIN32
+		if (status) {
+			if (intel_buggy_driver) {
+				fprintf(stderr, "OpenGL renderer isn't compatible with SandyBridge/IvyBridge GPU due to issues. Sorry.\n"
+						"Tip:Try it on Linux");
+			}
+			if (fglrx_buggy_driver) {
+				fprintf(stderr, "OpenGL renderer is slow on AMD GPU due to inefficient driver. Sorry.");
+			}
+		}
+#endif
+
+		fprintf(stdout, "\n");
+
+		return status;
+	}
+}
diff --git a/plugins/GSdx_legacy/GLLoader.h b/plugins/GSdx_legacy/GLLoader.h
new file mode 100644
index 0000000000..2d4da9a98c
--- /dev/null
+++ b/plugins/GSdx_legacy/GLLoader.h
@@ -0,0 +1,370 @@
+/*
+ *	Copyright (C) 2011-2014 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#define GL_TEX_LEVEL_0 (0)
+#define GL_TEX_LEVEL_1 (1)
+#define GL_FB_DEFAULT  (0)
+#define GL_BUFFER_0    (0)
+
+#ifndef GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR
+#define GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR  0x00000008
+#endif
+
+// FIX compilation issue with Mesa 10
+// Note it might be possible to do better with the right include
+// in the rigth order but I don't have time
+#ifndef APIENTRY
+#define APIENTRY
+#endif
+#ifndef APIENTRYP
+#define APIENTRYP APIENTRY *
+#endif
+
+// Mesa glext.h < 20150122 uses GLsizei for BUFFER*PROCs
+#if GL_GLEXT_VERSION < 20150122
+typedef GLsizei buffer_proc_t;
+#else
+typedef GLsizeiptr buffer_proc_t;
+#endif
+
+// Allow compilation with older mesa
+#ifndef GL_VERSION_4_3
+#define GL_VERSION_4_3 1
+typedef void (APIENTRYP PFNGLDEBUGMESSAGECALLBACKPROC) (GLDEBUGPROC callback, const void *userParam);
+#endif
+
+#ifndef GL_ARB_copy_image
+#define GL_ARB_copy_image 1
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glCopyImageSubData (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
+#endif /* GL_GLEXT_PROTOTYPES */
+typedef void (APIENTRYP PFNGLCOPYIMAGESUBDATAPROC) (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
+#endif
+
+#ifndef GL_VERSION_4_4
+#define GL_VERSION_4_4 1
+#define GL_MAX_VERTEX_ATTRIB_STRIDE       0x82E5
+#define GL_MAP_PERSISTENT_BIT             0x0040
+#define GL_MAP_COHERENT_BIT               0x0080
+#define GL_DYNAMIC_STORAGE_BIT            0x0100
+#define GL_CLIENT_STORAGE_BIT             0x0200
+#define GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT 0x00004000
+#define GL_BUFFER_IMMUTABLE_STORAGE       0x821F
+#define GL_BUFFER_STORAGE_FLAGS           0x8220
+#define GL_CLEAR_TEXTURE                  0x9365
+#define GL_LOCATION_COMPONENT             0x934A
+#define GL_TRANSFORM_FEEDBACK_BUFFER_INDEX 0x934B
+#define GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE 0x934C
+#define GL_QUERY_BUFFER                   0x9192
+#define GL_QUERY_BUFFER_BARRIER_BIT       0x00008000
+#define GL_QUERY_BUFFER_BINDING           0x9193
+#define GL_QUERY_RESULT_NO_WAIT           0x9194
+#define GL_MIRROR_CLAMP_TO_EDGE           0x8743
+typedef void (APIENTRYP PFNGLBUFFERSTORAGEPROC) (GLenum target, GLsizeiptr size, const void *data, GLbitfield flags);
+typedef void (APIENTRYP PFNGLCLEARTEXIMAGEPROC) (GLuint texture, GLint level, GLenum format, GLenum type, const void *data);
+typedef void (APIENTRYP PFNGLCLEARTEXSUBIMAGEPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *data);
+typedef void (APIENTRYP PFNGLBINDBUFFERSBASEPROC) (GLenum target, GLuint first, GLsizei count, const GLuint *buffers);
+typedef void (APIENTRYP PFNGLBINDBUFFERSRANGEPROC) (GLenum target, GLuint first, GLsizei count, const GLuint *buffers, const GLintptr *offsets, const GLsizeiptr *sizes);
+typedef void (APIENTRYP PFNGLBINDTEXTURESPROC) (GLuint first, GLsizei count, const GLuint *textures);
+typedef void (APIENTRYP PFNGLBINDSAMPLERSPROC) (GLuint first, GLsizei count, const GLuint *samplers);
+typedef void (APIENTRYP PFNGLBINDIMAGETEXTURESPROC) (GLuint first, GLsizei count, const GLuint *textures);
+typedef void (APIENTRYP PFNGLBINDVERTEXBUFFERSPROC) (GLuint first, GLsizei count, const GLuint *buffers, const GLintptr *offsets, const GLsizei *strides);
+#endif /* GL_VERSION_4_4 */
+
+// Note: trim it
+#ifndef GL_VERSION_4_5
+#define GL_VERSION_4_5 1
+#define GL_CONTEXT_LOST                   0x0507
+#define GL_NEGATIVE_ONE_TO_ONE            0x935E
+#define GL_ZERO_TO_ONE                    0x935F
+#define GL_CLIP_ORIGIN                    0x935C
+#define GL_CLIP_DEPTH_MODE                0x935D
+#define GL_QUERY_WAIT_INVERTED            0x8E17
+#define GL_QUERY_NO_WAIT_INVERTED         0x8E18
+#define GL_QUERY_BY_REGION_WAIT_INVERTED  0x8E19
+#define GL_QUERY_BY_REGION_NO_WAIT_INVERTED 0x8E1A
+#define GL_MAX_CULL_DISTANCES             0x82F9
+#define GL_MAX_COMBINED_CLIP_AND_CULL_DISTANCES 0x82FA
+#define GL_TEXTURE_TARGET                 0x1006
+#define GL_QUERY_TARGET                   0x82EA
+#define GL_TEXTURE_BINDING                0x82EB
+#define GL_GUILTY_CONTEXT_RESET           0x8253
+#define GL_INNOCENT_CONTEXT_RESET         0x8254
+#define GL_UNKNOWN_CONTEXT_RESET          0x8255
+#define GL_RESET_NOTIFICATION_STRATEGY    0x8256
+#define GL_LOSE_CONTEXT_ON_RESET          0x8252
+#define GL_NO_RESET_NOTIFICATION          0x8261
+#define GL_CONTEXT_FLAG_ROBUST_ACCESS_BIT 0x00000004
+#define GL_CONTEXT_RELEASE_BEHAVIOR       0x82FB
+#define GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH 0x82FC
+typedef void (APIENTRYP PFNGLCLIPCONTROLPROC) (GLenum origin, GLenum depth);
+typedef void (APIENTRYP PFNGLCREATEBUFFERSPROC) (GLsizei n, GLuint *buffers);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERSTORAGEPROC) (GLuint buffer, GLsizei size, const void *data, GLbitfield flags);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERDATAPROC) (GLuint buffer, GLsizei size, const void *data, GLenum usage);
+typedef void (APIENTRYP PFNGLNAMEDBUFFERSUBDATAPROC) (GLuint buffer, GLintptr offset, GLsizei size, const void *data);
+typedef void (APIENTRYP PFNGLCOPYNAMEDBUFFERSUBDATAPROC) (GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizei size);
+typedef void (APIENTRYP PFNGLCLEARNAMEDBUFFERDATAPROC) (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data);
+typedef void (APIENTRYP PFNGLCLEARNAMEDBUFFERSUBDATAPROC) (GLuint buffer, GLenum internalformat, GLintptr offset, GLsizei size, GLenum format, GLenum type, const void *data);
+typedef void *(APIENTRYP PFNGLMAPNAMEDBUFFERPROC) (GLuint buffer, GLenum access);
+typedef void *(APIENTRYP PFNGLMAPNAMEDBUFFERRANGEPROC) (GLuint buffer, GLintptr offset, GLsizei length, GLbitfield access);
+typedef GLboolean (APIENTRYP PFNGLUNMAPNAMEDBUFFERPROC) (GLuint buffer);
+typedef void (APIENTRYP PFNGLFLUSHMAPPEDNAMEDBUFFERRANGEPROC) (GLuint buffer, GLintptr offset, GLsizei length);
+typedef void (APIENTRYP PFNGLCREATEFRAMEBUFFERSPROC) (GLsizei n, GLuint *framebuffers);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERRENDERBUFFERPROC) (GLuint framebuffer, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERPARAMETERIPROC) (GLuint framebuffer, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTUREPROC) (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERTEXTURELAYERPROC) (GLuint framebuffer, GLenum attachment, GLuint texture, GLint level, GLint layer);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERDRAWBUFFERPROC) (GLuint framebuffer, GLenum buf);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERDRAWBUFFERSPROC) (GLuint framebuffer, GLsizei n, const GLenum *bufs);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERREADBUFFERPROC) (GLuint framebuffer, GLenum src);
+typedef void (APIENTRYP PFNGLINVALIDATENAMEDFRAMEBUFFERDATAPROC) (GLuint framebuffer, GLsizei numAttachments, const GLenum *attachments);
+typedef void (APIENTRYP PFNGLINVALIDATENAMEDFRAMEBUFFERSUBDATAPROC) (GLuint framebuffer, GLsizei numAttachments, const GLenum *attachments, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLCLEARNAMEDFRAMEBUFFERIVPROC) (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLint *value);
+typedef void (APIENTRYP PFNGLCLEARNAMEDFRAMEBUFFERUIVPROC) (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLuint *value);
+typedef void (APIENTRYP PFNGLCLEARNAMEDFRAMEBUFFERFVPROC) (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value);
+typedef void (APIENTRYP PFNGLCLEARNAMEDFRAMEBUFFERFIPROC) (GLuint framebuffer, GLenum buffer, const GLfloat depth, GLint stencil);
+typedef void (APIENTRYP PFNGLBLITNAMEDFRAMEBUFFERPROC) (GLuint readFramebuffer, GLuint drawFramebuffer, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
+typedef GLenum (APIENTRYP PFNGLCHECKNAMEDFRAMEBUFFERSTATUSPROC) (GLuint framebuffer, GLenum target);
+typedef void (APIENTRYP PFNGLCREATERENDERBUFFERSPROC) (GLsizei n, GLuint *renderbuffers);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEPROC) (GLuint renderbuffer, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLCREATETEXTURESPROC) (GLenum target, GLsizei n, GLuint *textures);
+typedef void (APIENTRYP PFNGLTEXTUREBUFFERPROC) (GLuint texture, GLenum internalformat, GLuint buffer);
+typedef void (APIENTRYP PFNGLTEXTUREBUFFERRANGEPROC) (GLuint texture, GLenum internalformat, GLuint buffer, GLintptr offset, GLsizei size);
+typedef void (APIENTRYP PFNGLTEXTURESTORAGE2DPROC) (GLuint texture, GLsizei levels, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLTEXTURESUBIMAGE2DPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels);
+typedef void (APIENTRYP PFNGLCOMPRESSEDTEXTURESUBIMAGE2DPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *data);
+typedef void (APIENTRYP PFNGLCOPYTEXTURESUBIMAGE2DPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERFPROC) (GLuint texture, GLenum pname, GLfloat param);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERFVPROC) (GLuint texture, GLenum pname, const GLfloat *param);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIPROC) (GLuint texture, GLenum pname, GLint param);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIIVPROC) (GLuint texture, GLenum pname, const GLint *params);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIUIVPROC) (GLuint texture, GLenum pname, const GLuint *params);
+typedef void (APIENTRYP PFNGLTEXTUREPARAMETERIVPROC) (GLuint texture, GLenum pname, const GLint *param);
+typedef void (APIENTRYP PFNGLGENERATETEXTUREMIPMAPPROC) (GLuint texture);
+typedef void (APIENTRYP PFNGLBINDTEXTUREUNITPROC) (GLuint unit, GLuint texture);
+typedef void (APIENTRYP PFNGLCREATEVERTEXARRAYSPROC) (GLsizei n, GLuint *arrays);
+typedef void (APIENTRYP PFNGLDISABLEVERTEXARRAYATTRIBPROC) (GLuint vaobj, GLuint index);
+typedef void (APIENTRYP PFNGLENABLEVERTEXARRAYATTRIBPROC) (GLuint vaobj, GLuint index);
+typedef void (APIENTRYP PFNGLVERTEXARRAYELEMENTBUFFERPROC) (GLuint vaobj, GLuint buffer);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXBUFFERPROC) (GLuint vaobj, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride);
+typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXBUFFERSPROC) (GLuint vaobj, GLuint first, GLsizei count, const GLuint *buffers, const GLintptr *offsets, const GLsizei *strides);
+typedef void (APIENTRYP PFNGLVERTEXARRAYATTRIBBINDINGPROC) (GLuint vaobj, GLuint attribindex, GLuint bindingindex);
+typedef void (APIENTRYP PFNGLVERTEXARRAYATTRIBFORMATPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYATTRIBIFORMATPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYATTRIBLFORMATPROC) (GLuint vaobj, GLuint attribindex, GLint size, GLenum type, GLuint relativeoffset);
+typedef void (APIENTRYP PFNGLVERTEXARRAYBINDINGDIVISORPROC) (GLuint vaobj, GLuint bindingindex, GLuint divisor);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYIVPROC) (GLuint vaobj, GLenum pname, GLint *param);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYINDEXEDIVPROC) (GLuint vaobj, GLuint index, GLenum pname, GLint *param);
+typedef void (APIENTRYP PFNGLGETVERTEXARRAYINDEXED64IVPROC) (GLuint vaobj, GLuint index, GLenum pname, GLint64 *param);
+typedef void (APIENTRYP PFNGLCREATESAMPLERSPROC) (GLsizei n, GLuint *samplers);
+typedef void (APIENTRYP PFNGLCREATEPROGRAMPIPELINESPROC) (GLsizei n, GLuint *pipelines);
+typedef void (APIENTRYP PFNGLCREATEQUERIESPROC) (GLenum target, GLsizei n, GLuint *ids);
+typedef void (APIENTRYP PFNGLMEMORYBARRIERBYREGIONPROC) (GLbitfield barriers);
+typedef void (APIENTRYP PFNGLGETTEXTURESUBIMAGEPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, GLsizei bufSize, void *pixels);
+typedef void (APIENTRYP PFNGLGETCOMPRESSEDTEXTURESUBIMAGEPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLsizei bufSize, void *pixels);
+typedef GLenum (APIENTRYP PFNGLGETGRAPHICSRESETSTATUSPROC) (void);
+typedef void (APIENTRYP PFNGLGETNCOMPRESSEDTEXIMAGEPROC) (GLenum target, GLint lod, GLsizei bufSize, void *pixels);
+typedef void (APIENTRYP PFNGLGETNTEXIMAGEPROC) (GLenum target, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels);
+typedef void (APIENTRYP PFNGLGETNUNIFORMDVPROC) (GLuint program, GLint location, GLsizei bufSize, GLdouble *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMFVPROC) (GLuint program, GLint location, GLsizei bufSize, GLfloat *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMIVPROC) (GLuint program, GLint location, GLsizei bufSize, GLint *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMUIVPROC) (GLuint program, GLint location, GLsizei bufSize, GLuint *params);
+typedef void (APIENTRYP PFNGLREADNPIXELSPROC) (GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, GLsizei bufSize, void *data);
+typedef void (APIENTRYP PFNGLTEXTUREBARRIERPROC) (void);
+typedef void (APIENTRYP PFNGLGETTEXTUREIMAGEPROC) (GLuint texture, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels);
+#endif /* GL_VERSION_4_5 */
+
+// Note: glActiveTexture & glBlendColor aren't included in the win GL ABI.
+// (maybe gl.h is outdated, or my setup is wrong)
+// Anyway, let's just keep the mangled function pointer for those 2 functions.
+extern   PFNGLACTIVETEXTUREPROC                 gl_ActiveTexture;
+extern   PFNGLBLENDCOLORPROC                    gl_BlendColor;
+
+extern   PFNGLATTACHSHADERPROC                  glAttachShader;
+extern   PFNGLBINDBUFFERPROC                    glBindBuffer;
+extern   PFNGLBINDBUFFERBASEPROC                glBindBufferBase;
+extern   PFNGLBINDBUFFERRANGEPROC               glBindBufferRange;
+extern   PFNGLBINDFRAMEBUFFERPROC               glBindFramebuffer;
+extern   PFNGLBINDSAMPLERPROC                   glBindSampler;
+extern   PFNGLBINDVERTEXARRAYPROC               glBindVertexArray;
+extern   PFNGLBLENDEQUATIONSEPARATEIARBPROC     glBlendEquationSeparateiARB;
+extern   PFNGLBLENDFUNCSEPARATEIARBPROC         glBlendFuncSeparateiARB;
+extern   PFNGLBLITFRAMEBUFFERPROC               glBlitFramebuffer;
+extern   PFNGLBUFFERDATAPROC                    glBufferData;
+extern   PFNGLCHECKFRAMEBUFFERSTATUSPROC        glCheckFramebufferStatus;
+extern   PFNGLCLEARBUFFERFVPROC                 glClearBufferfv;
+extern   PFNGLCLEARBUFFERIVPROC                 glClearBufferiv;
+extern   PFNGLCLEARBUFFERUIVPROC                glClearBufferuiv;
+extern   PFNGLCOMPILESHADERPROC                 glCompileShader;
+extern   PFNGLCOLORMASKIPROC                    glColorMaski;
+extern   PFNGLCREATEPROGRAMPROC                 glCreateProgram;
+extern   PFNGLCREATESHADERPROC                  glCreateShader;
+extern   PFNGLCREATESHADERPROGRAMVPROC          glCreateShaderProgramv;
+extern   PFNGLDELETEBUFFERSPROC                 glDeleteBuffers;
+extern   PFNGLDELETEFRAMEBUFFERSPROC            glDeleteFramebuffers;
+extern   PFNGLDELETEPROGRAMPROC                 glDeleteProgram;
+extern   PFNGLDELETESAMPLERSPROC                glDeleteSamplers;
+extern   PFNGLDELETESHADERPROC                  glDeleteShader;
+extern   PFNGLDELETEVERTEXARRAYSPROC            glDeleteVertexArrays;
+extern   PFNGLDETACHSHADERPROC                  glDetachShader;
+extern   PFNGLDRAWBUFFERSPROC                   glDrawBuffers;
+extern   PFNGLDRAWELEMENTSBASEVERTEXPROC        glDrawElementsBaseVertex;
+extern   PFNGLENABLEVERTEXATTRIBARRAYPROC       glEnableVertexAttribArray;
+extern   PFNGLFRAMEBUFFERRENDERBUFFERPROC       glFramebufferRenderbuffer;
+extern   PFNGLFRAMEBUFFERTEXTURE2DPROC          glFramebufferTexture2D;
+extern   PFNGLGENBUFFERSPROC                    glGenBuffers;
+extern   PFNGLGENFRAMEBUFFERSPROC               glGenFramebuffers;
+extern   PFNGLGENSAMPLERSPROC                   glGenSamplers;
+extern   PFNGLGENVERTEXARRAYSPROC               glGenVertexArrays;
+extern   PFNGLGETBUFFERPARAMETERIVPROC          glGetBufferParameteriv;
+extern   PFNGLGETDEBUGMESSAGELOGARBPROC         glGetDebugMessageLogARB;
+extern   PFNGLDEBUGMESSAGECALLBACKPROC          glDebugMessageCallback;
+extern   PFNGLGETPROGRAMINFOLOGPROC             glGetProgramInfoLog;
+extern   PFNGLGETPROGRAMIVPROC                  glGetProgramiv;
+extern   PFNGLGETSHADERIVPROC                   glGetShaderiv;
+extern   PFNGLGETSTRINGIPROC                    glGetStringi;
+extern   PFNGLISFRAMEBUFFERPROC                 glIsFramebuffer;
+extern   PFNGLLINKPROGRAMPROC                   glLinkProgram;
+extern   PFNGLMAPBUFFERPROC                     glMapBuffer;
+extern   PFNGLMAPBUFFERRANGEPROC                glMapBufferRange;
+extern   PFNGLPROGRAMPARAMETERIPROC             glProgramParameteri;
+extern   PFNGLSAMPLERPARAMETERFPROC             glSamplerParameterf;
+extern   PFNGLSAMPLERPARAMETERIPROC             glSamplerParameteri;
+extern   PFNGLSHADERSOURCEPROC                  glShaderSource;
+extern   PFNGLUNIFORM1IPROC                     glUniform1i;
+extern   PFNGLUNMAPBUFFERPROC                   glUnmapBuffer;
+extern   PFNGLUSEPROGRAMSTAGESPROC              glUseProgramStages;
+extern   PFNGLVERTEXATTRIBIPOINTERPROC          glVertexAttribIPointer;
+extern   PFNGLVERTEXATTRIBPOINTERPROC           glVertexAttribPointer;
+extern   PFNGLBUFFERSUBDATAPROC                 glBufferSubData;
+extern   PFNGLFENCESYNCPROC                     glFenceSync;
+extern   PFNGLDELETESYNCPROC                    glDeleteSync;
+extern   PFNGLCLIENTWAITSYNCPROC                glClientWaitSync;
+extern   PFNGLFLUSHMAPPEDBUFFERRANGEPROC        glFlushMappedBufferRange;
+extern   PFNGLBLENDEQUATIONSEPARATEPROC         glBlendEquationSeparate;
+extern   PFNGLBLENDFUNCSEPARATEPROC             glBlendFuncSeparate;
+// Query object
+extern   PFNGLBEGINQUERYPROC                    glBeginQuery;
+extern   PFNGLENDQUERYPROC                      glEndQuery;
+extern   PFNGLGETQUERYIVPROC                    glGetQueryiv;
+extern   PFNGLGETQUERYOBJECTIVPROC              glGetQueryObjectiv;
+extern   PFNGLGETQUERYOBJECTUIVPROC             glGetQueryObjectuiv;
+extern   PFNGLQUERYCOUNTERPROC                  glQueryCounter;
+extern   PFNGLGETQUERYOBJECTI64VPROC            glGetQueryObjecti64v;
+extern   PFNGLGETQUERYOBJECTUI64VPROC           glGetQueryObjectui64v;
+extern   PFNGLGETINTEGER64VPROC                 glGetInteger64v;
+// GL4.0
+// GL4.1
+extern   PFNGLBINDPROGRAMPIPELINEPROC           glBindProgramPipeline;
+extern   PFNGLDELETEPROGRAMPIPELINESPROC        glDeleteProgramPipelines;
+extern   PFNGLGENPROGRAMPIPELINESPROC           glGenProgramPipelines;
+extern   PFNGLGETPROGRAMPIPELINEIVPROC          glGetProgramPipelineiv;
+extern   PFNGLVALIDATEPROGRAMPIPELINEPROC       glValidateProgramPipeline;
+extern   PFNGLGETPROGRAMPIPELINEINFOLOGPROC     glGetProgramPipelineInfoLog;
+extern   PFNGLGETPROGRAMBINARYPROC              glGetProgramBinary;
+extern   PFNGLVIEWPORTINDEXEDFPROC              glViewportIndexedf;
+extern   PFNGLVIEWPORTINDEXEDFVPROC             glViewportIndexedfv;
+extern   PFNGLSCISSORINDEXEDPROC                glScissorIndexed;
+extern   PFNGLSCISSORINDEXEDVPROC               glScissorIndexedv;
+// NO GL4.1
+extern   PFNGLUSEPROGRAMPROC                    glUseProgram;
+extern   PFNGLGETSHADERINFOLOGPROC              glGetShaderInfoLog;
+extern   PFNGLPROGRAMUNIFORM1IPROC              glProgramUniform1i;
+// GL4.2
+extern   PFNGLBINDIMAGETEXTUREPROC              glBindImageTexture;
+extern   PFNGLMEMORYBARRIERPROC                 glMemoryBarrier;
+extern   PFNGLTEXSTORAGE2DPROC                  glTexStorage2D;
+extern   PFNGLPOPDEBUGGROUPPROC                 glPopDebugGroup;
+// GL4.3
+extern   PFNGLCOPYIMAGESUBDATAPROC              glCopyImageSubData;
+extern   PFNGLINVALIDATETEXIMAGEPROC            glInvalidateTexImage;
+extern   PFNGLPUSHDEBUGGROUPPROC                glPushDebugGroup;
+extern   PFNGLDEBUGMESSAGEINSERTPROC            glDebugMessageInsert;
+extern   PFNGLDEBUGMESSAGECONTROLPROC           glDebugMessageControl;
+// GL4.4
+extern   PFNGLCLEARTEXIMAGEPROC                 glClearTexImage;
+extern   PFNGLBUFFERSTORAGEPROC                 glBufferStorage;
+
+// GL4.5
+extern PFNGLCREATETEXTURESPROC                  glCreateTextures;
+extern PFNGLTEXTURESTORAGE2DPROC                glTextureStorage2D;
+extern PFNGLTEXTURESUBIMAGE2DPROC               glTextureSubImage2D;
+extern PFNGLCOPYTEXTURESUBIMAGE2DPROC           glCopyTextureSubImage2D;
+extern PFNGLBINDTEXTUREUNITPROC                 glBindTextureUnit;
+extern PFNGLGETTEXTUREIMAGEPROC                 glGetTextureImage;
+extern PFNGLTEXTUREPARAMETERIPROC               glTextureParameteri;
+
+extern PFNGLCREATEFRAMEBUFFERSPROC              glCreateFramebuffers;
+extern PFNGLCLEARNAMEDFRAMEBUFFERFVPROC         glClearNamedFramebufferfv;
+extern PFNGLCLEARNAMEDFRAMEBUFFERIVPROC         glClearNamedFramebufferiv;
+extern PFNGLCLEARNAMEDFRAMEBUFFERUIVPROC        glClearNamedFramebufferuiv;
+extern PFNGLNAMEDFRAMEBUFFERTEXTUREPROC         glNamedFramebufferTexture;
+extern PFNGLNAMEDFRAMEBUFFERDRAWBUFFERSPROC     glNamedFramebufferDrawBuffers;
+extern PFNGLNAMEDFRAMEBUFFERREADBUFFERPROC      glNamedFramebufferReadBuffer;
+extern PFNGLCHECKNAMEDFRAMEBUFFERSTATUSPROC     glCheckNamedFramebufferStatus;
+
+extern PFNGLCREATEBUFFERSPROC                   glCreateBuffers;
+extern PFNGLNAMEDBUFFERSTORAGEPROC              glNamedBufferStorage;
+extern PFNGLNAMEDBUFFERDATAPROC                 glNamedBufferData;
+extern PFNGLNAMEDBUFFERSUBDATAPROC              glNamedBufferSubData;
+extern PFNGLMAPNAMEDBUFFERPROC                  glMapNamedBuffer;
+extern PFNGLMAPNAMEDBUFFERRANGEPROC             glMapNamedBufferRange;
+extern PFNGLUNMAPNAMEDBUFFERPROC                glUnmapNamedBuffer;
+extern PFNGLFLUSHMAPPEDNAMEDBUFFERRANGEPROC     glFlushMappedNamedBufferRange;
+
+extern PFNGLCREATESAMPLERSPROC                  glCreateSamplers;
+extern PFNGLCREATEPROGRAMPIPELINESPROC          glCreateProgramPipelines;
+
+extern PFNGLCLIPCONTROLPROC                     glClipControl;
+extern PFNGLTEXTUREBARRIERPROC                  glTextureBarrier;
+
+namespace Emulate_DSA {
+	extern void SetFramebufferTarget(GLenum target);
+	extern void SetBufferTarget(GLenum target);
+	extern void Init();
+}
+
+namespace GLLoader {
+	bool check_gl_version(int major, int minor);
+	void init_gl_function();
+	bool check_gl_supported_extension();
+
+	extern bool fglrx_buggy_driver;
+	extern bool mesa_amd_buggy_driver;
+	extern bool nvidia_buggy_driver;
+	extern bool intel_buggy_driver;
+	extern bool in_replayer;
+
+	// GL
+	extern bool found_GL_ARB_separate_shader_objects;
+	extern bool found_GL_ARB_copy_image;
+	extern bool found_geometry_shader;
+	extern bool found_GL_ARB_gpu_shader5;
+	extern bool found_GL_ARB_shader_image_load_store;
+	extern bool found_GL_ARB_clear_texture;
+	extern bool found_GL_ARB_buffer_storage;
+	extern bool found_GL_ARB_clip_control;
+	extern bool found_GL_ARB_direct_state_access;
+	extern bool found_GL_ARB_texture_barrier;
+	extern bool found_GL_EXT_texture_filter_anisotropic;
+}
diff --git a/plugins/GSdx_legacy/GLState.cpp b/plugins/GSdx_legacy/GLState.cpp
new file mode 100644
index 0000000000..1ecdb4a41f
--- /dev/null
+++ b/plugins/GSdx_legacy/GLState.cpp
@@ -0,0 +1,97 @@
+/*
+ *	Copyright (C) 2011-2013 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GLState.h"
+
+namespace GLState {
+	GLuint fbo;
+	GSVector2i viewport;
+	GSVector4i scissor;
+
+	bool blend;
+	uint16 eq_RGB;
+	uint16 f_sRGB;
+	uint16 f_dRGB;
+	uint8 bf;
+	uint32 wrgba;
+
+	bool depth;
+	GLenum depth_func;
+	bool depth_mask;
+
+	bool stencil;
+	GLenum stencil_func;
+	GLenum stencil_pass;
+
+	GLuint ubo;
+
+	GLuint ps_ss;
+
+	GLuint rt;
+	GLuint ds;
+	GLuint tex_unit[4];
+	GLuint64 tex_handle[4];
+
+	GLuint ps;
+	GLuint gs;
+	GLuint vs;
+	GLuint program;
+	bool dirty_prog;
+
+	void Clear() {
+		fbo = 0;
+		viewport = GSVector2i(0, 0);
+		scissor = GSVector4i(0, 0, 0, 0);
+
+		blend = false;
+		eq_RGB = 0;
+		f_sRGB = 0;
+		f_dRGB = 0;
+		bf = 0;
+		wrgba = 0xF;
+
+		depth = false;
+		depth_func = 0;
+		depth_mask = false;
+
+		stencil = false;
+		stencil_func = 0;
+		stencil_pass = 0;
+
+		ubo = 0;
+
+		ps_ss = 0;
+
+		rt = 0;
+		ds = 0;
+		for (size_t i = 0; i < countof(tex_unit); i++)
+			tex_unit[i] = 0;
+		for (size_t i = 0; i < countof(tex_handle); i++)
+			tex_handle[i] = 0;
+
+		ps = 0;
+		gs = 0;
+		vs = 0;
+		program = 0;
+		dirty_prog = true;
+	}
+}
diff --git a/plugins/GSdx_legacy/GLState.h b/plugins/GSdx_legacy/GLState.h
new file mode 100644
index 0000000000..18931d6cd2
--- /dev/null
+++ b/plugins/GSdx_legacy/GLState.h
@@ -0,0 +1,63 @@
+/*
+ *	Copyright (C) 2011-2013 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSdx.h"
+#include "GSVector.h"
+
+namespace GLState {
+	extern GLuint fbo; // frame buffer object
+	extern GSVector2i viewport;
+	extern GSVector4i scissor;
+
+	extern bool blend;
+	extern uint16 eq_RGB;
+	extern uint16 f_sRGB;
+	extern uint16 f_dRGB;
+	extern uint8 bf;
+	extern uint32 wrgba;
+
+	extern bool depth;
+	extern GLenum depth_func;
+	extern bool depth_mask;
+
+	extern bool stencil;
+	extern GLenum stencil_func;
+	extern GLenum stencil_pass;
+
+	extern GLuint ubo; // uniform buffer object
+
+	extern GLuint ps_ss; // sampler
+
+	extern GLuint rt; // render target
+	extern GLuint ds; // Depth-Stencil
+	extern GLuint tex_unit[4]; // shader input texture
+	extern GLuint64 tex_handle[4]; // shader input texture
+
+	extern GLuint ps;
+	extern GLuint gs;
+	extern GLuint vs;
+	extern GLuint program; // monolith program (when sso isn't supported)
+	extern bool dirty_prog;
+
+	extern void Clear();
+}
diff --git a/plugins/GSdx_legacy/GPU.cpp b/plugins/GSdx_legacy/GPU.cpp
new file mode 100644
index 0000000000..3cb212d336
--- /dev/null
+++ b/plugins/GSdx_legacy/GPU.cpp
@@ -0,0 +1,310 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSUtil.h"
+#include "GPURendererSW.h"
+#include "GSDeviceNull.h"
+
+#ifdef _WIN32
+
+#include "GPUSettingsDlg.h"
+#include "GSDevice9.h"
+#include "GSDevice11.h"
+
+static HRESULT s_hr = E_FAIL;
+
+#endif
+
+#define PSE_LT_GPU 2
+
+static GPURenderer* s_gpu = NULL;
+
+EXPORT_C_(uint32) PSEgetLibType()
+{
+	return PSE_LT_GPU;
+}
+
+EXPORT_C_(const char*) PSEgetLibName()
+{
+	return GSUtil::GetLibName();
+}
+
+EXPORT_C_(uint32) PSEgetLibVersion()
+{
+	static const uint32 version = 1;
+	static const uint32 revision = 1;
+
+	return version << 16 | revision << 8 | PLUGIN_VERSION;
+}
+
+EXPORT_C_(int32) GPUinit()
+{
+	return 0;
+}
+
+EXPORT_C_(int32) GPUshutdown()
+{
+	return 0;
+}
+
+EXPORT_C_(int32) GPUclose()
+{
+	delete s_gpu;
+
+	s_gpu = NULL;
+
+#ifdef _WIN32
+
+	if(SUCCEEDED(s_hr))
+	{
+		::CoUninitialize();
+
+		s_hr = E_FAIL;
+	}
+
+#endif
+
+	return 0;
+}
+
+EXPORT_C_(int32) GPUopen(void* hWnd)
+{
+	GPUclose();
+
+	if(!GSUtil::CheckSSE())
+	{
+		return -1;
+	}
+
+#ifdef _WIN32
+
+	s_hr = ::CoInitializeEx(NULL, COINIT_MULTITHREADED);
+
+	if(!GSUtil::CheckDirectX())
+	{
+		return -1;
+	}
+
+#endif
+
+	int renderer = theApp.GetConfig("Renderer", 1);
+	int threads = theApp.GetConfig("extrathreads", DEFAULT_EXTRA_RENDERING_THREADS);
+
+	switch(renderer)
+	{
+	default:
+	#ifdef _WIN32
+	case 0: s_gpu = new GPURendererSW(new GSDevice9(), threads); break;
+	case 1: s_gpu = new GPURendererSW(new GSDevice11(), threads); break;
+	#endif
+	case 3: s_gpu = new GPURendererSW(new GSDeviceNull(), threads); break;
+	//case 4: s_gpu = new GPURendererNull(new GSDeviceNull()); break;
+	}
+
+	if(!s_gpu->Create(hWnd))
+	{
+		GPUclose();
+
+		return -1;
+	}
+
+	return 0;
+}
+
+EXPORT_C_(int32) GPUconfigure()
+{
+#ifdef _WIN32
+
+	GPUSettingsDlg dlg;
+
+	if(IDOK == dlg.DoModal())
+	{
+		GPUshutdown();
+		GPUinit();
+	}
+
+#else
+
+    // TODO: linux
+#endif
+
+	return 0;
+}
+
+EXPORT_C_(int32) GPUtest()
+{
+	return 0;
+}
+
+EXPORT_C GPUabout()
+{
+	// TODO
+}
+
+EXPORT_C GPUwriteDataMem(const uint8* mem, uint32 size)
+{
+	s_gpu->WriteData(mem, size);
+}
+
+EXPORT_C GPUwriteData(uint32 data)
+{
+	s_gpu->WriteData((uint8*)&data, 1);
+}
+
+EXPORT_C GPUreadDataMem(uint8* mem, uint32 size)
+{
+	s_gpu->ReadData(mem, size);
+}
+
+EXPORT_C_(uint32) GPUreadData()
+{
+	uint32 data = 0;
+
+	s_gpu->ReadData((uint8*)&data, 1);
+
+	return data;
+}
+
+EXPORT_C GPUwriteStatus(uint32 status)
+{
+	s_gpu->WriteStatus(status);
+}
+
+EXPORT_C_(uint32) GPUreadStatus()
+{
+	return s_gpu->ReadStatus();
+}
+
+EXPORT_C_(uint32) GPUdmaChain(const uint8* mem, uint32 addr)
+{
+	uint32 last[3];
+
+	memset(last, 0xff, sizeof(last));
+
+	do
+	{
+		if(addr == last[1] || addr == last[2])
+		{
+			break;
+		}
+
+		(addr < last[0] ? last[1] : last[2]) = addr;
+
+		last[0] = addr;
+
+		uint8 size = mem[addr + 3];
+
+		if(size > 0)
+		{
+			s_gpu->WriteData(&mem[addr + 4], size);
+		}
+
+		addr = *(uint32*)&mem[addr] & 0xffffff;
+	}
+	while(addr != 0xffffff);
+
+	return 0;
+}
+
+EXPORT_C_(uint32) GPUgetMode()
+{
+	// TODO
+
+	return 0;
+}
+
+EXPORT_C GPUsetMode(uint32 mode)
+{
+	// TODO
+}
+
+EXPORT_C GPUupdateLace()
+{
+	s_gpu->VSync();
+}
+
+EXPORT_C GPUmakeSnapshot()
+{
+	s_gpu->MakeSnapshot("c:/"); // TODO
+}
+
+EXPORT_C GPUdisplayText(char* text)
+{
+	// TODO
+}
+
+EXPORT_C GPUdisplayFlags(uint32 flags)
+{
+	// TODO
+}
+
+EXPORT_C_(int32) GPUfreeze(uint32 type, GPUFreezeData* data)
+{
+	if(!data || data->version != 1)
+	{
+		return 0;
+	}
+
+	if(type == 0)
+	{
+		s_gpu->Defrost(data);
+
+		return 1;
+	}
+	else if(type == 1)
+	{
+		s_gpu->Freeze(data);
+
+		return 1;
+	}
+	else if(type == 2)
+	{
+		int slot = *(int*)data + 1;
+
+		if(slot < 1 || slot > 9)
+		{
+			return 0;
+		}
+
+		// TODO
+
+		return 1;
+	}
+
+	return 0;
+}
+
+EXPORT_C GPUgetScreenPic(uint8* mem)
+{
+	// TODO
+}
+
+EXPORT_C GPUshowScreenPic(uint8* mem)
+{
+	// TODO
+}
+
+EXPORT_C GPUcursor(int player, int x, int y)
+{
+	// TODO
+}
diff --git a/plugins/GSdx_legacy/GPU.h b/plugins/GSdx_legacy/GPU.h
new file mode 100644
index 0000000000..18342ea3d6
--- /dev/null
+++ b/plugins/GSdx_legacy/GPU.h
@@ -0,0 +1,276 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#pragma pack(push, 1)
+
+#include "GS.h"
+
+enum
+{
+	GPU_POLYGON = 1,
+	GPU_LINE = 2,
+	GPU_SPRITE = 3,
+};
+
+REG32_(GPUReg, STATUS)
+	uint32 TX:4;
+	uint32 TY:1;
+	uint32 ABR:2;
+	uint32 TP:2;
+	uint32 DTD:1;
+	uint32 DFE:1;
+	uint32 MD:1;
+	uint32 ME:1;
+	uint32 _PAD0:3;
+	uint32 WIDTH1:1;
+	uint32 WIDTH0:2;
+	uint32 HEIGHT:1;
+	uint32 ISPAL:1;
+	uint32 ISRGB24:1;
+	uint32 ISINTER:1;
+	uint32 DEN:1;
+	uint32 _PAD1:2;
+	uint32 IDLE:1;
+	uint32 IMG:1;
+	uint32 COM:1;
+	uint32 DMA:2;
+	uint32 LCF:1;
+	/*
+	uint32 TX:4;
+	uint32 TY:1;
+	uint32 ABR:2;
+	uint32 TP:2;
+	uint32 DTD:1;
+	uint32 DFE:1;
+	uint32 PBW:1;
+	uint32 PBC:1;
+	uint32 _PAD0:3;
+	uint32 HRES2:1;
+	uint32 HRES1:2;
+	uint32 VRES:1;
+	uint32 ISPAL:1;
+	uint32 ISRGB24:1;
+	uint32 ISINTER:1;
+	uint32 ISSTOP:1;
+	uint32 _PAD1:1;
+	uint32 DMARDY:1;
+	uint32 IDIDLE:1;
+	uint32 DATARDY:1;
+	uint32 ISEMPTY:1;
+	uint32 TMODE:2;
+	uint32 ODE:1;
+	*/
+REG_END
+
+REG32_(GPUReg, PACKET)
+	uint32 _PAD:24;
+	uint32 OPTION:5;
+	uint32 TYPE:3;
+REG_END
+
+REG32_(GPUReg, PRIM)
+	uint32 VTX:24;
+	uint32 TGE:1;
+	uint32 ABE:1;
+	uint32 TME:1;
+	uint32 _PAD2:1;
+	uint32 IIP:1;
+	uint32 TYPE:3;
+REG_END
+
+REG32_(GPUReg, POLYGON)
+	uint32 _PAD:24;
+	uint32 TGE:1;
+	uint32 ABE:1;
+	uint32 TME:1;
+	uint32 VTX:1;
+	uint32 IIP:1;
+	uint32 TYPE:3;
+REG_END
+
+REG32_(GPUReg, LINE)
+	uint32 _PAD:24;
+	uint32 ZERO1:1;
+	uint32 ABE:1;
+	uint32 ZERO2:1;
+	uint32 PLL:1;
+	uint32 IIP:1;
+	uint32 TYPE:3;
+REG_END
+
+REG32_(GPUReg, SPRITE)
+	uint32 _PAD:24;
+	uint32 ZERO:1;
+	uint32 ABE:1;
+	uint32 TME:1;
+	uint32 SIZE:2;
+	uint32 TYPE:3;
+REG_END
+
+REG32_(GPUReg, RESET)
+	uint32 _PAD:32;
+REG_END
+
+REG32_(GPUReg, DEN)
+	uint32 DEN:1;
+	uint32 _PAD:31;
+REG_END
+
+REG32_(GPUReg, DMA)
+	uint32 DMA:2;
+	uint32 _PAD:30;
+REG_END
+
+REG32_(GPUReg, DAREA)
+	uint32 X:10;
+	uint32 Y:9;
+	uint32 _PAD:13;
+REG_END
+
+REG32_(GPUReg, DHRANGE)
+	uint32 X1:12;
+	uint32 X2:12;
+	uint32 _PAD:8;
+REG_END
+
+REG32_(GPUReg, DVRANGE)
+	uint32 Y1:10;
+	uint32 Y2:11;
+	uint32 _PAD:11;
+REG_END
+
+REG32_(GPUReg, DMODE)
+	uint32 WIDTH0:2;
+	uint32 HEIGHT:1;
+	uint32 ISPAL:1;
+	uint32 ISRGB24:1;
+	uint32 ISINTER:1;
+	uint32 WIDTH1:1;
+	uint32 REVERSE:1;
+	uint32 _PAD:24;
+REG_END
+
+REG32_(GPUReg, GPUINFO)
+	uint32 PARAM:24;
+	uint32 _PAD:8;
+REG_END
+
+REG32_(GPUReg, MODE)
+	uint32 TX:4;
+	uint32 TY:1;
+	uint32 ABR:2;
+	uint32 TP:2;
+	uint32 DTD:1;
+	uint32 DFE:1;
+	uint32 _PAD:21;
+REG_END
+
+REG32_(GPUReg, MASK)
+	uint32 MD:1;
+	uint32 ME:1;
+	uint32 _PAD:30;
+REG_END
+
+REG32_(GPUReg, DRAREA)
+	uint32 X:10;
+	uint32 Y:10;
+	uint32 _PAD:12;
+REG_END
+
+REG32_(GPUReg, DROFF)
+	int32 X:11;
+	int32 Y:11;
+	int32 _PAD:10;
+REG_END
+
+REG32_(GPUReg, RGB)
+	uint32 R:8;
+	uint32 G:8;
+	uint32 B:8;
+	uint32 _PAD:8;
+REG_END
+
+REG32_(GPUReg, XY)
+	int32 X:11;
+	int32 _PAD1:5;
+	int32 Y:11;
+	int32 _PAD2:5;
+REG_END
+
+REG32_(GPUReg, UV)
+	uint32 U:8;
+	uint32 V:8;
+	uint32 _PAD:16;
+REG_END
+
+REG32_(GPUReg, TWIN)
+	uint32 TWW:5;
+	uint32 TWH:5;
+	uint32 TWX:5;
+	uint32 TWY:5;
+	uint32 _PAD:12;
+REG_END
+
+REG32_(GPUReg, CLUT)
+	uint32 _PAD1:16;
+	uint32 X:6;
+	uint32 Y:9;
+	uint32 _PAD2:1;
+REG_END
+
+REG32_SET(GPUReg)
+	GPURegSTATUS STATUS;
+	GPURegPACKET PACKET;
+	GPURegPRIM PRIM;
+	GPURegPOLYGON POLYGON;
+	GPURegLINE LINE;
+	GPURegSPRITE SPRITE;
+	GPURegRESET RESET;
+	GPURegDEN DEN;
+	GPURegDMA DMA;
+	GPURegDAREA DAREA;
+	GPURegDHRANGE DHRANGE;
+	GPURegDVRANGE DVRANGE;
+	GPURegDMODE DMODE;
+	GPURegGPUINFO GPUINFO;
+	GPURegMODE MODE;
+	GPURegMASK MASK;
+	GPURegDRAREA DRAREA;
+	GPURegDROFF DROFF;
+	GPURegRGB RGB;
+	GPURegXY XY;
+	GPURegUV UV;
+	GPURegTWIN TWIN;
+	GPURegCLUT CLUT;
+REG_SET_END
+
+struct GPUFreezeData
+{
+	uint32 version; // == 1
+	uint32 status;
+	uint32 control[256];
+	uint16 vram[1024 * 1024];
+};
+
+#pragma pack(pop)
+
diff --git a/plugins/GSdx_legacy/GPUDrawScanline.cpp b/plugins/GSdx_legacy/GPUDrawScanline.cpp
new file mode 100644
index 0000000000..4159fd9d93
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUDrawScanline.cpp
@@ -0,0 +1,495 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GPUDrawScanline.h"
+
+GPUDrawScanline::GPUDrawScanline()
+	: m_sp_map("GPUSetupPrim", &m_local)
+	, m_ds_map("GPUDrawScanline", &m_local)
+{
+	memset(&m_local, 0, sizeof(m_local));
+
+	m_local.gd = &m_global;
+}
+
+GPUDrawScanline::~GPUDrawScanline()
+{
+}
+
+void GPUDrawScanline::BeginDraw(const GSRasterizerData* data)
+{
+	memcpy(&m_global, &((const SharedData*)data)->global, sizeof(m_global));
+
+	if(m_global.sel.tme && m_global.sel.twin)
+	{
+		uint32 u, v;
+
+		u = ~(m_global.twin.x << 3) & 0xff; // TWW
+		v = ~(m_global.twin.y << 3) & 0xff; // TWH
+
+		m_local.twin[0].u = GSVector4i((u << 16) | u);
+		m_local.twin[0].v = GSVector4i((v << 16) | v);
+
+		u = m_global.twin.z << 3; // TWX
+		v = m_global.twin.w << 3; // TWY
+
+		m_local.twin[1].u = GSVector4i((u << 16) | u) & ~m_local.twin[0].u;
+		m_local.twin[1].v = GSVector4i((v << 16) | v) & ~m_local.twin[0].v;
+	}
+
+	m_ds = m_ds_map[m_global.sel];
+
+	m_de = NULL;
+
+	m_dr = NULL; // TODO
+
+	// doesn't need all bits => less functions generated
+
+	GPUScanlineSelector sel;
+
+	sel.key = 0;
+
+	sel.iip = m_global.sel.iip;
+	sel.tfx = m_global.sel.tfx;
+	sel.twin = m_global.sel.twin;
+	sel.sprite = m_global.sel.sprite;
+
+	m_sp = m_sp_map[sel];
+}
+
+void GPUDrawScanline::EndDraw(uint64 frame, uint64 ticks, int actual, int total)
+{
+	m_ds_map.UpdateStats(frame, ticks, actual, total);
+}
+
+#ifndef ENABLE_JIT_RASTERIZER
+
+void GPUDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan)
+{
+	GPUScanlineSelector sel = m_global.sel;
+
+	const GSVector4* shift = GPUSetupPrimCodeGenerator::m_shift;
+
+	if(sel.tme && !sel.twin)
+	{
+		if(sel.sprite)
+		{
+			GSVector4i t = (GSVector4i(vertex[index[1]].t) >> 8) - GSVector4i::x00000001();
+
+			t = t.ps32(t);
+			t = t.upl16(t);
+			
+			m_local.twin[2].u = t.xxxx();
+			m_local.twin[2].v = t.yyyy();
+		}
+		else
+		{
+			// TODO: not really needed
+
+			m_local.twin[2].u = GSVector4i::x00ff();
+			m_local.twin[2].v = GSVector4i::x00ff();
+		}
+	}
+
+	if(sel.tme || sel.iip && sel.tfx != 3)
+	{
+		GSVector4 dt = dscan.t;
+		GSVector4 dc = dscan.c;
+
+		GSVector4i dtc8 = GSVector4i(dt * shift[0]).ps32(GSVector4i(dc * shift[0]));
+
+		if(sel.tme)
+		{
+			m_local.d8.st = dtc8.upl16(dtc8);
+		}
+
+		if(sel.iip && sel.tfx != 3)
+		{
+			m_local.d8.c = dtc8.uph16(dtc8);
+		}
+
+		if(sel.tme)
+		{
+			GSVector4 dtx = dt.xxxx();
+			GSVector4 dty = dt.yyyy();
+
+			m_local.d.s = GSVector4i(dtx * shift[1]).ps32(GSVector4i(dtx * shift[2]));
+			m_local.d.t = GSVector4i(dty * shift[1]).ps32(GSVector4i(dty * shift[2]));
+		}
+
+		if(sel.iip && sel.tfx != 3)
+		{
+			GSVector4 dcx = dc.xxxx();
+			GSVector4 dcy = dc.yyyy();
+			GSVector4 dcz = dc.zzzz();
+
+			m_local.d.r = GSVector4i(dcx * shift[1]).ps32(GSVector4i(dcx * shift[2]));
+			m_local.d.g = GSVector4i(dcy * shift[1]).ps32(GSVector4i(dcy * shift[2]));
+			m_local.d.b = GSVector4i(dcz * shift[1]).ps32(GSVector4i(dcz * shift[2]));
+		}
+	}
+}
+
+void GPUDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	// TODO: not tested yet, probably bogus
+
+	GPUScanlineSelector sel = m_global.sel;
+
+	GSVector4i s, t;
+	GSVector4i uf, vf;
+	GSVector4i rf, gf, bf;
+	GSVector4i dither;
+
+	// Init
+
+	uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left;
+
+	int steps = pixels - 8;
+
+	if(sel.dtd)
+	{
+		dither = GSVector4i::load<false>(&GPUDrawScanlineCodeGenerator::m_dither[top & 3][left & 3]);
+	}
+
+	if(sel.tme)
+	{
+		GSVector4i vt = GSVector4i(scan.t).xxzzl();
+
+		s = vt.xxxx().add16(m_local.d.s);
+		t = vt.yyyy();
+
+		if(!sel.sprite)
+		{
+			t = t.add16(m_local.d.t);
+		}
+		else
+		{
+			if(sel.ltf)
+			{
+				vf = t.sll16(1).srl16(1);
+			}
+		}
+	}
+
+	if(sel.tfx != 3)
+	{
+		GSVector4i vc = GSVector4i(scan.c).xxzzlh();
+
+		rf = vc.xxxx();
+		gf = vc.yyyy();
+		bf = vc.zzzz();
+
+		if(sel.iip)
+		{
+			rf = rf.add16(m_local.d.r);
+			gf = gf.add16(m_local.d.g);
+			bf = bf.add16(m_local.d.b);
+		}
+	}
+
+	while(1)
+	{
+		do
+		{
+			GSVector4i test = GPUDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
+
+			GSVector4i fd = GSVector4i::load(fb, fb + 8);
+
+			GSVector4i r, g, b, a;
+
+			// TestMask
+
+			if(sel.me)
+			{
+				test |= fd.sra16(15);
+
+				if(test.alltrue()) continue;
+			}
+
+			// SampleTexture
+
+			if(sel.tme)
+			{
+				GSVector4i u0, v0, u1, v1;
+				GSVector4i addr00, addr01, addr10, addr11;
+				GSVector4i c00, c01, c10, c11;
+
+				if(sel.ltf)
+				{
+					u0 = s.sub16(GSVector4i(0x00200020)); // - 0.125f
+					v0 = t.sub16(GSVector4i(0x00200020)); // - 0.125f
+
+					uf = u0.sll16(8).srl16(1);
+					vf = v0.sll16(8).srl16(1);;
+				}
+				else
+				{
+					u0 = s;
+					v0 = t;
+				}
+				
+				u0 = u0.srl16(8);
+				v0 = v0.srl16(8);
+
+				if(sel.ltf)
+				{
+					u1 = u0.add16(GSVector4i::x0001());
+					v1 = v0.add16(GSVector4i::x0001());
+
+					if(sel.twin)
+					{
+						u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
+						v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
+						u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
+						v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);
+					}
+					else
+					{
+						u0 = u0.min_i16(m_local.twin[2].u);
+						v0 = v0.min_i16(m_local.twin[2].v);
+						u1 = u1.min_i16(m_local.twin[2].u);
+						v1 = v1.min_i16(m_local.twin[2].v);
+					}
+
+					addr00 = v0.sll16(8) | u0;
+					addr01 = v0.sll16(8) | u1;
+					addr10 = v1.sll16(8) | u0;
+					addr11 = v1.sll16(8) | u1;
+
+					// TODO
+
+					if(sel.tlu)
+					{
+						c00 = addr00.gather16_16((const uint16*)m_global.vm, m_global.clut);
+						c01 = addr01.gather16_16((const uint16*)m_global.vm, m_global.clut);
+						c10 = addr10.gather16_16((const uint16*)m_global.vm, m_global.clut);
+						c11 = addr11.gather16_16((const uint16*)m_global.vm, m_global.clut);
+					}
+					else
+					{
+						c00 = addr00.gather16_16((const uint16*)m_global.vm);
+						c01 = addr01.gather16_16((const uint16*)m_global.vm);
+						c10 = addr10.gather16_16((const uint16*)m_global.vm);
+						c11 = addr11.gather16_16((const uint16*)m_global.vm);
+					}
+
+					GSVector4i r00 = c00.sll16(11).srl16(8);
+					GSVector4i r01 = c01.sll16(11).srl16(8);
+					GSVector4i r10 = c10.sll16(11).srl16(8);
+					GSVector4i r11 = c11.sll16(11).srl16(8);
+
+					r00 = r00.lerp16<0>(r01, uf);
+					r10 = r10.lerp16<0>(r11, uf);
+
+					GSVector4i g00 = c00.sll16(6).srl16(11).sll16(3);
+					GSVector4i g01 = c01.sll16(6).srl16(11).sll16(3);
+					GSVector4i g10 = c10.sll16(6).srl16(11).sll16(3);
+					GSVector4i g11 = c11.sll16(6).srl16(11).sll16(3);
+
+					g00 = g00.lerp16<0>(g01, uf);
+					g10 = g10.lerp16<0>(g11, uf);
+
+					GSVector4i b00 = c00.sll16(1).srl16(11).sll16(3);
+					GSVector4i b01 = c01.sll16(1).srl16(11).sll16(3);
+					GSVector4i b10 = c10.sll16(1).srl16(11).sll16(3);
+					GSVector4i b11 = c11.sll16(1).srl16(11).sll16(3);
+
+					b00 = b00.lerp16<0>(b01, uf);
+					b10 = b10.lerp16<0>(b11, uf);
+
+					GSVector4i a00 = c00.sra16(15).sll16(8);
+					GSVector4i a01 = c01.sra16(15).sll16(8);
+					GSVector4i a10 = c10.sra16(15).sll16(8);
+					GSVector4i a11 = c11.sra16(15).sll16(8);
+
+					a00 = a00.lerp16<0>(a01, uf);
+					a10 = a10.lerp16<0>(a11, uf);
+
+					r = r00.lerp16<0>(r10, vf);
+					g = g00.lerp16<0>(g10, vf);
+					b = b00.lerp16<0>(b10, vf);
+					a = a00.lerp16<0>(a10, vf);
+
+					test |= (r | g | b | a).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)
+
+					a = a.gt16(GSVector4i::zero());
+				}
+				else
+				{
+					if(sel.twin)
+					{
+						u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
+						v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
+					}
+					else
+					{
+						u0 = u0.min_i16(m_local.twin[2].u);
+						v0 = v0.min_i16(m_local.twin[2].v);
+					}
+
+					addr00 = v0.sll16(8) | u0;
+
+					// TODO
+
+					if(sel.tlu)
+					{
+						c00 = addr00.gather16_16((const uint16*)m_global.vm, m_global.clut);
+					}
+					else
+					{
+						c00 = addr00.gather16_16((const uint16*)m_global.vm);
+					}
+
+					r = (c00 << 3) & 0x00f800f8;
+					g = (c00 >> 2) & 0x00f800f8;
+					b = (c00 >> 7) & 0x00f800f8;
+					a = c00.sra16(15);
+
+					test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels
+				}
+			}
+
+			// ColorTFX
+
+			switch(sel.tfx)
+			{
+			case 0: // none (tfx = 0)
+			case 1: // none (tfx = tge)
+				r = rf.srl16(7);
+				g = gf.srl16(7);
+				b = bf.srl16(7);
+				break;
+			case 2: // modulate (tfx = tme | tge)
+				r = r.modulate16<1>(rf).clamp8();
+				g = g.modulate16<1>(gf).clamp8();
+				b = b.modulate16<1>(bf).clamp8();
+				break;
+			case 3: // decal (tfx = tme)
+				break;
+			default:
+				__assume(0);
+			}
+
+			// AlphaBlend
+
+			if(sel.abe)
+			{
+				GSVector4i rs = r;
+				GSVector4i gs = g;
+				GSVector4i bs = b;
+				GSVector4i rd = (fd & 0x001f001f) << 3;
+				GSVector4i gd = (fd & 0x03e003e0) >> 2;
+				GSVector4i bd = (fd & 0x7c007c00) >> 7;
+
+				switch(sel.abr)
+				{
+				case 0:
+					r = rd.avg8(rs);
+					g = gd.avg8(gs);
+					b = bd.avg8(bs);
+					break;
+				case 1:
+					r = rd.addus8(rs);
+					g = gd.addus8(gs);
+					b = bd.addus8(bs);
+					break;
+				case 2:
+					r = rd.subus8(rs);
+					g = gd.subus8(gs);
+					b = bd.subus8(bs);
+					break;
+				case 3:
+					r = rd.addus8(rs.srl16(2));
+					g = gd.addus8(gs.srl16(2));
+					b = bd.addus8(bs.srl16(2));
+					break;
+				default:
+					__assume(0);
+				}
+
+				if(sel.tme)
+				{
+					r = rs.blend8(rd, a);
+					g = gs.blend8(gd, a);
+					b = bs.blend8(bd, a);
+				}
+			}
+
+			// Dither
+
+			if(sel.dtd)
+			{
+				r = r.addus8(dither);
+				g = g.addus8(dither);
+				b = b.addus8(dither);
+			}
+			
+			// WriteFrame
+
+			GSVector4i fs = r | g | b | (sel.md ? GSVector4i(0x80008000) : sel.tme ? a : GSVector4i::zero());
+
+			fs = fs.blend8(fd, test);
+
+			GSVector4i::store(fb, fb + 8, fs);
+		}
+		while(0);
+
+		if(steps <= 0) break;
+
+		steps -= 8;
+
+		fb += 8;
+
+		if(sel.tme)
+		{
+			GSVector4i st = m_local.d8.st;
+
+			s = s.add16(st.xxxx());
+			t = t.add16(st.yyyy());
+		}
+
+		if(sel.tfx != 3) // != decal
+		{
+			if(sel.iip)
+			{
+				GSVector4i c = m_local.d8.c;
+
+				rf = rf.add16(c.xxxx());
+				gf = gf.add16(c.yyyy());
+				bf = bf.add16(c.zzzz());
+			}
+		}
+	}
+}
+
+void GPUDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	ASSERT(0);
+}
+
+void GPUDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
+{
+	// TODO
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GPUDrawScanline.h b/plugins/GSdx_legacy/GPUDrawScanline.h
new file mode 100644
index 0000000000..d7c7e26155
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUDrawScanline.h
@@ -0,0 +1,76 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPUState.h"
+#include "GSRasterizer.h"
+#include "GPUScanlineEnvironment.h"
+#include "GPUSetupPrimCodeGenerator.h"
+#include "GPUDrawScanlineCodeGenerator.h"
+
+class GPUDrawScanline : public IDrawScanline
+{
+public:
+	class SharedData : public GSRasterizerData
+	{
+	public:
+		GPUScanlineGlobalData global;
+
+	public:
+		SharedData()
+		{
+			global.clut = NULL;
+		}
+
+		virtual ~SharedData()
+		{
+			if(global.clut) _aligned_free(global.clut);
+		}
+	};
+
+protected:
+	GPUScanlineGlobalData m_global;
+	GPUScanlineLocalData m_local;
+
+	GSCodeGeneratorFunctionMap<GPUSetupPrimCodeGenerator, uint32, SetupPrimPtr> m_sp_map;
+	GSCodeGeneratorFunctionMap<GPUDrawScanlineCodeGenerator, uint32, DrawScanlinePtr> m_ds_map;
+
+public:
+	GPUDrawScanline();
+	virtual ~GPUDrawScanline();
+
+	// IDrawScanline
+
+	void BeginDraw(const GSRasterizerData* data);
+	void EndDraw(uint64 frame, uint64 ticks, int actual, int total);
+
+#ifndef ENABLE_JIT_RASTERIZER
+
+	void SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan);
+	void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan);
+	void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan);
+	void DrawRect(const GSVector4i& r, const GSVertexSW& v);
+
+#endif
+
+	void PrintStats() {m_ds_map.PrintStats();}
+};
diff --git a/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.cpp b/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.cpp
new file mode 100644
index 0000000000..c92e28d7c3
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.cpp
@@ -0,0 +1,1031 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+// TODO: x64
+
+#include "stdafx.h"
+#include "GPUDrawScanlineCodeGenerator.h"
+#include "GSVertexSW.h"
+
+static const int _args = 8;
+static const int _top = _args + 4;
+static const int _v = _args + 8;
+
+GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
+	: GSCodeGenerator(code, maxsize)
+	, m_local(*(GPUScanlineLocalData*)param)
+{
+	m_sel.key = key;
+
+	Generate();
+}
+
+void GPUDrawScanlineCodeGenerator::Generate()
+{
+	push(esi);
+	push(edi);
+
+	Init();
+
+	align(16);
+
+L("loop");
+
+	// GSVector4i test = m_test[7 + (steps & (steps >> 31))];
+
+	mov(edx, ecx);
+	sar(edx, 31);
+	and(edx, ecx);
+	shl(edx, 4);
+
+	movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
+
+	// movdqu(xmm1, ptr[edi]);
+
+	movq(xmm1, qword[edi]);
+	movhps(xmm1, qword[edi + 8]);
+
+	// ecx = steps
+	// esi = tex (tme)
+	// edi = fb
+	// xmm1 = fd
+	// xmm2 = s
+	// xmm3 = t
+	// xmm4 = r
+	// xmm5 = g
+	// xmm6 = b
+	// xmm7 = test
+
+	TestMask();
+
+	SampleTexture();
+
+	// xmm1 = fd
+	// xmm3 = a
+	// xmm4 = r
+	// xmm5 = g
+	// xmm6 = b
+	// xmm7 = test
+	// xmm0, xmm2 = free
+
+	ColorTFX();
+
+	AlphaBlend();
+
+	Dither();
+
+	WriteFrame();
+
+L("step");
+
+	// if(steps <= 0) break;
+
+	test(ecx, ecx);
+	jle("exit", T_NEAR);
+
+	Step();
+
+	jmp("loop", T_NEAR);
+
+L("exit");
+
+	pop(edi);
+	pop(esi);
+
+	ret(8);
+}
+
+void GPUDrawScanlineCodeGenerator::Init()
+{
+	mov(eax, dword[esp + _top]);
+
+	// uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left;
+
+	mov(edi, eax);
+	shl(edi, 10 + m_sel.scalex);
+	add(edi, edx);
+	lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);
+
+	// int steps = pixels - 8;
+
+	sub(ecx, 8);
+
+	if(m_sel.dtd)
+	{
+		// dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]);
+
+		and(eax, 3);
+		shl(eax, 5);
+		and(edx, 3);
+		shl(edx, 1);
+		movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]);
+		movdqa(ptr[&m_local.temp.dither], xmm0);
+	}
+
+	mov(edx, dword[esp + _v]);
+
+	if(m_sel.tme)
+	{
+		mov(esi, dword[&m_local.gd->tex]);
+
+		// GSVector4i vt = GSVector4i(v.t).xxzzl();
+
+		cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
+		pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+
+		// s = vt.xxxx().add16(m_local.d.s);
+		// t = vt.yyyy().add16(m_local.d.t);
+
+		pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+		pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+
+		paddw(xmm2, ptr[&m_local.d.s]);
+
+		if(!m_sel.sprite)
+		{
+			paddw(xmm3, ptr[&m_local.d.t]);
+		}
+		else
+		{
+			if(m_sel.ltf)
+			{
+				movdqa(xmm0, xmm3);
+				psllw(xmm0, 8);
+				psrlw(xmm0, 1);
+				movdqa(ptr[&m_local.temp.vf], xmm0);
+			}
+		}
+
+		movdqa(ptr[&m_local.temp.s], xmm2);
+		movdqa(ptr[&m_local.temp.t], xmm3);
+	}
+
+	if(m_sel.tfx != 3) // != decal
+	{
+		// GSVector4i vc = GSVector4i(v.c).xxzzlh();
+
+		cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
+		pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+		pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+
+		// r = vc.xxxx();
+		// g = vc.yyyy();
+		// b = vc.zzzz();
+
+		pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+		pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
+		pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+		if(m_sel.iip)
+		{
+			// r = r.add16(m_local.d.r);
+			// g = g.add16(m_local.d.g);
+			// b = b.add16(m_local.d.b);
+
+			paddw(xmm4, ptr[&m_local.d.r]);
+			paddw(xmm5, ptr[&m_local.d.g]);
+			paddw(xmm6, ptr[&m_local.d.b]);
+		}
+
+		movdqa(ptr[&m_local.temp.r], xmm4);
+		movdqa(ptr[&m_local.temp.g], xmm5);
+		movdqa(ptr[&m_local.temp.b], xmm6);
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 8;
+
+	sub(ecx, 8);
+
+	// fb += 8;
+
+	add(edi, 8 * sizeof(uint16));
+
+	if(m_sel.tme)
+	{
+		// GSVector4i st = m_local.d8.st;
+
+		movdqa(xmm4, ptr[&m_local.d8.st]);
+
+		// s = s.add16(st.xxxx());
+		// t = t.add16(st.yyyy());
+
+		pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+		paddw(xmm2, ptr[&m_local.temp.s]);
+		movdqa(ptr[&m_local.temp.s], xmm2);
+
+		// TODO: if(!sprite) ... else reload t
+
+		pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+		paddw(xmm3, ptr[&m_local.temp.t]);
+		movdqa(ptr[&m_local.temp.t], xmm3);
+	}
+
+	if(m_sel.tfx != 3) // != decal
+	{
+		if(m_sel.iip)
+		{
+			// GSVector4i c = m_local.d8.c;
+
+			// r = r.add16(c.xxxx());
+			// g = g.add16(c.yyyy());
+			// b = b.add16(c.zzzz());
+
+			movdqa(xmm6, ptr[&m_local.d8.c]);
+
+			pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+			pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
+			pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+			paddw(xmm4, ptr[&m_local.temp.r]);
+			paddw(xmm5, ptr[&m_local.temp.g]);
+			paddw(xmm6, ptr[&m_local.temp.b]);
+
+			movdqa(ptr[&m_local.temp.r], xmm4);
+			movdqa(ptr[&m_local.temp.g], xmm5);
+			movdqa(ptr[&m_local.temp.b], xmm6);
+		}
+		else
+		{
+			movdqa(xmm4, ptr[&m_local.temp.r]);
+			movdqa(xmm5, ptr[&m_local.temp.g]);
+			movdqa(xmm6, ptr[&m_local.temp.b]);
+		}
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::TestMask()
+{
+	if(!m_sel.me)
+	{
+		return;
+	}
+
+	// test |= fd.sra16(15);
+
+	movdqa(xmm0, xmm1);
+	psraw(xmm0, 15);
+	por(xmm7, xmm0);
+
+	alltrue();
+}
+
+void GPUDrawScanlineCodeGenerator::SampleTexture()
+{
+	if(!m_sel.tme)
+	{
+		return;
+	}
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	// xmm2 = s
+	// xmm3 = t
+	// xmm7 = test
+	// xmm0, xmm4, xmm5, xmm6 = free
+	// xmm1 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
+		// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f
+
+		mov(eax, 0x00200020);
+		movd(xmm0, eax);
+		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		psubw(xmm2, xmm0);
+		psubw(xmm3, xmm0);
+
+		// GSVector4i uf = (u & GSVector4i::x00ff()) << 7;
+		// GSVector4i vf = (v & GSVector4i::x00ff()) << 7;
+
+		movdqa(xmm0, xmm2);
+		psllw(xmm0, 8);
+		psrlw(xmm0, 1);
+		movdqa(ptr[&m_local.temp.uf], xmm0);
+
+		if(!m_sel.sprite)
+		{
+			movdqa(xmm0, xmm3);
+			psllw(xmm0, 8);
+			psrlw(xmm0, 1);
+			movdqa(ptr[&m_local.temp.vf], xmm0);
+		}
+	}
+
+	// GSVector4i u0 = s.srl16(8);
+	// GSVector4i v0 = t.srl16(8);
+
+	psrlw(xmm2, 8);
+	psrlw(xmm3, 8);
+
+	// xmm2 = u
+	// xmm3 = v
+	// xmm7 = test
+	// xmm0, xmm4, xmm5, xmm6 = free
+	// xmm1 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i u1 = u0.add16(GSVector4i::x0001());
+		// GSVector4i v1 = v0.add16(GSVector4i::x0001());
+
+		movdqa(xmm4, xmm2);
+		movdqa(xmm5, xmm3);
+
+		pcmpeqd(xmm0, xmm0);
+		psrlw(xmm0, 15);
+		paddw(xmm4, xmm0);
+		paddw(xmm5, xmm0);
+
+		if(m_sel.twin)
+		{
+			// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
+			// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
+			// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
+			// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);
+
+			movdqa(xmm0, ptr[&m_local.twin[0].u]);
+			movdqa(xmm6, ptr[&m_local.twin[1].u]);
+
+			pand(xmm2, xmm0);
+			paddw(xmm2, xmm6);
+			pand(xmm4, xmm0);
+			paddw(xmm4, xmm6);
+
+			movdqa(xmm0, ptr[&m_local.twin[0].v]);
+			movdqa(xmm6, ptr[&m_local.twin[1].v]);
+
+			pand(xmm3, xmm0);
+			paddw(xmm3, xmm6);
+			pand(xmm5, xmm0);
+			paddw(xmm5, xmm6);
+		}
+		else
+		{
+			// u0 = u0.min_i16(m_local.twin[2].u);
+			// v0 = v0.min_i16(m_local.twin[2].v);
+			// u1 = u1.min_i16(m_local.twin[2].u);
+			// v1 = v1.min_i16(m_local.twin[2].v);
+
+			// TODO: if(!sprite) clamp16 else:
+
+			movdqa(xmm0, ptr[&m_local.twin[2].u]);
+			movdqa(xmm6, ptr[&m_local.twin[2].v]);
+
+			pminsw(xmm2, xmm0);
+			pminsw(xmm3, xmm6);
+			pminsw(xmm4, xmm0);
+			pminsw(xmm5, xmm6);
+		}
+
+		// xmm2 = u0
+		// xmm3 = v0
+		// xmm4 = u1
+		// xmm5 = v1
+		// xmm7 = test
+		// xmm0, xmm6 = free
+		// xmm1 = used
+
+		// GSVector4i addr00 = v0.sll16(8) | u0;
+		// GSVector4i addr01 = v0.sll16(8) | u1;
+		// GSVector4i addr10 = v1.sll16(8) | u0;
+		// GSVector4i addr11 = v1.sll16(8) | u1;
+
+		psllw(xmm3, 8);
+		movdqa(xmm0, xmm3);
+		por(xmm3, xmm2);
+		por(xmm0, xmm4);
+
+		psllw(xmm5, 8);
+		movdqa(xmm6, xmm5);
+		por(xmm5, xmm2);
+		por(xmm6, xmm4);
+
+		// xmm3 = addr00
+		// xmm0 = addr01
+		// xmm5 = addr10
+		// xmm6 = addr11
+		// xmm7 = test
+		// xmm2, xmm4 = free
+		// xmm1 = used
+
+		ReadTexel(xmm2, xmm3);
+		ReadTexel(xmm4, xmm0);
+		ReadTexel(xmm3, xmm5);
+		ReadTexel(xmm5, xmm6);
+
+		// xmm2 = c00
+		// xmm4 = c01
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm7 = test
+		// xmm0, xmm6 = free
+		// xmm1 = used
+
+		// spill (TODO)
+
+		movdqa(ptr[&m_local.temp.fd], xmm1);
+		movdqa(ptr[&m_local.temp.test], xmm7);
+
+		// xmm2 = c00
+		// xmm4 = c01
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm0, xmm1, xmm6, xmm7 = free
+
+		movdqa(xmm1, xmm2);
+		psllw(xmm1, 11);
+		psrlw(xmm1, 8);
+
+		movdqa(xmm0, xmm4);
+		psllw(xmm0, 11);
+		psrlw(xmm0, 8);
+
+		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);
+
+		movdqa(xmm6, xmm2);
+		psllw(xmm6, 6);
+		psrlw(xmm6, 11);
+		psllw(xmm6, 3);
+
+		movdqa(xmm1, xmm4);
+		psllw(xmm1, 6);
+		psrlw(xmm1, 11);
+		psllw(xmm1, 3);
+
+		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);
+
+		movdqa(xmm7, xmm2);
+		psllw(xmm7, 1);
+		psrlw(xmm7, 11);
+		psllw(xmm7, 3);
+
+		movdqa(xmm6, xmm4);
+		psllw(xmm6, 1);
+		psrlw(xmm6, 11);
+		psllw(xmm6, 3);
+
+		lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);
+
+		psraw(xmm2, 15);
+		psrlw(xmm2, 8);
+		psraw(xmm4, 15);
+		psrlw(xmm4, 8);
+
+		lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);
+
+		// xmm0 = r00
+		// xmm1 = g00
+		// xmm6 = b00
+		// xmm4 = a00
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm2, xmm7 = free
+
+		movdqa(xmm7, xmm3);
+		psllw(xmm7, 11);
+		psrlw(xmm7, 8);
+
+		movdqa(xmm2, xmm5);
+		psllw(xmm2, 11);
+		psrlw(xmm2, 8);
+
+		lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
+		lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);
+
+		// xmm2 = r
+		// xmm1 = g00
+		// xmm6 = b00
+		// xmm4 = a00
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm0, xmm7 = free
+
+		movdqa(xmm7, xmm3);
+		psllw(xmm7, 6);
+		psrlw(xmm7, 11);
+		psllw(xmm7, 3);
+
+		movdqa(xmm0, xmm5);
+		psllw(xmm0, 6);
+		psrlw(xmm0, 11);
+		psllw(xmm0, 3);
+
+		lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
+		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);
+
+		// xmm2 = r
+		// xmm0 = g
+		// xmm6 = b00
+		// xmm4 = a00
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm1, xmm7 = free
+
+		movdqa(xmm7, xmm3);
+		psllw(xmm7, 1);
+		psrlw(xmm7, 11);
+		psllw(xmm7, 3);
+
+		movdqa(xmm1, xmm5);
+		psllw(xmm1, 1);
+		psrlw(xmm1, 11);
+		psllw(xmm1, 3);
+
+		lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
+		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);
+
+		// xmm2 = r
+		// xmm0 = g
+		// xmm1 = b
+		// xmm4 = a00
+		// xmm3 = c10
+		// xmm5 = c11
+		// xmm6, xmm7 = free
+
+		psraw(xmm3, 15);
+		psrlw(xmm3, 8);
+		psraw(xmm5, 15);
+		psrlw(xmm5, 8);
+
+		lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
+		lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);
+
+		// xmm2 = r
+		// xmm0 = g
+		// xmm1 = b
+		// xmm5 = a
+		// xmm3, xmm4, xmm6, xmm7 = free
+
+		// TODO
+		movdqa(xmm3, xmm5); // a
+		movdqa(xmm4, xmm2); // r
+		movdqa(xmm6, xmm1); // b
+		movdqa(xmm5, xmm0); // g
+
+		// reload test
+
+		movdqa(xmm7, ptr[&m_local.temp.test]);
+
+		// xmm4 = r
+		// xmm5 = g
+		// xmm6 = b
+		// xmm3 = a
+		// xmm7 = test
+		// xmm0, xmm1, xmm2 = free
+
+		// test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)
+
+		movdqa(xmm1, xmm3);
+		por(xmm1, xmm4);
+		movdqa(xmm2, xmm5);
+		por(xmm2, xmm6);
+		por(xmm1, xmm2);
+
+		pxor(xmm0, xmm0);
+		pcmpeqw(xmm1, xmm0);
+		por(xmm7, xmm1);
+
+		// a = a.gt16(GSVector4i::zero());
+
+		pcmpgtw(xmm3, xmm0);
+
+		// reload fd
+
+		movdqa(xmm1, ptr[&m_local.temp.fd]);
+	}
+	else
+	{
+		if(m_sel.twin)
+		{
+			// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
+			// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);
+
+			pand(xmm2, ptr[&m_local.twin[0].u]);
+			paddw(xmm2, ptr[&m_local.twin[1].u]);
+			pand(xmm3, ptr[&m_local.twin[0].v]);
+			paddw(xmm3, ptr[&m_local.twin[1].v]);
+		}
+		else
+		{
+			// u = u.min_i16(m_local.twin[2].u);
+			// v = v.min_i16(m_local.twin[2].v);
+
+			// TODO: if(!sprite) clamp16 else:
+
+			pminsw(xmm2, ptr[&m_local.twin[2].u]);
+			pminsw(xmm3, ptr[&m_local.twin[2].v]);
+		}
+
+		// xmm2 = u
+		// xmm3 = v
+		// xmm7 = test
+		// xmm0, xmm4, xmm5, xmm6 = free
+		// xmm1 = used
+
+		// GSVector4i addr = v.sll16(8) | u;
+
+		psllw(xmm3, 8);
+		por(xmm3, xmm2);
+
+		// xmm3 = addr
+		// xmm7 = test
+		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
+		// xmm1 = used
+
+		ReadTexel(xmm6, xmm3);
+
+		// xmm3 = c00
+		// xmm7 = test
+		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
+		// xmm1 = used
+
+		// test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels
+
+		pxor(xmm0, xmm0);
+		pcmpeqw(xmm0, xmm6);
+		por(xmm7, xmm0);
+
+		// c[0] = (c00 << 3) & 0x00f800f8;
+		// c[1] = (c00 >> 2) & 0x00f800f8;
+		// c[2] = (c00 >> 7) & 0x00f800f8;
+		// c[3] = c00.sra16(15);
+
+		movdqa(xmm3, xmm6);
+		psraw(xmm3, 15); // a
+
+		pcmpeqd(xmm0, xmm0);
+		psrlw(xmm0, 11);
+		psllw(xmm0, 3); // 0x00f8
+
+		movdqa(xmm4, xmm6);
+		psllw(xmm4, 3);
+		pand(xmm4, xmm0); // r
+
+		movdqa(xmm5, xmm6);
+		psrlw(xmm5, 2);
+		pand(xmm5, xmm0); // g
+
+		psrlw(xmm6, 7);
+		pand(xmm6, xmm0); // b
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::ColorTFX()
+{
+	switch(m_sel.tfx)
+	{
+	case 0: // none (tfx = 0)
+	case 1: // none (tfx = tge)
+		// c[0] = r.srl16(7);
+		// c[1] = g.srl16(7);
+		// c[2] = b.srl16(7);
+		psrlw(xmm4, 7);
+		psrlw(xmm5, 7);
+		psrlw(xmm6, 7);
+		break;
+	case 2: // modulate (tfx = tme | tge)
+		// c[0] = c[0].modulate16<1>(r).clamp8();
+		// c[1] = c[1].modulate16<1>(g).clamp8();
+		// c[2] = c[2].modulate16<1>(b).clamp8();
+		pcmpeqd(xmm0, xmm0);
+		psrlw(xmm0, 8);
+		modulate16<1>(xmm4, ptr[&m_local.temp.r]);
+		pminsw(xmm4, xmm0);
+		modulate16<1>(xmm5, ptr[&m_local.temp.g]);
+		pminsw(xmm5, xmm0);
+		modulate16<1>(xmm6, ptr[&m_local.temp.b]);
+		pminsw(xmm6, xmm0);
+		break;
+	case 3: // decal (tfx = tme)
+		break;
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if(!m_sel.abe)
+	{
+		return;
+	}
+
+	// xmm1 = fd
+	// xmm3 = a
+	// xmm4 = r
+	// xmm5 = g
+	// xmm6 = b
+	// xmm7 = test
+	// xmm0, xmm2 = free
+
+	// GSVector4i r = (fd & 0x001f001f) << 3;
+
+	pcmpeqd(xmm0, xmm0);
+	psrlw(xmm0, 11); // 0x001f
+	movdqa(xmm2, xmm1);
+	pand(xmm2, xmm0);
+	psllw(xmm2, 3);
+
+	switch(m_sel.abr)
+	{
+	case 0:
+		// r = r.avg8(c[0]);
+		pavgb(xmm2, xmm4);
+		break;
+	case 1:
+		// r = r.addus8(c[0]);
+		paddusb(xmm2, xmm4);
+		break;
+	case 2:
+		// r = r.subus8(c[0]);
+		psubusb(xmm2, xmm4);
+		break;
+	case 3:
+		// r = r.addus8(c[0].srl16(2));
+		movdqa(xmm0, xmm4);
+		psrlw(xmm0, 2);
+		paddusb(xmm2, xmm0);
+		break;
+	}
+
+	if(m_sel.tme)
+	{
+		movdqa(xmm0, xmm3);
+		blend8(xmm4, xmm2);
+	}
+	else
+	{
+		movdqa(xmm4, xmm2);
+	}
+
+	// GSVector4i g = (d & 0x03e003e0) >> 2;
+
+	pcmpeqd(xmm0, xmm0);
+	psrlw(xmm0, 11);
+	psllw(xmm0, 5); // 0x03e0
+	movdqa(xmm2, xmm1);
+	pand(xmm2, xmm0);
+	psrlw(xmm2, 2);
+
+	switch(m_sel.abr)
+	{
+	case 0:
+		// g = g.avg8(c[2]);
+		pavgb(xmm2, xmm5);
+		break;
+	case 1:
+		// g = g.addus8(c[2]);
+		paddusb(xmm2, xmm5);
+		break;
+	case 2:
+		// g = g.subus8(c[2]);
+		psubusb(xmm2, xmm5);
+		break;
+	case 3:
+		// g = g.addus8(c[2].srl16(2));
+		movdqa(xmm0, xmm5);
+		psrlw(xmm0, 2);
+		paddusb(xmm2, xmm0);
+		break;
+	}
+
+	if(m_sel.tme)
+	{
+		movdqa(xmm0, xmm3);
+		blend8(xmm5, xmm2);
+	}
+	else
+	{
+		movdqa(xmm5, xmm2);
+	}
+
+	// GSVector4i b = (d & 0x7c007c00) >> 7;
+
+	pcmpeqd(xmm0, xmm0);
+	psrlw(xmm0, 11);
+	psllw(xmm0, 10); // 0x7c00
+	movdqa(xmm2, xmm1);
+	pand(xmm2, xmm0);
+	psrlw(xmm2, 7);
+
+	switch(m_sel.abr)
+	{
+	case 0:
+		// b = b.avg8(c[2]);
+		pavgb(xmm2, xmm6);
+		break;
+	case 1:
+		// b = b.addus8(c[2]);
+		paddusb(xmm2, xmm6);
+		break;
+	case 2:
+		// b = b.subus8(c[2]);
+		psubusb(xmm2, xmm6);
+		break;
+	case 3:
+		// b = b.addus8(c[2].srl16(2));
+		movdqa(xmm0, xmm6);
+		psrlw(xmm0, 2);
+		paddusb(xmm2, xmm0);
+		break;
+	}
+
+	if(m_sel.tme)
+	{
+		movdqa(xmm0, xmm3);
+		blend8(xmm6, xmm2);
+	}
+	else
+	{
+		movdqa(xmm6, xmm2);
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::Dither()
+{
+	if(!m_sel.dtd)
+	{
+		return;
+	}
+
+	// c[0] = c[0].addus8(dither);
+	// c[1] = c[1].addus8(dither);
+	// c[2] = c[2].addus8(dither);
+
+	movdqa(xmm0, ptr[&m_local.temp.dither]);
+
+	paddusb(xmm4, xmm0);
+	paddusb(xmm5, xmm0);
+	paddusb(xmm6, xmm0);
+}
+
+void GPUDrawScanlineCodeGenerator::WriteFrame()
+{
+	// GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0);
+
+	pcmpeqd(xmm0, xmm0);
+
+	if(m_sel.md || m_sel.tme)
+	{
+		movdqa(xmm2, xmm0);
+		psllw(xmm2, 15);
+	}
+
+	psrlw(xmm0, 11);
+	psllw(xmm0, 3);
+
+	// xmm0 = 0x00f8
+	// xmm2 = 0x8000 (md)
+
+	// GSVector4i r = (c[0] & 0x00f800f8) >> 3;
+
+	pand(xmm4, xmm0);
+	psrlw(xmm4, 3);
+
+	// GSVector4i g = (c[1] & 0x00f800f8) << 2;
+
+	pand(xmm5, xmm0);
+	psllw(xmm5, 2);
+	por(xmm4, xmm5);
+
+	// GSVector4i b = (c[2] & 0x00f800f8) << 7;
+
+	pand(xmm6, xmm0);
+	psllw(xmm6, 7);
+	por(xmm4, xmm6);
+
+	if(m_sel.md)
+	{
+		// GSVector4i a = GSVector4i(0x80008000);
+
+		por(xmm4, xmm2);
+	}
+	else if(m_sel.tme)
+	{
+		// GSVector4i a = (c[3] << 8) & 0x80008000;
+
+		psllw(xmm3, 8);
+		pand(xmm3, xmm2);
+		por(xmm4, xmm3);
+	}
+
+	// fs = fs.blend8(fd, test);
+
+	movdqa(xmm0, xmm7);
+	blend8(xmm4, xmm1);
+
+	// GSVector4i::store<false>(fb, fs);
+
+	// movdqu(ptr[edi], xmm4);
+
+	movq(qword[edi], xmm4);
+	movhps(qword[edi + 8], xmm4);
+}
+
+void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr)
+{
+	for(int i = 0; i < 8; i++)
+	{
+		pextrw(eax, addr, (uint8)i);
+
+		if(m_sel.tlu) movzx(eax, byte[esi + eax]);
+
+		const Address& src = m_sel.tlu ? ptr[edx + eax * 2] : ptr[esi + eax * 2];
+
+		if(i == 0) movd(dst, src);
+		else pinsrw(dst, src, (uint8)i);
+	}
+}
+
+template<int shift>
+void GPUDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f)
+{
+	if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
+	{
+		pmulhrsw(a, f);
+	}
+	else
+	{
+		psllw(a, shift + 1);
+		pmulhw(a, f);
+	}
+}
+
+template<int shift>
+void GPUDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Operand& f)
+{
+	psubw(a, b);
+	modulate16<shift>(a, f);
+	paddw(a, b);
+}
+
+void GPUDrawScanlineCodeGenerator::alltrue()
+{
+	pmovmskb(eax, xmm7);
+	cmp(eax, 0xffff);
+	je("step", T_NEAR);
+}
+
+void GPUDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
+{
+	if(m_cpu.has(util::Cpu::tSSE41))
+	{
+		pblendvb(a, b);
+	}
+	else
+	{
+		blend(a, b, xmm0);
+	}
+}
+
+void GPUDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
+{
+	pand(b, mask);
+	pandn(mask, a);
+	por(b, mask);
+	movdqa(a, b);
+}
+
+const GSVector4i GPUDrawScanlineCodeGenerator::m_test[8] =
+{
+	GSVector4i(0xffff0000, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0xffff0000, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0xffff0000, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffff0000),
+	GSVector4i::zero(),
+};
+
+__aligned(const uint16, 32) GPUDrawScanlineCodeGenerator::m_dither[4][16] =
+{
+	{7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1},
+	{2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4},
+	{1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7},
+	{4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2},
+};
diff --git a/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.h b/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.h
new file mode 100644
index 0000000000..00eff14c6d
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUDrawScanlineCodeGenerator.h
@@ -0,0 +1,60 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPUScanlineEnvironment.h"
+#include "GSFunctionMap.h"
+
+using namespace Xbyak;
+
+class GPUDrawScanlineCodeGenerator : public GSCodeGenerator
+{
+	void operator = (const GPUDrawScanlineCodeGenerator&);
+
+	GPUScanlineSelector m_sel;
+	GPUScanlineLocalData& m_local;
+
+	void Generate();
+
+	void Init();
+	void Step();
+	void TestMask();
+	void SampleTexture();
+	void ColorTFX();
+	void AlphaBlend();
+	void Dither();
+	void WriteFrame();
+
+	void ReadTexel(const Xmm& dst, const Xmm& addr);
+
+	template<int shift> void modulate16(const Xmm& a, const Operand& f);
+	template<int shift> void lerp16(const Xmm& a, const Xmm& b, const Operand& f);
+	void alltrue();
+	void blend8(const Xmm& a, const Xmm& b);
+	void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
+
+public:
+	GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize);
+
+	static const GSVector4i m_test[8];
+	static __aligned(const uint16, 32) m_dither[4][16];
+};
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GPUDrawingEnvironment.h b/plugins/GSdx_legacy/GPUDrawingEnvironment.h
new file mode 100644
index 0000000000..674bfb2830
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUDrawingEnvironment.h
@@ -0,0 +1,75 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPU.h"
+
+__aligned(class, 32) GPUDrawingEnvironment
+{
+public:
+	GPURegSTATUS STATUS;
+	GPURegPRIM PRIM;
+	GPURegDAREA DAREA;
+	GPURegDHRANGE DHRANGE;
+	GPURegDVRANGE DVRANGE;
+	GPURegDRAREA DRAREATL;
+	GPURegDRAREA DRAREABR;
+	GPURegDROFF DROFF;
+	GPURegTWIN TWIN;
+	GPURegCLUT CLUT;
+
+	GPUDrawingEnvironment()
+	{
+		Reset();
+	}
+
+	void Reset()
+	{
+		memset(this, 0, sizeof(*this));
+
+		STATUS.IDLE = 1;
+		STATUS.COM = 1;
+		STATUS.WIDTH0 = 1;
+		DVRANGE.Y1 = 16;
+		DVRANGE.Y2 = 256;
+	}
+
+	GSVector4i GetDisplayRect()
+	{
+		static int s_width[] = {256, 320, 512, 640, 368, 384, 512, 640};
+		static int s_height[] = {240, 480};
+
+		GSVector4i r;
+
+		r.left = DAREA.X & ~7; // FIXME
+		r.top = DAREA.Y;
+		r.right = r.left + s_width[(STATUS.WIDTH1 << 2) | STATUS.WIDTH0];
+		r.bottom = r.top + (DVRANGE.Y2 - DVRANGE.Y1) * s_height[STATUS.HEIGHT] / 240;
+
+		return r.rintersect(GSVector4i(0, 0, 1024, 512));
+	}
+
+	float GetFPS()
+	{
+		return STATUS.ISPAL ? 50.0f : 59.94f;
+	}
+};
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GPULocalMemory.cpp b/plugins/GSdx_legacy/GPULocalMemory.cpp
new file mode 100644
index 0000000000..0218dcfc78
--- /dev/null
+++ b/plugins/GSdx_legacy/GPULocalMemory.cpp
@@ -0,0 +1,662 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GPULocalMemory.h"
+#include "GSdx.h"
+
+const GSVector4i GPULocalMemory::m_xxxa(0x00008000);
+const GSVector4i GPULocalMemory::m_xxbx(0x00007c00);
+const GSVector4i GPULocalMemory::m_xgxx(0x000003e0);
+const GSVector4i GPULocalMemory::m_rxxx(0x0000001f);
+
+#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16))
+#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2)
+#define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32)
+
+GPULocalMemory::GPULocalMemory()
+{
+	m_scale.x = std::min<int>(std::max<int>(theApp.GetConfig("scale_x", 0), 0), 2);
+	m_scale.y = std::min<int>(std::max<int>(theApp.GetConfig("scale_y", 0), 0), 2);
+
+	//
+
+	int size = VM_REAL_SIZE;
+
+	m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false);
+
+	memset(m_vm, 0, size);
+
+	//
+
+	m_clut.buff = m_vm + size;
+	m_clut.dirty = true;
+
+	//
+
+	size = TEX_ALLOC_SIZE;
+
+	m_texture.buff[0] = (uint8*)vmalloc(size, false);
+	m_texture.buff[1] = m_texture.buff[0] + 256 * 256 * 32;
+	m_texture.buff[2] = m_texture.buff[1] + 256 * 256 * 32;
+
+	memset(m_texture.buff[0], 0, size);
+
+	memset(m_texture.valid, 0, sizeof(m_texture.valid));
+
+	for(int y = 0, offset = 0; y < 2; y++)
+	{
+		for(int x = 0; x < 16; x++, offset += 256 * 256)
+		{
+			m_texture.page[0][y][x] = &((uint8*)m_texture.buff[0])[offset];
+			m_texture.page[1][y][x] = &((uint8*)m_texture.buff[1])[offset];
+		}
+	}
+
+	for(int y = 0, offset = 0; y < 2; y++)
+	{
+		for(int x = 0; x < 16; x++, offset += 256 * 256)
+		{
+			m_texture.page[2][y][x] = &((uint32*)m_texture.buff[2])[offset];
+		}
+	}
+}
+
+GPULocalMemory::~GPULocalMemory()
+{
+	vmfree(m_vm, VM_ALLOC_SIZE);
+
+	vmfree(m_texture.buff[0], TEX_ALLOC_SIZE);
+}
+
+const uint16* GPULocalMemory::GetCLUT(int tp, int cx, int cy)
+{
+	if(m_clut.dirty || m_clut.tp != tp || m_clut.cx != cx || m_clut.cy != cy)
+	{
+		uint16* src = GetPixelAddressScaled(cx << 4, cy);
+		uint16* dst = m_clut.buff;
+
+		if(m_scale.x == 0)
+		{
+			memcpy(dst, src, (tp == 0 ? 16 : 256) * 2);
+		}
+		else if(m_scale.x == 1)
+		{
+			if(tp == 0)
+			{
+				for(int i = 0; i < 16; i++)
+				{
+					dst[i] = src[i * 2];
+				}
+			}
+			else if(tp == 1)
+			{
+				for(int i = 0; i < 256; i++)
+				{
+					dst[i] = src[i * 2];
+				}
+			}
+		}
+		else if(m_scale.x == 2)
+		{
+			if(tp == 0)
+			{
+				for(int i = 0; i < 16; i++)
+				{
+					dst[i] = src[i * 4];
+				}
+			}
+			else if(tp == 1)
+			{
+				for(int i = 0; i < 256; i++)
+				{
+					dst[i] = src[i * 4];
+				}
+			}
+		}
+		else
+		{
+			ASSERT(0);
+		}
+
+		m_clut.tp = tp;
+		m_clut.cx = cx;
+		m_clut.cy = cy;
+		m_clut.dirty = false;
+	}
+
+	return m_clut.buff;
+}
+
+const void* GPULocalMemory::GetTexture(int tp, int tx, int ty)
+{
+	if(tp == 3)
+	{
+		ASSERT(0);
+
+		return NULL;
+	}
+
+	void* buff = m_texture.page[tp][ty][tx];
+
+	uint32 flag = 1 << tx;
+
+	if((m_texture.valid[tp][ty] & flag) == 0)
+	{
+		// int bpp = 0;
+
+		switch(tp)
+		{
+		case 0:
+			ReadPage4(tx, ty, (uint8*)buff);
+			// bpp = 4;
+			break;
+		case 1:
+			ReadPage8(tx, ty, (uint8*)buff);
+			// bpp = 8;
+			break;
+		case 2:
+		case 3:
+			ReadPage16(tx, ty, (uint16*)buff);
+			// bpp = 16;
+		default:
+			// FIXME: __assume(0); // vc9 generates bogus code in release mode
+			break;
+		}
+
+		// TODO: m_state->m_perfmon.Put(GSPerfMon::Unswizzle, 256 * 256 * bpp >> 3);
+
+		m_texture.valid[tp][ty] |= flag;
+	}
+
+	return buff;
+}
+
+void GPULocalMemory::Invalidate(const GSVector4i& r)
+{
+	if(!m_clut.dirty)
+	{
+		if(r.top <= m_clut.cy && m_clut.cy < r.bottom)
+		{
+			int left = m_clut.cx << 4;
+			int right = left + (m_clut.tp == 0 ? 16 : 256);
+
+			if(r.left < right && r.right > left)
+			{
+				m_clut.dirty = true;
+			}
+		}
+	}
+
+	for(int y = 0, ye = min(r.bottom, 512), j = 0; y < ye; y += 256, j++)
+	{
+		if(r.top >= y + 256) continue;
+
+		for(int x = 0, xe = min(r.right, 1024), i = 0; x < xe; x += 64, i++)
+		{
+			uint32 flag = 1 << i;
+
+			if(r.left >= x + 256) continue;
+
+			m_texture.valid[2][j] &= ~flag;
+
+			if(r.left >= x + 128) continue;
+
+			m_texture.valid[1][j] &= ~flag;
+
+			if(r.left >= x + 64) continue;
+
+			m_texture.valid[0][j] &= ~flag;
+		}
+	}
+}
+
+void GPULocalMemory::FillRect(const GSVector4i& r, uint16 c)
+{
+	Invalidate(r);
+
+	uint16* RESTRICT dst = GetPixelAddressScaled(r.left, r.top);
+
+	int w = r.width() << m_scale.x;
+	int h = r.height() << m_scale.y;
+
+	int pitch = GetWidth();
+
+	for(int j = 0; j < h; j++, dst += pitch)
+	{
+		for(int i = 0; i < w; i++)
+		{
+			dst[i] = c;
+		}
+	}
+}
+
+void GPULocalMemory::WriteRect(const GSVector4i& r, const uint16* RESTRICT src)
+{
+	Invalidate(r);
+
+	uint16* RESTRICT dst = GetPixelAddressScaled(r.left, r.top);
+
+	int w = r.width();
+	int h = r.height();
+
+	int pitch = GetWidth();
+
+	if(m_scale.x == 0)
+	{
+		for(int j = 0; j < h; j++, src += w)
+		{
+			for(int k = 1 << m_scale.y; k >= 1; k--, dst += pitch)
+			{
+				memcpy(dst, src, w * 2);
+			}
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int j = 0; j < h; j++, src += w)
+		{
+			for(int k = 1 << m_scale.y; k >= 1; k--, dst += pitch)
+			{
+				for(int i = 0; i < w; i++)
+				{
+					dst[i * 2 + 0] = src[i];
+					dst[i * 2 + 1] = src[i];
+				}
+			}
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int j = 0; j < h; j++, src += w)
+		{
+			for(int k = 1 << m_scale.y; k >= 1; k--, dst += pitch)
+			{
+				for(int i = 0; i < w; i++)
+				{
+					dst[i * 4 + 0] = src[i];
+					dst[i * 4 + 1] = src[i];
+					dst[i * 4 + 2] = src[i];
+					dst[i * 4 + 3] = src[i];
+				}
+			}
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GPULocalMemory::ReadRect(const GSVector4i& r, uint16* RESTRICT dst)
+{
+	uint16* RESTRICT src = GetPixelAddressScaled(r.left, r.top);
+
+	int w = r.width();
+	int h = r.height();
+
+	int pitch = GetWidth() << m_scale.y;
+
+	if(m_scale.x == 0)
+	{
+		for(int j = 0; j < h; j++, src += pitch, dst += w)
+		{
+			memcpy(dst, src, w * 2);
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int j = 0; j < h; j++, src += pitch, dst += w)
+		{
+			for(int i = 0; i < w; i++)
+			{
+				dst[i] = src[i * 2];
+			}
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int j = 0; j < h; j++, src += pitch, dst += w)
+		{
+			for(int i = 0; i < w; i++)
+			{
+				dst[i] = src[i * 4];
+			}
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GPULocalMemory::MoveRect(int sx, int sy, int dx, int dy, int w, int h)
+{
+	Invalidate(GSVector4i(dx, dy, dx + w, dy + h));
+
+	uint16* s = GetPixelAddressScaled(sx, sy);
+	uint16* d = GetPixelAddressScaled(dx, dy);
+
+	w <<= m_scale.x;
+	h <<= m_scale.y;
+
+	int pitch = GetWidth();
+
+	for(int i = 0; i < h; i++, s += pitch, d += pitch)
+	{
+		memcpy(d, s, w * sizeof(uint16));
+	}
+}
+
+void GPULocalMemory::ReadPage4(int tx, int ty, uint8* RESTRICT dst)
+{
+	uint16* src = GetPixelAddressScaled(tx << 6, ty << 8);
+
+	int pitch = GetWidth() << m_scale.y;
+
+	if(m_scale.x == 0)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 64; i++)
+			{
+				dst[i * 4 + 0] = (src[i] >> 0) & 0xf;
+				dst[i * 4 + 1] = (src[i] >> 4) & 0xf;
+				dst[i * 4 + 2] = (src[i] >> 8) & 0xf;
+				dst[i * 4 + 3] = (src[i] >> 12) & 0xf;
+			}
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 64; i++)
+			{
+				dst[i * 4 + 0] = (src[i * 2] >> 0) & 0xf;
+				dst[i * 4 + 1] = (src[i * 2] >> 4) & 0xf;
+				dst[i * 4 + 2] = (src[i * 2] >> 8) & 0xf;
+				dst[i * 4 + 3] = (src[i * 2] >> 12) & 0xf;
+			}
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 64; i++)
+			{
+				dst[i * 4 + 0] = (src[i * 4] >> 0) & 0xf;
+				dst[i * 4 + 1] = (src[i * 4] >> 4) & 0xf;
+				dst[i * 4 + 2] = (src[i * 4] >> 8) & 0xf;
+				dst[i * 4 + 3] = (src[i * 4] >> 12) & 0xf;
+			}
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GPULocalMemory::ReadPage8(int tx, int ty, uint8* RESTRICT dst)
+{
+	uint16* src = GetPixelAddressScaled(tx << 6, ty << 8);
+
+	int pitch = GetWidth() << m_scale.y;
+
+	if(m_scale.x == 0)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			memcpy(dst, src, 256);
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 128; i++)
+			{
+				((uint16*)dst)[i] = src[i * 2];
+			}
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 128; i++)
+			{
+				((uint16*)dst)[i] = src[i * 4];
+			}
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GPULocalMemory::ReadPage16(int tx, int ty, uint16* RESTRICT dst)
+{
+	uint16* src = GetPixelAddressScaled(tx << 6, ty << 8);
+
+	int pitch = GetWidth() << m_scale.y;
+
+	if(m_scale.x == 0)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			memcpy(dst, src, 512);
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 256; i++)
+			{
+				dst[i] = src[i * 2];
+			}
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int j = 0; j < 256; j++, src += pitch, dst += 256)
+		{
+			for(int i = 0; i < 256; i++)
+			{
+				dst[i] = src[i * 4];
+			}
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GPULocalMemory::ReadFrame32(const GSVector4i& r, uint32* RESTRICT dst, bool rgb24)
+{
+	uint16* src = GetPixelAddress(r.left, r.top);
+
+	int pitch = GetWidth();
+
+	if(rgb24)
+	{
+		for(int i = r.top; i < r.bottom; i++, src += pitch, dst += pitch)
+		{
+			Expand24(src, dst, r.width());
+		}
+	}
+	else
+	{
+		for(int i = r.top; i < r.bottom; i++, src += pitch, dst += pitch)
+		{
+			Expand16(src, dst, r.width());
+		}
+	}
+}
+
+void GPULocalMemory::Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int pixels)
+{
+	GSVector4i rm = m_rxxx;
+	GSVector4i gm = m_xgxx;
+	GSVector4i bm = m_xxbx;
+	GSVector4i am = m_xxxa;
+
+	GSVector4i* s = (GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	for(int i = 0, j = pixels >> 3; i < j; i++)
+	{
+		GSVector4i c = s[i];
+
+		GSVector4i l = c.upl16();
+		GSVector4i h = c.uph16();
+
+		d[i * 2 + 0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | ((l & am) << 16);
+		d[i * 2 + 1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | ((h & am) << 16);
+	}
+}
+
+void GPULocalMemory::Expand24(const uint16* RESTRICT src, uint32* RESTRICT dst, int pixels)
+{
+	uint8* s = (uint8*)src;
+
+	if(m_scale.x == 0)
+	{
+		for(int i = 0; i < pixels; i += 2, s += 6)
+		{
+			dst[i + 0] = (s[2] << 16) | (s[1] << 8) | s[0];
+			dst[i + 1] = (s[5] << 16) | (s[4] << 8) | s[3];
+		}
+	}
+	else if(m_scale.x == 1)
+	{
+		for(int i = 0; i < pixels; i += 4, s += 12)
+		{
+			dst[i + 0] = dst[i + 1] = (s[4] << 16) | (s[1] << 8) | s[0];
+			dst[i + 2] = dst[i + 3] = (s[9] << 16) | (s[8] << 8) | s[5];
+		}
+	}
+	else if(m_scale.x == 2)
+	{
+		for(int i = 0; i < pixels; i += 8, s += 24)
+		{
+			dst[i + 0] = dst[i + 1] = dst[i + 2] = dst[i + 3] = (s[8] << 16) | (s[1] << 8) | s[0];
+			dst[i + 4] = dst[i + 5] = dst[i + 6] = dst[i + 7] = (s[17] << 16) | (s[16] << 8) | s[9];
+		}
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+#include "GSTextureSW.h"
+
+void GPULocalMemory::SaveBMP(const string& fn, const GSVector4i& r2, int tp, int cx, int cy)
+{
+	GSVector4i r;
+
+	r.left = r2.left << m_scale.x;
+	r.top = r2.top << m_scale.y;
+	r.right = r2.right << m_scale.x;
+	r.bottom = r2.bottom << m_scale.y;
+
+	r.left &= ~1;
+	r.right &= ~1;
+
+	GSTextureSW t(GSTexture::Offscreen, r.width(), r.height());
+
+	GSTexture::GSMap m;
+
+	if(t.Map(m, NULL))
+	{
+		int pitch = GetWidth();
+
+		const uint16* RESTRICT src = GetPixelAddress(r.left, r.top);
+		const uint16* RESTRICT clut = GetCLUT(tp, cx, cy);
+
+		uint8* RESTRICT dst = m.bits;
+
+		uint16* RESTRICT buff = (uint16*)_aligned_malloc(pitch * sizeof(uint16), 32);
+		uint32* RESTRICT buff32 = (uint32*)_aligned_malloc(pitch * sizeof(uint32), 32);
+
+		for(int j = r.top; j < r.bottom; j++, src += pitch, dst += m.pitch)
+		{
+			switch(tp)
+			{
+			case 0: // 4 bpp
+
+				for(int i = 0, k = r.width() / 2; i < k; i++)
+				{
+					buff[i * 2 + 0] = clut[((uint8*)src)[i] & 0xf];
+					buff[i * 2 + 1] = clut[((uint8*)src)[i] >> 4];
+				}
+
+				break;
+
+			case 1: // 8 bpp
+
+				for(int i = 0, k = r.width(); i < k; i++)
+				{
+					buff[i] = clut[((uint8*)src)[i]];
+				}
+
+				break;
+
+			case 2: // 16 bpp;
+
+				for(int i = 0, k = r.width(); i < k; i++)
+				{
+					buff[i] = src[i];
+				}
+
+				break;
+
+			case 3: // 24 bpp
+
+				// TODO
+
+				break;
+			}
+
+			Expand16(buff, buff32, r.width());
+
+			for(int i = 0, k = r.width(); i < k; i++)
+			{
+				buff32[i] = (buff32[i] & 0xff00ff00) | ((buff32[i] & 0x00ff0000) >> 16) | ((buff32[i] & 0x000000ff) << 16);
+			}
+
+			memcpy(dst, buff32, r.width() << 2);
+		}
+
+		_aligned_free(buff);
+		_aligned_free(buff32);
+
+		t.Unmap();
+
+		t.Save(fn);
+	}
+}
diff --git a/plugins/GSdx_legacy/GPULocalMemory.h b/plugins/GSdx_legacy/GPULocalMemory.h
new file mode 100644
index 0000000000..4cd94d8a82
--- /dev/null
+++ b/plugins/GSdx_legacy/GPULocalMemory.h
@@ -0,0 +1,84 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPU.h"
+#include "GSVector.h"
+
+class GPULocalMemory
+{
+	static const GSVector4i m_xxxa;
+	static const GSVector4i m_xxbx;
+	static const GSVector4i m_xgxx;
+	static const GSVector4i m_rxxx;
+
+	uint16* m_vm; 
+
+	struct 
+	{
+		uint16* buff;
+		int tp, cx, cy;
+		bool dirty;
+	} m_clut;
+
+	struct
+	{
+		uint8* buff[3];
+		void* page[3][2][16];
+		uint16 valid[3][2];
+	} m_texture;
+
+	GSVector2i m_scale;
+
+public:
+	GPULocalMemory();
+	virtual ~GPULocalMemory();
+
+	GSVector2i GetScale() {return m_scale;}
+
+	int GetWidth() {return 1 << (10 + m_scale.x);}
+	int GetHeight() {return 1 << (9 + m_scale.y);}
+
+	uint16* GetPixelAddress(int x, int y) const {return &m_vm[(y << (10 + m_scale.x)) + x];}
+	uint16* GetPixelAddressScaled(int x, int y) const {return &m_vm[((y << m_scale.y) << (10 + m_scale.x)) + (x << m_scale.x)];}
+
+	const uint16* GetCLUT(int tp, int cx, int cy);
+	const void* GetTexture(int tp, int tx, int ty);
+
+	void Invalidate(const GSVector4i& r);
+
+	void FillRect(const GSVector4i& r, uint16 c);
+	void WriteRect(const GSVector4i& r, const uint16* RESTRICT src);
+	void ReadRect(const GSVector4i& r, uint16* RESTRICT dst);
+	void MoveRect(int sx, int sy, int dx, int dy, int w, int h);
+
+	void ReadPage4(int tx, int ty, uint8* RESTRICT dst);
+	void ReadPage8(int tx, int ty, uint8* RESTRICT dst);
+	void ReadPage16(int tx, int ty, uint16* RESTRICT dst);
+
+	void ReadFrame32(const GSVector4i& r, uint32* RESTRICT dst, bool rgb24);
+
+	void Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int pixels);
+	void Expand24(const uint16* RESTRICT src, uint32* RESTRICT dst, int pixels);
+
+	void SaveBMP(const string& fn, const GSVector4i& r, int tp, int cx, int cy);
+};
diff --git a/plugins/GSdx_legacy/GPURenderer.cpp b/plugins/GSdx_legacy/GPURenderer.cpp
new file mode 100644
index 0000000000..32338c5efb
--- /dev/null
+++ b/plugins/GSdx_legacy/GPURenderer.cpp
@@ -0,0 +1,270 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GPURenderer.h"
+#include "GSdx.h"
+
+#ifdef _WIN32
+
+map<HWND, GPURenderer*> GPURenderer::m_wnd2gpu;
+
+#endif
+
+GPURenderer::GPURenderer(GSDevice* dev)
+	: m_dev(dev)
+{
+	m_filter = theApp.GetConfig("filter", 0);
+	m_dither = theApp.GetConfig("dithering", 1);
+	m_aspectratio = theApp.GetConfig("AspectRatio", 1);
+	m_vsync = !!theApp.GetConfig("vsync", 0);
+	m_fxaa = !!theApp.GetConfig("fxaa", 0);
+	m_shaderfx = !!theApp.GetConfig("shaderfx", 0);
+	m_scale = m_mem.GetScale();
+	m_shadeboost = !!theApp.GetConfig("ShadeBoost", 0);
+
+	#ifdef _WIN32
+
+	m_hWnd = NULL;
+	m_wndproc = NULL;
+
+	m_wnd = new GSWndDX();
+
+	#endif
+}
+
+GPURenderer::~GPURenderer()
+{
+    #ifdef _WIN32
+
+	if(m_wndproc)
+	{
+		SetWindowLongPtr(m_hWnd, GWLP_WNDPROC, (LONG_PTR)m_wndproc);
+
+		m_wnd2gpu.erase(m_hWnd);
+	}
+
+	#endif
+}
+
+bool GPURenderer::Create(void* hWnd)
+{
+    #ifdef _WIN32
+
+	// TODO: move subclassing inside GSWnd::Attach
+
+	m_hWnd = (HWND)hWnd;
+
+	m_wndproc = (WNDPROC)GetWindowLongPtr(m_hWnd, GWLP_WNDPROC);
+
+	SetWindowLongPtr(m_hWnd, GWLP_WNDPROC, (LONG_PTR)WndProc);
+
+	if(!m_wnd->Attach(m_hWnd))
+	{
+		return false;
+	}
+
+	m_wnd2gpu[m_hWnd] = this;
+
+	SetWindowLong(m_hWnd, GWL_STYLE, GetWindowLong(m_hWnd, GWL_STYLE) | WS_OVERLAPPEDWINDOW);
+
+	#endif
+
+	m_wnd->Show();
+
+	if(!m_dev->Create(m_wnd))
+	{
+		return false;
+	}
+
+	m_dev->SetVSync(m_vsync);
+
+	Reset();
+
+	return true;
+}
+
+bool GPURenderer::Merge()
+{
+	GSTexture* st[2] = {GetOutput(), NULL};
+
+	if(!st[0])
+	{
+		return false;
+	}
+
+	GSVector2i s = st[0]->GetSize();
+
+	GSVector4 sr[2];
+	GSVector4 dr[2];
+
+	sr[0] = GSVector4(0, 0, 1, 1);
+	dr[0] = GSVector4(0, 0, s.x, s.y);
+
+	m_dev->Merge(st, sr, dr, s, 1, 1, GSVector4(0, 0, 0, 1));
+
+	if(m_shadeboost)
+	{
+		m_dev->ShadeBoost();
+	}
+
+	if (m_shaderfx)
+	{
+		m_dev->ExternalFX();
+	}
+
+	if(m_fxaa)
+	{
+		m_dev->FXAA();
+	}
+
+	return true;
+}
+
+void GPURenderer::VSync()
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	m_perfmon.Put(GSPerfMon::Frame);
+
+	// m_env.STATUS.LCF = ~m_env.STATUS.LCF; // ?
+
+	#ifdef _WIN32
+
+	if(!IsWindow(m_hWnd)) return;
+
+	#endif
+
+	Flush();
+
+	if(!m_dev->IsLost(true))
+	{
+		if(!Merge())
+		{
+			return;
+		}
+	}
+	else
+	{
+		ResetDevice();
+	}
+
+	// osd
+
+	if((m_perfmon.GetFrame() & 0x1f) == 0)
+	{
+		m_perfmon.Update();
+
+		double fps = 1000.0f / m_perfmon.Get(GSPerfMon::Frame);
+
+		GSVector4i r = m_env.GetDisplayRect();
+
+		int w = r.width() << m_scale.x;
+		int h = r.height() << m_scale.y;
+
+		string s = format(
+			"%lld | %d x %d | %.2f fps (%d%%) | %d/%d | %d%% CPU | %.2f | %.2f",
+			m_perfmon.GetFrame(), w, h, fps, (int)(100.0 * fps / m_env.GetFPS()),
+			(int)m_perfmon.Get(GSPerfMon::Prim),
+			(int)m_perfmon.Get(GSPerfMon::Draw),
+			m_perfmon.CPU(),
+			m_perfmon.Get(GSPerfMon::Swizzle) / 1024,
+			m_perfmon.Get(GSPerfMon::Unswizzle) / 1024
+		);
+
+		double fillrate = m_perfmon.Get(GSPerfMon::Fillrate);
+
+		if(fillrate > 0)
+		{
+			s = format("%s | %.2f mpps", s.c_str(), fps * fillrate / (1024 * 1024));
+		}
+
+        m_wnd->SetWindowText(s.c_str());
+	}
+
+	GSVector4i r = m_wnd->GetClientRect();
+
+	m_dev->Present(r.fit(m_aspectratio), 0);
+}
+
+bool GPURenderer::MakeSnapshot(const string& path)
+{
+	time_t t = time(NULL);
+
+	char buff[16];
+
+	if(!strftime(buff, sizeof(buff), "%Y%m%d%H%M%S", localtime(&t)))
+	{
+		return false;
+	}
+
+	if(GSTexture* t = m_dev->GetCurrent())
+	{
+		return t->Save(format("%s_%s.bmp", path.c_str(), buff));
+	}
+
+	return false;
+}
+
+#ifdef _WIN32
+
+LRESULT CALLBACK GPURenderer::WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	map<HWND, GPURenderer*>::iterator i = m_wnd2gpu.find(hWnd);
+
+	if(i != m_wnd2gpu.end())
+	{
+		return i->second->OnMessage(message, wParam, lParam);
+	}
+
+	ASSERT(0);
+
+	return 0;
+}
+
+LRESULT GPURenderer::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
+{
+	if(message == WM_KEYUP)
+	{
+		switch(wParam)
+		{
+		case VK_DELETE:
+			m_filter = (m_filter + 1) % 3;
+			return 0;
+		case VK_END:
+			m_dither = m_dither ? 0 : 1;
+			return 0;
+		case VK_NEXT:
+			m_aspectratio = (m_aspectratio + 1) % 3;
+			return 0;
+		case VK_PRIOR:
+			m_fxaa = !m_fxaa;
+			return 0;
+		case VK_HOME:
+			m_shaderfx = !m_shaderfx;
+			return 0;
+		}
+	}
+
+	return CallWindowProc(m_wndproc, m_hWnd, message, wParam, lParam);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GPURenderer.h b/plugins/GSdx_legacy/GPURenderer.h
new file mode 100644
index 0000000000..74fb9b2e7d
--- /dev/null
+++ b/plugins/GSdx_legacy/GPURenderer.h
@@ -0,0 +1,202 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPUState.h"
+#include "GSVertexList.h"
+#include "GSDevice.h"
+#ifdef _WIN32
+#include "GSWndDX.h"
+#endif
+
+class GPURenderer : public GPUState
+{
+	bool Merge();
+
+protected:
+	GSDevice* m_dev;
+	int m_filter;
+	int m_dither;
+	int m_aspectratio;
+	bool m_vsync;
+	bool m_shaderfx;
+	bool m_fxaa;
+	bool m_shadeboost;
+	GSVector2i m_scale;
+
+	virtual void ResetDevice() {}
+	virtual GSTexture* GetOutput() = 0;
+
+    #ifdef _WIN32
+
+	HWND m_hWnd;
+	WNDPROC m_wndproc;
+	static map<HWND, GPURenderer*> m_wnd2gpu;
+
+	static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+	LRESULT OnMessage(UINT message, WPARAM wParam, LPARAM lParam);
+
+	#endif
+
+	GSWnd* m_wnd;
+
+public:
+	GPURenderer(GSDevice* dev);
+	virtual ~GPURenderer();
+
+	virtual bool Create(void* hWnd);
+	virtual void VSync();
+	virtual bool MakeSnapshot(const string& path);
+};
+
+template<class Vertex>
+class GPURendererT : public GPURenderer
+{
+protected:
+	Vertex* m_vertices;
+	int m_count;
+	int m_maxcount;
+	GSVertexList<Vertex> m_vl;
+
+	void Reset()
+	{
+		m_count = 0;
+		m_vl.RemoveAll();
+
+		GPURenderer::Reset();
+	}
+
+	void ResetPrim()
+	{
+		m_vl.RemoveAll();
+	}
+
+	void FlushPrim()
+	{
+		if(m_count > 0)
+		{
+			/*
+			Dump("db");
+
+			if(m_env.PRIM.TME)
+			{
+				GSVector4i r;
+
+				r.left = m_env.STATUS.TX << 6;
+				r.top = m_env.STATUS.TY << 8;
+				r.right = r.left + 256;
+				r.bottom = r.top + 256;
+
+				Dump(format("da_%d_%d_%d_%d_%d", m_env.STATUS.TP, r.left, r.top, r.right, r.bottom).c_str(), m_env.STATUS.TP, r, false);
+			}
+			*/
+
+			Draw();
+
+			m_count = 0;
+
+			//Dump("dc", false);
+		}
+	}
+
+	void GrowVertexBuffer()
+	{
+		int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
+		Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);
+
+		if(vertices == NULL)
+		{
+			printf("GSdx: failed to allocate %d bytes for verticles.\n", (int)sizeof(Vertex) * maxcount);
+			throw GSDXError();
+		}
+
+		if(m_vertices != NULL)
+		{
+			memcpy(vertices, m_vertices, sizeof(Vertex) * m_maxcount);
+			_aligned_free(m_vertices);
+		}
+
+		m_vertices = vertices;
+		m_maxcount = maxcount - 100;
+	}
+
+	__forceinline Vertex* DrawingKick(int& count)
+	{
+		count = (int)m_env.PRIM.VTX;
+
+		if(m_vl.GetCount() < count)
+		{
+			return NULL;
+		}
+
+		if(m_count >= m_maxcount)
+		{
+			GrowVertexBuffer();
+		}
+
+		Vertex* v = &m_vertices[m_count];
+
+		switch(m_env.PRIM.TYPE)
+		{
+		case GPU_POLYGON:
+			m_vl.GetAt(0, v[0]);
+			m_vl.GetAt(1, v[1]);
+			m_vl.GetAt(2, v[2]);
+			m_vl.RemoveAll();
+			break;
+		case GPU_LINE:
+			m_vl.GetAt(0, v[0]);
+			m_vl.GetAt(1, v[1]);
+			m_vl.RemoveAll();
+			break;
+		case GPU_SPRITE:
+			m_vl.GetAt(0, v[0]);
+			m_vl.GetAt(1, v[1]);
+			m_vl.RemoveAll();
+			break;
+		default:
+			ASSERT(0);
+			m_vl.RemoveAll();
+			return NULL;
+		}
+
+		return v;
+	}
+
+	virtual void VertexKick() = 0;
+
+	virtual void Draw() = 0;
+
+public:
+	GPURendererT(GSDevice* dev)
+		: GPURenderer(dev)
+		, m_vertices(NULL)
+		, m_count(0)
+		, m_maxcount(0)
+	{
+	}
+
+	virtual ~GPURendererT()
+	{
+		if(m_vertices) _aligned_free(m_vertices);
+	}
+};
diff --git a/plugins/GSdx_legacy/GPURendererSW.cpp b/plugins/GSdx_legacy/GPURendererSW.cpp
new file mode 100644
index 0000000000..addd2c4379
--- /dev/null
+++ b/plugins/GSdx_legacy/GPURendererSW.cpp
@@ -0,0 +1,205 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GPURendererSW.h"
+//#include "GSdx.h"
+
+GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
+	: GPURendererT<GSVertexSW>(dev)
+	, m_texture(NULL)
+{
+	m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 32);
+
+	m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads, &m_perfmon);
+}
+
+GPURendererSW::~GPURendererSW()
+{
+	delete m_texture;
+
+	delete m_rl;
+
+	_aligned_free(m_output);
+}
+
+void GPURendererSW::ResetDevice()
+{
+	delete m_texture;
+
+	m_texture = NULL;
+}
+
+GSTexture* GPURendererSW::GetOutput()
+{
+	GSVector4i r = m_env.GetDisplayRect();
+
+	r.left <<= m_scale.x;
+	r.top <<= m_scale.y;
+	r.right <<= m_scale.x;
+	r.bottom <<= m_scale.y;
+
+	if(m_dev->ResizeTexture(&m_texture, r.width(), r.height()))
+	{
+		m_mem.ReadFrame32(r, m_output, !!m_env.STATUS.ISRGB24);
+
+		m_texture->Update(r.rsize(), m_output, m_mem.GetWidth() * sizeof(uint32));
+	}
+
+	return m_texture;
+}
+
+void GPURendererSW::Draw()
+{
+	GPUDrawScanline::SharedData* sd = new GPUDrawScanline::SharedData();
+
+	shared_ptr<GSRasterizerData> data(sd);
+
+	GPUScanlineGlobalData& gd = sd->global;
+
+	const GPUDrawingEnvironment& env = m_env;
+
+	gd.sel.key = 0;
+	gd.sel.iip = env.PRIM.IIP;
+	gd.sel.me = env.STATUS.ME;
+
+	if(env.PRIM.ABE)
+	{
+		gd.sel.abe = env.PRIM.ABE;
+		gd.sel.abr = env.STATUS.ABR;
+	}
+
+	gd.sel.tge = env.PRIM.TGE;
+
+	if(env.PRIM.TME)
+	{
+		gd.sel.tme = env.PRIM.TME;
+		gd.sel.tlu = env.STATUS.TP < 2;
+		gd.sel.twin = (env.TWIN.u32 & 0xfffff) != 0;
+		gd.sel.ltf = m_filter == 1 && env.PRIM.TYPE == GPU_POLYGON || m_filter == 2 ? 1 : 0;
+
+		const void* t = m_mem.GetTexture(env.STATUS.TP, env.STATUS.TX, env.STATUS.TY);
+
+		if(!t) {ASSERT(0); return;}
+
+		gd.tex = t;
+
+		gd.clut = (uint16*)_aligned_malloc(sizeof(uint16) * 256, 32);
+
+		memcpy(gd.clut, m_mem.GetCLUT(env.STATUS.TP, env.CLUT.X, env.CLUT.Y), sizeof(uint16) * (env.STATUS.TP == 0 ? 16 : 256));
+
+		gd.twin = GSVector4i(env.TWIN.TWW, env.TWIN.TWH, env.TWIN.TWX, env.TWIN.TWY);
+	}
+
+	gd.sel.dtd = m_dither ? env.STATUS.DTD : 0;
+	gd.sel.md = env.STATUS.MD;
+	gd.sel.sprite = env.PRIM.TYPE == GPU_SPRITE;
+	gd.sel.scalex = m_mem.GetScale().x;
+
+	gd.vm = m_mem.GetPixelAddress(0, 0);
+
+	data->scissor.left = (int)m_env.DRAREATL.X << m_scale.x;
+	data->scissor.top = (int)m_env.DRAREATL.Y << m_scale.y;
+	data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
+	data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
+	
+	data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 32);
+	data->vertex = (GSVertexSW*)data->buff;
+	data->vertex_count = m_count;
+
+	memcpy(data->vertex, m_vertices, sizeof(GSVertexSW) * m_count);
+	
+	data->frame = m_perfmon.GetFrame();
+
+	int prims = 0;
+
+	switch(env.PRIM.TYPE)
+	{
+	case GPU_POLYGON: data->primclass = GS_TRIANGLE_CLASS; prims = data->vertex_count / 3; break;
+	case GPU_LINE: data->primclass = GS_LINE_CLASS; prims = data->vertex_count / 2; break;
+	case GPU_SPRITE: data->primclass = GS_SPRITE_CLASS; prims = data->vertex_count / 2; break;
+	default: __assume(0);
+	}
+
+	// TODO: VertexTrace
+
+	GSVector4 tl(+1e10f);
+	GSVector4 br(-1e10f);
+
+	GSVertexSW* v = data->vertex;
+
+	for(int i = 0, j = data->vertex_count; i < j; i++)
+	{
+		GSVector4 p = v[i].p;
+
+		tl = tl.min(p);
+		br = br.max(p);
+	}
+
+	data->bbox = GSVector4i(tl.xyxy(br));
+
+	GSVector4i r = data->bbox.rintersect(data->scissor);
+
+	r.left >>= m_scale.x;
+	r.top >>= m_scale.y;
+	r.right >>= m_scale.x;
+	r.bottom >>= m_scale.y;
+
+	Invalidate(r);
+
+	m_rl->Queue(data);
+
+	m_rl->Sync();
+
+	m_perfmon.Put(GSPerfMon::Draw, 1);
+	m_perfmon.Put(GSPerfMon::Prim, prims);
+	m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels());
+}
+
+void GPURendererSW::VertexKick()
+{
+	GSVertexSW& dst = m_vl.AddTail();
+
+	// TODO: x/y + off.x/y should wrap around at +/-1024
+
+	int x = (int)(m_v.XY.X + m_env.DROFF.X) << m_scale.x;
+	int y = (int)(m_v.XY.Y + m_env.DROFF.Y) << m_scale.y;
+
+	int u = m_v.UV.X;
+	int v = m_v.UV.Y;
+
+	GSVector4 pt(x, y, u, v);
+
+	dst.p = pt.xyxy(GSVector4::zero());
+	dst.t = (pt.zwzw(GSVector4::zero()) + GSVector4(0.125f)) * 256.0f;
+	// dst.c = GSVector4(m_v.RGB.u32) * 128.0f;
+	dst.c = GSVector4(GSVector4i::load((int)m_v.RGB.u32).u8to32() << 7);
+
+	int count = 0;
+
+	if(DrawingKick(count))
+	{
+		// TODO
+
+		m_count += count;
+	}
+}
+
diff --git a/plugins/GSdx_legacy/GPURendererSW.h b/plugins/GSdx_legacy/GPURendererSW.h
new file mode 100644
index 0000000000..e340191bef
--- /dev/null
+++ b/plugins/GSdx_legacy/GPURendererSW.h
@@ -0,0 +1,42 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPURenderer.h"
+#include "GPUDrawScanline.h"
+
+class GPURendererSW : public GPURendererT<GSVertexSW>
+{
+protected:
+	IRasterizer* m_rl;
+	GSTexture* m_texture;
+	uint32* m_output;
+
+	void ResetDevice();
+	GSTexture* GetOutput();
+	void VertexKick();
+	void Draw();
+
+public:
+	GPURendererSW(GSDevice* dev, int threads);
+	virtual ~GPURendererSW();
+};
diff --git a/plugins/GSdx_legacy/GPUScanlineEnvironment.h b/plugins/GSdx_legacy/GPUScanlineEnvironment.h
new file mode 100644
index 0000000000..ad9d7fa1fd
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUScanlineEnvironment.h
@@ -0,0 +1,78 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSVector.h"
+#include "GPULocalMemory.h"
+
+union GPUScanlineSelector
+{
+	struct
+	{
+		uint32 iip:1; // 0
+		uint32 me:1; // 1
+		uint32 abe:1; // 2
+		uint32 abr:2; // 3
+		uint32 tge:1; // 5
+		uint32 tme:1; // 6
+		uint32 twin:1; // 7
+		uint32 tlu:1; // 8
+		uint32 dtd:1; // 9
+		uint32 ltf:1; // 10
+		uint32 md:1; // 11
+		uint32 sprite:1; // 12
+		uint32 scalex:2; // 13
+	};
+
+	struct
+	{
+		uint32 _pad1:1; // 0
+		uint32 rfb:2; // 1
+		uint32 _pad2:2; // 3
+		uint32 tfx:2; // 5
+	};
+
+	uint32 key;
+
+	operator uint32() const {return key;}
+};
+
+__aligned(struct, 32) GPUScanlineGlobalData
+{
+	GPUScanlineSelector sel;
+
+	void* vm;
+	const void* tex;
+	uint16* clut;
+	GSVector4i twin; // TWW, TWH, TWX, TWY
+};
+
+__aligned(struct, 32) GPUScanlineLocalData
+{
+	const GPUScanlineGlobalData* gd;
+
+	struct {GSVector4i u, v;} twin[3];
+	struct {GSVector4i s, t, r, g, b, _pad[3];} d;
+	struct {GSVector4i st, c;} d8;
+
+	struct {GSVector4i s, t, r, b, g, uf, vf, dither, fd, test;} temp;
+};
diff --git a/plugins/GSdx_legacy/GPUSettingsDlg.cpp b/plugins/GSdx_legacy/GPUSettingsDlg.cpp
new file mode 100644
index 0000000000..bdbcecb9b3
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUSettingsDlg.cpp
@@ -0,0 +1,151 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSUtil.h"
+#include "GPUSettingsDlg.h"
+#include "resource.h"
+
+GPUSettingsDlg::GPUSettingsDlg()
+	: GSDialog(IDD_GPUCONFIG)
+{
+}
+
+void GPUSettingsDlg::OnInit()
+{
+	__super::OnInit();
+
+	m_modes.clear();
+
+	{
+		D3DDISPLAYMODE mode;
+		memset(&mode, 0, sizeof(mode));
+		m_modes.push_back(mode);
+
+		ComboBoxAppend(IDC_RESOLUTION, "Please select...", (LPARAM)&m_modes.back(), true);
+
+		if(CComPtr<IDirect3D9> d3d = Direct3DCreate9(D3D_SDK_VERSION))
+		{
+			uint32 w = theApp.GetConfig("ModeWidth", 0);
+			uint32 h = theApp.GetConfig("ModeHeight", 0);
+			uint32 hz = theApp.GetConfig("ModeRefreshRate", 0);
+
+			uint32 n = d3d->GetAdapterModeCount(D3DADAPTER_DEFAULT, D3DFMT_X8R8G8B8);
+
+			for(uint32 i = 0; i < n; i++)
+			{
+				if(S_OK == d3d->EnumAdapterModes(D3DADAPTER_DEFAULT, D3DFMT_X8R8G8B8, i, &mode))
+				{
+					m_modes.push_back(mode);
+
+					string str = format("%dx%d %dHz", mode.Width, mode.Height, mode.RefreshRate);
+
+					ComboBoxAppend(IDC_RESOLUTION, str.c_str(), (LPARAM)&m_modes.back(), w == mode.Width && h == mode.Height && hz == mode.RefreshRate);
+				}
+			}
+		}
+	}
+
+	ComboBoxInit(IDC_RENDERER, theApp.m_gpu_renderers, theApp.GetConfig("Renderer", 0));
+	ComboBoxInit(IDC_FILTER, theApp.m_gpu_filter, theApp.GetConfig("filter", 0));
+	ComboBoxInit(IDC_DITHERING, theApp.m_gpu_dithering, theApp.GetConfig("dithering", 1));
+	ComboBoxInit(IDC_ASPECTRATIO, theApp.m_gpu_aspectratio, theApp.GetConfig("AspectRatio", 1));
+	ComboBoxInit(IDC_SCALE, theApp.m_gpu_scale, theApp.GetConfig("scale_x", 0) | (theApp.GetConfig("scale_y", 0) << 2));
+
+	CheckDlgButton(m_hWnd, IDC_WINDOWED, theApp.GetConfig("windowed", 1));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_SETRANGE, 0, MAKELPARAM(16, 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("extrathreads", DEFAULT_EXTRA_RENDERING_THREADS), 0));
+
+	UpdateControls();
+}
+
+bool GPUSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code)
+{
+	if(id == IDC_RENDERER && code == CBN_SELCHANGE)
+	{
+		UpdateControls();
+	}
+	else if(id == IDOK)
+	{
+		INT_PTR data;
+
+		if(ComboBoxGetSelData(IDC_RESOLUTION, data))
+		{
+			const D3DDISPLAYMODE* mode = (D3DDISPLAYMODE*)data;
+
+			theApp.SetConfig("ModeWidth", (int)mode->Width);
+			theApp.SetConfig("ModeHeight", (int)mode->Height);
+			theApp.SetConfig("ModeRefreshRate", (int)mode->RefreshRate);
+		}
+
+		if(ComboBoxGetSelData(IDC_RENDERER, data))
+		{
+			theApp.SetConfig("Renderer", (int)data);
+		}
+
+		if(ComboBoxGetSelData(IDC_FILTER, data))
+		{
+			theApp.SetConfig("filter", (int)data);
+		}
+
+		if(ComboBoxGetSelData(IDC_DITHERING, data))
+		{
+			theApp.SetConfig("dithering", (int)data);
+		}
+
+		if(ComboBoxGetSelData(IDC_ASPECTRATIO, data))
+		{
+			theApp.SetConfig("AspectRatio", (int)data);
+		}
+
+		if(ComboBoxGetSelData(IDC_SCALE, data))
+		{
+			theApp.SetConfig("scale_x", data & 3);
+			theApp.SetConfig("scale_y", (data >> 2) & 3);
+		}
+
+		theApp.SetConfig("extrathreads", (int)SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_GETPOS, 0, 0));
+		theApp.SetConfig("windowed", (int)IsDlgButtonChecked(m_hWnd, IDC_WINDOWED));
+	}
+
+	return __super::OnCommand(hWnd, id, code);
+}
+
+void GPUSettingsDlg::UpdateControls()
+{
+	INT_PTR i;
+
+	if(ComboBoxGetSelData(IDC_RENDERER, i))
+	{
+		bool dx9 = i == 0;
+		bool dx11 = i == 1;
+		bool sw = i >= 0 && i <= 2;
+
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO9), dx9 ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO11), dx11 ? SW_SHOW : SW_HIDE);
+		
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SCALE), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS), sw);
+	}
+}
diff --git a/plugins/GSdx_legacy/GPUSettingsDlg.h b/plugins/GSdx_legacy/GPUSettingsDlg.h
new file mode 100644
index 0000000000..7af5202a69
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUSettingsDlg.h
@@ -0,0 +1,39 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDialog.h"
+#include "GSSetting.h"
+
+class GPUSettingsDlg : public GSDialog
+{
+	list<D3DDISPLAYMODE> m_modes;
+
+	void UpdateControls();
+
+protected:
+	void OnInit();
+	bool OnCommand(HWND hWnd, UINT id, UINT code);
+
+public:
+	GPUSettingsDlg();
+};
diff --git a/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.cpp b/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.cpp
new file mode 100644
index 0000000000..5367fc3a0b
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.cpp
@@ -0,0 +1,228 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+// TODO: x64
+
+#include "stdafx.h"
+#include "GPUSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+using namespace Xbyak;
+
+static const int _args = 0;
+static const int _vertex = _args + 4;
+static const int _index = _args + 8;
+static const int _dscan = _args + 12;
+
+GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
+	: GSCodeGenerator(code, maxsize)
+	, m_local(*(GPUScanlineLocalData*)param)
+{
+	m_sel.key = key;
+
+	Generate();
+}
+
+void GPUSetupPrimCodeGenerator::Generate()
+{
+	if(m_sel.tme && !m_sel.twin)
+	{
+		pcmpeqd(xmm0, xmm0);
+
+		if(m_sel.sprite)
+		{
+			// t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001();
+
+			mov(ecx, ptr[esp + _index]);
+			mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
+			shl(ecx, 6); // * sizeof(GSVertexSW)
+			add(ecx, ptr[esp + _vertex]);
+
+			cvttps2dq(xmm1, ptr[ecx + offsetof(GSVertexSW, t)]);
+			psrld(xmm1, 8);
+			psrld(xmm0, 31);
+			psubd(xmm1, xmm0);
+
+			// t = t.ps32(t);
+			// t = t.upl16(t);
+
+			packssdw(xmm1, xmm1);
+			punpcklwd(xmm1, xmm1);
+
+			// m_local.twin[2].u = t.xxxx();
+			// m_local.twin[2].v = t.yyyy();
+
+			pshufd(xmm2, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
+			pshufd(xmm3, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
+
+			movdqa(ptr[&m_local.twin[2].u], xmm2);
+			movdqa(ptr[&m_local.twin[2].v], xmm3);
+		}
+		else
+		{
+			// TODO: not really needed
+
+			// m_local.twin[2].u = GSVector4i::x00ff();
+			// m_local.twin[2].v = GSVector4i::x00ff();
+
+			psrlw(xmm0, 8);
+
+			movdqa(ptr[&m_local.twin[2].u], xmm0);
+			movdqa(ptr[&m_local.twin[2].v], xmm0);
+		}
+	}
+
+	if(m_sel.tme || m_sel.iip && m_sel.tfx != 3)
+	{
+		mov(edx, dword[esp + _dscan]);
+
+		for(int i = 0; i < 3; i++)
+		{
+			movaps(Xmm(5 + i), ptr[&m_shift[i]]);
+		}
+
+		// GSVector4 dt = dscan.t;
+		// GSVector4 dc = dscan.c;
+
+		movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]);
+		movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]);
+
+		// GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f));
+
+		movaps(xmm1, xmm3);
+		mulps(xmm1, xmm5);
+		cvttps2dq(xmm1, xmm1);
+		movaps(xmm2, xmm4);
+		mulps(xmm2, xmm5);
+		cvttps2dq(xmm2, xmm2);
+		packssdw(xmm1, xmm2);
+
+		if(m_sel.tme)
+		{
+			// m_local.d8.st = dtc8.upl16(dtc8);
+
+			movdqa(xmm0, xmm1);
+			punpcklwd(xmm0, xmm0);
+			movdqa(ptr[&m_local.d8.st], xmm0);
+		}
+
+		if(m_sel.iip && m_sel.tfx != 3)
+		{
+			// m_local.d8.c = dtc8.uph16(dtc8);
+
+			punpckhwd(xmm1, xmm1);
+			movdqa(ptr[&m_local.d8.c], xmm1);
+		}
+
+		// xmm3 = dt
+		// xmm4 = dc
+		// xmm6 = ps0123
+		// xmm7 = ps4567
+		// xmm0, xmm1, xmm2, xmm5 = free
+
+		if(m_sel.tme)
+		{
+			// GSVector4 dtx = dt.xxxx();
+			// GSVector4 dty = dt.yyyy();
+
+			movaps(xmm0, xmm3);
+			shufps(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
+			shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+
+			// m_local.d.s = GSVector4i(dtx * ps0123).ps32(GSVector4i(dtx * ps4567));
+
+			movaps(xmm1, xmm3);
+			mulps(xmm3, xmm6);
+			mulps(xmm1, xmm7);
+			cvttps2dq(xmm3, xmm3);
+			cvttps2dq(xmm1, xmm1);
+			packssdw(xmm3, xmm1);
+			movdqa(ptr[&m_local.d.s], xmm3);
+
+			// m_local.d.t = GSVector4i(dty * ps0123).ps32(GSVector4i(dty * ps4567));
+
+			movaps(xmm1, xmm0);
+			mulps(xmm0, xmm6);
+			mulps(xmm1, xmm7);
+			cvttps2dq(xmm0, xmm0);
+			cvttps2dq(xmm1, xmm1);
+			packssdw(xmm0, xmm1);
+			movdqa(ptr[&m_local.d.t], xmm0);
+		}
+
+		// xmm4 = dc
+		// xmm6 = ps0123
+		// xmm7 = ps4567
+		// xmm0, xmm1, zmm2, xmm3, xmm5 = free
+
+		if(m_sel.iip && m_sel.tfx != 3)
+		{
+			// GSVector4 dcx = dc.xxxx();
+			// GSVector4 dcy = dc.yyyy();
+			// GSVector4 dcz = dc.zzzz();
+
+			movaps(xmm0, xmm4);
+			movaps(xmm1, xmm4);
+			shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+			shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+			shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+
+			// m_local.d.r = GSVector4i(dcx * ps0123).ps32(GSVector4i(dcx * ps4567));
+
+			movaps(xmm2, xmm4);
+			mulps(xmm4, xmm6);
+			mulps(xmm2, xmm7);
+			cvttps2dq(xmm4, xmm4);
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm4, xmm2);
+			movdqa(ptr[&m_local.d.r], xmm4);
+
+			// m_local.d.g = GSVector4i(dcy * ps0123).ps32(GSVector4i(dcy * ps4567));
+
+			movaps(xmm2, xmm0);
+			mulps(xmm0, xmm6);
+			mulps(xmm2, xmm7);
+			cvttps2dq(xmm0, xmm0);
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm0, xmm2);
+			movdqa(ptr[&m_local.d.g], xmm0);
+
+			// m_local.d.b = GSVector4i(dcz * ps0123).ps32(GSVector4i(dcz * ps4567));
+
+			movaps(xmm2, xmm1);
+			mulps(xmm1, xmm6);
+			mulps(xmm2, xmm7);
+			cvttps2dq(xmm1, xmm1);
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm1, xmm2);
+			movdqa(ptr[&m_local.d.b], xmm1);
+		}
+	}
+
+	ret();
+}
+
+const GSVector4 GPUSetupPrimCodeGenerator::m_shift[3] =
+{
+	GSVector4(8.0f, 8.0f, 8.0f, 8.0f),
+	GSVector4(0.0f, 1.0f, 2.0f, 3.0f),
+	GSVector4(4.0f, 5.0f, 6.0f, 7.0f),
+};
diff --git a/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.h b/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.h
new file mode 100644
index 0000000000..938c8b4736
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.h
@@ -0,0 +1,40 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPUScanlineEnvironment.h"
+#include "GSFunctionMap.h"
+
+class GPUSetupPrimCodeGenerator : public GSCodeGenerator
+{
+	void operator = (const GPUSetupPrimCodeGenerator&);
+
+	GPUScanlineSelector m_sel;
+	GPUScanlineLocalData& m_local;
+
+	void Generate();
+
+public:
+	GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize);
+
+	static const GSVector4 m_shift[3];
+};
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GPUState.cpp b/plugins/GSdx_legacy/GPUState.cpp
new file mode 100644
index 0000000000..90feb5adf8
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUState.cpp
@@ -0,0 +1,809 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GPUState.h"
+
+GPUState::GPUState()
+	: s_n(0)
+{
+	memset(m_status, 0, sizeof(m_status));
+
+	for(size_t i = 0; i < countof(m_fpGPUStatusCommandHandlers); i++)
+	{
+		m_fpGPUStatusCommandHandlers[i] = &GPUState::SCH_Null;
+	}
+
+	m_fpGPUStatusCommandHandlers[0x00] = &GPUState::SCH_ResetGPU;
+	m_fpGPUStatusCommandHandlers[0x01] = &GPUState::SCH_ResetCommandBuffer;
+	m_fpGPUStatusCommandHandlers[0x02] = &GPUState::SCH_ResetIRQ;
+	m_fpGPUStatusCommandHandlers[0x03] = &GPUState::SCH_DisplayEnable;
+	m_fpGPUStatusCommandHandlers[0x04] = &GPUState::SCH_DMASetup;
+	m_fpGPUStatusCommandHandlers[0x05] = &GPUState::SCH_StartOfDisplayArea;
+	m_fpGPUStatusCommandHandlers[0x06] = &GPUState::SCH_HorizontalDisplayRange;
+	m_fpGPUStatusCommandHandlers[0x07] = &GPUState::SCH_VerticalDisplayRange;
+	m_fpGPUStatusCommandHandlers[0x08] = &GPUState::SCH_DisplayMode;
+	m_fpGPUStatusCommandHandlers[0x10] = &GPUState::SCH_GPUInfo;
+
+	m_fpGPUPacketHandler[0] = &GPUState::PH_Command;
+	m_fpGPUPacketHandler[1] = &GPUState::PH_Polygon;
+	m_fpGPUPacketHandler[2] = &GPUState::PH_Line;
+	m_fpGPUPacketHandler[3] = &GPUState::PH_Sprite;
+	m_fpGPUPacketHandler[4] = &GPUState::PH_Move;
+	m_fpGPUPacketHandler[5] = &GPUState::PH_Write;
+	m_fpGPUPacketHandler[6] = &GPUState::PH_Read;
+	m_fpGPUPacketHandler[7] = &GPUState::PH_Environment;
+
+	Reset();
+}
+
+GPUState::~GPUState()
+{
+}
+
+void GPUState::Reset()
+{
+	m_env.Reset();
+
+	m_mem.Invalidate(GSVector4i(0, 0, 1024, 512));
+
+	memset(&m_v, 0, sizeof(m_v));
+}
+
+void GPUState::Flush()
+{
+	FlushPrim();
+}
+
+void GPUState::SetPrim(GPUReg* r)
+{
+	if(m_env.PRIM.TYPE != r->PRIM.TYPE)
+	{
+		ResetPrim();
+	}
+
+	GPURegPRIM PRIM = r->PRIM;
+
+	PRIM.VTX = 0;
+
+	switch(r->PRIM.TYPE)
+	{
+	case GPU_POLYGON:
+		PRIM.u32 = (r->PRIM.u32 & 0xF7000000) | 3; // TYPE IIP TME ABE TGE
+		break;
+	case GPU_LINE:
+		PRIM.u32 = (r->PRIM.u32 & 0xF2000000) | 2; // TYPE IIP ABE
+		PRIM.TGE = 1; // ?
+		break;
+	case GPU_SPRITE:
+		PRIM.u32 = (r->PRIM.u32 & 0xE7000000) | 2; // TYPE TME ABE TGE
+		break;
+	}
+
+	if(m_env.PRIM.u32 != PRIM.u32)
+	{
+		Flush();
+
+		m_env.PRIM = PRIM;
+	}
+}
+
+void GPUState::SetCLUT(GPUReg* r)
+{
+	uint32 mask = 0xFFFF0000; // X Y
+
+	uint32 value = (m_env.CLUT.u32 & ~mask) | (r->u32 & mask);
+
+	if(m_env.CLUT.u32 != value)
+	{
+		Flush();
+
+		m_env.CLUT.u32 = value;
+	}
+}
+
+void GPUState::SetTPAGE(GPUReg* r)
+{
+	uint32 mask = 0x000001FF; // TP ABR TY TX
+
+	uint32 value = (m_env.STATUS.u32 & ~mask) | ((r->u32 >> 16) & mask);
+
+	if(m_env.STATUS.u32 != value)
+	{
+		Flush();
+
+		m_env.STATUS.u32 = value;
+	}
+}
+
+void GPUState::Invalidate(const GSVector4i& r)
+{
+	m_mem.Invalidate(r);
+}
+
+void GPUState::WriteData(const uint8* mem, uint32 size)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	size <<= 2;
+
+	m_write.Append(mem, size);
+
+	int i = 0;
+
+	while(i < m_write.bytes)
+	{
+		GPUReg* r = (GPUReg*)&m_write.buff[i];
+
+		int ret = (this->*m_fpGPUPacketHandler[r->PACKET.TYPE])(r, (m_write.bytes - i) >> 2);
+
+		if(ret == 0) return; // need more data
+
+		i += ret << 2;
+	}
+
+	m_write.Remove(i);
+}
+
+void GPUState::ReadData(uint8* mem, uint32 size)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	int remaining = m_read.bytes - m_read.cur;
+
+	int bytes = (int)size << 2;
+
+	if(bytes > remaining)
+	{
+		// ASSERT(0);
+
+		// printf"WARNING: ReadData\n");
+
+		// memset(&mem[remaining], 0, bytes - remaining);
+
+		bytes = remaining;
+	}
+
+	memcpy(mem, &m_read.buff[m_read.cur], bytes);
+
+	m_read.cur += bytes;
+
+	if(m_read.cur >= m_read.bytes)
+	{
+		m_env.STATUS.IMG = 0;
+	}
+}
+
+void GPUState::WriteStatus(uint32 status)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	uint32 b = status >> 24;
+
+	m_status[b] = status;
+
+	(this->*m_fpGPUStatusCommandHandlers[b])((GPUReg*)&status);
+}
+
+uint32 GPUState::ReadStatus()
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	m_env.STATUS.LCF = ~m_env.STATUS.LCF; // ?
+
+	return m_env.STATUS.u32;
+}
+
+void GPUState::Freeze(GPUFreezeData* data)
+{
+	data->status = m_env.STATUS.u32;
+	memcpy(data->control, m_status, 256 * 4);
+	m_mem.ReadRect(GSVector4i(0, 0, 1024, 512), data->vram);
+}
+
+void GPUState::Defrost(const GPUFreezeData* data)
+{
+	m_env.STATUS.u32 = data->status;
+	memcpy(m_status, data->control, 256 * 4);
+	m_mem.WriteRect(GSVector4i(0, 0, 1024, 512), data->vram);
+
+	for(int i = 0; i <= 8; i++)
+	{
+		WriteStatus(m_status[i]);
+	}
+}
+
+void GPUState::SCH_Null(GPUReg* r)
+{
+	ASSERT(0);
+}
+
+void GPUState::SCH_ResetGPU(GPUReg* r)
+{
+	Reset();
+}
+
+void GPUState::SCH_ResetCommandBuffer(GPUReg* r)
+{
+	// ?
+}
+
+void GPUState::SCH_ResetIRQ(GPUReg* r)
+{
+	// ?
+}
+
+void GPUState::SCH_DisplayEnable(GPUReg* r)
+{
+	m_env.STATUS.DEN = r->DEN.DEN;
+}
+
+void GPUState::SCH_DMASetup(GPUReg* r)
+{
+	m_env.STATUS.DMA = r->DMA.DMA;
+}
+
+void GPUState::SCH_StartOfDisplayArea(GPUReg* r)
+{
+	m_env.DAREA = r->DAREA;
+}
+
+void GPUState::SCH_HorizontalDisplayRange(GPUReg* r)
+{
+	m_env.DHRANGE = r->DHRANGE;
+}
+
+void GPUState::SCH_VerticalDisplayRange(GPUReg* r)
+{
+	m_env.DVRANGE = r->DVRANGE;
+}
+
+void GPUState::SCH_DisplayMode(GPUReg* r)
+{
+	m_env.STATUS.WIDTH0 = r->DMODE.WIDTH0;
+	m_env.STATUS.HEIGHT = r->DMODE.HEIGHT;
+	m_env.STATUS.ISPAL = r->DMODE.ISPAL;
+	m_env.STATUS.ISRGB24 = r->DMODE.ISRGB24;
+	m_env.STATUS.ISINTER = r->DMODE.ISINTER;
+	m_env.STATUS.WIDTH1 = r->DMODE.WIDTH1;
+}
+
+void GPUState::SCH_GPUInfo(GPUReg* r)
+{
+	uint32 value = 0;
+
+	switch(r->GPUINFO.PARAM)
+	{
+	case 0x2:
+		value = m_env.TWIN.u32;
+		break;
+	case 0x0:
+	case 0x1:
+	case 0x3:
+		value = m_env.DRAREATL.u32;
+		break;
+	case 0x4:
+		value = m_env.DRAREABR.u32;
+		break;
+	case 0x5:
+	case 0x6:
+		value = m_env.DROFF.u32;
+		break;
+	case 0x7:
+		value = 2;
+		break;
+	case 0x8:
+	case 0xf:
+		value = 0xBFC03720; // ?
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	m_read.RemoveAll();
+	m_read.Append((uint8*)&value, 4);
+	m_read.cur = 0;
+}
+
+int GPUState::PH_Command(GPUReg* r, int size)
+{
+	switch(r->PACKET.OPTION)
+	{
+	case 0: // ???
+
+		return 1;
+
+	case 1: // clear cache
+
+		return 1;
+
+	case 2: // fillrect
+
+		if(size < 3) return 0;
+
+		Flush();
+
+		GSVector4i r2;
+
+		r2.left = r[1].XY.X;
+		r2.top = r[1].XY.Y;
+		r2.right = r2.left + r[2].XY.X;
+		r2.bottom = r2.top + r[2].XY.Y;
+
+		uint16 c = (uint16)(((r[0].RGB.R >> 3) << 10) | ((r[0].RGB.R >> 3) << 5) | (r[0].RGB.R >> 3));
+
+		m_mem.FillRect(r2, c);
+
+		Invalidate(r2);
+
+		Dump("f");
+
+		return 3;
+	}
+
+	ASSERT(0);
+
+	return 1;
+}
+
+int GPUState::PH_Polygon(GPUReg* r, int size)
+{
+	int required = 1;
+
+	int vertices = r[0].POLYGON.VTX ? 4 : 3;
+
+	required += vertices;
+
+	if(r[0].POLYGON.TME) required += vertices;
+
+	if(r[0].POLYGON.IIP) required += vertices - 1;
+
+	if(size < required) return 0;
+
+	//
+
+	SetPrim(r);
+
+	if(r[0].POLYGON.TME)
+	{
+		SetCLUT(&r[2]);
+
+		SetTPAGE(&r[r[0].POLYGON.IIP ? 5 : 4]);
+	}
+
+	//
+
+	GPUVertex v[4];
+
+	for(int i = 0, j = 0; j < vertices; j++)
+	{
+		v[j].RGB = r[r[0].POLYGON.IIP ? i : 0].RGB;
+
+		if(j == 0 || r[0].POLYGON.IIP) i++;
+
+		v[j].XY = r[i++].XY;
+
+		if(r[0].POLYGON.TME)
+		{
+			v[j].UV.X = r[i].UV.U;
+			v[j].UV.Y = r[i].UV.V;
+
+			i++;
+		}
+	}
+
+	for(int i = 0; i <= vertices - 3; i++)
+	{
+		// TODO: sse
+
+		int y0 = v[i + 0].XY.Y;
+		int y1 = v[i + 1].XY.Y;
+		int y2 = v[i + 2].XY.Y;
+
+		if(std::abs(y0 - y1) >= 512 
+		|| std::abs(y0 - y2) >= 512
+		|| std::abs(y1 - y2) >= 512)
+		{
+			continue;
+		}
+
+		int x0 = v[i + 0].XY.X;
+		int x1 = v[i + 1].XY.X;
+		int x2 = v[i + 2].XY.X;
+
+		if(std::abs(x0 - x1) >= 1024 
+		|| std::abs(x0 - x2) >= 1024
+		|| std::abs(x1 - x2) >= 1024)
+		{
+			continue;
+		}
+
+		//
+
+		for(int j = 0; j < 3; j++)
+		{
+			m_v = v[i + j];
+
+			VertexKick();
+		}
+	}
+
+	//
+
+	return required;
+}
+
+int GPUState::PH_Line(GPUReg* r, int size)
+{
+	int required = 1;
+
+	int vertices = 0;
+
+	if(r->LINE.PLL)
+	{
+		required++;
+
+		for(int i = 1; i < size; i++)
+		{
+			if((r[i].u32 & 0xf000f000) == 0x50005000)
+			{
+				vertices = i - 1;
+			}
+		}
+
+		if(vertices < 2)
+		{
+			return 0;
+		}
+	}
+	else
+	{
+		vertices = 2;
+	}
+
+	required += vertices;
+
+	if(r->LINE.IIP) required += vertices - 1;
+
+	//
+
+	SetPrim(r);
+
+	//
+
+	for(int i = 0, j = 0; j < vertices; j++)
+	{
+		if(j >= 2) VertexKick();
+
+		m_v.RGB = r[r[0].LINE.IIP ? i : 0].RGB;
+
+		if(j == 0 || r[0].LINE.IIP) i++;
+
+		m_v.XY = r[i++].XY;
+
+		VertexKick();
+	}
+
+	//
+
+	return required;
+}
+
+int GPUState::PH_Sprite(GPUReg* r, int size)
+{
+	int required = 2;
+
+	if(r[0].SPRITE.TME) required++;
+	if(r[0].SPRITE.SIZE == 0) required++;
+
+	if(size < required) return 0;
+
+	//
+
+	SetPrim(r);
+
+	if(r[0].SPRITE.TME)
+	{
+		SetCLUT(&r[2]);
+	}
+
+	//
+
+	int i = 0;
+
+	m_v.RGB = r[i++].RGB;
+
+	m_v.XY = r[i++].XY;
+
+	if(r[0].SPRITE.TME)
+	{
+		m_v.UV.X = r[i].UV.U;
+		m_v.UV.Y = r[i].UV.V;
+
+		i++;
+	}
+
+	VertexKick();
+
+	int w = 0;
+	int h = 0;
+
+	switch(r[0].SPRITE.SIZE)
+	{
+	case 0: w = r[i].XY.X; h = r[i].XY.Y; i++; break;
+	case 1: w = h = 1; break;
+	case 2: w = h = 8; break;
+	case 3: w = h = 16; break;
+	default: __assume(0);
+	}
+
+	m_v.XY.X += w;
+	m_v.XY.Y += h;
+
+	if(r[0].SPRITE.TME)
+	{
+		m_v.UV.X += w;
+		m_v.UV.Y += h;
+	}
+
+	VertexKick();
+
+	//
+
+	return required;
+}
+
+int GPUState::PH_Move(GPUReg* r, int size)
+{
+	if(size < 4) return 0;
+
+	Flush();
+
+	int sx = r[1].XY.X;
+	int sy = r[1].XY.Y;
+
+	int dx = r[2].XY.X;
+	int dy = r[2].XY.Y;
+
+	int w = r[3].XY.X;
+	int h = r[3].XY.Y;
+
+	m_mem.MoveRect(sx, sy, dx, dy, w, h);
+
+	Invalidate(GSVector4i(dx, dy, dx + w, dy + h));
+
+	// Dump("m");
+
+	return 4;
+}
+
+int GPUState::PH_Write(GPUReg* r, int size)
+{
+	if(size < 3) return 0;
+
+	int w = r[2].XY.X;
+	int h = r[2].XY.Y;
+
+	int required = 3 + ((w * h + 1) >> 1);
+
+	if(size < required) return 0;
+
+	Flush();
+
+	GSVector4i r2;
+
+	r2.left = r[1].XY.X;
+	r2.top = r[1].XY.Y;
+	r2.right = r2.left + w;
+	r2.bottom = r2.top + h;
+
+	m_mem.WriteRect(r2, (const uint16*)&r[3]);
+
+	Invalidate(r2);
+
+	Dump("w");
+
+	m_perfmon.Put(GSPerfMon::Swizzle, w * h * 2);
+
+	return required;
+}
+
+int GPUState::PH_Read(GPUReg* r, int size)
+{
+	if(size < 3) return 0;
+
+	Flush();
+
+	int w = r[2].XY.X;
+	int h = r[2].XY.Y;
+
+	if(w > 0 && h > 0)
+	{
+		GSVector4i r2;
+
+		r2.left = r[1].XY.X;
+		r2.top = r[1].XY.Y;
+		r2.right = r2.left + w;
+		r2.bottom = r2.top + h;
+
+		m_read.bytes = ((w * h + 1) & ~1) * 2;
+		m_read.cur = 0;
+		m_read.Reserve(m_read.bytes);
+
+		m_mem.ReadRect(r2, (uint16*)m_read.buff);
+
+		Dump("r");
+	}
+
+	m_env.STATUS.IMG = 1;
+
+	return 3;
+}
+
+int GPUState::PH_Environment(GPUReg* r, int size)
+{
+	switch(r->PACKET.OPTION)
+	{
+	case 1: // draw mode setting
+
+		if(((m_env.STATUS.u32 ^ r->MODE.u32) & 0x7ff) != 0)
+		{
+			Flush();
+
+			m_env.STATUS.TX = r->MODE.TX;
+			m_env.STATUS.TY = r->MODE.TY;
+			m_env.STATUS.ABR = r->MODE.ABR;
+			m_env.STATUS.TP = r->MODE.TP;
+			m_env.STATUS.DTD = r->MODE.DTD;
+			m_env.STATUS.DFE = r->MODE.DFE;
+		}
+
+		return 1;
+
+	case 2: // texture window setting
+
+		if(((m_env.TWIN.u32 ^ r->TWIN.u32) & 0xfffff) != 0)
+		{
+			Flush();
+
+			m_env.TWIN = r->TWIN;
+		}
+
+		return 1;
+
+	case 3: // set drawing area top left
+
+		if(((m_env.DRAREATL.u32 ^ r->DRAREA.u32) & 0xfffff) != 0)
+		{
+			Flush();
+
+			m_env.DRAREATL = r->DRAREA;
+		}
+
+		return 1;
+
+	case 4: // set drawing area bottom right
+
+		if(((m_env.DRAREABR.u32 ^ r->DRAREA.u32) & 0xfffff) != 0)
+		{
+			Flush();
+
+			m_env.DRAREABR = r->DRAREA;
+		}
+
+		return 1;
+
+	case 5: // drawing offset
+
+		if(((m_env.DROFF.u32 ^ r->DROFF.u32) & 0x3fffff) != 0)
+		{
+			Flush();
+
+			m_env.DROFF = r->DROFF;
+		}
+
+		return 1;
+
+	case 6: // mask setting
+
+		if(m_env.STATUS.MD != r->MASK.MD || m_env.STATUS.ME != r->MASK.ME)
+		{
+			Flush();
+
+			m_env.STATUS.MD = r->MASK.MD;
+			m_env.STATUS.ME = r->MASK.ME;
+		}
+
+		return 1;
+	}
+
+	ASSERT(0);
+
+	return 1;
+}
+
+//
+
+GPUState::Buffer::Buffer()
+{
+	bytes = 0;
+	maxbytes = 4096;
+	buff = (uint8*)_aligned_malloc(maxbytes, 32);
+	cur = 0;
+}
+
+GPUState::Buffer::~Buffer()
+{
+	_aligned_free(buff);
+}
+
+void GPUState::Buffer::Reserve(int size)
+{
+	if(size > maxbytes)
+	{
+		int new_maxbytes = (maxbytes + size + 1023) & ~1023;
+		uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 32);
+
+		if(buff != NULL)
+		{
+			memcpy(new_buff, buff, maxbytes);
+			_aligned_free(buff);
+		}
+
+		maxbytes = new_maxbytes;
+		buff = new_buff;
+	}
+}
+
+void GPUState::Buffer::Append(const uint8* src, int size)
+{
+	Reserve(bytes + (int)size);
+
+	memcpy(&buff[bytes], src, size);
+
+	bytes += size;
+}
+
+void GPUState::Buffer::Remove(int size)
+{
+	ASSERT(size <= bytes);
+
+	if(size < bytes)
+	{
+		memmove(&buff[0], &buff[size], bytes - size);
+
+		bytes -= size;
+	}
+	else
+	{
+		bytes = 0;
+	}
+
+	#ifdef DEBUG
+	memset(&buff[bytes], 0xff, maxbytes - bytes);
+	#endif
+}
+
+void GPUState::Buffer::RemoveAll()
+{
+	bytes = 0;
+}
diff --git a/plugins/GSdx_legacy/GPUState.h b/plugins/GSdx_legacy/GPUState.h
new file mode 100644
index 0000000000..c2aeb287e7
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUState.h
@@ -0,0 +1,143 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPU.h"
+#include "GPUDrawingEnvironment.h"
+#include "GPULocalMemory.h"
+#include "GPUVertex.h"
+#include "GSAlignedClass.h"
+#include "GSUtil.h"
+#include "GSPerfMon.h"
+
+class GPUState : public GSAlignedClass<32>
+{
+	typedef void (GPUState::*GPUStatusCommandHandler)(GPUReg* r);
+
+	GPUStatusCommandHandler m_fpGPUStatusCommandHandlers[256];
+
+	void SCH_Null(GPUReg* r);
+	void SCH_ResetGPU(GPUReg* r);
+	void SCH_ResetCommandBuffer(GPUReg* r);
+	void SCH_ResetIRQ(GPUReg* r);
+	void SCH_DisplayEnable(GPUReg* r);
+	void SCH_DMASetup(GPUReg* r);
+	void SCH_StartOfDisplayArea(GPUReg* r);
+	void SCH_HorizontalDisplayRange(GPUReg* r);
+	void SCH_VerticalDisplayRange(GPUReg* r);
+	void SCH_DisplayMode(GPUReg* r);
+	void SCH_GPUInfo(GPUReg* r);
+
+	typedef int (GPUState::*GPUPacketHandler)(GPUReg* r, int size);
+
+	GPUPacketHandler m_fpGPUPacketHandler[8];
+
+	int PH_Command(GPUReg* r, int size);
+	int PH_Polygon(GPUReg* r, int size);
+	int PH_Line(GPUReg* r, int size);
+	int PH_Sprite(GPUReg* r, int size);
+	int PH_Move(GPUReg* r, int size);
+	int PH_Write(GPUReg* r, int size);
+	int PH_Read(GPUReg* r, int size);
+	int PH_Environment(GPUReg* r, int size);
+
+	class Buffer
+	{
+	public:
+		int bytes;
+		int maxbytes;
+		uint8* buff;
+		int cur;
+
+	public:
+		Buffer();
+		~Buffer();
+		void Reserve(int size);
+		void Append(const uint8* src, int size);
+		void Remove(int size);
+		void RemoveAll();
+	};
+
+	Buffer m_write;
+	Buffer m_read;
+
+	void SetPrim(GPUReg* r);
+	void SetCLUT(GPUReg* r);
+	void SetTPAGE(GPUReg* r);
+
+protected:
+
+	int s_n;
+
+	void Dump(const string& s, uint32 TP, const GSVector4i& r, int inc = true)
+	{
+		//if(m_perfmon.GetFrame() < 1000)
+		//if((m_env.TWIN.u32 & 0xfffff) == 0)
+		//if(!m_env.STATUS.ME && !m_env.STATUS.MD)
+			return;
+
+		if(inc) s_n++;
+
+		//if(s_n < 86) return;
+
+		int dir = 1;
+#ifdef DEBUG
+		dir = 2;
+#endif
+        string path = format("c:\\temp%d\\%04d_%s.bmp", dir, s_n, s.c_str());
+
+		m_mem.SaveBMP(path, r, TP, m_env.CLUT.X, m_env.CLUT.Y);
+	}
+
+	void Dump(const string& s, int inc = true)
+	{
+		Dump(s, 2, GSVector4i(0, 0, 1024, 512), inc);
+	}
+
+public:
+	GPUDrawingEnvironment m_env;
+	GPULocalMemory m_mem;
+	GPUVertex m_v;
+	GSPerfMon m_perfmon;
+	uint32 m_status[256];
+
+public:
+	GPUState();
+	virtual ~GPUState();
+
+	virtual void Reset();
+	virtual void Flush();
+	virtual void FlushPrim() = 0;
+	virtual void ResetPrim() = 0;
+	virtual void VertexKick() = 0;
+	virtual void Invalidate(const GSVector4i& r);
+
+	void WriteData(const uint8* mem, uint32 size);
+	void ReadData(uint8* mem, uint32 size);
+
+	void WriteStatus(uint32 status);
+	uint32 ReadStatus();
+
+	void Freeze(GPUFreezeData* data);
+	void Defrost(const GPUFreezeData* data);
+};
+
diff --git a/plugins/GSdx_legacy/GPUVertex.h b/plugins/GSdx_legacy/GPUVertex.h
new file mode 100644
index 0000000000..05455a4c42
--- /dev/null
+++ b/plugins/GSdx_legacy/GPUVertex.h
@@ -0,0 +1,51 @@
+/* 
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA. 
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GPU.h"
+#include "GSVector.h"
+
+#pragma pack(push, 1)
+
+__aligned(struct, 32) GPUVertex
+{
+	union
+	{
+		struct
+		{
+			GPURegRGB RGB;
+			GPURegXY XY;
+			GPURegXY UV;
+		};
+
+		struct {__m128i m128i;};
+		struct {__m128 m128;};
+	};
+
+	GPUVertex() {memset(this, 0, sizeof(*this));}
+};
+
+struct GPUVertexNull 
+{
+};
+
+#pragma pack(pop)
diff --git a/plugins/GSdx_legacy/GS.cpp b/plugins/GSdx_legacy/GS.cpp
new file mode 100644
index 0000000000..0eecf951b8
--- /dev/null
+++ b/plugins/GSdx_legacy/GS.cpp
@@ -0,0 +1,1768 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSUtil.h"
+#include "GSRendererSW.h"
+#include "GSRendererNull.h"
+#include "GSDeviceNull.h"
+#include "GSDeviceOGL.h"
+#include "GSRendererOGL.h"
+#include "GSRendererCL.h"
+
+#ifdef _WIN32
+
+#include "GSRendererDX9.h"
+#include "GSRendererDX11.h"
+#include "GSDevice9.h"
+#include "GSDevice11.h"
+#include "GSWndDX.h"
+#include "GSWndWGL.h"
+#include "GSRendererCS.h"
+#include "GSSettingsDlg.h"
+
+static HRESULT s_hr = E_FAIL;
+
+#else
+
+#include "GSWndOGL.h"
+#include "GSWndEGL.h"
+
+#include <gtk/gtk.h>
+#include <gdk/gdkx.h>
+
+extern bool RunLinuxDialog();
+
+#endif
+
+#define PS2E_LT_GS 0x01
+#define PS2E_GS_VERSION 0x0006
+#define PS2E_X86 0x01   // 32 bit
+#define PS2E_X86_64 0x02   // 64 bit
+
+static GSRenderer* s_gs = NULL;
+static void (*s_irq)() = NULL;
+static uint8* s_basemem = NULL;
+static GSRendererType s_renderer = GSRendererType::Undefined;
+static bool s_framelimit = true;
+static bool s_vsync = false;
+static bool s_exclusive = true;
+static const char *s_renderer_name = "";
+static const char *s_renderer_type = "";
+bool gsopen_done = false; // crash guard for GSgetTitleInfo2 and GSKeyEvent (replace with lock?)
+
+EXPORT_C_(uint32) PS2EgetLibType()
+{
+	return PS2E_LT_GS;
+}
+
+EXPORT_C_(const char*) PS2EgetLibName()
+{
+	return GSUtil::GetLibName();
+}
+
+EXPORT_C_(uint32) PS2EgetLibVersion2(uint32 type)
+{
+	const uint32 revision = 1;
+	const uint32 build = 0;
+
+	return (build << 0) | (revision << 8) | (PS2E_GS_VERSION << 16) | (PLUGIN_VERSION << 24);
+}
+
+EXPORT_C_(uint32) PS2EgetCpuPlatform()
+{
+#ifdef _M_AMD64
+
+	return PS2E_X86_64;
+
+#else
+
+	return PS2E_X86;
+
+#endif
+}
+
+EXPORT_C GSsetBaseMem(uint8* mem)
+{
+	s_basemem = mem;
+
+	if(s_gs)
+	{
+		s_gs->SetRegsMem(s_basemem);
+	}
+}
+
+EXPORT_C GSsetSettingsDir(const char* dir)
+{
+	theApp.SetConfigDir(dir);
+}
+
+EXPORT_C_(int) GSinit()
+{
+	if(!GSUtil::CheckSSE())
+	{
+		return -1;
+	}
+
+#ifdef _WIN32
+
+	s_hr = ::CoInitializeEx(NULL, COINIT_MULTITHREADED);
+
+	if (!GSDeviceDX::LoadD3DCompiler())
+	{
+		return -1;
+	}
+#endif
+
+	return 0;
+}
+
+EXPORT_C GSshutdown()
+{
+	gsopen_done = false;
+
+	delete s_gs;
+
+	s_gs = NULL;
+
+	s_renderer = GSRendererType::Undefined;
+
+#ifdef _WIN32
+
+	if(SUCCEEDED(s_hr))
+	{
+		::CoUninitialize();
+
+		s_hr = E_FAIL;
+	}
+
+	GSDeviceDX::FreeD3DCompiler();
+
+#endif
+}
+
+EXPORT_C GSclose()
+{
+	gsopen_done = false;
+
+	if(s_gs == NULL) return;
+
+	s_gs->ResetDevice();
+
+	// Opengl requirement: It must be done before the Detach() of
+	// the context
+	delete s_gs->m_dev;
+
+	s_gs->m_dev = NULL;
+
+	if (s_gs->m_wnd)
+	{
+		s_gs->m_wnd->Detach();
+	}
+}
+
+static int _GSopen(void** dsp, const char* title, GSRendererType renderer, int threads = -1)
+{
+	GSDevice* dev = NULL;
+
+	if(renderer == GSRendererType::Undefined)
+	{
+		renderer = static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(GSRendererType::Default)));
+	}
+
+	if(threads == -1)
+	{
+		threads = theApp.GetConfig("extrathreads", DEFAULT_EXTRA_RENDERING_THREADS);
+	}
+
+	GSWnd* wnd[2] = { NULL, NULL };
+
+	try
+	{
+		if (s_renderer != renderer)
+		{
+			// Emulator has made a render change request, which requires a completely
+			// new s_gs -- if the emu doesn't save/restore the GS state across this
+			// GSopen call then they'll get corrupted graphics, but that's not my problem.
+
+			delete s_gs;
+
+			s_gs = NULL;
+		}
+
+		const char* renderer_fullname = "";
+		const char* renderer_mode = "";
+
+		switch (renderer)
+		{		
+		case GSRendererType::DX9_SW:
+		case GSRendererType::DX1011_SW:
+		case GSRendererType::Null_SW:
+		case GSRendererType::OGL_SW:
+			renderer_mode = "(Software mode)";
+			break;
+		case GSRendererType::DX9_Null:
+		case GSRendererType::DX1011_Null:
+		case GSRendererType::Null_Null:
+			renderer_mode = "(Null mode)";
+			break;
+		case GSRendererType::DX9_OpenCL:
+		case GSRendererType::DX1011_OpenCL:
+		case GSRendererType::Null_OpenCL:
+		case GSRendererType::OGL_OpenCL:
+			renderer_mode = "(OpenCL)";
+			break;
+		default:
+			renderer_mode = "(Hardware mode)";
+			break;
+		}
+
+		switch (renderer)
+		{
+		default:
+#ifdef _WIN32
+		case GSRendererType::DX9_HW:
+		case GSRendererType::DX9_SW:
+		case GSRendererType::DX9_Null:
+		case GSRendererType::DX9_OpenCL:
+			dev = new GSDevice9();
+			s_renderer_name = " D3D9";
+			renderer_fullname = "Direct3D9";
+			break;
+		case GSRendererType::DX1011_HW:
+		case GSRendererType::DX1011_SW:
+		case GSRendererType::DX1011_Null:
+		case GSRendererType::DX1011_OpenCL:
+			dev = new GSDevice11();
+			s_renderer_name = " D3D11";
+			renderer_fullname = "Direct3D11";
+			break;
+#endif
+		case GSRendererType::Null_HW:
+		case GSRendererType::Null_SW:
+		case GSRendererType::Null_Null:
+		case GSRendererType::Null_OpenCL:
+			dev = new GSDeviceNull();
+			s_renderer_name = " Null";
+			renderer_fullname = "Null";
+			break;
+		case GSRendererType::OGL_HW:
+		case GSRendererType::OGL_SW:
+		case GSRendererType::OGL_OpenCL:
+			dev = new GSDeviceOGL();
+			s_renderer_name = " OGL";
+			renderer_fullname = "OpenGL";
+			break;
+		}
+
+		printf("Current Renderer: %s %s\n", renderer_fullname, renderer_mode);
+
+		if (dev == NULL)
+		{
+			return -1;
+		}
+
+		if (s_gs == NULL)
+		{
+			switch (renderer)
+			{
+			default:
+#ifdef _WIN32
+			case GSRendererType::DX9_HW:
+				s_gs = (GSRenderer*)new GSRendererDX9();
+				s_renderer_type = " HW";
+				break;
+			case GSRendererType::DX1011_HW:
+				s_gs = (GSRenderer*)new GSRendererDX11();
+				s_renderer_type = " HW";
+				break;
+#endif
+			case GSRendererType::OGL_HW:
+				s_gs = (GSRenderer*)new GSRendererOGL();
+				s_renderer_type = " HW";
+				break;
+			case GSRendererType::DX9_SW:
+			case GSRendererType::DX1011_SW:
+			case GSRendererType::Null_SW:
+			case GSRendererType::OGL_SW:
+				s_gs = new GSRendererSW(threads);
+				s_renderer_type = " SW";
+				break;
+			case GSRendererType::DX9_Null:
+			case GSRendererType::DX1011_Null:
+			case GSRendererType::Null_Null:
+				s_gs = new GSRendererNull();
+				s_renderer_type = " Null";
+				break;
+			case GSRendererType::DX9_OpenCL:
+			case GSRendererType::DX1011_OpenCL:
+			case GSRendererType::Null_OpenCL:
+			case GSRendererType::OGL_OpenCL:
+#ifdef ENABLE_OPENCL
+				s_gs = new GSRendererCL();
+				s_renderer_type = " OCL";
+#else
+				printf("GSdx error: OpenCL is disabled\n");
+#endif
+				break;
+			}
+			if (s_gs == NULL)
+				return -1;
+
+			s_renderer = renderer;
+		}
+
+		if (s_gs->m_wnd == NULL)
+		{
+#ifdef _WIN32
+			switch (renderer)
+			{
+			case GSRendererType::OGL_HW:
+			case GSRendererType::OGL_SW:
+			case GSRendererType::OGL_OpenCL:
+				s_gs->m_wnd = new GSWndWGL();
+				break;
+			default:
+				s_gs->m_wnd = new GSWndDX();
+				break;
+			}
+#else
+#ifdef EGL_SUPPORTED
+			wnd[0] = new GSWndEGL();
+			wnd[1] = new GSWndOGL();
+#else
+			wnd[0] = new GSWndOGL();
+#endif
+#endif
+		}
+	}
+	catch (std::exception& ex)
+	{
+		// Allowing std exceptions to escape the scope of the plugin callstack could
+		// be problematic, because of differing typeids between DLL and EXE compilations.
+		// ('new' could throw std::alloc)
+
+		printf("GSdx error: Exception caught in GSopen: %s", ex.what());
+
+		return -1;
+	}
+
+	s_gs->SetRegsMem(s_basemem);
+	s_gs->SetIrqCallback(s_irq);
+	s_gs->SetVSync(s_vsync);
+	s_gs->SetFrameLimit(s_framelimit);
+
+	if(*dsp == NULL)
+	{
+		// old-style API expects us to create and manage our own window:
+
+		int w = theApp.GetConfig("ModeWidth", 0);
+		int h = theApp.GetConfig("ModeHeight", 0);
+
+#ifdef __linux__
+		for(uint32 i = 0; i < 2; i++) {
+			try
+			{
+				if (wnd[i] == NULL) continue;
+
+				wnd[i]->Create(title, w, h);
+				s_gs->m_wnd = wnd[i];
+
+				if (i == 0) delete wnd[1];
+
+				break;
+			}
+			catch (GSDXRecoverableError)
+			{
+				wnd[i]->Detach();
+				delete wnd[i];
+			}
+		}
+		if (s_gs->m_wnd == NULL)
+		{
+			GSclose();
+
+			return -1;
+		}
+#endif
+#ifdef _WIN32
+		if(!s_gs->CreateWnd(title, w, h))
+		{
+			GSclose();
+
+			return -1;
+		}
+#endif
+
+		s_gs->m_wnd->Show();
+
+		*dsp = s_gs->m_wnd->GetDisplay();
+	}
+	else
+	{
+		s_gs->SetMultithreaded(true);
+
+#ifdef __linux__
+		if (s_gs->m_wnd) {
+			// A window was already attached to s_gs so we also
+			// need to restore the window state (Attach)
+			s_gs->m_wnd->Attach((void*)((uptr*)(dsp)+1), false);
+		} else {
+			// No window found, try to attach a GLX win and retry 
+			// with EGL win if failed.
+			for(uint32 i = 0; i < 2; i++) {
+				try
+				{
+					if (wnd[i] == NULL) continue;
+
+					wnd[i]->Attach((void*)((uptr*)(dsp)+1), false);
+					s_gs->m_wnd = wnd[i];
+
+					if (i == 0) delete wnd[1];
+
+					break;
+				}
+				catch (GSDXRecoverableError)
+				{
+					wnd[i]->Detach();
+					delete wnd[i];
+				}
+			}
+		}
+#endif
+#ifdef _WIN32
+		try
+		{
+			s_gs->m_wnd->Attach(*dsp, false);
+		}
+		catch (GSDXRecoverableError)
+		{
+			s_gs->m_wnd->Detach();
+			delete s_gs->m_wnd;
+			s_gs->m_wnd = NULL;
+		}
+#endif
+		if (s_gs->m_wnd == NULL)
+		{
+			return -1;
+		}
+	}
+
+	if(!s_gs->CreateDevice(dev))
+	{
+		// This probably means the user has DX11 configured with a video card that is only DX9
+		// compliant.  Cound mean drivr issues of some sort also, but to be sure, that's the most
+		// common cause of device creation errors. :)  --air
+
+		GSclose();
+
+		return -1;
+	}
+
+	if (renderer == GSRendererType::OGL_HW && theApp.GetConfig("debug_glsl_shader", 0) == 2) {
+		printf("GSdx: test OpenGL shader. Please wait...\n\n");
+		static_cast<GSDeviceOGL*>(s_gs->m_dev)->SelfShaderTest();
+		printf("\nGSdx: test OpenGL shader done. It will now exit\n");
+		return -1;
+	}
+	
+	return 0;
+}
+
+EXPORT_C_(int) GSopen2(void** dsp, uint32 flags)
+{
+	static bool stored_toggle_state = false;
+	bool toggle_state = !!(flags & 4);
+
+	GSRendererType renderer = s_renderer;
+	// Fresh start up or config file changed
+	if (renderer == GSRendererType::Undefined)
+	{
+#ifdef _WIN32
+		GSRendererType default_renderer = GSUtil::CheckDirect3D11Level() >= D3D_FEATURE_LEVEL_10_0 ? GSRendererType::DX1011_HW : GSRendererType::DX9_HW;
+#else
+		GSRendererType default_renderer = GSRendererType::Default;
+#endif
+		renderer = static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(default_renderer)));
+	}
+	else if (stored_toggle_state != toggle_state)
+	{
+#ifdef _WIN32
+		GSRendererType best_sw_renderer = GSUtil::CheckDirect3D11Level() >= D3D_FEATURE_LEVEL_10_0 ? GSRendererType::DX1011_SW : GSRendererType::DX9_SW;
+
+
+		switch (renderer) {
+			// Use alternative renderer (SW if currently using HW renderer, and vice versa, keeping the same API and API version)
+		case GSRendererType::DX9_SW: renderer = GSRendererType::DX9_HW; break;
+		case GSRendererType::DX9_HW: renderer = GSRendererType::DX9_SW; break;
+		case GSRendererType::DX1011_SW: renderer = GSRendererType::DX1011_HW; break;
+		case GSRendererType::DX1011_HW: renderer = GSRendererType::DX1011_SW; break;
+		case GSRendererType::OGL_SW: renderer = GSRendererType::OGL_HW; break;
+		case GSRendererType::OGL_HW: renderer = GSRendererType::OGL_SW; break;
+		default: renderer = best_sw_renderer; break;// If wasn't using one of the above mentioned ones, use best SW renderer.
+
+		}
+
+#endif
+#ifdef __linux__
+		switch(renderer) {
+			// Use alternative renderer (SW if currently using HW renderer, and vice versa)
+		case GSRendererType::OGL_SW: renderer = GSRendererType::OGL_HW; break;
+		case GSRendererType::OGL_HW: renderer = GSRendererType::OGL_SW; break;
+		default: renderer = GSRendererType::OGL_SW; break; // fallback to OGL SW
+		}
+#endif
+	}
+	stored_toggle_state = toggle_state;
+
+	int retval = _GSopen(dsp, "", renderer);
+
+	if (s_gs != NULL)
+		s_gs->SetAspectRatio(0);	 // PCSX2 manages the aspect ratios
+
+	gsopen_done = true;
+
+	return retval;
+}
+
+EXPORT_C_(int) GSopen(void** dsp, const char* title, int mt)
+{
+	/*
+	if(!XInitThreads()) return -1;
+
+	Display* display = XOpenDisplay(0);
+
+	XCloseDisplay(display);
+	*/
+
+	GSRendererType renderer = GSRendererType::Default;
+
+	// Legacy GUI expects to acquire vsync from the configuration files.
+
+	s_vsync = !!theApp.GetConfig("vsync", 0);
+
+	if(mt == 2)
+	{
+		// pcsx2 sent a switch renderer request
+
+#ifdef _WIN32
+
+		renderer = GSUtil::CheckDirect3D11Level() >= D3D_FEATURE_LEVEL_10_0 ? GSRendererType::DX1011_SW : GSRendererType::DX9_SW;
+
+#endif
+
+		mt = 1;
+	}
+	else
+	{
+		// normal init
+
+		renderer = static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(GSRendererType::Default)));
+	}
+
+	*dsp = NULL;
+
+	int retval = _GSopen(dsp, title, renderer);
+
+	if(retval == 0 && s_gs)
+	{
+		s_gs->SetMultithreaded(!!mt);
+	}
+
+	gsopen_done = true;
+
+	return retval;
+}
+
+EXPORT_C GSreset()
+{
+	try
+	{
+		s_gs->Reset();
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSgifSoftReset(uint32 mask)
+{
+	try
+	{
+		s_gs->SoftReset(mask);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSwriteCSR(uint32 csr)
+{
+	try
+	{
+		s_gs->WriteCSR(csr);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSinitReadFIFO(uint8* mem)
+{
+	GL_PERF("Init Read FIFO1");
+	try
+	{
+		s_gs->InitReadFIFO(mem, 1);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSreadFIFO(uint8* mem)
+{
+	try
+	{
+		s_gs->ReadFIFO(mem, 1);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSinitReadFIFO2(uint8* mem, uint32 size)
+{
+	GL_PERF("Init Read FIFO2");
+	try
+	{
+		s_gs->InitReadFIFO(mem, size);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSreadFIFO2(uint8* mem, uint32 size)
+{
+	try
+	{
+		s_gs->ReadFIFO(mem, size);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSgifTransfer(const uint8* mem, uint32 size)
+{
+	try
+	{
+		s_gs->Transfer<3>(mem, size);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSgifTransfer1(uint8* mem, uint32 addr)
+{
+	try
+	{
+		s_gs->Transfer<0>(const_cast<uint8*>(mem) + addr, (0x4000 - addr) / 16);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSgifTransfer2(uint8* mem, uint32 size)
+{
+	try
+	{
+		s_gs->Transfer<1>(const_cast<uint8*>(mem), size);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSgifTransfer3(uint8* mem, uint32 size)
+{
+	try
+	{
+		s_gs->Transfer<2>(const_cast<uint8*>(mem), size);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C GSvsync(int field)
+{
+	try
+	{
+#ifdef _WIN32
+
+		if(s_gs->m_wnd->IsManaged())
+		{
+			MSG msg;
+
+			memset(&msg, 0, sizeof(msg));
+
+			while(msg.message != WM_QUIT && PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))
+			{
+				TranslateMessage(&msg);
+				DispatchMessage(&msg);
+			}
+		}
+
+#endif
+
+		s_gs->VSync(field);
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C_(uint32) GSmakeSnapshot(char* path)
+{
+	try
+	{
+		string s(path);
+
+		if(!s.empty() && s[s.length() - 1] != DIRECTORY_SEPARATOR)
+		{
+			s = s + DIRECTORY_SEPARATOR;
+		}
+
+		return s_gs->MakeSnapshot(s + "gsdx");
+	}
+	catch (GSDXRecoverableError)
+	{
+		return false;
+	}
+}
+
+EXPORT_C GSkeyEvent(GSKeyEventData* e)
+{
+	try
+	{
+		if(gsopen_done)
+		{
+			s_gs->KeyEvent(e);
+		}
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C_(int) GSfreeze(int mode, GSFreezeData* data)
+{
+	try
+	{
+		if(mode == FREEZE_SAVE)
+		{
+			return s_gs->Freeze(data, false);
+		}
+		else if(mode == FREEZE_SIZE)
+		{
+			return s_gs->Freeze(data, true);
+		}
+		else if(mode == FREEZE_LOAD)
+		{
+			return s_gs->Defrost(data);
+		}
+	}
+	catch (GSDXRecoverableError)
+	{
+	}
+
+	return 0;
+}
+
+EXPORT_C GSconfigure()
+{
+	try
+	{
+		if(!GSUtil::CheckSSE()) return;
+
+#ifdef _WIN32
+		GSDialog::InitCommonControls();
+		if(GSSettingsDlg().DoModal() == IDOK)
+		{
+			// Force a reload of the gs state
+			s_renderer = GSRendererType::Undefined;
+		}
+
+#else
+
+		if (RunLinuxDialog()) {
+			theApp.ReloadConfig();
+			// Force a reload of the gs state
+			s_renderer = GSRendererType::Undefined;
+		}
+
+#endif
+
+	} catch (GSDXRecoverableError)
+	{
+	}
+}
+
+EXPORT_C_(int) GStest()
+{
+	if(!GSUtil::CheckSSE())
+	{
+		return -1;
+	}
+
+#ifdef _WIN32
+
+	s_hr = ::CoInitializeEx(NULL, COINIT_MULTITHREADED);
+
+	if(!GSUtil::CheckDirectX())
+	{
+		if(SUCCEEDED(s_hr))
+		{
+			::CoUninitialize();
+		}
+
+		s_hr = E_FAIL;
+
+		return -1;
+	}
+
+	if(SUCCEEDED(s_hr))
+	{
+		::CoUninitialize();
+	}
+
+	s_hr = E_FAIL;
+
+#endif
+
+	return 0;
+}
+
+EXPORT_C GSabout()
+{
+}
+
+EXPORT_C GSirqCallback(void (*irq)())
+{
+	s_irq = irq;
+
+	if(s_gs)
+	{
+		s_gs->SetIrqCallback(s_irq);
+	}
+}
+
+void pt(const char* str){
+	struct tm *current;
+	time_t now;
+	
+	time(&now);
+	current = localtime(&now);
+
+	printf("%02i:%02i:%02i%s", current->tm_hour, current->tm_min, current->tm_sec, str);
+}
+
+EXPORT_C_(int) GSsetupRecording(int start, void* data)
+{
+	if (s_gs == NULL) {
+		printf("GSdx: no s_gs for recording\n");
+		return 0;
+	}
+#ifdef __linux__
+	if (!theApp.GetConfig("capture_enabled", 0)) {
+		printf("GSdx: Recording is disabled\n");
+		return 0;
+	}
+#endif
+
+	if(start & 1)
+	{
+		printf("GSdx: Recording start command\n");
+		if (s_gs->BeginCapture()) {
+			pt(" - Capture started\n");
+		} else {
+			pt(" - Capture cancelled\n");
+			return 0;
+		}
+	}
+	else
+	{
+		printf("GSdx: Recording end command\n");
+		s_gs->EndCapture();
+		pt(" - Capture ended\n");
+	}
+
+	return 1;
+}
+
+EXPORT_C GSsetGameCRC(uint32 crc, int options)
+{
+	s_gs->SetGameCRC(crc, options);
+}
+
+EXPORT_C GSgetLastTag(uint32* tag)
+{
+	s_gs->GetLastTag(tag);
+}
+
+EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
+{
+	string s = "GSdx";
+	s.append(s_renderer_name).append(s_renderer_type);
+
+	// TODO: this gets called from a different thread concurrently with GSOpen (on linux)
+	if (gsopen_done && s_gs != NULL && s_gs->m_GStitleInfoBuffer[0])
+	{
+		std::lock_guard<std::mutex> lock(s_gs->m_pGSsetTitle_Crit);
+
+		s.append(" | ").append(s_gs->m_GStitleInfoBuffer);
+
+		if(s.size() > length - 1)
+		{
+			s = s.substr(0, length - 1);
+		}
+	}
+
+	strcpy(dest, s.c_str());
+}
+
+EXPORT_C GSsetFrameSkip(int frameskip)
+{
+	s_gs->SetFrameSkip(frameskip);
+}
+
+EXPORT_C GSsetVsync(int enabled)
+{
+	s_vsync = !!enabled;
+
+	if(s_gs)
+	{
+		s_gs->SetVSync(s_vsync);
+	}
+}
+
+EXPORT_C GSsetExclusive(int enabled)
+{
+	s_exclusive = !!enabled;
+
+	if(s_gs)
+	{
+		s_gs->SetVSync(s_vsync);
+	}
+}
+
+EXPORT_C GSsetFrameLimit(int limit)
+{
+	s_framelimit = !!limit;
+
+	if(s_gs)
+	{
+		s_gs->SetFrameLimit(s_framelimit);
+	}
+}
+
+#ifdef _WIN32
+
+#include <io.h>
+#include <fcntl.h>
+
+class Console
+{
+	HANDLE m_console;
+	string m_title;
+
+public:
+	Console::Console(LPCSTR title, bool open)
+		: m_console(NULL)
+		, m_title(title)
+	{
+		if(open) Open();
+	}
+
+	Console::~Console()
+	{
+		Close();
+	}
+
+	void Console::Open()
+	{
+		if(m_console == NULL)
+		{
+			CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
+
+			AllocConsole();
+
+			SetConsoleTitle(m_title.c_str());
+
+			m_console = GetStdHandle(STD_OUTPUT_HANDLE);
+
+			COORD size;
+
+			size.X = 100;
+			size.Y = 300;
+
+			SetConsoleScreenBufferSize(m_console, size);
+
+			GetConsoleScreenBufferInfo(m_console, &csbiInfo);
+
+			SMALL_RECT rect;
+
+			rect = csbiInfo.srWindow;
+			rect.Right = rect.Left + 99;
+			rect.Bottom = rect.Top + 64;
+
+			SetConsoleWindowInfo(m_console, TRUE, &rect);
+
+			*stdout = *_fdopen(_open_osfhandle((long)m_console, _O_TEXT), "w");
+
+			setvbuf(stdout, NULL, _IONBF, 0);
+		}
+	}
+
+	void Console::Close()
+	{
+		if(m_console != NULL)
+		{
+			FreeConsole();
+
+			m_console = NULL;
+		}
+	}
+};
+
+// lpszCmdLine:
+//   First parameter is the renderer.
+//   Second parameter is the gs file to load and run.
+
+EXPORT_C GSReplay(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
+{
+	GSRendererType renderer = GSRendererType::Undefined;
+
+	{
+		char* start = lpszCmdLine;
+		char* end = NULL;
+		long n = strtol(lpszCmdLine, &end, 10);
+		if(end > start) {renderer = static_cast<GSRendererType>(n); lpszCmdLine = end;}
+	}
+
+	while(*lpszCmdLine == ' ') lpszCmdLine++;
+
+	::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS);
+
+	if(FILE* fp = fopen(lpszCmdLine, "rb"))
+	{
+		Console console("GSdx", true);
+
+		GSinit();
+
+		uint8 regs[0x2000];
+		GSsetBaseMem(regs);
+
+		s_vsync = !!theApp.GetConfig("vsync", 0);
+
+		HWND hWnd = NULL;
+
+		_GSopen((void**)&hWnd, "", renderer);
+
+		uint32 crc;
+		fread(&crc, 4, 1, fp);
+		GSsetGameCRC(crc, 0);
+
+		GSFreezeData fd;
+		fread(&fd.size, 4, 1, fp);
+		fd.data = new uint8[fd.size];
+		fread(fd.data, fd.size, 1, fp);
+		GSfreeze(FREEZE_LOAD, &fd);
+		delete [] fd.data;
+
+		fread(regs, 0x2000, 1, fp);
+
+		long start = ftell(fp);
+
+		GSvsync(1);
+
+		struct Packet {uint8 type, param; uint32 size, addr; vector<uint8> buff;};
+
+		list<Packet*> packets;
+		vector<uint8> buff;
+		int type;
+
+		while((type = fgetc(fp)) != EOF)
+		{
+			Packet* p = new Packet();
+
+			p->type = (uint8)type;
+
+			switch(type)
+			{
+			case 0:
+				
+				p->param = (uint8)fgetc(fp);
+
+				fread(&p->size, 4, 1, fp);
+
+				switch(p->param)
+				{
+				case 0:
+					p->buff.resize(0x4000);
+					p->addr = 0x4000 - p->size;
+					fread(&p->buff[p->addr], p->size, 1, fp);
+					break;
+				case 1:
+				case 2:
+				case 3:
+					p->buff.resize(p->size);
+					fread(&p->buff[0], p->size, 1, fp);
+					break;
+				}
+
+				break;
+
+			case 1:
+
+				p->param = (uint8)fgetc(fp);
+
+				break;
+
+			case 2:
+
+				fread(&p->size, 4, 1, fp);
+
+				break;
+
+			case 3:
+
+				p->buff.resize(0x2000);
+
+				fread(&p->buff[0], 0x2000, 1, fp);
+
+				break;
+			}
+
+			packets.push_back(p);
+		}
+
+		Sleep(100);
+
+		while(IsWindowVisible(hWnd))
+		{
+			for(list<Packet*>::iterator i = packets.begin(); i != packets.end(); i++)
+			{
+				Packet* p = *i;
+
+				switch(p->type)
+				{
+				case 0:
+
+					switch(p->param)
+					{
+					case 0: GSgifTransfer1(&p->buff[0], p->addr); break;
+					case 1: GSgifTransfer2(&p->buff[0], p->size / 16); break;
+					case 2: GSgifTransfer3(&p->buff[0], p->size / 16); break;
+					case 3: GSgifTransfer(&p->buff[0], p->size / 16); break;
+					}
+
+					break;
+
+				case 1:
+
+					GSvsync(p->param);
+
+					break;
+
+				case 2:
+
+					if(buff.size() < p->size) buff.resize(p->size);
+
+					GSreadFIFO2(&buff[0], p->size / 16);
+
+					break;
+
+				case 3:
+
+					memcpy(regs, &p->buff[0], 0x2000);
+
+					break;
+				}
+			}
+		}
+
+		for(list<Packet*>::iterator i = packets.begin(); i != packets.end(); i++)
+		{
+			delete *i;
+		}
+
+		packets.clear();
+
+		Sleep(100);
+
+
+		/*
+		vector<uint8> buff;
+		bool exit = false;
+
+		int round = 0;
+
+		while(!exit)
+		{
+			uint32 index;
+			uint32 size;
+			uint32 addr;
+
+			int pos;
+
+			switch(fgetc(fp))
+			{
+			case EOF:
+				fseek(fp, start, 0);
+				exit = !IsWindowVisible(hWnd);
+				//exit = ++round == 60;
+				break;
+
+			case 0:
+				index = fgetc(fp);
+				fread(&size, 4, 1, fp);
+
+				switch(index)
+				{
+				case 0:
+					if(buff.size() < 0x4000) buff.resize(0x4000);
+					addr = 0x4000 - size;
+					fread(&buff[addr], size, 1, fp);
+					GSgifTransfer1(&buff[0], addr);
+					break;
+
+				case 1:
+					if(buff.size() < size) buff.resize(size);
+					fread(&buff[0], size, 1, fp);
+					GSgifTransfer2(&buff[0], size / 16);
+					break;
+
+				case 2:
+					if(buff.size() < size) buff.resize(size);
+					fread(&buff[0], size, 1, fp);
+					GSgifTransfer3(&buff[0], size / 16);
+					break;
+
+				case 3:
+					if(buff.size() < size) buff.resize(size);
+					fread(&buff[0], size, 1, fp);
+					GSgifTransfer(&buff[0], size / 16);
+					break;
+				}
+
+				break;
+
+			case 1:
+				GSvsync(fgetc(fp));
+				exit = !IsWindowVisible(hWnd);
+				break;
+
+			case 2:
+				fread(&size, 4, 1, fp);
+				if(buff.size() < size) buff.resize(size);
+				GSreadFIFO2(&buff[0], size / 16);
+				break;
+
+			case 3:
+				fread(regs, 0x2000, 1, fp);
+				break;
+			}
+		}
+		*/
+
+		GSclose();
+		GSshutdown();
+
+		fclose(fp);
+	}
+}
+
+EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
+{
+	::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS);
+
+	Console console("GSdx", true);
+
+	if(1)
+	{
+		GSLocalMemory* mem = new GSLocalMemory();		
+
+		static struct {int psm; const char* name;} s_format[] =
+		{
+			{PSM_PSMCT32, "32"},
+			{PSM_PSMCT24, "24"},
+			{PSM_PSMCT16, "16"},
+			{PSM_PSMCT16S, "16S"},
+			{PSM_PSMT8, "8"},
+			{PSM_PSMT4, "4"},
+			{PSM_PSMT8H, "8H"},
+			{PSM_PSMT4HL, "4HL"},
+			{PSM_PSMT4HH, "4HH"},
+			{PSM_PSMZ32, "32Z"},
+			{PSM_PSMZ24, "24Z"},
+			{PSM_PSMZ16, "16Z"},
+			{PSM_PSMZ16S, "16ZS"},
+		};
+
+		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
+
+		for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
+
+		//
+
+		for(int tbw = 5; tbw <= 10; tbw++)
+		{
+			int n = 256 << ((10 - tbw) * 2);
+
+			int w = 1 << tbw;
+			int h = 1 << tbw;
+
+			printf("%d x %d\n\n", w, h);
+
+			for(size_t i = 0; i < countof(s_format); i++)
+			{
+				const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[s_format[i].psm];
+
+				GSLocalMemory::writeImage wi = psm.wi;
+				GSLocalMemory::readImage ri = psm.ri;
+				GSLocalMemory::readTexture rtx = psm.rtx;
+				GSLocalMemory::readTexture rtxP = psm.rtxP;
+
+				GIFRegBITBLTBUF BITBLTBUF;
+
+				BITBLTBUF.SBP = 0;
+				BITBLTBUF.SBW = w / 64;
+				BITBLTBUF.SPSM = s_format[i].psm;
+				BITBLTBUF.DBP = 0;
+				BITBLTBUF.DBW = w / 64;
+				BITBLTBUF.DPSM = s_format[i].psm;
+
+				GIFRegTRXPOS TRXPOS;
+
+				TRXPOS.SSAX = 0;
+				TRXPOS.SSAY = 0;
+				TRXPOS.DSAX = 0;
+				TRXPOS.DSAY = 0;
+
+				GIFRegTRXREG TRXREG;
+
+				TRXREG.RRW = w;
+				TRXREG.RRH = h;
+
+				GSVector4i r(0, 0, w, h);
+
+				GIFRegTEX0 TEX0;
+
+				TEX0.TBP0 = 0;
+				TEX0.TBW = w / 64;
+
+				GIFRegTEXA TEXA;
+
+				TEXA.TA0 = 0;
+				TEXA.TA1 = 0x80;
+				TEXA.AEM = 0;
+
+				int trlen = w * h * psm.trbpp / 8;
+				int len = w * h * psm.bpp / 8;
+
+				clock_t start, end;
+
+				printf("[%4s] ", s_format[i].name);
+
+				start = clock();
+
+				for(int j = 0; j < n; j++)
+				{
+					int x = 0;
+					int y = 0;
+
+					(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+				}
+
+				end = clock();
+
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+
+				start = clock();
+
+				for(int j = 0; j < n; j++)
+				{
+					int x = 0;
+					int y = 0;
+
+					(mem->*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+				}
+
+				end = clock();
+
+				printf("%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+
+				const GSOffset* off = mem->GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+				start = clock();
+
+				for(int j = 0; j < n; j++)
+				{
+					(mem->*rtx)(off, r, ptr, w * 4, TEXA);
+				}
+
+				end = clock();
+
+				printf("%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+
+				if(psm.pal > 0)
+				{
+					start = clock();
+
+					for(int j = 0; j < n; j++)
+					{
+						(mem->*rtxP)(off, r, ptr, w, TEXA);
+					}
+
+					end = clock();
+
+					printf("| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000));
+				}
+
+				printf("\n");
+			}
+
+			printf("\n");
+		}
+
+		_aligned_free(ptr);
+
+		delete mem;
+	}
+
+	//
+
+	if(0)
+	{
+		GSLocalMemory* mem = new GSLocalMemory();
+
+		uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
+
+		for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
+
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[PSM_PSMCT32];
+
+		GSLocalMemory::writeImage wi = psm.wi;
+
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.DBP = 0;
+		BITBLTBUF.DBW = 32;
+		BITBLTBUF.DPSM = PSM_PSMCT32;
+
+		GIFRegTRXPOS TRXPOS;
+
+		TRXPOS.DSAX = 0;
+		TRXPOS.DSAY = 1;
+
+		GIFRegTRXREG TRXREG;
+
+		TRXREG.RRW = 256;
+		TRXREG.RRH = 256;
+
+		int trlen = 256 * 256 * psm.trbpp / 8;
+
+		int x = 0;
+		int y = 0;
+
+		(mem->*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG);
+
+		delete mem;
+	}
+
+	//
+
+	PostQuitMessage(0);
+}
+
+#endif
+
+#ifdef __linux__
+
+#include <sys/time.h>
+#include <sys/timeb.h>	// ftime(), struct timeb
+#include "GSLzma.h"
+
+inline unsigned long timeGetTime()
+{
+	timeb t;
+	ftime(&t);
+
+	return (unsigned long)(t.time*1000 + t.millitm);
+}
+
+// Note
+EXPORT_C GSReplay(char* lpszCmdLine, int renderer)
+{
+	GLLoader::in_replayer = true;
+
+	GSRendererType m_renderer;
+	// Allow to easyly switch between SW/HW renderer -> this effectively removes the ability to select the renderer by function args
+	m_renderer = static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(GSRendererType::Default)));
+	// alternatively:
+	// m_renderer = static_cast<GSRendererType>(renderer);
+
+	if (m_renderer != GSRendererType::OGL_HW && m_renderer != GSRendererType::OGL_SW)
+	{
+		fprintf(stderr, "wrong renderer selected %d\n", static_cast<int>(m_renderer));
+		return;
+	}
+
+	struct Packet {uint8 type, param; uint32 size, addr; vector<uint8> buff;};
+
+	list<Packet*> packets;
+	vector<uint8> buff;
+	vector<float> stats;
+	stats.clear();
+	uint8 regs[0x2000];
+
+	GSinit();
+
+	GSsetBaseMem(regs);
+
+	s_vsync = !!theApp.GetConfig("vsync", 0);
+
+	void* hWnd = NULL;
+
+	int err = _GSopen((void**)&hWnd, "", m_renderer);
+	if (err != 0) {
+		fprintf(stderr, "Error failed to GSopen\n");
+		return;
+	}
+	if (s_gs->m_wnd == NULL) return;
+
+	{ // Read .gs content
+		std::string f(lpszCmdLine);
+#ifdef LZMA_SUPPORTED
+		GSDumpFile* file = (f.size() >= 4) && (f.compare(f.size()-3, 3, ".xz") == 0)
+			? (GSDumpFile*) new GSDumpLzma(lpszCmdLine)
+			: (GSDumpFile*) new GSDumpRaw(lpszCmdLine);
+#else
+		GSDumpFile* file = new GSDumpRaw(lpszCmdLine);
+#endif
+
+		uint32 crc;
+		file->Read(&crc, 4);
+		GSsetGameCRC(crc, 0);
+
+		GSFreezeData fd;
+		file->Read(&fd.size, 4);
+		fd.data = new uint8[fd.size];
+		file->Read(fd.data, fd.size);
+
+		GSfreeze(FREEZE_LOAD, &fd);
+		delete [] fd.data;
+
+		file->Read(regs, 0x2000);
+
+		GSvsync(1);
+
+
+		while(!file->IsEof())
+		{
+			uint8 type;
+			file->Read(&type, 1);
+
+			Packet* p = new Packet();
+
+			p->type = type;
+
+			switch(type)
+			{
+			case 0:
+				file->Read(&p->param, 1);
+				file->Read(&p->size, 4);
+
+				switch(p->param)
+				{
+				case 0:
+					p->buff.resize(0x4000);
+					p->addr = 0x4000 - p->size;
+					file->Read(&p->buff[p->addr], p->size);
+					break;
+				case 1:
+				case 2:
+				case 3:
+					p->buff.resize(p->size);
+					file->Read(&p->buff[0], p->size);
+					break;
+				}
+
+				break;
+
+			case 1:
+				file->Read(&p->param, 1);
+
+				break;
+
+			case 2:
+				file->Read(&p->size, 4);
+
+				break;
+
+			case 3:
+				p->buff.resize(0x2000);
+
+				file->Read(&p->buff[0], 0x2000);
+
+				break;
+			}
+
+			packets.push_back(p);
+		}
+
+		delete file;
+	}
+
+	sleep(1);
+
+	//while(IsWindowVisible(hWnd))
+	//FIXME map?
+	int finished = theApp.GetConfig("linux_replay", 1);
+	if (theApp.GetConfig("dump", 0)) {
+		fprintf(stderr, "Dump is enabled. Replay will be disabled\n");
+		finished = 1;
+	}
+	unsigned long frame_number = 0;
+	unsigned long total_frame_nb = 0;
+	while(finished > 0)
+	{
+		frame_number = 0;
+		unsigned long start = timeGetTime();
+		for(auto i = packets.begin(); i != packets.end(); i++)
+		{
+			Packet* p = *i;
+
+			switch(p->type)
+			{
+				case 0:
+
+					switch(p->param)
+					{
+						case 0: GSgifTransfer1(&p->buff[0], p->addr); break;
+						case 1: GSgifTransfer2(&p->buff[0], p->size / 16); break;
+						case 2: GSgifTransfer3(&p->buff[0], p->size / 16); break;
+						case 3: GSgifTransfer(&p->buff[0], p->size / 16); break;
+					}
+
+					break;
+
+				case 1:
+
+					GSvsync(p->param);
+					frame_number++;
+
+					break;
+
+				case 2:
+
+					if(buff.size() < p->size) buff.resize(p->size);
+
+					GSreadFIFO2(&buff[0], p->size / 16);
+
+					break;
+
+				case 3:
+
+					memcpy(regs, &p->buff[0], 0x2000);
+
+					break;
+			}
+		}
+
+		// Ensure the rendering is complete to measure correctly the time.
+		glFinish();
+
+		if (finished > 90) {
+			sleep(1);
+		} else {
+			unsigned long end = timeGetTime();
+			frame_number = std::max(1ul, frame_number); // avoid a potential division by 0
+
+			fprintf(stderr, "The %ld frames of the scene was render on %ldms\n", frame_number, end - start);
+			fprintf(stderr, "A means of %fms by frame\n", (float)(end - start)/(float)frame_number);
+
+			stats.push_back((float)(end - start));
+
+			finished--;
+			total_frame_nb += frame_number;
+		}
+	}
+
+	if (theApp.GetConfig("linux_replay", 1) > 1) {
+		// Print some nice stats
+		// Skip first frame (shader compilation populate the result)
+		// it divides by 10 the standard deviation...
+		float n = (float)theApp.GetConfig("linux_replay", 1) - 1.0f;
+		float mean = 0;
+		float sd = 0;
+		for (auto i = stats.begin()+1; i != stats.end(); i++) {
+			mean += *i;
+		}
+		mean = mean/n;
+		for (auto i = stats.begin()+1; i != stats.end(); i++) {
+			sd += pow((*i)-mean, 2);
+		}
+		sd = sqrt(sd/n);
+
+		fprintf(stderr, "\n\nMean: %fms\n", mean);
+		fprintf(stderr, "Standard deviation: %fms\n", sd);
+		fprintf(stderr, "Mean by frame: %fms (%ffps)\n", mean/(float)frame_number, 1000.0f*frame_number/mean);
+		fprintf(stderr, "Standard deviatin by frame: %fms\n", sd/(float)frame_number);
+	}
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+	total_frame_nb *= 1024;
+	fprintf(stderr, "memory bandwith. T: %f KB/f. V: %f KB/f. U: %f KB/f\n",
+			(float)g_real_texture_upload_byte/(float)total_frame_nb,
+			(float)g_vertex_upload_byte/(float)total_frame_nb,
+			(float)g_uniform_upload_byte/(float)total_frame_nb
+		   );
+#endif
+
+	for(auto i = packets.begin(); i != packets.end(); i++)
+	{
+		delete *i;
+	}
+
+	packets.clear();
+
+	sleep(1);
+
+	GSclose();
+	GSshutdown();
+}
+#endif
+
diff --git a/plugins/GSdx_legacy/GS.h b/plugins/GSdx_legacy/GS.h
new file mode 100644
index 0000000000..b1c7fb5bfd
--- /dev/null
+++ b/plugins/GSdx_legacy/GS.h
@@ -0,0 +1,1300 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#define PLUGIN_VERSION 0
+
+#define VM_SIZE 4194304
+#define PAGE_SIZE 8192
+#define BLOCK_SIZE 256
+#define COLUMN_SIZE 64
+
+#define MAX_PAGES (VM_SIZE / PAGE_SIZE)
+#define MAX_BLOCKS (VM_SIZE / BLOCK_SIZE)
+#define MAX_COLUMNS (VM_SIZE / COLUMN_SIZE)
+
+//if defined, will send much info in reply to the API title info queri from PCSX2
+//default should be undefined
+//#define GSTITLEINFO_API_FORCE_VERBOSE
+
+#include "GSVector.h"
+
+#pragma pack(push, 1)
+
+enum GS_PRIM
+{
+	GS_POINTLIST		= 0,
+	GS_LINELIST			= 1,
+	GS_LINESTRIP		= 2,
+	GS_TRIANGLELIST		= 3,
+	GS_TRIANGLESTRIP	= 4,
+	GS_TRIANGLEFAN		= 5,
+	GS_SPRITE			= 6,
+	GS_INVALID			= 7,
+};
+
+enum GS_PRIM_CLASS
+{
+	GS_POINT_CLASS		= 0,
+	GS_LINE_CLASS		= 1,
+	GS_TRIANGLE_CLASS	= 2,
+	GS_SPRITE_CLASS		= 3,
+	GS_INVALID_CLASS	= 7,
+};
+
+enum GIF_REG
+{
+	GIF_REG_PRIM	= 0x00,
+	GIF_REG_RGBA	= 0x01,
+	GIF_REG_STQ		= 0x02,
+	GIF_REG_UV		= 0x03,
+	GIF_REG_XYZF2	= 0x04,
+	GIF_REG_XYZ2	= 0x05,
+	GIF_REG_TEX0_1	= 0x06,
+	GIF_REG_TEX0_2	= 0x07,
+	GIF_REG_CLAMP_1	= 0x08,
+	GIF_REG_CLAMP_2	= 0x09,
+	GIF_REG_FOG		= 0x0a,
+	GIF_REG_INVALID	= 0x0b,
+	GIF_REG_XYZF3	= 0x0c,
+	GIF_REG_XYZ3	= 0x0d,
+	GIF_REG_A_D		= 0x0e,
+	GIF_REG_NOP		= 0x0f,
+};
+
+enum GIF_REG_COMPLEX
+{
+	GIF_REG_STQRGBAXYZF2	= 0x00,
+	GIF_REG_STQRGBAXYZ2		= 0x01,
+};
+
+enum GIF_A_D_REG
+{
+	GIF_A_D_REG_PRIM		= 0x00,
+	GIF_A_D_REG_RGBAQ		= 0x01,
+	GIF_A_D_REG_ST			= 0x02,
+	GIF_A_D_REG_UV			= 0x03,
+	GIF_A_D_REG_XYZF2		= 0x04,
+	GIF_A_D_REG_XYZ2		= 0x05,
+	GIF_A_D_REG_TEX0_1		= 0x06,
+	GIF_A_D_REG_TEX0_2		= 0x07,
+	GIF_A_D_REG_CLAMP_1		= 0x08,
+	GIF_A_D_REG_CLAMP_2		= 0x09,
+	GIF_A_D_REG_FOG			= 0x0a,
+	GIF_A_D_REG_XYZF3		= 0x0c,
+	GIF_A_D_REG_XYZ3		= 0x0d,
+	GIF_A_D_REG_NOP			= 0x0f,
+	GIF_A_D_REG_TEX1_1		= 0x14,
+	GIF_A_D_REG_TEX1_2		= 0x15,
+	GIF_A_D_REG_TEX2_1		= 0x16,
+	GIF_A_D_REG_TEX2_2		= 0x17,
+	GIF_A_D_REG_XYOFFSET_1	= 0x18,
+	GIF_A_D_REG_XYOFFSET_2	= 0x19,
+	GIF_A_D_REG_PRMODECONT	= 0x1a,
+	GIF_A_D_REG_PRMODE		= 0x1b,
+	GIF_A_D_REG_TEXCLUT		= 0x1c,
+	GIF_A_D_REG_SCANMSK		= 0x22,
+	GIF_A_D_REG_MIPTBP1_1	= 0x34,
+	GIF_A_D_REG_MIPTBP1_2	= 0x35,
+	GIF_A_D_REG_MIPTBP2_1	= 0x36,
+	GIF_A_D_REG_MIPTBP2_2	= 0x37,
+	GIF_A_D_REG_TEXA		= 0x3b,
+	GIF_A_D_REG_FOGCOL		= 0x3d,
+	GIF_A_D_REG_TEXFLUSH	= 0x3f,
+	GIF_A_D_REG_SCISSOR_1	= 0x40,
+	GIF_A_D_REG_SCISSOR_2	= 0x41,
+	GIF_A_D_REG_ALPHA_1		= 0x42,
+	GIF_A_D_REG_ALPHA_2		= 0x43,
+	GIF_A_D_REG_DIMX		= 0x44,
+	GIF_A_D_REG_DTHE		= 0x45,
+	GIF_A_D_REG_COLCLAMP	= 0x46,
+	GIF_A_D_REG_TEST_1		= 0x47,
+	GIF_A_D_REG_TEST_2		= 0x48,
+	GIF_A_D_REG_PABE		= 0x49,
+	GIF_A_D_REG_FBA_1		= 0x4a,
+	GIF_A_D_REG_FBA_2		= 0x4b,
+	GIF_A_D_REG_FRAME_1		= 0x4c,
+	GIF_A_D_REG_FRAME_2		= 0x4d,
+	GIF_A_D_REG_ZBUF_1		= 0x4e,
+	GIF_A_D_REG_ZBUF_2		= 0x4f,
+	GIF_A_D_REG_BITBLTBUF	= 0x50,
+	GIF_A_D_REG_TRXPOS		= 0x51,
+	GIF_A_D_REG_TRXREG		= 0x52,
+	GIF_A_D_REG_TRXDIR		= 0x53,
+	GIF_A_D_REG_HWREG		= 0x54,
+	GIF_A_D_REG_SIGNAL		= 0x60,
+	GIF_A_D_REG_FINISH		= 0x61,
+	GIF_A_D_REG_LABEL		= 0x62,
+};
+
+enum GIF_FLG
+{
+	GIF_FLG_PACKED	= 0,
+	GIF_FLG_REGLIST	= 1,
+	GIF_FLG_IMAGE	= 2,
+	GIF_FLG_IMAGE2	= 3
+};
+
+enum GS_PSM
+{
+	PSM_PSMCT32		= 0,  // 0000-0000
+	PSM_PSMCT24		= 1,  // 0000-0001
+	PSM_PSMCT16		= 2,  // 0000-0010
+	PSM_PSMCT16S	= 10, // 0000-1010
+	PSM_PSMT8		= 19, // 0001-0011
+	PSM_PSMT4		= 20, // 0001-0100
+	PSM_PSMT8H		= 27, // 0001-1011
+	PSM_PSMT4HL		= 36, // 0010-0100
+	PSM_PSMT4HH		= 44, // 0010-1100
+	PSM_PSMZ32		= 48, // 0011-0000
+	PSM_PSMZ24		= 49, // 0011-0001
+	PSM_PSMZ16		= 50, // 0011-0010
+	PSM_PSMZ16S		= 58, // 0011-1010
+};
+
+enum GS_TFX
+{
+	TFX_MODULATE	= 0,
+	TFX_DECAL		= 1,
+	TFX_HIGHLIGHT	= 2,
+	TFX_HIGHLIGHT2	= 3,
+	TFX_NONE		= 4,
+};
+
+enum GS_CLAMP
+{
+	CLAMP_REPEAT		= 0,
+	CLAMP_CLAMP			= 1,
+	CLAMP_REGION_CLAMP	= 2,
+	CLAMP_REGION_REPEAT	= 3,
+};
+
+enum GS_ZTST
+{
+	ZTST_NEVER		= 0,
+	ZTST_ALWAYS		= 1,
+	ZTST_GEQUAL		= 2,
+	ZTST_GREATER	= 3,
+};
+
+enum GS_ATST
+{
+	ATST_NEVER		= 0,
+	ATST_ALWAYS		= 1,
+	ATST_LESS		= 2,
+	ATST_LEQUAL		= 3,
+	ATST_EQUAL		= 4,
+	ATST_GEQUAL		= 5,
+	ATST_GREATER	= 6,
+	ATST_NOTEQUAL	= 7,
+};
+
+enum GS_AFAIL
+{
+	AFAIL_KEEP		= 0,
+	AFAIL_FB_ONLY	= 1,
+	AFAIL_ZB_ONLY	= 2,
+	AFAIL_RGB_ONLY	= 3,
+};
+
+enum class GSRendererType : int8_t
+{
+	Undefined = -1,
+
+	DX9_HW = 0,
+	DX9_SW = 1,
+	DX9_OpenCL = 14,
+	DX9_Null = 2,
+
+	DX1011_HW = 3,
+	DX1011_SW = 4,
+	DX1011_OpenCL = 15,
+	DX1011_Null = 5,
+
+	Null_HW = 9,
+	Null_SW = 10,
+	Null_OpenCL = 16,
+	Null_Null = 11,
+
+	OGL_HW = 12,
+	OGL_SW = 13,
+	OGL_OpenCL = 17,
+
+#ifdef _WIN32
+	Default = DX9_HW
+#else
+	// Use ogl renderer as default otherwise it crash at startup
+	// GSRenderOGL only GSDeviceOGL (not GSDeviceNULL)
+	Default = OGL_HW
+#endif
+
+};
+
+
+#define REG32(name) \
+union name			\
+{					\
+	uint32 u32;	\
+	struct {		\
+
+#define REG64(name) \
+union name			\
+{					\
+	uint64 u64;		\
+	uint32 u32[2];	\
+	void operator = (const GSVector4i& v) {GSVector4i::storel(this, v);} \
+	bool operator == (const union name& r) const {return ((GSVector4i)r).eq(*this);} \
+	bool operator != (const union name& r) const {return !((GSVector4i)r).eq(*this);} \
+	operator GSVector4i() const {return GSVector4i::loadl(this);} \
+	struct {		\
+
+#define REG128(name)\
+union name			\
+{					\
+	uint64 u64[2];	\
+	uint32 u32[4];	\
+	struct {		\
+
+#define REG32_(prefix, name) REG32(prefix##name)
+#define REG64_(prefix, name) REG64(prefix##name)
+#define REG128_(prefix, name) REG128(prefix##name)
+
+#define REG_END }; };
+#define REG_END2 };
+
+#define REG32_SET(name) \
+union name			\
+{					\
+	uint32 u32;	\
+
+#define REG64_SET(name) \
+union name			\
+{					\
+	uint64 u64;		\
+	uint32 u32[2];	\
+
+#define REG128_SET(name)\
+union name			\
+{					\
+	__m128i m128;  \
+	uint64 u64[2];	\
+	uint32 u32[4];	\
+
+#define REG_SET_END };
+
+REG64_(GSReg, BGCOLOR)
+	uint8 R;
+	uint8 G;
+	uint8 B;
+	uint8 _PAD1[5];
+REG_END
+
+REG64_(GSReg, BUSDIR)
+	uint32 DIR:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GSReg, CSR)
+	uint32 rSIGNAL:1;
+	uint32 rFINISH:1;
+	uint32 rHSINT:1;
+	uint32 rVSINT:1;
+	uint32 rEDWINT:1;
+	uint32 rZERO1:1;
+	uint32 rZERO2:1;
+	uint32 r_PAD1:1;
+	uint32 rFLUSH:1;
+	uint32 rRESET:1;
+	uint32 r_PAD2:2;
+	uint32 rNFIELD:1;
+	uint32 rFIELD:1;
+	uint32 rFIFO:2;
+	uint32 rREV:8;
+	uint32 rID:8;
+	uint32 wSIGNAL:1;
+	uint32 wFINISH:1;
+	uint32 wHSINT:1;
+	uint32 wVSINT:1;
+	uint32 wEDWINT:1;
+	uint32 wZERO1:1;
+	uint32 wZERO2:1;
+	uint32 w_PAD1:1;
+	uint32 wFLUSH:1;
+	uint32 wRESET:1;
+	uint32 w_PAD2:2;
+	uint32 wNFIELD:1;
+	uint32 wFIELD:1;
+	uint32 wFIFO:2;
+	uint32 wREV:8;
+	uint32 wID:8;
+REG_END
+
+REG64_(GSReg, DISPFB) // (-1/2)
+	uint32 FBP:9;
+	uint32 FBW:6;
+	uint32 PSM:5;
+	uint32 _PAD:12;
+	uint32 DBX:11;
+	uint32 DBY:11;
+	uint32 _PAD2:10;
+REG_END2
+	uint32 Block() const {return FBP << 5;}
+REG_END2
+
+REG64_(GSReg, DISPLAY) // (-1/2)
+	uint32 DX:12;
+	uint32 DY:11;
+	uint32 MAGH:4;
+	uint32 MAGV:2;
+	uint32 _PAD:3;
+	uint32 DW:12;
+	uint32 DH:11;
+	uint32 _PAD2:9;
+REG_END
+
+REG64_(GSReg, EXTBUF)
+	uint32 EXBP:14;
+	uint32 EXBW:6;
+	uint32 FBIN:2;
+	uint32 WFFMD:1;
+	uint32 EMODA:2;
+	uint32 EMODC:2;
+	uint32 _PAD1:5;
+	uint32 WDX:11;
+	uint32 WDY:11;
+	uint32 _PAD2:10;
+REG_END
+
+REG64_(GSReg, EXTDATA)
+	uint32 SX:12;
+	uint32 SY:11;
+	uint32 SMPH:4;
+	uint32 SMPV:2;
+	uint32 _PAD1:3;
+	uint32 WW:12;
+	uint32 WH:11;
+	uint32 _PAD2:9;
+REG_END
+
+REG64_(GSReg, EXTWRITE)
+	uint32 WRITE:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GSReg, IMR)
+	uint32 _PAD1:8;
+	uint32 SIGMSK:1;
+	uint32 FINISHMSK:1;
+	uint32 HSMSK:1;
+	uint32 VSMSK:1;
+	uint32 EDWMSK:1;
+	uint32 _PAD2:19;
+	uint32 _PAD3:32;
+REG_END
+
+REG64_(GSReg, PMODE)
+union
+{
+	struct
+	{
+		uint32 EN1:1;
+		uint32 EN2:1;
+		uint32 CRTMD:3;
+		uint32 MMOD:1;
+		uint32 AMOD:1;
+		uint32 SLBG:1;
+		uint32 ALP:8;
+		uint32 _PAD:16;
+		uint32 _PAD1:32;
+	};
+
+	struct
+	{
+		uint32 EN:2;
+		uint32 _PAD2:30;
+		uint32 _PAD3:32;
+	};
+};
+REG_END
+
+REG64_(GSReg, SIGLBLID)
+	uint32 SIGID;
+	uint32 LBLID;
+REG_END
+
+REG64_(GSReg, SMODE1)
+	uint32 RC:3;
+	uint32 LC:7;
+	uint32 T1248:2;
+	uint32 SLCK:1;
+	uint32 CMOD:2;
+	uint32 EX:1;
+	uint32 PRST:1;
+	uint32 SINT:1;
+	uint32 XPCK:1;
+	uint32 PCK2:2;
+	uint32 SPML:4;
+	uint32 GCONT:1; // YCrCb
+	uint32 PHS:1;
+	uint32 PVS:1;
+	uint32 PEHS:1;
+	uint32 PEVS:1;
+	uint32 CLKSEL:2;
+	uint32 NVCK:1;
+	uint32 SLCK2:1;
+	uint32 VCKSEL:2;
+	uint32 VHP:1;
+	uint32 _PAD1:27;
+REG_END
+
+/*
+
+// pal
+
+CLKSEL=1 CMOD=3 EX=0 GCONT=0 LC=32 NVCK=1 PCK2=0 PEHS=0 PEVS=0 PHS=0 PRST=1 PVS=0 RC=4 SINT=0 SLCK=0 SLCK2=1 SPML=4 T1248=1 VCKSEL=1 VHP=0 XPCK=0
+
+// ntsc
+
+CLKSEL=1 CMOD=2 EX=0 GCONT=0 LC=32 NVCK=1 PCK2=0 PEHS=0 PEVS=0 PHS=0 PRST=1 PVS=0 RC=4 SINT=0 SLCK=0 SLCK2=1 SPML=4 T1248=1 VCKSEL=1 VHP=0 XPCK=0
+
+// ntsc progressive (SoTC)
+
+CLKSEL=1 CMOD=0 EX=0 GCONT=0 LC=32 NVCK=1 PCK2=0 PEHS=0 PEVS=0 PHS=0 PRST=1 PVS=0 RC=4 SINT=0 SLCK=0 SLCK2=1 SPML=2 T1248=1 VCKSEL=1 VHP=1 XPCK=0
+
+*/
+
+REG64_(GSReg, SMODE2)
+	uint32 INT:1;
+	uint32 FFMD:1;
+	uint32 DPMS:2;
+	uint32 _PAD2:28;
+	uint32 _PAD3:32;
+REG_END
+
+REG64_(GSReg, SRFSH)
+	uint32 _DUMMY;
+	// TODO
+REG_END
+
+REG64_(GSReg, SYNCH1)
+	uint32 _DUMMY;
+	// TODO
+REG_END
+
+REG64_(GSReg, SYNCH2)
+	uint32 _DUMMY;
+	// TODO
+REG_END
+
+REG64_(GSReg, SYNCV)
+	uint64 _DUMMY;
+	// TODO
+REG_END
+
+REG64_SET(GSReg)
+	GSRegBGCOLOR	BGCOLOR;
+	GSRegBUSDIR		BUSDIR;
+	GSRegCSR		CSR;
+	GSRegDISPFB		DISPFB;
+	GSRegDISPLAY	DISPLAY;
+	GSRegEXTBUF		EXTBUF;
+	GSRegEXTDATA	EXTDATA;
+	GSRegEXTWRITE	EXTWRITE;
+	GSRegIMR		IMR;
+	GSRegPMODE		PMODE;
+	GSRegSIGLBLID	SIGLBLID;
+	GSRegSMODE1		SMODE1;
+	GSRegSMODE2		SMODE2;
+REG_SET_END
+
+//
+// GIFTag
+
+REG128(GIFTag)
+	uint32 NLOOP:15;
+	uint32 EOP:1;
+	uint32 _PAD1:16;
+	uint32 _PAD2:14;
+	uint32 PRE:1;
+	uint32 PRIM:11;
+	uint32 FLG:2; // enum GIF_FLG
+	uint32 NREG:4;
+	uint64 REGS;
+REG_END
+
+// GIFReg
+
+REG64_(GIFReg, ALPHA)
+	uint32 A:2;
+	uint32 B:2;
+	uint32 C:2;
+	uint32 D:2;
+	uint32 _PAD1:24;
+	uint8 FIX;
+	uint8 _PAD2[3];
+REG_END2
+	// opaque => output will be Cs/As
+	__forceinline bool IsOpaque() const {return ((A == B || (C == 2 && FIX == 0)) && D == 0) || (A == 0 && B == D && C == 2 && FIX == 0x80);}
+	__forceinline bool IsOpaque(int amin, int amax) const {return ((A == B || amax == 0) && D == 0) || (A == 0 && B == D && amin == 0x80 && amax == 0x80);}
+	__forceinline bool IsCd() { return (A == B) && (D == 1);}
+REG_END2
+
+REG64_(GIFReg, BITBLTBUF)
+	uint32 SBP:14;
+	uint32 _PAD1:2;
+	uint32 SBW:6;
+	uint32 _PAD2:2;
+	uint32 SPSM:6;
+	uint32 _PAD3:2;
+	uint32 DBP:14;
+	uint32 _PAD4:2;
+	uint32 DBW:6;
+	uint32 _PAD5:2;
+	uint32 DPSM:6;
+	uint32 _PAD6:2;
+REG_END
+
+REG64_(GIFReg, CLAMP)
+union
+{
+	struct
+	{
+		uint32 WMS:2;
+		uint32 WMT:2;
+		uint32 MINU:10;
+		uint32 MAXU:10;
+		uint32 _PAD1:8;
+		uint32 _PAD2:2;
+		uint32 MAXV:10;
+		uint32 _PAD3:20;
+	};
+
+	struct
+	{
+		uint64 _PAD4:24;
+		uint64 MINV:10;
+		uint64 _PAD5:30;
+	};
+};
+REG_END
+
+REG64_(GIFReg, COLCLAMP)
+	uint32 CLAMP:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, DIMX)
+	int32 DM00:3;
+	int32 _PAD00:1;
+	int32 DM01:3;
+	int32 _PAD01:1;
+	int32 DM02:3;
+	int32 _PAD02:1;
+	int32 DM03:3;
+	int32 _PAD03:1;
+	int32 DM10:3;
+	int32 _PAD10:1;
+	int32 DM11:3;
+	int32 _PAD11:1;
+	int32 DM12:3;
+	int32 _PAD12:1;
+	int32 DM13:3;
+	int32 _PAD13:1;
+	int32 DM20:3;
+	int32 _PAD20:1;
+	int32 DM21:3;
+	int32 _PAD21:1;
+	int32 DM22:3;
+	int32 _PAD22:1;
+	int32 DM23:3;
+	int32 _PAD23:1;
+	int32 DM30:3;
+	int32 _PAD30:1;
+	int32 DM31:3;
+	int32 _PAD31:1;
+	int32 DM32:3;
+	int32 _PAD32:1;
+	int32 DM33:3;
+	int32 _PAD33:1;
+REG_END
+
+REG64_(GIFReg, DTHE)
+	uint32 DTHE:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, FBA)
+	uint32 FBA:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, FINISH)
+	uint32 _PAD1[2];
+REG_END
+
+REG64_(GIFReg, FOG)
+	uint8 _PAD1[7];
+	uint8 F;
+REG_END
+
+REG64_(GIFReg, FOGCOL)
+	uint8 FCR;
+	uint8 FCG;
+	uint8 FCB;
+	uint8 _PAD1[5];
+REG_END
+
+REG64_(GIFReg, FRAME)
+	uint32 FBP:9;
+	uint32 _PAD1:7;
+	uint32 FBW:6;
+	uint32 _PAD2:2;
+	uint32 PSM:6;
+	uint32 _PAD3:2;
+	uint32 FBMSK;
+REG_END2
+	uint32 Block() const {return FBP << 5;}
+REG_END2
+
+REG64_(GIFReg, HWREG)
+	uint32 DATA_LOWER;
+	uint32 DATA_UPPER;
+REG_END
+
+REG64_(GIFReg, LABEL)
+	uint32 ID;
+	uint32 IDMSK;
+REG_END
+
+REG64_(GIFReg, MIPTBP1)
+	uint64 TBP1:14;
+	uint64 TBW1:6;
+	uint64 TBP2:14;
+	uint64 TBW2:6;
+	uint64 TBP3:14;
+	uint64 TBW3:6;
+	uint64 _PAD:4;
+REG_END
+
+REG64_(GIFReg, MIPTBP2)
+	uint64 TBP4:14;
+	uint64 TBW4:6;
+	uint64 TBP5:14;
+	uint64 TBW5:6;
+	uint64 TBP6:14;
+	uint64 TBW6:6;
+	uint64 _PAD:4;
+REG_END
+
+REG64_(GIFReg, NOP)
+	uint32 _PAD[2];
+REG_END
+
+REG64_(GIFReg, PABE)
+	uint32 PABE:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, PRIM)
+	uint32 PRIM:3;
+	uint32 IIP:1;
+	uint32 TME:1;
+	uint32 FGE:1;
+	uint32 ABE:1;
+	uint32 AA1:1;
+	uint32 FST:1;
+	uint32 CTXT:1;
+	uint32 FIX:1;
+	uint32 _PAD1:21;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, PRMODE)
+	uint32 _PRIM:3;
+	uint32 IIP:1;
+	uint32 TME:1;
+	uint32 FGE:1;
+	uint32 ABE:1;
+	uint32 AA1:1;
+	uint32 FST:1;
+	uint32 CTXT:1;
+	uint32 FIX:1;
+	uint32 _PAD2:21;
+	uint32 _PAD3:32;
+REG_END
+
+REG64_(GIFReg, PRMODECONT)
+	uint32 AC:1;
+	uint32 _PAD1:31;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, RGBAQ)
+	uint8 R;
+	uint8 G;
+	uint8 B;
+	uint8 A;
+	float Q;
+REG_END
+
+REG64_(GIFReg, SCANMSK)
+	uint32 MSK:2;
+	uint32 _PAD1:30;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, SCISSOR)
+	uint32 SCAX0:11;
+	uint32 _PAD1:5;
+	uint32 SCAX1:11;
+	uint32 _PAD2:5;
+	uint32 SCAY0:11;
+	uint32 _PAD3:5;
+	uint32 SCAY1:11;
+	uint32 _PAD4:5;
+REG_END
+
+REG64_(GIFReg, SIGNAL)
+	uint32 ID;
+	uint32 IDMSK;
+REG_END
+
+REG64_(GIFReg, ST)
+	float S;
+	float T;
+REG_END
+
+REG64_(GIFReg, TEST)
+	uint32 ATE:1;
+	uint32 ATST:3;
+	uint32 AREF:8;
+	uint32 AFAIL:2;
+	uint32 DATE:1;
+	uint32 DATM:1;
+	uint32 ZTE:1;
+	uint32 ZTST:2;
+	uint32 _PAD1:13;
+	uint32 _PAD2:32;
+REG_END2
+	__forceinline bool DoFirstPass() const {return !ATE || ATST != ATST_NEVER;} // not all pixels fail automatically
+	__forceinline bool DoSecondPass() const {return ATE && ATST != ATST_ALWAYS && AFAIL != AFAIL_KEEP;} // pixels may fail, write fb/z
+	__forceinline bool NoSecondPass() const {return ATE && ATST != ATST_ALWAYS && AFAIL == AFAIL_KEEP;} // pixels may fail, no output
+REG_END2
+
+REG64_(GIFReg, TEX0)
+union
+{
+	struct
+	{
+		uint32 TBP0:14;
+		uint32 TBW:6;
+		uint32 PSM:6;
+		uint32 TW:4;
+		uint32 _PAD1:2;
+		uint32 _PAD2:2;
+		uint32 TCC:1;
+		uint32 TFX:2;
+		uint32 CBP:14;
+		uint32 CPSM:4;
+		uint32 CSM:1;
+		uint32 CSA:5;
+		uint32 CLD:3;
+	};
+
+	struct
+	{
+		uint64 _PAD3:30;
+		uint64 TH:4;
+		uint64 _PAD4:30;
+	};
+};
+REG_END2
+	__forceinline bool IsRepeating() const
+	{
+		if(TBW < 2)
+		{
+			if(PSM == PSM_PSMT8) return TW > 7 || TH > 6;
+			if(PSM == PSM_PSMT4) return TW > 7 || TH > 7;
+		}
+
+		// The recast of TBW seems useless but it avoid tons of warning from GCC...
+		return ((uint32)TBW << 6u) < (1u << TW);
+	}
+REG_END2
+
+REG64_(GIFReg, TEX1)
+	uint32 LCM:1;
+	uint32 _PAD1:1;
+	uint32 MXL:3;
+	uint32 MMAG:1;
+	uint32 MMIN:3;
+	uint32 MTBA:1;
+	uint32 _PAD2:9;
+	uint32 L:2;
+	uint32 _PAD3:11;
+	int32  K:12; // 1:7:4
+	uint32 _PAD4:20;
+REG_END2
+	bool IsMinLinear() const {return (MMIN == 1) || (MMIN & 4);}
+	bool IsMagLinear() const {return MMAG;}
+REG_END2
+
+REG64_(GIFReg, TEX2)
+	uint32 _PAD1:20;
+	uint32 PSM:6;
+	uint32 _PAD2:6;
+	uint32 _PAD3:5;
+	uint32 CBP:14;
+	uint32 CPSM:4;
+	uint32 CSM:1;
+	uint32 CSA:5;
+	uint32 CLD:3;
+REG_END
+
+REG64_(GIFReg, TEXA)
+	uint8 TA0;
+	uint8 _PAD1:7;
+	uint8 AEM:1;
+	uint16 _PAD2;
+	uint8 TA1:8;
+	uint8 _PAD3[3];
+REG_END
+
+REG64_(GIFReg, TEXCLUT)
+	uint32 CBW:6;
+	uint32 COU:6;
+	uint32 COV:10;
+	uint32 _PAD1:10;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, TEXFLUSH)
+	uint32 _PAD1:32;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, TRXDIR)
+	uint32 XDIR:2;
+	uint32 _PAD1:30;
+	uint32 _PAD2:32;
+REG_END
+
+REG64_(GIFReg, TRXPOS)
+	uint32 SSAX:11;
+	uint32 _PAD1:5;
+	uint32 SSAY:11;
+	uint32 _PAD2:5;
+	uint32 DSAX:11;
+	uint32 _PAD3:5;
+	uint32 DSAY:11;
+	uint32 DIRY:1;
+	uint32 DIRX:1;
+	uint32 _PAD4:3;
+REG_END
+
+REG64_(GIFReg, TRXREG)
+	uint32 RRW:12;
+	uint32 _PAD1:20;
+	uint32 RRH:12;
+	uint32 _PAD2:20;
+REG_END
+
+// GSState::GIFPackedRegHandlerUV and GSState::GIFRegHandlerUV will make sure that the _PAD1/2 bits are set to zero
+
+REG64_(GIFReg, UV)
+	uint16 U;
+//	uint32 _PAD1:2;
+	uint16 V;
+//	uint32 _PAD2:2;
+	uint32 _PAD3;
+REG_END
+
+// GSState::GIFRegHandlerXYOFFSET will make sure that the _PAD1/2 bits are set to zero
+
+REG64_(GIFReg, XYOFFSET)
+	uint32 OFX; // :16; uint32 _PAD1:16;
+	uint32 OFY; // :16; uint32 _PAD2:16;
+REG_END
+
+REG64_(GIFReg, XYZ)
+	uint16 X;
+	uint16 Y;
+	uint32 Z;
+REG_END
+
+REG64_(GIFReg, XYZF)
+	uint16 X;
+	uint16 Y;
+	uint32 Z:24;
+	uint32 F:8;
+REG_END
+
+REG64_(GIFReg, ZBUF)
+	uint32 ZBP:9;
+	uint32 _PAD1:15;
+	// uint32 PSM:4;
+	// uint32 _PAD2:4;
+	uint32 PSM:6;
+	uint32 _PAD2:2;
+	uint32 ZMSK:1;
+	uint32 _PAD3:31;
+REG_END2
+	uint32 Block() const {return ZBP << 5;}
+REG_END2
+
+REG64_SET(GIFReg)
+	GIFRegALPHA			ALPHA;
+	GIFRegBITBLTBUF		BITBLTBUF;
+	GIFRegCLAMP			CLAMP;
+	GIFRegCOLCLAMP		COLCLAMP;
+	GIFRegDIMX			DIMX;
+	GIFRegDTHE			DTHE;
+	GIFRegFBA			FBA;
+	GIFRegFINISH		FINISH;
+	GIFRegFOG			FOG;
+	GIFRegFOGCOL		FOGCOL;
+	GIFRegFRAME			FRAME;
+	GIFRegHWREG			HWREG;
+	GIFRegLABEL			LABEL;
+	GIFRegMIPTBP1		MIPTBP1;
+	GIFRegMIPTBP2		MIPTBP2;
+	GIFRegNOP			NOP;
+	GIFRegPABE			PABE;
+	GIFRegPRIM			PRIM;
+	GIFRegPRMODE		PRMODE;
+	GIFRegPRMODECONT	PRMODECONT;
+	GIFRegRGBAQ			RGBAQ;
+	GIFRegSCANMSK		SCANMSK;
+	GIFRegSCISSOR		SCISSOR;
+	GIFRegSIGNAL		SIGNAL;
+	GIFRegST			ST;
+	GIFRegTEST			TEST;
+	GIFRegTEX0			TEX0;
+	GIFRegTEX1			TEX1;
+	GIFRegTEX2			TEX2;
+	GIFRegTEXA			TEXA;
+	GIFRegTEXCLUT		TEXCLUT;
+	GIFRegTEXFLUSH		TEXFLUSH;
+	GIFRegTRXDIR		TRXDIR;
+	GIFRegTRXPOS		TRXPOS;
+	GIFRegTRXREG		TRXREG;
+	GIFRegUV			UV;
+	GIFRegXYOFFSET		XYOFFSET;
+	GIFRegXYZ			XYZ;
+	GIFRegXYZF			XYZF;
+	GIFRegZBUF			ZBUF;
+REG_SET_END
+
+// GIFPacked
+
+REG128_(GIFPacked, PRIM)
+	uint32 PRIM:11;
+	uint32 _PAD1:21;
+	uint32 _PAD2[3];
+REG_END
+
+REG128_(GIFPacked, RGBA)
+	uint8 R;
+	uint8 _PAD1[3];
+	uint8 G;
+	uint8 _PAD2[3];
+	uint8 B;
+	uint8 _PAD3[3];
+	uint8 A;
+	uint8 _PAD4[3];
+REG_END
+
+REG128_(GIFPacked, STQ)
+	float S;
+	float T;
+	float Q;
+	uint32 _PAD1:32;
+REG_END
+
+REG128_(GIFPacked, UV)
+	uint32 U:14;
+	uint32 _PAD1:18;
+	uint32 V:14;
+	uint32 _PAD2:18;
+	uint32 _PAD3:32;
+	uint32 _PAD4:32;
+REG_END
+
+REG128_(GIFPacked, XYZF2)
+	uint16 X;
+	uint16 _PAD1;
+	uint16 Y;
+	uint16 _PAD2;
+	uint32 _PAD3:4;
+	uint32 Z:24;
+	uint32 _PAD4:4;
+	uint32 _PAD5:4;
+	uint32 F:8;
+	uint32 _PAD6:3;
+	uint32 ADC:1;
+	uint32 _PAD7:16;
+REG_END2
+	uint32 Skip() const {return u32[3] & 0x8000;}
+REG_END2
+
+REG128_(GIFPacked, XYZ2)
+	uint16 X;
+	uint16 _PAD1;
+	uint16 Y;
+	uint16 _PAD2;
+	uint32 Z;
+	uint32 _PAD3:15;
+	uint32 ADC:1;
+	uint32 _PAD4:16;
+REG_END2
+	uint32 Skip() const {return u32[3] & 0x8000;}
+REG_END2
+
+REG128_(GIFPacked, FOG)
+	uint32 _PAD1;
+	uint32 _PAD2;
+	uint32 _PAD3;
+	uint32 _PAD4:4;
+	uint32 F:8;
+	uint32 _PAD5:20;
+REG_END
+
+REG128_(GIFPacked, A_D)
+	uint64 DATA;
+	uint8 ADDR:8; // enum GIF_A_D_REG
+	uint8 _PAD1[3+4];
+REG_END
+
+REG128_(GIFPacked, NOP)
+	uint32 _PAD1;
+	uint32 _PAD2;
+	uint32 _PAD3;
+	uint32 _PAD4;
+REG_END
+
+REG128_SET(GIFPackedReg)
+	GIFReg			r;
+	GIFPackedPRIM	PRIM;
+	GIFPackedRGBA	RGBA;
+	GIFPackedSTQ	STQ;
+	GIFPackedUV		UV;
+	GIFPackedXYZF2	XYZF2;
+	GIFPackedXYZ2	XYZ2;
+	GIFPackedFOG	FOG;
+	GIFPackedA_D	A_D;
+	GIFPackedNOP	NOP;
+REG_SET_END
+
+__aligned(struct, 32) GIFPath
+{
+	GIFTag tag;
+	uint32 nloop;
+	uint32 nreg;
+	uint32 reg;
+	uint32 type;
+	GSVector4i regs;
+
+	enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2, TYPE_STQRGBAXYZ2};
+
+	__forceinline void SetTag(const void* mem)
+	{
+		const GIFTag* RESTRICT src = (const GIFTag*)mem;
+
+		// the compiler has a hard time not reloading every time a field of src is accessed
+
+		uint32 a = src->u32[0];
+		uint32 b = src->u32[1];
+
+		tag.u32[0] = a;
+		tag.u32[1] = b;
+
+		nloop = a & 0x7fff;
+
+		if(nloop == 0) return;
+
+		GSVector4i v = GSVector4i::loadl(&src->REGS); // REGS not stored to tag.REGS, only into this->regs, restored before saving the state though
+
+		nreg = (b & 0xf0000000) ? (b >> 28) : 16; // src->NREG
+		regs = v.upl8(v >> 4) & GSVector4i::x0f(nreg);
+		reg = 0;
+
+		type = TYPE_UNKNOWN;
+
+		if(tag.FLG == GIF_FLG_PACKED)
+		{
+			if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1)
+			{
+				type = TYPE_ADONLY;
+			}
+			else
+			{
+				switch(nreg)
+				{
+				case 1: break;
+				case 2: break;
+				case 3:
+					if(regs.u32[0] == 0x00040102) type = TYPE_STQRGBAXYZF2; // many games, TODO: formats mixed with NOPs (xeno2: 040f010f02, 04010f020f, mgs3: 04010f0f02, 0401020f0f, 04010f020f)
+					if(regs.u32[0] == 0x00050102) type = TYPE_STQRGBAXYZ2; // GoW (has other crazy formats, like ...030503050103)
+					// TODO: common types with UV instead
+					break;
+				case 4: break;
+				case 5: break;
+				case 6: break;
+				case 7: break;
+				case 8: break;
+				case 9:
+					if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x00000004) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 3;} // ffx
+					break;
+				case 10: break;
+				case 11: break;
+				case 12:
+					if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x04010204) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 4;} // dq8 (not many, mostly 040102)
+					break;
+				case 13: break;
+				case 14: break;
+				case 15: break;
+				case 16: break;
+				default:
+					__assume(0);
+				}
+			}
+		}
+	}
+
+	__forceinline uint8 GetReg() const
+	{
+		return regs.u8[reg];
+	}
+
+	__forceinline uint8 GetReg(uint32 index) const
+	{
+		return regs.u8[index];
+	}
+
+	__forceinline bool StepReg()
+	{
+		if(++reg == nreg)
+		{
+			reg = 0;
+
+			if(--nloop == 0)
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+};
+
+struct GSPrivRegSet
+{
+	union
+	{
+		struct
+		{
+			GSRegPMODE		PMODE;
+			uint64			_pad1;
+			GSRegSMODE1		SMODE1;
+			uint64			_pad2;
+			GSRegSMODE2		SMODE2;
+			uint64			_pad3;
+			GSRegSRFSH		SRFSH;
+			uint64			_pad4;
+			GSRegSYNCH1		SYNCH1;
+			uint64			_pad5;
+			GSRegSYNCH2		SYNCH2;
+			uint64			_pad6;
+			GSRegSYNCV		SYNCV;
+			uint64			_pad7;
+			struct {
+			GSRegDISPFB		DISPFB;
+			uint64			_pad1;
+			GSRegDISPLAY	DISPLAY;
+			uint64			_pad2;
+			} DISP[2];
+			GSRegEXTBUF		EXTBUF;
+			uint64			_pad8;
+			GSRegEXTDATA	EXTDATA;
+			uint64			_pad9;
+			GSRegEXTWRITE	EXTWRITE;
+			uint64			_pad10;
+			GSRegBGCOLOR	BGCOLOR;
+			uint64			_pad11;
+		};
+
+		uint8 _pad12[0x1000];
+	};
+
+	union
+	{
+		struct
+		{
+			GSRegCSR		CSR;
+			uint64			_pad13;
+			GSRegIMR		IMR;
+			uint64			_pad14;
+			uint64			_unk1[4];
+			GSRegBUSDIR		BUSDIR;
+			uint64			_pad15;
+			uint64			_unk2[6];
+			GSRegSIGLBLID	SIGLBLID;
+			uint64			_pad16;
+		};
+
+		uint8 _pad17[0x1000];
+	};
+};
+
+#pragma pack(pop)
+
+enum {KEYPRESS=1, KEYRELEASE=2};
+struct GSKeyEventData {uint32 key, type;};
+
+enum {FREEZE_LOAD=0, FREEZE_SAVE=1, FREEZE_SIZE=2};
+struct GSFreezeData {int size; uint8* data;};
+
+enum stateType {ST_WRITE, ST_TRANSFER, ST_VSYNC};
+
+// default gs config settings
+#define DEFAULT_EXTRA_RENDERING_THREADS 2
+
+// GS Video modes macros
+#define Vmode_VESA_DTV			(m_regs->SMODE1.CMOD == 0)
+#define Vmode_NTSC				(m_regs->SMODE1.CMOD == 2)
+#define Vmode_PAL				(m_regs->SMODE1.CMOD == 3)
+#define Vmode_VESA_1A			(m_regs->SMODE1.LC == 15 && Vmode_VESA_DTV)
+#define Vmode_VESA_1C			(m_regs->SMODE1.LC == 28 && Vmode_VESA_DTV)
+#define Vmode_VESA_2B			(m_regs->SMODE1.LC == 71 && Vmode_VESA_DTV)
+#define Vmode_VESA_2D			(m_regs->SMODE1.LC == 44 && Vmode_VESA_DTV)
+#define Vmode_VESA_3B			(m_regs->SMODE1.LC == 58 && Vmode_VESA_DTV)
+#define Vmode_VESA_3D			(m_regs->SMODE1.LC == 35 && Vmode_VESA_DTV)
+#define Vmode_VESA_4A			(m_regs->SMODE1.LC == 8  && Vmode_VESA_DTV)
+#define Vmode_VESA_4B			(m_regs->SMODE1.LC == 10 && Vmode_VESA_DTV)
+#define Vmode_DTV_480P			(m_regs->SMODE1.LC == 32 && Vmode_VESA_DTV)
+#define Vmode_DTV_720P_1080I	(m_regs->SMODE1.LC == 22 && Vmode_VESA_DTV)
diff --git a/plugins/GSdx_legacy/GSAlignedClass.cpp b/plugins/GSdx_legacy/GSAlignedClass.cpp
new file mode 100644
index 0000000000..6940b5ae8b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSAlignedClass.cpp
@@ -0,0 +1,23 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSAlignedClass.h"
diff --git a/plugins/GSdx_legacy/GSAlignedClass.h b/plugins/GSdx_legacy/GSAlignedClass.h
new file mode 100644
index 0000000000..81e83180b0
--- /dev/null
+++ b/plugins/GSdx_legacy/GSAlignedClass.h
@@ -0,0 +1,49 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+template<int i> class GSAlignedClass
+{
+public:
+	GSAlignedClass() {}
+	virtual ~GSAlignedClass() {}
+
+	void* operator new (size_t size)
+	{
+		return _aligned_malloc(size, i);
+	}
+
+	void operator delete (void* p)
+	{
+		_aligned_free(p);
+	}
+
+	void* operator new [] (size_t size)
+	{
+		return _aligned_malloc(size, i);
+	}
+
+	void operator delete [] (void* p)
+	{
+		_aligned_free(p);
+	}
+};
diff --git a/plugins/GSdx_legacy/GSBlock.cpp b/plugins/GSdx_legacy/GSBlock.cpp
new file mode 100644
index 0000000000..e010f700c7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSBlock.cpp
@@ -0,0 +1,48 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSBlock.h"
+
+#if _M_SSE >= 0x501
+const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
+#else
+const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
+#endif
+const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
+const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+
+#if _M_SSE >= 0x501
+const GSVector8i GSBlock::m_xxxa(0x00008000);
+const GSVector8i GSBlock::m_xxbx(0x00007c00);
+const GSVector8i GSBlock::m_xgxx(0x000003e0);
+const GSVector8i GSBlock::m_rxxx(0x0000001f);
+#else
+const GSVector4i GSBlock::m_xxxa(0x00008000);
+const GSVector4i GSBlock::m_xxbx(0x00007c00);
+const GSVector4i GSBlock::m_xgxx(0x000003e0);
+const GSVector4i GSBlock::m_rxxx(0x0000001f);
+#endif
+
+const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
+const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
+const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
+const GSVector4i GSBlock::m_uw8hmask3(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
diff --git a/plugins/GSdx_legacy/GSBlock.h b/plugins/GSdx_legacy/GSBlock.h
new file mode 100644
index 0000000000..a18f597299
--- /dev/null
+++ b/plugins/GSdx_legacy/GSBlock.h
@@ -0,0 +1,2195 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSTables.h"
+#include "GSVector.h"
+
+class GSBlock
+{
+	#if _M_SSE >= 0x501
+	static const GSVector8i m_r16mask;
+	#else
+	static const GSVector4i m_r16mask;
+	#endif
+	static const GSVector4i m_r8mask;
+	static const GSVector4i m_r4mask;
+
+	#if _M_SSE >= 0x501
+	static const GSVector8i m_xxxa;
+	static const GSVector8i m_xxbx;
+	static const GSVector8i m_xgxx;
+	static const GSVector8i m_rxxx;
+	#else
+	static const GSVector4i m_xxxa;
+	static const GSVector4i m_xxbx;
+	static const GSVector4i m_xgxx;
+	static const GSVector4i m_rxxx;
+	#endif
+
+	static const GSVector4i m_uw8hmask0;
+	static const GSVector4i m_uw8hmask1;
+	static const GSVector4i m_uw8hmask2;
+	static const GSVector4i m_uw8hmask3;
+
+public:
+	template<int i, int alignment, uint32 mask> __forceinline static void WriteColumn32(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		const uint8* RESTRICT s0 = &src[srcpitch * 0];
+		const uint8* RESTRICT s1 = &src[srcpitch * 1];
+
+		#if _M_SSE >= 0x501
+
+		GSVector8i v0, v1;
+
+		if(alignment == 32)
+		{
+			v0 = GSVector8i::load<true>(s0).acbd();
+			v1 = GSVector8i::load<true>(s1).acbd();
+
+			GSVector8i::sw64(v0, v1);
+		}
+		else
+		{
+			if(alignment == 16)
+			{
+				v0 = GSVector8i::load(&s0[0], &s0[16]).acbd();
+				v1 = GSVector8i::load(&s1[0], &s1[16]).acbd();
+
+				GSVector8i::sw64(v0, v1);
+			}
+			else
+			{
+				//v0 = GSVector8i::load(&s0[0], &s0[16], &s0[8], &s0[24]);
+				//v1 = GSVector8i::load(&s1[0], &s1[16], &s1[8], &s1[24]);
+
+				GSVector4i v4 = GSVector4i::load(&s0[0], &s1[0]);
+				GSVector4i v5 = GSVector4i::load(&s0[8], &s1[8]);
+				GSVector4i v6 = GSVector4i::load(&s0[16], &s1[16]);
+				GSVector4i v7 = GSVector4i::load(&s0[24], &s1[24]);
+
+				if(mask == 0xffffffff)
+				{
+					// just write them out directly
+
+					((GSVector4i*)dst)[i * 4 + 0] = v4;
+					((GSVector4i*)dst)[i * 4 + 1] = v5;
+					((GSVector4i*)dst)[i * 4 + 2] = v6;
+					((GSVector4i*)dst)[i * 4 + 3] = v7;
+
+					return;
+				}
+
+				v0 = GSVector8i::cast(v4).insert<1>(v5);
+				v1 = GSVector8i::cast(v6).insert<1>(v7);
+			}
+		}
+
+		if(mask == 0xffffffff)
+		{
+			((GSVector8i*)dst)[i * 2 + 0] = v0;
+			((GSVector8i*)dst)[i * 2 + 1] = v1;
+		}
+		else 
+		{
+			GSVector8i v2((int)mask);
+
+			if(mask == 0xff000000 || mask == 0x00ffffff)
+			{
+				((GSVector8i*)dst)[i * 2 + 0] = ((GSVector8i*)dst)[i * 2 + 0].blend8(v0, v2);
+				((GSVector8i*)dst)[i * 2 + 1] = ((GSVector8i*)dst)[i * 2 + 1].blend8(v1, v2);
+			}
+			else
+			{
+				((GSVector8i*)dst)[i * 2 + 0] = ((GSVector8i*)dst)[i * 2 + 0].blend(v0, v2);
+				((GSVector8i*)dst)[i * 2 + 1] = ((GSVector8i*)dst)[i * 2 + 1].blend(v1, v2);
+			}
+		}
+
+		#else
+
+		GSVector4i v0, v1, v2, v3;
+
+		if(alignment != 0)
+		{
+			v0 = GSVector4i::load<true>(&s0[0]);
+			v1 = GSVector4i::load<true>(&s0[16]);
+			v2 = GSVector4i::load<true>(&s1[0]);
+			v3 = GSVector4i::load<true>(&s1[16]);
+
+			GSVector4i::sw64(v0, v2, v1, v3);
+		}
+		else
+		{
+			v0 = GSVector4i::load(&s0[0], &s1[0]);
+			v1 = GSVector4i::load(&s0[8], &s1[8]);
+			v2 = GSVector4i::load(&s0[16], &s1[16]);
+			v3 = GSVector4i::load(&s0[24], &s1[24]);
+		}
+
+		if(mask == 0xffffffff)
+		{
+			((GSVector4i*)dst)[i * 4 + 0] = v0;
+			((GSVector4i*)dst)[i * 4 + 1] = v1;
+			((GSVector4i*)dst)[i * 4 + 2] = v2;
+			((GSVector4i*)dst)[i * 4 + 3] = v3;
+		}
+		else
+		{
+			GSVector4i v4((int)mask);
+
+			#if _M_SSE >= 0x401
+
+			if(mask == 0xff000000 || mask == 0x00ffffff)
+			{
+				((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, v4);
+				((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, v4);
+				((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, v4);
+				((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, v4);
+			}
+			else
+			{
+
+			#endif
+
+			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4);
+			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4);
+			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4);
+			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4);
+
+			#if _M_SSE >= 0x401
+
+			}
+
+			#endif
+		}
+
+		#endif
+	}
+
+	template<int i, int alignment> __forceinline static void WriteColumn16(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		const uint8* RESTRICT s0 = &src[srcpitch * 0];
+		const uint8* RESTRICT s1 = &src[srcpitch * 1];
+
+		// for(int j = 0; j < 16; j++) {((uint16*)s0)[j] = columnTable16[0][j]; ((uint16*)s1)[j] = columnTable16[1][j];}
+
+		#if _M_SSE >= 0x501
+
+		GSVector8i v0, v1;
+
+		if(alignment == 32)
+		{
+			v0 = GSVector8i::load<true>(s0);
+			v1 = GSVector8i::load<true>(s1);
+
+			GSVector8i::sw128(v0, v1);
+			GSVector8i::sw16(v0, v1);
+		}
+		else
+		{
+			if(alignment == 16)
+			{
+				v0 = GSVector8i::load(&s0[0], &s1[0]);
+				v1 = GSVector8i::load(&s0[16], &s1[16]);
+			}
+			else
+			{
+				v0 = GSVector8i::load(&s0[0], &s0[8], &s1[0], &s1[8]);
+				v1 = GSVector8i::load(&s0[16], &s0[24], &s1[16], &s1[24]);
+			}
+
+			GSVector8i::sw16(v0, v1);
+		}
+
+		v0 = v0.acbd();
+		v1 = v1.acbd();
+		
+		((GSVector8i*)dst)[i * 2 + 0] = v0;
+		((GSVector8i*)dst)[i * 2 + 1] = v1;
+
+		#else
+
+		GSVector4i v0, v1, v2, v3;
+
+		if(alignment != 0)
+		{
+			v0 = GSVector4i::load<true>(&s0[0]);
+			v1 = GSVector4i::load<true>(&s0[16]);
+			v2 = GSVector4i::load<true>(&s1[0]);
+			v3 = GSVector4i::load<true>(&s1[16]);
+
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw64(v0, v1, v2, v3);
+		}
+		else
+		{
+			v0 = GSVector4i::loadl(&s0[0]).upl16(GSVector4i::loadl(&s0[16]));
+			v2 = GSVector4i::loadl(&s0[8]).upl16(GSVector4i::loadl(&s0[24]));
+			v1 = GSVector4i::loadl(&s1[0]).upl16(GSVector4i::loadl(&s1[16]));
+			v3 = GSVector4i::loadl(&s1[8]).upl16(GSVector4i::loadl(&s1[24]));
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+		}
+
+		((GSVector4i*)dst)[i * 4 + 0] = v0;
+		((GSVector4i*)dst)[i * 4 + 1] = v2;
+		((GSVector4i*)dst)[i * 4 + 2] = v1;
+		((GSVector4i*)dst)[i * 4 + 3] = v3;
+
+		#endif
+	}
+
+	template<int i, int alignment> __forceinline static void WriteColumn8(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		// TODO: read unaligned as WriteColumn32 does and try saving a few shuffles
+
+		#if _M_SSE >= 0x501
+
+		GSVector4i v4 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
+		GSVector4i v5 = GSVector4i::load<alignment != 0>(&src[srcpitch * 1]);
+		GSVector4i v6 = GSVector4i::load<alignment != 0>(&src[srcpitch * 2]);
+		GSVector4i v7 = GSVector4i::load<alignment != 0>(&src[srcpitch * 3]);
+
+		GSVector8i v0(v4, v5);
+		GSVector8i v1(v6, v7);
+
+		if((i & 1) == 0)
+		{
+			v1 = v1.yxwz();
+		}
+		else 
+		{
+			v0 = v0.yxwz();
+		}
+
+		GSVector8i::sw8(v0, v1);
+		GSVector8i::sw16(v0, v1);
+		
+		v0 = v0.acbd();
+		v1 = v1.acbd();
+
+		((GSVector8i*)dst)[i * 2 + 0] = v0;
+		((GSVector8i*)dst)[i * 2 + 1] = v1;
+
+		#else
+
+		GSVector4i v0 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
+		GSVector4i v1 = GSVector4i::load<alignment != 0>(&src[srcpitch * 1]);
+		GSVector4i v2 = GSVector4i::load<alignment != 0>(&src[srcpitch * 2]);
+		GSVector4i v3 = GSVector4i::load<alignment != 0>(&src[srcpitch * 3]);
+
+		if((i & 1) == 0)
+		{
+			v2 = v2.yxwz();
+			v3 = v3.yxwz();
+		}
+		else
+		{
+			v0 = v0.yxwz();
+			v1 = v1.yxwz();
+		}
+
+		GSVector4i::sw8(v0, v2, v1, v3);
+		GSVector4i::sw16(v0, v1, v2, v3);
+		GSVector4i::sw64(v0, v1, v2, v3);
+
+		((GSVector4i*)dst)[i * 4 + 0] = v0;
+		((GSVector4i*)dst)[i * 4 + 1] = v2;
+		((GSVector4i*)dst)[i * 4 + 2] = v1;
+		((GSVector4i*)dst)[i * 4 + 3] = v3;
+
+		#endif
+	}
+
+	template<int i, int alignment> __forceinline static void WriteColumn4(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		//printf("WriteColumn4\n");
+
+		// TODO: read unaligned as WriteColumn32 does and try saving a few shuffles
+
+		// TODO: pshufb
+
+		GSVector4i v0 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
+		GSVector4i v1 = GSVector4i::load<alignment != 0>(&src[srcpitch * 1]);
+		GSVector4i v2 = GSVector4i::load<alignment != 0>(&src[srcpitch * 2]);
+		GSVector4i v3 = GSVector4i::load<alignment != 0>(&src[srcpitch * 3]);
+
+		if((i & 1) == 0)
+		{
+			v2 = v2.yxwzlh();
+			v3 = v3.yxwzlh();
+		}
+		else
+		{
+			v0 = v0.yxwzlh();
+			v1 = v1.yxwzlh();
+		}
+
+		GSVector4i::sw4(v0, v2, v1, v3);
+		GSVector4i::sw8(v0, v1, v2, v3);
+		GSVector4i::sw8(v0, v2, v1, v3);
+		GSVector4i::sw64(v0, v2, v1, v3);
+
+		((GSVector4i*)dst)[i * 4 + 0] = v0;
+		((GSVector4i*)dst)[i * 4 + 1] = v1;
+		((GSVector4i*)dst)[i * 4 + 2] = v2;
+		((GSVector4i*)dst)[i * 4 + 3] = v3;
+	}
+
+	template<int alignment, uint32 mask> static void WriteColumn32(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		switch((y >> 1) & 3)
+		{
+		case 0: WriteColumn32<0, alignment, mask>(dst, src, srcpitch); break;
+		case 1: WriteColumn32<1, alignment, mask>(dst, src, srcpitch); break;
+		case 2: WriteColumn32<2, alignment, mask>(dst, src, srcpitch); break;
+		case 3: WriteColumn32<3, alignment, mask>(dst, src, srcpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	template<int alignment> static void WriteColumn16(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		switch((y >> 1) & 3)
+		{
+		case 0: WriteColumn16<0, alignment>(dst, src, srcpitch); break;
+		case 1: WriteColumn16<1, alignment>(dst, src, srcpitch); break;
+		case 2: WriteColumn16<2, alignment>(dst, src, srcpitch); break;
+		case 3: WriteColumn16<3, alignment>(dst, src, srcpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	template<int alignment> static void WriteColumn8(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		switch((y >> 2) & 3)
+		{
+		case 0: WriteColumn8<0, alignment>(dst, src, srcpitch); break;
+		case 1: WriteColumn8<1, alignment>(dst, src, srcpitch); break;
+		case 2: WriteColumn8<2, alignment>(dst, src, srcpitch); break;
+		case 3: WriteColumn8<3, alignment>(dst, src, srcpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	template<int alignment> static void WriteColumn4(int y, uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		switch((y >> 2) & 3)
+		{
+		case 0: WriteColumn4<0, alignment>(dst, src, srcpitch); break;
+		case 1: WriteColumn4<1, alignment>(dst, src, srcpitch); break;
+		case 2: WriteColumn4<2, alignment>(dst, src, srcpitch); break;
+		case 3: WriteColumn4<3, alignment>(dst, src, srcpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	template<int alignment, uint32 mask> static void WriteBlock32(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		WriteColumn32<0, alignment, mask>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn32<1, alignment, mask>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn32<2, alignment, mask>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn32<3, alignment, mask>(dst, src, srcpitch);
+	}
+
+	template<int alignment> static void WriteBlock16(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		WriteColumn16<0, alignment>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn16<1, alignment>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn16<2, alignment>(dst, src, srcpitch);
+		src += srcpitch * 2;
+		WriteColumn16<3, alignment>(dst, src, srcpitch);
+	}
+
+	template<int alignment> static void WriteBlock8(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		WriteColumn8<0, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn8<1, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn8<2, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn8<3, alignment>(dst, src, srcpitch);
+	}
+
+	template<int alignment> static void WriteBlock4(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
+	{
+		WriteColumn4<0, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn4<1, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn4<2, alignment>(dst, src, srcpitch);
+		src += srcpitch * 4;
+		WriteColumn4<3, alignment>(dst, src, srcpitch);
+	}
+
+	template<int i> __forceinline static void ReadColumn32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		#if _M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0 = s[i * 2 + 0];
+		GSVector8i v1 = s[i * 2 + 1];
+
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw64(v0, v1);
+
+		GSVector8i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector8i::store<true>(&dst[dstpitch * 1], v1);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+		
+		GSVector4i v0 = s[i * 4 + 0];
+		GSVector4i v1 = s[i * 4 + 1];
+		GSVector4i v2 = s[i * 4 + 2];
+		GSVector4i v3 = s[i * 4 + 3];
+
+		GSVector4i::sw64(v0, v1, v2, v3);
+
+		GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+		GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+		GSVector4i::store<true>(&d0[0], v0);
+		GSVector4i::store<true>(&d0[1], v1);
+		GSVector4i::store<true>(&d1[0], v2);
+		GSVector4i::store<true>(&d1[1], v3);
+
+		#endif
+	}
+
+	template<int i> __forceinline static void ReadColumn16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		#if _M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask);
+		GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask);
+
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw32(v0, v1);
+
+		v0 = v0.acbd();
+		v1 = v1.acbd();
+
+		GSVector8i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector8i::store<true>(&dst[dstpitch * 1], v1);
+
+		#elif _M_SSE >= 0x301
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0 = s[i * 4 + 0].shuffle8(m_r16mask);
+		GSVector4i v1 = s[i * 4 + 1].shuffle8(m_r16mask);
+		GSVector4i v2 = s[i * 4 + 2].shuffle8(m_r16mask);
+		GSVector4i v3 = s[i * 4 + 3].shuffle8(m_r16mask);
+
+		GSVector4i::sw32(v0, v1, v2, v3);
+		GSVector4i::sw64(v0, v1, v2, v3);
+
+		GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+		GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+		GSVector4i::store<true>(&d0[0], v0);
+		GSVector4i::store<true>(&d0[1], v2);
+		GSVector4i::store<true>(&d1[0], v1);
+		GSVector4i::store<true>(&d1[1], v3);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0 = s[i * 4 + 0];
+		GSVector4i v1 = s[i * 4 + 1];
+		GSVector4i v2 = s[i * 4 + 2];
+		GSVector4i v3 = s[i * 4 + 3];
+
+		//for(int16 i = 0; i < 8; i++) {v0.i16[i] = i; v1.i16[i] = i + 8; v2.i16[i] = i + 16; v3.i16[i] = i + 24;}
+
+		GSVector4i::sw16(v0, v1, v2, v3);
+		GSVector4i::sw32(v0, v1, v2, v3);
+		GSVector4i::sw16(v0, v2, v1, v3);
+
+		GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+		GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+		GSVector4i::store<true>(&d0[0], v0);
+		GSVector4i::store<true>(&d0[1], v1);
+		GSVector4i::store<true>(&d1[0], v2);
+		GSVector4i::store<true>(&d1[1], v3);
+
+		#endif
+	}
+
+	template<int i> __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		//for(int j = 0; j < 64; j++) ((uint8*)src)[j] = (uint8)j;
+
+		#if 0//_M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0 = s[i * 2 + 0];
+		GSVector8i v1 = s[i * 2 + 1];
+
+		GSVector8i::sw8(v0, v1);
+		GSVector8i::sw16(v0, v1);
+		GSVector8i::sw8(v0, v1);
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw16(v0, v1);
+
+		v0 = v0.acbd();
+		v1 = v1.acbd();
+		v1 = v1.yxwz();
+
+		GSVector8i::storel(&dst[dstpitch * 0], v0);
+		GSVector8i::storeh(&dst[dstpitch * 1], v0);
+		GSVector8i::storel(&dst[dstpitch * 2], v1);
+		GSVector8i::storeh(&dst[dstpitch * 3], v1);
+
+		// TODO: not sure if this is worth it, not in this form, there should be a shorter path
+
+		#elif _M_SSE >= 0x301
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		if((i & 1) == 0)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+		}
+		else
+		{
+			v2 = s[i * 4 + 0];
+			v3 = s[i * 4 + 1];
+			v0 = s[i * 4 + 2];
+			v1 = s[i * 4 + 3];
+		}
+
+		v0 = v0.shuffle8(m_r8mask);
+		v1 = v1.shuffle8(m_r8mask);
+		v2 = v2.shuffle8(m_r8mask);
+		v3 = v3.shuffle8(m_r8mask);
+
+		GSVector4i::sw16(v0, v1, v2, v3);
+		GSVector4i::sw32(v0, v1, v3, v2);
+
+		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector4i::store<true>(&dst[dstpitch * 1], v3);
+		GSVector4i::store<true>(&dst[dstpitch * 2], v1);
+		GSVector4i::store<true>(&dst[dstpitch * 3], v2);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0 = s[i * 4 + 0];
+		GSVector4i v1 = s[i * 4 + 1];
+		GSVector4i v2 = s[i * 4 + 2];
+		GSVector4i v3 = s[i * 4 + 3];
+
+		GSVector4i::sw8(v0, v1, v2, v3);
+		GSVector4i::sw16(v0, v1, v2, v3);
+		GSVector4i::sw8(v0, v2, v1, v3);
+		GSVector4i::sw64(v0, v1, v2, v3);
+
+		if((i & 1) == 0)
+		{
+			v2 = v2.yxwz();
+			v3 = v3.yxwz();
+		}
+		else
+		{
+			v0 = v0.yxwz();
+			v1 = v1.yxwz();
+		}
+
+		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
+		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
+		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
+
+		#endif
+	}
+
+	template<int i> __forceinline static void ReadColumn4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		//printf("ReadColumn4\n");
+
+		#if _M_SSE >= 0x301
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0 = s[i * 4 + 0].xzyw();
+		GSVector4i v1 = s[i * 4 + 1].xzyw();
+		GSVector4i v2 = s[i * 4 + 2].xzyw();
+		GSVector4i v3 = s[i * 4 + 3].xzyw();
+
+		GSVector4i::sw64(v0, v1, v2, v3);
+		GSVector4i::sw4(v0, v2, v1, v3);
+		GSVector4i::sw8(v0, v1, v2, v3);
+
+		v0 = v0.shuffle8(m_r4mask);
+		v1 = v1.shuffle8(m_r4mask);
+		v2 = v2.shuffle8(m_r4mask);
+		v3 = v3.shuffle8(m_r4mask);
+
+		if((i & 1) == 0)
+		{
+			GSVector4i::sw16rh(v0, v1, v2, v3);
+		}
+		else
+		{
+			GSVector4i::sw16rl(v0, v1, v2, v3);
+		}
+
+		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
+		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
+		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0 = s[i * 4 + 0];
+		GSVector4i v1 = s[i * 4 + 1];
+		GSVector4i v2 = s[i * 4 + 2];
+		GSVector4i v3 = s[i * 4 + 3];
+
+		GSVector4i::sw32(v0, v1, v2, v3);
+		GSVector4i::sw32(v0, v1, v2, v3);
+		GSVector4i::sw4(v0, v2, v1, v3);
+		GSVector4i::sw8(v0, v1, v2, v3);
+		GSVector4i::sw16(v0, v2, v1, v3);
+
+		v0 = v0.xzyw();
+		v1 = v1.xzyw();
+		v2 = v2.xzyw();
+		v3 = v3.xzyw();
+
+		GSVector4i::sw64(v0, v1, v2, v3);
+
+		if((i & 1) == 0)
+		{
+			v2 = v2.yxwzlh();
+			v3 = v3.yxwzlh();
+		}
+		else
+		{
+			v0 = v0.yxwzlh();
+			v1 = v1.yxwzlh();
+		}
+
+		GSVector4i::store<true>(&dst[dstpitch * 0], v0);
+		GSVector4i::store<true>(&dst[dstpitch * 1], v1);
+		GSVector4i::store<true>(&dst[dstpitch * 2], v2);
+		GSVector4i::store<true>(&dst[dstpitch * 3], v3);
+
+		#endif
+	}
+
+	static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		switch((y >> 1) & 3)
+		{
+		case 0: ReadColumn32<0>(src, dst, dstpitch); break;
+		case 1: ReadColumn32<1>(src, dst, dstpitch); break;
+		case 2: ReadColumn32<2>(src, dst, dstpitch); break;
+		case 3: ReadColumn32<3>(src, dst, dstpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	static void ReadColumn16(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		switch((y >> 1) & 3)
+		{
+		case 0: ReadColumn16<0>(src, dst, dstpitch); break;
+		case 1: ReadColumn16<1>(src, dst, dstpitch); break;
+		case 2: ReadColumn16<2>(src, dst, dstpitch); break;
+		case 3: ReadColumn16<3>(src, dst, dstpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	static void ReadColumn8(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		switch((y >> 2) & 3)
+		{
+		case 0: ReadColumn8<0>(src, dst, dstpitch); break;
+		case 1: ReadColumn8<1>(src, dst, dstpitch); break;
+		case 2: ReadColumn8<2>(src, dst, dstpitch); break;
+		case 3: ReadColumn8<3>(src, dst, dstpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	static void ReadColumn4(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		switch((y >> 2) & 3)
+		{
+		case 0: ReadColumn4<0>(src, dst, dstpitch); break;
+		case 1: ReadColumn4<1>(src, dst, dstpitch); break;
+		case 2: ReadColumn4<2>(src, dst, dstpitch); break;
+		case 3: ReadColumn4<3>(src, dst, dstpitch); break;
+		default: __assume(0);
+		}
+	}
+
+	static void ReadBlock32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		ReadColumn32<0>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn32<1>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn32<2>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn32<3>(src, dst, dstpitch);
+	}
+
+	static void ReadBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		ReadColumn16<0>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn16<1>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn16<2>(src, dst, dstpitch);
+		dst += dstpitch * 2;
+		ReadColumn16<3>(src, dst, dstpitch);
+	}
+
+	static void ReadBlock8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		ReadColumn8<0>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn8<1>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn8<2>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn8<3>(src, dst, dstpitch);
+	}
+
+	static void ReadBlock4(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		ReadColumn4<0>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn4<1>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn4<2>(src, dst, dstpitch);
+		dst += dstpitch * 4;
+		ReadColumn4<3>(src, dst, dstpitch);
+	}
+
+	__forceinline static void ReadBlock4P(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		//printf("ReadBlock4P\n");
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		GSVector4i mask(0x0f0f0f0f);
+
+		for(int i = 0; i < 2; i++)
+		{
+			// col 0, 2
+
+			v0 = s[i * 8 + 0];
+			v1 = s[i * 8 + 1];
+			v2 = s[i * 8 + 2];
+			v3 = s[i * 8 + 3];
+
+			GSVector4i::sw8(v0, v1, v2, v3);
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw8(v0, v2, v1, v3);
+
+			GSVector4i::store<true>(&dst[dstpitch * 0 +  0], (v0 & mask));
+			GSVector4i::store<true>(&dst[dstpitch * 0 + 16], (v1 & mask));
+			GSVector4i::store<true>(&dst[dstpitch * 1 +  0], (v2 & mask));
+			GSVector4i::store<true>(&dst[dstpitch * 1 + 16], (v3 & mask));
+
+			dst += dstpitch * 2;
+
+			GSVector4i::store<true>(&dst[dstpitch * 0 +  0], (v0.andnot(mask)).yxwz() >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 0 + 16], (v1.andnot(mask)).yxwz() >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 1 +  0], (v2.andnot(mask)).yxwz() >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 1 + 16], (v3.andnot(mask)).yxwz() >> 4);
+
+			dst += dstpitch * 2;
+
+			// col 1, 3
+
+			v0 = s[i * 8 + 4];
+			v1 = s[i * 8 + 5];
+			v2 = s[i * 8 + 6];
+			v3 = s[i * 8 + 7];
+
+			GSVector4i::sw8(v0, v1, v2, v3);
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw8(v0, v2, v1, v3);
+
+			GSVector4i::store<true>(&dst[dstpitch * 0 +  0], (v0 & mask).yxwz());
+			GSVector4i::store<true>(&dst[dstpitch * 0 + 16], (v1 & mask).yxwz());
+			GSVector4i::store<true>(&dst[dstpitch * 1 +  0], (v2 & mask).yxwz());
+			GSVector4i::store<true>(&dst[dstpitch * 1 + 16], (v3 & mask).yxwz());
+
+			dst += dstpitch * 2;
+
+			GSVector4i::store<true>(&dst[dstpitch * 0 +  0], (v0.andnot(mask)) >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 0 + 16], (v1.andnot(mask)) >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 1 +  0], (v2.andnot(mask)) >> 4);
+			GSVector4i::store<true>(&dst[dstpitch * 1 + 16], (v3.andnot(mask)) >> 4);
+
+			dst += dstpitch * 2;
+		}
+	}
+
+	__forceinline static void ReadBlock8HP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		#if _M_SSE >= 0x501
+
+		uint8* RESTRICT d0 = &dst[dstpitch * 0];
+		uint8* RESTRICT d1 = &dst[dstpitch * 4];
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0, v1, v2, v3;
+		GSVector4i v4, v5;
+
+		v0 = s[0].acbd();
+		v1 = s[1].acbd();
+		v2 = s[2].acbd();
+		v3 = s[3].acbd();
+
+		v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24));
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d0[dstpitch * 0], v4);
+		GSVector4i::storel(&d0[dstpitch * 1], v5);
+		GSVector4i::storeh(&d0[dstpitch * 2], v4);
+		GSVector4i::storeh(&d0[dstpitch * 3], v5);
+
+		v0 = s[4].acbd();
+		v1 = s[5].acbd();
+		v2 = s[6].acbd();
+		v3 = s[7].acbd();
+
+		v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24));
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d1[dstpitch * 0], v4);
+		GSVector4i::storel(&d1[dstpitch * 1], v5);
+		GSVector4i::storeh(&d1[dstpitch * 2], v4);
+		GSVector4i::storeh(&d1[dstpitch * 3], v5);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24));
+
+			GSVector4i::storel(dst, v0);
+
+			dst += dstpitch;
+
+			GSVector4i::storeh(dst, v0);
+
+			dst += dstpitch;
+		}
+
+		#endif
+	}
+
+	__forceinline static void ReadBlock4HLP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		#if _M_SSE >= 0x501
+
+		uint8* RESTRICT d0 = &dst[dstpitch * 0];
+		uint8* RESTRICT d1 = &dst[dstpitch * 4];
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0, v1, v2, v3;
+		GSVector4i v4, v5;
+		GSVector8i mask(0x0f0f0f0f);
+
+		v0 = s[0].acbd();
+		v1 = s[1].acbd();
+		v2 = s[2].acbd();
+		v3 = s[3].acbd();
+
+		v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d0[dstpitch * 0], v4);
+		GSVector4i::storel(&d0[dstpitch * 1], v5);
+		GSVector4i::storeh(&d0[dstpitch * 2], v4);
+		GSVector4i::storeh(&d0[dstpitch * 3], v5);
+
+		v0 = s[4].acbd();
+		v1 = s[5].acbd();
+		v2 = s[6].acbd();
+		v3 = s[7].acbd();
+
+		v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d1[dstpitch * 0], v4);
+		GSVector4i::storel(&d1[dstpitch * 1], v5);
+		GSVector4i::storeh(&d1[dstpitch * 2], v4);
+		GSVector4i::storeh(&d1[dstpitch * 3], v5);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		GSVector4i mask(0x0f0f0f0f);
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
+
+			GSVector4i::storel(dst, v0);
+
+			dst += dstpitch;
+
+			GSVector4i::storeh(dst, v0);
+
+			dst += dstpitch;
+		}
+
+		#endif
+	}
+
+	__forceinline static void ReadBlock4HHP(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch)
+	{
+		#if _M_SSE >= 0x501
+
+		uint8* RESTRICT d0 = &dst[dstpitch * 0];
+		uint8* RESTRICT d1 = &dst[dstpitch * 4];
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i v0, v1, v2, v3;
+		GSVector4i v4, v5;
+
+		v0 = s[0].acbd();
+		v1 = s[1].acbd();
+		v2 = s[2].acbd();
+		v3 = s[3].acbd();
+
+		v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28));
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d0[dstpitch * 0], v4);
+		GSVector4i::storel(&d0[dstpitch * 1], v5);
+		GSVector4i::storeh(&d0[dstpitch * 2], v4);
+		GSVector4i::storeh(&d0[dstpitch * 3], v5);
+
+		v0 = s[4].acbd();
+		v1 = s[5].acbd();
+		v2 = s[6].acbd();
+		v3 = s[7].acbd();
+
+		v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28));
+
+		v4 = v0.extract<0>();
+		v5 = v0.extract<1>();
+
+		GSVector4i::storel(&d1[dstpitch * 0], v4);
+		GSVector4i::storel(&d1[dstpitch * 1], v5);
+		GSVector4i::storeh(&d1[dstpitch * 2], v4);
+		GSVector4i::storeh(&d1[dstpitch * 3], v5);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			v0 = ((v0 >> 28).ps32(v1 >> 28)).pu16((v2 >> 28).ps32(v3 >> 28));
+
+			GSVector4i::storel(dst, v0);
+
+			dst += dstpitch;
+
+			GSVector4i::storeh(dst, v0);
+
+			dst += dstpitch;
+		}
+
+		#endif
+	}
+
+	template<bool AEM, class V> __forceinline static V Expand24to32(const V& c, const V& TA0)
+	{
+		return c | (AEM ? TA0.andnot(c == V::zero()) : TA0); // TA0 & (c != GSVector4i::zero())
+	}
+
+	template<bool AEM, class V> __forceinline static V Expand16to32(const V& c, const V& TA0, const V& TA1)
+	{
+		return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == V::zero()) : TA0.blend(TA1, c.sra16(15)));
+	}
+
+	template<bool AEM> static void ExpandBlock24(const uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA)
+	{
+		#if _M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+
+		GSVector8i TA0(TEXA.TA0 << 24);
+		GSVector8i mask = GSVector8i::x00ffffff();
+
+		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector8i v0 = s[i * 2 + 0] & mask;
+			GSVector8i v1 = s[i * 2 + 1] & mask;
+
+			GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
+			GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
+
+			d0[0] = Expand24to32<AEM>(v0, TA0);
+			d1[0] = Expand24to32<AEM>(v1, TA0);
+		}
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i TA0(TEXA.TA0 << 24);
+		GSVector4i mask = GSVector4i::x00ffffff();
+
+		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector4i v0 = s[i * 4 + 0] & mask;
+			GSVector4i v1 = s[i * 4 + 1] & mask;
+			GSVector4i v2 = s[i * 4 + 2] & mask;
+			GSVector4i v3 = s[i * 4 + 3] & mask;
+
+			GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+			GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+			d0[0] = Expand24to32<AEM>(v0, TA0);
+			d0[1] = Expand24to32<AEM>(v1, TA0);
+			d1[0] = Expand24to32<AEM>(v2, TA0);
+			d1[1] = Expand24to32<AEM>(v3, TA0);
+		}
+
+		#endif
+	}
+
+	template<bool AEM> static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs
+	{
+		#if _M_SSE >= 0x501
+		
+		const GSVector8i* s = (const GSVector8i*)src;
+
+		GSVector8i TA0(TEXA.TA0 << 24);
+		GSVector8i TA1(TEXA.TA1 << 24);
+
+		for(int i = 0; i < 8; i++, dst += dstpitch)
+		{
+			GSVector8i v = s[i].acbd();
+
+			((GSVector8i*)dst)[0] = Expand16to32<AEM>(v.upl16(v), TA0, TA1);
+			((GSVector8i*)dst)[1] = Expand16to32<AEM>(v.uph16(v), TA0, TA1);
+		}
+
+		#else
+		
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i TA0(TEXA.TA0 << 24);
+		GSVector4i TA1(TEXA.TA1 << 24);
+
+		for(int i = 0; i < 8; i++, dst += dstpitch)
+		{
+			GSVector4i v0 = s[i * 2 + 0];
+
+			((GSVector4i*)dst)[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
+			((GSVector4i*)dst)[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
+
+			GSVector4i v1 = s[i * 2 + 1];
+
+			((GSVector4i*)dst)[2] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
+			((GSVector4i*)dst)[3] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
+		}
+
+		#endif
+	}
+
+	__forceinline static void ExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 16; j++, dst += dstpitch)
+		{
+			((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
+		}
+	}
+
+	__forceinline static void ExpandBlock8_16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 16; j++, dst += dstpitch)
+		{
+			((const GSVector4i*)src)[j].gather16_8(pal, (GSVector4i*)dst);
+		}
+	}
+
+	__forceinline static void ExpandBlock4_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal)
+	{
+		for(int j = 0; j < 16; j++, dst += dstpitch)
+		{
+			((const GSVector4i*)src)[j].gather64_8(pal, (GSVector4i*)dst);
+		}
+	}
+
+	__forceinline static void ExpandBlock4_16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal)
+	{
+		for(int j = 0; j < 16; j++, dst += dstpitch)
+		{
+			((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
+		}
+	}
+
+	__forceinline static void ExpandBlock8H_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 24).gather32_32<>(pal);
+			((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
+		}
+	}
+
+	__forceinline static void ExpandBlock8H_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			#if _M_SSE >= 0x401
+
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			GSVector4i v0 = (s[j * 2 + 0] >> 24).gather32_32<>(pal);
+			GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
+
+			((GSVector4i*)dst)[0] = v0.pu32(v1);
+
+			#else
+
+			for(int i = 0; i < 8; i++)
+			{
+				((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 24];
+			}
+
+			#endif
+		}
+	}
+
+	__forceinline static void ExpandBlock4HL_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			((GSVector4i*)dst)[0] = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
+			((GSVector4i*)dst)[1] = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
+		}
+	}
+
+	__forceinline static void ExpandBlock4HL_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			#if _M_SSE >= 0x401
+
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
+			GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
+
+			((GSVector4i*)dst)[0] = v0.pu32(v1);
+
+			#else
+
+			for(int i = 0; i < 8; i++)
+			{
+				((uint16*)dst)[i] = (uint16)pal[(src[j * 8 + i] >> 24) & 0xf];
+			}
+
+			#endif
+		}
+	}
+
+	__forceinline static void ExpandBlock4HH_32(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
+			((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
+		}
+	}
+
+	__forceinline static void ExpandBlock4HH_16(uint32* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		for(int j = 0; j < 8; j++, dst += dstpitch)
+		{
+			#if _M_SSE >= 0x401
+
+			const GSVector4i* s = (const GSVector4i*)src;
+
+			GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
+			GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
+
+			((GSVector4i*)dst)[0] = v0.pu32(v1);
+
+			#else
+
+			for(int i = 0; i < 8; i++)
+			{
+				((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 28];
+			}
+
+			#endif
+		}
+	}
+
+	__forceinline static void UnpackAndWriteBlock24(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst)
+	{
+		#if _M_SSE >= 0x501
+
+		const uint8* RESTRICT s0 = &src[srcpitch * 0];
+		const uint8* RESTRICT s1 = &src[srcpitch * 1];
+		const uint8* RESTRICT s2 = &src[srcpitch * 2];
+		const uint8* RESTRICT s3 = &src[srcpitch * 3];
+
+		GSVector8i v0, v1, v2, v3, v4, v5, v6;
+		GSVector8i mask = GSVector8i::x00ffffff();
+
+		v4 = GSVector8i::load(s0, s0 + 8, s2, s2 + 8);
+		v5 = GSVector8i::load(s0 + 16, s1, s2 + 16, s3);
+		v6 = GSVector8i::load(s1 + 8, s1 + 16, s3 + 8, s3 + 16);
+		
+		v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v4.srl<12>(v5);
+		v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v5.srl<8>(v6);
+		v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v6.srl<4>();
+		v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+
+		GSVector8i::sw64(v0, v2, v1, v3);
+
+		((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend8(v0, mask);
+		((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend8(v2, mask);
+		((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend8(v1, mask);
+		((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend8(v3, mask);
+
+		src += srcpitch * 4;
+
+		s0 = &src[srcpitch * 0];
+		s1 = &src[srcpitch * 1];
+		s2 = &src[srcpitch * 2];
+		s3 = &src[srcpitch * 3];
+
+		v4 = GSVector8i::load(s0, s0 + 8, s2, s2 + 8);
+		v5 = GSVector8i::load(s0 + 16, s1, s2 + 16, s3);
+		v6 = GSVector8i::load(s1 + 8, s1 + 16, s3 + 8, s3 + 16);
+		
+		v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v4.srl<12>(v5);
+		v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v5.srl<8>(v6);
+		v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+		v4 = v6.srl<4>();
+		v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>())).acbd();
+
+		GSVector8i::sw64(v0, v2, v1, v3);
+
+		((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend8(v0, mask);
+		((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend8(v2, mask);
+		((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend8(v1, mask);
+		((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend8(v3, mask);
+
+		#else
+
+		GSVector4i v0, v1, v2, v3, v4, v5, v6;
+		GSVector4i mask = GSVector4i::x00ffffff();
+
+		for(int i = 0; i < 4; i++, src += srcpitch * 2)
+		{
+			v4 = GSVector4i::load<false>(src);
+			v5 = GSVector4i::load(src + 16, src + srcpitch);
+			v6 = GSVector4i::load<false>(src + srcpitch + 8);
+
+			v0 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>()));
+			v4 = v4.srl<12>(v5);
+			v1 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>()));
+			v4 = v5.srl<8>(v6);
+			v2 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>()));
+			v4 = v6.srl<4>();
+			v3 = v4.upl32(v4.srl<3>()).upl64(v4.srl<6>().upl32(v4.srl<9>()));
+
+			GSVector4i::sw64(v0, v2, v1, v3);
+
+			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask);
+			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask);
+			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask);
+			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
+		}
+
+		#endif
+	}
+
+	__forceinline static void UnpackAndWriteBlock8H(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst)
+	{
+		GSVector4i v4, v5, v6, v7;
+
+		#if _M_SSE >= 0x501
+
+		GSVector8i v0, v1, v2, v3;
+		GSVector8i mask = GSVector8i::xff000000();
+
+		v4 = GSVector4i::loadl(&src[srcpitch * 0]);
+		v5 = GSVector4i::loadl(&src[srcpitch * 1]);
+		v6 = GSVector4i::loadl(&src[srcpitch * 2]);
+		v7 = GSVector4i::loadl(&src[srcpitch * 3]);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v6.upl16(v7));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend8(v0, mask);
+		((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend8(v1, mask);
+		((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend8(v2, mask);
+		((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend8(v3, mask);
+
+		src += srcpitch * 4;
+
+		v4 = GSVector4i::loadl(&src[srcpitch * 0]);
+		v5 = GSVector4i::loadl(&src[srcpitch * 1]);
+		v6 = GSVector4i::loadl(&src[srcpitch * 2]);
+		v7 = GSVector4i::loadl(&src[srcpitch * 3]);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v6.upl16(v7));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend8(v0, mask);
+		((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend8(v1, mask);
+		((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend8(v2, mask);
+		((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend8(v3, mask);
+
+		#elif _M_SSE >= 0x301
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i::xff000000();
+		GSVector4i mask0 = m_uw8hmask0;
+		GSVector4i mask1 = m_uw8hmask1;
+		GSVector4i mask2 = m_uw8hmask2;
+		GSVector4i mask3 = m_uw8hmask3;
+
+		for(int i = 0; i < 4; i++, src += srcpitch * 2)
+		{
+			v4 = GSVector4i::load(src, src + srcpitch);
+
+			v0 = v4.shuffle8(mask0);
+			v1 = v4.shuffle8(mask1);
+			v2 = v4.shuffle8(mask2);
+			v3 = v4.shuffle8(mask3);
+
+			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask);
+			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask);
+			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask);
+			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
+		}
+
+		#else
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i::xff000000();
+
+		for(int i = 0; i < 4; i++, src += srcpitch * 2)
+		{
+			v4 = GSVector4i::loadl(&src[srcpitch * 0]);
+			v5 = GSVector4i::loadl(&src[srcpitch * 1]);
+
+			v6 = v4.upl16(v5);
+
+			v4 = v6.upl8(v6);
+			v5 = v6.uph8(v6);
+
+			v0 = v4.upl16(v4);
+			v1 = v4.uph16(v4);
+			v2 = v5.upl16(v5);
+			v3 = v5.uph16(v5);
+			
+			((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask);
+			((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask);
+			((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask);
+			((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
+		}
+
+		#endif
+	}
+
+	__forceinline static void UnpackAndWriteBlock4HL(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst)
+	{
+		//printf("4HL\n");
+
+		if(0)
+		{
+			uint8* s = (uint8*)src;
+			for(int j = 0; j < 8; j++, s += srcpitch)
+				for(int i = 0; i < 4; i++) s[i] = (columnTable32[j][i*2] & 0x0f) | (columnTable32[j][i*2+1] << 4);
+		}
+
+		GSVector4i v4, v5, v6, v7;
+
+		#if _M_SSE >= 0x501
+
+		GSVector8i v0, v1, v2, v3;
+		GSVector8i mask(0x0f000000);
+
+		v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+		v4 = v6.upl8(v6 >> 4);
+		v5 = v6.uph8(v6 >> 4);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v4.uph16(v5));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend(v0, mask);
+		((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend(v1, mask);
+		((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend(v2, mask);
+		((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend(v3, mask);
+
+		src += srcpitch * 4;
+
+		v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+		v4 = v6.upl8(v6 >> 4);
+		v5 = v6.uph8(v6 >> 4);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v4.uph16(v5));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend(v0, mask);
+		((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend(v1, mask);
+		((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend(v2, mask);
+		((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend(v3, mask);
+
+		#elif _M_SSE >= 0x301
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i(0x0f000000);
+		GSVector4i mask0 = m_uw8hmask0;
+		GSVector4i mask1 = m_uw8hmask1;
+		GSVector4i mask2 = m_uw8hmask2;
+		GSVector4i mask3 = m_uw8hmask3;
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 3]);
+
+			v4 = v.upl8(v >> 4);
+			v5 = v.uph8(v >> 4);
+
+			v0 = v4.shuffle8(mask0);
+			v1 = v4.shuffle8(mask1);
+			v2 = v4.shuffle8(mask2);
+			v3 = v4.shuffle8(mask3);
+
+			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
+
+			v0 = v5.shuffle8(mask0);
+			v1 = v5.shuffle8(mask1);
+			v2 = v5.shuffle8(mask2);
+			v3 = v5.shuffle8(mask3);
+
+			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
+		}
+
+		#else
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i(0x0f000000);
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+			v4 = v.upl8(v >> 4);
+			v5 = v.uph8(v >> 4);
+
+			v6 = v4.upl16(v5);
+			v7 = v4.uph16(v5);
+
+			v4 = v6.upl8(v6);
+			v5 = v6.uph8(v6);
+			v6 = v7.upl8(v7);
+			v7 = v7.uph8(v7);
+
+			v0 = v4.upl16(v4);
+			v1 = v4.uph16(v4);
+			v2 = v5.upl16(v5);
+			v3 = v5.uph16(v5);
+
+			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
+
+			v0 = v6.upl16(v6);
+			v1 = v6.uph16(v6);
+			v2 = v7.upl16(v7);
+			v3 = v7.uph16(v7);
+
+			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
+		}
+
+		#endif
+	}
+
+	__forceinline static void UnpackAndWriteBlock4HH(const uint8* RESTRICT src, int srcpitch, uint8* RESTRICT dst)
+	{
+		GSVector4i v4, v5, v6, v7;
+
+		#if _M_SSE >= 0x501
+
+		GSVector8i v0, v1, v2, v3;
+		GSVector8i mask = GSVector8i::xf0000000();
+
+		v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+		v4 = (v6 << 4).upl8(v6);
+		v5 = (v6 << 4).uph8(v6);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v4.uph16(v5));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[0] = ((GSVector8i*)dst)[0].blend(v0, mask);
+		((GSVector8i*)dst)[1] = ((GSVector8i*)dst)[1].blend(v1, mask);
+		((GSVector8i*)dst)[2] = ((GSVector8i*)dst)[2].blend(v2, mask);
+		((GSVector8i*)dst)[3] = ((GSVector8i*)dst)[3].blend(v3, mask);
+
+		src += srcpitch * 4;
+
+		v6 = GSVector4i(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+		v4 = (v6 << 4).upl8(v6);
+		v5 = (v6 << 4).uph8(v6);
+
+		v2 = GSVector8i::cast(v4.upl16(v5));
+		v3 = GSVector8i::cast(v4.uph16(v5));
+			
+		v0 = v2.u8to32c() << 24;
+		v1 = v2.bbbb().u8to32c() << 24;
+		v2 = v3.u8to32c() << 24;
+		v3 = v3.bbbb().u8to32c() << 24;
+
+		((GSVector8i*)dst)[4] = ((GSVector8i*)dst)[4].blend(v0, mask);
+		((GSVector8i*)dst)[5] = ((GSVector8i*)dst)[5].blend(v1, mask);
+		((GSVector8i*)dst)[6] = ((GSVector8i*)dst)[6].blend(v2, mask);
+		((GSVector8i*)dst)[7] = ((GSVector8i*)dst)[7].blend(v3, mask);
+
+		#elif _M_SSE >= 0x301
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i::xf0000000();
+		GSVector4i mask0 = m_uw8hmask0;
+		GSVector4i mask1 = m_uw8hmask1;
+		GSVector4i mask2 = m_uw8hmask2;
+		GSVector4i mask3 = m_uw8hmask3;
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 3]);
+
+			v4 = (v << 4).upl8(v);
+			v5 = (v << 4).uph8(v);
+
+			v0 = v4.shuffle8(mask0);
+			v1 = v4.shuffle8(mask1);
+			v2 = v4.shuffle8(mask2);
+			v3 = v4.shuffle8(mask3);
+
+			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
+
+			v0 = v5.shuffle8(mask0);
+			v1 = v5.shuffle8(mask1);
+			v2 = v5.shuffle8(mask2);
+			v3 = v5.shuffle8(mask3);
+
+			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
+		}
+
+		#else
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = GSVector4i::xf0000000();
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]);
+
+			v4 = (v << 4).upl8(v);
+			v5 = (v << 4).uph8(v);
+
+			v6 = v4.upl16(v5);
+			v7 = v4.uph16(v5);
+
+			v4 = v6.upl8(v6);
+			v5 = v6.uph8(v6);
+			v6 = v7.upl8(v7);
+			v7 = v7.uph8(v7);
+
+			v0 = v4.upl16(v4);
+			v1 = v4.uph16(v4);
+			v2 = v5.upl16(v5);
+			v3 = v5.uph16(v5);
+
+			((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
+
+			v0 = v6.upl16(v6);
+			v1 = v6.uph16(v6);
+			v2 = v7.upl16(v7);
+			v3 = v7.uph16(v7);
+
+			((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
+			((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
+			((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
+			((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
+		}
+
+		#endif
+	}
+
+	template<bool AEM> __forceinline static void ReadAndExpandBlock24(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA)
+	{
+		#if _M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+		
+		GSVector8i TA0(TEXA.TA0 << 24);
+		GSVector8i mask = GSVector8i::x00ffffff();
+
+		GSVector8i v0, v1, v2, v3;
+
+		v0 = s[0] & mask;
+		v1 = s[1] & mask;
+		v2 = s[2] & mask;
+		v3 = s[3] & mask;
+
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw64(v0, v1);
+		GSVector8i::sw128(v2, v3);
+		GSVector8i::sw64(v2, v3);
+
+		*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
+		*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
+		*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
+		*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
+
+		v0 = s[4] & mask;
+		v1 = s[5] & mask;
+		v2 = s[6] & mask;
+		v3 = s[7] & mask;
+
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw64(v0, v1);
+		GSVector8i::sw128(v2, v3);
+		GSVector8i::sw64(v2, v3);
+
+		dst += dstpitch * 4;
+
+		*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
+		*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
+		*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
+		*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
+
+		#else
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i TA0(TEXA.TA0 << 24);
+		GSVector4i mask = GSVector4i::x00ffffff();
+
+		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector4i v0 = s[i * 4 + 0];
+			GSVector4i v1 = s[i * 4 + 1];
+			GSVector4i v2 = s[i * 4 + 2];
+			GSVector4i v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			v0 &= mask;
+			v1 &= mask;
+			v2 &= mask;
+			v3 &= mask;
+
+			GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+			GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+			d0[0] = Expand24to32<AEM>(v0, TA0);
+			d0[1] = Expand24to32<AEM>(v1, TA0);
+			d1[0] = Expand24to32<AEM>(v2, TA0);
+			d1[1] = Expand24to32<AEM>(v3, TA0);
+		}
+
+		#endif
+	}
+
+	template<bool AEM> __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA)
+	{
+		#if _M_SSE >= 0x501
+
+		const GSVector8i* s = (const GSVector8i*)src;
+
+		GSVector8i TA0(TEXA.TA0 << 24);
+		GSVector8i TA1(TEXA.TA1 << 24);
+
+		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector8i v0 = s[i * 2 + 0].shuffle8(m_r16mask);
+			GSVector8i v1 = s[i * 2 + 1].shuffle8(m_r16mask);
+
+			GSVector8i::sw128(v0, v1);
+			GSVector8i::sw32(v0, v1);
+
+			GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
+			GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
+
+			d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
+			d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
+			d1[0] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
+			d1[1] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
+		}
+
+		#elif 0 // not faster
+		
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i TA0(TEXA.TA0 << 24);
+		GSVector4i TA1(TEXA.TA1 << 24);
+
+		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
+		{
+			GSVector4i v0 = s[i * 4 + 0];
+			GSVector4i v1 = s[i * 4 + 1];
+			GSVector4i v2 = s[i * 4 + 2];
+			GSVector4i v3 = s[i * 4 + 3];
+
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw32(v0, v1, v2, v3);
+			GSVector4i::sw16(v0, v2, v1, v3);
+
+			GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
+
+			d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
+			d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
+			d0[2] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
+			d0[3] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
+			
+			GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
+
+			d1[0] = Expand16to32<AEM>(v2.upl16(v2), TA0, TA1);
+			d1[1] = Expand16to32<AEM>(v2.uph16(v2), TA0, TA1);
+			d1[2] = Expand16to32<AEM>(v3.upl16(v3), TA0, TA1);
+			d1[3] = Expand16to32<AEM>(v3.uph16(v3), TA0, TA1);
+		}
+
+		#else
+		
+		__aligned(uint16, 32) block[16 * 8];
+	
+		ReadBlock16(src, (uint8*)block, sizeof(block) / 8);
+
+		ExpandBlock16<AEM>(block, dst, dstpitch, TEXA);
+
+		#endif
+	}
+
+	__forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		//printf("ReadAndExpandBlock8_32\n");
+
+		#if _M_SSE >= 0x401
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = m_r8mask;
+
+		for(int i = 0; i < 2; i++)
+		{
+			v0 = s[i * 8 + 0].shuffle8(mask);
+			v1 = s[i * 8 + 1].shuffle8(mask);
+			v2 = s[i * 8 + 2].shuffle8(mask);
+			v3 = s[i * 8 + 3].shuffle8(mask);
+
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw32(v0, v1, v3, v2);
+
+			v0.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v3.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v1.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v2.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+
+			v2 = s[i * 8 + 4].shuffle8(mask);
+			v3 = s[i * 8 + 5].shuffle8(mask);
+			v0 = s[i * 8 + 6].shuffle8(mask);
+			v1 = s[i * 8 + 7].shuffle8(mask);
+
+			GSVector4i::sw16(v0, v1, v2, v3);
+			GSVector4i::sw32(v0, v1, v3, v2);
+
+			v0.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v3.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v1.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v2.gather32_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+		}
+
+		#else
+
+		__aligned(uint8, 32) block[16 * 16];
+
+		ReadBlock8(src, (uint8*)block, sizeof(block) / 16);
+
+		ExpandBlock8_32(block, dst, dstpitch, pal);
+
+		#endif
+	}
+
+	// TODO: ReadAndExpandBlock8_16
+
+	__forceinline static void ReadAndExpandBlock4_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint64* RESTRICT pal)
+	{
+		//printf("ReadAndExpandBlock4_32\n");
+
+		#if _M_SSE >= 0x401
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+		GSVector4i mask = m_r4mask;
+
+		for(int i = 0; i < 2; i++)
+		{
+			v0 = s[i * 8 + 0].xzyw();
+			v1 = s[i * 8 + 1].xzyw();
+			v2 = s[i * 8 + 2].xzyw();
+			v3 = s[i * 8 + 3].xzyw();
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+			GSVector4i::sw4(v0, v2, v1, v3);
+			GSVector4i::sw8(v0, v1, v2, v3);
+
+			v0 = v0.shuffle8(mask);
+			v1 = v1.shuffle8(mask);
+			v2 = v2.shuffle8(mask);
+			v3 = v3.shuffle8(mask);
+
+			GSVector4i::sw16rh(v0, v1, v2, v3);
+
+			v0.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v1.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v2.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v3.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+
+			v0 = s[i * 8 + 4].xzyw();
+			v1 = s[i * 8 + 5].xzyw();
+			v2 = s[i * 8 + 6].xzyw();
+			v3 = s[i * 8 + 7].xzyw();
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+			GSVector4i::sw4(v0, v2, v1, v3);
+			GSVector4i::sw8(v0, v1, v2, v3);
+
+			v0 = v0.shuffle8(mask);
+			v1 = v1.shuffle8(mask);
+			v2 = v2.shuffle8(mask);
+			v3 = v3.shuffle8(mask);
+
+			GSVector4i::sw16rl(v0, v1, v2, v3);
+
+			v0.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v1.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v2.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+			v3.gather64_8<>(pal, (GSVector4i*)dst);
+			dst += dstpitch;
+		}
+
+		#else
+
+		__aligned(uint8, 32) block[(32 / 2) * 16];
+
+		ReadBlock4(src, (uint8*)block, sizeof(block) / 16);
+
+		ExpandBlock4_32(block, dst, dstpitch, pal);
+
+		#endif
+	}
+
+	// TODO: ReadAndExpandBlock4_16
+
+	__forceinline static void ReadAndExpandBlock8H_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		//printf("ReadAndExpandBlock8H_32\n");
+
+		#if _M_SSE >= 0x401
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			(v0 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			(v1 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+
+			(v2 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			(v3 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+		}
+
+		#else
+
+		__aligned(uint32, 32) block[8 * 8];
+
+		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
+
+		ExpandBlock8H_32(block, dst, dstpitch, pal);
+
+		#endif
+	}
+
+	// TODO: ReadAndExpandBlock8H_16
+
+	__forceinline static void ReadAndExpandBlock4HL_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		//printf("ReadAndExpandBlock4HL_32\n");
+
+		#if _M_SSE >= 0x401
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			((v0 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			((v1 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+
+			((v2 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			((v3 >> 24) & 0xf).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+		}
+
+		#else
+
+		__aligned(uint32, 32) block[8 * 8];
+
+		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
+
+		ExpandBlock4HL_32(block, dst, dstpitch, pal);
+
+		#endif
+	}
+
+	// TODO: ReadAndExpandBlock4HL_16
+
+	__forceinline static void ReadAndExpandBlock4HH_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
+	{
+		//printf("ReadAndExpandBlock4HH_32\n");
+
+		#if _M_SSE >= 0x401
+
+		const GSVector4i* s = (const GSVector4i*)src;
+
+		GSVector4i v0, v1, v2, v3;
+
+		for(int i = 0; i < 4; i++)
+		{
+			v0 = s[i * 4 + 0];
+			v1 = s[i * 4 + 1];
+			v2 = s[i * 4 + 2];
+			v3 = s[i * 4 + 3];
+
+			GSVector4i::sw64(v0, v1, v2, v3);
+
+			(v0 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			(v1 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+
+			(v2 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[0]);
+			(v3 >> 28).gather32_32<>(pal, (GSVector4i*)&dst[16]);
+
+			dst += dstpitch;
+		}
+
+		#else
+
+		__aligned(uint32, 32) block[8 * 8];
+
+		ReadBlock32(src, (uint8*)block, sizeof(block) / 8);
+
+		ExpandBlock4HH_32(block, dst, dstpitch, pal);
+
+		#endif
+	}
+
+	// TODO: ReadAndExpandBlock4HH_16
+};
diff --git a/plugins/GSdx_legacy/GSCapture.cpp b/plugins/GSdx_legacy/GSCapture.cpp
new file mode 100644
index 0000000000..d31a0873d5
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCapture.cpp
@@ -0,0 +1,569 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSCapture.h"
+#include "GSPng.h"
+#include "GSUtil.h"
+
+#ifdef _WIN32
+
+//
+// GSSource
+//
+
+#ifdef __INTEL_COMPILER
+interface __declspec(uuid("59C193BB-C520-41F3-BC1D-E245B80A86FA"))
+#else
+[uuid("59C193BB-C520-41F3-BC1D-E245B80A86FA")] interface
+#endif
+IGSSource : public IUnknown
+{
+	STDMETHOD(DeliverNewSegment)() PURE;
+	STDMETHOD(DeliverFrame)(const void* bits, int pitch, bool rgba) PURE;
+	STDMETHOD(DeliverEOS)() PURE;
+};
+
+#ifdef __INTEL_COMPILER
+class __declspec(uuid("F8BB6F4F-0965-4ED4-BA74-C6A01E6E6C77"))
+#else
+[uuid("F8BB6F4F-0965-4ED4-BA74-C6A01E6E6C77")] class
+#endif
+GSSource : public CBaseFilter, private CCritSec, public IGSSource
+{
+	GSVector2i m_size;
+	REFERENCE_TIME m_atpf;
+	REFERENCE_TIME m_now;
+
+	STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void** ppv)
+	{
+		return
+			riid == __uuidof(IGSSource) ? GetInterface((IGSSource*)this, ppv) :
+			__super::NonDelegatingQueryInterface(riid, ppv);
+	}
+
+	class GSSourceOutputPin : public CBaseOutputPin
+	{
+		GSVector2i m_size;
+		vector<CMediaType> m_mts;
+
+	public:
+		GSSourceOutputPin(const GSVector2i& size, REFERENCE_TIME atpf, CBaseFilter* pFilter, CCritSec* pLock, HRESULT& hr, int colorspace)
+			: CBaseOutputPin("GSSourceOutputPin", pFilter, pLock, &hr, L"Output")
+			, m_size(size)
+		{
+			CMediaType mt;
+			mt.majortype = MEDIATYPE_Video;
+			mt.formattype = FORMAT_VideoInfo;
+
+			VIDEOINFOHEADER vih;
+			memset(&vih, 0, sizeof(vih));
+			vih.AvgTimePerFrame = atpf;
+			vih.bmiHeader.biSize = sizeof(vih.bmiHeader);
+			vih.bmiHeader.biWidth = m_size.x;
+			vih.bmiHeader.biHeight = m_size.y;
+
+			// YUY2
+
+			mt.subtype = MEDIASUBTYPE_YUY2;
+			mt.lSampleSize = m_size.x * m_size.y * 2;
+
+			vih.bmiHeader.biCompression = '2YUY';
+			vih.bmiHeader.biPlanes = 1;
+			vih.bmiHeader.biBitCount = 16;
+			vih.bmiHeader.biSizeImage = m_size.x * m_size.y * 2;
+			mt.SetFormat((uint8*)&vih, sizeof(vih));
+
+			m_mts.push_back(mt);
+
+			// RGB32
+
+			mt.subtype = MEDIASUBTYPE_RGB32;
+			mt.lSampleSize = m_size.x * m_size.y * 4;
+
+			vih.bmiHeader.biCompression = BI_RGB;
+			vih.bmiHeader.biPlanes = 1;
+			vih.bmiHeader.biBitCount = 32;
+			vih.bmiHeader.biSizeImage = m_size.x * m_size.y * 4;
+			mt.SetFormat((uint8*)&vih, sizeof(vih));
+
+			if(colorspace == 1) m_mts.insert(m_mts.begin(), mt);
+			else m_mts.push_back(mt);
+		}
+
+		HRESULT GSSourceOutputPin::DecideBufferSize(IMemAllocator* pAlloc, ALLOCATOR_PROPERTIES* pProperties)
+		{
+			ASSERT(pAlloc && pProperties);
+
+			HRESULT hr;
+
+			pProperties->cBuffers = 1;
+			pProperties->cbBuffer = m_mt.lSampleSize;
+
+			ALLOCATOR_PROPERTIES Actual;
+
+			if(FAILED(hr = pAlloc->SetProperties(pProperties, &Actual)))
+			{
+				return hr;
+			}
+
+			if(Actual.cbBuffer < pProperties->cbBuffer)
+			{
+				return E_FAIL;
+			}
+
+			ASSERT(Actual.cBuffers == pProperties->cBuffers);
+
+			return S_OK;
+		}
+
+	    HRESULT CheckMediaType(const CMediaType* pmt)
+		{
+			for(vector<CMediaType>::iterator i = m_mts.begin(); i != m_mts.end(); i++)
+			{
+				if(i->majortype == pmt->majortype && i->subtype == pmt->subtype)
+				{
+					return S_OK;
+				}
+			}
+
+			return E_FAIL;
+		}
+
+	    HRESULT GetMediaType(int i, CMediaType* pmt)
+		{
+			CheckPointer(pmt, E_POINTER);
+
+			if(i < 0) return E_INVALIDARG;
+			if(i > 1) return VFW_S_NO_MORE_ITEMS;
+
+			*pmt = m_mts[i];
+
+			return S_OK;
+		}
+
+		STDMETHODIMP Notify(IBaseFilter* pSender, Quality q)
+		{
+			return E_NOTIMPL;
+		}
+
+		const CMediaType& CurrentMediaType()
+		{
+			return m_mt;
+		}
+	};
+
+	GSSourceOutputPin* m_output;
+
+public:
+
+	GSSource(int w, int h, float fps, IUnknown* pUnk, HRESULT& hr, int colorspace)
+		: CBaseFilter(NAME("GSSource"), pUnk, this, __uuidof(this), &hr)
+		, m_output(NULL)
+		, m_size(w, h)
+		, m_atpf((REFERENCE_TIME)(10000000.0f / fps))
+		, m_now(0)
+	{
+		m_output = new GSSourceOutputPin(m_size, m_atpf, this, this, hr, colorspace);
+	}
+
+	virtual ~GSSource()
+	{
+		delete m_output;
+	}
+
+	DECLARE_IUNKNOWN;
+
+	int GetPinCount()
+	{
+		return 1;
+	}
+
+	CBasePin* GetPin(int n)
+	{
+		return n == 0 ? m_output : NULL;
+	}
+
+	// IGSSource
+
+	STDMETHODIMP DeliverNewSegment()
+	{
+		m_now = 0;
+
+		return m_output->DeliverNewSegment(0, _I64_MAX, 1.0);
+	}
+
+	STDMETHODIMP DeliverFrame(const void* bits, int pitch, bool rgba)
+	{
+		if(!m_output || !m_output->IsConnected())
+		{
+			return E_UNEXPECTED;
+		}
+
+		CComPtr<IMediaSample> sample;
+
+		if(FAILED(m_output->GetDeliveryBuffer(&sample, NULL, NULL, 0)))
+		{
+			return E_FAIL;
+		}
+
+		REFERENCE_TIME start = m_now;
+		REFERENCE_TIME stop = m_now + m_atpf;
+
+		sample->SetTime(&start, &stop);
+		sample->SetSyncPoint(TRUE);
+
+		const CMediaType& mt = m_output->CurrentMediaType();
+
+		uint8* src = (uint8*)bits;
+		uint8* dst = NULL;
+
+		sample->GetPointer(&dst);
+
+		int w = m_size.x;
+		int h = m_size.y;
+		int srcpitch = pitch;
+
+		if(mt.subtype == MEDIASUBTYPE_YUY2)
+		{
+			int dstpitch = ((VIDEOINFOHEADER*)mt.Format())->bmiHeader.biWidth * 2;
+
+			GSVector4 ys(0.257f, 0.504f, 0.098f, 0.0f);
+			GSVector4 us(-0.148f / 2, -0.291f / 2, 0.439f / 2, 0.0f);
+			GSVector4 vs(0.439f / 2, -0.368f / 2, -0.071f / 2, 0.0f);
+
+			if(!rgba)
+			{
+				ys = ys.zyxw();
+				us = us.zyxw();
+				vs = vs.zyxw();
+			}
+
+			const GSVector4 offset(16, 128, 16, 128);
+
+			for(int j = 0; j < h; j++, dst += dstpitch, src += srcpitch)
+			{
+				uint32* s = (uint32*)src;
+				uint16* d = (uint16*)dst;
+
+				for(int i = 0; i < w; i += 2)
+				{
+					GSVector4 c0 = GSVector4::rgba32(s[i + 0]);
+					GSVector4 c1 = GSVector4::rgba32(s[i + 1]);
+					GSVector4 c2 = c0 + c1;
+
+					GSVector4 lo = (c0 * ys).hadd(c2 * us);
+					GSVector4 hi = (c1 * ys).hadd(c2 * vs);
+
+					GSVector4 c = lo.hadd(hi) + offset;
+
+					*((uint32*)&d[i]) = GSVector4i(c).rgba32();
+				}
+			}
+		}
+		else if(mt.subtype == MEDIASUBTYPE_RGB32)
+		{
+			int dstpitch = ((VIDEOINFOHEADER*)mt.Format())->bmiHeader.biWidth * 4;
+
+			dst += dstpitch * (h - 1);
+			dstpitch = -dstpitch;
+
+			for(int j = 0; j < h; j++, dst += dstpitch, src += srcpitch)
+			{
+				if(rgba)
+				{
+					#if _M_SSE >= 0x301
+
+					GSVector4i* s = (GSVector4i*)src;
+					GSVector4i* d = (GSVector4i*)dst;
+
+					GSVector4i mask(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+
+					for(int i = 0, w4 = w >> 2; i < w4; i++)
+					{
+						d[i] = s[i].shuffle8(mask);
+					}
+
+					#else
+
+					GSVector4i* s = (GSVector4i*)src;
+					GSVector4i* d = (GSVector4i*)dst;
+
+					for(int i = 0, w4 = w >> 2; i < w4; i++)
+					{
+						d[i] = ((s[i] & 0x00ff0000) >> 16) | ((s[i] & 0x000000ff) << 16) | (s[i] & 0x0000ff00);
+					}
+
+					#endif
+				}
+				else
+				{
+					memcpy(dst, src, w * 4);
+				}
+			}
+		}
+		else
+		{
+			return E_FAIL;
+		}
+
+		if(FAILED(m_output->Deliver(sample)))
+		{
+			return E_FAIL;
+		}
+
+		m_now = stop;
+
+		return S_OK;
+	}
+
+	STDMETHODIMP DeliverEOS()
+	{
+		return m_output->DeliverEndOfStream();
+	}
+};
+
+#define BeginEnumPins(pBaseFilter, pEnumPins, pPin) \
+	{CComPtr<IEnumPins> pEnumPins; \
+	if(pBaseFilter && SUCCEEDED(pBaseFilter->EnumPins(&pEnumPins))) \
+	{ \
+		for(CComPtr<IPin> pPin; S_OK == pEnumPins->Next(1, &pPin, 0); pPin = NULL) \
+		{ \
+
+#define EndEnumPins }}}
+
+static IPin* GetFirstPin(IBaseFilter* pBF, PIN_DIRECTION dir)
+{
+	if(!pBF) return(NULL);
+
+	BeginEnumPins(pBF, pEP, pPin)
+	{
+		PIN_DIRECTION dir2;
+		pPin->QueryDirection(&dir2);
+		if(dir == dir2)
+		{
+			IPin* pRet = pPin.Detach();
+			pRet->Release();
+			return(pRet);
+		}
+	}
+	EndEnumPins
+
+	return(NULL);
+}
+
+#endif
+
+//
+// GSCapture
+//
+
+GSCapture::GSCapture()
+	: m_capturing(false), m_frame(0)
+	  , m_out_dir("/tmp/GSdx_Capture") // FIXME Later add an option
+{
+	m_out_dir = theApp.GetConfig("capture_out_dir", "/tmp/GSdx_Capture");
+	m_threads = theApp.GetConfig("capture_threads", 4);
+#ifdef __linux__
+	m_compression_level = theApp.GetConfig("png_compression_level", Z_BEST_SPEED);
+#endif
+}
+
+GSCapture::~GSCapture()
+{
+	EndCapture();
+}
+
+bool GSCapture::BeginCapture(float fps, GSVector2i recomendedResolution, float aspect)
+{
+	printf("Recomended resolution: %d x %d, DAR for muxing: %.4f\n", recomendedResolution.x, recomendedResolution.y, aspect);
+	std::lock_guard<std::recursive_mutex> lock(m_lock);
+
+	ASSERT(fps != 0);
+
+	EndCapture();
+
+#ifdef _WIN32
+
+	GSCaptureDlg dlg;
+
+	if(IDOK != dlg.DoModal()) return false;
+
+	m_size.x = (dlg.m_width + 7) & ~7;
+	m_size.y = (dlg.m_height + 7) & ~7;
+
+	wstring fn(dlg.m_filename.begin(), dlg.m_filename.end());
+
+	//
+
+	HRESULT hr;
+
+	CComPtr<ICaptureGraphBuilder2> cgb;
+	CComPtr<IBaseFilter> mux;
+
+	if(FAILED(hr = m_graph.CoCreateInstance(CLSID_FilterGraph))
+	|| FAILED(hr = cgb.CoCreateInstance(CLSID_CaptureGraphBuilder2))
+	|| FAILED(hr = cgb->SetFiltergraph(m_graph))
+	|| FAILED(hr = cgb->SetOutputFileName(&MEDIASUBTYPE_Avi, fn.c_str(), &mux, NULL)))
+	{
+		return false;
+	}
+
+	m_src = new GSSource(m_size.x, m_size.y, fps, NULL, hr, dlg.m_colorspace);
+
+	if (dlg.m_enc==0)
+	{
+		if (FAILED(hr = m_graph->AddFilter(m_src, L"Source")))
+			return false;
+		if (FAILED(hr = m_graph->ConnectDirect(GetFirstPin(m_src, PINDIR_OUTPUT), GetFirstPin(mux, PINDIR_INPUT), NULL)))
+			return false;
+	}
+	else
+	{
+		if(FAILED(hr = m_graph->AddFilter(m_src, L"Source"))
+		|| FAILED(hr = m_graph->AddFilter(dlg.m_enc, L"Encoder")))
+		{
+			return false;
+		}
+
+		if(FAILED(hr = m_graph->ConnectDirect(GetFirstPin(m_src, PINDIR_OUTPUT), GetFirstPin(dlg.m_enc, PINDIR_INPUT), NULL))
+		|| FAILED(hr = m_graph->ConnectDirect(GetFirstPin(dlg.m_enc, PINDIR_OUTPUT), GetFirstPin(mux, PINDIR_INPUT), NULL)))
+		{
+			return false;
+		}
+	}
+
+	BeginEnumFilters(m_graph, pEF, pBF)
+	{
+		CFilterInfo fi;
+		pBF->QueryFilterInfo(&fi);
+		wstring s(fi.achName);
+		printf("Filter [%p]: %s\n", pBF.p, string(s.begin(), s.end()).c_str());
+
+		BeginEnumPins(pBF, pEP, pPin)
+		{
+			CComPtr<IPin> pPinTo;
+			pPin->ConnectedTo(&pPinTo);
+
+			CPinInfo pi;
+			pPin->QueryPinInfo(&pi);
+			wstring s(pi.achName);
+			printf("- Pin [%p - %p]: %s (%s)\n", pPin.p, pPinTo.p, string(s.begin(), s.end()).c_str(), pi.dir ? "out" : "in");
+
+			BeginEnumMediaTypes(pPin, pEMT, pmt)
+			{
+			}
+			EndEnumMediaTypes(pmt)
+		}
+		EndEnumPins
+	}
+	EndEnumFilters
+
+	hr = CComQIPtr<IMediaControl>(m_graph)->Run();
+
+	CComQIPtr<IGSSource>(m_src)->DeliverNewSegment();
+
+#elif __linux__
+	// Note I think it doesn't support multiple depth creation
+	GSmkdir(m_out_dir.c_str());
+
+	// Really cheap recording
+	m_frame = 0;
+	// Add option !!!
+	m_size.x = theApp.GetConfig("capture_resx", 1280);
+	m_size.y = theApp.GetConfig("capture_resy", 1024);
+
+	for(int i = 0; i < m_threads; i++) {
+		m_workers.push_back(new GSPng::Worker());
+	}
+#endif
+
+	m_capturing = true;
+
+	return true;
+}
+
+bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
+{
+	std::lock_guard<std::recursive_mutex> lock(m_lock);
+
+	if(bits == NULL || pitch == 0)
+	{
+		ASSERT(0);
+
+		return false;
+	}
+
+#ifdef _WIN32
+
+	if(m_src)
+	{
+		CComQIPtr<IGSSource>(m_src)->DeliverFrame(bits, pitch, rgba);
+
+		return true;
+	}
+
+#elif __linux__
+
+	std::string out_file = m_out_dir + format("/frame.%010d.png", m_frame);
+	//GSPng::Save(GSPng::RGB_PNG, out_file, (uint8*)bits, m_size.x, m_size.y, pitch, m_compression_level);
+	m_workers[m_frame%m_threads]->Push(shared_ptr<GSPng::Transaction>(new GSPng::Transaction(GSPng::RGB_PNG, out_file, static_cast<const uint8*>(bits), m_size.x, m_size.y, pitch, m_compression_level)));
+
+	m_frame++;
+
+#endif
+
+	return false;
+}
+
+bool GSCapture::EndCapture()
+{
+	std::lock_guard<std::recursive_mutex> lock(m_lock);
+
+#ifdef _WIN32
+
+	if(m_src)
+	{
+		CComQIPtr<IGSSource>(m_src)->DeliverEOS();
+
+		m_src = NULL;
+	}
+
+	if(m_graph)
+	{
+		CComQIPtr<IMediaControl>(m_graph)->Stop();
+
+		m_graph = NULL;
+	}
+
+#elif __linux__
+	for(size_t i = 0; i < m_workers.size(); i++) {
+		m_workers[i]->Wait();
+	}
+
+	m_frame = 0;
+
+#endif
+
+	m_capturing = false;
+
+	return true;
+}
diff --git a/plugins/GSdx_legacy/GSCapture.h b/plugins/GSdx_legacy/GSCapture.h
new file mode 100644
index 0000000000..3fdb7afde1
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCapture.h
@@ -0,0 +1,62 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSVector.h"
+#include "GSPng.h"
+
+#ifdef _WIN32
+#include "GSCaptureDlg.h"
+#endif
+
+class GSCapture
+{
+	std::recursive_mutex m_lock;
+	bool m_capturing;
+	GSVector2i m_size;
+	uint64 m_frame;
+	std::string m_out_dir;
+	int m_threads;
+
+	#ifdef _WIN32
+
+	CComPtr<IGraphBuilder> m_graph;
+	CComPtr<IBaseFilter> m_src;
+
+	#elif __linux__
+
+	vector<GSPng::Worker*> m_workers;
+	int m_compression_level;
+
+	#endif
+
+public:
+	GSCapture();
+	virtual ~GSCapture();
+
+	bool BeginCapture(float fps, GSVector2i recomendedResolution, float aspect);
+	bool DeliverFrame(const void* bits, int pitch, bool rgba);
+	bool EndCapture();
+
+	bool IsCapturing() {return m_capturing;}
+	GSVector2i GetSize() {return m_size;}
+};
diff --git a/plugins/GSdx_legacy/GSCaptureDlg.cpp b/plugins/GSdx_legacy/GSCaptureDlg.cpp
new file mode 100644
index 0000000000..7c34743f6f
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCaptureDlg.cpp
@@ -0,0 +1,211 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSCaptureDlg.h"
+
+GSCaptureDlg::GSCaptureDlg()
+	: GSDialog(IDD_CAPTURE)
+{
+	m_width = theApp.GetConfig("CaptureWidth", 640);
+	m_height = theApp.GetConfig("CaptureHeight", 480);
+	m_filename = theApp.GetConfig("CaptureFileName", "");
+}
+
+int GSCaptureDlg::GetSelCodec(Codec& c)
+{
+	INT_PTR data = 0;
+
+	if(ComboBoxGetSelData(IDC_CODECS, data))
+	{
+		if(data == 0) return 2;
+
+		c = *(Codec*)data;
+
+		if(!c.filter)
+		{
+			c.moniker->BindToObject(NULL, NULL, __uuidof(IBaseFilter), (void**)&c.filter);
+
+			if(!c.filter) return 0;
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+void GSCaptureDlg::OnInit()
+{
+	__super::OnInit();
+
+	SetTextAsInt(IDC_WIDTH, m_width);
+	SetTextAsInt(IDC_HEIGHT, m_height);
+	SetText(IDC_FILENAME, m_filename.c_str());
+
+	m_codecs.clear();
+
+	_bstr_t selected = theApp.GetConfig("CaptureVideoCodecDisplayName", "").c_str();
+
+	ComboBoxAppend(IDC_CODECS, "Uncompressed", 0, true);
+
+	ComboBoxAppend(IDC_COLORSPACE, "YUY2", 0, true);
+	ComboBoxAppend(IDC_COLORSPACE, "RGB32", 1, false);
+
+	CoInitialize(0); // this is obviously wrong here, each thread should call this on start, and where is CoUninitalize?
+
+	BeginEnumSysDev(CLSID_VideoCompressorCategory, moniker)
+	{
+		Codec c;
+
+		c.moniker = moniker;
+
+		wstring prefix;
+
+		LPOLESTR str = NULL;
+
+		if(FAILED(moniker->GetDisplayName(NULL, NULL, &str)))
+			continue;
+
+		if(wcsstr(str, L"@device:dmo:")) prefix = L"(DMO) ";
+		else if(wcsstr(str, L"@device:sw:")) prefix = L"(DS) ";
+		else if(wcsstr(str, L"@device:cm:")) prefix = L"(VfW) ";
+
+		c.DisplayName = str;
+
+		CoTaskMemFree(str);
+
+		CComPtr<IPropertyBag> pPB;
+
+		if(FAILED(moniker->BindToStorage(0, 0, IID_IPropertyBag, (void**)&pPB)))
+			continue;
+
+		_variant_t var;
+
+		if(FAILED(pPB->Read(_bstr_t(_T("FriendlyName")), &var, NULL)))
+			continue;
+
+		c.FriendlyName = prefix + var.bstrVal;
+
+		m_codecs.push_back(c);
+
+		string s(c.FriendlyName.begin(), c.FriendlyName.end());
+
+		ComboBoxAppend(IDC_CODECS, s.c_str(), (LPARAM)&m_codecs.back(), c.DisplayName == selected);
+	}
+	EndEnumSysDev
+}
+
+bool GSCaptureDlg::OnCommand(HWND hWnd, UINT id, UINT code)
+{
+	if(id == IDC_BROWSE && code == BN_CLICKED)
+	{
+		char buff[MAX_PATH] = {0};
+
+		OPENFILENAME ofn;
+
+		memset(&ofn, 0, sizeof(ofn));
+
+		ofn.lStructSize = sizeof(ofn);
+		ofn.hwndOwner = m_hWnd;
+		ofn.lpstrFile = buff;
+		ofn.nMaxFile = countof(buff);
+		ofn.lpstrFilter = "Avi files (*.avi)\0*.avi\0";
+		ofn.Flags = OFN_EXPLORER | OFN_ENABLESIZING | OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT | OFN_PATHMUSTEXIST;
+
+		strcpy(ofn.lpstrFile, m_filename.c_str());
+
+		if(GetSaveFileName(&ofn))
+		{
+			m_filename = ofn.lpstrFile;
+
+			SetText(IDC_FILENAME, m_filename.c_str());
+		}
+
+		return true;
+	}
+	else if(id == IDC_CONFIGURE && code == BN_CLICKED)
+	{
+		Codec c;
+
+		if(GetSelCodec(c) == 1)
+		{
+			if(CComQIPtr<ISpecifyPropertyPages> pSPP = c.filter)
+			{
+				CAUUID caGUID;
+
+				memset(&caGUID, 0, sizeof(caGUID));
+
+				if(SUCCEEDED(pSPP->GetPages(&caGUID)))
+				{
+					IUnknown* lpUnk = NULL;
+					pSPP.QueryInterface(&lpUnk);
+					OleCreatePropertyFrame(m_hWnd, 0, 0, c.FriendlyName.c_str(), 1, (IUnknown**)&lpUnk, caGUID.cElems, caGUID.pElems, 0, 0, NULL);
+					lpUnk->Release();
+
+					if(caGUID.pElems) CoTaskMemFree(caGUID.pElems);
+				}
+			}
+			else if(CComQIPtr<IAMVfwCompressDialogs> pAMVfWCD = c.filter)
+			{
+				if(pAMVfWCD->ShowDialog(VfwCompressDialog_QueryConfig, NULL) == S_OK)
+				{
+					pAMVfWCD->ShowDialog(VfwCompressDialog_Config, m_hWnd);
+				}
+			}
+		}
+
+		return true;
+	}
+	else if(id == IDOK)
+	{
+		m_width = GetTextAsInt(IDC_WIDTH);
+		m_height = GetTextAsInt(IDC_HEIGHT);
+		m_filename = GetText(IDC_FILENAME);
+		ComboBoxGetSelData(IDC_COLORSPACE, (INT_PTR)m_colorspace);
+
+		Codec c;
+
+		int ris = GetSelCodec(c);
+		if(ris == 0)
+		{
+			return false;
+		}
+
+		m_enc = c.filter;
+
+		theApp.SetConfig("CaptureWidth", m_width);
+		theApp.SetConfig("CaptureHeight", m_height);
+		theApp.SetConfig("CaptureFileName", m_filename.c_str());
+
+		if (ris != 2)
+		{
+			theApp.SetConfig("CaptureVideoCodecDisplayName", c.DisplayName);
+		}
+		else
+		{
+			theApp.SetConfig("CaptureVideoCodecDisplayName", "");
+		}
+	}
+
+	return __super::OnCommand(hWnd, id, code);
+}
diff --git a/plugins/GSdx_legacy/GSCaptureDlg.h b/plugins/GSdx_legacy/GSCaptureDlg.h
new file mode 100644
index 0000000000..58773750d8
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCaptureDlg.h
@@ -0,0 +1,54 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDialog.h"
+#include "resource.h"
+#include "baseclasses/streams.h"
+
+class GSCaptureDlg : public GSDialog
+{
+	struct Codec
+	{
+		CComPtr<IMoniker> moniker;
+		CComPtr<IBaseFilter> filter;
+		wstring FriendlyName;
+		_bstr_t DisplayName;
+	};
+
+	list<Codec> m_codecs;
+
+	int GetSelCodec(Codec& c);
+
+protected:
+	void OnInit();
+	bool OnCommand(HWND hWnd, UINT id, UINT code);
+
+public:
+	GSCaptureDlg();
+
+	int m_width;
+	int m_height;
+	string m_filename;
+	int m_colorspace;
+	CComPtr<IBaseFilter> m_enc;
+};
diff --git a/plugins/GSdx_legacy/GSClut.cpp b/plugins/GSdx_legacy/GSClut.cpp
new file mode 100644
index 0000000000..a7eab9a4b2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSClut.cpp
@@ -0,0 +1,744 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSClut.h"
+#include "GSLocalMemory.h"
+
+#define CLUT_ALLOC_SIZE (2 * 4096)
+
+GSClut::GSClut(GSLocalMemory* mem)
+	: m_mem(mem)
+{
+	uint8* p = (uint8*)vmalloc(CLUT_ALLOC_SIZE, false);
+
+	m_clut = (uint16*)&p[0]; // 1k + 1k for mirrored area simulating wrapping memory
+	m_buff32 = (uint32*)&p[2048]; // 1k
+	m_buff64 = (uint64*)&p[4096]; // 2k
+	m_write.dirty = true;
+	m_read.dirty = true;
+
+	for(int i = 0; i < 16; i++)
+	{
+		for(int j = 0; j < 64; j++)
+		{
+			m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
+			m_wc[1][i][j] = &GSClut::WriteCLUT_NULL;
+		}
+	}
+
+	m_wc[0][PSM_PSMCT32][PSM_PSMT8] = &GSClut::WriteCLUT32_I8_CSM1;
+	m_wc[0][PSM_PSMCT32][PSM_PSMT8H] = &GSClut::WriteCLUT32_I8_CSM1;
+	m_wc[0][PSM_PSMCT32][PSM_PSMT4] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT32][PSM_PSMT4HL] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT32][PSM_PSMT4HH] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT24][PSM_PSMT8] = &GSClut::WriteCLUT32_I8_CSM1;
+	m_wc[0][PSM_PSMCT24][PSM_PSMT8H] = &GSClut::WriteCLUT32_I8_CSM1;
+	m_wc[0][PSM_PSMCT24][PSM_PSMT4] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT24][PSM_PSMT4HL] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT24][PSM_PSMT4HH] = &GSClut::WriteCLUT32_I4_CSM1;
+	m_wc[0][PSM_PSMCT16][PSM_PSMT8] = &GSClut::WriteCLUT16_I8_CSM1;
+	m_wc[0][PSM_PSMCT16][PSM_PSMT8H] = &GSClut::WriteCLUT16_I8_CSM1;
+	m_wc[0][PSM_PSMCT16][PSM_PSMT4] = &GSClut::WriteCLUT16_I4_CSM1;
+	m_wc[0][PSM_PSMCT16][PSM_PSMT4HL] = &GSClut::WriteCLUT16_I4_CSM1;
+	m_wc[0][PSM_PSMCT16][PSM_PSMT4HH] = &GSClut::WriteCLUT16_I4_CSM1;
+	m_wc[0][PSM_PSMCT16S][PSM_PSMT8] = &GSClut::WriteCLUT16S_I8_CSM1;
+	m_wc[0][PSM_PSMCT16S][PSM_PSMT8H] = &GSClut::WriteCLUT16S_I8_CSM1;
+	m_wc[0][PSM_PSMCT16S][PSM_PSMT4] = &GSClut::WriteCLUT16S_I4_CSM1;
+	m_wc[0][PSM_PSMCT16S][PSM_PSMT4HL] = &GSClut::WriteCLUT16S_I4_CSM1;
+	m_wc[0][PSM_PSMCT16S][PSM_PSMT4HH] = &GSClut::WriteCLUT16S_I4_CSM1;
+
+	m_wc[1][PSM_PSMCT32][PSM_PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
+	m_wc[1][PSM_PSMCT32][PSM_PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
+	m_wc[1][PSM_PSMCT32][PSM_PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT32][PSM_PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT32][PSM_PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT24][PSM_PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
+	m_wc[1][PSM_PSMCT24][PSM_PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
+	m_wc[1][PSM_PSMCT24][PSM_PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT24][PSM_PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT24][PSM_PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
+	m_wc[1][PSM_PSMCT16][PSM_PSMT8] = &GSClut::WriteCLUT16_CSM2<256>;
+	m_wc[1][PSM_PSMCT16][PSM_PSMT8H] = &GSClut::WriteCLUT16_CSM2<256>;
+	m_wc[1][PSM_PSMCT16][PSM_PSMT4] = &GSClut::WriteCLUT16_CSM2<16>;
+	m_wc[1][PSM_PSMCT16][PSM_PSMT4HL] = &GSClut::WriteCLUT16_CSM2<16>;
+	m_wc[1][PSM_PSMCT16][PSM_PSMT4HH] = &GSClut::WriteCLUT16_CSM2<16>;
+	m_wc[1][PSM_PSMCT16S][PSM_PSMT8] = &GSClut::WriteCLUT16S_CSM2<256>;
+	m_wc[1][PSM_PSMCT16S][PSM_PSMT8H] = &GSClut::WriteCLUT16S_CSM2<256>;
+	m_wc[1][PSM_PSMCT16S][PSM_PSMT4] = &GSClut::WriteCLUT16S_CSM2<16>;
+	m_wc[1][PSM_PSMCT16S][PSM_PSMT4HL] = &GSClut::WriteCLUT16S_CSM2<16>;
+	m_wc[1][PSM_PSMCT16S][PSM_PSMT4HH] = &GSClut::WriteCLUT16S_CSM2<16>;
+}
+
+GSClut::~GSClut()
+{
+	vmfree(m_clut, CLUT_ALLOC_SIZE);
+}
+
+void GSClut::Invalidate()
+{
+	m_write.dirty = true;
+}
+
+bool GSClut::WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	switch(TEX0.CLD)
+	{
+	case 0: return false;
+	case 1: break;
+	case 2: m_CBP[0] = TEX0.CBP; break;
+	case 3: m_CBP[1] = TEX0.CBP; break;
+	case 4: if(m_CBP[0] == TEX0.CBP) return false; m_CBP[0] = TEX0.CBP; break;
+	case 5: if(m_CBP[1] == TEX0.CBP) return false; m_CBP[1] = TEX0.CBP; break;
+	case 6: ASSERT(0); return false; // ffx2 menu
+	case 7: ASSERT(0); return false; // ford mustang racing // Bouken Jidai Katsugeki Goemon
+	default: __assume(0);
+	}
+
+	return m_write.IsDirty(TEX0, TEXCLUT);
+}
+
+void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	m_write.TEX0 = TEX0;
+	m_write.TEXCLUT = TEXCLUT;
+	m_write.dirty = false;
+	m_read.dirty = true;
+
+	(this->*m_wc[TEX0.CSM][TEX0.CPSM][TEX0.PSM])(TEX0, TEXCLUT);
+
+	// Mirror write to other half of buffer to simulate wrapping memory
+
+	int offset = (TEX0.CSA & (TEX0.CPSM < PSM_PSMCT16 ? 15 : 31)) * 16;
+
+	if(TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT8H)
+	{
+		int size = TEX0.CPSM < PSM_PSMCT16 ? 512 : 256;
+
+		memcpy(m_clut + 512 + offset, m_clut + offset, sizeof(*m_clut) * min(size, 512 - offset));
+		memcpy(m_clut, m_clut + 512, sizeof(*m_clut) * max(0, size + offset - 512));
+	}
+	else
+	{
+		int size = 16;
+
+		memcpy(m_clut + 512 + offset, m_clut + offset, sizeof(*m_clut) * size);
+		
+		if(TEX0.CPSM < PSM_PSMCT16)
+		{
+			memcpy(m_clut + 512 + 256 + offset, m_clut + 256 + offset, sizeof(*m_clut) * size);
+		}
+	}
+}
+
+void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	ALIGN_STACK(32);
+
+	WriteCLUT_T32_I8_CSM1((uint32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut + ((TEX0.CSA & 15) << 4));
+}
+
+void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	ALIGN_STACK(32);
+
+	WriteCLUT_T32_I4_CSM1((uint32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut + ((TEX0.CSA & 15) << 4));
+}
+
+void GSClut::WriteCLUT16_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	WriteCLUT_T16_I8_CSM1((uint16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
+}
+
+void GSClut::WriteCLUT16_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	WriteCLUT_T16_I4_CSM1((uint16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
+}
+
+void GSClut::WriteCLUT16S_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	WriteCLUT_T16_I8_CSM1((uint16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
+}
+
+void GSClut::WriteCLUT16S_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	WriteCLUT_T16_I4_CSM1((uint16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
+}
+
+template<int n> void GSClut::WriteCLUT32_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	GSOffset* off = m_mem->GetOffset(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT32);
+
+	uint32* RESTRICT s = &m_mem->m_vm32[off->pixel.row[TEXCLUT.COV]];
+	int* RESTRICT col = &off->pixel.col[0][TEXCLUT.COU << 4];
+
+	uint16* RESTRICT clut = m_clut + ((TEX0.CSA & 15) << 4);
+
+	for(int i = 0; i < n; i++)
+	{
+		uint32 c = s[col[i]];
+
+		clut[i] = (uint16)(c & 0xffff);
+		clut[i + 256] = (uint16)(c >> 16);
+	}
+}
+
+template<int n> void GSClut::WriteCLUT16_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	GSOffset* off = m_mem->GetOffset(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT16);
+
+	uint16* RESTRICT s = &m_mem->m_vm16[off->pixel.row[TEXCLUT.COV]];
+	int* RESTRICT col = &off->pixel.col[0][TEXCLUT.COU << 4];
+
+	uint16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
+
+	for(int i = 0; i < n; i++)
+	{
+		clut[i] = s[col[i]];
+	}
+}
+
+template<int n> void GSClut::WriteCLUT16S_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	GSOffset* off = m_mem->GetOffset(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT16S);
+
+	uint16* RESTRICT s = &m_mem->m_vm16[off->pixel.row[TEXCLUT.COV]];
+	int* RESTRICT col = &off->pixel.col[0][TEXCLUT.COU << 4];
+
+	uint16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
+
+	for(int i = 0; i < n; i++)
+	{
+		clut[i] = s[col[i]];
+	}
+}
+
+#if 0
+void GSClut::Read(const GIFRegTEX0& TEX0)
+{
+	if(m_read.IsDirty(TEX0))
+	{
+		m_read.TEX0 = TEX0;
+		m_read.dirty = false;
+
+		uint16* clut = m_clut;
+
+		if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24)
+		{
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMT8:
+			case PSM_PSMT8H:
+				clut += (TEX0.CSA & 15) << 4;
+				ReadCLUT_T32_I8(clut, m_buff32);
+				break;
+			case PSM_PSMT4:
+			case PSM_PSMT4HL:
+			case PSM_PSMT4HH:
+				clut += (TEX0.CSA & 15) << 4;
+				ReadCLUT_T32_I4(clut, m_buff32, m_buff64);
+				break;
+			}
+		}
+		else if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
+		{
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMT8:
+			case PSM_PSMT8H:
+				clut += TEX0.CSA << 4;
+				ReadCLUT_T16_I8(clut, m_buff32);
+				break;
+			case PSM_PSMT4:
+			case PSM_PSMT4HL:
+			case PSM_PSMT4HH:
+				clut += TEX0.CSA << 4;
+				ReadCLUT_T16_I4(clut, m_buff32, m_buff64);
+				break;
+			}
+		}
+	}
+}
+#endif
+
+void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
+{
+	if(m_read.IsDirty(TEX0, TEXA))
+	{
+		m_read.TEX0 = TEX0;
+		m_read.TEXA = TEXA;
+		m_read.dirty = false;
+		m_read.adirty = true;
+
+		uint16* clut = m_clut;
+
+		if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24)
+		{
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMT8:
+			case PSM_PSMT8H:
+				clut += (TEX0.CSA & 15) << 4; // disney golf title screen
+				ReadCLUT_T32_I8(clut, m_buff32);
+				break;
+			case PSM_PSMT4:
+			case PSM_PSMT4HL:
+			case PSM_PSMT4HH:
+				clut += (TEX0.CSA & 15) << 4;
+				// TODO: merge these functions
+				ReadCLUT_T32_I4(clut, m_buff32);
+				ExpandCLUT64_T32_I8(m_buff32, (uint64*)m_buff64); // sw renderer does not need m_buff64 anymore
+				break;
+			}
+		}
+		else if(TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S)
+		{
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMT8:
+			case PSM_PSMT8H:
+				clut += TEX0.CSA << 4;
+				Expand16(clut, m_buff32, 256, TEXA);
+				break;
+			case PSM_PSMT4:
+			case PSM_PSMT4HL:
+			case PSM_PSMT4HH:
+				clut += TEX0.CSA << 4;
+				// TODO: merge these functions
+				Expand16(clut, m_buff32, 16, TEXA);
+				ExpandCLUT64_T32_I8(m_buff32, (uint64*)m_buff64); // sw renderer does not need m_buff64 anymore
+				break;
+			}
+		}
+	}
+}
+
+void GSClut::GetAlphaMinMax32(int& amin, int& amax)
+{
+	// call only after Read32
+
+	ASSERT(!m_read.dirty);
+
+	if(m_read.adirty)
+	{
+		m_read.adirty = false;
+
+		if(GSLocalMemory::m_psm[m_read.TEX0.CPSM].trbpp == 24 && m_read.TEXA.AEM == 0)
+		{
+			m_read.amin = m_read.TEXA.TA0;
+			m_read.amax = m_read.TEXA.TA0;
+		}
+		else
+		{
+			const GSVector4i* p = (const GSVector4i*)m_buff32;
+
+			GSVector4i amin, amax;
+
+			if(GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 256)
+			{
+				amin = GSVector4i::xffffffff();
+				amax = GSVector4i::zero();
+
+				for(int i = 0; i < 16; i++)
+				{
+					GSVector4i v0 = (p[i * 4 + 0] >> 24).ps32(p[i * 4 + 1] >> 24);
+					GSVector4i v1 = (p[i * 4 + 2] >> 24).ps32(p[i * 4 + 3] >> 24);
+					GSVector4i v2 = v0.pu16(v1);
+
+					amin = amin.min_u8(v2);
+					amax = amax.max_u8(v2);
+				}
+			}
+			else
+			{
+				ASSERT(GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 16);
+
+				GSVector4i v0 = (p[0] >> 24).ps32(p[1] >> 24);
+				GSVector4i v1 = (p[2] >> 24).ps32(p[3] >> 24);
+				GSVector4i v2 = v0.pu16(v1);
+
+				amin = v2;
+				amax = v2;
+			}
+
+			amin = amin.min_u8(amin.zwxy());
+			amax = amax.max_u8(amax.zwxy());
+			amin = amin.min_u8(amin.zwxyl());
+			amax = amax.max_u8(amax.zwxyl());
+			amin = amin.min_u8(amin.yxwzl());
+			amax = amax.max_u8(amax.yxwzl());
+
+			GSVector4i v0 = amin.upl8(amax).u8to16();
+			GSVector4i v1 = v0.yxwz();
+
+			m_read.amin = v0.min_i16(v1).extract16<0>();
+			m_read.amax = v0.max_i16(v1).extract16<1>();
+		}
+	}
+
+	amin = m_read.amin;
+	amax = m_read.amax;
+}
+
+//
+
+void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut)
+{
+	// 4 blocks
+
+	for(int i = 0; i < 64; i += 16)
+	{
+		WriteCLUT_T32_I4_CSM1(&src[i +   0], &clut[i * 2 +   0]);
+		WriteCLUT_T32_I4_CSM1(&src[i +  64], &clut[i * 2 +  16]);
+		WriteCLUT_T32_I4_CSM1(&src[i + 128], &clut[i * 2 + 128]);
+		WriteCLUT_T32_I4_CSM1(&src[i + 192], &clut[i * 2 + 144]);
+	}
+}
+
+__forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut)
+{
+	// 1 block
+
+	#if _M_SSE >= 0x501
+
+	GSVector8i* s = (GSVector8i*)src;
+	GSVector8i* d = (GSVector8i*)clut;
+
+	GSVector8i v0 = s[0].acbd();
+	GSVector8i v1 = s[1].acbd();
+
+	GSVector8i::sw16(v0, v1);
+	GSVector8i::sw16(v0, v1);
+	GSVector8i::sw16(v0, v1);
+
+	d[0] = v0;
+	d[16] = v1;
+
+	#else
+
+	GSVector4i* s = (GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)clut;
+
+	GSVector4i v0 = s[0];
+	GSVector4i v1 = s[1];
+	GSVector4i v2 = s[2];
+	GSVector4i v3 = s[3];
+
+	GSVector4i::sw16(v0, v1, v2, v3);
+	GSVector4i::sw32(v0, v1, v2, v3);
+	GSVector4i::sw16(v0, v2, v1, v3);
+
+	d[0] = v0;
+	d[1] = v2;
+	d[32] = v1;
+	d[33] = v3;
+
+	#endif
+}
+
+void GSClut::WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut)
+{
+	// 2 blocks
+
+	GSVector4i* s = (GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)clut;
+
+	for(int i = 0; i < 32; i += 4)
+	{
+		GSVector4i v0 = s[i + 0];
+		GSVector4i v1 = s[i + 1];
+		GSVector4i v2 = s[i + 2];
+		GSVector4i v3 = s[i + 3];
+
+		GSVector4i::sw16(v0, v1, v2, v3);
+		GSVector4i::sw32(v0, v1, v2, v3);
+		GSVector4i::sw16(v0, v2, v1, v3);
+
+		d[i + 0] = v0;
+		d[i + 1] = v2;
+		d[i + 2] = v1;
+		d[i + 3] = v3;
+	}
+}
+
+__forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut)
+{
+	// 1 block (half)
+
+	for(int i = 0; i < 16; i++)
+	{
+		clut[i] = src[clutTableT16I4[i]];
+	}
+}
+
+void GSClut::ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst)
+{
+	for(int i = 0; i < 256; i += 16)
+	{
+		ReadCLUT_T32_I4(&clut[i], &dst[i]);
+	}
+}
+
+__forceinline void GSClut::ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst)
+{
+	GSVector4i* s = (GSVector4i*)clut;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	GSVector4i v0 = s[0];
+	GSVector4i v1 = s[1];
+	GSVector4i v2 = s[32];
+	GSVector4i v3 = s[33];
+
+	GSVector4i::sw16(v0, v2, v1, v3);
+
+	d[0] = v0;
+	d[1] = v1;
+	d[2] = v2;
+	d[3] = v3;
+}
+
+#if 0
+__forceinline void GSClut::ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst32, uint64* RESTRICT dst64)
+{
+	GSVector4i* s = (GSVector4i*)clut;
+	GSVector4i* d32 = (GSVector4i*)dst32;
+	GSVector4i* d64 = (GSVector4i*)dst64;
+
+	GSVector4i s0 = s[0];
+	GSVector4i s1 = s[1];
+	GSVector4i s2 = s[32];
+	GSVector4i s3 = s[33];
+
+	GSVector4i::sw16(s0, s2, s1, s3);
+
+	d32[0] = s0;
+	d32[1] = s1;
+	d32[2] = s2;
+	d32[3] = s3;
+
+	ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d64[0]);
+	ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d64[32]);
+	ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d64[64]);
+	ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d64[96]);
+}
+#endif
+
+#if 0
+void GSClut::ReadCLUT_T16_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst)
+{
+	for(int i = 0; i < 256; i += 16)
+	{
+		ReadCLUT_T16_I4(&clut[i], &dst[i]);
+	}
+}
+#endif
+
+#if 0
+__forceinline void GSClut::ReadCLUT_T16_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst)
+{
+	GSVector4i* s = (GSVector4i*)clut;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	GSVector4i v0 = s[0];
+	GSVector4i v1 = s[1];
+
+	d[0] = v0.upl16();
+	d[1] = v0.uph16();
+	d[2] = v1.upl16();
+	d[3] = v1.uph16();
+}
+#endif
+
+#if 0
+__forceinline void GSClut::ReadCLUT_T16_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst32, uint64* RESTRICT dst64)
+{
+	GSVector4i* s = (GSVector4i*)clut;
+	GSVector4i* d32 = (GSVector4i*)dst32;
+	GSVector4i* d64 = (GSVector4i*)dst64;
+
+	GSVector4i v0 = s[0];
+	GSVector4i v1 = s[1];
+
+	GSVector4i s0 = v0.upl16();
+	GSVector4i s1 = v0.uph16();
+	GSVector4i s2 = v1.upl16();
+	GSVector4i s3 = v1.uph16();
+
+	d32[0] = s0;
+	d32[1] = s1;
+	d32[2] = s2;
+	d32[3] = s3;
+
+	ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d64[0]);
+	ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d64[32]);
+	ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d64[64]);
+	ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d64[96]);
+}
+#endif
+
+void GSClut::ExpandCLUT64_T32_I8(const uint32* RESTRICT src, uint64* RESTRICT dst)
+{
+	GSVector4i* s = (GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	GSVector4i s0 = s[0];
+	GSVector4i s1 = s[1];
+	GSVector4i s2 = s[2];
+	GSVector4i s3 = s[3];
+
+	ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d[0]);
+	ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d[32]);
+	ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d[64]);
+	ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d[96]);
+}
+
+__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
+{
+	ExpandCLUT64_T32(hi.xxxx(), lo0, &dst[0]);
+	ExpandCLUT64_T32(hi.xxxx(), lo1, &dst[2]);
+	ExpandCLUT64_T32(hi.xxxx(), lo2, &dst[4]);
+	ExpandCLUT64_T32(hi.xxxx(), lo3, &dst[6]);
+	ExpandCLUT64_T32(hi.yyyy(), lo0, &dst[8]);
+	ExpandCLUT64_T32(hi.yyyy(), lo1, &dst[10]);
+	ExpandCLUT64_T32(hi.yyyy(), lo2, &dst[12]);
+	ExpandCLUT64_T32(hi.yyyy(), lo3, &dst[14]);
+	ExpandCLUT64_T32(hi.zzzz(), lo0, &dst[16]);
+	ExpandCLUT64_T32(hi.zzzz(), lo1, &dst[18]);
+	ExpandCLUT64_T32(hi.zzzz(), lo2, &dst[20]);
+	ExpandCLUT64_T32(hi.zzzz(), lo3, &dst[22]);
+	ExpandCLUT64_T32(hi.wwww(), lo0, &dst[24]);
+	ExpandCLUT64_T32(hi.wwww(), lo1, &dst[26]);
+	ExpandCLUT64_T32(hi.wwww(), lo2, &dst[28]);
+	ExpandCLUT64_T32(hi.wwww(), lo3, &dst[30]);
+}
+
+__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
+{
+	dst[0] = lo.upl32(hi);
+	dst[1] = lo.uph32(hi);
+}
+
+#if 0
+void GSClut::ExpandCLUT64_T16_I8(const uint32* RESTRICT src, uint64* RESTRICT dst)
+{
+	GSVector4i* s = (GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	GSVector4i s0 = s[0];
+	GSVector4i s1 = s[1];
+	GSVector4i s2 = s[2];
+	GSVector4i s3 = s[3];
+
+	ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d[0]);
+	ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d[32]);
+	ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d[64]);
+	ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d[96]);
+}
+#endif
+
+__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
+{
+	ExpandCLUT64_T16(hi.xxxx(), lo0, &dst[0]);
+	ExpandCLUT64_T16(hi.xxxx(), lo1, &dst[2]);
+	ExpandCLUT64_T16(hi.xxxx(), lo2, &dst[4]);
+	ExpandCLUT64_T16(hi.xxxx(), lo3, &dst[6]);
+	ExpandCLUT64_T16(hi.yyyy(), lo0, &dst[8]);
+	ExpandCLUT64_T16(hi.yyyy(), lo1, &dst[10]);
+	ExpandCLUT64_T16(hi.yyyy(), lo2, &dst[12]);
+	ExpandCLUT64_T16(hi.yyyy(), lo3, &dst[14]);
+	ExpandCLUT64_T16(hi.zzzz(), lo0, &dst[16]);
+	ExpandCLUT64_T16(hi.zzzz(), lo1, &dst[18]);
+	ExpandCLUT64_T16(hi.zzzz(), lo2, &dst[20]);
+	ExpandCLUT64_T16(hi.zzzz(), lo3, &dst[22]);
+	ExpandCLUT64_T16(hi.wwww(), lo0, &dst[24]);
+	ExpandCLUT64_T16(hi.wwww(), lo1, &dst[26]);
+	ExpandCLUT64_T16(hi.wwww(), lo2, &dst[28]);
+	ExpandCLUT64_T16(hi.wwww(), lo3, &dst[30]);
+}
+
+__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
+{
+	dst[0] = lo.upl16(hi);
+	dst[1] = lo.uph16(hi);
+}
+
+// TODO
+
+static const GSVector4i s_bm(0x00007c00);
+static const GSVector4i s_gm(0x000003e0);
+static const GSVector4i s_rm(0x0000001f);
+
+void GSClut::Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int w, const GIFRegTEXA& TEXA)
+{
+	ASSERT((w & 7) == 0);
+
+	const GSVector4i rm = s_rm;
+	const GSVector4i gm = s_gm;
+	const GSVector4i bm = s_bm;
+
+	GSVector4i TA0(TEXA.TA0 << 24);
+	GSVector4i TA1(TEXA.TA1 << 24);
+
+	GSVector4i c, cl, ch;
+
+	const GSVector4i* s = (const GSVector4i*)src;
+	GSVector4i* d = (GSVector4i*)dst;
+
+	if(!TEXA.AEM)
+	{
+		for(int i = 0, j = w >> 3; i < j; i++)
+		{
+			c = s[i];
+			cl = c.upl16(c);
+			ch = c.uph16(c);
+			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15));
+			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15));
+		}
+	}
+	else
+	{
+		for(int i = 0, j = w >> 3; i < j; i++)
+		{
+			c = s[i];
+			cl = c.upl16(c);
+			ch = c.uph16(c);
+			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15)).andnot(cl == GSVector4i::zero());
+			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15)).andnot(ch == GSVector4i::zero());
+		}
+	}
+}
+
+//
+
+bool GSClut::WriteState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
+{
+	return dirty || !GSVector4i::load<true>(this).eq(GSVector4i::load(&TEX0, &TEXCLUT));
+}
+
+bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0)
+{
+	return dirty || !GSVector4i::load<true>(this).eq(GSVector4i::load(&TEX0, &this->TEXA));
+}
+
+bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
+{
+	return dirty || !GSVector4i::load<true>(this).eq(GSVector4i::load(&TEX0, &TEXA));
+}
diff --git a/plugins/GSdx_legacy/GSClut.h b/plugins/GSdx_legacy/GSClut.h
new file mode 100644
index 0000000000..171e5e1452
--- /dev/null
+++ b/plugins/GSdx_legacy/GSClut.h
@@ -0,0 +1,110 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSVector.h"
+#include "GSTables.h"
+#include "GSAlignedClass.h"
+
+class GSLocalMemory;
+
+__aligned(class, 32) GSClut : public GSAlignedClass<32>
+{
+	GSLocalMemory* m_mem;
+
+	uint32 m_CBP[2];
+	uint16* m_clut;
+	uint32* m_buff32;
+	uint64* m_buff64;
+
+	__aligned(struct, 32) WriteState
+	{
+		GIFRegTEX0 TEX0;
+		GIFRegTEXCLUT TEXCLUT;
+		bool dirty;
+		bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	} m_write;
+
+	__aligned(struct, 32) ReadState
+	{
+		GIFRegTEX0 TEX0;
+		GIFRegTEXA TEXA;
+		bool dirty;
+		bool adirty;
+		int amin, amax;
+		bool IsDirty(const GIFRegTEX0& TEX0);
+		bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
+	} m_read;
+
+	typedef void (GSClut::*writeCLUT)(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+
+	writeCLUT m_wc[2][16][64];
+
+	void WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void WriteCLUT16_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void WriteCLUT16_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void WriteCLUT16S_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void WriteCLUT16S_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+
+	template<int n> void WriteCLUT32_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	template<int n> void WriteCLUT16_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	template<int n> void WriteCLUT16S_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+
+	void WriteCLUT_NULL(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) {} // xenosaga 3, bios
+
+	static void WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut);
+	static void WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut);
+	static void WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut);
+	static void WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut);
+	static void ReadCLUT_T32_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst);
+	static void ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst);
+	//static void ReadCLUT_T32_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst32, uint64* RESTRICT dst64);
+	//static void ReadCLUT_T16_I8(const uint16* RESTRICT clut, uint32* RESTRICT dst);
+	//static void ReadCLUT_T16_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst);
+	//static void ReadCLUT_T16_I4(const uint16* RESTRICT clut, uint32* RESTRICT dst32, uint64* RESTRICT dst64);
+	static void ExpandCLUT64_T32_I8(const uint32* RESTRICT src, uint64* RESTRICT dst);
+	static void ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst);
+	static void ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst);
+	//static void ExpandCLUT64_T16_I8(const uint32* RESTRICT src, uint64* RESTRICT dst);
+	static void ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst);
+	static void ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst);
+
+	static void Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int w, const GIFRegTEXA& TEXA);
+
+public:
+	GSClut(GSLocalMemory* mem);
+	virtual ~GSClut();
+
+	void Invalidate();
+	bool WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	void Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
+	//void Read(const GIFRegTEX0& TEX0);
+	void Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
+	void GetAlphaMinMax32(int& amin, int& amax);
+
+	uint32 operator [] (size_t i) const {return m_buff32[i];}
+
+	operator const uint32*() const  {return m_buff32;}
+	operator const uint64*() const {return m_buff64;}
+};
diff --git a/plugins/GSdx_legacy/GSCodeBuffer.cpp b/plugins/GSdx_legacy/GSCodeBuffer.cpp
new file mode 100644
index 0000000000..9de0e189b3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCodeBuffer.cpp
@@ -0,0 +1,73 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSCodeBuffer.h"
+
+GSCodeBuffer::GSCodeBuffer(size_t blocksize)
+	: m_blocksize(blocksize)
+	, m_pos(0)
+	, m_reserved(0)
+	, m_ptr(NULL)
+{
+}
+
+GSCodeBuffer::~GSCodeBuffer()
+{
+	for(list<void*>::iterator i = m_buffers.begin(); i != m_buffers.end(); i++)
+	{
+		vmfree(*i, m_blocksize);
+	}
+}
+
+void* GSCodeBuffer::GetBuffer(size_t size)
+{
+	ASSERT(size < m_blocksize);
+	ASSERT(m_reserved == 0);
+
+	size = (size + 15) & ~15;
+
+	if(m_ptr == NULL || m_pos + size > m_blocksize)
+	{
+		m_ptr = (uint8*)vmalloc(m_blocksize, true);
+
+		m_pos = 0;
+
+		m_buffers.push_back(m_ptr);
+	}
+
+	uint8* ptr = &m_ptr[m_pos];
+
+	m_reserved = size;
+
+	return ptr;
+}
+
+void GSCodeBuffer::ReleaseBuffer(size_t size)
+{
+	ASSERT(size <= m_reserved);
+
+	m_pos = ((m_pos + size) + 15) & ~15;
+
+	ASSERT(m_pos < m_blocksize);
+
+	m_reserved = 0;
+}
diff --git a/plugins/GSdx_legacy/GSCodeBuffer.h b/plugins/GSdx_legacy/GSCodeBuffer.h
new file mode 100644
index 0000000000..3345a4c7d5
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCodeBuffer.h
@@ -0,0 +1,37 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+class GSCodeBuffer
+{
+	list<void*> m_buffers;
+	size_t m_blocksize;
+	size_t m_pos, m_reserved;
+	uint8* m_ptr;
+
+public:
+	GSCodeBuffer(size_t blocksize = 4096 * 64); // 256k
+	virtual ~GSCodeBuffer();
+
+	void* GetBuffer(size_t size);
+	void ReleaseBuffer(size_t size);
+};
diff --git a/plugins/GSdx_legacy/GSCrc.cpp b/plugins/GSdx_legacy/GSCrc.cpp
new file mode 100644
index 0000000000..b4ab18d286
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCrc.cpp
@@ -0,0 +1,570 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSCrc.h"
+
+CRC::Game CRC::m_games[] =
+{
+	{0x00000000, NoTitle, NoRegion, 0},
+	{0x2113EA2E, MetalSlug6, JP, 0},
+	{0x42E05BAF, TomoyoAfter, JP, PointListPalette},
+	{0x7800DC84, Clannad, JP, PointListPalette},
+	{0xA6167B59, Lamune, JP, PointListPalette},
+	{0xDDB59F46, KyuuketsuKitanMoonties, JP, PointListPalette},
+	{0xC8EE2562, PiaCarroteYoukosoGPGakuenPrincess, JP, PointListPalette},
+	{0x6CF94A43, KazokuKeikakuKokoroNoKizuna, JP, PointListPalette},
+	{0xEDAF602D, DuelSaviorDestiny, JP, PointListPalette},
+	{0xA39517AB, FFX, EU, 0},
+	{0xA39517AE, FFX, FR, 0},
+	{0x941BB7D9, FFX, DE, 0},
+	{0xA39517A9, FFX, IT, 0},
+	{0x941BB7DE, FFX, ES, 0},
+	{0xA80F497C, FFX, ES, 0},
+	{0xB4414EA1, FFX, RU, 0},
+	{0xEE97DB5B, FFX, RU, 0},
+	{0xAEC495CC, FFX, RU, 0},
+	{0xBB3D833A, FFX, US, 0},
+	{0x6A4EFE60, FFX, JP, 0},
+	{0x3866CA7E, FFX, ASIA, 0}, // int.
+	{0x658597E2, FFX, JP, 0}, // int.
+	{0x9AAC5309, FFX2, EU, 0},
+	{0x9AAC530C, FFX2, FR, 0},
+	{0x9AAC530A, FFX2, ES, 0}, 
+	{0x9AAC530D, FFX2, DE, 0},
+	{0x9AAC530B, FFX2, IT, 0},
+	{0x48FE0C71, FFX2, US, 0},
+	{0x8A6D7F14, FFX2, JP, 0},
+	{0xE1FD9A2D, FFX2, JP, 0}, // int.
+	{0x11624CD6, FFX2, KO, 0},
+	{0x78DA0252, FFXII, EU, 0},
+	{0xC1274668, FFXII, EU, 0},
+	{0xDC2A467E, FFXII, EU, 0},
+	{0xCA284668, FFXII, EU, 0},
+	{0xC52B466E, FFXII, EU, 0}, //ES
+	{0xE5E71BF9, FFXII, FR, 0},
+	{0x280AD120, FFXII, JP, 0},
+	{0x08C1ED4D, HauntingGround, EU, 0},
+	{0x2CD5794C, HauntingGround, EU, 0},
+	// {0x7D4EA48F, HauntingGround, EU, 0}, // same CRC as {Genji, EU}
+	{0x867BB945, HauntingGround, JP, 0},
+	{0xE263BC4B, HauntingGround, JP, 0},
+	{0x901AAC09, HauntingGround, US, 0},
+	{0x21068223, Okami, US, 0},
+	{0x891F223F, Okami, EU, 0}, // PAL DE, ES & FR.
+	{0xC5DEFEA0, Okami, JP, 0},
+	{0x086273D2, MetalGearSolid3, EU, 0}, // - PAL UK & FR
+	{0x26A6E286, MetalGearSolid3, DE, 0},
+	{0x9F185CE1, MetalGearSolid3, EU, 0},
+	{0x98D4BC93, MetalGearSolid3, ES, 0},
+	{0x79ED26AD, MetalGearSolid3, EU, 0},
+	{0x5E31EA42, MetalGearSolid3, EU, 0},
+	{0xD7ED797D, MetalGearSolid3, DE, 0},
+	{0x053D2239, MetalGearSolid3, US, 0}, //Metal Gear Solid 3 Subsistence disc1
+	{0x01B2FA7F, MetalGearSolid3, US, 0}, //Metal Gear Solid 3 Subsistence disc2
+	{0xAA31B5BF, MetalGearSolid3, US, 0},
+	{0x86BC3040, MetalGearSolid3, US, 0}, //Metal Gear Solid 3 Subsistence disc1
+	{0x0481AD8A, MetalGearSolid3, JP, 0},
+	{0xC69ACB6F, MetalGearSolid3, KO, 0}, //Metal Gear Solid 3 Snake Eater
+	{0xB0D195EF, MetalGearSolid3, KO, 0}, //Metal Gear Solid 3 Subsistence disc1
+	{0x3EBABC9C, MetalGearSolid3, KO, 0}, //Metal Gear Solid 3 Subsistence disc2
+	{0x8A5C25A7, MetalGearSolid3, ES, 0}, //Metal Gear Solid 3 Subsistence Spanish version
+	{0x278722BF, DBZBT2, US, 0},
+	{0xFE961D28, DBZBT2, US, 0},
+	{0x0393B6BE, DBZBT2, EU, 0},
+	{0xE2F289ED, DBZBT2, JP, 0}, // Sparking Neo!
+	{0xE29C09A3, DBZBT2, KO, 0}, //DragonBall Z Sparking Neo
+	{0x0BAA4387, DBZBT2, JP, 0},
+	{0x35AA84D1, DBZBT2, NoRegion, 0},
+	{0xBE6A9CFB, DBZBT2, NoRegion, 0},
+	{0x428113C2, DBZBT3, US, 0},
+	{0xA422BB13, DBZBT3, EU, 0},
+	{0xCE93CB30, DBZBT3, JP, 0},
+	{0xF28D21F1, DBZBT3, JP, 0},
+	{0x983C53D2, DBZBT3, NoRegion, 0},
+	{0x983C53D3, DBZBT3, EU, 0},
+	{0x9B0E119F, DBZBT3, KO, 0}, //DragonBall Z Sparking Meteo
+	{0x72B3802A, SFEX3, US, 0},
+	{0x71521863, SFEX3, US, 0},
+	{0x28703748, Bully, US, 0},
+	{0x019CFA48, Bully, JP, 0},
+	{0xC78A495D, BullyCC, US, 0},
+	{0xC19A374E, SoTC, US, 0},
+	{0x7D8F539A, SoTC, EU, 0},
+	{0x0F0C4A9C, SoTC, EU, 0},
+	{0x877F3436, SoTC, JP, 0},
+	{0xA17D6AAA, SoTC, KO, 0},
+	{0x877B3D35, SoTC, CH, 0},
+	{0x3122B508, OnePieceGrandAdventure, US, 0},
+	{0x8DF14A24, OnePieceGrandAdventure, EU, 0},
+	{0xE446C9F9, OnePieceGrandAdventure, KO, 0},
+	{0xCA2073B3, OnePieceGrandBattle, KO, 0},
+	{0x66953267, OnePieceGrandAdventure, JP, 0},
+	{0xE1674F57, OnePieceGrandBattle, EU, 0},
+	{0x947B933B, OnePieceGrandAdventure, US, 0},
+	{0xB049DD5E, OnePieceGrandBattle, US, 0},
+	{0x5D02CC5B, OnePieceGrandBattle, NoRegion, 0},
+	{0x6F8545DB, ICO, US, 0},
+	{0xB01A4C95, ICO, JP, 0},
+	{0x2DF2C1EA, ICO, KO, 0},
+	{0x5C991F4E, ICO, EU, 0},
+	{0x7ACF7E03, ICO, NoRegion, 0}, // same CRC as {SpyroNewBeginning, NoRegion}
+	// and as "Twisted Metal - Black" (PAL).
+	{0x788D8B4F, ICO, EU, 0},
+	{0x29C28734, ICO, CH, 0},
+	{0xAEAD1CA3, GT4, JP, 0},
+	{0x30E41D93, GT4, KO, 0},
+	{0x44A61C8F, GT4, EU, 0},
+	{0x0086E35B, GT4, EU, 0},
+	{0x77E61C8A, GT4, US, 0},
+	{0x33C6E35E, GT4, US, 0},
+	{0x7ABDBB5E, GT3, CH, 0}, // cutie comment
+	{0x3E9D448A, GT3, CH, 0}, // cutie comment
+	{0xAD66643C, GT3, CH, 0}, // cutie comment
+	{0x6810C3BC, GT3, CH, 0}, //GRAN TURISMO Concept 2002 Tokyo-Geneva
+	{0x85AE91B3, GT3, US, 0},
+	{0xC220951A, GT3, JP, 0},
+	{0x9DE5CF65, GT3, JP, 0}, //Gran Turismo 3: A-spec
+	{0x60013EBD, GTConcept, EU, 0},
+	{0xB590CE04, GTConcept, EU, 0},
+	{0x0EEF32A3, GTConcept, KO, 0}, //Gran Turismo Concept 2002 Tokyo-Seoul
+	{0xC164550A, WildArms5, JPUNDUB, 0},
+	{0xC1640D2C, WildArms5, US, 0},
+	{0x0FCF8FE4, WildArms5, EU, 0},
+	{0x2294D322, WildArms5, JP, 0},
+	{0x565B6170, WildArms5, JP, 0},
+	{0xBBC3EFFA, WildArms4, US, 0},
+	{0xBBC396EC, WildArms4, US, 0}, //hmm such a small diff in the CRC..
+	{0x7B2DE9CC, WildArms4, EU, 0},
+	{0x8B029334, Manhunt2, EU, 0},
+	{0x3B0ADBEF, Manhunt2, US, 0},
+	{0x09F49E37, CrashBandicootWoC, NoRegion, 0},
+	{0x103B5706, CrashBandicootWoC, US, 0}, //American Greatest Hits release
+	{0x75182BE5, CrashBandicootWoC, US, 0},
+	{0x5188ABCA, CrashBandicootWoC, US, 0},
+	{0x3A03D62F, CrashBandicootWoC, EU, 0},
+	{0x013E349D, ResidentEvil4, US, 0},
+	{0xDBB7A559, ResidentEvil4, US, 0},
+	{0x6BA2F6B9, ResidentEvil4, EU, 0},
+	{0x60FA8C69, ResidentEvil4, JP, 0},
+	{0x5F254B7C, ResidentEvil4, KO, 0},
+	{0x72E1E60E, Spartan, EU, 0},
+	{0x26689C87, Spartan, JP, 0},
+	{0x08277A9E, Spartan, US, 0},
+	{0xA32F7CD0, AceCombat4, US, 0},
+	{0x5ED8FB53, AceCombat4, JP, 0},
+	{0x1B9B7563, AceCombat4, EU, 0},
+	{0xFC46EA61, Tekken5, JP, 0},
+	{0x1F88EE37, Tekken5, EU, 0},
+	{0x1F88BECD, Tekken5, EU, 0},	//language selector...
+	{0x652050D2, Tekken5, US, 0},
+	{0xEA64EF39, Tekken5, KO, 0},
+	{0x9E98B8AE, IkkiTousen, JP, 0},
+	{0xD6385328, GodOfWar, US, 0},
+	{0xF2A8D307, GodOfWar, US, 0},
+	{0xFB0E6D72, GodOfWar, EU, 0},
+	{0xEB001875, GodOfWar, EU, 0},
+	{0xCF148C74, GodOfWar, EU, 0},
+	{0xCA052D22, GodOfWar, JP, 0},
+	{0xBFCC1795, GodOfWar, KO, 0},
+	{0x9567B7D6, GodOfWar, KO, 0},
+	{0x9B5C97BA, GodOfWar, KO, 0},
+	{0xA61A4C6D, GodOfWar, US, 0},
+	{0xE23D532B, GodOfWar, NoRegion, 0},
+	{0xDF1AF973, GodOfWar, EU, 0},
+	{0x1A85E924, GodOfWar, NoRegion, 0}, // cutie comment
+	{0x608ACBD3, GodOfWar, CH, 0}, // cutie comment
+	{0x2F123FD8, GodOfWar2, US, 0}, // same CRC as RU
+	{0x44A8A22A, GodOfWar2, EU, 0},
+	{0x60BC362B, GodOfWar2, EU, 0},
+	{0x4340C7C6, GodOfWar2, KO, 0},
+	{0xE96E55BD, GodOfWar2, JP, 0},
+	{0xF8CD3DF6, GodOfWar2, NoRegion, 0},
+	{0x0B82BFF7, GodOfWar2, NoRegion, 0},
+	{0x5990866F, GodOfWar2, NoRegion, 0},
+	{0xC4C4FD5F, GodOfWar2, CH, 0},
+	{0xDCD9A9F7, GodOfWar2, EU, 0},
+	{0xFA0DF523, GodOfWar2, CH, 0}, // cutie comment
+	{0x9FEE3466, GodOfWar2, CH, 0}, // cutie comment
+	{0x5D482F18, JackieChanAdv, EU, 0},
+	{0xF0A6D880, HarvestMoon, US, 0},
+	{0x9536E111, NamcoXCapcom, JP, 0},
+	{0x75C01A04, NamcoXCapcom, US, 0}, // same CRC as another JP disc
+	{0x95CC86EF, GiTS, US, 0}, // same CRC also reported as EU
+	{0xA5768F53, GiTS, JP, 0},
+	{0xA3643EB1, GiTS, KO, 0},
+	{0xBF6F101F, GiTS, EU, 0}, // same CRC as another US disc
+	{0x6BF11378, Onimusha3, US, 0},
+	{0x71320CA8, Onimusha3, JP, 0},
+	{0xDAFFFB0D, Onimusha3, KO, 0},
+	{0xF442260C, MajokkoALaMode2, JP, 0},
+	{0x14FE77F7, TalesOfAbyss, US, 0},
+	{0x045D77E9, TalesOfAbyss, JPUNDUB, 0},
+	{0xAA5EC3A3, TalesOfAbyss, JP, 0},
+	{0xFB236A46, SonicUnleashed, US, 0},
+	{0x8C913264, SonicUnleashed, EU, 0},
+	{0x5C1EBD61, SimpsonsGame, EU, 0},
+	{0x5C1EBF61, SimpsonsGame, FR, 0},
+	{0x4C7BB3C8, SimpsonsGame, NoRegion, 0},
+	{0x4C94B32C, SimpsonsGame, NoRegion, 0},
+	{0x565B7E04, SimpsonsGame, IT, 0},
+	{0x206779D8, SimpsonsGame, EU, 0},
+	{0xBBE4D862, SimpsonsGame, US, 0},
+	{0xD71B57F4, Genji, US, 0},
+	{0xFADEBC45, Genji, EU, 0},
+	{0xB4776FC1, Genji, JP, 0},
+	{0x56242EC9, Genji, KO, 0},
+	{0xCDAF243D, Genji, CH, 0}, 
+	{0x2A5E0B61, Genji, CH, 0},
+	{0x7D4EA48F, Genji, EU, 0}, // same CRC as {HauntingGround, EU}
+	{0xE04EA200, StarOcean3, EU, 0},
+	{0x23A97857, StarOcean3, US, 0},
+	{0xBEC32D49, StarOcean3, JP, 0},
+	{0x8192A241, StarOcean3, JP, 0}, //NTSC JP special directors cut limited extra sugar on top edition (the special one :p)
+	// it's the US version with speach files from JP... {0x23A97857, StarOcean3, JPUNDUB, 0},
+	{0xCC96CE93, ValkyrieProfile2, US, 0},
+	{0x774DE8E2, ValkyrieProfile2, JP, 0},
+	{0x04CCB600, ValkyrieProfile2, EU, 0},
+	{0xB65E141B, ValkyrieProfile2, DE, 0}, // PAL German
+	{0xC70FC973, ValkyrieProfile2, IT, 0}, 
+	{0x47B9B2FD, RadiataStories, US, 0},
+	{0xAC73005E, RadiataStories, JP, 0},
+	{0xE8FCF8EC, SMTNocturne, US, ZWriteMustNotClear},	// saves/reloads z buffer around shadow drawing, same issue with all the SMT games following
+	{0xF0A31EE3, SMTNocturne, EU, ZWriteMustNotClear},	// SMTNocturne (Lucifers Call in EU)
+	{0xAE0DE7B7, SMTNocturne, EU, ZWriteMustNotClear},	// SMTNocturne (Lucifers Call in EU)
+	{0xD60DA6D4, SMTNocturne, JP, ZWriteMustNotClear},	// SMTNocturne
+	{0x0E762E8D, SMTNocturne, JP, ZWriteMustNotClear},	// SMTNocturne Maniacs
+	{0x47BA9034, SMTNocturne, JP, ZWriteMustNotClear},	// SMTNocturne Maniacs Chronicle
+	{0xD3FFC263, SMTNocturne, KO, ZWriteMustNotClear},
+	{0xD7273511, SMTDDS1, US, ZWriteMustNotClear},		// SMT Digital Devil Saga
+	{0x1683A6BE, SMTDDS1, EU, ZWriteMustNotClear},		// SMT Digital Devil Saga
+	{0x44865CE1, SMTDDS1, JP, ZWriteMustNotClear},		// SMT Digital Devil Saga
+	{0xF2E397C0, SMTDDS1, KO, ZWriteMustNotClear}, // SMT Digital Devil Saga
+	{0x43202D1A, SMTDDS2, KO, ZWriteMustNotClear}, // SMT Digital Devil Saga 2
+	{0xD382C164, SMTDDS2, US, ZWriteMustNotClear},		// SMT Digital Devil Saga 2
+	{0xD568B684, SMTDDS2, EU, ZWriteMustNotClear},		// SMT Digital Devil Saga 2
+	{0xE47C1A9C, SMTDDS2, JP, ZWriteMustNotClear},		// SMT Digital Devil Saga 2
+	{0x0B8AB37B, RozenMaidenGebetGarden, JP, 0},
+	{0x1CC39DBD, SuikodenTactics, US, 0},
+	{0x3E205556, SuikodenTactics, EU, 0},
+	{0xB808413B, SuikodenTactics, JP, 0},
+	{0x64C58FB4, TenchuFS, US, 0},
+	{0xE7CCCB1E, TenchuFS, EU, 0},
+	{0x1969B19A, TenchuFS, ES, 0},		//PAL Spanish
+	{0xBF0DC4CE, TenchuFS, DE, 0},
+	{0x696BBEC3, TenchuFS, KO, 0},
+	{0x525C1994, TenchuFS, ASIA, 0},
+	{0x0D73BBCD, TenchuFS, KO, 0},
+	{0xAFBFB287, TenchuWoH, KO, 0},
+	{0x767E383D, TenchuWoH, US, 0},
+	{0x83261085, TenchuWoH, DE, 0},		//PAL German
+	{0x7FA1510D, TenchuWoH, EU, 0},		//PAL ES, IT
+	{0xC8DADF58, TenchuWoH, EU, 0},
+	{0x13DD9957, TenchuWoH, JP, 0},
+	{0x8BC95883, Sly3, US, 0},
+	{0x8164C614, Sly3, EU, 0},
+	{0xA8CC1583, Sly3, KO, 0},
+	{0x518DD841, Sly2, KO, 0},
+	{0x07652DD9, Sly2, US, 0},
+	{0xFDA1CBF6, Sly2, EU, 0},
+	{0x15DD1F6F, Sly2, NoRegion, 0},
+	{0xA9C82AB9, DemonStone, US, 0},
+	{0x7C7578F3, DemonStone, EU, 0},
+	{0x22425C19, DemonStone, KO, 0},
+	{0x506644B3, BigMuthaTruckers, EU, 0},
+	{0x90F0D852, BigMuthaTruckers, US, 0},
+	{0x5CC9BF81, TimeSplitters2, EU, 0},
+	{0x12532F1C, TimeSplitters2, US, 0},
+	{0xC818BEC2, LordOfTheRingsTwoTowers, US, 0},
+	{0xDC43F2B8, LordOfTheRingsTwoTowers, EU, 0},
+	{0x9ABF90FB, LordOfTheRingsTwoTowers, ES, 0},
+	{0x5FF407EE, LordOfTheRingsTwoTowers, IT, 0},
+	{0xC0E909E9, LordOfTheRingsTwoTowers, JP, 0},
+	{0x6898435D, LordOfTheRingsTwoTowers, KO, 0},
+	{0xDC2F9B98, LordOfTheRingsTwoTowers, CH, 0}, // cutie comment
+	{0xEB198738, LordOfTheRingsThirdAge, US, 0},
+	{0x614F4CF4, LordOfTheRingsThirdAge, EU, 0},
+	{0x37CD4279, LordOfTheRingsThirdAge, KO, 0},
+	{0xE169BAF8, RedDeadRevolver, US, 0},
+	{0xE2E67E23, RedDeadRevolver, EU, 0},
+	{0xEDDD6573, SpidermanWoS, US, 0},	//Web of Shadows
+	{0xF14C1D82, SpidermanWoS, EU, 0},
+	{0xF56C7948, HeavyMetalThunder, JP, 0},
+	{0x2498951B, SilentHill3, US, 0},
+	{0x5088CCDB, SilentHill3, EU, 0},
+	{0x8CFE667F, SilentHill3, JP, 0},
+	{0xC6CBDE91, SilentHill3, KO, 0},
+	{0x6B149273, SilentHill2, EU, 0},
+	{0x6BBD4932, SilentHill2, EU, 0}, // Director's Cut
+	{0x8E8E384B, SilentHill2, US, 0},
+	{0xFE06A030, SilentHill2, US, 0},	//greatest hits
+	{0xE36E16C9, SilentHill2, JP, 0},
+	{0x380D6782, SilentHill2, JP, 0},	//Saigo no uta
+	{0x6DF62AEA, BleachBladeBattlers, JP, 0},
+	{0x6EB71AB0, BleachBladeBattlers, JP, 0},	//2nd
+	{0x3A446111, CastlevaniaCoD, US, 0},
+	{0xF321BC38, CastlevaniaCoD, EU, 0},
+	{0x950876FA, CastlevaniaCoD, KO, 0},
+	{0x237B84D3, CastlevaniaCoD, CH, 0},
+	{0x28270F7D, CastlevaniaLoI, US, 0},
+	{0x306CDADA, CastlevaniaLoI, EU, 0},
+	{0xA36CFF6C, CastlevaniaLoI, JP, 0},
+	{0x9A93FE5D, CastlevaniaLoI, KO, 0},
+	{0xA79B0491, NanoBreaker, JP, 0},
+	{0x7985D894, FinalFightStreetwise, US, 0}, 
+	{0xED4BF0D3, FinalFightStreetwise, US, 0}, // cutie comment
+	{0x73C560BA, FinalFightStreetwise, EU, 0},
+	{0xCBB87BF9, EvangelionJo, JP, 0}, // cutie comment
+	{0x278A91FD, CaptainTsubasa, JP, 0}, // cutie comment
+	{0xC5B75C7C, Oneechanbara2Special, JP, 0}, // cutie comment
+	{0xC0659AD1, NarutimateAccel, JP, 0}, // cutie comment
+	{0xF3D9DFBE, NarutimateAccel, JP, 0},
+	{0x59739DDE, Naruto, JP, 0}, // cutie comment
+	{0xF7786EE4, EternalPoison, JP, 0}, // cutie comment
+	{0x2BE55519, EternalPoison, US, 0},
+	{0xE01F57EC, LegoBatman, US, 0}, // cutie comment
+	{0xE01F57ED, LegoBatman, EU, 0},
+	{0xE0347841, XE3, JP, 0}, // cutie comment
+	{0xA4E88698, XE3, CH, 0},
+	{0x2088950A, XE3, US, 0},
+	// DMC(1)? {0x79B8A95F, DevilMayCry3, US, 0},
+	{0x7F3D692D, DevilMayCry3, CH, 0},
+	// {0x1A85E924, DevilMayCry3, CH, 0}, // same CRC as {GodOfWar, NoRegion}
+	{0xB1995E29, ShadowofRome, EU, 0}, // cutie comment
+	{0x958DCA28, ShadowofRome, EU, 0},
+	{0x57818AF6, ShadowofRome, US, 0}, 
+	{0xF21EE6E0, CrashNburn, US, 0},
+	{0x694A998E, TombRaiderUnderworld, JP, 0}, // cutie comment
+	{0x8E214549, TombRaiderUnderworld, EU, 0},
+	{0xB639EB17, TombRaiderAnniversary, US, 0},
+	{0xB05805B6, TombRaiderAnniversary, JP, 0}, // cutie comment
+	{0xA629A376, TombRaiderAnniversary, EU, 0},
+	{0xBC8B3F50, TombRaiderLegend, US, 0}, // cutie comment
+	{0x05177ECE, TombRaiderLegend, EU, 0},
+	{0x08FFF00D, SSX3, JP, 0}, // cutie comment
+	{0xCE942B2A, SSX3, EU, 0},
+	{0x5C891FF1, Black, US, 0},
+	{0xCAA04879, Black, EU, 0},
+	{0xADDFF505, Black, EU, 0},	//?
+	{0xB3A9F9ED, Black, JP, 0},
+	{0x7838882F, VF4, JP, 0},
+	{0xEA131B57, VF4, US, 0},
+	{0x4F755D39, TyTasmanianTiger, US, 0},
+	{0xD59D3252, TyTasmanianTiger, EU, 0},
+	{0x5A1BB2A1, TyTasmanianTiger2, US, 0},
+	{0x44A5FA15, FFVIIDoC, US, 0},
+	{0x33F7D21A, FFVIIDoC, EU, 0},
+	{0xAFAC88EF, FFVIIDoC, JP, 0},
+	{0x568A5C78, DigimonRumbleArena2, US, 0},
+	{0x785E22BB, DigimonRumbleArena2, EU, 0},
+	{0x4C5CE4C3, DigimonRumbleArena2, EU, 0},
+	{0x7F995E8D, DigimonRumbleArena2, JP, 0},
+	{0x115A184D, DigimonRumbleArena2, KO, 0},
+	{0x879CDA5E, StarWarsForceUnleashed, US, 0},
+	{0x137C792E, StarWarsForceUnleashed, US, 0},
+	{0x503BF9E1, StarWarsBattlefront, NoRegion, 0},  // EU and US versions have the same CRC
+	{0x02F4B541, StarWarsBattlefront2, NoRegion, 0}, // EU and US versions have the same CRC
+	{0xA8DB29DF, BlackHawkDown, EU, 0},
+	{0x25FC361B, DevilMayCry3, US, 0},	//SE
+	{0x2F7D8AD5, DevilMayCry3, US, 0},
+	{0x0BED0AF9, DevilMayCry3, US, 0},
+	{0x18C9343F, DevilMayCry3, EU, 0},	//SE
+	{0x7ADCB24A, DevilMayCry3, EU, 0},
+	{0x79C952B0, DevilMayCry3, JP, 0},	//SE
+	{0x7F3DDEAB, DevilMayCry3, JP, 0},
+	{0x05931990, DevilMayCry3, KO, 0},
+	{0x4AD36D59, DevilMayCry3, RU, 0},
+	{0xBEBF8793, BurnoutTakedown, US, 0},
+	{0x75BECC18, BurnoutTakedown, EU, 0},
+	{0xCE49B0DE, BurnoutTakedown, EU, 0},
+	{0xD224D348, BurnoutRevenge, US, 0},
+	{0x7E83CC5B, BurnoutRevenge, EU, 0},
+	{0xEEA60511, BurnoutRevenge, KO, 0},
+	{0x8C9576A1, BurnoutDominator, US, 0},
+	{0x8C9576B4, BurnoutDominator, EU, 0},
+	{0x4A0E5B3A, MidnightClub3, US, 0},	//dub
+	{0xEBE1972D, MidnightClub3, EU, 0},	//dub
+	{0x60A42FF5, MidnightClub3, US, 0},	//remix
+	{0x4B1A0FFA, XmenOriginsWolverine, US, 0},
+	{0xBFF3DBCB, CallofDutyFinalFronts, US, 0},
+	{0xB78A5F5A, CallofDutyFinalFronts, EU, 0},
+	{0xD03D4C77, SpyroNewBeginning, US, 0},
+	{0x0EE5646B, SpyroNewBeginning, EU, 0},
+	// {0x7ACF7E03, SpyroNewBeginning, NoRegion, 0}, // same CRC as {ICO, NoRegion}
+	//  and as "Twisted Metal - Black" (PAL).
+	{0xB80CE8EC, SpyroEternalNight, US, 0},
+	{0x8AE9536D, SpyroEternalNight, EU, 0},
+	{0xC95F0198, SpyroEternalNight, NoRegion, 0},
+	{0x43AB7214, TalesOfLegendia, US, 0},
+	{0x1F8640E0, TalesOfLegendia, JP, 0},
+	{0xE4F5DA2B, TalesOfLegendia, KO, 0},
+	{0x98C7B76D, NanoBreaker, US, 0},
+	{0x7098BE76, NanoBreaker, KO, 0},
+	{0x9B89F425, NanoBreaker, EU, 0},
+	{0x519E816B, Kunoichi, US, 0},	//Nightshade
+	{0x3FB419FD, Kunoichi, JP, 0},
+	{0x086D198E, Kunoichi, CH, 0},
+	{0x3B470BBD, Kunoichi, EU, 0},
+	{0x6BA65DD8, Kunoichi, KO, 0},
+	{0XD3F182A3, Yakuza, EU, 0},
+	{0x6F9F99F8, Yakuza, EU, 0},
+	{0x388F687B, Yakuza, US, 0},
+	{0xB7B3800A, Yakuza, JP, 0},
+	{0xA60C2E65, Yakuza2, EU, 0},
+	{0x800E3E5A, Yakuza2, EU, 0},
+	{0x97E9C87E, Yakuza2, US, 0},
+	{0xC6B95C48, Yakuza2, JP, 0},
+	{0x9000252A, SkyGunner, JP, 0},
+	{0x93092623, SkyGunner, JP, 0},
+	{0xA9461CB2, SkyGunner, US, 0},
+	{0xB799A60C, SkyGunner, NoRegion, 0},
+	{0x6848699B, JamesBondEverythingOrNothing, US, 0},
+	{0x5FFFDE40, JamesBondEverythingOrNothing, EU, 0},
+	{0xF7FB054C, Siren, CH, 0}, // cutie comment
+	{0x47C2C34A, Siren, KO, 0},
+	{0xB083CCC2, Siren, EU, 0}, // Spanish
+	{0x90F4B057, ZettaiZetsumeiToshi2, CH, 0},
+	{0xC988ECBB, ZettaiZetsumeiToshi2, JP, 0},
+	{0x2905C5C6, ZettaiZetsumeiToshi2, US, 0},  // Raw Danger!
+	{0x81CA29BE, VF4EVO, EU, 0},
+	{0xC9DEF513, VF4EVO, US, 0},
+	{0x7B402694, VF4EVO, KO, 0},
+	{0xAB01411F, VF4EVO, JP, 0},
+	{0xE11DFA28, Dororo, CH, 0},
+	{0x89954774, Dororo, US, 0},
+	{0xFDA2F2DF, Dororo, KO, 0},
+	{0xBD17248E, ShinOnimusha, JP, 0},
+	{0xBE17248E, ShinOnimusha, JP, 0},
+	{0xB817248E, ShinOnimusha, JP, 0},
+	{0x812C5A96, ShinOnimusha, EU, 0},
+	{0xFE44479E, ShinOnimusha, US, 0},
+	{0xFFDE85E9, ShinOnimusha, US, 0},
+	{0xE21404E2, GetaWay, US, 0},
+	{0xE78971DF, GetaWayBlackMonday, US, 0},
+	{0x1130BF23, SakuraTaisen, CH, 0}, // cutie comment
+	{0x4FAE8B83, SakuraTaisen, KO, 0},
+	{0xEF06DBD6, SakuraWarsSoLongMyLove, JP, 0}, // cutie comment
+	{0xDD41054D, SakuraWarsSoLongMyLove, US, 0}, // cutie comment
+	{0xC2E3A7A4, SakuraWarsSoLongMyLove, KO, 0},
+	{0x4A4B623A, FightingBeautyWulong, JP,0}, // cutie comment
+	{0x5AC7E79C, TouristTrophy, CH, 0}, // cutie comment
+	{0xFF9C0E93, TouristTrophy, US, 0},
+	{0xCA9AA903, TouristTrophy, EU, 0}, //crc hack not fully working on PAL, still needs brightness =0
+	{0xA1B3F232, GTASanAndreas, EU, 0}, // cutie comment
+	{0x399A49CA, GTASanAndreas, US, 0}, 
+	{0x60FE139C, GTASanAndreas, JP, 0}, 
+	{0x2615F542, FrontMission5, JP, 0}, 
+	{0xF60255AC, FrontMission5, JP, 0},
+	{0xCB783836, FrontMission5, JP, 0},
+	{0xAEDAEE99, GodHand, JP, 0}, 
+	{0x6FB69282, GodHand, US, 0},
+	{0x924C4AA6, GodHand, KO, 0},
+	{0x9637D496, KnightsOfTheTemple2, JP, 0}, // cutie comment
+	{0x4E811100, UltramanFightingEvolution, JP, 0}, // cutie comment
+	{0xF7F181C3, DeathByDegreesTekkenNinaWilliams, CH, 0}, // cutie comment
+	{0xF088FA5B, DeathByDegreesTekkenNinaWilliams, KO, 0},
+	{0x59683BB0, DeathByDegreesTekkenNinaWilliams, EU, 0},
+	{0x771C3B47, AlpineRacer3, JP, 0}, // cutie comment
+	{0x7367D841, AlpineRacer3, EU, 0},
+	{0x449E1F6B, HummerBadlands, US, 0}, 
+	{0xAEA1B3AD, SengokuBasara, JP, 0},
+	{0x5B659BED, Grandia3, JP, 0},
+	{0x5B657DAD, Grandia3, US, 0},
+	{0x830B6FB1, TalesofSymphonia, JP, 0},
+	{0x8409FD51, TalesofDestiny, JP, 0}, // cutie comment
+	{0xA90CD846, TalesofDestiny, JP, 0},
+	{0xC4D0FACC, SDGundamGGeneration, JP, 0}, // cutie comment
+	{0xBBDE6926, SDGundamGGeneration, JP, 0}, // cutie comment
+	{0x49D60A00, SDGundamGGeneration, JP, 0}, //NEO
+	{0x83AFB38A, SoulCalibur2, KO, 0},
+	{0xE1B01308, SoulCalibur2, US, 0},
+	{0xFB8554A0, SoulCalibur3, JP, 0},
+	{0x027C604C, SoulCalibur3, US, 0},
+	{0x24090A12, SoulCalibur3, EU, 0},
+	{0x37B99B14, SoulCalibur3, KO, 0},
+	{0xBC5480A3, SoulCalibur3, EU, 0},
+	{0xFC0F8A5B, Simple2000Vol114, JP, 0},
+	{0x0098F740, SeintoSeiya, NoRegion, 0}, // cutie comment
+	{0xBDD9BAAD, UrbanReign, US, 0}, // cutie comment
+	{0xAE4BEBD3, UrbanReign, EU, 0},
+	{0x48AC09BC, SteambotChronicles, EU, 0},
+	{0x9F391882, SteambotChronicles, US, 0},
+	{0xFEFCF9DE, SteambotChronicles, JP, 0}, // Ponkotsu Roman Daikatsugeki: Bumpy Trot 
+	{0XE1BF5DCA, SuperManReturns, US, 0},
+	{0x06A7506A, SacredBlaze, JP, 0},
+};
+
+hash_map<uint32, CRC::Game*> CRC::m_map;
+
+string ToLower( string str )
+{
+	transform( str.begin(), str.end(), str.begin(), ::tolower);
+	return str;
+}
+
+// The exclusions list is a comma separated list of: the word "all" and/or CRCs in standard hex notation (0x and 8 digits with leading 0's if required).
+// The list is case insensitive and order insensitive.
+// E.g. Disable all CRC hacks:          CrcHacksExclusions=all
+// E.g. Disable hacks for these CRCs:   CrcHacksExclusions=0x0F0C4A9C, 0x0EE5646B, 0x7ACF7E03
+bool IsCrcExcluded(string exclusionList, uint32 crc)
+{
+	string target = format( "0x%08x", crc );
+	exclusionList = ToLower( exclusionList );
+	return ( exclusionList.find( target ) != string::npos || exclusionList.find( "all" ) != string::npos );
+}
+
+CRC::Game CRC::Lookup(uint32 crc)
+{
+	if(m_map.empty())
+	{
+		string exclusions = theApp.GetConfig( "CrcHacksExclusions", "" );
+		if (exclusions.length() != 0)
+			printf( "GSdx: CrcHacksExclusions: %s\n", exclusions.c_str() );
+
+		int crcDups = 0;
+		for(size_t i = 0; i < countof(m_games); i++)
+		{
+			if( !IsCrcExcluded( exclusions, m_games[i].crc ) ){
+				if(m_map[m_games[i].crc]){
+					printf("[FIXME] GSdx: Duplicate CRC: 0x%x: (game-id/region-id) %d/%d overrides %d/%d\n"
+						, m_games[i].crc, m_games[i].title, m_games[i].region, m_map[m_games[i].crc]->title, m_map[m_games[i].crc]->region);
+					crcDups++;
+				}
+				
+				m_map[m_games[i].crc] = &m_games[i];
+			}
+			//else
+			//	printf( "GSdx: excluding CRC hack for 0x%08x\n", m_games[i].crc );
+		}
+		if(crcDups)
+			printf("[FIXME] GSdx: Duplicate CRC: Overall: %d\n", crcDups);
+	}
+
+	hash_map<uint32, Game*>::iterator i = m_map.find(crc);
+
+	if(i != m_map.end())
+	{
+		return *i->second;
+	}
+
+	return m_games[0];
+}
diff --git a/plugins/GSdx_legacy/GSCrc.h b/plugins/GSdx_legacy/GSCrc.h
new file mode 100644
index 0000000000..c47f8ec6f2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSCrc.h
@@ -0,0 +1,217 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+class CRC
+{
+public:
+	enum Title
+	{
+		NoTitle,
+		MetalSlug6,
+		TomoyoAfter,
+		Clannad,
+		Lamune,
+		KyuuketsuKitanMoonties,
+		PiaCarroteYoukosoGPGakuenPrincess,
+		KazokuKeikakuKokoroNoKizuna,
+		DuelSaviorDestiny,
+		FFX,
+		FFX2,
+		FFXII,
+		HauntingGround,
+		ShadowHearts,
+		Okami,
+		MetalGearSolid3,
+		DBZBT2,
+		DBZBT3,
+		SFEX3,
+		Bully,
+		BullyCC,
+		SoTC,
+		OnePieceGrandAdventure,
+		OnePieceGrandBattle,
+		ICO,
+		GT4,
+		GT3,
+		GTConcept,
+		WildArms5,
+		WildArms4,
+		Manhunt2,
+		CrashBandicootWoC,
+		ResidentEvil4,
+		Spartan,
+		AceCombat4,
+		Tekken5,
+		IkkiTousen,
+		GodOfWar,
+		GodOfWar2,
+		JackieChanAdv,
+		HarvestMoon,
+		NamcoXCapcom,
+		GiTS,
+		Onimusha3,
+		MajokkoALaMode2,
+		TalesOfAbyss,
+		SonicUnleashed,
+		SimpsonsGame,
+		Genji,
+		StarOcean3,
+		ValkyrieProfile2,
+		RadiataStories,
+		SMTNocturne,
+		SMTDDS1,
+		SMTDDS2,
+		RozenMaidenGebetGarden,
+		EvangelionJo,
+		SuikodenTactics,
+		CaptainTsubasa,
+		Oneechanbara2Special,
+		NarutimateAccel,
+		Naruto,
+		EternalPoison,
+		LegoBatman,
+		XE3,
+		TenchuWoH,
+		TenchuFS,
+		Sly3,
+		Sly2,
+		ShadowofRome,
+		DemonStone,
+		BigMuthaTruckers,
+		TimeSplitters2,
+		LordOfTheRingsTwoTowers,
+		LordOfTheRingsThirdAge,
+		RedDeadRevolver,
+		SpidermanWoS,
+		HeavyMetalThunder,
+		SilentHill3,
+		SilentHill2,
+		BleachBladeBattlers,
+		CastlevaniaCoD,
+		CastlevaniaLoI,
+		FinalFightStreetwise,
+		CrashNburn,
+		TombRaiderUnderworld,
+		TombRaiderAnniversary,
+		TombRaiderLegend,
+		SSX3,
+		Black,
+		VF4,
+		TyTasmanianTiger,
+		TyTasmanianTiger2,
+		FFVIIDoC,
+		DigimonRumbleArena2,
+		StarWarsForceUnleashed,
+		StarWarsBattlefront,
+		StarWarsBattlefront2,
+		BlackHawkDown,
+		DevilMayCry3,
+		BurnoutTakedown,
+		BurnoutRevenge,
+		BurnoutDominator,
+		MidnightClub3,
+		XmenOriginsWolverine,
+		CallofDutyFinalFronts,
+		SpyroNewBeginning,
+		SpyroEternalNight,
+		TalesOfLegendia,
+		NanoBreaker,
+		Kunoichi,
+		Yakuza,
+		Yakuza2,
+		SkyGunner,
+		JamesBondEverythingOrNothing,
+		Siren,
+		ZettaiZetsumeiToshi2,
+		VF4EVO,
+		Dororo,
+		ShinOnimusha,
+		GetaWay,
+		GetaWayBlackMonday,
+		SakuraTaisen,
+		SakuraWarsSoLongMyLove,
+		FightingBeautyWulong,
+		TouristTrophy,
+		GTASanAndreas,
+		FrontMission5,
+		GodHand,
+		KnightsOfTheTemple2,
+		UltramanFightingEvolution,
+		DeathByDegreesTekkenNinaWilliams,
+		AlpineRacer3,
+		HummerBadlands,
+		SengokuBasara,
+		Grandia3,
+		TalesofSymphonia,
+		TalesofDestiny,
+		SDGundamGGeneration,
+		SoulCalibur2,
+		SoulCalibur3,
+		Simple2000Vol114,
+		SeintoSeiya,
+		UrbanReign,
+		SteambotChronicles,
+		SacredBlaze,
+		SuperManReturns,
+		TitleCount,
+	};
+
+	enum Region
+	{
+		NoRegion,
+		US,
+		EU,
+		JP,
+		JPUNDUB,
+		RU,
+		FR,
+		DE,
+		IT,
+		ES,
+		CH,
+		ASIA,
+		KO,
+		RegionCount,
+	};
+
+	enum Flags
+	{
+		PointListPalette = 1,
+		ZWriteMustNotClear = 2,
+	};
+
+	struct Game
+	{
+		uint32 crc;
+		Title title;
+		Region region;
+		uint32 flags;
+	};
+
+private:
+	static Game m_games[];
+	static hash_map<uint32, Game*> m_map;
+
+public:
+	static Game Lookup(uint32 crc);
+};
diff --git a/plugins/GSdx_legacy/GSDevice.cpp b/plugins/GSdx_legacy/GSDevice.cpp
new file mode 100644
index 0000000000..23e01d0fc8
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice.cpp
@@ -0,0 +1,450 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSDevice.h"
+
+GSDevice::GSDevice()
+	: m_wnd(NULL)
+	, m_vsync(false)
+	, m_rbswapped(false)
+	, m_backbuffer(NULL)
+	, m_merge(NULL)
+	, m_weavebob(NULL)
+	, m_blend(NULL)
+	, m_shaderfx(NULL)
+	, m_fxaa(NULL)
+	, m_shadeboost(NULL)
+	, m_1x1(NULL)
+	, m_current(NULL)
+	, m_frame(0)
+{
+	memset(&m_vertex, 0, sizeof(m_vertex));
+	memset(&m_index, 0, sizeof(m_index));
+}
+
+GSDevice::~GSDevice()
+{
+	for_each(m_pool.begin(), m_pool.end(), delete_object());
+
+	delete m_backbuffer;
+	delete m_merge;
+	delete m_weavebob;
+	delete m_blend;
+	delete m_shaderfx;
+	delete m_fxaa;
+	delete m_shadeboost;
+	delete m_1x1;
+}
+
+bool GSDevice::Create(GSWnd* wnd)
+{
+	m_wnd = wnd;
+
+	return true;
+}
+
+bool GSDevice::Reset(int w, int h)
+{
+	for_each(m_pool.begin(), m_pool.end(), delete_object());
+
+	m_pool.clear();
+
+	delete m_backbuffer;
+	delete m_merge;
+	delete m_weavebob;
+	delete m_blend;
+	delete m_shaderfx;
+	delete m_fxaa;
+	delete m_shadeboost;
+	delete m_1x1;
+
+	m_backbuffer = NULL;
+	m_merge = NULL;
+	m_weavebob = NULL;
+	m_blend = NULL;
+	m_shaderfx = NULL;
+	m_fxaa = NULL;
+	m_shadeboost = NULL;
+	m_1x1 = NULL;
+
+	m_current = NULL; // current is special, points to other textures, no need to delete
+
+	return m_wnd != NULL;
+}
+
+void GSDevice::Present(const GSVector4i& r, int shader)
+{
+	GSVector4i cr = m_wnd->GetClientRect();
+
+	int w = std::max<int>(cr.width(), 1);
+	int h = std::max<int>(cr.height(), 1);
+
+	if(!m_backbuffer || m_backbuffer->GetWidth() != w || m_backbuffer->GetHeight() != h)
+	{
+		if(!Reset(w, h))
+		{
+			return;
+		}
+	}
+
+	GL_PUSH("Present");
+
+	ClearRenderTarget(m_backbuffer, 0);
+
+	if(m_current)
+	{
+		static int s_shader[5] = {ShaderConvert_COPY, ShaderConvert_SCANLINE,
+			ShaderConvert_DIAGONAL_FILTER, ShaderConvert_TRIANGULAR_FILTER,
+			ShaderConvert_COMPLEX_FILTER}; // FIXME
+
+		Present(m_current, m_backbuffer, GSVector4(r), s_shader[shader]);
+	}
+
+	Flip();
+
+	GL_POP();
+}
+
+void GSDevice::Present(GSTexture* sTex, GSTexture* dTex, const GSVector4& dRect, int shader)
+{
+	StretchRect(sTex, dTex, dRect, shader);
+}
+
+GSTexture* GSDevice::FetchSurface(int type, int w, int h, bool msaa, int format)
+{
+	GSVector2i size(w, h);
+
+	for(list<GSTexture*>::iterator i = m_pool.begin(); i != m_pool.end(); i++)
+	{
+		GSTexture* t = *i;
+
+		if(t->GetType() == type && t->GetFormat() == format && t->GetSize() == size && t->IsMSAA() == msaa)
+		{
+			m_pool.erase(i);
+
+			return t;
+		}
+	}
+
+	return CreateSurface(type, w, h, msaa, format);
+}
+
+void GSDevice::PrintMemoryUsage()
+{
+#ifdef ENABLE_OGL_DEBUG
+	uint32 pool = 0;
+	for(list<GSTexture*>::iterator i = m_pool.begin(); i != m_pool.end(); i++)
+	{
+		GSTexture* t = *i;
+		if (t)
+			pool += t->GetMemUsage();
+	}
+	GL_PERF("MEM: Surface Pool %dMB", pool >> 20u);
+#endif
+}
+
+void GSDevice::EndScene()
+{
+	m_vertex.start += m_vertex.count;
+	m_vertex.count = 0;
+	m_index.start += m_index.count;
+	m_index.count = 0;
+}
+
+void GSDevice::Recycle(GSTexture* t)
+{
+	if(t)
+	{
+		// FIXME: WARNING: Broken Texture Cache reuse render target without any
+		// cleaning (or uploading of correct gs mem data) Ofc it is wrong. If
+		// blending is enabled, rendering would be completely broken. However
+		// du to wrong invalidation of the TC it is sometimes better to reuse
+		// (partially) wrong data...
+		//
+		// Invalidating the data might be even worse. I'm not sure invalidating data really
+		// help on the perf. But people reports better perf on BDG2 (memory intensive) on OpenGL.
+		// It could be the reason.
+		t->Invalidate();
+
+		t->last_frame_used = m_frame;
+
+		m_pool.push_front(t);
+
+		//printf("%d\n",m_pool.size());
+
+		while(m_pool.size() > 300)
+		{
+			delete m_pool.back();
+
+			m_pool.pop_back();
+		}
+	}
+}
+
+void GSDevice::AgePool()
+{
+	m_frame++;
+
+	while(m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
+	{
+		delete m_pool.back();
+
+		m_pool.pop_back();
+	}
+}
+
+GSTexture* GSDevice::CreateRenderTarget(int w, int h, bool msaa, int format)
+{
+	return FetchSurface(GSTexture::RenderTarget, w, h, msaa, format);
+}
+
+GSTexture* GSDevice::CreateDepthStencil(int w, int h, bool msaa, int format)
+{
+	return FetchSurface(GSTexture::DepthStencil, w, h, msaa, format);
+}
+
+GSTexture* GSDevice::CreateTexture(int w, int h, int format)
+{
+	return FetchSurface(GSTexture::Texture, w, h, false, format);
+}
+
+GSTexture* GSDevice::CreateOffscreen(int w, int h, int format)
+{
+	return FetchSurface(GSTexture::Offscreen, w, h, false, format);
+}
+
+void GSDevice::StretchRect(GSTexture* sTex, GSTexture* dTex, const GSVector4& dRect, int shader, bool linear)
+{
+	StretchRect(sTex, GSVector4(0, 0, 1, 1), dTex, dRect, shader, linear);
+}
+
+GSTexture* GSDevice::GetCurrent()
+{
+	return m_current;
+}
+
+void GSDevice::Merge(GSTexture* sTex[2], GSVector4* sRect, GSVector4* dRect, const GSVector2i& fs, bool slbg, bool mmod, const GSVector4& c)
+{
+	if(m_merge == NULL || m_merge->GetSize() != fs)
+	{
+		Recycle(m_merge);
+
+		m_merge = CreateRenderTarget(fs.x, fs.y, false);
+	}
+
+	// TODO: m_1x1
+
+	// KH:COM crashes at startup when booting *through the bios* due to m_merge being NULL.
+	// (texture appears to be non-null, and is being re-created at a size around like 1700x340,
+	// dunno if that's relevant) -- air
+
+	if(m_merge)
+	{
+		GSTexture* tex[2] = {NULL, NULL};
+
+		for(size_t i = 0; i < countof(tex); i++)
+		{
+			if(sTex[i] != NULL)
+			{
+				tex[i] = sTex[i]->IsMSAA() ? Resolve(sTex[i]) : sTex[i];
+			}
+		}
+
+		DoMerge(tex, sRect, m_merge, dRect, slbg, mmod, c);
+
+		for(size_t i = 0; i < countof(tex); i++)
+		{
+			if(tex[i] != sTex[i])
+			{
+				Recycle(tex[i]);
+			}
+		}
+	}
+	else
+	{
+		printf("GSdx: m_merge is NULL!\n");
+	}
+
+	m_current = m_merge;
+}
+
+void GSDevice::Interlace(const GSVector2i& ds, int field, int mode, float yoffset)
+{
+	if(m_weavebob == NULL || m_weavebob->GetSize() != ds)
+	{
+		delete m_weavebob;
+
+		m_weavebob = CreateRenderTarget(ds.x, ds.y, false);
+	}
+
+	if(mode == 0 || mode == 2) // weave or blend
+	{
+		// weave first
+
+		DoInterlace(m_merge, m_weavebob, field, false, 0);
+
+		if(mode == 2)
+		{
+			// blend
+
+			if(m_blend == NULL || m_blend->GetSize() != ds)
+			{
+				delete m_blend;
+
+				m_blend = CreateRenderTarget(ds.x, ds.y, false);
+			}
+
+			DoInterlace(m_weavebob, m_blend, 2, false, 0);
+
+			m_current = m_blend;
+		}
+		else
+		{
+			m_current = m_weavebob;
+		}
+	}
+	else if(mode == 1) // bob
+	{
+		DoInterlace(m_merge, m_weavebob, 3, true, yoffset * field);
+
+		m_current = m_weavebob;
+	}
+	else
+	{
+		m_current = m_merge;
+	}
+}
+
+void GSDevice::ExternalFX()
+{
+	GSVector2i s = m_current->GetSize();
+
+	if (m_shaderfx == NULL || m_shaderfx->GetSize() != s)
+	{
+		delete m_shaderfx;
+		m_shaderfx = CreateRenderTarget(s.x, s.y, false);
+	}
+
+	if (m_shaderfx != NULL)
+	{
+		GSVector4 sRect(0, 0, 1, 1);
+		GSVector4 dRect(0, 0, s.x, s.y);
+
+		StretchRect(m_current, sRect, m_shaderfx, dRect, 7, false);
+		DoExternalFX(m_shaderfx, m_current);
+	}
+}
+
+void GSDevice::FXAA()
+{
+	GSVector2i s = m_current->GetSize();
+
+	if(m_fxaa == NULL || m_fxaa->GetSize() != s)
+	{
+		delete m_fxaa;
+		m_fxaa = CreateRenderTarget(s.x, s.y, false);
+	}
+
+	if(m_fxaa != NULL)
+	{
+		GSVector4 sRect(0, 0, 1, 1);
+		GSVector4 dRect(0, 0, s.x, s.y);
+
+		StretchRect(m_current, sRect, m_fxaa, dRect, 7, false);
+		DoFXAA(m_fxaa, m_current);
+	}
+}
+
+void GSDevice::ShadeBoost()
+{
+	GSVector2i s = m_current->GetSize();
+
+	if(m_shadeboost == NULL || m_shadeboost->GetSize() != s)
+	{
+		delete m_shadeboost;
+		m_shadeboost = CreateRenderTarget(s.x, s.y, false);
+	}
+
+	if(m_shadeboost != NULL)
+	{
+		GSVector4 sRect(0, 0, 1, 1);
+		GSVector4 dRect(0, 0, s.x, s.y);
+
+		StretchRect(m_current, sRect, m_shadeboost, dRect, 0, false);
+		DoShadeBoost(m_shadeboost, m_current);
+	}
+}
+
+bool GSDevice::ResizeTexture(GSTexture** t, int w, int h)
+{
+	if(t == NULL) {ASSERT(0); return false;}
+
+	GSTexture* t2 = *t;
+
+	if(t2 == NULL || t2->GetWidth() != w || t2->GetHeight() != h)
+	{
+		delete t2;
+
+		t2 = CreateTexture(w, h);
+
+		*t = t2;
+	}
+
+	return t2 != NULL;
+}
+
+GSAdapter::operator std::string() const
+{
+	char buf[sizeof "12345678:12345678:12345678:12345678"];
+	sprintf(buf, "%.4X:%.4X:%.8X:%.2X", vendor, device, subsys, rev);
+	return buf;
+}
+
+bool GSAdapter::operator==(const GSAdapter &desc_dxgi) const
+{
+	return vendor == desc_dxgi.vendor
+		&& device == desc_dxgi.device
+		&& subsys == desc_dxgi.subsys
+		&& rev == desc_dxgi.rev;
+}
+
+#ifdef _WIN32
+GSAdapter::GSAdapter(const DXGI_ADAPTER_DESC1 &desc_dxgi)
+	: vendor(desc_dxgi.VendorId)
+	, device(desc_dxgi.DeviceId)
+	, subsys(desc_dxgi.SubSysId)
+	, rev(desc_dxgi.Revision)
+{
+}
+
+GSAdapter::GSAdapter(const D3DADAPTER_IDENTIFIER9 &desc_d3d9)
+	: vendor(desc_d3d9.VendorId)
+	, device(desc_d3d9.DeviceId)
+	, subsys(desc_d3d9.SubSysId)
+	, rev(desc_d3d9.Revision)
+{
+}
+#endif
+#ifdef __linux__
+// TODO
+#endif
diff --git a/plugins/GSdx_legacy/GSDevice.h b/plugins/GSdx_legacy/GSDevice.h
new file mode 100644
index 0000000000..186d88b751
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice.h
@@ -0,0 +1,226 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSWnd.h"
+#include "GSTexture.h"
+#include "GSVertex.h"
+#include "GSAlignedClass.h"
+
+enum ShaderConvert {
+	ShaderConvert_COPY = 0,
+	ShaderConvert_RGBA8_TO_16_BITS,
+	ShaderConvert_DATM_1,
+	ShaderConvert_DATM_0,
+	ShaderConvert_MOD_256,
+	ShaderConvert_SCANLINE = 5,
+	ShaderConvert_DIAGONAL_FILTER,
+	ShaderConvert_TRANSPARENCY_FILTER,
+	ShaderConvert_TRIANGULAR_FILTER,
+	ShaderConvert_COMPLEX_FILTER,
+	ShaderConvert_FLOAT32_TO_32_BITS = 10,
+	ShaderConvert_FLOAT32_TO_RGBA8,
+	ShaderConvert_FLOAT16_TO_RGB5A1,
+	ShaderConvert_RGBA8_TO_FLOAT32 = 13,
+	ShaderConvert_RGBA8_TO_FLOAT24,
+	ShaderConvert_RGBA8_TO_FLOAT16,
+	ShaderConvert_RGB5A1_TO_FLOAT16,
+	ShaderConvert_RGBA_TO_8I = 17
+};
+
+#pragma pack(push, 1)
+
+class ConvertConstantBuffer
+{
+public:
+	GSVector4i ScalingFactor;
+
+	ConvertConstantBuffer() {memset(this, 0, sizeof(*this));}
+};
+
+class MergeConstantBuffer
+{
+public:
+	GSVector4 BGColor;
+
+	MergeConstantBuffer() {memset(this, 0, sizeof(*this));}
+};
+
+class InterlaceConstantBuffer
+{
+public:
+	GSVector2 ZrH;
+	float hH;
+	float _pad[1];
+
+	InterlaceConstantBuffer() {memset(this, 0, sizeof(*this));}
+};
+
+class ExternalFXConstantBuffer
+{
+public:
+	GSVector2 xyFrame;
+	GSVector4 rcpFrame;
+	GSVector4 rcpFrameOpt;
+
+	ExternalFXConstantBuffer() { memset(this, 0, sizeof(*this)); }
+};
+
+class FXAAConstantBuffer
+{
+public:
+	GSVector4 rcpFrame;
+	GSVector4 rcpFrameOpt;
+
+	FXAAConstantBuffer() {memset(this, 0, sizeof(*this));}
+};
+
+class ShadeBoostConstantBuffer
+{
+public:
+	GSVector4 rcpFrame;
+	GSVector4 rcpFrameOpt;
+
+	ShadeBoostConstantBuffer() {memset(this, 0, sizeof(*this));}
+};
+
+#pragma pack(pop)
+
+class GSDevice : public GSAlignedClass<32>
+{
+	list<GSTexture*> m_pool;
+
+protected:
+	GSWnd* m_wnd;
+	bool m_vsync;
+	bool m_rbswapped;
+	GSTexture* m_backbuffer;
+	GSTexture* m_merge;
+	GSTexture* m_weavebob;
+	GSTexture* m_blend;
+	GSTexture* m_shaderfx;
+	GSTexture* m_fxaa;
+	GSTexture* m_shadeboost;
+	GSTexture* m_1x1;
+	GSTexture* m_current;
+	struct {size_t stride, start, count, limit;} m_vertex;
+	struct {size_t start, count, limit;} m_index;
+	unsigned int m_frame; // for ageing the pool
+
+	virtual GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format) = 0;
+	virtual GSTexture* FetchSurface(int type, int w, int h, bool msaa, int format);
+
+	virtual void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c) = 0;
+	virtual void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset) = 0;
+	virtual void DoFXAA(GSTexture* sTex, GSTexture* dTex) {}
+	virtual void DoShadeBoost(GSTexture* sTex, GSTexture* dTex) {}
+	virtual void DoExternalFX(GSTexture* sTex, GSTexture* dTex) {}
+
+public:
+	GSDevice();
+	virtual ~GSDevice();
+
+	void Recycle(GSTexture* t);
+
+	enum {Windowed, Fullscreen, DontCare};
+
+	virtual bool Create(GSWnd* wnd);
+	virtual bool Reset(int w, int h);
+	virtual bool IsLost(bool update = false) {return false;}
+	virtual void Present(const GSVector4i& r, int shader);
+	virtual void Present(GSTexture* sTex, GSTexture* dTex, const GSVector4& dRect, int shader = 0);
+	virtual void Flip() {}
+
+	virtual void SetVSync(bool enable) {m_vsync = enable;}
+
+	virtual void BeginScene() {}
+	virtual void DrawPrimitive() {};
+	virtual void DrawIndexedPrimitive() {}
+	virtual void DrawIndexedPrimitive(int offset, int count) {}
+	virtual void EndScene();
+
+	virtual void ClearRenderTarget(GSTexture* t, const GSVector4& c) {}
+	virtual void ClearRenderTarget(GSTexture* t, uint32 c) {}
+	virtual void ClearDepth(GSTexture* t, float c) {}
+	virtual void ClearStencil(GSTexture* t, uint8 c) {}
+
+	virtual GSTexture* CreateRenderTarget(int w, int h, bool msaa, int format = 0);
+	virtual GSTexture* CreateDepthStencil(int w, int h, bool msaa, int format = 0);
+	virtual GSTexture* CreateTexture(int w, int h, int format = 0);
+	virtual GSTexture* CreateOffscreen(int w, int h, int format = 0);
+
+	virtual GSTexture* Resolve(GSTexture* t) {return NULL;}
+
+	virtual GSTexture* CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format = 0, int ps_shader = 0) {return NULL;}
+
+	virtual void CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r) {}
+	virtual void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true) {}
+
+	void StretchRect(GSTexture* sTex, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true);
+
+	virtual void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1) {}
+	virtual void PSSetShaderResource(int i, GSTexture* sRect) {}
+	virtual void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL) {}
+
+	GSTexture* GetCurrent();
+
+	void Merge(GSTexture* sTex[2], GSVector4* sRect, GSVector4* dRect, const GSVector2i& fs, bool slbg, bool mmod, const GSVector4& c);
+	void Interlace(const GSVector2i& ds, int field, int mode, float yoffset);
+	void FXAA();
+	void ShadeBoost();
+	void ExternalFX();
+
+	bool ResizeTexture(GSTexture** t, int w, int h);
+
+	bool IsRBSwapped() {return m_rbswapped;}
+
+	void AgePool();
+
+	virtual void PrintMemoryUsage();
+};
+
+struct GSAdapter
+{
+	uint32 vendor;
+	uint32 device;
+	uint32 subsys;
+	uint32 rev;
+
+	operator std::string() const;
+	bool operator==(const GSAdapter&) const;
+	bool operator==(const std::string &s) const
+	{
+		return (std::string)*this == s;
+	}
+	bool operator==(const char *s) const
+	{
+		return (std::string)*this == s;
+	}
+
+#ifdef _WIN32
+	GSAdapter(const DXGI_ADAPTER_DESC1 &desc_dxgi);
+	GSAdapter(const D3DADAPTER_IDENTIFIER9 &desc_d3d9);
+#endif
+#ifdef __linux__
+	// TODO
+#endif
+};
diff --git a/plugins/GSdx_legacy/GSDevice11.cpp b/plugins/GSdx_legacy/GSDevice11.cpp
new file mode 100644
index 0000000000..1c4a2d8cec
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice11.cpp
@@ -0,0 +1,1482 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSDevice11.h"
+#include "GSUtil.h"
+#include "resource.h"
+#include <fstream>
+
+GSDevice11::GSDevice11()
+{
+	memset(&m_state, 0, sizeof(m_state));
+	memset(&m_vs_cb_cache, 0, sizeof(m_vs_cb_cache));
+	memset(&m_ps_cb_cache, 0, sizeof(m_ps_cb_cache));
+
+	FXAA_Compiled = false;
+	ExShader_Compiled = false;
+	
+	m_state.topology = D3D11_PRIMITIVE_TOPOLOGY_UNDEFINED;
+	m_state.bf = -1;
+}
+
+GSDevice11::~GSDevice11()
+{
+}
+
+bool GSDevice11::Create(GSWnd* wnd)
+{
+	if(!__super::Create(wnd))
+	{
+		return false;
+	}
+
+	HRESULT hr = E_FAIL;
+
+	DXGI_SWAP_CHAIN_DESC scd;
+	D3D11_BUFFER_DESC bd;
+	D3D11_SAMPLER_DESC sd;
+	D3D11_DEPTH_STENCIL_DESC dsd;
+	D3D11_RASTERIZER_DESC rd;
+	D3D11_BLEND_DESC bsd;
+
+	CComPtr<IDXGIAdapter1> adapter;
+	D3D_DRIVER_TYPE driver_type = D3D_DRIVER_TYPE_HARDWARE;
+
+	std::string adapter_id = theApp.GetConfig("Adapter", "default");
+
+	if (adapter_id == "default")
+		;
+	else if (adapter_id == "ref")
+	{
+		driver_type = D3D_DRIVER_TYPE_REFERENCE;
+	}
+	else
+	{
+		CComPtr<IDXGIFactory1> dxgi_factory;
+		CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&dxgi_factory);
+		if (dxgi_factory)
+			for (int i = 0;; i++)
+			{
+				CComPtr<IDXGIAdapter1> enum_adapter;
+				if (S_OK != dxgi_factory->EnumAdapters1(i, &enum_adapter))
+					break;
+				DXGI_ADAPTER_DESC1 desc;
+				hr = enum_adapter->GetDesc1(&desc);
+				if (S_OK == hr && GSAdapter(desc) == adapter_id)
+				{
+					adapter = enum_adapter;
+					driver_type = D3D_DRIVER_TYPE_UNKNOWN;
+					break;
+				}
+			}
+	}
+
+	memset(&scd, 0, sizeof(scd));
+
+	scd.BufferCount = 2;
+	scd.BufferDesc.Width = 1;
+	scd.BufferDesc.Height = 1;
+	scd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
+	//scd.BufferDesc.RefreshRate.Numerator = 60;
+	//scd.BufferDesc.RefreshRate.Denominator = 1;
+	scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
+	scd.OutputWindow = (HWND)m_wnd->GetHandle();
+	scd.SampleDesc.Count = 1;
+	scd.SampleDesc.Quality = 0;
+
+	// Always start in Windowed mode.  According to MS, DXGI just "prefers" this, and it's more or less
+	// required if we want to add support for dual displays later on.  The fullscreen/exclusive flip
+	// will be issued after all other initializations are complete.
+
+	scd.Windowed = TRUE;
+
+	spritehack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SpriteHack", 0) : 0;
+	// NOTE : D3D11_CREATE_DEVICE_SINGLETHREADED
+	//   This flag is safe as long as the DXGI's internal message pump is disabled or is on the
+	//   same thread as the GS window (which the emulator makes sure of, if it utilizes a
+	//   multithreaded GS).  Setting the flag is a nice and easy 5% speedup on GS-intensive scenes.
+
+	uint32 flags = D3D11_CREATE_DEVICE_SINGLETHREADED;
+
+#ifdef DEBUG
+	flags |= D3D11_CREATE_DEVICE_DEBUG;
+#endif
+
+	D3D_FEATURE_LEVEL level;
+
+	const D3D_FEATURE_LEVEL levels[] =
+	{
+		D3D_FEATURE_LEVEL_11_0,
+		D3D_FEATURE_LEVEL_10_1,
+		D3D_FEATURE_LEVEL_10_0,
+	};
+
+	hr = D3D11CreateDeviceAndSwapChain(adapter, driver_type, NULL, flags, levels, countof(levels), D3D11_SDK_VERSION, &scd, &m_swapchain, &m_dev, &level, &m_ctx);
+
+	if(FAILED(hr)) return false;
+
+	if(!SetFeatureLevel(level, true))
+	{
+		return false;
+	}
+
+	D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS options;
+
+	hr = m_dev->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &options, sizeof(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS));
+
+	// msaa
+
+	for(uint32 i = 2; i <= D3D11_MAX_MULTISAMPLE_SAMPLE_COUNT; i++)
+	{
+		uint32 quality[2] = {0, 0};
+
+		if(SUCCEEDED(m_dev->CheckMultisampleQualityLevels(DXGI_FORMAT_R8G8B8A8_UNORM, i, &quality[0])) && quality[0] > 0
+		&& SUCCEEDED(m_dev->CheckMultisampleQualityLevels(DXGI_FORMAT_D32_FLOAT_S8X24_UINT, i, &quality[1])) && quality[1] > 0)
+		{
+			m_msaa_desc.Count = i;
+			m_msaa_desc.Quality = std::min<uint32>(quality[0] - 1, quality[1] - 1);
+
+			if(i >= m_msaa) break;
+		}
+	}
+
+	if(m_msaa_desc.Count == 1)
+	{
+		m_msaa = 0;
+	}
+
+	// convert
+
+	D3D11_INPUT_ELEMENT_DESC il_convert[] =
+	{
+		{"POSITION", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
+		{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
+	};
+
+	vector<unsigned char> shader;
+	theApp.LoadResource(IDR_CONVERT_FX, shader);
+	CompileShader((const char *)shader.data(), shader.size(), "convert.fx", nullptr, "vs_main", nullptr, &m_convert.vs, il_convert, countof(il_convert), &m_convert.il);
+
+	for(size_t i = 0; i < countof(m_convert.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "convert.fx", nullptr, format("ps_main%d", i).c_str(), nullptr, &m_convert.ps[i]);
+	}
+
+	memset(&dsd, 0, sizeof(dsd));
+
+	dsd.DepthEnable = false;
+	dsd.StencilEnable = false;
+
+	hr = m_dev->CreateDepthStencilState(&dsd, &m_convert.dss);
+
+	memset(&bsd, 0, sizeof(bsd));
+
+	bsd.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;
+
+	hr = m_dev->CreateBlendState(&bsd, &m_convert.bs);
+
+	// merge
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(MergeConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_merge.cb);
+
+	theApp.LoadResource(IDR_MERGE_FX, shader);
+	for(size_t i = 0; i < countof(m_merge.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "merge.fx", nullptr, format("ps_main%d", i).c_str(), nullptr, &m_merge.ps[i]);
+	}
+
+	memset(&bsd, 0, sizeof(bsd));
+
+	bsd.RenderTarget[0].BlendEnable = true;
+	bsd.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD;
+	bsd.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA;
+	bsd.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA;
+	bsd.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD;
+	bsd.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ONE;
+	bsd.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO;
+	bsd.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL;
+
+	hr = m_dev->CreateBlendState(&bsd, &m_merge.bs);
+
+	// interlace
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(InterlaceConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_interlace.cb);
+
+	theApp.LoadResource(IDR_INTERLACE_FX, shader);
+	for(size_t i = 0; i < countof(m_interlace.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "interlace.fx", nullptr, format("ps_main%d", i).c_str(), nullptr, &m_interlace.ps[i]);
+	}
+
+	// Shade Boost	
+
+	int ShadeBoost_Contrast = theApp.GetConfig("ShadeBoost_Contrast", 50);
+	int ShadeBoost_Brightness = theApp.GetConfig("ShadeBoost_Brightness", 50);
+	int ShadeBoost_Saturation = theApp.GetConfig("ShadeBoost_Saturation", 50);
+		
+	string str[3];		
+		
+	str[0] = format("%d", ShadeBoost_Saturation);
+	str[1] = format("%d", ShadeBoost_Brightness);
+	str[2] = format("%d", ShadeBoost_Contrast);
+
+	D3D_SHADER_MACRO macro[] =
+	{			
+		{"SB_SATURATION", str[0].c_str()},
+		{"SB_BRIGHTNESS", str[1].c_str()},
+		{"SB_CONTRAST", str[2].c_str()},
+		{NULL, NULL},
+	};
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(ShadeBoostConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_shadeboost.cb);
+
+	theApp.LoadResource(IDR_SHADEBOOST_FX, shader);
+	CompileShader((const char *)shader.data(), shader.size(), "shadeboost.fx", nullptr, "ps_main", macro, &m_shadeboost.ps);
+
+	// External fx shader
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(ExternalFXConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_shaderfx.cb);
+
+	// Fxaa
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(FXAAConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_fxaa.cb);
+
+	//
+
+	memset(&rd, 0, sizeof(rd));
+
+	rd.FillMode = D3D11_FILL_SOLID;
+	rd.CullMode = D3D11_CULL_NONE;
+	rd.FrontCounterClockwise = false;
+	rd.DepthBias = false;
+	rd.DepthBiasClamp = 0;
+	rd.SlopeScaledDepthBias = 0;
+	rd.DepthClipEnable = false; // ???
+	rd.ScissorEnable = true;
+	rd.MultisampleEnable = true;
+	rd.AntialiasedLineEnable = false;
+
+	hr = m_dev->CreateRasterizerState(&rd, &m_rs);
+
+	m_ctx->RSSetState(m_rs);
+
+	//
+
+	memset(&sd, 0, sizeof(sd));
+
+	sd.Filter = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3D11_FILTER_ANISOTROPIC : D3D11_FILTER_MIN_MAG_MIP_LINEAR;
+	sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.MinLOD = -FLT_MAX;
+	sd.MaxLOD = FLT_MAX;
+	sd.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+	sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
+
+	hr = m_dev->CreateSamplerState(&sd, &m_convert.ln);
+
+	sd.Filter = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3D11_FILTER_ANISOTROPIC : D3D11_FILTER_MIN_MAG_MIP_POINT;
+
+	hr = m_dev->CreateSamplerState(&sd, &m_convert.pt);
+
+	//
+
+	Reset(1, 1);
+
+	//
+
+	CreateTextureFX();
+
+	//
+
+	memset(&dsd, 0, sizeof(dsd));
+
+	dsd.DepthEnable = false;
+	dsd.StencilEnable = true;
+	dsd.StencilReadMask = 1;
+	dsd.StencilWriteMask = 1;
+	dsd.FrontFace.StencilFunc = D3D11_COMPARISON_ALWAYS;
+	dsd.FrontFace.StencilPassOp = D3D11_STENCIL_OP_REPLACE;
+	dsd.FrontFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+	dsd.FrontFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+	dsd.BackFace.StencilFunc = D3D11_COMPARISON_ALWAYS;
+	dsd.BackFace.StencilPassOp = D3D11_STENCIL_OP_REPLACE;
+	dsd.BackFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+	dsd.BackFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+
+	m_dev->CreateDepthStencilState(&dsd, &m_date.dss);
+
+	D3D11_BLEND_DESC blend;
+
+	memset(&blend, 0, sizeof(blend));
+
+	m_dev->CreateBlendState(&blend, &m_date.bs);
+
+	// Exclusive/Fullscreen flip, issued for legacy (managed) windows only.  GSopen2 style
+	// emulators will issue the flip themselves later on.
+
+	if(m_wnd->IsManaged())
+	{
+		SetExclusive(!theApp.GetConfig("windowed", 1));
+	}
+
+	return true;
+}
+
+bool GSDevice11::Reset(int w, int h)
+{
+	if(!__super::Reset(w, h))
+		return false;
+
+	if(m_swapchain)
+	{
+		DXGI_SWAP_CHAIN_DESC scd;
+
+		memset(&scd, 0, sizeof(scd));
+
+		m_swapchain->GetDesc(&scd);
+		m_swapchain->ResizeBuffers(scd.BufferCount, w, h, scd.BufferDesc.Format, 0);
+
+		CComPtr<ID3D11Texture2D> backbuffer;
+
+		if(FAILED(m_swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backbuffer)))
+		{
+			return false;
+		}
+
+		m_backbuffer = new GSTexture11(backbuffer);
+	}
+
+	return true;
+}
+
+void GSDevice11::SetExclusive(bool isExcl)
+{
+	if(!m_swapchain) return;
+
+	// TODO : Support for alternative display modes, by finishing this code below:
+	//  Video mode info should be pulled form config/ini.
+
+	/*DXGI_MODE_DESC desc;
+	memset(&desc, 0, sizeof(desc));
+	desc.RefreshRate = 0;		// must be zero for best results.
+
+	m_swapchain->ResizeTarget(&desc);
+	*/
+
+	HRESULT hr = m_swapchain->SetFullscreenState(isExcl, NULL);
+
+	if(hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE)
+	{
+		fprintf(stderr, "(GSdx10) SetExclusive(%s) failed; request unavailable.", isExcl ? "true" : "false");
+	}
+}
+
+void GSDevice11::Flip()
+{
+	m_swapchain->Present(m_vsync, 0);
+}
+
+void GSDevice11::DrawPrimitive()
+{
+	m_ctx->Draw(m_vertex.count, m_vertex.start);
+}
+
+void GSDevice11::DrawIndexedPrimitive()
+{
+	m_ctx->DrawIndexed(m_index.count, m_index.start, m_vertex.start);
+}
+
+void GSDevice11::DrawIndexedPrimitive(int offset, int count)
+{
+	ASSERT(offset + count <= m_index.count);
+
+	m_ctx->DrawIndexed(count, m_index.start + offset, m_vertex.start);
+}
+
+void GSDevice11::Dispatch(uint32 x, uint32 y, uint32 z)
+{
+	m_ctx->Dispatch(x, y, z);
+}
+
+void GSDevice11::ClearRenderTarget(GSTexture* t, const GSVector4& c)
+{
+	if (!t) return;
+	m_ctx->ClearRenderTargetView(*(GSTexture11*)t, c.v);
+}
+
+void GSDevice11::ClearRenderTarget(GSTexture* t, uint32 c)
+{
+	if (!t) return;
+	GSVector4 color = GSVector4::rgba32(c) * (1.0f / 255);
+
+	m_ctx->ClearRenderTargetView(*(GSTexture11*)t, color.v);
+}
+
+void GSDevice11::ClearDepth(GSTexture* t, float c)
+{
+	if (!t) return;
+	m_ctx->ClearDepthStencilView(*(GSTexture11*)t, D3D11_CLEAR_DEPTH, c, 0);
+}
+
+void GSDevice11::ClearStencil(GSTexture* t, uint8 c)
+{
+	if (!t) return;
+	m_ctx->ClearDepthStencilView(*(GSTexture11*)t, D3D11_CLEAR_STENCIL, 0, c);
+}
+
+GSTexture* GSDevice11::CreateSurface(int type, int w, int h, bool msaa, int format)
+{
+	HRESULT hr;
+
+	D3D11_TEXTURE2D_DESC desc;
+
+	memset(&desc, 0, sizeof(desc));
+
+	desc.Width = w;
+	desc.Height = h;
+	desc.Format = (DXGI_FORMAT)format;
+	desc.MipLevels = 1;
+	desc.ArraySize = 1;
+	desc.SampleDesc.Count = 1;
+	desc.SampleDesc.Quality = 0;
+	desc.Usage = D3D11_USAGE_DEFAULT;
+
+	if(msaa)
+	{
+		desc.SampleDesc = m_msaa_desc;
+	}
+
+	switch(type)
+	{
+	case GSTexture::RenderTarget:
+		desc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
+		break;
+	case GSTexture::DepthStencil:
+		desc.BindFlags = D3D11_BIND_DEPTH_STENCIL;
+		break;
+	case GSTexture::Texture:
+		desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+		break;
+	case GSTexture::Offscreen:
+		desc.Usage = D3D11_USAGE_STAGING;
+		desc.CPUAccessFlags |= D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+		break;
+	}
+
+	GSTexture11* t = NULL;
+
+	CComPtr<ID3D11Texture2D> texture;
+
+	hr = m_dev->CreateTexture2D(&desc, NULL, &texture);
+
+	if(SUCCEEDED(hr))
+	{
+		t = new GSTexture11(texture);
+
+		switch(type)
+		{
+		case GSTexture::RenderTarget:
+			ClearRenderTarget(t, 0);
+			break;
+		case GSTexture::DepthStencil:
+			ClearDepth(t, 0);
+			break;
+		}
+	}
+
+	return t;
+}
+
+GSTexture* GSDevice11::CreateRenderTarget(int w, int h, bool msaa, int format)
+{
+	return __super::CreateRenderTarget(w, h, msaa, format ? format : DXGI_FORMAT_R8G8B8A8_UNORM);
+}
+
+GSTexture* GSDevice11::CreateDepthStencil(int w, int h, bool msaa, int format)
+{
+	return __super::CreateDepthStencil(w, h, msaa, format ? format : DXGI_FORMAT_D32_FLOAT_S8X24_UINT); // DXGI_FORMAT_R32G8X24_TYPELESS
+}
+
+GSTexture* GSDevice11::CreateTexture(int w, int h, int format)
+{
+	return __super::CreateTexture(w, h, format ? format : DXGI_FORMAT_R8G8B8A8_UNORM);
+}
+
+GSTexture* GSDevice11::CreateOffscreen(int w, int h, int format)
+{
+	return __super::CreateOffscreen(w, h, format ? format : DXGI_FORMAT_R8G8B8A8_UNORM);
+}
+
+GSTexture* GSDevice11::Resolve(GSTexture* t)
+{
+	ASSERT(t != NULL && t->IsMSAA());
+
+	if(GSTexture* dst = CreateRenderTarget(t->GetWidth(), t->GetHeight(), false, t->GetFormat()))
+	{
+		dst->SetScale(t->GetScale());
+
+		m_ctx->ResolveSubresource(*(GSTexture11*)dst, 0, *(GSTexture11*)t, 0, (DXGI_FORMAT)t->GetFormat());
+
+		return dst;
+	}
+
+	return NULL;
+}
+
+GSTexture* GSDevice11::CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format, int ps_shader)
+{
+	GSTexture* dst = NULL;
+
+	if(format == 0)
+	{
+		format = DXGI_FORMAT_R8G8B8A8_UNORM;
+	}
+
+	if(format != DXGI_FORMAT_R8G8B8A8_UNORM && format != DXGI_FORMAT_R16_UINT)
+	{
+		ASSERT(0);
+
+		return false;
+	}
+
+	if(GSTexture* rt = CreateRenderTarget(w, h, false, format))
+	{
+		GSVector4 dRect(0, 0, w, h);
+
+		if(GSTexture* src2 = src->IsMSAA() ? Resolve(src) : src)
+		{
+			StretchRect(src2, sRect, rt, dRect, m_convert.ps[format == DXGI_FORMAT_R16_UINT ? 1 : 0], NULL);
+
+			if(src2 != src) Recycle(src2);
+		}
+
+		dst = CreateOffscreen(w, h, format);
+
+		if(dst)
+		{
+			m_ctx->CopyResource(*(GSTexture11*)dst, *(GSTexture11*)rt);
+		}
+
+		Recycle(rt);
+	}
+
+	return dst;
+}
+
+void GSDevice11::CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r)
+{
+	if(!sTex || !dTex)
+	{
+		ASSERT(0);
+		return;
+	}
+
+	D3D11_BOX box = {r.left, r.top, 0, r.right, r.bottom, 1};
+
+	m_ctx->CopySubresourceRegion(*(GSTexture11*)dTex, 0, 0, 0, 0, *(GSTexture11*)sTex, 0, &box);
+}
+
+void GSDevice11::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, m_convert.ps[shader], NULL, linear);
+}
+
+void GSDevice11::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, ps, ps_cb, m_convert.bs, linear);
+}
+
+void GSDevice11::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear)
+{
+	if(!sTex || !dTex)
+	{
+		ASSERT(0);
+		return;
+	}
+
+	BeginScene();
+
+	GSVector2i ds = dTex->GetSize();
+
+	// om
+
+	OMSetDepthStencilState(m_convert.dss, 0);
+	OMSetBlendState(bs, 0);
+	OMSetRenderTargets(dTex, NULL);
+
+	// ia
+
+	float left = dRect.x * 2 / ds.x - 1.0f;
+	float top = 1.0f - dRect.y * 2 / ds.y;
+	float right = dRect.z * 2 / ds.x - 1.0f;
+	float bottom = 1.0f - dRect.w * 2 / ds.y;
+
+	GSVertexPT1 vertices[] =
+	{
+		{GSVector4(left, top, 0.5f, 1.0f), GSVector2(sRect.x, sRect.y)},
+		{GSVector4(right, top, 0.5f, 1.0f), GSVector2(sRect.z, sRect.y)},
+		{GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sRect.x, sRect.w)},
+		{GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sRect.z, sRect.w)},
+	};
+
+
+
+	IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
+	IASetInputLayout(m_convert.il);
+	IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
+
+	// vs
+
+	VSSetShader(m_convert.vs, NULL);
+
+
+	// gs
+	/* NVIDIA HACK!!!!
+	Not sure why, but having the Geometry shader disabled causes the strange stretching in recent drivers*/
+
+	GSSelector sel;
+	//Don't use shading for stretching, we're just passing through - Note: With Win10 it seems to cause other bugs when shading is off if any of the coords is greater than 0
+	//I really don't know whats going on there, but this seems to resolve it mostly (if not all, not tester a lot of games, only BIOS, FFXII and VP2)
+	//sel.iip = (sRect.y > 0.0f || sRect.w > 0.0f) ? 1 : 0; 
+	//sel.prim = 2; //Triangle Strip
+	//SetupGS(sel);
+
+	GSSetShader(NULL);
+
+	/*END OF HACK*/
+	
+	//
+
+	// ps
+
+	PSSetShaderResources(sTex, NULL);
+	PSSetSamplerState(linear ? m_convert.ln : m_convert.pt, NULL);
+	PSSetShader(ps, ps_cb);
+
+	//
+
+	DrawPrimitive();
+
+	//
+
+	EndScene();
+
+	PSSetShaderResources(NULL, NULL);
+}
+
+void GSDevice11::DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c)
+{
+	ClearRenderTarget(dTex, c);
+
+	if(sTex[1] && !slbg)
+	{
+		StretchRect(sTex[1], sRect[1], dTex, dRect[1], m_merge.ps[0], NULL, true);
+	}
+
+	if(sTex[0])
+	{
+		m_ctx->UpdateSubresource(m_merge.cb, 0, NULL, &c, 0, 0);
+
+		StretchRect(sTex[0], sRect[0], dTex, dRect[0], m_merge.ps[mmod ? 1 : 0], m_merge.cb, m_merge.bs, true);
+	}
+}
+
+void GSDevice11::DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset)
+{
+	GSVector4 s = GSVector4(dTex->GetSize());
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0.0f, yoffset, s.x, s.y + yoffset);
+
+	InterlaceConstantBuffer cb;
+
+	cb.ZrH = GSVector2(0, 1.0f / s.y);
+	cb.hH = s.y / 2;
+
+	m_ctx->UpdateSubresource(m_interlace.cb, 0, NULL, &cb, 0, 0);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_interlace.ps[shader], m_interlace.cb, linear);
+}
+
+//Included an init function for this also. Just to be safe.
+void GSDevice11::InitExternalFX()
+{
+	if (!ExShader_Compiled)
+	{
+		try {
+			std::string config_name(theApp.GetConfig("shaderfx_conf", "shaders/GSdx_FX_Settings.ini"));
+			std::ifstream fconfig(config_name);
+			std::stringstream shader;
+			if (fconfig.good())
+				shader << fconfig.rdbuf() << "\n";
+			else
+				fprintf(stderr, "GSdx: External shader config '%s' not loaded.\n", config_name.c_str());
+
+			std::string shader_name(theApp.GetConfig("shaderfx_glsl", "shaders/GSdx.fx"));
+			std::ifstream fshader(shader_name);
+			if (fshader.good())
+			{
+				shader << fshader.rdbuf();
+				CompileShader(shader.str().c_str(), shader.str().length(), shader_name.c_str(), nullptr, "ps_main", nullptr, &m_shaderfx.ps);
+			}
+			else
+			{
+				fprintf(stderr, "GSdx: External shader '%s' not loaded and will be disabled!\n", shader_name.c_str());
+			}
+		}
+		catch (GSDXRecoverableError) {
+			printf("GSdx: failed to compile external post-processing shader. \n");
+		}
+		ExShader_Compiled = true;
+	}
+}
+
+void GSDevice11::DoExternalFX(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ExternalFXConstantBuffer cb;
+
+	InitExternalFX();
+
+	cb.xyFrame = GSVector2(s.x, s.y);
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	m_ctx->UpdateSubresource(m_shaderfx.cb, 0, NULL, &cb, 0, 0);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shaderfx.ps, m_shaderfx.cb, true);
+}
+
+// This shouldn't be necessary, we have some bug corrupting memory
+// and for some reason isolating this code makes the plugin not crash
+void GSDevice11::InitFXAA()
+{
+	if (!FXAA_Compiled)
+	{
+		try {
+			vector<unsigned char> shader;
+			theApp.LoadResource(IDR_FXAA_FX, shader);
+			CompileShader((const char *)shader.data(), shader.size(), "fxaa.fx", nullptr, "ps_main", nullptr, &m_fxaa.ps);
+		}
+		catch (GSDXRecoverableError) {
+			printf("GSdx: failed to compile fxaa shader.\n");
+		}
+		FXAA_Compiled = true;
+	}
+}
+
+void GSDevice11::DoFXAA(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	FXAAConstantBuffer cb;
+
+	InitFXAA();
+
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	m_ctx->UpdateSubresource(m_fxaa.cb, 0, NULL, &cb, 0, 0);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_fxaa.ps, m_fxaa.cb, true);
+
+	//sTex->Save("c:\\temp1\\1.bmp");
+	//dTex->Save("c:\\temp1\\2.bmp");
+}
+
+void GSDevice11::DoShadeBoost(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ShadeBoostConstantBuffer cb;
+
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	m_ctx->UpdateSubresource(m_shadeboost.cb, 0, NULL, &cb, 0, 0);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shadeboost.ps, m_shadeboost.cb, true);
+}
+
+void GSDevice11::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm)
+{
+	// sfex3 (after the capcom logo), vf4 (first menu fading in), ffxii shadows, rumble roses shadows, persona4 shadows
+
+	BeginScene();
+
+	ClearStencil(ds, 0);
+
+	// om
+
+	OMSetDepthStencilState(m_date.dss, 1);
+	OMSetBlendState(m_date.bs, 0);
+	OMSetRenderTargets(NULL, ds);
+
+	// ia
+
+	IASetVertexBuffer(vertices, sizeof(vertices[0]), 4);
+	IASetInputLayout(m_convert.il);
+	IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
+
+	// vs
+
+	VSSetShader(m_convert.vs, NULL);
+
+	// gs
+
+	GSSetShader(NULL);
+
+	// ps
+
+	GSTexture* rt2 = rt->IsMSAA() ? Resolve(rt) : rt;
+
+	PSSetShaderResources(rt2, NULL);
+	PSSetSamplerState(m_convert.pt, NULL);
+	PSSetShader(m_convert.ps[datm ? 2 : 3], NULL);
+
+	//
+
+	DrawPrimitive();
+
+	//
+
+	EndScene();
+
+	if(rt2 != rt) Recycle(rt2);
+}
+
+void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t count)
+{
+	void* ptr = NULL;
+
+	if(IAMapVertexBuffer(&ptr, stride, count))
+	{
+		GSVector4i::storent(ptr, vertex, count * stride);
+
+		IAUnmapVertexBuffer();
+	}
+}
+
+bool GSDevice11::IAMapVertexBuffer(void** vertex, size_t stride, size_t count)
+{
+	ASSERT(m_vertex.count == 0);
+
+	if(count * stride > m_vertex.limit * m_vertex.stride)
+	{
+		m_vb_old = m_vb;
+		m_vb = NULL;
+
+		m_vertex.start = 0;
+		m_vertex.limit = std::max<int>(count * 3 / 2, 11000);
+	}
+
+	if(m_vb == NULL)
+	{
+		D3D11_BUFFER_DESC bd;
+
+		memset(&bd, 0, sizeof(bd));
+
+		bd.Usage = D3D11_USAGE_DYNAMIC;
+		bd.ByteWidth = m_vertex.limit * stride;
+		bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+		bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+
+		HRESULT hr;
+
+		hr = m_dev->CreateBuffer(&bd, NULL, &m_vb);
+
+		if(FAILED(hr)) return false;
+	}
+
+	D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
+
+	if(m_vertex.start + count > m_vertex.limit || stride != m_vertex.stride)
+	{
+		m_vertex.start = 0;
+
+		type = D3D11_MAP_WRITE_DISCARD;
+	}
+
+	D3D11_MAPPED_SUBRESOURCE m;
+
+	if(FAILED(m_ctx->Map(m_vb, 0, type, 0, &m)))
+	{
+		return false;
+	}
+
+	*vertex = (uint8*)m.pData + m_vertex.start * stride;
+
+	m_vertex.count = count;
+	m_vertex.stride = stride;
+
+	return true;
+}
+
+void GSDevice11::IAUnmapVertexBuffer()
+{
+	m_ctx->Unmap(m_vb, 0);
+
+	IASetVertexBuffer(m_vb, m_vertex.stride);
+}
+
+void GSDevice11::IASetVertexBuffer(ID3D11Buffer* vb, size_t stride)
+{
+	if(m_state.vb != vb || m_state.vb_stride != stride)
+	{
+		m_state.vb = vb;
+		m_state.vb_stride = stride;
+
+		uint32 stride2 = stride;
+		uint32 offset = 0;
+
+		m_ctx->IASetVertexBuffers(0, 1, &vb, &stride2, &offset);
+	}
+}
+
+void GSDevice11::IASetIndexBuffer(const void* index, size_t count)
+{
+	ASSERT(m_index.count == 0);
+
+	if(count > m_index.limit)
+	{
+		m_ib_old = m_ib;
+		m_ib = NULL;
+
+		m_index.start = 0;
+		m_index.limit = std::max<int>(count * 3 / 2, 11000);
+	}
+
+	if(m_ib == NULL)
+	{
+		D3D11_BUFFER_DESC bd;
+
+		memset(&bd, 0, sizeof(bd));
+
+		bd.Usage = D3D11_USAGE_DYNAMIC;
+		bd.ByteWidth = m_index.limit * sizeof(uint32);
+		bd.BindFlags = D3D11_BIND_INDEX_BUFFER;
+		bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+
+		HRESULT hr;
+
+		hr = m_dev->CreateBuffer(&bd, NULL, &m_ib);
+
+		if(FAILED(hr)) return;
+	}
+
+	D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
+
+	if(m_index.start + count > m_index.limit)
+	{
+		m_index.start = 0;
+
+		type = D3D11_MAP_WRITE_DISCARD;
+	}
+
+	D3D11_MAPPED_SUBRESOURCE m;
+
+	if(SUCCEEDED(m_ctx->Map(m_ib, 0, type, 0, &m)))
+	{
+		memcpy((uint8*)m.pData + m_index.start * sizeof(uint32), index, count * sizeof(uint32));
+
+		m_ctx->Unmap(m_ib, 0);
+	}
+
+	m_index.count = count;
+
+	IASetIndexBuffer(m_ib);
+}
+
+void GSDevice11::IASetIndexBuffer(ID3D11Buffer* ib)
+{
+	if(m_state.ib != ib)
+	{
+		m_state.ib = ib;
+
+		m_ctx->IASetIndexBuffer(ib, DXGI_FORMAT_R32_UINT, 0);
+	}
+}
+
+void GSDevice11::IASetInputLayout(ID3D11InputLayout* layout)
+{
+	if(m_state.layout != layout)
+	{
+		m_state.layout = layout;
+
+		m_ctx->IASetInputLayout(layout);
+	}
+}
+
+void GSDevice11::IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY topology)
+{
+	if(m_state.topology != topology)
+	{
+		m_state.topology = topology;
+
+		m_ctx->IASetPrimitiveTopology(topology);
+	}
+}
+
+void GSDevice11::VSSetShader(ID3D11VertexShader* vs, ID3D11Buffer* vs_cb)
+{
+	if(m_state.vs != vs)
+	{
+		m_state.vs = vs;
+
+		m_ctx->VSSetShader(vs, NULL, 0);
+	}
+
+	if(m_state.vs_cb != vs_cb)
+	{
+		m_state.vs_cb = vs_cb;
+
+		m_ctx->VSSetConstantBuffers(0, 1, &vs_cb);
+	}
+}
+
+void GSDevice11::GSSetShader(ID3D11GeometryShader* gs)
+{
+	if(m_state.gs != gs)
+	{
+		m_state.gs = gs;
+
+		m_ctx->GSSetShader(gs, NULL, 0);
+	}
+}
+
+void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
+{
+	PSSetShaderResource(0, sr0);
+	PSSetShaderResource(1, sr1);
+
+	for(size_t i = 2; i < countof(m_state.ps_srv); i++)
+	{
+		PSSetShaderResource(i, NULL);
+	}
+}
+
+void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
+{
+	ID3D11ShaderResourceView* srv = NULL;
+
+	if(sr) srv = *(GSTexture11*)sr;
+
+	PSSetShaderResourceView(i, srv);
+}
+
+void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv)
+{
+	ASSERT(i < countof(m_state.ps_srv));
+
+	if(m_state.ps_srv[i] != srv)
+	{
+		m_state.ps_srv[i] = srv;
+
+		m_srv_changed = true;
+	}
+}
+
+void GSDevice11::PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2)
+{
+	if(m_state.ps_ss[0] != ss0 || m_state.ps_ss[1] != ss1 || m_state.ps_ss[2] != ss2)
+	{
+		m_state.ps_ss[0] = ss0;
+		m_state.ps_ss[1] = ss1;
+		m_state.ps_ss[2] = ss2;
+
+		m_ss_changed = true;
+	}
+}
+
+void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
+{
+	if(m_state.ps != ps)
+	{
+		m_state.ps = ps;
+
+		m_ctx->PSSetShader(ps, NULL, 0);
+	}
+
+	if(m_srv_changed)
+	{
+		m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv);
+
+		m_srv_changed = false;
+	}
+
+	if(m_ss_changed)
+	{
+		m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss);
+
+		m_ss_changed = false;
+	}
+
+	if(m_state.ps_cb != ps_cb)
+	{
+		m_state.ps_cb = ps_cb;
+
+		m_ctx->PSSetConstantBuffers(0, 1, &ps_cb);
+	}
+}
+
+void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv)
+{
+	if(m_state.cs_srv[i] != srv)
+	{
+		m_state.cs_srv[i] = srv;
+
+		m_ctx->CSSetShaderResources(i, 1, &srv);
+	}
+}
+
+void GSDevice11::CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav)
+{
+	uint32 counters[8];
+		
+	memset(counters, 0, sizeof(counters));
+
+	m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, counters);
+}
+
+void GSDevice11::CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb)
+{
+	if(m_state.cs != cs)
+	{
+		m_state.cs = cs;
+
+		m_ctx->CSSetShader(cs, NULL, 0);
+	}
+
+	if(m_state.cs_cb != cs_cb)
+	{
+		m_state.cs_cb = cs_cb;
+
+		m_ctx->CSSetConstantBuffers(0, 1, &cs_cb);
+	}
+}
+
+void GSDevice11::OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref)
+{
+	if(m_state.dss != dss || m_state.sref != sref)
+	{
+		m_state.dss = dss;
+		m_state.sref = sref;
+
+		m_ctx->OMSetDepthStencilState(dss, sref);
+	}
+}
+
+void GSDevice11::OMSetBlendState(ID3D11BlendState* bs, float bf)
+{
+	if(m_state.bs != bs || m_state.bf != bf)
+	{
+		m_state.bs = bs;
+		m_state.bf = bf;
+
+		float BlendFactor[] = {bf, bf, bf, 0};
+
+		m_ctx->OMSetBlendState(bs, BlendFactor, 0xffffffff);
+	}
+}
+
+void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor)
+{
+	ID3D11RenderTargetView* rtv = NULL;
+	ID3D11DepthStencilView* dsv = NULL;
+
+	if (!rt && !ds)
+		throw GSDXRecoverableError();
+
+	if(rt) rtv = *(GSTexture11*)rt;
+	if(ds) dsv = *(GSTexture11*)ds;
+
+	if(m_state.rtv != rtv || m_state.dsv != dsv)
+	{
+		m_state.rtv = rtv;
+		m_state.dsv = dsv;
+
+		m_ctx->OMSetRenderTargets(1, &rtv, dsv);
+	}
+
+	GSVector2i size = rt ? rt->GetSize() : ds->GetSize();
+	if(m_state.viewport != size)
+	{
+		bool isNative = theApp.GetConfig("upscale_multiplier", 1) == 1;
+		m_state.viewport = size;
+
+		D3D11_VIEWPORT vp;
+
+		memset(&vp, 0, sizeof(vp));
+
+		vp.TopLeftX = (spritehack > 0 || isNative) ? 0.0f : -0.01f;
+		vp.TopLeftY = (spritehack > 0 || isNative) ? 0.0f : -0.01f;
+		vp.Width = (float)size.x;
+		vp.Height = (float)size.y;
+		vp.MinDepth = 0.0f;
+		vp.MaxDepth = 1.0f;
+
+		m_ctx->RSSetViewports(1, &vp);
+	}
+
+	GSVector4i r = scissor ? *scissor : GSVector4i(size).zwxy();
+
+	if(!m_state.scissor.eq(r))
+	{
+		m_state.scissor = r;
+
+		m_ctx->RSSetScissorRects(1, r);
+	}
+}
+
+void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor)
+{
+	m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, counters);
+
+	m_state.rtv = NULL;
+	m_state.dsv = NULL;
+
+	if(m_state.viewport != rtsize)
+	{
+		m_state.viewport = rtsize;
+
+		D3D11_VIEWPORT vp;
+
+		memset(&vp, 0, sizeof(vp));
+
+		vp.TopLeftX = 0;
+		vp.TopLeftY = 0;
+		vp.Width = (float)rtsize.x;
+		vp.Height = (float)rtsize.y;
+		vp.MinDepth = 0.0f;
+		vp.MaxDepth = 1.0f;
+
+		m_ctx->RSSetViewports(1, &vp);
+	}
+
+	GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy();
+
+	if(!m_state.scissor.eq(r))
+	{
+		m_state.scissor = r;
+
+		m_ctx->RSSetScissorRects(1, r);
+	}
+}
+
+void GSDevice11::CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
+{
+	HRESULT hr;
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, fn, &m[0], s_old_d3d_compiler_dll? nullptr : include, entry, m_shader.vs.c_str(), 0, 0, &shader, &error);
+
+	if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateVertexShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(), NULL, vs);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateInputLayout(layout, count, shader->GetBufferPointer(), shader->GetBufferSize(), il);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSDevice11::CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11GeometryShader** gs)
+{
+	HRESULT hr;
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, fn, &m[0], s_old_d3d_compiler_dll ? nullptr : include, entry, m_shader.gs.c_str(), 0, 0, &shader, &error);
+
+	if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateGeometryShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(), NULL, gs);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSDevice11::CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count)
+{
+	HRESULT hr;
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, fn, &m[0], s_old_d3d_compiler_dll ? nullptr : include, entry, m_shader.gs.c_str(), 0, 0, &shader, &error);
+
+	if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSDevice11::CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11PixelShader** ps)
+{
+	HRESULT hr;
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, fn, &m[0], s_old_d3d_compiler_dll ? nullptr : include, entry, m_shader.ps.c_str(), 0, 0, &shader, &error);
+
+	if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreatePixelShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(), NULL, ps);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSDevice11::CompileShader(const char* source, size_t size, const char *fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11ComputeShader** cs)
+{
+	HRESULT hr;
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, fn, &m[0], s_old_d3d_compiler_dll ? nullptr : include, entry, m_shader.cs.c_str(), 0, 0, &shader, &error);
+
+	if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateComputeShader((void*)shader->GetBufferPointer(), shader->GetBufferSize(), NULL, cs);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
diff --git a/plugins/GSdx_legacy/GSDevice11.h b/plugins/GSdx_legacy/GSDevice11.h
new file mode 100644
index 0000000000..06ef75f230
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice11.h
@@ -0,0 +1,236 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDeviceDX.h"
+#include "GSTexture11.h"
+
+struct GSVertexShader11
+{
+	CComPtr<ID3D11VertexShader> vs;
+	CComPtr<ID3D11InputLayout> il;
+};
+
+class GSDevice11 : public GSDeviceDX
+{
+	GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format);
+
+	void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c);
+	void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset = 0);
+	void DoFXAA(GSTexture* sTex, GSTexture* dTex);
+	void DoShadeBoost(GSTexture* sTex, GSTexture* dTex);
+	void DoExternalFX(GSTexture* sTex, GSTexture* dTex);
+
+	void InitExternalFX();
+	void InitFXAA(); // Bug workaround! Stack corruption? Heap corruption? No idea
+	
+	//
+
+	CComPtr<ID3D11Device> m_dev;
+	CComPtr<ID3D11DeviceContext> m_ctx;
+	CComPtr<IDXGISwapChain> m_swapchain;
+	CComPtr<ID3D11Buffer> m_vb;
+	CComPtr<ID3D11Buffer> m_vb_old;
+	CComPtr<ID3D11Buffer> m_ib;
+	CComPtr<ID3D11Buffer> m_ib_old;
+
+	bool m_srv_changed, m_ss_changed;
+	int spritehack;
+
+	struct
+	{
+		ID3D11Buffer* vb;
+		size_t vb_stride;
+		ID3D11Buffer* ib;
+		ID3D11InputLayout* layout;
+		D3D11_PRIMITIVE_TOPOLOGY topology;
+		ID3D11VertexShader* vs;
+		ID3D11Buffer* vs_cb;
+		ID3D11GeometryShader* gs;
+		ID3D11ShaderResourceView* ps_srv[16];
+		ID3D11PixelShader* ps;
+		ID3D11Buffer* ps_cb;
+		ID3D11SamplerState* ps_ss[3];
+		ID3D11ShaderResourceView* cs_srv[16];
+		ID3D11ComputeShader* cs;
+		ID3D11Buffer* cs_cb;
+		GSVector2i viewport;
+		GSVector4i scissor;
+		ID3D11DepthStencilState* dss;
+		uint8 sref;
+		ID3D11BlendState* bs;
+		float bf;
+		ID3D11RenderTargetView* rtv;
+		ID3D11DepthStencilView* dsv;
+	} m_state;
+
+public: // TODO
+	CComPtr<ID3D11RasterizerState> m_rs;
+
+	bool FXAA_Compiled;
+	bool ExShader_Compiled;
+
+	struct
+	{
+		CComPtr<ID3D11InputLayout> il;
+		CComPtr<ID3D11VertexShader> vs;
+		CComPtr<ID3D11PixelShader> ps[10];
+		CComPtr<ID3D11SamplerState> ln;
+		CComPtr<ID3D11SamplerState> pt;
+		CComPtr<ID3D11DepthStencilState> dss;
+		CComPtr<ID3D11BlendState> bs;
+	} m_convert;
+
+	struct
+	{
+		CComPtr<ID3D11PixelShader> ps[2];
+		CComPtr<ID3D11Buffer> cb;
+		CComPtr<ID3D11BlendState> bs;
+	} m_merge;
+
+	struct
+	{
+		CComPtr<ID3D11PixelShader> ps[4];
+		CComPtr<ID3D11Buffer> cb;
+	} m_interlace;
+
+	struct
+	{
+		CComPtr<ID3D11PixelShader> ps;
+		CComPtr<ID3D11Buffer> cb;
+	} m_shaderfx;
+
+	struct 
+	{
+		CComPtr<ID3D11PixelShader> ps;
+		CComPtr<ID3D11Buffer> cb;
+	} m_fxaa;
+
+	struct 
+	{
+		CComPtr<ID3D11PixelShader> ps;
+		CComPtr<ID3D11Buffer> cb;
+	} m_shadeboost;
+
+	struct
+	{
+		CComPtr<ID3D11DepthStencilState> dss;
+		CComPtr<ID3D11BlendState> bs;
+	} m_date;
+
+	void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm);
+
+	// Shaders...
+
+	hash_map<uint32, GSVertexShader11 > m_vs;
+	CComPtr<ID3D11Buffer> m_vs_cb;
+	hash_map<uint32, CComPtr<ID3D11GeometryShader> > m_gs;
+	hash_map<uint32, CComPtr<ID3D11PixelShader> > m_ps;
+	CComPtr<ID3D11Buffer> m_ps_cb;
+	hash_map<uint32, CComPtr<ID3D11SamplerState> > m_ps_ss;
+	CComPtr<ID3D11SamplerState> m_palette_ss;
+	CComPtr<ID3D11SamplerState> m_rt_ss;
+	hash_map<uint32, CComPtr<ID3D11DepthStencilState> > m_om_dss;
+	hash_map<uint32, CComPtr<ID3D11BlendState> > m_om_bs;
+
+	VSConstantBuffer m_vs_cb_cache;
+	PSConstantBuffer m_ps_cb_cache;
+
+	bool CreateTextureFX();
+
+public:
+	GSDevice11();
+	virtual ~GSDevice11();
+
+	bool Create(GSWnd* wnd);
+	bool Reset(int w, int h);
+	void Flip();
+
+	void SetExclusive(bool isExcl);
+
+	void DrawPrimitive();
+	void DrawIndexedPrimitive();
+	void DrawIndexedPrimitive(int offset, int count);
+	void Dispatch(uint32 x, uint32 y, uint32 z);
+
+	void ClearRenderTarget(GSTexture* t, const GSVector4& c);
+	void ClearRenderTarget(GSTexture* t, uint32 c);
+	void ClearDepth(GSTexture* t, float c);
+	void ClearStencil(GSTexture* t, uint8 c);
+
+	GSTexture* CreateRenderTarget(int w, int h, bool msaa, int format = 0);
+	GSTexture* CreateDepthStencil(int w, int h, bool msaa, int format = 0);
+	GSTexture* CreateTexture(int w, int h, int format = 0);
+	GSTexture* CreateOffscreen(int w, int h, int format = 0);
+
+	GSTexture* Resolve(GSTexture* t);
+
+	GSTexture* CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format = 0, int ps_shader = 0);
+
+	void CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r);
+
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, bool linear = true);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear = true);
+
+	void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
+	bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count);
+	void IAUnmapVertexBuffer();
+	void IASetVertexBuffer(ID3D11Buffer* vb, size_t stride);
+	void IASetIndexBuffer(const void* index, size_t count);
+	void IASetIndexBuffer(ID3D11Buffer* ib);
+	void IASetInputLayout(ID3D11InputLayout* layout);
+	void IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY topology);
+	void VSSetShader(ID3D11VertexShader* vs, ID3D11Buffer* vs_cb);
+	void GSSetShader(ID3D11GeometryShader* gs);
+	void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
+	void PSSetShaderResource(int i, GSTexture* sr);
+	void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv);
+	void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb);
+	void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL);
+	void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv);
+	void CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav);
+	void CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb);
+	void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref);
+	void OMSetBlendState(ID3D11BlendState* bs, float bf);
+	void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
+	void OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor = NULL);
+
+	void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
+	void SetupGS(GSSelector sel);
+	void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel);
+	void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix);
+
+	bool HasStencil() { return true; }
+	bool HasDepth32() { return true; }
+
+	ID3D11Device* operator->() {return m_dev;}
+	operator ID3D11Device*() {return m_dev;}
+	operator ID3D11DeviceContext*() {return m_ctx;}
+
+	void CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
+	void CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
+	void CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count);
+	void CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11PixelShader** ps);
+	void CompileShader(const char* source, size_t size, const char* fn, ID3DInclude *include, const char* entry, D3D_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
+};
+
diff --git a/plugins/GSdx_legacy/GSDevice9.cpp b/plugins/GSdx_legacy/GSDevice9.cpp
new file mode 100644
index 0000000000..1414d07fd7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice9.cpp
@@ -0,0 +1,1523 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSDevice9.h"
+#include "resource.h"
+#include <fstream>
+
+GSDevice9::GSDevice9()
+	: m_lost(false)
+{
+	m_rbswapped = true;
+	FXAA_Compiled = false;
+	ExShader_Compiled = false;
+
+
+	memset(&m_pp, 0, sizeof(m_pp));
+	memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
+	memset(&m_state, 0, sizeof(m_state));
+
+	m_state.bf = 0xffffffff;
+}
+
+GSDevice9::~GSDevice9()
+{
+	for_each(m_om_bs.begin(), m_om_bs.end(), delete_second());
+	for_each(m_om_dss.begin(), m_om_dss.end(), delete_second());
+	for_each(m_ps_ss.begin(), m_ps_ss.end(), delete_second());
+	for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
+
+	if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
+	if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
+}
+
+static void FindAdapter(IDirect3D9 *d3d9, UINT &adapter, D3DDEVTYPE &devtype, std::string adapter_id = "")
+{
+	adapter = D3DADAPTER_DEFAULT;
+	devtype = D3DDEVTYPE_HAL;
+
+	if (!adapter_id.length())
+		adapter_id = theApp.GetConfig("Adapter", "default");
+
+	if (adapter_id == "default")
+		;
+	else if (adapter_id == "ref")
+	{
+		devtype = D3DDEVTYPE_REF;
+	}
+	else
+	{
+		int n = d3d9->GetAdapterCount();
+		for (int i = 0; i < n; i++)
+		{
+			D3DADAPTER_IDENTIFIER9 id;
+			if (D3D_OK != d3d9->GetAdapterIdentifier(i, 0, &id))
+				break;
+			if (GSAdapter(id) == adapter_id)
+			{
+				adapter = i;
+				devtype = D3DDEVTYPE_HAL;
+				break;
+			}
+		}
+	}
+}
+
+// if supported and null != msaa_desc, msaa_desc will contain requested Count and Quality
+
+static bool IsMsaaSupported(IDirect3D9* d3d, UINT adapter, D3DDEVTYPE devtype, D3DFORMAT depth_format, uint32 msaaCount, DXGI_SAMPLE_DESC* msaa_desc = NULL)
+{
+	if(msaaCount > 16) return false;
+
+	D3DCAPS9 d3dcaps;
+
+	memset(&d3dcaps, 0, sizeof(d3dcaps));
+
+	d3d->GetDeviceCaps(adapter, devtype, &d3dcaps);
+
+	DWORD quality[2] = {0, 0};
+
+	if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] > 0
+	&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] > 0)
+	{
+		if(msaa_desc)
+		{
+			msaa_desc->Count = msaaCount;
+			msaa_desc->Quality = std::min<DWORD>(quality[0] - 1, quality[1] - 1);
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool TestDepthFormat(IDirect3D9* d3d, UINT adapter, D3DDEVTYPE devtype, D3DFORMAT format)
+{
+	if(FAILED(d3d->CheckDeviceFormat(adapter, devtype, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
+	{
+		return false;
+	}
+
+	if(FAILED(d3d->CheckDepthStencilMatch(adapter, devtype, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
+	{
+		return false;
+	}
+
+	return true;
+}
+
+static D3DFORMAT BestD3dFormat(IDirect3D9* d3d, UINT adapter, D3DDEVTYPE devtype, int msaaCount = 0, DXGI_SAMPLE_DESC* msaa_desc = NULL)
+{
+	// In descending order of preference
+
+	static D3DFORMAT fmts[] =
+	{
+		D3DFMT_D32,
+		D3DFMT_D32F_LOCKABLE,
+		D3DFMT_D24S8
+	};
+
+	if(1 == msaaCount) msaaCount = 0;
+
+	for(size_t i = 0; i < countof(fmts); i++)
+	{
+		if(TestDepthFormat(d3d, adapter, devtype, fmts[i]) && (!msaaCount || IsMsaaSupported(d3d, adapter, devtype, fmts[i], msaaCount, msaa_desc)))
+		{
+			return fmts[i];
+		}
+	}
+
+	return D3DFMT_UNKNOWN;
+}
+
+// return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
+
+uint32 GSDevice9::GetMaxDepth(uint32 msaa, std::string adapter_id)
+{
+	CComPtr<IDirect3D9> d3d;
+
+	d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
+
+	UINT adapter;
+	D3DDEVTYPE devtype;
+
+	FindAdapter(d3d, adapter, devtype, adapter_id);
+
+	switch(BestD3dFormat(d3d, adapter, devtype, msaa))
+	{
+		case D3DFMT_D32:
+		case D3DFMT_D32F_LOCKABLE:
+			return 32;
+		case D3DFMT_D24S8:
+			return 24;
+	}
+
+	return 0;
+}
+
+void GSDevice9::ForceValidMsaaConfig()
+{
+	if(0 == GetMaxDepth(theApp.GetConfig("UserHacks_MSAA", 0)))
+	{
+		theApp.SetConfig("UserHacks_MSAA", 0); // replace invalid msaa value in ini file with 0.
+	}
+};
+
+bool GSDevice9::Create(GSWnd* wnd)
+{
+	if(!__super::Create(wnd))
+	{
+		return false;
+	}
+
+	// d3d
+
+	m_d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
+
+	if(!m_d3d) return false;
+
+	UINT adapter;
+	D3DDEVTYPE devtype;
+
+	FindAdapter(m_d3d, adapter, devtype);
+
+	D3DADAPTER_IDENTIFIER9 id;
+
+	if(S_OK == m_d3d->GetAdapterIdentifier(adapter, 0, &id))
+	{
+		printf("%s (%d.%d.%d.%d)\n",
+			id.Description,
+			id.DriverVersion.HighPart >> 16,
+			id.DriverVersion.HighPart & 0xffff,
+			id.DriverVersion.LowPart >> 16,
+			id.DriverVersion.LowPart & 0xffff);
+	}
+
+	ForceValidMsaaConfig();
+
+	// Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
+	// the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
+
+	m_depth_format = BestD3dFormat(m_d3d, adapter, devtype, m_msaa, &m_msaa_desc);
+
+	if(D3DFMT_UNKNOWN == m_depth_format)
+	{
+		// can't find a format with requested msaa, try without.
+
+		m_depth_format = BestD3dFormat(m_d3d, adapter, devtype, 0);
+
+		if(D3DFMT_UNKNOWN == m_depth_format)
+		{
+			return false;
+		}
+
+		m_msaa = 0;
+	}
+
+	memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
+
+	m_d3d->GetDeviceCaps(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, &m_d3dcaps);
+
+	//
+
+	if(m_d3dcaps.VertexShaderVersion < (m_d3dcaps.PixelShaderVersion & ~0x10000))
+	{
+		if(m_d3dcaps.VertexShaderVersion > D3DVS_VERSION(0, 0))
+		{
+			ASSERT(0);
+
+			return false;
+		}
+
+		// else vertex shader should be emulated in software (gma950)
+	}
+
+	m_d3dcaps.VertexShaderVersion = m_d3dcaps.PixelShaderVersion & ~0x10000;
+
+	if(m_d3dcaps.PixelShaderVersion >= D3DPS_VERSION(3, 0))
+	{
+		SetFeatureLevel(D3D_FEATURE_LEVEL_9_3, false);
+	}
+	else if(m_d3dcaps.PixelShaderVersion >= D3DPS_VERSION(2, 0))
+	{
+		SetFeatureLevel(D3D_FEATURE_LEVEL_9_2, false);
+	}
+	else
+	{
+		string s = format(
+			"Supported pixel shader version is too low!\n\nSupported: %d.%d\nNeeded: 2.0 or higher",
+			D3DSHADER_VERSION_MAJOR(m_d3dcaps.PixelShaderVersion), D3DSHADER_VERSION_MINOR(m_d3dcaps.PixelShaderVersion));
+
+		MessageBox(NULL, s.c_str(), "GSdx", MB_OK);
+
+		return false;
+	}
+
+	if(!Reset(1, 1))
+	{
+		return false;
+	}
+
+	m_dev->Clear(0, NULL, D3DCLEAR_TARGET, 0, 1.0f, 0);
+
+	// convert
+
+	static const D3DVERTEXELEMENT9 il_convert[] =
+	{
+		{0, 0,  D3DDECLTYPE_FLOAT4, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0},
+		{0, 16, D3DDECLTYPE_FLOAT2, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_TEXCOORD, 0},
+		D3DDECL_END()
+	};
+
+	vector<unsigned char> shader;
+	theApp.LoadResource(IDR_CONVERT_FX, shader);
+	CompileShader((const char *)shader.data(), shader.size(), "convert.fx", "vs_main", nullptr, &m_convert.vs, il_convert, countof(il_convert), &m_convert.il);
+
+	for(size_t i = 0; i < countof(m_convert.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "convert.fx", format("ps_main%d", i), nullptr, &m_convert.ps[i]);
+	}
+
+	m_convert.dss.DepthEnable = false;
+	m_convert.dss.StencilEnable = false;
+
+	m_convert.bs.BlendEnable = false;
+	m_convert.bs.RenderTargetWriteMask = D3DCOLORWRITEENABLE_RGBA;
+	D3DTEXTUREFILTERTYPE LinearToAnisotropic = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3DTEXF_ANISOTROPIC : D3DTEXF_LINEAR;
+	D3DTEXTUREFILTERTYPE PointToAnisotropic = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3DTEXF_ANISOTROPIC : D3DTEXF_POINT;
+
+	m_convert.ln.FilterMin[0] = LinearToAnisotropic;
+	m_convert.ln.FilterMag[0] = LinearToAnisotropic;
+	m_convert.ln.FilterMin[1] = LinearToAnisotropic;
+	m_convert.ln.FilterMag[1] = LinearToAnisotropic;
+	m_convert.ln.AddressU = D3DTADDRESS_CLAMP;
+	m_convert.ln.AddressV = D3DTADDRESS_CLAMP;
+	m_convert.ln.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+
+	m_convert.pt.FilterMin[0] = PointToAnisotropic;
+	m_convert.pt.FilterMag[0] = PointToAnisotropic;
+	m_convert.pt.FilterMin[1] = PointToAnisotropic;
+	m_convert.pt.FilterMag[1] = PointToAnisotropic;
+	m_convert.pt.AddressU = D3DTADDRESS_CLAMP;
+	m_convert.pt.AddressV = D3DTADDRESS_CLAMP;
+	m_convert.pt.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+
+	// merge
+
+	theApp.LoadResource(IDR_MERGE_FX, shader);
+	for(size_t i = 0; i < countof(m_merge.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "merge.fx", format("ps_main%d", i), nullptr, &m_merge.ps[i]);
+	}
+
+	m_merge.bs.BlendEnable = true;
+	m_merge.bs.BlendOp = D3DBLENDOP_ADD;
+	m_merge.bs.SrcBlend = D3DBLEND_SRCALPHA;
+	m_merge.bs.DestBlend = D3DBLEND_INVSRCALPHA;
+	m_merge.bs.BlendOpAlpha = D3DBLENDOP_ADD;
+	m_merge.bs.SrcBlendAlpha = D3DBLEND_ONE;
+	m_merge.bs.DestBlendAlpha = D3DBLEND_ZERO;
+	m_merge.bs.RenderTargetWriteMask = D3DCOLORWRITEENABLE_RGBA;
+
+	// interlace
+
+	theApp.LoadResource(IDR_INTERLACE_FX, shader);
+	for(size_t i = 0; i < countof(m_interlace.ps); i++)
+	{
+		CompileShader((const char *)shader.data(), shader.size(), "interlace.fx", format("ps_main%d", i), nullptr, &m_interlace.ps[i]);
+	}
+
+	// Shade Boost	
+
+	int ShadeBoost_Contrast = theApp.GetConfig("ShadeBoost_Contrast", 50);
+	int ShadeBoost_Brightness = theApp.GetConfig("ShadeBoost_Brightness", 50);
+	int ShadeBoost_Saturation = theApp.GetConfig("ShadeBoost_Saturation", 50);
+		
+	string str[3];		
+		
+	str[0] = format("%d", ShadeBoost_Saturation);
+	str[1] = format("%d", ShadeBoost_Brightness);
+	str[2] = format("%d", ShadeBoost_Contrast);
+
+	D3D_SHADER_MACRO macro[] =
+	{			
+		{"SB_SATURATION", str[0].c_str()},
+		{"SB_BRIGHTNESS", str[1].c_str()},
+		{"SB_CONTRAST", str[2].c_str()},
+		{NULL, NULL},
+	};
+
+	theApp.LoadResource(IDR_SHADEBOOST_FX, shader);
+	CompileShader((const char *)shader.data(), shader.size(), "shadeboost.fx", "ps_main", macro, &m_shadeboost.ps);
+
+	// create shader layout
+
+	VSSelector sel;
+	VSConstantBuffer cb;
+
+	SetupVS(sel, &cb);
+
+	//
+
+	memset(&m_date.dss, 0, sizeof(m_date.dss));
+
+	m_date.dss.StencilEnable = true;
+	m_date.dss.StencilReadMask = 1;
+	m_date.dss.StencilWriteMask = 1;
+	m_date.dss.StencilFunc = D3DCMP_ALWAYS;
+	m_date.dss.StencilPassOp = D3DSTENCILOP_REPLACE;
+	m_date.dss.StencilRef = 1;
+
+	memset(&m_date.bs, 0, sizeof(m_date.bs));
+
+	//
+
+	return true;
+}
+
+bool GSDevice9::Reset(int w, int h)
+{
+	if(!__super::Reset(w, h))
+		return false;
+
+	HRESULT hr;
+
+	int mode = (!m_wnd->IsManaged() || theApp.GetConfig("windowed", 1)) ? Windowed : Fullscreen;
+
+	if(mode == DontCare)
+	{
+		mode = m_pp.Windowed ? Windowed : Fullscreen;
+	}
+
+	if(!m_lost)
+	{
+		if(m_swapchain && mode != Fullscreen && m_pp.Windowed)
+		{
+			m_swapchain = NULL;
+
+			m_pp.BackBufferWidth = w;
+			m_pp.BackBufferHeight = h;
+			m_pp.PresentationInterval = m_vsync ? D3DPRESENT_INTERVAL_ONE : D3DPRESENT_INTERVAL_IMMEDIATE;
+
+			hr = m_dev->CreateAdditionalSwapChain(&m_pp, &m_swapchain);
+
+			if(FAILED(hr)) return false;
+
+			CComPtr<IDirect3DSurface9> backbuffer;
+			hr = m_swapchain->GetBackBuffer(0, D3DBACKBUFFER_TYPE_MONO, &backbuffer);
+			m_backbuffer = new GSTexture9(backbuffer);
+
+			return true;
+		}
+	}
+
+	m_swapchain = NULL;
+
+	m_vb = NULL;
+	m_vb_old = NULL;
+
+	m_vertex.start = 0;
+	m_vertex.count = 0;
+	m_index.start = 0;
+	m_index.count = 0;
+
+	if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
+	if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
+
+	memset(&m_state, 0, sizeof(m_state));
+
+	m_state.bf = 0xffffffff;
+
+	memset(&m_pp, 0, sizeof(m_pp));
+
+	m_pp.Windowed = TRUE;
+	m_pp.hDeviceWindow = (HWND)m_wnd->GetHandle();
+	m_pp.SwapEffect = D3DSWAPEFFECT_FLIP;
+	m_pp.BackBufferFormat = D3DFMT_X8R8G8B8;
+	m_pp.BackBufferWidth = 1;
+	m_pp.BackBufferHeight = 1;
+	m_pp.PresentationInterval = m_vsync ? D3DPRESENT_INTERVAL_ONE : D3DPRESENT_INTERVAL_IMMEDIATE;
+
+	// m_pp.Flags |= D3DPRESENTFLAG_VIDEO; // enables tv-out (but I don't think anyone would still use a regular tv...)
+
+	int mw = theApp.GetConfig("ModeWidth", 0);
+	int mh = theApp.GetConfig("ModeHeight", 0);
+	int mrr = theApp.GetConfig("ModeRefreshRate", 0);
+
+	if(m_wnd->IsManaged() && mode == Fullscreen && mw > 0 && mh > 0 && mrr >= 0)
+	{
+		m_pp.Windowed = FALSE;
+		m_pp.BackBufferWidth = mw;
+		m_pp.BackBufferHeight = mh;
+		// m_pp.FullScreen_RefreshRateInHz = mrr;
+
+		m_wnd->HideFrame();
+	}
+
+	if(!m_dev)
+	{
+		uint32 flags = m_d3dcaps.VertexProcessingCaps ? D3DCREATE_HARDWARE_VERTEXPROCESSING : D3DCREATE_SOFTWARE_VERTEXPROCESSING;
+
+		if(flags & D3DCREATE_HARDWARE_VERTEXPROCESSING)
+		{
+			flags |= D3DCREATE_PUREDEVICE;
+		}
+
+		hr = m_d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, (HWND)m_wnd->GetHandle(), flags, &m_pp, &m_dev);
+
+		if(FAILED(hr)) return false;
+	}
+	else
+	{
+		hr = m_dev->Reset(&m_pp);
+
+		if(FAILED(hr))
+		{
+			if(D3DERR_DEVICELOST == hr)
+			{
+				Sleep(1000);
+
+				hr = m_dev->Reset(&m_pp);
+			}
+
+			if(FAILED(hr)) return false;
+		}
+	}
+
+	if(m_pp.Windowed)
+	{
+		m_pp.BackBufferWidth = 1;
+		m_pp.BackBufferHeight = 1;
+
+		hr = m_dev->CreateAdditionalSwapChain(&m_pp, &m_swapchain);
+
+		if(FAILED(hr)) return false;
+	}
+
+	CComPtr<IDirect3DSurface9> backbuffer;
+
+	if(m_swapchain)
+	{
+		hr = m_swapchain->GetBackBuffer(0, D3DBACKBUFFER_TYPE_MONO, &backbuffer);
+	}
+	else
+	{
+		hr = m_dev->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &backbuffer);
+	}
+
+	m_backbuffer = new GSTexture9(backbuffer);
+
+	m_dev->SetRenderState(D3DRS_CULLMODE, D3DCULL_NONE);
+	m_dev->SetRenderState(D3DRS_LIGHTING, FALSE);
+	m_dev->SetRenderState(D3DRS_ALPHATESTENABLE, FALSE);
+	m_dev->SetRenderState(D3DRS_SCISSORTESTENABLE, TRUE);
+
+	return true;
+}
+
+bool GSDevice9::IsLost(bool update)
+{
+	if(!m_lost || update)
+	{
+		HRESULT hr = m_dev->TestCooperativeLevel();
+
+		m_lost = hr == D3DERR_DEVICELOST || hr == D3DERR_DEVICENOTRESET;
+	}
+
+	return m_lost;
+}
+
+void GSDevice9::Flip()
+{
+	m_dev->EndScene();
+
+	HRESULT hr;
+
+	if(m_swapchain)
+	{
+		hr = m_swapchain->Present(NULL, NULL, NULL, NULL, 0);
+	}
+	else
+	{
+		hr = m_dev->Present(NULL, NULL, NULL, NULL);
+	}
+
+	m_dev->BeginScene();
+
+	if(FAILED(hr))
+	{
+		m_lost = true;
+	}
+}
+
+void GSDevice9::SetVSync(bool enable)
+{
+	if(m_vsync == enable) return;
+
+	__super::SetVSync(enable);
+
+	// Clever trick:  Delete the backbuffer, so that the next Present will fail and
+	// cause a DXDevice9::Reset call, which re-creates the backbuffer with current
+	// vsync settings. :)
+
+	delete m_backbuffer;
+
+	m_backbuffer = NULL;
+}
+
+void GSDevice9::BeginScene()
+{
+	// m_dev->BeginScene();
+}
+
+void GSDevice9::DrawPrimitive()
+{
+	int prims = 0;
+
+	switch(m_state.topology)
+	{
+    case D3DPT_POINTLIST:
+		prims = m_vertex.count;
+		break;
+    case D3DPT_LINELIST:
+		prims = m_vertex.count / 2;
+		break;
+    case D3DPT_LINESTRIP:
+		prims = m_vertex.count - 1;
+		break;
+    case D3DPT_TRIANGLELIST:
+		prims = m_vertex.count / 3;
+		break;
+    case D3DPT_TRIANGLESTRIP:
+    case D3DPT_TRIANGLEFAN:
+		prims = m_vertex.count - 2;
+		break;
+	default:
+		__assume(0);
+	}
+
+	m_dev->DrawPrimitive(m_state.topology, m_vertex.start, prims);
+}
+
+void GSDevice9::DrawIndexedPrimitive()
+{
+	int prims = 0;
+
+	switch(m_state.topology)
+	{
+    case D3DPT_POINTLIST:
+		prims = m_index.count;
+		break;
+    case D3DPT_LINELIST:
+    case D3DPT_LINESTRIP:
+		prims = m_index.count / 2;
+		break;
+    case D3DPT_TRIANGLELIST:
+    case D3DPT_TRIANGLESTRIP:
+    case D3DPT_TRIANGLEFAN:
+		prims = m_index.count / 3;
+		break;
+	default:
+		__assume(0);
+	}
+
+	m_dev->DrawIndexedPrimitive(m_state.topology, m_vertex.start, 0, m_index.count, m_index.start, prims);
+}
+
+void GSDevice9::EndScene()
+{
+	// m_dev->EndScene();
+
+	__super::EndScene();
+}
+
+void GSDevice9::ClearRenderTarget(GSTexture* t, const GSVector4& c)
+{
+	if (!t) return;
+	ClearRenderTarget(t, (c * 255 + 0.5f).zyxw().rgba32());
+}
+
+void GSDevice9::ClearRenderTarget(GSTexture* rt, uint32 c)
+{
+	if (!rt) return;
+	CComPtr<IDirect3DSurface9> surface;
+	m_dev->GetRenderTarget(0, &surface);
+	m_dev->SetRenderTarget(0, *(GSTexture9*)rt);
+	m_dev->Clear(0, NULL, D3DCLEAR_TARGET, c, 0, 0);
+	m_dev->SetRenderTarget(0, surface);
+}
+
+void GSDevice9::ClearDepth(GSTexture* t, float c)
+{
+	if (!t) return;
+	CComPtr<IDirect3DSurface9> dssurface;
+	m_dev->GetDepthStencilSurface(&dssurface);
+	m_dev->SetDepthStencilSurface(*(GSTexture9*)t);
+	m_dev->Clear(0, NULL, D3DCLEAR_ZBUFFER, 0, c, 0);
+	m_dev->SetDepthStencilSurface(dssurface);
+}
+
+void GSDevice9::ClearStencil(GSTexture* t, uint8 c)
+{
+	if (!t) return;
+	CComPtr<IDirect3DSurface9> dssurface;
+	m_dev->GetDepthStencilSurface(&dssurface);
+	m_dev->SetDepthStencilSurface(*(GSTexture9*)t);
+	m_dev->Clear(0, NULL, D3DCLEAR_STENCIL, 0, 0, c);
+	m_dev->SetDepthStencilSurface(dssurface);
+}
+
+GSTexture* GSDevice9::CreateSurface(int type, int w, int h, bool msaa, int format)
+{
+	HRESULT hr;
+
+	CComPtr<IDirect3DTexture9> texture;
+	CComPtr<IDirect3DSurface9> surface;
+
+	switch(type)
+	{
+	case GSTexture::RenderTarget:
+		if(msaa) hr = m_dev->CreateRenderTarget(w, h, (D3DFORMAT)format, (D3DMULTISAMPLE_TYPE)m_msaa_desc.Count, m_msaa_desc.Quality, FALSE, &surface, NULL);
+		else hr = m_dev->CreateTexture(w, h, 1, D3DUSAGE_RENDERTARGET, (D3DFORMAT)format, D3DPOOL_DEFAULT, &texture, NULL);
+		break;
+	case GSTexture::DepthStencil:
+		if(msaa) hr = m_dev->CreateDepthStencilSurface(w, h, (D3DFORMAT)format, (D3DMULTISAMPLE_TYPE)m_msaa_desc.Count, m_msaa_desc.Quality, FALSE, &surface, NULL);
+		else hr = m_dev->CreateDepthStencilSurface(w, h, (D3DFORMAT)format, D3DMULTISAMPLE_NONE, 0, FALSE, &surface, NULL);
+		break;
+	case GSTexture::Texture:
+		hr = m_dev->CreateTexture(w, h, 1, 0, (D3DFORMAT)format, D3DPOOL_MANAGED, &texture, NULL);
+		break;
+	case GSTexture::Offscreen:
+		hr = m_dev->CreateOffscreenPlainSurface(w, h, (D3DFORMAT)format, D3DPOOL_SYSTEMMEM, &surface, NULL);
+		break;
+	}
+
+	GSTexture9* t = NULL;
+
+	if(surface)
+	{
+		t = new GSTexture9(surface);
+	}
+
+	if(texture)
+	{
+		t = new GSTexture9(texture);
+	}
+
+	if(t)
+	{
+		switch(type)
+		{
+		case GSTexture::RenderTarget:
+			ClearRenderTarget(t, 0);
+			break;
+		case GSTexture::DepthStencil:
+			ClearDepth(t, 0);
+			break;
+		}
+	}
+
+	return t;
+}
+
+GSTexture* GSDevice9::CreateRenderTarget(int w, int h, bool msaa, int format)
+{
+	return __super::CreateRenderTarget(w, h, msaa, format ? format : D3DFMT_A8R8G8B8);
+}
+
+GSTexture* GSDevice9::CreateDepthStencil(int w, int h, bool msaa, int format)
+{
+	return __super::CreateDepthStencil(w, h, msaa, format ? format : m_depth_format);
+}
+
+GSTexture* GSDevice9::CreateTexture(int w, int h, int format)
+{
+	return __super::CreateTexture(w, h, format ? format : D3DFMT_A8R8G8B8);
+}
+
+GSTexture* GSDevice9::CreateOffscreen(int w, int h, int format)
+{
+	return __super::CreateOffscreen(w, h, format ? format : D3DFMT_A8R8G8B8);
+}
+
+GSTexture* GSDevice9::Resolve(GSTexture* t)
+{
+	ASSERT(t != NULL && t->IsMSAA());
+
+	if(GSTexture* dst = CreateRenderTarget(t->GetWidth(), t->GetHeight(), false, t->GetFormat()))
+	{
+		dst->SetScale(t->GetScale());
+
+		m_dev->StretchRect(*(GSTexture9*)t, NULL, *(GSTexture9*)dst, NULL, D3DTEXF_POINT);
+
+		return dst;
+	}
+
+	return NULL;
+}
+
+GSTexture* GSDevice9::CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format, int ps_shader)
+{
+	GSTexture* dst = NULL;
+
+	if(format == 0)
+	{
+		format = D3DFMT_A8R8G8B8;
+	}
+
+	if(format != D3DFMT_A8R8G8B8)
+	{
+		ASSERT(0);
+
+		return false;
+	}
+
+	if(GSTexture* rt = CreateRenderTarget(w, h, false, format))
+	{
+		GSVector4 dRect(0, 0, w, h);
+
+		if(GSTexture* src2 = src->IsMSAA() ? Resolve(src) : src)
+		{
+			StretchRect(src2, sRect, rt, dRect, m_convert.ps[1], NULL, 0);
+
+			if(src2 != src) Recycle(src2);
+		}
+
+		dst = CreateOffscreen(w, h, format);
+
+		if(dst)
+		{
+			m_dev->GetRenderTargetData(*(GSTexture9*)rt, *(GSTexture9*)dst);
+		}
+
+		Recycle(rt);
+	}
+
+	return dst;
+}
+
+void GSDevice9::CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r)
+{
+	if(!sTex || !dTex)
+	{
+		ASSERT(0);
+		return;
+	}
+
+	m_dev->StretchRect(*(GSTexture9*)sTex, r, *(GSTexture9*)dTex, r, D3DTEXF_NONE);
+}
+
+void GSDevice9::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, m_convert.ps[shader], NULL, 0, linear);
+}
+
+void GSDevice9::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, ps, ps_cb, ps_cb_len, &m_convert.bs, linear);
+}
+
+void GSDevice9::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, Direct3DBlendState9* bs, bool linear)
+{
+	if(!sTex || !dTex)
+	{
+		ASSERT(0);
+		return;
+	}
+
+	BeginScene();
+
+	GSVector2i ds = dTex->GetSize();
+
+	// om
+
+	OMSetDepthStencilState(&m_convert.dss);
+	OMSetBlendState(bs, 0);
+	OMSetRenderTargets(dTex, NULL);
+
+	// ia
+
+	float left = dRect.x * 2 / ds.x - 1.0f;
+	float top = 1.0f - dRect.y * 2 / ds.y;
+	float right = dRect.z * 2 / ds.x - 1.0f;
+	float bottom = 1.0f - dRect.w * 2 / ds.y;
+
+	GSVertexPT1 vertices[] =
+	{
+		{GSVector4(left, top, 0.5f, 1.0f), GSVector2(sRect.x, sRect.y)},
+		{GSVector4(right, top, 0.5f, 1.0f), GSVector2(sRect.z, sRect.y)},
+		{GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sRect.x, sRect.w)},
+		{GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sRect.z, sRect.w)},
+	};
+
+	for(size_t i = 0; i < countof(vertices); i++)
+	{
+		vertices[i].p.x -= 1.0f / ds.x;
+		vertices[i].p.y += 1.0f / ds.y;
+	}
+
+	IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
+	IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
+	IASetInputLayout(m_convert.il);
+
+	// vs
+
+	VSSetShader(m_convert.vs, NULL, 0);
+
+	// ps
+
+	PSSetSamplerState(linear ? &m_convert.ln : &m_convert.pt);
+	PSSetShaderResources(sTex, NULL);
+	PSSetShader(ps, ps_cb, ps_cb_len);
+
+	//
+
+	DrawPrimitive();
+
+	//
+
+	EndScene();
+}
+
+void GSDevice9::DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c)
+{
+	ClearRenderTarget(dTex, c);
+
+	if(sTex[1] && !slbg)
+	{
+		StretchRect(sTex[1], sRect[1], dTex, dRect[1], m_merge.ps[0], NULL, true);
+	}
+
+	if(sTex[0])
+	{
+		MergeConstantBuffer cb;
+
+		cb.BGColor = c;
+
+		StretchRect(sTex[0], sRect[0], dTex, dRect[0], m_merge.ps[mmod ? 1 : 0], (const float*)&cb, 1, &m_merge.bs, true);
+	}
+}
+
+void GSDevice9::DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset)
+{
+	GSVector4 s = GSVector4(dTex->GetSize());
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0.0f, yoffset, s.x, s.y + yoffset);
+
+	InterlaceConstantBuffer cb;
+
+	cb.ZrH = GSVector2(0, 1.0f / s.y);
+	cb.hH = (float)s.y / 2;
+
+	StretchRect(sTex, sRect, dTex, dRect, m_interlace.ps[shader], (const float*)&cb, 1, linear);
+}
+
+void GSDevice9::InitExternalFX()
+{
+	if (!ExShader_Compiled)
+	{
+		try {
+			std::string config_name(theApp.GetConfig("shaderfx_conf", "shaders/GSdx_FX_Settings.ini"));
+			std::ifstream fconfig(config_name);
+			std::stringstream shader;
+			if (fconfig.good())
+				shader << fconfig.rdbuf() << "\n";
+			else
+				fprintf(stderr, "GSdx: External shader config '%s' not loaded.\n", config_name.c_str());
+
+			std::string shader_name(theApp.GetConfig("shaderfx_glsl", "shaders/GSdx.fx"));
+			std::ifstream fshader(shader_name);
+			if (fshader.good())
+			{
+				shader << fshader.rdbuf();
+				CompileShader(shader.str().c_str(), shader.str().length(), shader_name.c_str(), "ps_main", nullptr, &m_shaderfx.ps);
+			}
+			else
+			{
+				fprintf(stderr, "GSdx: External shader '%s' not loaded and will be disabled!\n", shader_name.c_str());
+			}
+		}
+		catch (GSDXRecoverableError) {
+			printf("GSdx: failed to compile external post-processing shader. \n");
+		}
+		ExShader_Compiled = true;
+	}
+}
+
+void GSDevice9::DoExternalFX(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ExternalFXConstantBuffer cb;
+	
+	InitExternalFX();
+
+	cb.xyFrame = GSVector2(s.x, s.y);
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shaderfx.ps, (const float*)&cb, 2, true);
+}
+
+void GSDevice9::InitFXAA()
+{
+	if (!FXAA_Compiled)
+	{
+		try {
+			vector<unsigned char> shader;
+			theApp.LoadResource(IDR_FXAA_FX, shader);
+			CompileShader((const char *)shader.data(), shader.size(), "fxaa.fx", "ps_main", nullptr, &m_fxaa.ps);
+		}
+		catch (GSDXRecoverableError) {
+			printf("GSdx: Failed to compile fxaa shader.\n");
+		}
+		FXAA_Compiled = true;
+	}
+}
+
+void GSDevice9::DoFXAA(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	FXAAConstantBuffer cb;
+
+	InitFXAA();
+
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	StretchRect(sTex, sRect, dTex, dRect, m_fxaa.ps, (const float*)&cb, 2, true);
+}
+
+void GSDevice9::DoShadeBoost(GSTexture* sTex, GSTexture* dTex)
+{
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ShadeBoostConstantBuffer cb;
+
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shadeboost.ps, (const float*)&cb, 1, true);
+}
+
+void GSDevice9::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm)
+{
+	const GSVector2i& size = rt->GetSize();
+
+	if(GSTexture* t = CreateRenderTarget(size.x, size.y, rt->IsMSAA()))
+	{
+		// sfex3 (after the capcom logo), vf4 (first menu fading in), ffxii shadows, rumble roses shadows, persona4 shadows
+
+		BeginScene();
+
+		ClearStencil(ds, 0);
+
+		// om
+
+		OMSetDepthStencilState(&m_date.dss);
+		OMSetBlendState(&m_date.bs, 0);
+		OMSetRenderTargets(t, ds);
+
+		// ia
+
+		IASetVertexBuffer(vertices, sizeof(vertices[0]), 4);
+		IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
+
+		// vs
+
+		VSSetShader(m_convert.vs, NULL, 0);
+		IASetInputLayout(m_convert.il);
+
+		// ps
+
+		GSTexture* rt2 = rt->IsMSAA() ? Resolve(rt) : rt;
+
+		PSSetShaderResources(rt2, NULL);
+		PSSetShader(m_convert.ps[datm ? 2 : 3], NULL, 0);
+		PSSetSamplerState(&m_convert.pt);
+
+		//
+
+		DrawPrimitive();
+
+		//
+
+		EndScene();
+
+		Recycle(t);
+
+		if(rt2 != rt) Recycle(rt2);
+	}
+}
+
+void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t count)
+{
+	void* ptr = NULL;
+
+	if(IAMapVertexBuffer(&ptr, stride, count))
+	{
+		GSVector4i::storent(ptr, vertex, count * stride);
+
+		IAUnmapVertexBuffer();
+	}
+}
+
+bool GSDevice9::IAMapVertexBuffer(void** vertex, size_t stride, size_t count)
+{
+	ASSERT(m_vertex.count == 0);
+
+	if(count * stride > m_vertex.limit * m_vertex.stride)
+	{
+		m_vb_old = m_vb;
+		m_vb = NULL;
+
+		m_vertex.start = 0;
+		m_vertex.count = 0;
+		m_vertex.limit = std::max<int>(count * 3 / 2, 10000);
+	}
+
+	if(m_vb == NULL)
+	{
+		HRESULT hr;
+
+		hr = m_dev->CreateVertexBuffer(m_vertex.limit * stride, D3DUSAGE_DYNAMIC | D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &m_vb, NULL);
+
+		if(FAILED(hr)) return false;
+	}
+
+	uint32 flags = D3DLOCK_NOOVERWRITE;
+
+	if(m_vertex.start + count > m_vertex.limit || stride != m_vertex.stride)
+	{
+		m_vertex.start = 0;
+
+		flags = D3DLOCK_DISCARD;
+	}
+
+	if(FAILED(m_vb->Lock(m_vertex.start * stride, count * stride, vertex, flags)))
+	{
+		return false;
+	}
+
+	m_vertex.count = count;
+	m_vertex.stride = stride;
+
+	return true;
+}
+
+void GSDevice9::IAUnmapVertexBuffer()
+{
+	m_vb->Unlock();
+
+	IASetVertexBuffer(m_vb, m_vertex.stride);
+}
+
+void GSDevice9::IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride)
+{
+	if(m_state.vb != vb || m_state.vb_stride != stride)
+	{
+		m_state.vb = vb;
+		m_state.vb_stride = stride;
+
+		m_dev->SetStreamSource(0, vb, 0, stride);
+	}
+}
+
+void GSDevice9::IASetIndexBuffer(const void* index, size_t count)
+{
+	ASSERT(m_index.count == 0);
+
+	if(count > m_index.limit)
+	{
+		m_ib_old = m_ib;
+		m_ib = NULL;
+
+		m_index.count = 0;
+		m_index.limit = std::max<int>(count * 3 / 2, 11000);
+	}
+
+	if(m_ib == NULL)
+	{
+		HRESULT hr;
+
+		hr = m_dev->CreateIndexBuffer(m_index.limit * sizeof(uint32), D3DUSAGE_DYNAMIC | D3DUSAGE_WRITEONLY, D3DFMT_INDEX32, D3DPOOL_DEFAULT, &m_ib, NULL);
+
+		if(FAILED(hr)) return;
+	}
+
+	uint32 flags = D3DLOCK_NOOVERWRITE;
+
+	if(m_index.start + count > m_index.limit)
+	{
+		m_index.start = 0;
+
+		flags = D3DLOCK_DISCARD;
+	}
+
+	void* ptr = NULL;
+
+	if(SUCCEEDED(m_ib->Lock(m_index.start * sizeof(uint32), count * sizeof(uint32), &ptr, flags)))
+	{
+		memcpy(ptr, index, count * sizeof(uint32));
+
+		m_ib->Unlock();
+	}
+
+	m_index.count = count;
+
+	IASetIndexBuffer(m_ib);
+}
+
+void GSDevice9::IASetIndexBuffer(IDirect3DIndexBuffer9* ib)
+{
+	if(m_state.ib != ib)
+	{
+		m_state.ib = ib;
+
+		m_dev->SetIndices(ib);
+	}
+}
+
+void GSDevice9::IASetInputLayout(IDirect3DVertexDeclaration9* layout)
+{
+	if(m_state.layout != layout)
+	{
+		m_state.layout = layout;
+
+		m_dev->SetVertexDeclaration(layout);
+	}
+}
+
+void GSDevice9::IASetPrimitiveTopology(D3DPRIMITIVETYPE topology)
+{
+	m_state.topology = topology;
+}
+
+void GSDevice9::VSSetShader(IDirect3DVertexShader9* vs, const float* vs_cb, int vs_cb_len)
+{
+	if(m_state.vs != vs)
+	{
+		m_state.vs = vs;
+
+		m_dev->SetVertexShader(vs);
+	}
+
+	if(vs_cb && vs_cb_len > 0)
+	{
+		int size = vs_cb_len * sizeof(float) * 4;
+
+		if(m_state.vs_cb_len != vs_cb_len || m_state.vs_cb == NULL || memcmp(m_state.vs_cb, vs_cb, size))
+		{
+			if(m_state.vs_cb == NULL || m_state.vs_cb_len < vs_cb_len)
+			{
+				if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
+
+				m_state.vs_cb = (float*)_aligned_malloc(size, 32);
+			}
+
+			m_state.vs_cb_len = vs_cb_len;
+
+			memcpy(m_state.vs_cb, vs_cb, size);
+
+			m_dev->SetVertexShaderConstantF(0, vs_cb, vs_cb_len);
+		}
+	}
+}
+
+void GSDevice9::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
+{
+	PSSetShaderResource(0, sr0);
+	PSSetShaderResource(1, sr1);
+	PSSetShaderResource(2, NULL);
+}
+
+void GSDevice9::PSSetShaderResource(int i, GSTexture* sr)
+{
+	IDirect3DTexture9* srv = NULL;
+
+	if(sr) srv = *(GSTexture9*)sr;
+
+	if(m_state.ps_srvs[i] != srv)
+	{
+		m_state.ps_srvs[i] = srv;
+
+		m_dev->SetTexture(i, srv);
+	}
+}
+
+void GSDevice9::PSSetShader(IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len)
+{
+	if(m_state.ps != ps)
+	{
+		m_state.ps = ps;
+
+		m_dev->SetPixelShader(ps);
+	}
+
+	if(ps_cb && ps_cb_len > 0)
+	{
+		int size = ps_cb_len * sizeof(float) * 4;
+
+		if(m_state.ps_cb_len != ps_cb_len || m_state.ps_cb == NULL || memcmp(m_state.ps_cb, ps_cb, size))
+		{
+			if(m_state.ps_cb == NULL || m_state.ps_cb_len < ps_cb_len)
+			{
+				if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
+
+				m_state.ps_cb = (float*)_aligned_malloc(size, 32);
+			}
+
+			m_state.ps_cb_len = ps_cb_len;
+
+			memcpy(m_state.ps_cb, ps_cb, size);
+
+			m_dev->SetPixelShaderConstantF(0, ps_cb, ps_cb_len);
+		}
+	}
+}
+
+void GSDevice9::PSSetSamplerState(Direct3DSamplerState9* ss)
+{
+	if(ss && m_state.ps_ss != ss)
+	{
+		m_state.ps_ss = ss;
+
+		m_dev->SetSamplerState(0, D3DSAMP_MINFILTER, ss->FilterMin[0]);
+		m_dev->SetSamplerState(0, D3DSAMP_MAGFILTER, ss->FilterMag[0]);
+		m_dev->SetSamplerState(0, D3DSAMP_MIPFILTER, ss->FilterMip[0]);
+		m_dev->SetSamplerState(0, D3DSAMP_ADDRESSU, ss->AddressU);
+		m_dev->SetSamplerState(0, D3DSAMP_ADDRESSV, ss->AddressV);
+		m_dev->SetSamplerState(0, D3DSAMP_ADDRESSW, ss->AddressW);
+		m_dev->SetSamplerState(0, D3DSAMP_MAXANISOTROPY, ss->MaxAnisotropy);
+		m_dev->SetSamplerState(0, D3DSAMP_MAXMIPLEVEL, ss->MaxLOD);
+
+		m_dev->SetSamplerState(1, D3DSAMP_MINFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(1, D3DSAMP_MAGFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(1, D3DSAMP_MIPFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(1, D3DSAMP_ADDRESSU, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(1, D3DSAMP_ADDRESSV, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(1, D3DSAMP_ADDRESSW, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(1, D3DSAMP_MAXANISOTROPY, ss->MaxAnisotropy);
+		m_dev->SetSamplerState(1, D3DSAMP_MAXMIPLEVEL, ss->MaxLOD);
+
+		m_dev->SetSamplerState(2, D3DSAMP_MINFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(2, D3DSAMP_MAGFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(2, D3DSAMP_MIPFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(2, D3DSAMP_ADDRESSU, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(2, D3DSAMP_ADDRESSV, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(2, D3DSAMP_ADDRESSW, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(2, D3DSAMP_MAXANISOTROPY, ss->MaxAnisotropy);
+		m_dev->SetSamplerState(2, D3DSAMP_MAXMIPLEVEL, ss->MaxLOD);
+
+		m_dev->SetSamplerState(3, D3DSAMP_MINFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(3, D3DSAMP_MAGFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(3, D3DSAMP_MIPFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(3, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP);
+		m_dev->SetSamplerState(3, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP);
+		m_dev->SetSamplerState(3, D3DSAMP_ADDRESSW, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(3, D3DSAMP_MAXANISOTROPY, ss->MaxAnisotropy);
+		m_dev->SetSamplerState(3, D3DSAMP_MAXMIPLEVEL, ss->MaxLOD);
+
+		m_dev->SetSamplerState(4, D3DSAMP_MINFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(4, D3DSAMP_MAGFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(4, D3DSAMP_MIPFILTER, ss->Anisotropic[1]);
+		m_dev->SetSamplerState(4, D3DSAMP_ADDRESSU, D3DTADDRESS_WRAP);
+		m_dev->SetSamplerState(4, D3DSAMP_ADDRESSV, D3DTADDRESS_WRAP);
+		m_dev->SetSamplerState(4, D3DSAMP_ADDRESSW, D3DTADDRESS_CLAMP);
+		m_dev->SetSamplerState(4, D3DSAMP_MAXANISOTROPY, ss->MaxAnisotropy);
+		m_dev->SetSamplerState(4, D3DSAMP_MAXMIPLEVEL, ss->MaxLOD);
+	}
+}
+
+void GSDevice9::OMSetDepthStencilState(Direct3DDepthStencilState9* dss)
+{
+	if(m_state.dss != dss)
+	{
+		m_state.dss = dss;
+
+		m_dev->SetRenderState(D3DRS_ZENABLE, dss->DepthEnable);
+		m_dev->SetRenderState(D3DRS_ZWRITEENABLE, dss->DepthWriteMask);
+
+		if(dss->DepthEnable)
+		{
+			m_dev->SetRenderState(D3DRS_ZFUNC, dss->DepthFunc);
+		}
+
+		m_dev->SetRenderState(D3DRS_STENCILENABLE, dss->StencilEnable);
+
+		if(dss->StencilEnable)
+		{
+			m_dev->SetRenderState(D3DRS_STENCILMASK, dss->StencilReadMask);
+			m_dev->SetRenderState(D3DRS_STENCILWRITEMASK, dss->StencilWriteMask);
+			m_dev->SetRenderState(D3DRS_STENCILFUNC, dss->StencilFunc);
+			m_dev->SetRenderState(D3DRS_STENCILPASS, dss->StencilPassOp);
+			m_dev->SetRenderState(D3DRS_STENCILFAIL, dss->StencilFailOp);
+			m_dev->SetRenderState(D3DRS_STENCILZFAIL, dss->StencilDepthFailOp);
+			m_dev->SetRenderState(D3DRS_STENCILREF, dss->StencilRef);
+		}
+	}
+}
+
+void GSDevice9::OMSetBlendState(Direct3DBlendState9* bs, uint32 bf)
+{
+	if(m_state.bs != bs || m_state.bf != bf)
+	{
+		m_state.bs = bs;
+		m_state.bf = bf;
+
+		m_dev->SetRenderState(D3DRS_ALPHABLENDENABLE, bs->BlendEnable);
+
+		if(bs->BlendEnable)
+		{
+			m_dev->SetRenderState(D3DRS_BLENDOP, bs->BlendOp);
+			m_dev->SetRenderState(D3DRS_SRCBLEND, bs->SrcBlend);
+			m_dev->SetRenderState(D3DRS_DESTBLEND, bs->DestBlend);
+			m_dev->SetRenderState(D3DRS_SEPARATEALPHABLENDENABLE, TRUE);
+			m_dev->SetRenderState(D3DRS_BLENDOPALPHA, bs->BlendOpAlpha);
+			m_dev->SetRenderState(D3DRS_SRCBLENDALPHA, bs->SrcBlendAlpha);
+			m_dev->SetRenderState(D3DRS_DESTBLENDALPHA, bs->DestBlendAlpha);
+			m_dev->SetRenderState(D3DRS_BLENDFACTOR, bf);
+		}
+
+		m_dev->SetRenderState(D3DRS_COLORWRITEENABLE, bs->RenderTargetWriteMask);
+	}
+}
+
+void GSDevice9::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor)
+{
+	IDirect3DSurface9* rtv = NULL;
+	IDirect3DSurface9* dsv = NULL;
+
+	if(rt) rtv = *(GSTexture9*)rt;
+	if(ds) dsv = *(GSTexture9*)ds;
+
+	if(m_state.rtv != rtv)
+	{
+		m_state.rtv = rtv;
+
+		m_dev->SetRenderTarget(0, rtv);
+	}
+
+	if(m_state.dsv != dsv)
+	{
+		m_state.dsv = dsv;
+
+		m_dev->SetDepthStencilSurface(dsv);
+	}
+
+	GSVector4i r = scissor ? *scissor : GSVector4i(rt->GetSize()).zwxy();
+
+	if(!m_state.scissor.eq(r))
+	{
+		m_state.scissor = r;
+
+		m_dev->SetScissorRect(r);
+	}
+}
+
+void GSDevice9::CompileShader(const char *source, size_t size, const char *filename, const string& entry, const D3D_SHADER_MACRO* macro, IDirect3DVertexShader9** vs, const D3DVERTEXELEMENT9* layout, int count, IDirect3DVertexDeclaration9** il)
+{
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	HRESULT hr;
+
+	CComPtr<ID3DBlob> shader, error;
+
+	hr = s_pD3DCompile(source, size, nullptr, &m[0], nullptr, entry.c_str(), m_shader.vs.c_str(), 0, 0, &shader, &error);
+
+	if(SUCCEEDED(hr))
+	{
+		hr = m_dev->CreateVertexShader((DWORD*)shader->GetBufferPointer(), vs);
+	}
+	else if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	ASSERT(SUCCEEDED(hr));
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+
+	hr = m_dev->CreateVertexDeclaration(layout, il);
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSDevice9::CompileShader(const char *source, size_t size, const char *filename, const string& entry, const D3D_SHADER_MACRO* macro, IDirect3DPixelShader9** ps)
+{
+	uint32 flags = 0;
+
+	if(m_shader.level >= D3D_FEATURE_LEVEL_9_3)
+	{
+		flags |= D3DCOMPILE_AVOID_FLOW_CONTROL;
+	}
+	else
+	{
+		flags |= D3DCOMPILE_SKIP_VALIDATION;
+	}
+
+	vector<D3D_SHADER_MACRO> m;
+
+	PrepareShaderMacro(m, macro);
+
+	HRESULT hr;
+
+	CComPtr<ID3DBlob> shader, error;
+	hr = s_pD3DCompile(source, size, filename, &m[0], nullptr, entry.c_str(), m_shader.ps.c_str(), flags, 0, &shader, &error);
+
+	if(SUCCEEDED(hr))
+	{
+		hr = m_dev->CreatePixelShader((DWORD*)shader->GetBufferPointer(), ps);
+	}
+	else if(error)
+	{
+		printf("%s\n", (const char*)error->GetBufferPointer());
+	}
+
+	ASSERT(SUCCEEDED(hr));
+
+	if(FAILED(hr))
+	{
+		throw GSDXRecoverableError();
+	}
+}
diff --git a/plugins/GSdx_legacy/GSDevice9.h b/plugins/GSdx_legacy/GSDevice9.h
new file mode 100644
index 0000000000..6bb1477e34
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDevice9.h
@@ -0,0 +1,255 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDeviceDX.h"
+#include "GSTexture9.h"
+
+struct Direct3DSamplerState9
+{
+    D3DTEXTUREFILTERTYPE FilterMin[2];
+    D3DTEXTUREFILTERTYPE FilterMag[2];
+	D3DTEXTUREFILTERTYPE FilterMip[2];
+	D3DTEXTUREFILTERTYPE Anisotropic[2];
+    D3DTEXTUREADDRESS AddressU;
+    D3DTEXTUREADDRESS AddressV;
+	D3DTEXTUREADDRESS AddressW;
+	DWORD MaxAnisotropy;
+	DWORD MaxLOD;
+};
+
+struct Direct3DDepthStencilState9
+{
+    BOOL DepthEnable;
+    BOOL DepthWriteMask;
+    D3DCMPFUNC DepthFunc;
+    BOOL StencilEnable;
+    UINT8 StencilReadMask;
+    UINT8 StencilWriteMask;
+    D3DSTENCILOP StencilFailOp;
+    D3DSTENCILOP StencilDepthFailOp;
+    D3DSTENCILOP StencilPassOp;
+    D3DCMPFUNC StencilFunc;
+	uint32 StencilRef;
+};
+
+struct Direct3DBlendState9
+{
+    BOOL BlendEnable;
+    D3DBLEND SrcBlend;
+    D3DBLEND DestBlend;
+    D3DBLENDOP BlendOp;
+    D3DBLEND SrcBlendAlpha;
+    D3DBLEND DestBlendAlpha;
+    D3DBLENDOP BlendOpAlpha;
+    UINT8 RenderTargetWriteMask;
+};
+
+struct GSVertexShader9
+{
+	CComPtr<IDirect3DVertexShader9> vs;
+	CComPtr<IDirect3DVertexDeclaration9> il;
+};
+
+class GSDevice9 : public GSDeviceDX
+{
+	GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format);
+
+	void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c);
+	void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset = 0);
+	void DoFXAA(GSTexture* sTex, GSTexture* dTex);
+	void DoShadeBoost(GSTexture* sTex, GSTexture* dTex);
+	void DoExternalFX(GSTexture* sTex, GSTexture* dTex);
+
+	void InitExternalFX();
+	void InitFXAA();
+
+	//
+
+	D3DCAPS9 m_d3dcaps;
+	D3DPRESENT_PARAMETERS m_pp;
+	CComPtr<IDirect3D9> m_d3d;
+	CComPtr<IDirect3DDevice9> m_dev;
+	CComPtr<IDirect3DSwapChain9> m_swapchain;
+	CComPtr<IDirect3DVertexBuffer9> m_vb;
+	CComPtr<IDirect3DVertexBuffer9> m_vb_old;
+	CComPtr<IDirect3DIndexBuffer9> m_ib;
+	CComPtr<IDirect3DIndexBuffer9> m_ib_old;
+	bool m_lost;
+	D3DFORMAT m_depth_format;
+
+	struct
+	{
+		IDirect3DVertexBuffer9* vb;
+		size_t vb_stride;
+		IDirect3DIndexBuffer9* ib;
+		IDirect3DVertexDeclaration9* layout;
+		D3DPRIMITIVETYPE topology;
+		IDirect3DVertexShader9* vs;
+		float* vs_cb;
+		int vs_cb_len;
+		IDirect3DTexture9* ps_srvs[3];
+		IDirect3DPixelShader9* ps;
+		float* ps_cb;
+		int ps_cb_len;
+		Direct3DSamplerState9* ps_ss;
+		GSVector4i scissor;
+		Direct3DDepthStencilState9* dss;
+		Direct3DBlendState9* bs;
+		uint32 bf;
+		IDirect3DSurface9* rtv;
+		IDirect3DSurface9* dsv;
+	} m_state;
+
+public: // TODO
+
+	bool FXAA_Compiled;
+	bool ExShader_Compiled;
+
+	struct
+	{
+		CComPtr<IDirect3DVertexDeclaration9> il;
+		CComPtr<IDirect3DVertexShader9> vs;
+		CComPtr<IDirect3DPixelShader9> ps[10];
+		Direct3DSamplerState9 ln;
+		Direct3DSamplerState9 pt;
+		Direct3DDepthStencilState9 dss;
+		Direct3DBlendState9 bs;
+	} m_convert;
+
+	struct
+	{
+		CComPtr<IDirect3DPixelShader9> ps[2];
+		Direct3DBlendState9 bs;
+	} m_merge;
+
+	struct
+	{
+		CComPtr<IDirect3DPixelShader9> ps[4];
+	} m_interlace;
+
+	struct
+	{
+		CComPtr<IDirect3DPixelShader9> ps;
+	} m_shaderfx;
+
+	struct
+	{
+		CComPtr<IDirect3DPixelShader9> ps;
+	} m_fxaa;
+
+	struct
+	{
+		CComPtr<IDirect3DPixelShader9> ps;
+	} m_shadeboost;
+
+	struct
+	{
+		Direct3DDepthStencilState9 dss;
+		Direct3DBlendState9 bs;
+	} m_date;
+
+	void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm);
+
+	// Shaders...
+
+	hash_map<uint32, GSVertexShader9 > m_vs;
+	hash_map<uint32, CComPtr<IDirect3DPixelShader9> > m_ps;
+	hash_map<uint32, Direct3DSamplerState9* > m_ps_ss;
+	hash_map<uint32, Direct3DDepthStencilState9* > m_om_dss;
+	hash_map<uint32, Direct3DBlendState9* > m_om_bs;
+	hash_map<uint32, GSTexture*> m_mskfix;
+
+	GSTexture* CreateMskFix(uint32 size, uint32 msk, uint32 fix);
+
+public:
+	GSDevice9();
+	virtual ~GSDevice9();
+
+	bool Create(GSWnd* wnd);
+	bool Reset(int w, int h);
+	bool IsLost(bool update);
+	void Flip();
+
+	void SetVSync(bool enable);
+
+	void BeginScene();
+	void DrawPrimitive();
+	void DrawIndexedPrimitive();
+	void EndScene();
+
+	void ClearRenderTarget(GSTexture* t, const GSVector4& c);
+	void ClearRenderTarget(GSTexture* t, uint32 c);
+	void ClearDepth(GSTexture* t, float c);
+	void ClearStencil(GSTexture* t, uint8 c);
+
+	GSTexture* CreateRenderTarget(int w, int h, bool msaa, int format = 0);
+	GSTexture* CreateDepthStencil(int w, int h, bool msaa, int format = 0);
+	GSTexture* CreateTexture(int w, int h, int format = 0);
+	GSTexture* CreateOffscreen(int w, int h, int format = 0);
+
+	GSTexture* Resolve(GSTexture* t);
+
+	GSTexture* CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format = 0, int ps_shader = 0);
+
+	void CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r);
+
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, bool linear = true);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, Direct3DBlendState9* bs, bool linear = true);
+
+	void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
+	bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count);
+	void IAUnmapVertexBuffer();
+	void IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride);
+	void IASetIndexBuffer(const void* index, size_t count);
+	void IASetIndexBuffer(IDirect3DIndexBuffer9* ib);
+	void IASetInputLayout(IDirect3DVertexDeclaration9* layout);
+	void IASetPrimitiveTopology(D3DPRIMITIVETYPE topology);
+	void VSSetShader(IDirect3DVertexShader9* vs, const float* vs_cb, int vs_cb_len);
+	void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
+	void PSSetShaderResource(int i, GSTexture* sr);
+	void PSSetShader(IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len);
+	void PSSetSamplerState(Direct3DSamplerState9* ss);
+	void OMSetDepthStencilState(Direct3DDepthStencilState9* dss);
+	void OMSetBlendState(Direct3DBlendState9* bs, uint32 bf);
+	void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
+
+	IDirect3DDevice9* operator->() {return m_dev;}
+	operator IDirect3DDevice9*() {return m_dev;}
+
+	void CompileShader(const char *source, size_t size, const char *filename, const string& entry, const D3D_SHADER_MACRO* macro, IDirect3DVertexShader9** vs, const D3DVERTEXELEMENT9* layout, int count, IDirect3DVertexDeclaration9** il);
+	void CompileShader(const char *source, size_t size, const char *filename, const string& entry, const D3D_SHADER_MACRO* macro, IDirect3DPixelShader9** ps);
+
+	void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
+	void SetupGS(GSSelector sel) {}
+	void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel);
+	void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix);
+
+	bool HasStencil() { return m_depth_format == D3DFMT_D24S8; }
+	bool HasDepth32() { return m_depth_format != D3DFMT_D24S8; }
+
+	static uint32 GetMaxDepth(uint32 msaaCount = 0, std::string adapter_id = "");
+	static void ForceValidMsaaConfig();
+
+};
+
diff --git a/plugins/GSdx_legacy/GSDeviceDX.cpp b/plugins/GSdx_legacy/GSDeviceDX.cpp
new file mode 100644
index 0000000000..2844f37e40
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceDX.cpp
@@ -0,0 +1,241 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSDeviceDX.h"
+#include <VersionHelpers.h>
+
+HMODULE GSDeviceDX::s_d3d_compiler_dll = nullptr;
+decltype(&D3DCompile) GSDeviceDX::s_pD3DCompile = nullptr;
+bool GSDeviceDX::s_old_d3d_compiler_dll;
+
+GSDeviceDX::GSDeviceDX()
+{
+	m_msaa = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_MSAA", 0) : 0;
+
+	m_msaa_desc.Count = 1;
+	m_msaa_desc.Quality = 0;
+}
+
+GSDeviceDX::~GSDeviceDX()
+{
+}
+
+bool GSDeviceDX::LoadD3DCompiler()
+{
+	// Windows 8.1 and later come with the latest d3dcompiler_47.dll, but
+	// Windows 7 devs might also have the dll available for use (which will
+	// have to be placed in the application directory)
+	s_d3d_compiler_dll = LoadLibraryEx("d3dcompiler_47.dll", nullptr, LOAD_LIBRARY_SEARCH_APPLICATION_DIR | LOAD_LIBRARY_SEARCH_SYSTEM32);
+
+	// Windows Vista and 7 can use the older version. If the previous LoadLibrary
+	// call fails on Windows 8.1 and later, then the user's system is likely
+	// broken.
+	if (s_d3d_compiler_dll)
+	{
+		s_old_d3d_compiler_dll = false;
+	}
+	else
+	{
+		if (!IsWindows8Point1OrGreater())
+			// Use LoadLibrary instead of LoadLibraryEx, some Windows 7 systems
+			// have issues with it.
+			s_d3d_compiler_dll = LoadLibrary("D3DCompiler_43.dll");
+
+		if (s_d3d_compiler_dll == nullptr)
+			return false;
+
+		s_old_d3d_compiler_dll = true;
+	}
+
+	s_pD3DCompile = reinterpret_cast<decltype(&D3DCompile)>(GetProcAddress(s_d3d_compiler_dll, "D3DCompile"));
+	if (s_pD3DCompile)
+		return true;
+
+	FreeLibrary(s_d3d_compiler_dll);
+	s_d3d_compiler_dll = nullptr;
+	return false;
+}
+
+void GSDeviceDX::FreeD3DCompiler()
+{
+	s_pD3DCompile = nullptr;
+	if (s_d3d_compiler_dll)
+		FreeLibrary(s_d3d_compiler_dll);
+	s_d3d_compiler_dll = nullptr;
+}
+
+GSTexture* GSDeviceDX::FetchSurface(int type, int w, int h, bool msaa, int format)
+{
+	if(m_msaa < 2)
+	{
+		msaa = false;
+	}
+
+	return __super::FetchSurface(type, w, h, msaa, format);
+}
+
+bool GSDeviceDX::SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode)
+{
+	m_shader.level = level;
+
+	switch(level)
+	{
+	case D3D_FEATURE_LEVEL_9_1:
+	case D3D_FEATURE_LEVEL_9_2:
+		m_shader.model = "0x200";
+		m_shader.vs = compat_mode ? "vs_4_0_level_9_1" : "vs_2_0";
+		m_shader.ps = compat_mode ? "ps_4_0_level_9_1" : "ps_2_0";
+		break;
+	case D3D_FEATURE_LEVEL_9_3:
+		m_shader.model = "0x300";
+		m_shader.vs = compat_mode ? "vs_4_0_level_9_3" : "vs_3_0";
+		m_shader.ps = compat_mode ? "ps_4_0_level_9_3" : "ps_3_0";
+		break;
+	case D3D_FEATURE_LEVEL_10_0:
+		m_shader.model = "0x400";
+		m_shader.vs = "vs_4_0";
+		m_shader.gs = "gs_4_0";
+		m_shader.ps = "ps_4_0";
+		m_shader.cs = "cs_4_0";
+		break;
+	case D3D_FEATURE_LEVEL_10_1:
+		m_shader.model = "0x401";
+		m_shader.vs = "vs_4_1";
+		m_shader.gs = "gs_4_1";
+		m_shader.ps = "ps_4_1";
+		m_shader.cs = "cs_4_1";
+		break;
+	case D3D_FEATURE_LEVEL_11_0:
+		m_shader.model = "0x500";
+		m_shader.vs = "vs_5_0";
+		m_shader.gs = "gs_5_0";
+		m_shader.ps = "ps_5_0";
+		m_shader.cs = "cs_5_0";
+		break;
+	default:
+		ASSERT(0);
+		return false;
+	}
+
+	return true;
+}
+
+// (A - B) * C + D
+// A: Cs/Cd/0
+// B: Cs/Cd/0
+// C: As/Ad/FIX
+// D: Cs/Cd/0
+
+// bogus: 0100, 0110, 0120, 0200, 0210, 0220, 1001, 1011, 1021
+// tricky: 1201, 1211, 1221
+
+// Source.rgb = float3(1, 1, 1);
+// 1201 Cd*(1 + As) => Source * Dest color + Dest * Source alpha
+// 1211 Cd*(1 + Ad) => Source * Dest color + Dest * Dest alpha
+// 1221 Cd*(1 + F) => Source * Dest color + Dest * Factor
+
+const GSDeviceDX::D3D9Blend GSDeviceDX::m_blendMapD3D9[3*3*3*3] =
+{
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 0000: (Cs - Cs)*As + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 0001: (Cs - Cs)*As + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 0002: (Cs - Cs)*As +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 0010: (Cs - Cs)*Ad + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 0011: (Cs - Cs)*Ad + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 0012: (Cs - Cs)*Ad +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 0020: (Cs - Cs)*F  + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 0021: (Cs - Cs)*F  + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 0022: (Cs - Cs)*F  +  0 ==> 0
+	{1, D3DBLENDOP_SUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_SRCALPHA},			//*0100: (Cs - Cd)*As + Cs ==> Cs*(As + 1) - Cd*As
+	{0, D3DBLENDOP_ADD, D3DBLEND_SRCALPHA, D3DBLEND_INVSRCALPHA},			// 0101: (Cs - Cd)*As + Cd ==> Cs*As + Cd*(1 - As)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_SRCALPHA},			// 0102: (Cs - Cd)*As +  0 ==> Cs*As - Cd*As
+	{1, D3DBLENDOP_SUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_DESTALPHA},		//*0110: (Cs - Cd)*Ad + Cs ==> Cs*(Ad + 1) - Cd*Ad
+	{0, D3DBLENDOP_ADD, D3DBLEND_DESTALPHA, D3DBLEND_INVDESTALPHA},			// 0111: (Cs - Cd)*Ad + Cd ==> Cs*Ad + Cd*(1 - Ad)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_DESTALPHA},		// 0112: (Cs - Cd)*Ad +  0 ==> Cs*Ad - Cd*Ad
+	{1, D3DBLENDOP_SUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_BLENDFACTOR},	//*0120: (Cs - Cd)*F  + Cs ==> Cs*(F + 1) - Cd*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_BLENDFACTOR, D3DBLEND_INVBLENDFACTOR},		// 0121: (Cs - Cd)*F  + Cd ==> Cs*F + Cd*(1 - F)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_BLENDFACTOR},	// 0122: (Cs - Cd)*F  +  0 ==> Cs*F - Cd*F
+	{1, D3DBLENDOP_ADD, D3DBLEND_SRCALPHA, D3DBLEND_ZERO},					//*0200: (Cs -  0)*As + Cs ==> Cs*(As + 1)
+	{0, D3DBLENDOP_ADD, D3DBLEND_SRCALPHA, D3DBLEND_ONE},					// 0201: (Cs -  0)*As + Cd ==> Cs*As + Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_SRCALPHA, D3DBLEND_ZERO},					// 0202: (Cs -  0)*As +  0 ==> Cs*As
+	{1, D3DBLENDOP_ADD, D3DBLEND_DESTALPHA, D3DBLEND_ZERO},					//*0210: (Cs -  0)*Ad + Cs ==> Cs*(Ad + 1)
+	{0, D3DBLENDOP_ADD, D3DBLEND_DESTALPHA, D3DBLEND_ONE},					// 0211: (Cs -  0)*Ad + Cd ==> Cs*Ad + Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_DESTALPHA, D3DBLEND_ZERO},					// 0212: (Cs -  0)*Ad +  0 ==> Cs*Ad
+	{1, D3DBLENDOP_ADD, D3DBLEND_BLENDFACTOR, D3DBLEND_ZERO},				//*0220: (Cs -  0)*F  + Cs ==> Cs*(F + 1)
+	{0, D3DBLENDOP_ADD, D3DBLEND_BLENDFACTOR, D3DBLEND_ONE},				// 0221: (Cs -  0)*F  + Cd ==> Cs*F + Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_BLENDFACTOR, D3DBLEND_ZERO},				// 0222: (Cs -  0)*F  +  0 ==> Cs*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVSRCALPHA, D3DBLEND_SRCALPHA},			// 1000: (Cd - Cs)*As + Cs ==> Cd*As + Cs*(1 - As)
+	{1, D3DBLENDOP_REVSUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_SRCALPHA},		//*1001: (Cd - Cs)*As + Cd ==> Cd*(As + 1) - Cs*As
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_SRCALPHA},		// 1002: (Cd - Cs)*As +  0 ==> Cd*As - Cs*As
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVDESTALPHA, D3DBLEND_DESTALPHA},			// 1010: (Cd - Cs)*Ad + Cs ==> Cd*Ad + Cs*(1 - Ad)
+	{1, D3DBLENDOP_REVSUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_DESTALPHA},	//*1011: (Cd - Cs)*Ad + Cd ==> Cd*(Ad + 1) - Cs*Ad
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_DESTALPHA},	// 1012: (Cd - Cs)*Ad +  0 ==> Cd*Ad - Cs*Ad
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVBLENDFACTOR, D3DBLEND_BLENDFACTOR},		// 1020: (Cd - Cs)*F  + Cs ==> Cd*F + Cs*(1 - F)
+	{1, D3DBLENDOP_REVSUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_BLENDFACTOR},//*1021: (Cd - Cs)*F  + Cd ==> Cd*(F + 1) - Cs*F
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_BLENDFACTOR},// 1022: (Cd - Cs)*F  +  0 ==> Cd*F - Cs*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 1100: (Cd - Cd)*As + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 1101: (Cd - Cd)*As + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 1102: (Cd - Cd)*As +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 1110: (Cd - Cd)*Ad + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 1111: (Cd - Cd)*Ad + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 1112: (Cd - Cd)*Ad +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 1120: (Cd - Cd)*F  + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 1121: (Cd - Cd)*F  + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 1122: (Cd - Cd)*F  +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_SRCALPHA},					// 1200: (Cd -  0)*As + Cs ==> Cs + Cd*As
+	{2, D3DBLENDOP_ADD, D3DBLEND_DESTCOLOR, D3DBLEND_SRCALPHA},				//#1201: (Cd -  0)*As + Cd ==> Cd*(1 + As)  // ffxii main menu background glow effect
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_SRCALPHA},					// 1202: (Cd -  0)*As +  0 ==> Cd*As
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_DESTALPHA},					// 1210: (Cd -  0)*Ad + Cs ==> Cs + Cd*Ad
+	{2, D3DBLENDOP_ADD, D3DBLEND_DESTCOLOR, D3DBLEND_DESTALPHA},			//#1211: (Cd -  0)*Ad + Cd ==> Cd*(1 + Ad)
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_DESTALPHA},					// 1212: (Cd -  0)*Ad +  0 ==> Cd*Ad
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_BLENDFACTOR},				// 1220: (Cd -  0)*F  + Cs ==> Cs + Cd*F
+	{2, D3DBLENDOP_ADD, D3DBLEND_DESTCOLOR, D3DBLEND_BLENDFACTOR},			//#1221: (Cd -  0)*F  + Cd ==> Cd*(1 + F)
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_BLENDFACTOR},				// 1222: (Cd -  0)*F  +  0 ==> Cd*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVSRCALPHA, D3DBLEND_ZERO},				// 2000: (0  - Cs)*As + Cs ==> Cs*(1 - As)
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_ONE},			// 2001: (0  - Cs)*As + Cd ==> Cd - Cs*As
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_SRCALPHA, D3DBLEND_ZERO},			// 2002: (0  - Cs)*As +  0 ==> 0 - Cs*As
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVDESTALPHA, D3DBLEND_ZERO},				// 2010: (0  - Cs)*Ad + Cs ==> Cs*(1 - Ad)
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_ONE},			// 2011: (0  - Cs)*Ad + Cd ==> Cd - Cs*Ad
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_DESTALPHA, D3DBLEND_ZERO},			// 2012: (0  - Cs)*Ad +  0 ==> 0 - Cs*Ad
+	{0, D3DBLENDOP_ADD, D3DBLEND_INVBLENDFACTOR, D3DBLEND_ZERO},			// 2020: (0  - Cs)*F  + Cs ==> Cs*(1 - F)
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_ONE},		// 2021: (0  - Cs)*F  + Cd ==> Cd - Cs*F
+	{0, D3DBLENDOP_REVSUBTRACT, D3DBLEND_BLENDFACTOR, D3DBLEND_ZERO},		// 2022: (0  - Cs)*F  +  0 ==> 0 - Cs*F
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ONE, D3DBLEND_SRCALPHA},				// 2100: (0  - Cd)*As + Cs ==> Cs - Cd*As
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_INVSRCALPHA},				// 2101: (0  - Cd)*As + Cd ==> Cd*(1 - As)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ZERO, D3DBLEND_SRCALPHA},				// 2102: (0  - Cd)*As +  0 ==> 0 - Cd*As
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ONE, D3DBLEND_DESTALPHA},				// 2110: (0  - Cd)*Ad + Cs ==> Cs - Cd*Ad
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_INVDESTALPHA},				// 2111: (0  - Cd)*Ad + Cd ==> Cd*(1 - Ad)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ONE, D3DBLEND_DESTALPHA},				// 2112: (0  - Cd)*Ad +  0 ==> 0 - Cd*Ad
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ONE, D3DBLEND_BLENDFACTOR},			// 2120: (0  - Cd)*F  + Cs ==> Cs - Cd*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_INVBLENDFACTOR},			// 2121: (0  - Cd)*F  + Cd ==> Cd*(1 - F)
+	{0, D3DBLENDOP_SUBTRACT, D3DBLEND_ONE, D3DBLEND_BLENDFACTOR},			// 2122: (0  - Cd)*F  +  0 ==> 0 - Cd*F
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 2200: (0  -  0)*As + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 2201: (0  -  0)*As + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 2202: (0  -  0)*As +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 2210: (0  -  0)*Ad + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 2211: (0  -  0)*Ad + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 2212: (0  -  0)*Ad +  0 ==> 0
+	{0, D3DBLENDOP_ADD, D3DBLEND_ONE, D3DBLEND_ZERO},						// 2220: (0  -  0)*F  + Cs ==> Cs
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ONE},						// 2221: (0  -  0)*F  + Cd ==> Cd
+	{0, D3DBLENDOP_ADD, D3DBLEND_ZERO, D3DBLEND_ZERO},						// 2222: (0  -  0)*F  +  0 ==> 0
+};
+
diff --git a/plugins/GSdx_legacy/GSDeviceDX.h b/plugins/GSdx_legacy/GSDeviceDX.h
new file mode 100644
index 0000000000..cdf03312ad
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceDX.h
@@ -0,0 +1,331 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSVector.h"
+#include "GSDevice.h"
+#include "GSAlignedClass.h"
+
+class GSDeviceDX : public GSDevice
+{
+public:
+	#pragma pack(push, 1)
+
+	__aligned(struct, 32) VSConstantBuffer
+	{
+		GSVector4 VertexScale;
+		GSVector4 VertexOffset;
+		GSVector4 TextureScale;
+
+		struct VSConstantBuffer()
+		{
+			VertexScale = GSVector4::zero();
+			VertexOffset = GSVector4::zero();
+			TextureScale = GSVector4::zero();
+		}
+
+		__forceinline bool Update(const VSConstantBuffer* cb)
+		{
+			GSVector4i* a = (GSVector4i*)this;
+			GSVector4i* b = (GSVector4i*)cb;
+
+			GSVector4i b0 = b[0];
+			GSVector4i b1 = b[1];
+			GSVector4i b2 = b[2];
+
+			if(!((a[0] == b0) & (a[1] == b1) & (a[2] == b2)).alltrue())
+			{
+				a[0] = b0;
+				a[1] = b1;
+				a[2] = b2;
+
+				return true;
+			}
+
+			return false;
+		}
+	};
+
+	struct VSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 bppz:2;
+				uint32 tme:1;
+				uint32 fst:1;
+				uint32 logz:1;
+				uint32 rtcopy:1;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0xff;}
+
+		VSSelector() : key(0) {}
+	};
+
+	__aligned(struct, 32) PSConstantBuffer
+	{
+		GSVector4 FogColor_AREF;
+		GSVector4 HalfTexel;
+		GSVector4 WH;
+		GSVector4 MinMax;
+		GSVector4 MinF_TA;
+		GSVector4i MskFix;
+
+		GSVector4 TC_OffsetHack;
+
+		struct PSConstantBuffer()
+		{
+			FogColor_AREF = GSVector4::zero();
+			HalfTexel = GSVector4::zero();
+			WH = GSVector4::zero();
+			MinMax = GSVector4::zero();
+			MinF_TA = GSVector4::zero();
+			MskFix = GSVector4i::zero();
+		}
+
+		__forceinline bool Update(const PSConstantBuffer* cb)
+		{
+			GSVector4i* a = (GSVector4i*)this;
+			GSVector4i* b = (GSVector4i*)cb;
+
+			GSVector4i b0 = b[0];
+			GSVector4i b1 = b[1];
+			GSVector4i b2 = b[2];
+			GSVector4i b3 = b[3];
+			GSVector4i b4 = b[4];
+			GSVector4i b5 = b[5];
+
+			if(!((a[0] == b0) /*& (a[1] == b1)*/ & (a[2] == b2) & (a[3] == b3) & (a[4] == b4) & (a[5] == b5)).alltrue()) // if WH matches HalfTexel does too
+			{
+				a[0] = b0;
+				a[1] = b1;
+				a[2] = b2;
+				a[3] = b3;
+				a[4] = b4;
+				a[5] = b5;
+
+				return true;
+			}
+
+			return false;
+		}
+	};
+
+	struct GSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 iip:1;
+				uint32 prim:2;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x7;}
+
+		GSSelector() : key(0) {}
+	};
+
+	struct PSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 fst:1;
+				uint32 wms:2;
+				uint32 wmt:2;
+				uint32 fmt:3;
+				uint32 aem:1;
+				uint32 tfx:3;
+				uint32 tcc:1;
+				uint32 atst:3;
+				uint32 fog:1;
+				uint32 clr1:1;
+				uint32 fba:1;
+				uint32 aout:1;
+				uint32 rt:1;
+				uint32 ltf:1;
+				uint32 colclip:2;
+				uint32 date:2;
+				uint32 spritehack:1;
+				uint32 tcoffsethack:1;
+				uint32 point_sampler:1;
+				uint32 shuffle:1;
+				uint32 read_ba:1;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0xfffffff;}
+
+		PSSelector() : key(0) {}
+	};
+
+	struct PSSamplerSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 tau:1;
+				uint32 tav:1;
+				uint32 ltf:1;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x7;}
+
+		PSSamplerSelector() : key(0) {}
+	};
+
+	struct OMDepthStencilSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 ztst:2;
+				uint32 zwe:1;
+				uint32 date:1;
+				uint32 fba:1;
+				uint32 alpha_stencil:1;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x3f;}
+
+		OMDepthStencilSelector() : key(0) {}
+	};
+
+	struct OMBlendSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 abe:1;
+				uint32 a:2;
+				uint32 b:2;
+				uint32 c:2;
+				uint32 d:2;
+				uint32 wr:1;
+				uint32 wg:1;
+				uint32 wb:1;
+				uint32 wa:1;
+				uint32 negative:1;
+			};
+
+			struct
+			{
+				uint32 _pad:1;
+				uint32 abcd:8;
+				uint32 wrgba:4;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x3fff;}
+
+		OMBlendSelector() : key(0) {}
+
+		bool IsCLR1() const
+		{
+			return (key & 0x19f) == 0x93; // abe == 1 && a == 1 && b == 2 && d == 1
+		}
+	};
+
+	struct D3D9Blend {int bogus, op, src, dst;};
+	static const D3D9Blend m_blendMapD3D9[3*3*3*3];
+
+	#pragma pack(pop)
+
+protected:
+	struct {D3D_FEATURE_LEVEL level; string model, vs, gs, ps, cs;} m_shader;
+	uint32 m_msaa;
+	DXGI_SAMPLE_DESC m_msaa_desc;
+
+	static HMODULE s_d3d_compiler_dll;
+	static decltype(&D3DCompile) s_pD3DCompile;
+	// Older version doesn't support D3D_COMPILE_STANDARD_FILE_INCLUDE, which
+	// could be useful for external shaders.
+	static bool s_old_d3d_compiler_dll;
+
+	GSTexture* FetchSurface(int type, int w, int h, bool msaa, int format);
+
+public:
+	GSDeviceDX();
+	virtual ~GSDeviceDX();
+
+	bool SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode);
+	void GetFeatureLevel(D3D_FEATURE_LEVEL& level) const {level = m_shader.level;}
+
+	virtual void SetupVS(VSSelector sel, const VSConstantBuffer* cb) = 0;
+	virtual void SetupGS(GSSelector sel) = 0;
+	virtual void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel) = 0;
+	virtual void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix) = 0;
+
+	virtual void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm) = 0;
+
+	virtual bool HasStencil() = 0;
+	virtual bool HasDepth32() = 0;
+
+	static bool LoadD3DCompiler();
+	static void FreeD3DCompiler();
+
+	template<class T> void PrepareShaderMacro(vector<T>& dst, const T* src)
+	{
+		dst.clear();
+
+		while(src && src->Definition && src->Name)
+		{
+			dst.push_back(*src++);
+		}
+
+		T m;
+
+		m.Name = "SHADER_MODEL";
+		m.Definition = m_shader.model.c_str();
+
+		dst.push_back(m);
+
+		m.Name = NULL;
+		m.Definition = NULL;
+
+		dst.push_back(m);
+	}
+};
+
diff --git a/plugins/GSdx_legacy/GSDeviceNull.cpp b/plugins/GSdx_legacy/GSDeviceNull.cpp
new file mode 100644
index 0000000000..d1b5d4cb41
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceNull.cpp
@@ -0,0 +1,47 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDeviceNull.h"
+
+bool GSDeviceNull::Create(GSWnd* wnd)
+{
+	if(!GSDevice::Create(wnd))
+		return false;
+
+	Reset(1, 1);
+
+	return true;
+}
+
+bool GSDeviceNull::Reset(int w, int h)
+{
+	if(!GSDevice::Reset(w, h))
+		return false;
+
+	return true;
+}
+
+GSTexture* GSDeviceNull::CreateSurface(int type, int w, int h, bool msaa, int format)
+{
+	return new GSTextureNull(type, w, h, format);
+}
+
diff --git a/plugins/GSdx_legacy/GSDeviceNull.h b/plugins/GSdx_legacy/GSDeviceNull.h
new file mode 100644
index 0000000000..c61312ab1a
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceNull.h
@@ -0,0 +1,41 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDevice.h"
+#include "GSTextureNull.h"
+
+class GSDeviceNull : public GSDevice
+{
+private:
+	GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format);
+
+	void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c) {}
+	void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset = 0) {}
+
+public:
+	GSDeviceNull() {}
+
+	bool Create(GSWnd* wnd);
+	bool Reset(int w, int h);
+};
+
diff --git a/plugins/GSdx_legacy/GSDeviceOGL.cpp b/plugins/GSdx_legacy/GSDeviceOGL.cpp
new file mode 100644
index 0000000000..7af10383d9
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceOGL.cpp
@@ -0,0 +1,1697 @@
+/*
+ *	Copyright (C) 2011-2014 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDeviceOGL.h"
+#include "GLState.h"
+#include <fstream>
+
+#include "res/glsl_source.h"
+
+//#define ONLY_LINES
+
+// TODO port those value into PerfMon API
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+uint64 g_real_texture_upload_byte = 0;
+uint64 g_vertex_upload_byte = 0;
+uint64 g_uniform_upload_byte = 0;
+#endif
+
+static const uint32 g_merge_cb_index      = 10;
+static const uint32 g_interlace_cb_index  = 11;
+static const uint32 g_shadeboost_cb_index = 12;
+static const uint32 g_fx_cb_index         = 14;
+static const uint32 g_convert_index       = 15;
+
+bool GSDeviceOGL::m_debug_gl_call = false;
+int  GSDeviceOGL::s_n = 0;
+FILE* GSDeviceOGL::m_debug_gl_file = NULL;
+
+GSDeviceOGL::GSDeviceOGL()
+	: m_msaa(0)
+	, m_window(NULL)
+	, m_fbo(0)
+	, m_fbo_read(0)
+	, m_va(NULL)
+	, m_apitrace(0)
+	, m_palette_ss(0)
+	, m_vs_cb(NULL)
+	, m_ps_cb(NULL)
+	, m_shader(NULL)
+{
+	memset(&m_merge_obj, 0, sizeof(m_merge_obj));
+	memset(&m_interlace, 0, sizeof(m_interlace));
+	memset(&m_convert, 0, sizeof(m_convert));
+	memset(&m_fxaa, 0, sizeof(m_fxaa));
+	memset(&m_shaderfx, 0, sizeof(m_shaderfx));
+	memset(&m_date, 0, sizeof(m_date));
+	memset(&m_shadeboost, 0, sizeof(m_shadeboost));
+	memset(&m_om_dss, 0, sizeof(m_om_dss));
+	GLState::Clear();
+
+	// Reset the debug file
+	#ifdef ENABLE_OGL_DEBUG
+	m_debug_gl_file = fopen("GSdx_opengl_debug.txt","w");
+	#endif
+
+	m_debug_gl_call =  theApp.GetConfig("debug_opengl", 0);
+}
+
+GSDeviceOGL::~GSDeviceOGL()
+{
+	if (m_debug_gl_file) {
+		fclose(m_debug_gl_file);
+		m_debug_gl_file = NULL;
+	}
+
+	// If the create function wasn't called nothing to do.
+	if (m_shader == NULL)
+		return;
+
+	GL_PUSH("GSDeviceOGL destructor");
+
+	// Clean vertex buffer state
+	delete (m_va);
+
+	// Clean m_merge_obj
+	for (size_t i = 0; i < countof(m_merge_obj.ps); i++)
+		m_shader->Delete(m_merge_obj.ps[i]);
+	delete (m_merge_obj.cb);
+
+	// Clean m_interlace
+	for (size_t i = 0; i < countof(m_interlace.ps); i++)
+		m_shader->Delete(m_interlace.ps[i]);
+	delete (m_interlace.cb);
+
+	// Clean m_convert
+	m_shader->Delete(m_convert.vs);
+	for (size_t i = 0; i < countof(m_convert.ps); i++)
+		m_shader->Delete(m_convert.ps[i]);
+	delete m_convert.dss;
+	delete m_convert.dss_write;
+	delete m_convert.cb;
+
+	// Clean m_fxaa
+	delete m_fxaa.cb;
+	m_shader->Delete(m_fxaa.ps);
+
+	// Clean m_shaderfx
+	delete m_shaderfx.cb;
+	m_shader->Delete(m_shaderfx.ps);
+
+	// Clean m_date
+	delete m_date.dss;
+
+	// Clean shadeboost
+	delete m_shadeboost.cb;
+	m_shader->Delete(m_shadeboost.ps);
+
+
+	// Clean various opengl allocation
+	glDeleteFramebuffers(1, &m_fbo);
+	glDeleteFramebuffers(1, &m_fbo_read);
+
+	// Delete HW FX
+	delete m_vs_cb;
+	delete m_ps_cb;
+	glDeleteSamplers(1, &m_palette_ss);
+	m_shader->Delete(m_apitrace);
+
+	for (uint32 key = 0; key < countof(m_vs); key++) m_shader->Delete(m_vs[key]);
+	for (uint32 key = 0; key < countof(m_gs); key++) m_shader->Delete(m_gs[key]);
+	for (auto it = m_ps.begin(); it != m_ps.end() ; it++) m_shader->Delete(it->second);
+
+	m_ps.clear();
+
+	glDeleteSamplers(countof(m_ps_ss), m_ps_ss);
+
+	for (uint32 key = 0; key < countof(m_om_dss); key++) delete m_om_dss[key];
+
+	PboPool::Destroy();
+
+	// Must be done after the destruction of all shader/program objects
+	delete m_shader;
+	m_shader = NULL;
+
+	GL_POP();
+}
+
+GSTexture* GSDeviceOGL::CreateSurface(int type, int w, int h, bool msaa, int fmt)
+{
+	GL_PUSH("Create surface");
+
+	// A wrapper to call GSTextureOGL, with the different kind of parameter
+	GSTextureOGL* t = NULL;
+	t = new GSTextureOGL(type, w, h, fmt, m_fbo_read);
+
+	// NOTE: I'm not sure RenderTarget always need to be cleared. It could be costly for big upscale.
+	switch(type)
+	{
+		case GSTexture::RenderTarget:
+			ClearRenderTarget(t, 0);
+			break;
+		case GSTexture::DepthStencil:
+			ClearDepth(t, 0);
+			// No need to clear the stencil now.
+			break;
+	}
+
+	GL_POP();
+	return t;
+}
+
+GSTexture* GSDeviceOGL::FetchSurface(int type, int w, int h, bool msaa, int format)
+{
+	return GSDevice::FetchSurface(type, w, h, false, format);
+}
+
+bool GSDeviceOGL::Create(GSWnd* wnd)
+{
+	if (m_window == NULL) {
+		if (!GLLoader::check_gl_version(3, 3)) return false;
+
+		if (!GLLoader::check_gl_supported_extension()) return false;
+	}
+
+	m_window = wnd;
+
+	// ****************************************************************
+	// Debug helper
+	// ****************************************************************
+#ifdef ENABLE_OGL_DEBUG
+	if (theApp.GetConfig("debug_opengl", 0)) {
+		if (glDebugMessageCallback) {
+			glDebugMessageCallback((GLDEBUGPROC)DebugOutputToFile, NULL);
+			glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB);
+		}
+		if (glDebugMessageControl) {
+			glDebugMessageControl(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL, true);
+			// Useless info message on Nvidia driver
+			GLuint ids[] = {0x20004};
+			glDebugMessageControl(GL_DEBUG_SOURCE_API_ARB, GL_DEBUG_TYPE_OTHER_ARB, GL_DONT_CARE, countof(ids), ids, false);
+		}
+	}
+#endif
+
+	// WARNING it must be done after the control setup (at least on MESA)
+	GL_PUSH("GSDeviceOGL::Create");
+
+	// ****************************************************************
+	// Various object
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Various");
+
+	m_shader = new GSShaderOGL(!!theApp.GetConfig("debug_glsl_shader", 0));
+
+	glGenFramebuffers(1, &m_fbo);
+	// Always write to the first buffer
+	OMSetFBO(m_fbo);
+	GLenum target[1] = {GL_COLOR_ATTACHMENT0};
+	glDrawBuffers(1, target);
+	OMSetFBO(0);
+
+	glGenFramebuffers(1, &m_fbo_read);
+	// Always read from the first buffer
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, m_fbo_read);
+	glReadBuffer(GL_COLOR_ATTACHMENT0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+
+	GL_POP();
+
+	// ****************************************************************
+	// Vertex buffer state
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Vertex Buffer");
+
+	ASSERT(sizeof(GSVertexPT1) == sizeof(GSVertex));
+	GSInputLayoutOGL il_convert[] =
+	{
+		{2 , GL_FLOAT          , GL_FALSE , sizeof(GSVertexPT1) , (const GLvoid*)(0) }  ,
+		{2 , GL_FLOAT          , GL_FALSE , sizeof(GSVertexPT1) , (const GLvoid*)(16) } ,
+		{4 , GL_UNSIGNED_BYTE  , GL_FALSE , sizeof(GSVertex)    , (const GLvoid*)(8) }  ,
+		{1 , GL_FLOAT          , GL_FALSE , sizeof(GSVertex)    , (const GLvoid*)(12) } ,
+		{2 , GL_UNSIGNED_SHORT , GL_FALSE , sizeof(GSVertex)    , (const GLvoid*)(16) } ,
+		{1 , GL_UNSIGNED_INT   , GL_FALSE , sizeof(GSVertex)    , (const GLvoid*)(20) } ,
+		{2 , GL_UNSIGNED_SHORT , GL_FALSE , sizeof(GSVertex)    , (const GLvoid*)(24) } ,
+		{4 , GL_UNSIGNED_BYTE  , GL_TRUE  , sizeof(GSVertex)    , (const GLvoid*)(28) } , // Only 1 byte is useful but hardware unit only support 4B
+	};
+	m_va = new GSVertexBufferStateOGL(il_convert, countof(il_convert));
+
+	GL_POP();
+	// ****************************************************************
+	// Pre Generate the different sampler object
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Sampler");
+
+	for (uint32 key = 0; key < countof(m_ps_ss); key++) {
+		m_ps_ss[key] = CreateSampler(PSSamplerSelector(key));
+	}
+
+	GL_POP();
+
+	// ****************************************************************
+	// convert
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Convert");
+
+	m_convert.cb = new GSUniformBufferOGL(g_convert_index, sizeof(ConvertConstantBuffer));
+	// Upload once and forget about it
+	ConvertConstantBuffer cb;
+	cb.ScalingFactor = GSVector4i(theApp.GetConfig("upscale_multiplier", 1));
+	m_convert.cb->upload(&cb);
+
+	m_convert.vs = m_shader->Compile("convert.glsl", "vs_main", GL_VERTEX_SHADER, convert_glsl);
+	for(size_t i = 0; i < countof(m_convert.ps); i++)
+		m_convert.ps[i] = m_shader->Compile("convert.glsl", format("ps_main%d", i), GL_FRAGMENT_SHADER, convert_glsl);
+
+	PSSamplerSelector point;
+	m_convert.pt = GetSamplerID(point);
+
+	PSSamplerSelector bilinear;
+	bilinear.ltf = true;
+	m_convert.ln = GetSamplerID(bilinear);
+
+	m_convert.dss = new GSDepthStencilOGL();
+	m_convert.dss_write = new GSDepthStencilOGL();
+	m_convert.dss_write->EnableDepth();
+	m_convert.dss_write->SetDepth(GL_ALWAYS, true);
+
+	GL_POP();
+
+	// ****************************************************************
+	// merge
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Merge");
+
+	m_merge_obj.cb = new GSUniformBufferOGL(g_merge_cb_index, sizeof(MergeConstantBuffer));
+
+	for(size_t i = 0; i < countof(m_merge_obj.ps); i++)
+		m_merge_obj.ps[i] = m_shader->Compile("merge.glsl", format("ps_main%d", i), GL_FRAGMENT_SHADER, merge_glsl);
+
+	GL_POP();
+
+	// ****************************************************************
+	// interlace
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Interlace");
+
+	m_interlace.cb = new GSUniformBufferOGL(g_interlace_cb_index, sizeof(InterlaceConstantBuffer));
+
+	for(size_t i = 0; i < countof(m_interlace.ps); i++)
+		m_interlace.ps[i] = m_shader->Compile("interlace.glsl", format("ps_main%d", i), GL_FRAGMENT_SHADER, interlace_glsl);
+
+	GL_POP();
+
+	// ****************************************************************
+	// Shade boost
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Shadeboost");
+
+	m_shadeboost.cb = new GSUniformBufferOGL(g_shadeboost_cb_index, sizeof(ShadeBoostConstantBuffer));
+
+	int ShadeBoost_Contrast = theApp.GetConfig("ShadeBoost_Contrast", 50);
+	int ShadeBoost_Brightness = theApp.GetConfig("ShadeBoost_Brightness", 50);
+	int ShadeBoost_Saturation = theApp.GetConfig("ShadeBoost_Saturation", 50);
+	std::string shade_macro = format("#define SB_SATURATION %d.0\n", ShadeBoost_Saturation)
+		+ format("#define SB_BRIGHTNESS %d.0\n", ShadeBoost_Brightness)
+		+ format("#define SB_CONTRAST %d.0\n", ShadeBoost_Contrast);
+
+	m_shadeboost.ps = m_shader->Compile("shadeboost.glsl", "ps_main", GL_FRAGMENT_SHADER, shadeboost_glsl, shade_macro);
+
+	GL_POP();
+
+	// ****************************************************************
+	// rasterization configuration
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Rasterization");
+
+#ifdef ONLY_LINES
+	glLineWidth(5.0);
+	glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
+#else
+	glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+#endif
+	glDisable(GL_CULL_FACE);
+	glEnable(GL_SCISSOR_TEST);
+	glDisable(GL_MULTISAMPLE);
+	glDisable(GL_DITHER); // Honestly I don't know!
+
+	GL_POP();
+
+	// ****************************************************************
+	// DATE
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::Date");
+
+	m_date.dss = new GSDepthStencilOGL();
+	m_date.dss->EnableStencil();
+	m_date.dss->SetStencil(GL_ALWAYS, GL_REPLACE);
+
+	GL_POP();
+	// ****************************************************************
+	// Use DX coordinate convention
+	// ****************************************************************
+
+
+	// VS gl_position.z => [-1,-1]
+	// FS depth => [0, 1]
+	// because of -1 we loose lot of precision for small GS value
+	// This extension allow FS depth to range from -1 to 1. So
+	// gl_position.z could range from [0, 1]
+	if (GLLoader::found_GL_ARB_clip_control) {
+		// Change depth convention
+		glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
+	}
+
+	// ****************************************************************
+	// HW renderer shader
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::CreateTextureFX");
+
+	CreateTextureFX();
+
+	GL_POP();
+
+	// ****************************************************************
+	// Pbo Pool allocation
+	// ****************************************************************
+	GL_PUSH("GSDeviceOGL::PBO");
+
+	PboPool::Init();
+
+	GL_POP();
+
+	// Done !
+	GL_POP();
+
+	// ****************************************************************
+	// Finish window setup and backbuffer
+	// ****************************************************************
+	if(!GSDevice::Create(wnd))
+		return false;
+
+	GSVector4i rect = wnd->GetClientRect();
+	Reset(rect.z, rect.w);
+
+	// Basic to ensure structures are correctly packed
+	ASSERT(sizeof(VSSelector) == 4);
+	ASSERT(sizeof(PSSelector) == 8);
+	ASSERT(sizeof(PSSamplerSelector) == 4);
+	ASSERT(sizeof(OMDepthStencilSelector) == 4);
+	ASSERT(sizeof(OMColorMaskSelector) == 4);
+
+	return true;
+}
+
+bool GSDeviceOGL::Reset(int w, int h)
+{
+	if(!GSDevice::Reset(w, h))
+		return false;
+
+	// Opengl allocate the backbuffer with the window. The render is done in the backbuffer when
+	// there isn't any FBO. Only a dummy texture is created to easily detect when the rendering is done
+	// in the backbuffer
+	m_backbuffer = new GSTextureOGL(GSTextureOGL::Backbuffer, w, h, 0, m_fbo_read);
+
+	return true;
+}
+
+void GSDeviceOGL::SetVSync(bool enable)
+{
+	m_wnd->SetVSync(enable);
+}
+
+void GSDeviceOGL::Flip()
+{
+	#ifdef ENABLE_OGL_DEBUG
+	CheckDebugLog();
+	#endif
+
+	m_wnd->Flip();
+}
+
+void GSDeviceOGL::BeforeDraw()
+{
+	m_shader->UseProgram();
+}
+
+void GSDeviceOGL::AfterDraw()
+{
+}
+
+void GSDeviceOGL::DrawPrimitive()
+{
+	BeforeDraw();
+	m_va->DrawPrimitive();
+	AfterDraw();
+}
+
+void GSDeviceOGL::DrawPrimitive(int offset, int count)
+{
+	BeforeDraw();
+	m_va->DrawPrimitive(offset, count);
+	AfterDraw();
+}
+
+void GSDeviceOGL::DrawIndexedPrimitive()
+{
+	BeforeDraw();
+	m_va->DrawIndexedPrimitive();
+	AfterDraw();
+}
+
+void GSDeviceOGL::DrawIndexedPrimitive(int offset, int count)
+{
+	//ASSERT(offset + count <= (int)m_index.count);
+
+	BeforeDraw();
+	m_va->DrawIndexedPrimitive(offset, count);
+	AfterDraw();
+}
+
+void GSDeviceOGL::ClearRenderTarget(GSTexture* t, const GSVector4& c)
+{
+	if (!t) return;
+
+	GSTextureOGL* T = static_cast<GSTextureOGL*>(t);
+	if (T->HasBeenCleaned() && !T->IsBackbuffer())
+		return;
+
+	GL_PUSH("Clear RT %d", T->GetID());
+
+	// TODO: check size of scissor before toggling it
+	glDisable(GL_SCISSOR_TEST);
+
+	uint32 old_color_mask = GLState::wrgba;
+	OMSetColorMaskState();
+
+	if (T->IsBackbuffer()) {
+		OMSetFBO(0);
+
+		// glDrawBuffer(GL_BACK); // this is the default when there is no FB
+		// 0 will select the first drawbuffer ie GL_BACK
+		glClearBufferfv(GL_COLOR, 0, c.v);
+	} else {
+		OMSetFBO(m_fbo);
+		OMAttachRt(T);
+
+		glClearBufferfv(GL_COLOR, 0, c.v);
+
+	}
+
+	OMSetColorMaskState(OMColorMaskSelector(old_color_mask));
+
+	glEnable(GL_SCISSOR_TEST);
+
+	T->WasCleaned();
+
+	GL_POP();
+}
+
+void GSDeviceOGL::ClearRenderTarget(GSTexture* t, uint32 c)
+{
+	if (!t) return;
+
+	GSVector4 color = GSVector4::rgba32(c) * (1.0f / 255);
+	ClearRenderTarget(t, color);
+}
+
+void GSDeviceOGL::ClearRenderTarget_i(GSTexture* t, int32 c)
+{
+	if (!t) return;
+
+	GSTextureOGL* T = static_cast<GSTextureOGL*>(t);
+
+	GL_PUSH("Clear RTi %d", T->GetID());
+
+	uint32 old_color_mask = GLState::wrgba;
+	OMSetColorMaskState();
+
+	// Keep SCISSOR_TEST enabled on purpose to reduce the size
+	// of clean in DATE (impact big upscaling)
+	int32 col[4] = {c, c, c, c};
+
+	OMSetFBO(m_fbo);
+	OMAttachRt(T);
+
+	// Blending is not supported when you render to an Integer texture
+	if (GLState::blend) {
+		glDisable(GL_BLEND);
+	}
+
+	glClearBufferiv(GL_COLOR, 0, col);
+
+	OMSetColorMaskState(OMColorMaskSelector(old_color_mask));
+
+	if (GLState::blend) {
+		glEnable(GL_BLEND);
+	}
+
+	GL_POP();
+}
+
+void GSDeviceOGL::ClearDepth(GSTexture* t, float c)
+{
+	if (!t) return;
+
+	GSTextureOGL* T = static_cast<GSTextureOGL*>(t);
+
+	GL_PUSH("Clear Depth %d", T->GetID());
+
+	OMSetFBO(m_fbo);
+	OMAttachDs(T);
+
+	// TODO: check size of scissor before toggling it
+	glDisable(GL_SCISSOR_TEST);
+	if (GLState::depth_mask) {
+		glClearBufferfv(GL_DEPTH, 0, &c);
+	} else {
+		glDepthMask(true);
+		glClearBufferfv(GL_DEPTH, 0, &c);
+		glDepthMask(false);
+	}
+	glEnable(GL_SCISSOR_TEST);
+
+	GL_POP();
+}
+
+void GSDeviceOGL::ClearStencil(GSTexture* t, uint8 c)
+{
+	if (!t) return;
+
+	GSTextureOGL* T = static_cast<GSTextureOGL*>(t);
+
+	GL_PUSH("Clear Stencil %d", T->GetID());
+
+	// Keep SCISSOR_TEST enabled on purpose to reduce the size
+	// of clean in DATE (impact big upscaling)
+	OMSetFBO(m_fbo);
+	OMAttachDs(T);
+	GLint color = c;
+
+	glClearBufferiv(GL_STENCIL, 0, &color);
+
+	GL_POP();
+}
+
+GLuint GSDeviceOGL::CreateSampler(PSSamplerSelector sel)
+{
+	return CreateSampler(sel.ltf, sel.tau, sel.tav, sel.aniso);
+}
+
+GLuint GSDeviceOGL::CreateSampler(bool bilinear, bool tau, bool tav, bool aniso)
+{
+	GL_PUSH("Create Sampler");
+
+	GLuint sampler;
+	glGenSamplers(1, &sampler);
+	if (bilinear) {
+		glSamplerParameteri(sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+		glSamplerParameteri(sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+	} else {
+		glSamplerParameteri(sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glSamplerParameteri(sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	}
+
+	if (tau)
+		glSamplerParameteri(sampler, GL_TEXTURE_WRAP_S, GL_REPEAT);
+	else
+		glSamplerParameteri(sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	if (tav)
+		glSamplerParameteri(sampler, GL_TEXTURE_WRAP_T, GL_REPEAT);
+	else
+		glSamplerParameteri(sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+	glSamplerParameteri(sampler, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
+
+	glSamplerParameterf(sampler, GL_TEXTURE_MIN_LOD, 0);
+	glSamplerParameterf(sampler, GL_TEXTURE_MAX_LOD, 6);
+
+	int anisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+	if (GLLoader::found_GL_EXT_texture_filter_anisotropic && anisotropy && aniso)
+		glSamplerParameterf(sampler, GL_TEXTURE_MAX_ANISOTROPY_EXT, (float)anisotropy);
+
+	GL_POP();
+	return sampler;
+}
+
+void GSDeviceOGL::InitPrimDateTexture(GSTexture* rt)
+{
+	const GSVector2i& rtsize = rt->GetSize();
+
+	// Create a texture to avoid the useless clean@0
+	if (m_date.t == NULL)
+		m_date.t = CreateTexture(rtsize.x, rtsize.y, GL_R32I);
+
+	// Clean with the max signed value
+	ClearRenderTarget_i(m_date.t, 0x7FFFFFFF);
+
+	glBindImageTexture(2, static_cast<GSTextureOGL*>(m_date.t)->GetID(), 0, false, 0, GL_READ_WRITE, GL_R32I);
+#ifdef ENABLE_OGL_DEBUG
+	// Help to see the texture in apitrace
+	PSSetShaderResource(2, m_date.t);
+#endif
+}
+
+void GSDeviceOGL::RecycleDateTexture()
+{
+	if (m_date.t) {
+		//static_cast<GSTextureOGL*>(m_date.t)->Save(format("/tmp/date_adv_%04ld.csv", s_n));
+
+		Recycle(m_date.t);
+		m_date.t = NULL;
+	}
+}
+
+void GSDeviceOGL::Barrier(GLbitfield b)
+{
+	glMemoryBarrier(b);
+}
+
+/* Note: must be here because tfx_glsl is static */
+GLuint GSDeviceOGL::CompileVS(VSSelector sel, int logz)
+{
+	std::string macro = format("#define VS_BPPZ %d\n", sel.bppz)
+		+ format("#define VS_LOGZ %d\n", logz)
+		+ format("#define VS_WILDHACK %d\n", sel.wildhack)
+		;
+
+	return m_shader->Compile("tfx_vgs.glsl", "vs_main", GL_VERTEX_SHADER, tfx_vgs_glsl, macro);
+}
+
+/* Note: must be here because tfx_glsl is static */
+GLuint GSDeviceOGL::CompileGS(GSSelector sel)
+{
+	std::string macro = format("#define GS_POINT %d\n", sel.point);
+
+	return m_shader->Compile("tfx_vgs.glsl", "gs_main", GL_GEOMETRY_SHADER, tfx_vgs_glsl, macro);
+}
+
+/* Note: must be here because tfx_glsl is static */
+GLuint GSDeviceOGL::CompilePS(PSSelector sel)
+{
+	std::string macro = format("#define PS_FST %d\n", sel.fst)
+		+ format("#define PS_WMS %d\n", sel.wms)
+		+ format("#define PS_WMT %d\n", sel.wmt)
+		+ format("#define PS_TEX_FMT %d\n", sel.tex_fmt)
+		+ format("#define PS_DFMT %d\n", sel.dfmt)
+		+ format("#define PS_AEM %d\n", sel.aem)
+		+ format("#define PS_TFX %d\n", sel.tfx)
+		+ format("#define PS_TCC %d\n", sel.tcc)
+		+ format("#define PS_ATST %d\n", sel.atst)
+		+ format("#define PS_FOG %d\n", sel.fog)
+		+ format("#define PS_CLR1 %d\n", sel.clr1)
+		+ format("#define PS_FBA %d\n", sel.fba)
+		+ format("#define PS_LTF %d\n", sel.ltf)
+		+ format("#define PS_COLCLIP %d\n", sel.colclip)
+		+ format("#define PS_DATE %d\n", sel.date)
+		+ format("#define PS_TCOFFSETHACK %d\n", sel.tcoffsethack)
+		//+ format("#define PS_POINT_SAMPLER %d\n", sel.point_sampler)
+		+ format("#define PS_BLEND_A %d\n", sel.blend_a)
+		+ format("#define PS_BLEND_B %d\n", sel.blend_b)
+		+ format("#define PS_BLEND_C %d\n", sel.blend_c)
+		+ format("#define PS_BLEND_D %d\n", sel.blend_d)
+		+ format("#define PS_IIP %d\n", sel.iip)
+		+ format("#define PS_SHUFFLE %d\n", sel.shuffle)
+		+ format("#define PS_READ_BA %d\n", sel.read_ba)
+		+ format("#define PS_WRITE_RG %d\n", sel.write_rg)
+		+ format("#define PS_FBMASK %d\n", sel.fbmask)
+		+ format("#define PS_HDR %d\n", sel.hdr)
+		+ format("#define PS_PABE %d\n", sel.pabe);
+		;
+
+	return m_shader->Compile("tfx.glsl", "ps_main", GL_FRAGMENT_SHADER, tfx_fs_all_glsl, macro);
+}
+
+void GSDeviceOGL::SelfShaderTest()
+{
+#define RUN_TEST \
+	do { \
+		GLuint p = CompilePS(sel); \
+		nb_shader++; \
+		perf += m_shader->DumpAsm(file, p); \
+		m_shader->Delete(p); \
+	} while(0);
+
+#define PRINT_TEST(s) \
+	do { \
+		fprintf(stderr, "%s %d instructions for %d shaders (mean of %4.2f)\n", \
+				s, perf, nb_shader, (float)perf/(float)nb_shader); \
+		all += perf; \
+		perf = 0; \
+		nb_shader = 0; \
+	} while(0);
+
+	int nb_shader = 0;
+	int perf = 0;
+	int all = 0;
+	// Test: SW blending
+	for (int colclip = 0; colclip < 2; colclip++) {
+		for (int fmt = 0; fmt < 3; fmt++) {
+			for (int i = 0; i < 3; i++) {
+				PSSelector sel;
+				sel.atst = 1;
+				sel.tfx = 4;
+
+				int ib = (i + 1) % 3;
+				sel.blend_a = i;
+				sel.blend_b = ib;;
+				sel.blend_c = i;
+				sel.blend_d = i;
+				sel.colclip = colclip;
+				sel.dfmt    = fmt;
+
+				std::string file = format("Shader_Blend_%d_%d_%d_%d__Cclip_%d__Dfmt_%d.glsl.asm",
+						i, ib, i, i, colclip, fmt);
+				RUN_TEST;
+			}
+		}
+	}
+	PRINT_TEST("Blend");
+
+	// Test: alpha test
+	for (int atst = 0; atst < 8; atst++) {
+		PSSelector sel;
+		sel.tfx = 4;
+
+		sel.atst = atst;
+		std::string file = format("Shader_Atst_%d.glsl.asm", atst);
+		RUN_TEST;
+	}
+	PRINT_TEST("Alpha Tst");
+
+	// Test: fbmask/fog/shuffle/read_ba
+	for (int read_ba = 0; read_ba < 2; read_ba++) {
+		PSSelector sel;
+		sel.tfx = 4;
+		sel.atst = 1;
+
+		sel.fog = 1;
+		sel.fbmask = 1;
+		sel.shuffle = 1;
+		sel.read_ba = read_ba;
+
+		std::string file = format("Shader_Fog__Fbmask__Shuffle__Read_ba_%d.glsl.asm", read_ba);
+		RUN_TEST;
+	}
+	PRINT_TEST("Fbmask/fog/shuffle/read_ba");
+
+	// Test: Date
+	for (int date = 1; date < 7; date++) {
+		PSSelector sel;
+		sel.tfx = 4;
+		sel.atst = 1;
+
+		sel.date = date;
+		std::string file = format("Shader_Date_%d.glsl.asm", date);
+		RUN_TEST;
+	}
+	PRINT_TEST("Date");
+
+	// Test: FBA
+	for (int fmt = 0; fmt < 3; fmt++) {
+		PSSelector sel;
+		sel.tfx = 4;
+		sel.atst = 1;
+
+		sel.fba = 1;
+		sel.dfmt = fmt;
+		sel.clr1 = 1;
+		std::string file = format("Shader_Fba__Clr1__Dfmt_%d.glsl.asm", fmt);
+		RUN_TEST;
+	}
+	PRINT_TEST("Fba/Clr1/Dfmt");
+
+	// Test: Fst/Tc/IIP
+	{
+		PSSelector sel;
+		sel.tfx = 1;
+		sel.atst = 1;
+
+		sel.fst = 0;
+		sel.iip = 1;
+		sel.tcoffsethack = 1;
+
+		std::string file = format("Shader_Fst__TC__Iip.glsl.asm");
+		RUN_TEST;
+	}
+	PRINT_TEST("Fst/Tc/IIp");
+
+	// Test: tfx/tcc
+	for (int tfx = 0; tfx < 5; tfx++) {
+		for (int tcc = 0; tcc < 2; tcc++) {
+			PSSelector sel;
+			sel.atst = 1;
+			sel.fst = 1;
+
+			sel.tfx = tfx;
+			sel.tcc = tcc;
+			std::string file = format("Shader_Tfx_%d__Tcc_%d.glsl.asm", tfx, tcc);
+			RUN_TEST;
+		}
+	}
+	PRINT_TEST("Tfx/Tcc");
+
+	// Test: Texture Sampling
+	for (int fmt = 0; fmt < 16; fmt++) {
+		if ((fmt & 3) == 3) continue;
+
+		for (int ltf = 0; ltf < 2; ltf++) {
+			for (int aem = 0; aem < 2; aem++) {
+				for (int wms = 1; wms < 4; wms++) {
+					for (int wmt = 1; wmt < 4; wmt++) {
+						PSSelector sel;
+						sel.atst = 1;
+						sel.tfx  = 1;
+						sel.tcc  = 1;
+						sel.fst = 1;
+
+						sel.ltf     = ltf;
+						sel.aem     = aem;
+						sel.tex_fmt = fmt;
+						sel.wms     = wms;
+						sel.wmt     = wmt;
+						std::string file = format("Shader_Ltf_%d__Aem_%d__TFmt_%d__Wms_%d__Wmt_%d.glsl.asm",
+								ltf, aem, fmt, wms, wmt);
+						RUN_TEST;
+					}
+				}
+			}
+		}
+	}
+	PRINT_TEST("Texture Sampling");
+
+	fprintf(stderr, "\nTotal %d\n", all);
+
+#undef RUN_TEST
+#undef PRINT_TEST
+}
+
+GSTexture* GSDeviceOGL::CreateRenderTarget(int w, int h, bool msaa, int format)
+{
+	return GSDevice::CreateRenderTarget(w, h, msaa, format ? format : GL_RGBA8);
+}
+
+GSTexture* GSDeviceOGL::CreateDepthStencil(int w, int h, bool msaa, int format)
+{
+	return GSDevice::CreateDepthStencil(w, h, msaa, format ? format : GL_DEPTH32F_STENCIL8);
+}
+
+GSTexture* GSDeviceOGL::CreateTexture(int w, int h, int format)
+{
+	return GSDevice::CreateTexture(w, h, format ? format : GL_RGBA8);
+}
+
+GSTexture* GSDeviceOGL::CreateOffscreen(int w, int h, int format)
+{
+	return GSDevice::CreateOffscreen(w, h, format ? format : GL_RGBA8);
+}
+
+// blit a texture into an offscreen buffer
+GSTexture* GSDeviceOGL::CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format, int ps_shader)
+{
+	if (format == 0)
+		format = GL_RGBA8;
+
+	ASSERT(src);
+	ASSERT(format == GL_RGBA8 || format == GL_R16UI || format == GL_R32UI);
+
+	GSTexture* dst = CreateOffscreen(w, h, format);
+
+	GSVector4 dRect(0, 0, w, h);
+
+	StretchRect(src, sRect, dst, dRect, m_convert.ps[ps_shader]);
+
+	return dst;
+}
+
+// Copy a sub part of texture (same as below but force a conversion)
+void GSDeviceOGL::CopyRectConv(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r, bool at_origin)
+{
+	ASSERT(sTex && dTex);
+	if (!(sTex && dTex))
+		return;
+
+	const GLuint& sid = static_cast<GSTextureOGL*>(sTex)->GetID();
+	const GLuint& did = static_cast<GSTextureOGL*>(dTex)->GetID();
+
+	GL_PUSH(format("CopyRectConv from %d to %d", sid, did).c_str());
+
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, m_fbo_read);
+
+	glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, sid, 0);
+	if (at_origin)
+		glCopyTextureSubImage2D(did, GL_TEX_LEVEL_0, 0, 0, r.x, r.y, r.width(), r.height());
+	else
+		glCopyTextureSubImage2D(did, GL_TEX_LEVEL_0, r.x, r.y, r.x, r.y, r.width(), r.height());
+
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+
+	GL_POP();
+}
+
+// Copy a sub part of a texture into another
+void GSDeviceOGL::CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r)
+{
+	ASSERT(sTex && dTex);
+	if (!(sTex && dTex))
+		return;
+
+	const GLuint& sid = static_cast<GSTextureOGL*>(sTex)->GetID();
+	const GLuint& did = static_cast<GSTextureOGL*>(dTex)->GetID();
+
+	GL_PUSH("CopyRect from %d to %d", sid, did);
+
+	if (GLLoader::found_GL_ARB_copy_image) {
+		glCopyImageSubData( sid, GL_TEXTURE_2D,
+				0, r.x, r.y, 0,
+				did, GL_TEXTURE_2D,
+				0, 0, 0, 0,
+				r.width(), r.height(), 1);
+	} else {
+		// Slower copy (conversion is done)
+		CopyRectConv(sTex, dTex, r, true);
+	}
+
+	GL_POP();
+}
+
+void GSDeviceOGL::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, m_convert.ps[shader], linear);
+}
+
+void GSDeviceOGL::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, GLuint ps, bool linear)
+{
+	StretchRect(sTex, sRect, dTex, dRect, ps, m_NO_BLEND, linear);
+}
+
+void GSDeviceOGL::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, GLuint ps, int bs, bool linear)
+{
+	if(!sTex || !dTex)
+	{
+		ASSERT(0);
+		return;
+	}
+
+	bool draw_in_depth = (ps == m_convert.ps[ShaderConvert_RGBA8_TO_FLOAT32] || ps == m_convert.ps[ShaderConvert_RGBA8_TO_FLOAT24] ||
+		ps == m_convert.ps[ShaderConvert_RGBA8_TO_FLOAT16] || ps == m_convert.ps[ShaderConvert_RGB5A1_TO_FLOAT16]);
+
+	// Performance optimization. It might be faster to use a framebuffer blit for standard case
+	// instead to emulate it with shader
+	// see https://www.opengl.org/wiki/Framebuffer#Blitting
+
+	GL_PUSH("StretchRect from %d to %d", sTex->GetID(), dTex->GetID());
+
+	// ************************************
+	// Init
+	// ************************************
+
+	BeginScene();
+
+	GSVector2i ds = dTex->GetSize();
+
+	m_shader->VS(m_convert.vs);
+	m_shader->GS(0);
+	m_shader->PS(ps);
+
+	// ************************************
+	// om
+	// ************************************
+
+	if (draw_in_depth)
+		OMSetDepthStencilState(m_convert.dss_write);
+	else
+		OMSetDepthStencilState(m_convert.dss);
+
+	if (draw_in_depth)
+		OMSetRenderTargets(NULL, dTex);
+	else
+		OMSetRenderTargets(dTex, NULL);
+
+	OMSetBlendState(bs);
+	OMSetColorMaskState();
+
+	// ************************************
+	// ia
+	// ************************************
+
+
+	// Original code from DX
+	float left = dRect.x * 2 / ds.x - 1.0f;
+	float right = dRect.z * 2 / ds.x - 1.0f;
+#if 0
+	float top = 1.0f - dRect.y * 2 / ds.y;
+	float bottom = 1.0f - dRect.w * 2 / ds.y;
+#else
+	// Opengl get some issues with the coordinate
+	// I flip top/bottom to fix scaling of the internal resolution
+	float top = -1.0f + dRect.y * 2 / ds.y;
+	float bottom = -1.0f + dRect.w * 2 / ds.y;
+#endif
+
+	// Flip y axis only when we render in the backbuffer
+	// By default everything is render in the wrong order (ie dx).
+	// 1/ consistency between several pass rendering (interlace)
+	// 2/ in case some GSdx code expect thing in dx order.
+	// Only flipping the backbuffer is transparent (I hope)...
+	GSVector4 flip_sr = sRect;
+	if (static_cast<GSTextureOGL*>(dTex)->IsBackbuffer()) {
+		flip_sr.y = sRect.w;
+		flip_sr.w = sRect.y;
+	}
+
+	GSVertexPT1 vertices[] =
+	{
+		{GSVector4(left  , top   , 0.0f, 0.0f) , GSVector2(flip_sr.x , flip_sr.y)} ,
+		{GSVector4(right , top   , 0.0f, 0.0f) , GSVector2(flip_sr.z , flip_sr.y)} ,
+		{GSVector4(left  , bottom, 0.0f, 0.0f) , GSVector2(flip_sr.x , flip_sr.w)} ,
+		{GSVector4(right , bottom, 0.0f, 0.0f) , GSVector2(flip_sr.z , flip_sr.w)} ,
+	};
+
+	IASetVertexBuffer(vertices, 4);
+	IASetPrimitiveTopology(GL_TRIANGLE_STRIP);
+
+	// ************************************
+	// Texture
+	// ************************************
+
+	PSSetShaderResource(0, sTex);
+	PSSetSamplerState(linear ? m_convert.ln : m_convert.pt);
+
+	// ************************************
+	// Draw
+	// ************************************
+	DrawPrimitive();
+
+	// ************************************
+	// End
+	// ************************************
+
+	EndScene();
+
+	GL_POP();
+}
+
+void GSDeviceOGL::DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c)
+{
+	GL_PUSH("DoMerge");
+
+	OMSetColorMaskState();
+
+	ClearRenderTarget(dTex, c);
+
+	if(sTex[1] && !slbg)
+	{
+		StretchRect(sTex[1], sRect[1], dTex, dRect[1], m_merge_obj.ps[0]);
+	}
+
+	if(sTex[0])
+	{
+		m_merge_obj.cb->upload(&c.v);
+
+		StretchRect(sTex[0], sRect[0], dTex, dRect[0], m_merge_obj.ps[mmod ? 1 : 0], m_MERGE_BLEND);
+	}
+
+	GL_POP();
+}
+
+void GSDeviceOGL::DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset)
+{
+	GL_PUSH("DoInterlace");
+
+	OMSetColorMaskState();
+
+	GSVector4 s = GSVector4(dTex->GetSize());
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0.0f, yoffset, s.x, s.y + yoffset);
+
+	InterlaceConstantBuffer cb;
+
+	cb.ZrH = GSVector2(0, 1.0f / s.y);
+	cb.hH = s.y / 2;
+
+	m_interlace.cb->upload(&cb);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_interlace.ps[shader], linear);
+
+	GL_POP();
+}
+
+void GSDeviceOGL::DoFXAA(GSTexture* sTex, GSTexture* dTex)
+{
+	// Lazy compile
+	if (!m_fxaa.ps) {
+		if (!GLLoader::found_GL_ARB_gpu_shader5) { // GL4.0 extension
+			return;
+		}
+
+		std::string fxaa_macro = "#define FXAA_GLSL_130 1\n";
+		fxaa_macro += "#extension GL_ARB_gpu_shader5 : enable\n";
+		m_fxaa.ps = m_shader->Compile("fxaa.fx", "ps_main", GL_FRAGMENT_SHADER, fxaa_fx, fxaa_macro);
+	}
+
+	GL_PUSH("DoFxaa");
+
+	OMSetColorMaskState();
+
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_fxaa.ps, true);
+
+	GL_POP();
+}
+
+void GSDeviceOGL::DoExternalFX(GSTexture* sTex, GSTexture* dTex)
+{
+	// Lazy compile
+	if (!m_shaderfx.ps) {
+		if (!GLLoader::found_GL_ARB_gpu_shader5) { // GL4.0 extension
+			return;
+		}
+
+		std::string   config_name(theApp.GetConfig("shaderfx_conf", "dummy.ini"));
+		std::ifstream fconfig(config_name);
+		std::stringstream config;
+		if (fconfig.good())
+			config << fconfig.rdbuf();
+		else
+			fprintf(stderr, "Warning failed to load '%s'. External Shader might be wrongly configured\n", config_name.c_str());
+
+		std::string   shader_name(theApp.GetConfig("shaderfx_glsl", "dummy.glsl"));
+		std::ifstream fshader(shader_name);
+		std::stringstream shader;
+		if (!fshader.good()) {
+			fprintf(stderr, "Error failed to load '%s'. External Shader will be disabled !\n", shader_name.c_str());
+			return;
+		}
+		shader << fshader.rdbuf();
+
+
+		m_shaderfx.cb = new GSUniformBufferOGL(g_fx_cb_index, sizeof(ExternalFXConstantBuffer));
+		m_shaderfx.ps = m_shader->Compile("Extra", "ps_main", GL_FRAGMENT_SHADER, shader.str().c_str(), config.str());
+	}
+
+	GL_PUSH("DoExternalFX");
+
+	OMSetColorMaskState();
+
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ExternalFXConstantBuffer cb;
+
+	cb.xyFrame = GSVector2(s.x, s.y);
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	m_shaderfx.cb->upload(&cb);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shaderfx.ps, true);
+
+	GL_POP();
+}
+
+void GSDeviceOGL::DoShadeBoost(GSTexture* sTex, GSTexture* dTex)
+{
+	GL_PUSH("DoShadeBoost");
+
+	OMSetColorMaskState();
+
+	GSVector2i s = dTex->GetSize();
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0, 0, s.x, s.y);
+
+	ShadeBoostConstantBuffer cb;
+
+	cb.rcpFrame = GSVector4(1.0f / s.x, 1.0f / s.y, 0.0f, 0.0f);
+	cb.rcpFrameOpt = GSVector4::zero();
+
+	m_shadeboost.cb->upload(&cb);
+
+	StretchRect(sTex, sRect, dTex, dRect, m_shadeboost.ps, true);
+
+	GL_POP();
+}
+
+void GSDeviceOGL::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm)
+{
+	GL_PUSH("DATE First Pass");
+
+	// sfex3 (after the capcom logo), vf4 (first menu fading in), ffxii shadows, rumble roses shadows, persona4 shadows
+
+	BeginScene();
+
+	ClearStencil(ds, 0);
+
+	m_shader->VS(m_convert.vs);
+	m_shader->GS(0);
+	m_shader->PS(m_convert.ps[datm ? ShaderConvert_DATM_1 : ShaderConvert_DATM_0]);
+
+	// om
+
+	OMSetDepthStencilState(m_date.dss);
+	if (GLState::blend) {
+		glDisable(GL_BLEND);
+	}
+	OMSetRenderTargets(NULL, ds, &GLState::scissor);
+
+	// ia
+
+	IASetVertexBuffer(vertices, 4);
+	IASetPrimitiveTopology(GL_TRIANGLE_STRIP);
+
+
+	// Texture
+
+	PSSetShaderResource(0, rt);
+	PSSetSamplerState(m_convert.pt);
+
+	DrawPrimitive();
+
+	if (GLState::blend) {
+		glEnable(GL_BLEND);
+	}
+
+	EndScene();
+
+	GL_POP();
+}
+
+void GSDeviceOGL::EndScene()
+{
+	m_va->EndScene();
+}
+
+void GSDeviceOGL::IASetVertexBuffer(const void* vertices, size_t count)
+{
+	m_va->UploadVB(vertices, count);
+}
+
+void GSDeviceOGL::IASetIndexBuffer(const void* index, size_t count)
+{
+	m_va->UploadIB(index, count);
+}
+
+void GSDeviceOGL::IASetPrimitiveTopology(GLenum topology)
+{
+	m_va->SetTopology(topology);
+}
+
+void GSDeviceOGL::PSSetShaderResource(int i, GSTexture* sr)
+{
+	ASSERT(i < (int)countof(GLState::tex_unit));
+	// Note: Nvidia debgger doesn't support the id 0 (ie the NULL texture)
+	if (sr) {
+		GLuint id = static_cast<GSTextureOGL*>(sr)->GetID();
+		if (GLState::tex_unit[i] != id) {
+			GLState::tex_unit[i] = id;
+			glBindTextureUnit(i, id);
+		}
+	}
+}
+
+void GSDeviceOGL::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
+{
+	PSSetShaderResource(0, sr0);
+	PSSetShaderResource(1, sr1);
+}
+
+void GSDeviceOGL::PSSetSamplerState(GLuint ss)
+{
+	if (GLState::ps_ss != ss) {
+		GLState::ps_ss = ss;
+		glBindSampler(0, ss);
+	}
+}
+
+void GSDeviceOGL::OMAttachRt(GSTextureOGL* rt)
+{
+	GLuint id;
+	if (rt) {
+		rt->WasAttached();
+		id = rt->GetID();
+	} else {
+		id = 0;
+	}
+
+	if (GLState::rt != id) {
+		GLState::rt = id;
+		glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, id, 0);
+	}
+}
+
+void GSDeviceOGL::OMAttachDs(GSTextureOGL* ds)
+{
+	GLuint id;
+	if (ds) {
+		ds->WasAttached();
+		id = ds->GetID();
+	} else {
+		id = 0;
+	}
+
+	if (GLState::ds != id) {
+		GLState::ds = id;
+		glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, id, 0);
+	}
+}
+
+void GSDeviceOGL::OMSetFBO(GLuint fbo)
+{
+	if (GLState::fbo != fbo) {
+		GLState::fbo = fbo;
+		glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+	}
+}
+
+void GSDeviceOGL::OMSetDepthStencilState(GSDepthStencilOGL* dss)
+{
+	dss->SetupDepth();
+	dss->SetupStencil();
+}
+
+void GSDeviceOGL::OMSetColorMaskState(OMColorMaskSelector sel)
+{
+	if (sel.wrgba != GLState::wrgba) {
+		GLState::wrgba = sel.wrgba;
+
+		glColorMaski(0, sel.wr, sel.wg, sel.wb, sel.wa);
+	}
+}
+
+void GSDeviceOGL::OMSetBlendState(uint8 blend_index, uint8 blend_factor, bool is_blend_constant)
+{
+	if (blend_index) {
+		if (!GLState::blend) {
+			GLState::blend = true;
+			glEnable(GL_BLEND);
+		}
+
+		if (is_blend_constant && GLState::bf != blend_factor) {
+			GLState::bf = blend_factor;
+			float bf = (float)blend_factor / 128.0f;
+			gl_BlendColor(bf, bf, bf, bf);
+		}
+
+		const OGLBlend& b = m_blendMapOGL[blend_index];
+
+		if (GLState::eq_RGB != b.op) {
+			GLState::eq_RGB = b.op;
+			if (glBlendEquationSeparateiARB)
+				glBlendEquationSeparateiARB(0, b.op, GL_FUNC_ADD);
+			else
+				glBlendEquationSeparate(b.op, GL_FUNC_ADD);
+		}
+
+		if (GLState::f_sRGB != b.src || GLState::f_dRGB != b.dst) {
+			GLState::f_sRGB = b.src;
+			GLState::f_dRGB = b.dst;
+			if (glBlendFuncSeparateiARB)
+				glBlendFuncSeparateiARB(0, b.src, b.dst, GL_ONE, GL_ZERO);
+			else
+				glBlendFuncSeparate(b.src, b.dst, GL_ONE, GL_ZERO);
+		}
+
+	} else {
+		if (GLState::blend) {
+			GLState::blend = false;
+			glDisable(GL_BLEND);
+		}
+	}
+}
+
+void GSDeviceOGL::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor)
+{
+	GSTextureOGL* RT = static_cast<GSTextureOGL*>(rt);
+	GSTextureOGL* DS = static_cast<GSTextureOGL*>(ds);
+
+	if (rt == NULL || !RT->IsBackbuffer()) {
+		OMSetFBO(m_fbo);
+		if (rt) {
+			OMAttachRt(RT);
+		} else {
+			OMAttachRt();
+		}
+
+		// Note: it must be done after OMSetFBO
+		if (ds)
+			OMAttachDs(DS);
+		else
+			OMAttachDs();
+
+	} else {
+		// Render in the backbuffer
+		OMSetFBO(0);
+	}
+
+
+	GSVector2i size = rt ? rt->GetSize() : ds ? ds->GetSize() : GLState::viewport;
+	if(GLState::viewport != size)
+	{
+		GLState::viewport = size;
+		// FIXME ViewportIndexedf or ViewportIndexedfv (GL4.1)
+		glViewport(0, 0, size.x, size.y);
+	}
+
+	GSVector4i r = scissor ? *scissor : GSVector4i(size).zwxy();
+
+	if(!GLState::scissor.eq(r))
+	{
+		GLState::scissor = r;
+		// FIXME ScissorIndexedv (GL4.1)
+		glScissor( r.x, r.y, r.width(), r.height() );
+	}
+}
+
+void GSDeviceOGL::CheckDebugLog()
+{
+	if (!m_debug_gl_call) return;
+
+	unsigned int count = 16; // max. num. of messages that will be read from the log
+	int bufsize = 2048;
+	unsigned int sources[16] = {};
+	unsigned int types[16] = {};
+	unsigned int ids[16]   = {};
+	unsigned int severities[16] = {};
+	int lengths[16] = {};
+	char* messageLog = new char[bufsize];
+
+	unsigned int retVal = glGetDebugMessageLogARB(count, bufsize, sources, types, ids, severities, lengths, messageLog);
+
+	if(retVal > 0)
+	{
+		unsigned int pos = 0;
+		for(unsigned int i=0; i<retVal; i++)
+		{
+			DebugOutputToFile(sources[i], types[i], ids[i], severities[i], lengths[i], &messageLog[pos], NULL);
+			pos += lengths[i];
+		}
+	}
+
+	delete[] messageLog;
+}
+
+// Note: used as a callback of DebugMessageCallback. Don't change the signature
+void GSDeviceOGL::DebugOutputToFile(GLenum gl_source, GLenum gl_type, GLuint id, GLenum gl_severity, GLsizei gl_length, const GLchar *gl_message, const void* userParam)
+{
+	std::string message(gl_message, gl_length >= 0 ? gl_length : strlen(gl_message));
+	std::string type, severity, source;
+	static int sev_counter = 0;
+	switch(gl_type) {
+		case GL_DEBUG_TYPE_ERROR_ARB               : type = "Error"; break;
+		case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB : type = "Deprecated bhv"; break;
+		case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB  : type = "Undefined bhv"; break;
+		case GL_DEBUG_TYPE_PORTABILITY_ARB         : type = "Portability"; break;
+		case GL_DEBUG_TYPE_PERFORMANCE_ARB         : type = "Perf"; break;
+		case GL_DEBUG_TYPE_OTHER_ARB               : type = "Others"; break;
+		case GL_DEBUG_TYPE_PUSH_GROUP              : return; // Don't print message injected by myself
+		case GL_DEBUG_TYPE_POP_GROUP               : return; // Don't print message injected by myself
+		default                                    : type = "TTT"; break;
+	}
+	switch(gl_severity) {
+		case GL_DEBUG_SEVERITY_HIGH_ARB   : severity = "High"; sev_counter++; break;
+		case GL_DEBUG_SEVERITY_MEDIUM_ARB : severity = "Mid"; break;
+		case GL_DEBUG_SEVERITY_LOW_ARB    : severity = "Low"; break;
+		default                           : severity = "Info"; break;
+	}
+	switch(gl_source) {
+		case GL_DEBUG_SOURCE_API_ARB             : source = "API"; break;
+		case GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB   : source = "WINDOW"; break;
+		case GL_DEBUG_SOURCE_SHADER_COMPILER_ARB : source = "COMPILER"; break;
+		case GL_DEBUG_SOURCE_THIRD_PARTY_ARB     : source = "3rdparty"; break;
+		case GL_DEBUG_SOURCE_APPLICATION_ARB     : source = "Application"; break;
+		case GL_DEBUG_SOURCE_OTHER_ARB           : source = "Others"; break;
+		default                                  : source = "???"; break;
+	}
+
+	#ifdef _DEBUG
+	// Don't spam noisy information on the terminal
+	if (gl_severity != GL_DEBUG_SEVERITY_NOTIFICATION) {
+		fprintf(stderr,"Type:%s\tID:%d\tSeverity:%s\tMessage:%s\n", type.c_str(), s_n, severity.c_str(), message.c_str());
+	}
+	#endif
+
+	if (m_debug_gl_file)
+		fprintf(m_debug_gl_file,"Type:%s\tID:%d\tSeverity:%s\tMessage:%s\n", type.c_str(), s_n, severity.c_str(), message.c_str());
+
+#ifdef _DEBUG
+	if (sev_counter >= 5) {
+		// Close the file to flush the content on disk before exiting.
+		if (m_debug_gl_file) {
+			fclose(m_debug_gl_file);
+			m_debug_gl_file = NULL;
+		}
+		ASSERT(0);
+	}
+#endif
+}
+
+// (A - B) * C + D
+// A: Cs/Cd/0
+// B: Cs/Cd/0
+// C: As/Ad/FIX
+// D: Cs/Cd/0
+
+// bogus: 0100, 0110, 0120, 0200, 0210, 0220, 1001, 1011, 1021
+// tricky: 1201, 1211, 1221
+
+// Source.rgb = float3(1, 1, 1);
+// 1201 Cd*(1 + As) => Source * Dest color + Dest * Source alpha
+// 1211 Cd*(1 + Ad) => Source * Dest color + Dest * Dest alpha
+// 1221 Cd*(1 + F) => Source * Dest color + Dest * Factor
+
+// Special blending method table:
+// # (tricky) => 1 * Cd + Cd * F => Use (Cd, F) as factor of color (1, Cd)
+// * (bogus) => C * (1 + F ) + ... => factor is always bigger than 1 (except above case)
+// ? => Cs * F + Cd => do the multiplication in shader and addition in blending unit. It is an optimization
+
+// Copy Dx blend table and convert it to ogl
+#define D3DBLENDOP_ADD			GL_FUNC_ADD
+#define D3DBLENDOP_SUBTRACT		GL_FUNC_SUBTRACT
+#define D3DBLENDOP_REVSUBTRACT	GL_FUNC_REVERSE_SUBTRACT
+
+#define D3DBLEND_ONE			GL_ONE
+#define D3DBLEND_ZERO			GL_ZERO
+#define D3DBLEND_INVDESTALPHA	GL_ONE_MINUS_DST_ALPHA
+#define D3DBLEND_DESTALPHA		GL_DST_ALPHA
+#define D3DBLEND_DESTCOLOR		GL_DST_COLOR
+#define D3DBLEND_BLENDFACTOR	GL_CONSTANT_COLOR
+#define D3DBLEND_INVBLENDFACTOR GL_ONE_MINUS_CONSTANT_COLOR
+
+#define D3DBLEND_SRCALPHA		GL_SRC1_ALPHA
+#define D3DBLEND_INVSRCALPHA	GL_ONE_MINUS_SRC1_ALPHA
+
+const int GSDeviceOGL::m_NO_BLEND = 0;
+const int GSDeviceOGL::m_MERGE_BLEND = 3*3*3*3;
+
+const GSDeviceOGL::OGLBlend GSDeviceOGL::m_blendMapOGL[3*3*3*3 + 1] =
+{
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 0000: (Cs - Cs)*As + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 0001: (Cs - Cs)*As + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 0002: (Cs - Cs)*As +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 0010: (Cs - Cs)*Ad + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 0011: (Cs - Cs)*Ad + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 0012: (Cs - Cs)*Ad +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 0020: (Cs - Cs)*F  + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 0021: (Cs - Cs)*F  + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 0022: (Cs - Cs)*F  +  0 ==> 0
+	{ BLEND_A_MAX                , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_SRCALPHA}       , //*0100: (Cs - Cd)*As + Cs ==> Cs*(As + 1) - Cd*As
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_SRCALPHA       , D3DBLEND_INVSRCALPHA}    , // 0101: (Cs - Cd)*As + Cd ==> Cs*As + Cd*(1 - As)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_SRCALPHA       , D3DBLEND_SRCALPHA}       , // 0102: (Cs - Cd)*As +  0 ==> Cs*As - Cd*As
+	{ BLEND_A_MAX                , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_DESTALPHA}      , //*0110: (Cs - Cd)*Ad + Cs ==> Cs*(Ad + 1) - Cd*Ad
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_DESTALPHA      , D3DBLEND_INVDESTALPHA}   , // 0111: (Cs - Cd)*Ad + Cd ==> Cs*Ad + Cd*(1 - Ad)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_DESTALPHA      , D3DBLEND_DESTALPHA}      , // 0112: (Cs - Cd)*Ad +  0 ==> Cs*Ad - Cd*Ad
+	{ BLEND_A_MAX                , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_BLENDFACTOR}    , //*0120: (Cs - Cd)*F  + Cs ==> Cs*(F + 1) - Cd*F
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_BLENDFACTOR    , D3DBLEND_INVBLENDFACTOR} , // 0121: (Cs - Cd)*F  + Cd ==> Cs*F + Cd*(1 - F)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_BLENDFACTOR    , D3DBLEND_BLENDFACTOR}    , // 0122: (Cs - Cd)*F  +  0 ==> Cs*F - Cd*F
+	{ BLEND_NO_BAR | BLEND_A_MAX , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , //*0200: (Cs -  0)*As + Cs ==> Cs*(As + 1)
+	{ BLEND_ACCU                 , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ONE}            , //?0201: (Cs -  0)*As + Cd ==> Cs*As + Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_SRCALPHA       , D3DBLEND_ZERO}           , // 0202: (Cs -  0)*As +  0 ==> Cs*As
+	{ BLEND_A_MAX                , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , //*0210: (Cs -  0)*Ad + Cs ==> Cs*(Ad + 1)
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_DESTALPHA      , D3DBLEND_ONE}            , // 0211: (Cs -  0)*Ad + Cd ==> Cs*Ad + Cd
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_DESTALPHA      , D3DBLEND_ZERO}           , // 0212: (Cs -  0)*Ad +  0 ==> Cs*Ad
+	{ BLEND_NO_BAR | BLEND_A_MAX , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , //*0220: (Cs -  0)*F  + Cs ==> Cs*(F + 1)
+	{ BLEND_ACCU                 , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ONE}            , //?0221: (Cs -  0)*F  + Cd ==> Cs*F + Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_BLENDFACTOR    , D3DBLEND_ZERO}           , // 0222: (Cs -  0)*F  +  0 ==> Cs*F
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_INVSRCALPHA    , D3DBLEND_SRCALPHA}       , // 1000: (Cd - Cs)*As + Cs ==> Cd*As + Cs*(1 - As)
+	{ BLEND_A_MAX                , D3DBLENDOP_REVSUBTRACT , D3DBLEND_SRCALPHA       , D3DBLEND_ONE}            , //*1001: (Cd - Cs)*As + Cd ==> Cd*(As + 1) - Cs*As
+	{ 0                          , D3DBLENDOP_REVSUBTRACT , D3DBLEND_SRCALPHA       , D3DBLEND_SRCALPHA}       , // 1002: (Cd - Cs)*As +  0 ==> Cd*As - Cs*As
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_INVDESTALPHA   , D3DBLEND_DESTALPHA}      , // 1010: (Cd - Cs)*Ad + Cs ==> Cd*Ad + Cs*(1 - Ad)
+	{ BLEND_A_MAX                , D3DBLENDOP_REVSUBTRACT , D3DBLEND_DESTALPHA      , D3DBLEND_ONE}            , //*1011: (Cd - Cs)*Ad + Cd ==> Cd*(Ad + 1) - Cs*Ad
+	{ 0                          , D3DBLENDOP_REVSUBTRACT , D3DBLEND_DESTALPHA      , D3DBLEND_DESTALPHA}      , // 1012: (Cd - Cs)*Ad +  0 ==> Cd*Ad - Cs*Ad
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_INVBLENDFACTOR , D3DBLEND_BLENDFACTOR}    , // 1020: (Cd - Cs)*F  + Cs ==> Cd*F + Cs*(1 - F)
+	{ BLEND_A_MAX                , D3DBLENDOP_REVSUBTRACT , D3DBLEND_BLENDFACTOR    , D3DBLEND_ONE}            , //*1021: (Cd - Cs)*F  + Cd ==> Cd*(F + 1) - Cs*F
+	{ 0                          , D3DBLENDOP_REVSUBTRACT , D3DBLEND_BLENDFACTOR    , D3DBLEND_BLENDFACTOR}    , // 1022: (Cd - Cs)*F  +  0 ==> Cd*F - Cs*F
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 1100: (Cd - Cd)*As + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 1101: (Cd - Cd)*As + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 1102: (Cd - Cd)*As +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 1110: (Cd - Cd)*Ad + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 1111: (Cd - Cd)*Ad + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 1112: (Cd - Cd)*Ad +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 1120: (Cd - Cd)*F  + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 1121: (Cd - Cd)*F  + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 1122: (Cd - Cd)*F  +  0 ==> 0
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_SRCALPHA}       , // 1200: (Cd -  0)*As + Cs ==> Cs + Cd*As
+	{ BLEND_C_CLR                , D3DBLENDOP_ADD         , D3DBLEND_DESTCOLOR      , D3DBLEND_SRCALPHA}       , //#1201: (Cd -  0)*As + Cd ==> Cd*(1 + As) // ffxii main menu background
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_SRCALPHA}       , // 1202: (Cd -  0)*As +  0 ==> Cd*As
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_DESTALPHA}      , // 1210: (Cd -  0)*Ad + Cs ==> Cs + Cd*Ad
+	{ BLEND_C_CLR                , D3DBLENDOP_ADD         , D3DBLEND_DESTCOLOR      , D3DBLEND_DESTALPHA}      , //#1211: (Cd -  0)*Ad + Cd ==> Cd*(1 + Ad)
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_DESTALPHA}      , // 1212: (Cd -  0)*Ad +  0 ==> Cd*Ad
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_BLENDFACTOR}    , // 1220: (Cd -  0)*F  + Cs ==> Cs + Cd*F
+	{ BLEND_C_CLR                , D3DBLENDOP_ADD         , D3DBLEND_DESTCOLOR      , D3DBLEND_BLENDFACTOR}    , //#1221: (Cd -  0)*F  + Cd ==> Cd*(1 + F)
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_BLENDFACTOR}    , // 1222: (Cd -  0)*F  +  0 ==> Cd*F
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_INVSRCALPHA    , D3DBLEND_ZERO}           , // 2000: (0  - Cs)*As + Cs ==> Cs*(1 - As)
+	{ BLEND_ACCU                 , D3DBLENDOP_REVSUBTRACT , D3DBLEND_ONE            , D3DBLEND_ONE}            , // 2001: (0  - Cs)*As + Cd ==> Cd - Cs*As
+	{ BLEND_NO_BAR               , D3DBLENDOP_REVSUBTRACT , D3DBLEND_SRCALPHA       , D3DBLEND_ZERO}           , // 2002: (0  - Cs)*As +  0 ==> 0 - Cs*As
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_INVDESTALPHA   , D3DBLEND_ZERO}           , // 2010: (0  - Cs)*Ad + Cs ==> Cs*(1 - Ad)
+	{ 0                          , D3DBLENDOP_REVSUBTRACT , D3DBLEND_DESTALPHA      , D3DBLEND_ONE}            , // 2011: (0  - Cs)*Ad + Cd ==> Cd - Cs*Ad
+	{ 0                          , D3DBLENDOP_REVSUBTRACT , D3DBLEND_DESTALPHA      , D3DBLEND_ZERO}           , // 2012: (0  - Cs)*Ad +  0 ==> 0 - Cs*Ad
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_INVBLENDFACTOR , D3DBLEND_ZERO}           , // 2020: (0  - Cs)*F  + Cs ==> Cs*(1 - F)
+	{ BLEND_ACCU                 , D3DBLENDOP_REVSUBTRACT , D3DBLEND_ONE            , D3DBLEND_ONE}            , // 2021: (0  - Cs)*F  + Cd ==> Cd - Cs*F
+	{ BLEND_NO_BAR               , D3DBLENDOP_REVSUBTRACT , D3DBLEND_BLENDFACTOR    , D3DBLEND_ZERO}           , // 2022: (0  - Cs)*F  +  0 ==> 0 - Cs*F
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_SRCALPHA}       , // 2100: (0  - Cd)*As + Cs ==> Cs - Cd*As
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_INVSRCALPHA}    , // 2101: (0  - Cd)*As + Cd ==> Cd*(1 - As)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ZERO           , D3DBLEND_SRCALPHA}       , // 2102: (0  - Cd)*As +  0 ==> 0 - Cd*As
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_DESTALPHA}      , // 2110: (0  - Cd)*Ad + Cs ==> Cs - Cd*Ad
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_INVDESTALPHA}   , // 2111: (0  - Cd)*Ad + Cd ==> Cd*(1 - Ad)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_DESTALPHA}      , // 2112: (0  - Cd)*Ad +  0 ==> 0 - Cd*Ad
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_BLENDFACTOR}    , // 2120: (0  - Cd)*F  + Cs ==> Cs - Cd*F
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_INVBLENDFACTOR} , // 2121: (0  - Cd)*F  + Cd ==> Cd*(1 - F)
+	{ 0                          , D3DBLENDOP_SUBTRACT    , D3DBLEND_ONE            , D3DBLEND_BLENDFACTOR}    , // 2122: (0  - Cd)*F  +  0 ==> 0 - Cd*F
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 2200: (0  -  0)*As + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 2201: (0  -  0)*As + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 2202: (0  -  0)*As +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 2210: (0  -  0)*Ad + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 2211: (0  -  0)*Ad + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 2212: (0  -  0)*Ad +  0 ==> 0
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ONE            , D3DBLEND_ZERO}           , // 2220: (0  -  0)*F  + Cs ==> Cs
+	{ 0                          , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ONE}            , // 2221: (0  -  0)*F  + Cd ==> Cd
+	{ BLEND_NO_BAR               , D3DBLENDOP_ADD         , D3DBLEND_ZERO           , D3DBLEND_ZERO}           , // 2222: (0  -  0)*F  +  0 ==> 0
+	{ 0                          , D3DBLENDOP_ADD         , GL_SRC_ALPHA            , GL_ONE_MINUS_SRC_ALPHA}  , // extra for merge operation
+};
diff --git a/plugins/GSdx_legacy/GSDeviceOGL.h b/plugins/GSdx_legacy/GSDeviceOGL.h
new file mode 100644
index 0000000000..2199bd1d22
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceOGL.h
@@ -0,0 +1,551 @@
+/*
+ *	Copyright (C) 2011-2013 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDevice.h"
+#include "GSTextureOGL.h"
+#include "GSdx.h"
+#include "GSVertexArrayOGL.h"
+#include "GSUniformBufferOGL.h"
+#include "GSShaderOGL.h"
+#include "GLState.h"
+
+// A couple of flag to determine the blending behavior
+#define BLEND_A_MAX		(0x100) // Impossible blending uses coeff bigger than 1
+#define BLEND_C_CLR		(0x200) // Clear color blending (use directly the destination color as blending factor)
+#define BLEND_NO_BAR	(0x400) // don't require texture barrier for the blending (because the RT is not used)
+#define BLEND_ACCU		(0x800) // Allow to use a mix of SW and HW blending to keep the best of the 2 worlds
+
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+extern uint64 g_real_texture_upload_byte;
+extern uint64 g_vertex_upload_byte;
+#endif
+
+class GSDepthStencilOGL {
+	bool m_depth_enable;
+	GLenum m_depth_func;
+	bool m_depth_mask;
+	// Note front face and back might be split but it seems they have same parameter configuration
+	bool m_stencil_enable;
+	GLenum m_stencil_func;
+	GLenum m_stencil_spass_dpass_op;
+
+public:
+
+	GSDepthStencilOGL() : m_depth_enable(false)
+		, m_depth_func(GL_ALWAYS)
+		, m_depth_mask(0)
+		, m_stencil_enable(false)
+		, m_stencil_func(0)
+		, m_stencil_spass_dpass_op(GL_KEEP)
+	{
+	}
+
+	void EnableDepth() { m_depth_enable = true; }
+	void EnableStencil() { m_stencil_enable = true; }
+
+	void SetDepth(GLenum func, bool mask) { m_depth_func = func; m_depth_mask = mask; }
+	void SetStencil(GLenum func, GLenum pass) { m_stencil_func = func; m_stencil_spass_dpass_op = pass; }
+
+	void SetupDepth()
+	{
+		if (GLState::depth != m_depth_enable) {
+			GLState::depth = m_depth_enable;
+			if (m_depth_enable)
+				glEnable(GL_DEPTH_TEST);
+			else
+				glDisable(GL_DEPTH_TEST);
+		}
+
+		if (m_depth_enable) {
+			if (GLState::depth_func != m_depth_func) {
+				GLState::depth_func = m_depth_func;
+				glDepthFunc(m_depth_func);
+			}
+			if (GLState::depth_mask != m_depth_mask) {
+				GLState::depth_mask = m_depth_mask;
+				glDepthMask((GLboolean)m_depth_mask);
+			}
+		}
+	}
+
+	void SetupStencil()
+	{
+		if (GLState::stencil != m_stencil_enable) {
+			GLState::stencil = m_stencil_enable;
+			if (m_stencil_enable)
+				glEnable(GL_STENCIL_TEST);
+			else
+				glDisable(GL_STENCIL_TEST);
+		}
+
+		if (m_stencil_enable) {
+			// Note: here the mask control which bitplane is considered by the operation
+			if (GLState::stencil_func != m_stencil_func) {
+				GLState::stencil_func = m_stencil_func;
+				glStencilFunc(m_stencil_func, 1, 1);
+			}
+			if (GLState::stencil_pass != m_stencil_spass_dpass_op) {
+				GLState::stencil_pass = m_stencil_spass_dpass_op;
+				glStencilOp(GL_KEEP, GL_KEEP, m_stencil_spass_dpass_op);
+			}
+		}
+	}
+
+	bool IsMaskEnable() { return m_depth_mask != GL_FALSE; }
+};
+
+class GSDeviceOGL final : public GSDevice
+{
+	public:
+	__aligned(struct, 32) VSConstantBuffer
+	{
+		GSVector4 Vertex_Scale_Offset;
+		GSVector4 TextureScale;
+
+		VSConstantBuffer()
+		{
+			Vertex_Scale_Offset = GSVector4::zero();
+			TextureScale = GSVector4::zero();
+		}
+
+		__forceinline bool Update(const VSConstantBuffer* cb)
+		{
+			GSVector4i* a = (GSVector4i*)this;
+			GSVector4i* b = (GSVector4i*)cb;
+
+			if(!((a[0] == b[0]) & (a[1] == b[1])).alltrue())
+			{
+				a[0] = b[0];
+				a[1] = b[1];
+
+				return true;
+			}
+
+			return false;
+		}
+	};
+
+	struct VSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 wildhack:1;
+				uint32 bppz:2;
+
+				uint32 _free:29;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key;}
+
+		VSSelector() : key(0) {}
+		VSSelector(uint32 k) : key(k) {}
+	};
+
+	struct GSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 sprite:1;
+				uint32 point:1;
+
+				uint32 _free:30;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key;}
+
+		GSSelector() : key(0) {}
+		GSSelector(uint32 k) : key(k) {}
+	};
+
+	__aligned(struct, 32) PSConstantBuffer
+	{
+		GSVector4 FogColor_AREF;
+		GSVector4 WH;
+		GSVector4 TA_Af;
+		GSVector4i MskFix;
+		GSVector4i FbMask;
+
+		GSVector4 HalfTexel;
+		GSVector4 MinMax;
+		GSVector4 TC_OH_TS;
+
+		PSConstantBuffer()
+		{
+			FogColor_AREF = GSVector4::zero();
+			HalfTexel     = GSVector4::zero();
+			WH            = GSVector4::zero();
+			MinMax        = GSVector4::zero();
+			MskFix        = GSVector4i::zero();
+			TC_OH_TS      = GSVector4::zero();
+			FbMask        = GSVector4i::zero();
+		}
+
+		__forceinline bool Update(const PSConstantBuffer* cb)
+		{
+			GSVector4i* a = (GSVector4i*)this;
+			GSVector4i* b = (GSVector4i*)cb;
+
+			// if WH matches both HalfTexel and TC_OH_TS do too
+			// MinMax depends on WH and MskFix so no need to check it too
+			if(!((a[0] == b[0]) & (a[1] == b[1]) & (a[2] == b[2]) & (a[3] == b[3]) & (a[4] == b[4])).alltrue())
+			{
+				// Note previous check uses SSE already, a plain copy will be faster than any memcpy
+				a[0] = b[0];
+				a[1] = b[1];
+				a[2] = b[2];
+				a[3] = b[3];
+				a[4] = b[4];
+				a[5] = b[5];
+
+				return true;
+			}
+
+			return false;
+		}
+	};
+
+	struct PSSelector
+	{
+		// Performance note: there are too many shader combinations
+		// It might hurt the performance due to frequent toggling worse it could consume
+		// a lots of memory.
+		union
+		{
+			struct
+			{
+				// *** Word 1
+				// Format
+				uint32 tex_fmt:4;
+				uint32 dfmt:2;
+				// Alpha extension/Correction
+				uint32 aem:1;
+				uint32 fba:1;
+				// Fog
+				uint32 fog:1;
+				// Flat/goround shading
+				uint32 iip:1;
+				// Pixel test
+				uint32 date:3;
+				uint32 atst:3;
+				// Color sampling
+				uint32 fst:1; // Investigate to do it on the VS
+				uint32 tfx:3;
+				uint32 tcc:1;
+				uint32 wms:2;
+				uint32 wmt:2;
+				uint32 ltf:1;
+				// Shuffle and fbmask effect
+				uint32 shuffle:1;
+				uint32 read_ba:1;
+				uint32 write_rg:1;
+				uint32 fbmask:1;
+
+				uint32 _free1:2;
+
+				// *** Word 2
+				// Blend and Colclip
+				uint32 blend_a:2;
+				uint32 blend_b:2;
+				uint32 blend_c:2;
+				uint32 blend_d:2;
+				uint32 clr1:1; // useful?
+				uint32 pabe:1;
+				uint32 hdr:1;
+				uint32 colclip:1;
+
+				// Hack
+				uint32 tcoffsethack:1;
+
+				uint32 _free2:19;
+			};
+
+			uint64 key;
+		};
+
+		// FIXME is the & useful ?
+		operator uint64() {return key;}
+
+		PSSelector() : key(0) {}
+	};
+
+	struct PSSamplerSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 tau:1;
+				uint32 tav:1;
+				uint32 ltf:1;
+				uint32 aniso:1;
+
+				uint32 _free:28;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key;}
+
+		PSSamplerSelector() : key(0) {}
+		PSSamplerSelector(uint32 k) : key(k) {}
+	};
+
+	struct OMDepthStencilSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 ztst:2;
+				uint32 zwe:1;
+				uint32 date:1;
+
+				uint32 _free:28;
+			};
+
+			uint32 key;
+		};
+
+		// FIXME is the & useful ?
+		operator uint32() {return key;}
+
+		OMDepthStencilSelector() : key(0) {}
+		OMDepthStencilSelector(uint32 k) : key(k) {}
+	};
+
+	struct OMColorMaskSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 wr:1;
+				uint32 wg:1;
+				uint32 wb:1;
+				uint32 wa:1;
+
+				uint32 _free:28;
+			};
+
+			struct
+			{
+				uint32 wrgba:4;
+			};
+
+			uint32 key;
+		};
+
+		// FIXME is the & useful ?
+		operator uint32() {return key & 0xf;}
+
+		OMColorMaskSelector() : key(0xF) {}
+		OMColorMaskSelector(uint32 c) { wrgba = c; }
+	};
+
+	struct OGLBlend {uint16 bogus, op, src, dst;};
+	static const OGLBlend m_blendMapOGL[3*3*3*3 + 1];
+	static const int m_NO_BLEND;
+	static const int m_MERGE_BLEND;
+
+	static int s_n;
+
+	private:
+	uint32 m_msaa;				// Level of Msaa
+
+	static bool m_debug_gl_call;
+	static FILE* m_debug_gl_file;
+
+	GSWnd* m_window;
+
+	GLuint m_fbo;				// frame buffer container
+	GLuint m_fbo_read;			// frame buffer container only for reading
+
+	GSVertexBufferStateOGL* m_va;// state of the vertex buffer/array
+
+	struct {
+		GLuint ps[2];				 // program object
+		GSUniformBufferOGL* cb;		 // uniform buffer object
+	} m_merge_obj;
+
+	struct {
+		GLuint ps[4];				// program object
+		GSUniformBufferOGL* cb;		// uniform buffer object
+	} m_interlace;
+
+	struct {
+		GLuint vs;		// program object
+		GLuint ps[18];	// program object
+		GLuint ln;		// sampler object
+		GLuint pt;		// sampler object
+		GSDepthStencilOGL* dss;
+		GSDepthStencilOGL* dss_write;
+		GSUniformBufferOGL* cb;
+	} m_convert;
+
+	struct {
+		GLuint ps;
+		GSUniformBufferOGL *cb;
+	} m_fxaa;
+
+	struct {
+		GLuint ps;
+		GSUniformBufferOGL* cb;
+	} m_shaderfx;
+
+	struct {
+		GSDepthStencilOGL* dss;
+		GSTexture* t;
+	} m_date;
+
+	struct {
+		GLuint ps;
+		GSUniformBufferOGL *cb;
+	} m_shadeboost;
+
+	GLuint m_vs[1<<3];
+	GLuint m_gs[1<<2];
+	GLuint m_ps_ss[1<<4];
+	GSDepthStencilOGL* m_om_dss[1<<4];
+	hash_map<uint64, GLuint > m_ps;
+	GLuint m_apitrace;
+
+	GLuint m_palette_ss;
+
+	GSUniformBufferOGL* m_vs_cb;
+	GSUniformBufferOGL* m_ps_cb;
+
+	VSConstantBuffer m_vs_cb_cache;
+	PSConstantBuffer m_ps_cb_cache;
+
+	GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format);
+	GSTexture* FetchSurface(int type, int w, int h, bool msaa, int format);
+
+	void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c) final;
+	void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset = 0) final;
+	void DoFXAA(GSTexture* sTex, GSTexture* dTex) final;
+	void DoShadeBoost(GSTexture* sTex, GSTexture* dTex) final;
+	void DoExternalFX(GSTexture* sTex, GSTexture* dTex) final;
+
+	void OMAttachRt(GSTextureOGL* rt = NULL);
+	void OMAttachDs(GSTextureOGL* ds = NULL);
+	void OMSetFBO(GLuint fbo);
+
+	public:
+	GSShaderOGL* m_shader;
+
+	GSDeviceOGL();
+	virtual ~GSDeviceOGL();
+
+	static void CheckDebugLog();
+	// Used by OpenGL, so the same calling convention is required.
+	static void APIENTRY DebugOutputToFile(GLenum gl_source, GLenum gl_type, GLuint id, GLenum gl_severity, GLsizei gl_length, const GLchar *gl_message, const void* userParam);
+
+	bool HasStencil() { return true; }
+	bool HasDepth32() { return true; }
+
+	bool Create(GSWnd* wnd);
+	bool Reset(int w, int h);
+	void Flip();
+	void SetVSync(bool enable);
+
+	void DrawPrimitive() final;
+	void DrawPrimitive(int offset, int count);
+	void DrawIndexedPrimitive() final;
+	void DrawIndexedPrimitive(int offset, int count) final;
+	inline void BeforeDraw();
+	inline void AfterDraw();
+
+	void ClearRenderTarget(GSTexture* t, const GSVector4& c) final;
+	void ClearRenderTarget(GSTexture* t, uint32 c) final;
+	void ClearRenderTarget_i(GSTexture* t, int32 c);
+	void ClearDepth(GSTexture* t, float c) final;
+	void ClearStencil(GSTexture* t, uint8 c) final;
+
+	GSTexture* CreateRenderTarget(int w, int h, bool msaa, int format = 0) final;
+	GSTexture* CreateDepthStencil(int w, int h, bool msaa, int format = 0) final;
+	GSTexture* CreateTexture(int w, int h, int format = 0) final;
+	GSTexture* CreateOffscreen(int w, int h, int format = 0) final;
+	void InitPrimDateTexture(GSTexture* rt);
+	void RecycleDateTexture();
+
+	GSTexture* CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format = 0, int ps_shader = 0) final;
+
+	void CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r) final;
+	void CopyRectConv(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r, bool at_origin);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true) final;
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, GLuint ps, bool linear = true);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, GLuint ps, int bs, bool linear = true);
+
+	void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm);
+
+	void BeginScene() final {}
+	void EndScene() final;
+
+	void IASetPrimitiveTopology(GLenum topology);
+	void IASetVertexBuffer(const void* vertices, size_t count);
+	void IASetIndexBuffer(const void* index, size_t count);
+
+	void PSSetShaderResource(int i, GSTexture* sr) final;
+	void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1) final;
+	void PSSetSamplerState(GLuint ss);
+
+	void OMSetDepthStencilState(GSDepthStencilOGL* dss);
+	void OMSetBlendState(uint8 blend_index = 0, uint8 blend_factor = 0, bool is_blend_constant = false);
+	void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL) final;
+	void OMSetColorMaskState(OMColorMaskSelector sel = OMColorMaskSelector());
+
+
+	void CreateTextureFX();
+	GLuint CompileVS(VSSelector sel, int logz);
+	GLuint CompileGS(GSSelector sel);
+	GLuint CompilePS(PSSelector sel);
+	GLuint CreateSampler(bool bilinear, bool tau, bool tav, bool aniso = false);
+	GLuint CreateSampler(PSSamplerSelector sel);
+	GSDepthStencilOGL* CreateDepthStencil(OMDepthStencilSelector dssel);
+
+	void SelfShaderTest();
+
+
+	void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
+	void SetupVS(VSSelector sel);
+	void SetupGS(GSSelector sel);
+	void SetupPS(PSSelector sel);
+	void SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb);
+	void SetupSampler(PSSamplerSelector ssel);
+	void SetupOM(OMDepthStencilSelector dssel);
+	GLuint GetSamplerID(PSSamplerSelector ssel);
+	GLuint GetPaletteSamplerID();
+
+	void Barrier(GLbitfield b);
+};
diff --git a/plugins/GSdx_legacy/GSDeviceSW.cpp b/plugins/GSdx_legacy/GSDeviceSW.cpp
new file mode 100644
index 0000000000..5518e5eecd
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceSW.cpp
@@ -0,0 +1,436 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDeviceSW.h"
+
+GSDeviceSW::GSDeviceSW()
+{
+}
+
+bool GSDeviceSW::Create(GSWnd* wnd)
+{
+	if(!GSDevice::Create(wnd))
+		return false;
+
+	Reset(1, 1);
+
+	return true;
+}
+
+bool GSDeviceSW::Reset(int w, int h)
+{
+	if(!GSDevice::Reset(w, h))
+		return false;
+
+	// TODO: m_backbuffer should be a window wrapper, or some native bitmap, software-only StretchRect to a full screen window may be too slow
+
+	m_backbuffer = new GSTextureSW(GSTexture::RenderTarget, w, h);
+
+	return true;
+}
+
+GSTexture* GSDeviceSW::CreateSurface(int type, int w, int h, bool msaa, int format)
+{
+	if(format != 0) return NULL; // there is only one format
+
+	return new GSTextureSW(type, w, h);
+}
+
+void GSDeviceSW::BeginScene()
+{
+	// TODO
+}
+
+void GSDeviceSW::DrawPrimitive()
+{
+	// TODO
+}
+
+void GSDeviceSW::EndScene()
+{
+	// TODO
+}
+
+void GSDeviceSW::ClearRenderTarget(GSTexture* t, const GSVector4& c)
+{
+	Clear(t, (c * 255 + 0.5f).rgba32());
+}
+
+void GSDeviceSW::ClearRenderTarget(GSTexture* t, uint32 c)
+{
+	Clear(t, c);
+}
+
+void GSDeviceSW::ClearDepth(GSTexture* t, float c)
+{
+	Clear(t, *(uint32*)&c);
+}
+
+void GSDeviceSW::ClearStencil(GSTexture* t, uint8 c)
+{
+	Clear(t, c);
+}
+
+GSTexture* GSDeviceSW::CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format, int ps_shader)
+{
+	GSTexture* dst = CreateOffscreen(w, h, format);
+
+	if(dst != NULL)
+	{
+		CopyRect(src, dst, GSVector4i(0, 0, w, h));
+	}
+
+	return dst;
+}
+
+void GSDeviceSW::CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r)
+{
+	GSTexture::GSMap m;
+
+	if(sTex->Map(m, &r))
+	{
+		dTex->Update(r, m.bits, m.pitch);
+
+		sTex->Unmap();
+	}
+}
+
+class ShaderBase
+{
+protected:
+	GSVector4i Sample(const GSVector4i& c, const GSVector4i& uf, const GSVector4i& vf) const
+	{
+		GSVector4i c0 = c.upl8();
+		GSVector4i c1 = c.uph8();
+
+		c0 = c0.lerp16<0>(c1, vf);
+		c0 = c0.lerp16<0>(c0.srl<8>(), uf);
+
+		return c0;
+	}
+
+	GSVector4i Blend(const GSVector4i& c0, const GSVector4i& c1) const
+	{
+		return c0.lerp16<0>(c1, c1.wwwwl().sll16(7));
+	}
+
+	GSVector4i Blend2x(const GSVector4i& c0, const GSVector4i& c1) const
+	{
+		return c0.lerp16<0>(c1, c1.wwwwl().sll16(1).pu16().uph8().sll16(7)); // .sll16(1).pu16() => 2x, then clamp (...)
+	}
+
+	GSVector4i Blend(const GSVector4i& c0, const GSVector4i& c1, const GSVector4i& f) const
+	{
+		return c0.lerp16<0>(c1, f);
+	}
+};
+
+class ShaderCopy : public ShaderBase
+{
+public:
+	void operator() (uint32* RESTRICT dst, const GSVector4i& c, const GSVector4i& uf, const GSVector4i& vf) const
+	{
+		*dst = Sample(c, uf, vf).pu16().extract32<0>();
+	}
+
+	void operator() (uint32* RESTRICT dst, uint32 c) const
+	{
+		*dst = c;
+	}
+};
+
+class ShaderAlphaBlend : public ShaderBase
+{
+public:
+	void operator() (uint32* RESTRICT dst, const GSVector4i& c, const GSVector4i& uf, const GSVector4i& vf) const
+	{
+		*dst = Blend(Sample(c, uf, vf), GSVector4i(*dst).uph8()).pu16().extract32<0>();
+	}
+
+	void operator() (uint32* RESTRICT dst, uint32 c) const
+	{
+		*dst = Blend(GSVector4i(c), GSVector4i(*dst).uph8()).pu16().extract32<0>();
+	}
+};
+
+class ShaderAlpha2xBlend : public ShaderBase
+{
+public:
+	void operator() (uint32* RESTRICT dst, const GSVector4i& c, const GSVector4i& uf, const GSVector4i& vf) const
+	{
+		*dst = Blend2x(Sample(c, uf, vf), GSVector4i(*dst).uph8()).pu16().extract32<0>();
+	}
+
+	void operator() (uint32* RESTRICT dst, uint32 c) const
+	{
+		*dst = Blend2x(GSVector4i(c), GSVector4i(*dst).uph8()).pu16().extract32<0>();
+	}
+};
+
+__aligned(class, 16) ShaderFactorBlend : public ShaderBase
+{
+	GSVector4i m_f;
+
+public:
+	ShaderFactorBlend(uint32 f)
+	{
+		m_f = GSVector4i((f << 16) | f).xxxx().srl16(1);
+	}
+
+	void operator() (uint32* RESTRICT dst, const GSVector4i& c, const GSVector4i& uf, const GSVector4i& vf) const
+	{
+		*dst = Blend(Sample(c, uf, vf), GSVector4i(*dst).uph8(), m_f).pu16().extract32<0>();
+	}
+
+	void operator() (uint32* RESTRICT dst, uint32 c) const
+	{
+		*dst = Blend(GSVector4i(c), GSVector4i(*dst).uph8(), m_f).pu16().extract32<0>();
+	}
+};
+
+template<class SHADER> static void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, const SHADER& shader, bool linear)
+{
+	GSVector4i r(dRect.ceil());
+
+	r = r.rintersect(GSVector4i(dTex->GetSize()).zwxy());
+
+	if(r.rempty()) return;
+
+	GSTexture::GSMap dm;
+
+	if(!dTex->Map(dm, &r)) return;
+
+	GSTexture::GSMap sm;
+
+	if(!sTex->Map(sm, NULL)) {dTex->Unmap(); return;}
+
+	GSVector2i ssize = sTex->GetSize();
+
+	GSVector4 p = dRect;
+	GSVector4 t = sRect * GSVector4(ssize).xyxy() * GSVector4((float)0x10000);
+
+	GSVector4 tl = p.xyxy(t);
+	GSVector4 br = p.zwzw(t);
+	GSVector4 tlbr = br - tl;
+
+	tlbr /= tlbr.xyxy();
+
+	if(tl.x < (float)r.left) tl.z += tlbr.z * ((float)r.left - tl.x);
+	if(tl.y < (float)r.top) tl.w += tlbr.w * ((float)r.top - tl.y);
+
+	GSVector4i uvdudv(tl.zwzw(tlbr));
+
+	GSVector4i uv = uvdudv.xxyy() + GSVector4i(0, 0x10000).xyxy();
+	GSVector4i du = uvdudv.zzzz().srl<8>();
+	GSVector4i dv = uvdudv.wwww().sll<8>();
+
+	// TODO: clipping may not be that necessary knowing we don't address outside (except the linear filter +1 pixel)
+
+	GSVector4i uvmax = GSVector4i((ssize.x - 1) << 16, (ssize.y - 1) << 16).xxyy();
+
+	GSVector4i v = uv;
+
+	if(linear)
+	{
+		for(int j = r.height(); j > 0; j--, v += dv, dm.bits += dm.pitch)
+		{
+			GSVector4i vf = v.zzwwh().zzww().srl16(1);
+			GSVector4i vi = v.max_i16(GSVector4i::zero()).min_i16(uvmax);
+
+			int v0 = vi.extract16<5>();
+			int v1 = vi.extract16<7>();
+
+			uint32* RESTRICT src0 = (uint32*)&sm.bits[v0 * sm.pitch];
+			uint32* RESTRICT src1 = (uint32*)&sm.bits[v1 * sm.pitch];
+			uint32* RESTRICT dst = (uint32*)dm.bits;
+
+			GSVector4i u = v;
+
+			for(int i = r.width(); i > 0; i--, dst++, u += du)
+			{
+				GSVector4i uf = u.xxyyh().xxyy().srl16(1);
+				GSVector4i ui = u.max_i16(GSVector4i::zero()).min_i16(uvmax);
+
+				int u0 = ui.extract16<1>();
+				int u1 = ui.extract16<3>();
+
+				shader(dst, GSVector4i(src0[u0], src0[u1], src1[u0], src1[u1]), uf, vf);
+			}
+		}
+	}
+	else
+	{
+		for(int j = r.height(); j > 0; j--, v += dv, dm.bits += dm.pitch)
+		{
+			GSVector4i vi = v.max_i16(GSVector4i::zero()).min_i16(uvmax);
+
+			uint32* RESTRICT src = (uint32*)&sm.bits[vi.extract16<5>() * sm.pitch];
+			uint32* RESTRICT dst = (uint32*)dm.bits;
+
+			GSVector4i u = v;
+
+			for(int i = r.width(); i > 0; i--, dst++, u += du)
+			{
+				GSVector4i ui = u.max_i16(GSVector4i::zero()).min_i16(uvmax);
+
+				shader(dst, src[ui.extract16<1>()]);
+			}
+		}
+	}
+
+	sTex->Unmap();
+	dTex->Unmap();
+}
+
+void GSDeviceSW::StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader, bool linear)
+{
+	// TODO: if dTex == m_backbuffer && m_backbuffer is special
+
+	if(shader == 0)
+	{
+		if((sRect == GSVector4(0, 0, 1, 1) & dRect == GSVector4(dTex->GetSize()).zwxy()).alltrue() && sTex->GetSize() == dTex->GetSize())
+		{
+			// shortcut
+
+			CopyRect(sTex, dTex, GSVector4i(dTex->GetSize()).zwxy());
+
+			return;
+		}
+
+		ShaderCopy s;
+
+		::StretchRect(sTex, sRect, dTex, dRect, s, linear);
+	}
+	else if(shader == 1)
+	{
+		ShaderAlphaBlend s;
+
+		::StretchRect(sTex, sRect, dTex, dRect, s, linear);
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GSDeviceSW::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
+{
+	// TODO
+}
+
+void GSDeviceSW::PSSetShaderResource(int i, GSTexture* sRect)
+{
+	// TODO
+}
+
+void GSDeviceSW::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor)
+{
+	// TODO
+}
+
+//
+
+void GSDeviceSW::DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c)
+{
+	ClearRenderTarget(dTex, c);
+
+	if(sTex[1] && !slbg)
+	{
+		StretchRect(sTex[1], sRect[1], dTex, dRect[1]);
+	}
+
+	if(sTex[0])
+	{
+		if(mmod == 0)
+		{
+			// alpha = min(sTex[0].a * 2, 1)
+
+			ShaderAlpha2xBlend s;
+
+			::StretchRect(sTex[0], sRect[0], dTex, dRect[0], s, true);
+		}
+		else
+		{
+			// alpha = c.a
+
+			ShaderFactorBlend s((uint32)(int)(c.a * 255));
+
+			::StretchRect(sTex[0], sRect[0], dTex, dRect[0], s, true);
+		}
+	}
+
+	// dTex->Save("c:\\1.bmp");
+}
+
+void GSDeviceSW::DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset)
+{
+	GSVector4 s = GSVector4(dTex->GetSize());
+
+	GSVector4 sRect(0, 0, 1, 1);
+	GSVector4 dRect(0.0f, yoffset, s.x, s.y + yoffset);
+
+	if(shader == 0 || shader == 1)
+	{
+		// TODO: 0/1 => update even/odd lines of dTex
+	}
+	else if(shader == 2)
+	{
+		// TODO: blend lines (1:2:1 filter)
+	}
+	else if(shader == 3)
+	{
+		StretchRect(sTex, sRect, dTex, dRect, 0, linear);
+	}
+	else
+	{
+		ASSERT(0);
+	}
+}
+
+void GSDeviceSW::Clear(GSTexture* t, uint32 c)
+{
+	int w = t->GetWidth();
+	int h = t->GetHeight();
+
+	GSTexture::GSMap m;
+
+	if(t->Map(m, NULL))
+	{
+		GSVector4i v((int)c);
+
+		w >>= 2;
+
+		for(int j = 0; j < h; j++, m.bits += m.pitch)
+		{
+			GSVector4i* RESTRICT dst = (GSVector4i*)m.bits;
+
+			for(int i = 0; i < w; i += 2)
+			{
+				dst[i + 0] = v;
+				dst[i + 1] = v;
+			}
+		}
+
+		t->Unmap();
+	}
+}
+
diff --git a/plugins/GSdx_legacy/GSDeviceSW.h b/plugins/GSdx_legacy/GSDeviceSW.h
new file mode 100644
index 0000000000..2488ec974e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDeviceSW.h
@@ -0,0 +1,62 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDevice.h"
+#include "GSTextureSW.h"
+
+class GSDeviceSW : public GSDevice
+{
+	GSTexture* CreateSurface(int type, int w, int h, bool msaa, int format);
+
+	void DoMerge(GSTexture* sTex[2], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, bool slbg, bool mmod, const GSVector4& c);
+	void DoInterlace(GSTexture* sTex, GSTexture* dTex, int shader, bool linear, float yoffset = 0);
+
+	void Clear(GSTexture* t, uint32 c);
+
+public:
+	GSDeviceSW();
+
+	bool Create(GSWnd* wnd);
+	bool Reset(int w, int h);
+
+	// drawing may be routed through here, the software renderers use the rasterizer directly now
+
+	void BeginScene();
+	void DrawPrimitive();
+	void EndScene();
+
+	void ClearRenderTarget(GSTexture* t, const GSVector4& c);
+	void ClearRenderTarget(GSTexture* t, uint32 c);
+	void ClearDepth(GSTexture* t, float c);
+	void ClearStencil(GSTexture* t, uint8 c);
+
+	GSTexture* CopyOffscreen(GSTexture* src, const GSVector4& sRect, int w, int h, int format = 0, int ps_shader = 0);
+
+	void CopyRect(GSTexture* sTex, GSTexture* dTex, const GSVector4i& r);
+	void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, int shader = 0, bool linear = true);
+
+	void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
+	void PSSetShaderResource(int i, GSTexture* sRect);
+	void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
+};
+
diff --git a/plugins/GSdx_legacy/GSDialog.cpp b/plugins/GSdx_legacy/GSDialog.cpp
new file mode 100644
index 0000000000..7c55d67570
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDialog.cpp
@@ -0,0 +1,336 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "StdAfx.h"
+#include <Shlwapi.h>
+#include <CommCtrl.h>
+#include "GSdx.h"
+#include "GSDialog.h"
+#include "GSVector.h"
+
+GSDialog::GSDialog(UINT id)
+	: m_id(id)
+	, m_hWnd(NULL)
+{
+}
+
+INT_PTR GSDialog::DoModal()
+{
+	return DialogBoxParam(theApp.GetModuleHandle(), MAKEINTRESOURCE(m_id), GetActiveWindow(), DialogProc, (LPARAM)this);
+}
+
+INT_PTR CALLBACK GSDialog::DialogProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	GSDialog* dlg = NULL;
+
+	if(message == WM_INITDIALOG)
+	{
+		dlg = (GSDialog*)lParam;
+		SetWindowLongPtr(hWnd, GWLP_USERDATA, (LONG_PTR)dlg);
+		dlg->m_hWnd = hWnd;
+
+		MONITORINFO mi;
+		mi.cbSize = sizeof(mi);
+		GetMonitorInfo(MonitorFromWindow(hWnd, MONITOR_DEFAULTTONEAREST), &mi);
+
+		GSVector4i r;
+		GetWindowRect(hWnd, r);
+
+		int x = (mi.rcWork.left + mi.rcWork.right - r.width()) / 2;
+		int y = (mi.rcWork.top + mi.rcWork.bottom - r.height()) / 2;
+
+		SetWindowPos(hWnd, NULL, x, y, -1, -1, SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE);
+
+		dlg->OnInit();
+
+		return true;
+	}
+
+	dlg = (GSDialog*)GetWindowLongPtr(hWnd, GWLP_USERDATA);
+
+	if (message == WM_NOTIFY)
+	{
+		if (((LPNMHDR)lParam)->code == TTN_GETDISPINFO)
+		{
+			LPNMTTDISPINFO pInfo = (LPNMTTDISPINFO)lParam;
+			UINT id = GetWindowLongPtr((HWND)pInfo->hdr.idFrom, GWL_ID);
+
+			// lpszText is used only if hinst is NULL. Seems to be NULL already,
+			// but it can't hurt to explicitly set it.
+			pInfo->hinst = NULL;
+			pInfo->lpszText = (LPTSTR)dialog_message(id);
+			SendMessage(pInfo->hdr.hwndFrom, TTM_SETMAXTIPWIDTH, 0, 500);
+			return true;
+		}
+	}
+
+	return dlg != NULL ? dlg->OnMessage(message, wParam, lParam) : FALSE;
+}
+
+// Tooltips will only show if the TOOLINFO cbSize <= the struct size. If it's
+// smaller some functionality might be disabled. So let's try and use the
+// correct size.
+UINT GSDialog::GetTooltipStructSize()
+{
+	DLLGETVERSIONPROC dllGetVersion = (DLLGETVERSIONPROC)GetProcAddress(GetModuleHandle("ComCtl32.dll"), "DllGetVersion");
+	if (dllGetVersion) {
+		DLLVERSIONINFO2 dllversion = { 0 };
+		dllversion.info1.cbSize = sizeof(DLLVERSIONINFO2);
+
+		if (dllGetVersion((DLLVERSIONINFO*)&dllversion) == S_OK) {
+			// Minor, then major version.
+			DWORD version = MAKELONG(dllversion.info1.dwMinorVersion, dllversion.info1.dwMajorVersion);
+			DWORD tooltip_v3 = MAKELONG(0, 6);
+			if (version >= tooltip_v3)
+				return TTTOOLINFOA_V3_SIZE;
+		}
+	}
+	// Should be fine for XP and onwards, comctl versions >= 4.7 should at least
+	// be this size.
+	return TTTOOLINFOA_V2_SIZE;
+}
+
+bool GSDialog::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
+{
+	return message == WM_COMMAND ? OnCommand((HWND)lParam, LOWORD(wParam), HIWORD(wParam)) : false;
+}
+
+bool GSDialog::OnCommand(HWND hWnd, UINT id, UINT code)
+{
+	if(id == IDOK || id == IDCANCEL)
+	{
+		EndDialog(m_hWnd, id);
+
+		return true;
+	}
+
+	return false;
+}
+
+string GSDialog::GetText(UINT id)
+{
+	string s;
+
+	char* buff = NULL;
+
+	for(int size = 256, limit = 65536; size < limit; size <<= 1)
+	{
+		buff = new char[size];
+
+		if(GetDlgItemText(m_hWnd, id, buff, size))
+		{
+			s = buff;
+			size = limit;
+		}
+
+		delete [] buff;
+	}
+
+	return s;
+}
+
+int GSDialog::GetTextAsInt(UINT id)
+{
+	return atoi(GetText(id).c_str());
+}
+
+void GSDialog::SetText(UINT id, const char* str)
+{
+	SetDlgItemText(m_hWnd, id, str);
+}
+
+void GSDialog::SetTextAsInt(UINT id, int i)
+{
+	char buff[32] = {0};
+	itoa(i, buff, 10);
+	SetText(id, buff);
+}
+
+void GSDialog::ComboBoxInit(UINT id, const vector<GSSetting>& settings, int32_t selectionValue, int32_t maxValue)
+{
+	HWND hWnd = GetDlgItem(m_hWnd, id);
+
+	SendMessage(hWnd, CB_RESETCONTENT, 0, 0);
+
+	for(size_t i = 0; i < settings.size(); i++)
+	{
+		const GSSetting& s = settings[i];
+
+		if(s.value <= maxValue)
+		{
+			string str(s.name);
+
+			if(!s.note.empty())
+			{
+				str = str + " (" + s.note + ")";
+			}
+
+			ComboBoxAppend(id, str.c_str(), (LPARAM)s.value, s.value == selectionValue);
+		}
+	}
+
+	ComboBoxFixDroppedWidth(id);
+}
+
+int GSDialog::ComboBoxAppend(UINT id, const char* str, LPARAM data, bool select)
+{
+	HWND hWnd = GetDlgItem(m_hWnd, id);
+
+	int item = (int)SendMessage(hWnd, CB_ADDSTRING, 0, (LPARAM)str);
+
+	SendMessage(hWnd, CB_SETITEMDATA, item, (LPARAM)data);
+
+	if(select)
+	{
+		SendMessage(hWnd, CB_SETCURSEL, item, 0);
+	}
+
+	return item;
+}
+
+bool GSDialog::ComboBoxGetSelData(UINT id, INT_PTR& data)
+{
+	HWND hWnd = GetDlgItem(m_hWnd, id);
+
+	int item = SendMessage(hWnd, CB_GETCURSEL, 0, 0);
+
+	if(item >= 0)
+	{
+		data = SendMessage(hWnd, CB_GETITEMDATA, item, 0);
+
+		return true;
+	}
+
+	return false;
+}
+
+void GSDialog::ComboBoxFixDroppedWidth(UINT id)
+{
+	HWND hWnd = GetDlgItem(m_hWnd, id);
+
+	int count = (int)SendMessage(hWnd, CB_GETCOUNT, 0, 0);
+
+	if(count > 0)
+	{
+		HDC hDC = GetDC(hWnd);
+
+		SelectObject(hDC, (HFONT)SendMessage(hWnd, WM_GETFONT, 0, 0));
+
+		int width = (int)SendMessage(hWnd, CB_GETDROPPEDWIDTH, 0, 0);
+
+		for(int i = 0; i < count; i++)
+		{
+			int len = (int)SendMessage(hWnd, CB_GETLBTEXTLEN, i, 0);
+
+			if(len > 0)
+			{
+				char* buff = new char[len + 1];
+
+				SendMessage(hWnd, CB_GETLBTEXT, i, (LPARAM)buff);
+
+				SIZE size;
+				
+				if(GetTextExtentPoint32(hDC, buff, strlen(buff), &size))
+				{
+					size.cx += 10;
+
+					if(size.cx > width) width = size.cx;
+				}
+
+				delete [] buff;
+			}
+		}
+
+		ReleaseDC(hWnd, hDC);
+
+		if(width > 0)
+		{
+			SendMessage(hWnd, CB_SETDROPPEDWIDTH, width, 0);
+		}
+	}
+}
+
+void GSDialog::OpenFileDialog(UINT id, const char *title)
+{
+	char filename[512];
+	OPENFILENAME ofn = { 0 };
+	ofn.lStructSize = sizeof(OPENFILENAME);
+	ofn.hwndOwner = m_hWnd;
+	ofn.Flags = OFN_EXPLORER | OFN_FILEMUSTEXIST;
+	ofn.lpstrFile = filename;
+	ofn.lpstrFile[0] = 0;
+	ofn.nMaxFile = 512;
+	ofn.lpstrTitle = title;
+
+	// GetOpenFileName changes the current directory, so we need to save and
+	// restore the current directory or everything using relative paths will
+	// break.
+	char current_directory[512];
+	GetCurrentDirectory(512, current_directory);
+
+	if (GetOpenFileName(&ofn))
+		SendMessage(GetDlgItem(m_hWnd, id), WM_SETTEXT, 0, (LPARAM)filename);
+
+	SetCurrentDirectory(current_directory);
+
+}
+
+void GSDialog::AddTooltip(UINT id)
+{
+	static UINT tooltipStructSize = GetTooltipStructSize();
+	bool hasTooltip;
+
+	dialog_message(id, &hasTooltip);
+	if (!hasTooltip)
+		return;
+
+	HWND hWnd = GetDlgItem(m_hWnd, id);
+	if (hWnd == NULL)
+		return;
+
+	// TTS_NOPREFIX allows tabs and '&' to be used.
+	HWND hwndTip = CreateWindowEx(WS_EX_TOPMOST, TOOLTIPS_CLASS, NULL,
+		TTS_ALWAYSTIP | TTS_NOPREFIX,
+		CW_USEDEFAULT, CW_USEDEFAULT, CW_USEDEFAULT, CW_USEDEFAULT,
+		m_hWnd, NULL, theApp.GetModuleHandle(), NULL);
+	if (hwndTip == NULL)
+		return;
+
+	TOOLINFO toolInfo = { 0 };
+	toolInfo.cbSize = tooltipStructSize;
+	toolInfo.hwnd = m_hWnd;
+	toolInfo.uFlags = TTF_IDISHWND | TTF_SUBCLASS;
+	toolInfo.uId = (UINT_PTR)hWnd;
+	// Can't directly add the tooltip string - it doesn't work for long messages
+	toolInfo.lpszText = LPSTR_TEXTCALLBACK;
+	SendMessage(hwndTip, TTM_ADDTOOL, 0, (LPARAM)&toolInfo);
+	// 32.767s is the max show time.
+	SendMessage(hwndTip, TTM_SETDELAYTIME, TTDT_AUTOPOP, 32767);
+}
+
+void GSDialog::InitCommonControls()
+{
+	INITCOMMONCONTROLSEX icex;
+	icex.dwSize = sizeof(INITCOMMONCONTROLSEX);
+	icex.dwICC = ICC_TAB_CLASSES;
+
+	InitCommonControlsEx(&icex);
+}
diff --git a/plugins/GSdx_legacy/GSDialog.h b/plugins/GSdx_legacy/GSDialog.h
new file mode 100644
index 0000000000..905db3e9df
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDialog.h
@@ -0,0 +1,64 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSSetting.h"
+
+class GSDialog
+{
+	int m_id;
+
+	static INT_PTR CALLBACK DialogProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+	static UINT GetTooltipStructSize();
+
+protected:
+	HWND m_hWnd;
+
+	virtual void OnInit() {}
+	virtual bool OnMessage(UINT message, WPARAM wParam, LPARAM lParam);
+	virtual bool OnCommand(HWND hWnd, UINT id, UINT code);
+
+public:
+	GSDialog(UINT id);
+	virtual ~GSDialog() {}
+
+	int GetId() const {return m_id;}
+
+	INT_PTR DoModal();
+
+	string GetText(UINT id);
+	int GetTextAsInt(UINT id);
+
+	void SetText(UINT id, const char* str);
+	void SetTextAsInt(UINT id, int i);
+
+	void ComboBoxInit(UINT id, const vector<GSSetting>& settings, int32_t selectionValue, int32_t maxValue = INT32_MAX);
+	int ComboBoxAppend(UINT id, const char* str, LPARAM data = 0, bool select = false);
+	bool ComboBoxGetSelData(UINT id, INT_PTR& data);
+	void ComboBoxFixDroppedWidth(UINT id);
+
+	void OpenFileDialog(UINT id, const char *title);
+
+	void AddTooltip(UINT id);
+
+	static void InitCommonControls();
+};
diff --git a/plugins/GSdx_legacy/GSDirtyRect.cpp b/plugins/GSdx_legacy/GSDirtyRect.cpp
new file mode 100644
index 0000000000..e9efc0d10b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDirtyRect.cpp
@@ -0,0 +1,84 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDirtyRect.h"
+
+GSDirtyRect::GSDirtyRect()
+	: psm(PSM_PSMCT32)
+{
+	left = top = right = bottom = 0;
+}
+
+GSDirtyRect::GSDirtyRect(const GSVector4i& r, uint32 psm)
+	: psm(psm)
+{
+	left = r.left;
+	top = r.top;
+	right = r.right;
+	bottom = r.bottom;
+}
+
+GSVector4i GSDirtyRect::GetDirtyRect(const GIFRegTEX0& TEX0)
+{
+	GSVector4i r;
+
+	GSVector2i src = GSLocalMemory::m_psm[psm].bs;
+
+	if(psm != TEX0.PSM)
+	{
+		GSVector2i dst = GSLocalMemory::m_psm[TEX0.PSM].bs;
+
+		r.left = left * dst.x / src.x;
+		r.top = top * dst.y / src.y;
+		r.right = right * dst.x / src.x;
+		r.bottom = bottom * dst.y / src.y;
+	}
+	else
+	{
+		r = GSVector4i(left, top, right, bottom).ralign<Align_Outside>(src);
+	}
+
+	return r;
+}
+
+//
+
+GSVector4i GSDirtyRectList::GetDirtyRectAndClear(const GIFRegTEX0& TEX0, const GSVector2i& size)
+{
+	if(!empty())
+	{
+		GSVector4i r(INT_MAX, INT_MAX, 0, 0);
+
+		for(list<GSDirtyRect>::iterator i = begin(); i != end(); i++)
+		{
+			r = r.runion(i->GetDirtyRect(TEX0));
+		}
+
+		clear();
+
+		GSVector2i bs = GSLocalMemory::m_psm[TEX0.PSM].bs;
+
+		return r.ralign<Align_Outside>(bs).rintersect(GSVector4i(0, 0, size.x, size.y));
+	}
+
+	return GSVector4i::zero();
+}
diff --git a/plugins/GSdx_legacy/GSDirtyRect.h b/plugins/GSdx_legacy/GSDirtyRect.h
new file mode 100644
index 0000000000..e2468b6f4e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDirtyRect.h
@@ -0,0 +1,46 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSLocalMemory.h"
+
+class GSDirtyRect
+{
+	int left;
+	int top;
+	int right;
+	int bottom;
+
+	uint32 psm;
+
+public:
+	GSDirtyRect();
+	GSDirtyRect(const GSVector4i& r, uint32 psm);
+	GSVector4i GetDirtyRect(const GIFRegTEX0& TEX0);
+};
+
+class GSDirtyRectList : public list<GSDirtyRect>
+{
+public:
+	GSDirtyRectList() {}
+	GSVector4i GetDirtyRectAndClear(const GIFRegTEX0& TEX0, const GSVector2i& size);
+};
diff --git a/plugins/GSdx_legacy/GSDrawScanline.cpp b/plugins/GSdx_legacy/GSDrawScanline.cpp
new file mode 100644
index 0000000000..3cf739d071
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanline.cpp
@@ -0,0 +1,2965 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanline.h"
+#include "GSTextureCacheSW.h"
+
+GSDrawScanline::GSDrawScanline()
+	: m_sp_map("GSSetupPrim", &m_local)
+	, m_ds_map("GSDrawScanline", &m_local)
+{
+	memset(&m_local, 0, sizeof(m_local));
+
+	m_local.gd = &m_global;
+}
+
+GSDrawScanline::~GSDrawScanline()
+{
+}
+
+void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
+{
+	memcpy(&m_global, &((const SharedData*)data)->global, sizeof(m_global));
+
+	if(m_global.sel.mmin && m_global.sel.lcm)
+	{
+		GSVector4i v = m_global.t.minmax.srl16(m_global.lod.i.extract32<0>());//.x);
+
+		v = v.upl16(v);
+
+		m_local.temp.uv_minmax[0] = v.upl32(v);
+		m_local.temp.uv_minmax[1] = v.uph32(v);
+	}
+
+	m_ds = m_ds_map[m_global.sel];
+
+	if(m_global.sel.aa1)
+	{
+		GSScanlineSelector sel;
+
+		sel.key = m_global.sel.key;
+		sel.zwrite = 0;
+		sel.edge = 1;
+
+		m_de = m_ds_map[sel];
+	}
+	else
+	{
+		m_de = NULL;
+	}
+
+	if(m_global.sel.IsSolidRect())
+	{
+		m_dr = (DrawRectPtr)&GSDrawScanline::DrawRect;
+	}
+	else
+	{
+		m_dr = NULL;
+	}
+
+	// doesn't need all bits => less functions generated
+
+	GSScanlineSelector sel;
+
+	sel.key = 0;
+
+	sel.iip = m_global.sel.iip;
+	sel.tfx = m_global.sel.tfx;
+	sel.tcc = m_global.sel.tcc;
+	sel.fst = m_global.sel.fst;
+	sel.fge = m_global.sel.fge;
+	sel.prim = m_global.sel.prim;
+	sel.fb = m_global.sel.fb;
+	sel.zb = m_global.sel.zb;
+	sel.zoverflow = m_global.sel.zoverflow;
+	sel.notest = m_global.sel.notest;
+
+	m_sp = m_sp_map[sel];
+}
+
+void GSDrawScanline::EndDraw(uint64 frame, uint64 ticks, int actual, int total)
+{
+	m_ds_map.UpdateStats(frame, ticks, actual, total);
+}
+
+#ifndef ENABLE_JIT_RASTERIZER
+
+void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan)
+{
+	GSScanlineSelector sel = m_global.sel;
+
+	bool has_z = sel.zb != 0;
+	bool has_f = sel.fb && sel.fge;
+	bool has_t = sel.fb && sel.tfx != TFX_NONE;
+	bool has_c = sel.fb && !(sel.tfx == TFX_DECAL && sel.tcc);
+
+	#if _M_SSE >= 0x501
+
+	const GSVector8* shift = GSSetupPrimCodeGenerator::m_shift;
+
+	if(has_z || has_f)
+	{
+		if(sel.prim != GS_SPRITE_CLASS)
+		{
+			GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
+
+			if(has_f)
+			{
+				m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
+
+				GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
+
+				for(int i = 0; i < 8; i++)
+				{
+					m_local.d[i].f = GSVector8i(df * shift[1 + i]).xxzzlh();
+				}
+			}
+
+			if(has_z)
+			{
+				m_local.d8.p.z = dp8.extract32<2>();
+
+				GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
+
+				for(int i = 0; i < 8; i++)
+				{
+					m_local.d[i].z = dz * shift[1 + i];
+				}
+			}
+		}
+		else
+		{
+			if(has_f)
+			{
+				m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
+			}
+
+			if(has_z)
+			{
+				m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
+			}
+		}
+	}
+
+	if(has_t)
+	{
+		GSVector4 dt8 = dscan.t * GSVector4::broadcast32(&shift[0]);
+
+		if(sel.fst)
+		{
+			m_local.d8.stq = GSVector4::cast(GSVector4i(dt8));
+		}
+		else
+		{
+			m_local.d8.stq = dt8;
+		}
+
+		GSVector8 dt(dscan.t);
+
+		for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
+		{
+			GSVector8 dstq;
+
+			switch(j)
+			{
+			case 0: dstq = dt.xxxx(); break;
+			case 1: dstq = dt.yyyy(); break;
+			case 2: dstq = dt.zzzz(); break;
+			}
+
+			for(int i = 0; i < 8; i++)
+			{
+				GSVector8 v = dstq * shift[1 + i];
+
+				if(sel.fst)
+				{
+					switch(j)
+					{
+					case 0: m_local.d[i].s = GSVector8::cast(GSVector8i(v)); break;
+					case 1: m_local.d[i].t = GSVector8::cast(GSVector8i(v)); break;
+					}
+				}
+				else
+				{
+					switch(j)
+					{
+					case 0: m_local.d[i].s = v; break;
+					case 1: m_local.d[i].t = v; break;
+					case 2: m_local.d[i].q = v; break;
+					}
+				}
+			}
+		}
+	}
+
+	if(has_c)
+	{
+		if(sel.iip)
+		{
+			GSVector4 dc8 = dscan.c * GSVector4::broadcast32(&shift[0]);
+
+			GSVector4i::storel(&m_local.d8.c, GSVector4i(dc8).xzyw().ps32());
+
+			GSVector8 dc(dscan.c);
+
+			GSVector8 dr = dc.xxxx();
+			GSVector8 db = dc.zzzz();
+
+			for(int i = 0; i < 8; i++)
+			{
+				GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
+				GSVector8i b = GSVector8i(db * shift[1 + i]).ps32();
+
+				m_local.d[i].rb = r.upl16(b);
+			}
+
+			GSVector8 dg = dc.yyyy();
+			GSVector8 da = dc.wwww();
+
+			for(int i = 0; i < 8; i++)
+			{
+				GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
+				GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
+
+				m_local.d[i].ga = g.upl16(a);
+			}
+		}
+		else
+		{
+			int last = 0;
+			
+			switch(sel.prim)
+			{
+			case GS_POINT_CLASS: last = 0; break;
+			case GS_LINE_CLASS: last = 1; break;
+			case GS_TRIANGLE_CLASS: last = 2; break;
+			case GS_SPRITE_CLASS: last = 1; break;
+			}
+
+			GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
+
+			c = c.upl16(c.zwxy());
+
+			if(sel.tfx == TFX_NONE) c = c.srl16(7);
+
+			m_local.c.rb = c.xxxx();
+			m_local.c.ga = c.zzzz();
+		}
+	}
+
+	#else
+
+	const GSVector4* shift = GSSetupPrimCodeGenerator::m_shift;
+
+	if(has_z || has_f)
+	{
+		if(sel.prim != GS_SPRITE_CLASS)
+		{
+			if(has_f)
+			{
+				GSVector4 df = dscan.p.wwww();
+
+				m_local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
+
+				for(int i = 0; i < 4; i++)
+				{
+					m_local.d[i].f = GSVector4i(df * shift[1 + i]).xxzzlh();
+				}
+			}
+
+			if(has_z)
+			{
+				GSVector4 dz = dscan.p.zzzz();
+
+				m_local.d4.z = dz * shift[0];
+
+				for(int i = 0; i < 4; i++)
+				{
+					m_local.d[i].z = dz * shift[1 + i];
+				}
+			}
+		}
+		else
+		{
+			if(has_f)
+			{
+				m_local.p.f = GSVector4i(vertex[index[1]].p).zzzzh().zzzz();
+			}
+
+			if(has_z)
+			{
+				m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
+			}
+		}
+	}
+
+	if(has_t)
+	{
+		GSVector4 t = dscan.t;
+
+		if(sel.fst)
+		{
+			m_local.d4.stq = GSVector4::cast(GSVector4i(t * shift[0]));
+		}
+		else
+		{
+			m_local.d4.stq = t * shift[0];
+		}
+
+		for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
+		{
+			GSVector4 dstq;
+
+			switch(j)
+			{
+			case 0: dstq = t.xxxx(); break;
+			case 1: dstq = t.yyyy(); break;
+			case 2: dstq = t.zzzz(); break;
+			}
+
+			for(int i = 0; i < 4; i++)
+			{
+				GSVector4 v = dstq * shift[1 + i];
+
+				if(sel.fst)
+				{
+					switch(j)
+					{
+					case 0: m_local.d[i].s = GSVector4::cast(GSVector4i(v)); break;
+					case 1: m_local.d[i].t = GSVector4::cast(GSVector4i(v)); break;
+					}
+				}
+				else
+				{
+					switch(j)
+					{
+					case 0: m_local.d[i].s = v; break;
+					case 1: m_local.d[i].t = v; break;
+					case 2: m_local.d[i].q = v; break;
+					}
+				}
+			}
+		}
+	}
+
+	if(has_c)
+	{
+		if(sel.iip)
+		{
+			m_local.d4.c = GSVector4i(dscan.c * shift[0]).xzyw().ps32();
+
+			GSVector4 dr = dscan.c.xxxx();
+			GSVector4 db = dscan.c.zzzz();
+
+			for(int i = 0; i < 4; i++)
+			{
+				GSVector4i r = GSVector4i(dr * shift[1 + i]).ps32();
+				GSVector4i b = GSVector4i(db * shift[1 + i]).ps32();
+
+				m_local.d[i].rb = r.upl16(b);
+			}
+
+			GSVector4 dg = dscan.c.yyyy();
+			GSVector4 da = dscan.c.wwww();
+
+			for(int i = 0; i < 4; i++)
+			{
+				GSVector4i g = GSVector4i(dg * shift[1 + i]).ps32();
+				GSVector4i a = GSVector4i(da * shift[1 + i]).ps32();
+
+				m_local.d[i].ga = g.upl16(a);
+			}
+		}
+		else
+		{
+			int last = 0;
+			
+			switch(sel.prim)
+			{
+			case GS_POINT_CLASS: last = 0; break;
+			case GS_LINE_CLASS: last = 1; break;
+			case GS_TRIANGLE_CLASS: last = 2; break;
+			case GS_SPRITE_CLASS: last = 1; break;
+			}
+
+			GSVector4i c = GSVector4i(vertex[index[last]].c);
+
+			c = c.upl16(c.zwxy());
+
+			if(sel.tfx == TFX_NONE) c = c.srl16(7);
+
+			m_local.c.rb = c.xxxx();
+			m_local.c.ga = c.zzzz();
+		}
+	}
+
+	#endif
+}
+
+void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	GSScanlineSelector sel = m_global.sel;
+
+	#if _M_SSE >= 0x501
+
+	GSVector8i test;
+	GSVector8 zo;
+	GSVector8i f;
+	GSVector8 s, t, q;
+	GSVector8i uf, vf;
+	GSVector8i rbf, gaf;
+	GSVector8i cov;
+
+	// Init
+
+	int skip, steps;
+
+	if(!sel.notest)
+	{
+		skip = left & 7;
+		steps = pixels + skip - 8;
+		left -= skip;
+		test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[skip]) | GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]);
+	}
+	else
+	{
+		skip = 0;
+		steps = pixels - 8;
+	}
+	
+	ASSERT((left & 7) == 0);
+
+	const GSVector2i* fza_base = &m_global.fzbr[top];
+	const GSVector2i* fza_offset = &m_global.fzbc[left >> 2];
+
+	if(sel.prim != GS_SPRITE_CLASS)
+	{
+		if(sel.fwrite && sel.fge)
+		{
+			f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(m_local.d[skip].f);
+		}
+
+		if(sel.zb)
+		{
+			zo = m_local.d[skip].z;
+		}
+	}
+
+	if(sel.fb)
+	{
+		if(sel.edge)
+		{
+			cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9);
+		}
+
+		if(sel.tfx != TFX_NONE)
+		{
+			if(sel.fst)
+			{
+				GSVector4i vt(scan.t);
+
+				GSVector8i u = GSVector8i::broadcast32(vt.xxxx()) + GSVector8i::cast(m_local.d[skip].s);
+				GSVector8i v = GSVector8i::broadcast32(vt.yyyy());
+
+				if(sel.prim != GS_SPRITE_CLASS || sel.mmin)
+				{
+					v += GSVector8i::cast(m_local.d[skip].t);
+				}
+				else if(sel.ltf)
+				{
+					vf = v.xxzzlh().srl16(12);
+				}
+
+				s = GSVector8::cast(u);
+				t = GSVector8::cast(v);
+			}
+			else
+			{
+				s = GSVector8::broadcast32(&scan.t.x) + m_local.d[skip].s;
+				t = GSVector8::broadcast32(&scan.t.y) + m_local.d[skip].t;
+				q = GSVector8::broadcast32(&scan.t.z) + m_local.d[skip].q;
+			}
+		}
+
+		if(!(sel.tfx == TFX_DECAL && sel.tcc))
+		{
+			if(sel.iip)
+			{
+				GSVector4i c(scan.c);
+
+				c = c.upl16(c.zwxy());
+
+				rbf = GSVector8i::broadcast32(&c.x).add16(m_local.d[skip].rb);
+				gaf = GSVector8i::broadcast32(&c.z).add16(m_local.d[skip].ga);
+			}
+			else
+			{
+				rbf = m_local.c.rb;
+				gaf = m_local.c.ga;
+			}
+		}
+	}
+
+	while(1)
+	{
+		do
+		{
+			int fa = 0, za = 0;
+			GSVector8i fd, zs, zd;
+			GSVector8i fm, zm;
+			GSVector8i rb, ga;
+
+			// TestZ
+
+			if(sel.zb)
+			{
+				za = fza_base->y + fza_offset->y;
+
+				if(sel.prim != GS_SPRITE_CLASS)
+				{
+					GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
+
+					if(sel.zoverflow)
+					{
+						zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001());
+					}
+					else
+					{
+						zs = GSVector8i(z);
+					}
+				}
+				else
+				{
+					zs = GSVector8i::broadcast32(&m_local.p.z);
+				}
+
+				if(sel.ztest)
+				{
+					zd = GSVector8i::load(
+						(uint8*)m_global.vm + za * 2, (uint8*)m_global.vm + za * 2 + 16,
+						(uint8*)m_global.vm + za * 2 + 32, (uint8*)m_global.vm + za * 2 + 48);
+
+					switch(sel.zpsm)
+					{
+					case 1: zd = zd.sll32(8).srl32(8); break;
+					case 2: zd = zd.sll32(16).srl32(16); break;
+					default: break;
+					}
+
+					GSVector8i zso = zs;
+					GSVector8i zdo = zd;
+
+					if(sel.zoverflow || sel.zpsm == 0)
+					{
+						zso -= GSVector8i::x80000000();
+						zdo -= GSVector8i::x80000000();
+					}
+
+					switch(sel.ztst)
+					{
+					case ZTST_GEQUAL: test |= zso < zdo; break;
+					case ZTST_GREATER: test |= zso <= zdo; break;
+					}
+
+					if(test.alltrue()) continue;
+				}
+			}
+
+			// SampleTexture
+
+			if(sel.fb && sel.tfx != TFX_NONE)
+			{
+				GSVector8i u, v, uv[2];
+				GSVector8i lodi, lodf;
+				GSVector8i minuv, maxuv;
+				GSVector8i addr00, addr01, addr10, addr11;
+				GSVector8i c00, c01, c10, c11;
+
+				if(sel.mmin)
+				{
+					if(!sel.fst)
+					{
+						GSVector8 qrcp = q.rcp();
+
+						u = GSVector8i(s * qrcp);
+						v = GSVector8i(t * qrcp);
+					}
+					else
+					{
+						u = GSVector8i::cast(s);
+						v = GSVector8i::cast(t);
+					}
+
+					if(!sel.lcm)
+					{
+						GSVector8 tmp = q.log2(3) * m_global.l + m_global.k; // (-log2(Q) * (1 << L) + K) * 0x10000
+					
+						GSVector8i lod = GSVector8i(tmp.sat(GSVector8::zero(), m_global.mxl), false);
+
+						if(sel.mmin == 1) // round-off mode
+						{
+							lod += 0x8000;
+						}
+
+						lodi = lod.srl32(16);
+
+						if(sel.mmin == 2) // trilinear mode
+						{
+							lodf = lod.xxzzlh();
+						}
+
+						// shift u/v by (int)lod
+
+						u = u.srav32(lodi);
+						v = v.srav32(lodi);
+
+						uv[0] = u.srav32(lodi);
+						uv[1] = v.srav32(lodi);
+
+						GSVector8i tmin = GSVector8i::broadcast128(m_global.t.min);
+						GSVector8i tminu = tmin.upl16().srlv32(lodi);
+						GSVector8i tminv = tmin.uph16().srlv32(lodi);
+
+						GSVector8i tmax = GSVector8i::broadcast128(m_global.t.max);
+						GSVector8i tmaxu = tmax.upl16().srlv32(lodi);
+						GSVector8i tmaxv = tmax.uph16().srlv32(lodi);
+
+						minuv = tminu.pu32(tminv);
+						maxuv = tmaxu.pu32(tmaxv);
+					}
+					else
+					{
+						lodi = m_global.lod.i;
+
+						u = u.srav32(lodi);
+						v = v.srav32(lodi);
+
+						uv[0] = u;
+						uv[1] = v;
+
+						minuv = m_local.temp.uv_minmax[0];
+						maxuv = m_local.temp.uv_minmax[1];
+					}
+
+					if(sel.ltf)
+					{
+						u -= 0x8000;
+						v -= 0x8000;
+
+						uf = u.xxzzlh().srl16(12);
+						vf = v.xxzzlh().srl16(12);
+					}
+
+					GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+					GSVector8i uv1 = uv0;
+
+					{
+						GSVector8i repeat = (uv0 & minuv) | maxuv;
+						GSVector8i clamp = uv0.sat_i16(minuv, maxuv);
+					
+						uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+					}
+
+					if(sel.ltf)
+					{
+						uv1 = uv1.add16(GSVector8i::x0001());
+
+						GSVector8i repeat = (uv1 & minuv) | maxuv;
+						GSVector8i clamp = uv1.sat_i16(minuv, maxuv);
+					
+						uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+					}
+
+					GSVector8i y0 = uv0.uph16() << (sel.tw + 3);
+					GSVector8i x0 = uv0.upl16();
+
+					if(sel.ltf)
+					{
+						GSVector8i y1 = uv1.uph16() << (sel.tw + 3);
+						GSVector8i x1 = uv1.upl16();
+
+						addr00 = y0 + x0;
+						addr01 = y0 + x1;
+						addr10 = y1 + x0;
+						addr11 = y1 + x1;
+
+						if(sel.tlu)
+						{
+							for(int i = 0; i < 8; i++)
+							{
+								const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]];
+
+								c00.u32[i] = m_global.clut[tex[addr00.u32[i]]];
+								c01.u32[i] = m_global.clut[tex[addr01.u32[i]]];
+								c10.u32[i] = m_global.clut[tex[addr10.u32[i]]];
+								c11.u32[i] = m_global.clut[tex[addr11.u32[i]]];
+							}
+						}
+						else
+						{
+							for(int i = 0; i < 8; i++)
+							{
+								const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]];
+
+								c00.u32[i] = tex[addr00.u32[i]];
+								c01.u32[i] = tex[addr01.u32[i]];
+								c10.u32[i] = tex[addr10.u32[i]];
+								c11.u32[i] = tex[addr11.u32[i]];
+							}
+						}
+					
+						GSVector8i rb00 = c00.sll16(8).srl16(8);
+						GSVector8i ga00 = c00.srl16(8);
+						GSVector8i rb01 = c01.sll16(8).srl16(8);
+						GSVector8i ga01 = c01.srl16(8);
+
+						rb00 = rb00.lerp16_4(rb01, uf);
+						ga00 = ga00.lerp16_4(ga01, uf);
+
+						GSVector8i rb10 = c10.sll16(8).srl16(8);
+						GSVector8i ga10 = c10.srl16(8);
+						GSVector8i rb11 = c11.sll16(8).srl16(8);
+						GSVector8i ga11 = c11.srl16(8);
+
+						rb10 = rb10.lerp16_4(rb11, uf);
+						ga10 = ga10.lerp16_4(ga11, uf);
+
+						rb = rb00.lerp16_4(rb10, vf);
+						ga = ga00.lerp16_4(ga10, vf);
+					}
+					else
+					{
+						addr00 = y0 + x0;
+
+						if(sel.tlu)
+						{
+							for(int i = 0; i < 8; i++)
+							{
+								c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]];
+							}
+						}
+						else
+						{
+							for(int i = 0; i < 8; i++)
+							{
+								c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]];
+							}
+						}
+
+						rb = c00.sll16(8).srl16(8);
+						ga = c00.srl16(8);
+					}
+
+					if(sel.mmin != 1) // !round-off mode
+					{
+						GSVector8i rb2, ga2;
+
+						lodi += GSVector8i::x00000001();
+
+						u = uv[0].sra32(1);
+						v = uv[1].sra32(1);
+
+						minuv = minuv.srl16(1);
+						maxuv = maxuv.srl16(1);
+
+						if(sel.ltf)
+						{
+							u -= 0x8000;
+							v -= 0x8000;
+
+							uf = u.xxzzlh().srl16(12);
+							vf = v.xxzzlh().srl16(12);
+						}
+
+						GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+						GSVector8i uv1 = uv0;
+
+						{
+							GSVector8i repeat = (uv0 & minuv) | maxuv;
+							GSVector8i clamp = uv0.sat_i16(minuv, maxuv);
+					
+							uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+						}
+
+						if(sel.ltf)
+						{
+							uv1 = uv1.add16(GSVector8i::x0001());
+
+							GSVector8i repeat = (uv1 & minuv) | maxuv;
+							GSVector8i clamp = uv1.sat_i16(minuv, maxuv);
+					
+							uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+						}
+
+						GSVector8i y0 = uv0.uph16() << (sel.tw + 3);
+						GSVector8i x0 = uv0.upl16();
+
+						if(sel.ltf)
+						{
+							GSVector8i y1 = uv1.uph16() << (sel.tw + 3);
+							GSVector8i x1 = uv1.upl16();
+
+							addr00 = y0 + x0;
+							addr01 = y0 + x1;
+							addr10 = y1 + x0;
+							addr11 = y1 + x1;
+
+							if(sel.tlu)
+							{
+								for(int i = 0; i < 8; i++)
+								{
+									const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]];
+
+									c00.u32[i] = m_global.clut[tex[addr00.u32[i]]];
+									c01.u32[i] = m_global.clut[tex[addr01.u32[i]]];
+									c10.u32[i] = m_global.clut[tex[addr10.u32[i]]];
+									c11.u32[i] = m_global.clut[tex[addr11.u32[i]]];
+								}
+							}
+							else
+							{
+								for(int i = 0; i < 8; i++)
+								{
+									const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]];
+
+									c00.u32[i] = tex[addr00.u32[i]];
+									c01.u32[i] = tex[addr01.u32[i]];
+									c10.u32[i] = tex[addr10.u32[i]];
+									c11.u32[i] = tex[addr11.u32[i]];
+								}
+							}
+					
+							GSVector8i rb00 = c00.sll16(8).srl16(8);
+							GSVector8i ga00 = c00.srl16(8);
+							GSVector8i rb01 = c01.sll16(8).srl16(8);
+							GSVector8i ga01 = c01.srl16(8);
+
+							rb00 = rb00.lerp16_4(rb01, uf);
+							ga00 = ga00.lerp16_4(ga01, uf);
+
+							GSVector8i rb10 = c10.sll16(8).srl16(8);
+							GSVector8i ga10 = c10.srl16(8);
+							GSVector8i rb11 = c11.sll16(8).srl16(8);
+							GSVector8i ga11 = c11.srl16(8);
+
+							rb10 = rb10.lerp16_4(rb11, uf);
+							ga10 = ga10.lerp16_4(ga11, uf);
+
+							rb2 = rb00.lerp16_4(rb10, vf);
+							ga2 = ga00.lerp16_4(ga10, vf);
+						}
+						else
+						{
+							addr00 = y0 + x0;
+
+							if(sel.tlu)
+							{
+								for(int i = 0; i < 8; i++)
+								{
+									c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]];
+								}
+							}
+							else
+							{
+								for(int i = 0; i < 8; i++)
+								{
+									c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]];
+								}
+							}
+
+							rb2 = c00.sll16(8).srl16(8);
+							ga2 = c00.srl16(8);
+						}
+
+						if(sel.lcm) lodf = m_global.lod.f;
+
+						lodf = lodf.srl16(1);
+
+						rb = rb.lerp16<0>(rb2, lodf);
+						ga = ga.lerp16<0>(ga2, lodf);
+					}
+				}
+				else
+				{
+					if(!sel.fst)
+					{
+						GSVector8 qrcp = q.rcp();
+
+						u = GSVector8i(s * qrcp);
+						v = GSVector8i(t * qrcp);
+					
+						if(sel.ltf)
+						{
+							u -= 0x8000;
+							v -= 0x8000;
+						}
+					}
+					else
+					{
+						u = GSVector8i::cast(s);
+						v = GSVector8i::cast(t);
+					}
+
+					if(sel.ltf)
+					{
+						uf = u.xxzzlh().srl16(12);
+					
+						if(sel.prim != GS_SPRITE_CLASS)
+						{
+							vf = v.xxzzlh().srl16(12);
+						}
+					}
+
+					GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+					GSVector8i uv1 = uv0;
+
+					GSVector8i tmin = GSVector8i::broadcast128(m_global.t.min);
+					GSVector8i tmax = GSVector8i::broadcast128(m_global.t.max);
+
+					{
+						GSVector8i repeat = (uv0 & tmin) | tmax;
+						GSVector8i clamp = uv0.sat_i16(tmin, tmax);
+					
+						uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+					}
+
+					if(sel.ltf)
+					{
+						uv1 = uv1.add16(GSVector8i::x0001());
+
+						GSVector8i repeat = (uv1 & tmin) | tmax;
+						GSVector8i clamp = uv1.sat_i16(tmin, tmax);
+					
+						uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask));
+					}
+
+					GSVector8i y0 = uv0.uph16() << (sel.tw + 3);
+					GSVector8i x0 = uv0.upl16();
+
+					if(sel.ltf)
+					{
+						GSVector8i y1 = uv1.uph16() << (sel.tw + 3);
+						GSVector8i x1 = uv1.upl16();
+
+						addr00 = y0 + x0;
+						addr01 = y0 + x1;
+						addr10 = y1 + x0;
+						addr11 = y1 + x1;
+
+						if(sel.tlu)
+						{
+							const uint8* tex = (const uint8*)m_global.tex[0];
+
+							c00 = addr00.gather32_32(tex, m_global.clut);
+							c01 = addr01.gather32_32(tex, m_global.clut);
+							c10 = addr10.gather32_32(tex, m_global.clut);
+							c11 = addr11.gather32_32(tex, m_global.clut);
+						}
+						else
+						{
+							const uint32* tex = (const uint32*)m_global.tex[0];
+
+							c00 = addr00.gather32_32(tex);
+							c01 = addr01.gather32_32(tex);
+							c10 = addr10.gather32_32(tex);
+							c11 = addr11.gather32_32(tex);
+						}
+					
+						GSVector8i rb00 = c00.sll16(8).srl16(8);
+						GSVector8i ga00 = c00.srl16(8);
+						GSVector8i rb01 = c01.sll16(8).srl16(8);
+						GSVector8i ga01 = c01.srl16(8);
+
+						rb00 = rb00.lerp16_4(rb01, uf);
+						ga00 = ga00.lerp16_4(ga01, uf);
+
+						GSVector8i rb10 = c10.sll16(8).srl16(8);
+						GSVector8i ga10 = c10.srl16(8);
+						GSVector8i rb11 = c11.sll16(8).srl16(8);
+						GSVector8i ga11 = c11.srl16(8);
+
+						rb10 = rb10.lerp16_4(rb11, uf);
+						ga10 = ga10.lerp16_4(ga11, uf);
+
+						rb = rb00.lerp16_4(rb10, vf);
+						ga = ga00.lerp16_4(ga10, vf);
+					}
+					else
+					{
+						addr00 = y0 + x0;
+
+						if(sel.tlu)
+						{
+							c00 = addr00.gather32_32((const uint8*)m_global.tex[0], m_global.clut);
+						}
+						else
+						{
+							c00 = addr00.gather32_32((const uint32*)m_global.tex[0]);
+						}
+
+						rb = c00.sll16(8).srl16(8);
+						ga = c00.srl16(8);
+					}
+				}
+			}
+
+			// AlphaTFX
+
+			if(sel.fb)
+			{
+				switch(sel.tfx)
+				{
+				case TFX_MODULATE:
+					ga = ga.modulate16<1>(gaf).clamp8();
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_DECAL:
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_HIGHLIGHT:
+					ga = ga.mix16(!sel.tcc ? gaf.srl16(7) : ga.addus8(gaf.srl16(7)));
+					break;
+				case TFX_HIGHLIGHT2:
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_NONE:
+					ga = sel.iip ? gaf.srl16(7) : gaf;
+					break;
+				}
+
+				if(sel.aa1)
+				{
+					GSVector8i x00800080(0x00800080);
+
+					GSVector8i a = sel.edge ? cov : x00800080;
+
+					if(!sel.abe)
+					{
+						ga = ga.mix16(a);
+					}
+					else
+					{
+						ga = ga.blend8(a, ga.eq16(x00800080).srl32(16).sll32(16));
+					}
+				}
+			}
+
+			// ReadMask
+
+			if(sel.fwrite)
+			{
+				fm = m_global.fm;
+			}
+
+			if(sel.zwrite)
+			{
+				zm = m_global.zm;
+			}
+
+			// TestAlpha
+
+			if(!TestAlpha(test, fm, zm, ga)) continue;
+
+			// ColorTFX
+
+			if(sel.fwrite)
+			{
+				GSVector8i af;
+
+				switch(sel.tfx)
+				{
+				case TFX_MODULATE:
+					rb = rb.modulate16<1>(rbf).clamp8();
+					break;
+				case TFX_DECAL:
+					break;
+				case TFX_HIGHLIGHT:
+				case TFX_HIGHLIGHT2:
+					af = gaf.yywwlh().srl16(7);
+					rb = rb.modulate16<1>(rbf).add16(af).clamp8();
+					ga = ga.modulate16<1>(gaf).add16(af).clamp8().mix16(ga);
+					break;
+				case TFX_NONE:
+					rb = sel.iip ? rbf.srl16(7) : rbf;
+					break;
+				}
+			}
+
+			// Fog
+
+			if(sel.fwrite && sel.fge)
+			{
+				GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(&m_local.p.f);
+
+				GSVector8i frb((int)m_global.frb);
+				GSVector8i fga((int)m_global.fga);
+
+				rb = frb.lerp16<0>(rb, fog);
+				ga = fga.lerp16<0>(ga, fog).mix16(ga);
+
+				/*
+				fog = fog.srl16(7);
+
+				GSVector8i ifog = GSVector4i::x00ff().sub16(fog);
+
+				rb = rb.mul16l(fog).add16(frb.mul16l(ifog)).srl16(8);
+				ga = ga.mul16l(fog).add16(fga.mul16l(ifog)).srl16(8).mix16(ga);
+				*/
+			}
+
+			// ReadFrame
+
+			if(sel.fb)
+			{
+				fa = fza_base->x + fza_offset->x;
+
+				if(sel.rfb)
+				{
+					fd = GSVector8i::load(
+						(uint8*)m_global.vm + fa * 2, (uint8*)m_global.vm + fa * 2 + 16,
+						(uint8*)m_global.vm + fa * 2 + 32, (uint8*)m_global.vm + fa * 2 + 48);
+				}
+			}
+
+			// TestDestAlpha
+
+			if(sel.date && (sel.fpsm == 0 || sel.fpsm == 2))
+			{
+				if(sel.datm)
+				{
+					if(sel.fpsm == 2)
+					{
+						// test |= fd.srl32(15) == GSVector8i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector8i::zero();
+					}
+					else
+					{
+						test |= (~fd).sra32(31);
+					}
+				}
+				else
+				{
+					if(sel.fpsm == 2)
+					{
+						test |= fd.sll32(16).sra32(31); // == GSVector8i::xffffffff();
+					}
+					else
+					{
+						test |= fd.sra32(31);
+					}
+				}
+
+				if(test.alltrue()) continue;
+			}
+
+			// WriteMask
+
+			int fzm = 0;
+
+			if(!sel.notest)
+			{
+				if(sel.fwrite)
+				{
+					fm |= test;
+				}
+
+				if(sel.zwrite)
+				{
+					zm |= test;
+				}
+
+				if(sel.fwrite && sel.zwrite)
+				{
+					fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask();
+				}
+				else if(sel.fwrite)
+				{
+					fzm = ~(fm == GSVector8i::xffffffff()).ps32().mask();
+				}
+				else if(sel.zwrite)
+				{
+					fzm = ~(zm == GSVector8i::xffffffff()).ps32().mask();
+				}
+			}
+
+			// WriteZBuf
+
+			if(sel.zwrite)
+			{
+				if(sel.ztest && sel.zpsm < 2)
+				{
+					zs = zs.blend8(zd, zm);
+				}
+
+				bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
+
+				if(sel.notest)
+				{
+					if(fast)
+					{
+						GSVector4i::storel((uint8*)m_global.vm + za * 2, zs.extract<0>());
+						GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs.extract<0>());
+						GSVector4i::storel((uint8*)m_global.vm + za * 2 + 32, zs.extract<1>());
+						GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 48, zs.extract<1>());
+					}
+					else
+					{
+						WritePixel(zs, za, 0, sel.zpsm);
+						WritePixel(zs, za, 1, sel.zpsm);
+						WritePixel(zs, za, 2, sel.zpsm);
+						WritePixel(zs, za, 3, sel.zpsm);
+						WritePixel(zs, za, 4, sel.zpsm);
+						WritePixel(zs, za, 5, sel.zpsm);
+						WritePixel(zs, za, 6, sel.zpsm);
+						WritePixel(zs, za, 7, sel.zpsm);
+					}
+				}
+				else
+				{
+					if(fast)
+					{
+						if(fzm & 0x00000f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs.extract<0>());
+						if(fzm & 0x0000f000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs.extract<0>());
+						if(fzm & 0x0f000000) GSVector4i::storel((uint8*)m_global.vm + za * 2 + 32, zs.extract<1>());
+						if(fzm & 0xf0000000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 48, zs.extract<1>());
+					}
+					else
+					{
+						if(fzm & 0x00000300) WritePixel(zs, za, 0, sel.zpsm);
+						if(fzm & 0x00000c00) WritePixel(zs, za, 1, sel.zpsm);
+						if(fzm & 0x00003000) WritePixel(zs, za, 2, sel.zpsm);
+						if(fzm & 0x0000c000) WritePixel(zs, za, 3, sel.zpsm);
+						if(fzm & 0x03000000) WritePixel(zs, za, 4, sel.zpsm);
+						if(fzm & 0x0c000000) WritePixel(zs, za, 5, sel.zpsm);
+						if(fzm & 0x30000000) WritePixel(zs, za, 6, sel.zpsm);
+						if(fzm & 0xc0000000) WritePixel(zs, za, 7, sel.zpsm);
+					}
+				}
+			}
+
+			// AlphaBlend
+
+			if(sel.fwrite && (sel.abe || sel.aa1))
+			{
+				GSVector8i rbs = rb, gas = ga, rbd, gad, a, mask;
+
+				if(sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1)
+				{
+					switch(sel.fpsm)
+					{
+					case 0:
+					case 1:
+						rbd = fd.sll16(8).srl16(8);
+						gad = fd.srl16(8);
+						break;
+					case 2:
+						rbd = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+						gad = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+						break;
+					}
+				}
+
+				if(sel.aba != sel.abb)
+				{
+					switch(sel.aba)
+					{
+					case 0: break;
+					case 1: rb = rbd; break;
+					case 2: rb = GSVector8i::zero(); break;
+					}
+
+					switch(sel.abb)
+					{
+					case 0: rb = rb.sub16(rbs); break;
+					case 1: rb = rb.sub16(rbd); break;
+					case 2: break;
+					}
+
+					if(!(sel.fpsm == 1 && sel.abc == 1))
+					{
+						switch(sel.abc)
+						{
+						case 0: a = gas.yywwlh().sll16(7); break;
+						case 1: a = gad.yywwlh().sll16(7); break;
+						case 2: a = m_global.afix; break;
+						}
+
+						rb = rb.modulate16<1>(a);
+					}
+
+					switch(sel.abd)
+					{
+					case 0: rb = rb.add16(rbs); break;
+					case 1: rb = rb.add16(rbd); break;
+					case 2: break;
+					}
+				}
+				else
+				{
+					switch(sel.abd)
+					{
+					case 0: break;
+					case 1: rb = rbd; break;
+					case 2: rb = GSVector8i::zero(); break;
+					}
+				}
+
+				if(sel.pabe)
+				{
+					mask = (gas << 8).sra32(31);
+					
+					rb = rbs.blend8(rb, mask);
+				}
+				
+				if(sel.aba != sel.abb)
+				{
+					switch(sel.aba)
+					{
+					case 0: break;
+					case 1: ga = gad; break;
+					case 2: ga = GSVector8i::zero(); break;
+					}
+
+					switch(sel.abb)
+					{
+					case 0: ga = ga.sub16(gas); break;
+					case 1: ga = ga.sub16(gad); break;
+					case 2: break;
+					}
+
+					if(!(sel.fpsm == 1 && sel.abc == 1))
+					{
+						ga = ga.modulate16<1>(a);
+					}
+
+					switch(sel.abd)
+					{
+					case 0: ga = ga.add16(gas); break;
+					case 1: ga = ga.add16(gad); break;
+					case 2: break;
+					}
+				}
+				else
+				{
+					switch(sel.abd)
+					{
+					case 0: break;
+					case 1: ga = gad; break;
+					case 2: ga = GSVector8i::zero(); break;
+					}
+				}
+
+				if(sel.pabe)
+				{
+					ga = gas.blend8(ga, mask >> 16);
+				}
+				else
+				{
+					if(sel.fpsm != 1)
+					{
+						ga = ga.mix16(gas);
+					}
+				}
+			}
+
+			// WriteFrame
+
+			if(sel.fwrite)
+			{
+				if(sel.fpsm == 2 && sel.dthe)
+				{
+					int y = (top & 3) << 1;
+
+					rb = rb.add16(GSVector8i::broadcast128(m_global.dimx[0 + y]));
+					ga = ga.add16(GSVector8i::broadcast128(m_global.dimx[1 + y]));
+				}
+
+				if(sel.colclamp == 0)
+				{
+					rb &= GSVector8i::x00ff();
+					ga &= GSVector8i::x00ff();
+				}
+
+				GSVector8i fs = rb.upl16(ga).pu16(rb.uph16(ga));
+
+				if(sel.fba && sel.fpsm != 1)
+				{
+					fs |= GSVector8i::x80000000();
+				}
+
+				if(sel.fpsm == 2)
+				{
+					GSVector8i rb = fs & 0x00f800f8;
+					GSVector8i ga = fs & 0x8000f800;
+
+					fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+				}
+
+				if(sel.rfb)
+				{
+					fs = fs.blend(fd, fm);
+				}
+
+				bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest;
+
+				if(sel.notest)
+				{
+					if(fast)
+					{
+						GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs.extract<0>());
+						GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs.extract<0>());
+						GSVector4i::storel((uint8*)m_global.vm + fa * 2 + 32, fs.extract<1>());
+						GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 48, fs.extract<1>());
+					}
+					else
+					{
+						WritePixel(fs, fa, 0, sel.fpsm);
+						WritePixel(fs, fa, 1, sel.fpsm);
+						WritePixel(fs, fa, 2, sel.fpsm);
+						WritePixel(fs, fa, 3, sel.fpsm);
+						WritePixel(fs, fa, 4, sel.fpsm);
+						WritePixel(fs, fa, 5, sel.fpsm);
+						WritePixel(fs, fa, 6, sel.fpsm);
+						WritePixel(fs, fa, 7, sel.fpsm);
+					}
+				}
+				else
+				{
+					if(fast)
+					{
+						if(fzm & 0x0000000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs.extract<0>());
+						if(fzm & 0x000000f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs.extract<0>());
+						if(fzm & 0x000f0000) GSVector4i::storel((uint8*)m_global.vm + fa * 2 + 32, fs.extract<1>());
+						if(fzm & 0x00f00000) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 48, fs.extract<1>());
+					}
+					else
+					{
+						if(fzm & 0x00000003) WritePixel(fs, fa, 0, sel.fpsm);
+						if(fzm & 0x0000000c) WritePixel(fs, fa, 1, sel.fpsm);
+						if(fzm & 0x00000030) WritePixel(fs, fa, 2, sel.fpsm);
+						if(fzm & 0x000000c0) WritePixel(fs, fa, 3, sel.fpsm);
+						if(fzm & 0x00030000) WritePixel(fs, fa, 4, sel.fpsm);
+						if(fzm & 0x000c0000) WritePixel(fs, fa, 5, sel.fpsm);
+						if(fzm & 0x00300000) WritePixel(fs, fa, 6, sel.fpsm);
+						if(fzm & 0x00c00000) WritePixel(fs, fa, 7, sel.fpsm);
+					}
+				}
+			}
+		}
+		while(0);
+
+		if(sel.edge) break;
+
+		if(steps <= 0) break;
+
+		// Step
+		
+		steps -= 8;
+
+		fza_offset += 2;
+
+		if(sel.prim != GS_SPRITE_CLASS)
+		{
+			if(sel.zb)
+			{
+				zo += GSVector8::broadcast32(&m_local.d8.p.z);
+			}
+
+			if(sel.fwrite && sel.fge)
+			{
+				f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
+			}
+		}
+
+		if(sel.fb)
+		{
+			if(sel.tfx != TFX_NONE)
+			{
+				if(sel.fst)
+				{
+					GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
+
+					s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
+					
+					if(sel.prim != GS_SPRITE_CLASS || sel.mmin)
+					{
+						t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy());
+					}
+				}
+				else
+				{
+					GSVector8 stq(m_local.d8.stq);
+
+					s += stq.xxxx();
+					t += stq.yyyy();
+					q += stq.zzzz();
+				}
+			}
+		}
+
+		if(!(sel.tfx == TFX_DECAL && sel.tcc))
+		{
+			if(sel.iip)
+			{
+				GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
+
+				rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero());
+				gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero());
+			}
+		}
+
+		if(!sel.notest)
+		{
+			test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]);
+		}
+	}
+
+	#else
+
+	GSVector4i test;
+	GSVector4 zo;
+	GSVector4i f;
+	GSVector4 s, t, q;
+	GSVector4i uf, vf;
+	GSVector4i rbf, gaf;
+	GSVector4i cov;
+
+	// Init
+
+	int skip, steps;
+
+	if(!sel.notest)
+	{
+		skip = left & 3;
+		steps = pixels + skip - 4;
+		left -= skip;
+		test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
+	}
+	else
+	{
+		skip = 0;
+		steps = pixels - 4;
+	}
+
+	ASSERT((left & 3) == 0);
+
+	const GSVector2i* fza_base = &m_global.fzbr[top];
+	const GSVector2i* fza_offset = &m_global.fzbc[left >> 2];
+
+	if(sel.prim != GS_SPRITE_CLASS)
+	{
+		if(sel.fwrite && sel.fge)
+		{
+			f = GSVector4i(scan.p).zzzzh().zzzz().add16(m_local.d[skip].f);
+		}
+
+		if(sel.zb)
+		{
+			zo = m_local.d[skip].z;
+		}
+	}
+
+	if(sel.fb)
+	{
+		if(sel.edge)
+		{
+			cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9);
+		}
+
+		if(sel.tfx != TFX_NONE)
+		{
+			if(sel.fst)
+			{
+				GSVector4i vt(scan.t);
+
+				GSVector4i u = vt.xxxx() + GSVector4i::cast(m_local.d[skip].s);
+				GSVector4i v = vt.yyyy(); 
+				
+				if(sel.prim != GS_SPRITE_CLASS || sel.mmin)
+				{
+					v += GSVector4i::cast(m_local.d[skip].t);
+				}
+				else if(sel.ltf)
+				{
+					vf = v.xxzzlh().srl16(12);
+				}
+
+				s = GSVector4::cast(u);
+				t = GSVector4::cast(v);
+			}
+			else
+			{
+				s = scan.t.xxxx() + m_local.d[skip].s;
+				t = scan.t.yyyy() + m_local.d[skip].t;
+				q = scan.t.zzzz() + m_local.d[skip].q;
+			}
+		}
+
+		if(!(sel.tfx == TFX_DECAL && sel.tcc))
+		{
+			if(sel.iip)
+			{
+				GSVector4i c(scan.c);
+
+				c = c.upl16(c.zwxy());
+
+				rbf = c.xxxx().add16(m_local.d[skip].rb);
+				gaf = c.zzzz().add16(m_local.d[skip].ga);
+			}
+			else
+			{
+				rbf = m_local.c.rb;
+				gaf = m_local.c.ga;
+			}
+		}
+	}
+
+	while(1)
+	{
+		do
+		{
+			int fa = 0, za = 0;
+			GSVector4i fd, zs, zd;
+			GSVector4i fm, zm;
+			GSVector4i rb, ga;
+
+			// TestZ
+
+			if(sel.zb)
+			{
+				za = fza_base->y + fza_offset->y;
+
+				if(sel.prim != GS_SPRITE_CLASS)
+				{
+					GSVector4 z = scan.p.zzzz() + zo;
+
+					if(sel.zoverflow)
+					{
+						zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+					}
+					else
+					{
+						zs = GSVector4i(z);
+					}
+				}
+				else
+				{
+					zs = m_local.p.z;
+				}
+
+				if(sel.ztest)
+				{
+					zd = GSVector4i::load((uint8*)m_global.vm + za * 2, (uint8*)m_global.vm + za * 2 + 16);
+
+					switch(sel.zpsm)
+					{
+					case 1: zd = zd.sll32(8).srl32(8); break;
+					case 2: zd = zd.sll32(16).srl32(16); break;
+					default: break;
+					}
+
+					GSVector4i zso = zs;
+					GSVector4i zdo = zd;
+
+					if(sel.zoverflow || sel.zpsm == 0)
+					{
+						zso -= GSVector4i::x80000000();
+						zdo -= GSVector4i::x80000000();
+					}
+
+					switch(sel.ztst)
+					{
+					case ZTST_GEQUAL: test |= zso < zdo; break;
+					case ZTST_GREATER: test |= zso <= zdo; break;
+					}
+
+					if(test.alltrue()) continue;
+				}
+			}
+
+			// SampleTexture
+
+			if(sel.fb && sel.tfx != TFX_NONE)
+			{
+				GSVector4i u, v, uv[2];
+				GSVector4i lodi, lodf;
+				GSVector4i minuv, maxuv;
+				GSVector4i addr00, addr01, addr10, addr11;
+				GSVector4i c00, c01, c10, c11;
+
+				if(sel.mmin)
+				{
+					if(!sel.fst)
+					{
+						GSVector4 qrcp = q.rcp();
+
+						u = GSVector4i(s * qrcp);
+						v = GSVector4i(t * qrcp);
+					}
+					else
+					{
+						u = GSVector4i::cast(s);
+						v = GSVector4i::cast(t);
+					}
+
+					if(!sel.lcm)
+					{
+						GSVector4 tmp = q.log2(3) * m_global.l + m_global.k; // (-log2(Q) * (1 << L) + K) * 0x10000
+					
+						GSVector4i lod = GSVector4i(tmp.sat(GSVector4::zero(), m_global.mxl), false);
+
+						if(sel.mmin == 1) // round-off mode
+						{
+							lod += 0x8000;
+						}
+
+						lodi = lod.srl32(16);
+
+						if(sel.mmin == 2) // trilinear mode
+						{
+							lodf = lod.xxzzlh();
+						}
+
+						// shift u/v by (int)lod
+
+						GSVector4i aabb = u.upl32(v);
+						GSVector4i ccdd = u.uph32(v);
+					
+						GSVector4i aaxx = aabb.sra32(lodi.x);
+						GSVector4i xxbb = aabb.sra32(lodi.y);
+						GSVector4i ccxx = ccdd.sra32(lodi.z);
+						GSVector4i xxdd = ccdd.sra32(lodi.w);
+
+						GSVector4i acac = aaxx.upl32(ccxx);
+						GSVector4i bdbd = xxbb.uph32(xxdd);
+
+						u = acac.upl32(bdbd);
+						v = acac.uph32(bdbd);
+					
+						uv[0] = u;
+						uv[1] = v;
+
+						GSVector4i minmax = m_global.t.minmax;
+
+						GSVector4i v0 = minmax.srl16(lodi.x);
+						GSVector4i v1 = minmax.srl16(lodi.y);
+						GSVector4i v2 = minmax.srl16(lodi.z);
+						GSVector4i v3 = minmax.srl16(lodi.w);
+
+						v0 = v0.upl16(v1);
+						v2 = v2.upl16(v3);
+
+						minuv = v0.upl32(v2);
+						maxuv = v0.uph32(v2);
+					}
+					else
+					{
+						lodi = m_global.lod.i;
+
+						u = u.sra32(lodi.x);
+						v = v.sra32(lodi.x);
+
+						uv[0] = u;
+						uv[1] = v;
+
+						minuv = m_local.temp.uv_minmax[0];
+						maxuv = m_local.temp.uv_minmax[1];
+					}
+
+					if(sel.ltf)
+					{
+						u -= 0x8000;
+						v -= 0x8000;
+
+						uf = u.xxzzlh().srl16(12);
+						vf = v.xxzzlh().srl16(12);
+					}
+
+					GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+					GSVector4i uv1 = uv0;
+
+					{
+						GSVector4i repeat = (uv0 & minuv) | maxuv;
+						GSVector4i clamp = uv0.sat_i16(minuv, maxuv);
+					
+						uv0 = clamp.blend8(repeat, m_global.t.mask);
+					}
+
+					if(sel.ltf)
+					{
+						uv1 = uv1.add16(GSVector4i::x0001());
+
+						GSVector4i repeat = (uv1 & minuv) | maxuv;
+						GSVector4i clamp = uv1.sat_i16(minuv, maxuv);
+					
+						uv1 = clamp.blend8(repeat, m_global.t.mask);
+					}
+
+					GSVector4i y0 = uv0.uph16() << (sel.tw + 3);
+					GSVector4i x0 = uv0.upl16();
+
+					if(sel.ltf)
+					{
+						GSVector4i y1 = uv1.uph16() << (sel.tw + 3);
+						GSVector4i x1 = uv1.upl16();
+
+						addr00 = y0 + x0;
+						addr01 = y0 + x1;
+						addr10 = y1 + x0;
+						addr11 = y1 + x1;
+
+						if(sel.tlu)
+						{
+							for(int i = 0; i < 4; i++)
+							{
+								const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]];
+
+								c00.u32[i] = m_global.clut[tex[addr00.u32[i]]];
+								c01.u32[i] = m_global.clut[tex[addr01.u32[i]]];
+								c10.u32[i] = m_global.clut[tex[addr10.u32[i]]];
+								c11.u32[i] = m_global.clut[tex[addr11.u32[i]]];
+							}
+						}
+						else
+						{
+							for(int i = 0; i < 4; i++)
+							{
+								const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]];
+
+								c00.u32[i] = tex[addr00.u32[i]];
+								c01.u32[i] = tex[addr01.u32[i]];
+								c10.u32[i] = tex[addr10.u32[i]];
+								c11.u32[i] = tex[addr11.u32[i]];
+							}
+						}
+					
+						GSVector4i rb00 = c00.sll16(8).srl16(8);
+						GSVector4i ga00 = c00.srl16(8);
+						GSVector4i rb01 = c01.sll16(8).srl16(8);
+						GSVector4i ga01 = c01.srl16(8);
+
+						rb00 = rb00.lerp16_4(rb01, uf);
+						ga00 = ga00.lerp16_4(ga01, uf);
+
+						GSVector4i rb10 = c10.sll16(8).srl16(8);
+						GSVector4i ga10 = c10.srl16(8);
+						GSVector4i rb11 = c11.sll16(8).srl16(8);
+						GSVector4i ga11 = c11.srl16(8);
+
+						rb10 = rb10.lerp16_4(rb11, uf);
+						ga10 = ga10.lerp16_4(ga11, uf);
+
+						rb = rb00.lerp16_4(rb10, vf);
+						ga = ga00.lerp16_4(ga10, vf);
+					}
+					else
+					{
+						addr00 = y0 + x0;
+
+						if(sel.tlu)
+						{
+							for(int i = 0; i < 4; i++)
+							{
+								c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]];
+							}
+						}
+						else
+						{
+							for(int i = 0; i < 4; i++)
+							{
+								c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]];
+							}
+						}
+
+						rb = c00.sll16(8).srl16(8);
+						ga = c00.srl16(8);
+					}
+
+					if(sel.mmin != 1) // !round-off mode
+					{
+						GSVector4i rb2, ga2;
+
+						lodi += GSVector4i::x00000001();
+
+						u = uv[0].sra32(1);
+						v = uv[1].sra32(1);
+
+						minuv = minuv.srl16(1);
+						maxuv = maxuv.srl16(1);
+
+						if(sel.ltf)
+						{
+							u -= 0x8000;
+							v -= 0x8000;
+
+							uf = u.xxzzlh().srl16(12);
+							vf = v.xxzzlh().srl16(12);
+						}
+
+						GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+						GSVector4i uv1 = uv0;
+
+						{
+							GSVector4i repeat = (uv0 & minuv) | maxuv;
+							GSVector4i clamp = uv0.sat_i16(minuv, maxuv);
+					
+							uv0 = clamp.blend8(repeat, m_global.t.mask);
+						}
+
+						if(sel.ltf)
+						{
+							uv1 = uv1.add16(GSVector4i::x0001());
+
+							GSVector4i repeat = (uv1 & minuv) | maxuv;
+							GSVector4i clamp = uv1.sat_i16(minuv, maxuv);
+					
+							uv1 = clamp.blend8(repeat, m_global.t.mask);
+						}
+
+						GSVector4i y0 = uv0.uph16() << (sel.tw + 3);
+						GSVector4i x0 = uv0.upl16();
+
+						if(sel.ltf)
+						{
+							GSVector4i y1 = uv1.uph16() << (sel.tw + 3);
+							GSVector4i x1 = uv1.upl16();
+
+							addr00 = y0 + x0;
+							addr01 = y0 + x1;
+							addr10 = y1 + x0;
+							addr11 = y1 + x1;
+
+							if(sel.tlu)
+							{
+								for(int i = 0; i < 4; i++)
+								{
+									const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]];
+
+									c00.u32[i] = m_global.clut[tex[addr00.u32[i]]];
+									c01.u32[i] = m_global.clut[tex[addr01.u32[i]]];
+									c10.u32[i] = m_global.clut[tex[addr10.u32[i]]];
+									c11.u32[i] = m_global.clut[tex[addr11.u32[i]]];
+								}
+							}
+							else
+							{
+								for(int i = 0; i < 4; i++)
+								{
+									const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]];
+
+									c00.u32[i] = tex[addr00.u32[i]];
+									c01.u32[i] = tex[addr01.u32[i]];
+									c10.u32[i] = tex[addr10.u32[i]];
+									c11.u32[i] = tex[addr11.u32[i]];
+								}
+							}
+					
+							GSVector4i rb00 = c00.sll16(8).srl16(8);
+							GSVector4i ga00 = c00.srl16(8);
+							GSVector4i rb01 = c01.sll16(8).srl16(8);
+							GSVector4i ga01 = c01.srl16(8);
+
+							rb00 = rb00.lerp16_4(rb01, uf);
+							ga00 = ga00.lerp16_4(ga01, uf);
+
+							GSVector4i rb10 = c10.sll16(8).srl16(8);
+							GSVector4i ga10 = c10.srl16(8);
+							GSVector4i rb11 = c11.sll16(8).srl16(8);
+							GSVector4i ga11 = c11.srl16(8);
+
+							rb10 = rb10.lerp16_4(rb11, uf);
+							ga10 = ga10.lerp16_4(ga11, uf);
+
+							rb2 = rb00.lerp16_4(rb10, vf);
+							ga2 = ga00.lerp16_4(ga10, vf);
+						}
+						else
+						{
+							addr00 = y0 + x0;
+
+							if(sel.tlu)
+							{
+								for(int i = 0; i < 4; i++)
+								{
+									c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]];
+								}
+							}
+							else
+							{
+								for(int i = 0; i < 4; i++)
+								{
+									c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]];
+								}
+							}
+
+							rb2 = c00.sll16(8).srl16(8);
+							ga2 = c00.srl16(8);
+						}
+
+						if(sel.lcm) lodf = m_global.lod.f;
+
+						lodf = lodf.srl16(1);
+
+						rb = rb.lerp16<0>(rb2, lodf);
+						ga = ga.lerp16<0>(ga2, lodf);
+					}
+				}
+				else
+				{
+					if(!sel.fst)
+					{
+						GSVector4 qrcp = q.rcp();
+
+						u = GSVector4i(s * qrcp);
+						v = GSVector4i(t * qrcp);
+					
+						if(sel.ltf)
+						{
+							u -= 0x8000;
+							v -= 0x8000;
+						}
+					}
+					else
+					{
+						u = GSVector4i::cast(s);
+						v = GSVector4i::cast(t);
+					}
+
+					if(sel.ltf)
+					{
+						uf = u.xxzzlh().srl16(12);
+					
+						if(sel.prim != GS_SPRITE_CLASS)
+						{
+							vf = v.xxzzlh().srl16(12);
+						}
+					}
+
+					GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+					GSVector4i uv1 = uv0;
+
+					{
+						GSVector4i repeat = (uv0 & m_global.t.min) | m_global.t.max;
+						GSVector4i clamp = uv0.sat_i16(m_global.t.min, m_global.t.max);
+					
+						uv0 = clamp.blend8(repeat, m_global.t.mask);
+					}
+
+					if(sel.ltf)
+					{
+						uv1 = uv1.add16(GSVector4i::x0001());
+
+						GSVector4i repeat = (uv1 & m_global.t.min) | m_global.t.max;
+						GSVector4i clamp = uv1.sat_i16(m_global.t.min, m_global.t.max);
+					
+						uv1 = clamp.blend8(repeat, m_global.t.mask);
+					}
+
+					GSVector4i y0 = uv0.uph16() << (sel.tw + 3);
+					GSVector4i x0 = uv0.upl16();
+
+					if(sel.ltf)
+					{
+						GSVector4i y1 = uv1.uph16() << (sel.tw + 3);
+						GSVector4i x1 = uv1.upl16();
+
+						addr00 = y0 + x0;
+						addr01 = y0 + x1;
+						addr10 = y1 + x0;
+						addr11 = y1 + x1;
+
+						if(sel.tlu)
+						{
+							const uint8* tex = (const uint8*)m_global.tex[0];
+
+							c00 = addr00.gather32_32(tex, m_global.clut);
+							c01 = addr01.gather32_32(tex, m_global.clut);
+							c10 = addr10.gather32_32(tex, m_global.clut);
+							c11 = addr11.gather32_32(tex, m_global.clut);
+						}
+						else
+						{
+							const uint32* tex = (const uint32*)m_global.tex[0];
+
+							c00 = addr00.gather32_32(tex);
+							c01 = addr01.gather32_32(tex);
+							c10 = addr10.gather32_32(tex);
+							c11 = addr11.gather32_32(tex);
+						}
+					
+						GSVector4i rb00 = c00.sll16(8).srl16(8);
+						GSVector4i ga00 = c00.srl16(8);
+						GSVector4i rb01 = c01.sll16(8).srl16(8);
+						GSVector4i ga01 = c01.srl16(8);
+
+						rb00 = rb00.lerp16_4(rb01, uf);
+						ga00 = ga00.lerp16_4(ga01, uf);
+
+						GSVector4i rb10 = c10.sll16(8).srl16(8);
+						GSVector4i ga10 = c10.srl16(8);
+						GSVector4i rb11 = c11.sll16(8).srl16(8);
+						GSVector4i ga11 = c11.srl16(8);
+
+						rb10 = rb10.lerp16_4(rb11, uf);
+						ga10 = ga10.lerp16_4(ga11, uf);
+
+						rb = rb00.lerp16_4(rb10, vf);
+						ga = ga00.lerp16_4(ga10, vf);
+					}
+					else
+					{
+						addr00 = y0 + x0;
+
+						if(sel.tlu)
+						{
+							c00 = addr00.gather32_32((const uint8*)m_global.tex[0], m_global.clut);
+						}
+						else
+						{
+							c00 = addr00.gather32_32((const uint32*)m_global.tex[0]);
+						}
+
+						rb = c00.sll16(8).srl16(8);
+						ga = c00.srl16(8);
+					}
+				}
+			}
+
+			// AlphaTFX
+
+			if(sel.fb)
+			{
+				switch(sel.tfx)
+				{
+				case TFX_MODULATE:
+					ga = ga.modulate16<1>(gaf).clamp8();
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_DECAL:
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_HIGHLIGHT:
+					ga = ga.mix16(!sel.tcc ? gaf.srl16(7) : ga.addus8(gaf.srl16(7)));
+					break;
+				case TFX_HIGHLIGHT2:
+					if(!sel.tcc) ga = ga.mix16(gaf.srl16(7));
+					break;
+				case TFX_NONE:
+					ga = sel.iip ? gaf.srl16(7) : gaf;
+					break;
+				}
+
+				if(sel.aa1)
+				{
+					GSVector4i x00800080(0x00800080);
+
+					GSVector4i a = sel.edge ? cov : x00800080;
+
+					if(!sel.abe)
+					{
+						ga = ga.mix16(a);
+					}
+					else
+					{
+						ga = ga.blend8(a, ga.eq16(x00800080).srl32(16).sll32(16));
+					}
+				}
+			}
+
+			// ReadMask
+
+			if(sel.fwrite)
+			{
+				fm = m_global.fm;
+			}
+
+			if(sel.zwrite)
+			{
+				zm = m_global.zm;
+			}
+
+			// TestAlpha
+
+			if(!TestAlpha(test, fm, zm, ga)) continue;
+
+			// ColorTFX
+
+			if(sel.fwrite)
+			{
+				GSVector4i af;
+
+				switch(sel.tfx)
+				{
+				case TFX_MODULATE:
+					rb = rb.modulate16<1>(rbf).clamp8();
+					break;
+				case TFX_DECAL:
+					break;
+				case TFX_HIGHLIGHT:
+				case TFX_HIGHLIGHT2:
+					af = gaf.yywwlh().srl16(7);
+					rb = rb.modulate16<1>(rbf).add16(af).clamp8();
+					ga = ga.modulate16<1>(gaf).add16(af).clamp8().mix16(ga);
+					break;
+				case TFX_NONE:
+					rb = sel.iip ? rbf.srl16(7) : rbf;
+					break;
+				}
+			}
+
+			// Fog
+
+			if(sel.fwrite && sel.fge)
+			{
+				GSVector4i fog = sel.prim != GS_SPRITE_CLASS ? f : m_local.p.f;
+
+				rb = m_global.frb.lerp16<0>(rb, fog);
+				ga = m_global.fga.lerp16<0>(ga, fog).mix16(ga);
+
+				/*
+				fog = fog.srl16(7);
+
+				GSVector4i ifog = GSVector4i::x00ff().sub16(fog);
+
+				rb = rb.mul16l(fog).add16(m_global.frb.mul16l(ifog)).srl16(8);
+				ga = ga.mul16l(fog).add16(m_global.fga.mul16l(ifog)).srl16(8).mix16(ga);
+				*/
+			}
+
+			// ReadFrame
+
+			if(sel.fb)
+			{
+				fa = fza_base->x + fza_offset->x;
+
+				if(sel.rfb)
+				{
+					fd = GSVector4i::load((uint8*)m_global.vm + fa * 2, (uint8*)m_global.vm + fa * 2 + 16);
+				}
+			}
+
+			// TestDestAlpha
+
+			if(sel.date && (sel.fpsm == 0 || sel.fpsm == 2))
+			{
+				if(sel.datm)
+				{
+					if(sel.fpsm == 2)
+					{
+						// test |= fd.srl32(15) == GSVector4i::zero();
+						test |= fd.sll32(16).sra32(31) == GSVector4i::zero();
+					}
+					else
+					{
+						test |= (~fd).sra32(31);
+					}
+				}
+				else
+				{
+					if(sel.fpsm == 2)
+					{
+						test |= fd.sll32(16).sra32(31); // == GSVector4i::xffffffff();
+					}
+					else
+					{
+						test |= fd.sra32(31);
+					}
+				}
+
+				if(test.alltrue()) continue;
+			}
+
+			// WriteMask
+
+			int fzm = 0;
+
+			if(!sel.notest)
+			{
+				if(sel.fwrite)
+				{
+					fm |= test;
+				}
+
+				if(sel.zwrite)
+				{
+					zm |= test;
+				}
+
+				if(sel.fwrite && sel.zwrite)
+				{
+					fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+				}
+				else if(sel.fwrite)
+				{
+					fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask();
+				}
+				else if(sel.zwrite)
+				{
+					fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask();
+				}
+			}
+
+			// WriteZBuf
+
+			if(sel.zwrite)
+			{
+				if(sel.ztest && sel.zpsm < 2)
+				{
+					zs = zs.blend8(zd, zm);
+				}
+
+				bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
+
+				if(sel.notest)
+				{
+					if(fast)
+					{
+						GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
+						GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
+					}
+					else
+					{
+						WritePixel(zs, za, 0, sel.zpsm);
+						WritePixel(zs, za, 1, sel.zpsm);
+						WritePixel(zs, za, 2, sel.zpsm);
+						WritePixel(zs, za, 3, sel.zpsm);
+					}
+				}
+				else
+				{
+					if(fast)
+					{
+						if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
+						if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
+					}
+					else
+					{
+						if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm);
+						if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm);
+						if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm);
+						if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm);
+					}
+				}
+			}
+
+			// AlphaBlend
+
+			if(sel.fwrite && (sel.abe || sel.aa1))
+			{
+				GSVector4i rbs = rb, gas = ga, rbd, gad, a, mask;
+
+				if(sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1)
+				{
+					switch(sel.fpsm)
+					{
+					case 0:
+					case 1:
+						rbd = fd.sll16(8).srl16(8);
+						gad = fd.srl16(8);
+						break;
+					case 2:
+						rbd = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+						gad = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+						break;
+					}
+				}
+
+				if(sel.aba != sel.abb)
+				{
+					switch(sel.aba)
+					{
+					case 0: break;
+					case 1: rb = rbd; break;
+					case 2: rb = GSVector4i::zero(); break;
+					}
+
+					switch(sel.abb)
+					{
+					case 0: rb = rb.sub16(rbs); break;
+					case 1: rb = rb.sub16(rbd); break;
+					case 2: break;
+					}
+
+					if(!(sel.fpsm == 1 && sel.abc == 1))
+					{
+						switch(sel.abc)
+						{
+						case 0: a = gas.yywwlh().sll16(7); break;
+						case 1: a = gad.yywwlh().sll16(7); break;
+						case 2: a = m_global.afix; break;
+						}
+
+						rb = rb.modulate16<1>(a);
+					}
+
+					switch(sel.abd)
+					{
+					case 0: rb = rb.add16(rbs); break;
+					case 1: rb = rb.add16(rbd); break;
+					case 2: break;
+					}
+				}
+				else
+				{
+					switch(sel.abd)
+					{
+					case 0: break;
+					case 1: rb = rbd; break;
+					case 2: rb = GSVector4i::zero(); break;
+					}
+				}
+
+				if(sel.pabe)
+				{
+					mask = (gas << 8).sra32(31);
+					
+					rb = rbs.blend8(rb, mask);
+				}
+				
+				if(sel.aba != sel.abb)
+				{
+					switch(sel.aba)
+					{
+					case 0: break;
+					case 1: ga = gad; break;
+					case 2: ga = GSVector4i::zero(); break;
+					}
+
+					switch(sel.abb)
+					{
+					case 0: ga = ga.sub16(gas); break;
+					case 1: ga = ga.sub16(gad); break;
+					case 2: break;
+					}
+
+					if(!(sel.fpsm == 1 && sel.abc == 1))
+					{
+						ga = ga.modulate16<1>(a);
+					}
+
+					switch(sel.abd)
+					{
+					case 0: ga = ga.add16(gas); break;
+					case 1: ga = ga.add16(gad); break;
+					case 2: break;
+					}
+				}
+				else
+				{
+					switch(sel.abd)
+					{
+					case 0: break;
+					case 1: ga = gad; break;
+					case 2: ga = GSVector4i::zero(); break;
+					}
+				}
+
+				if(sel.pabe)
+				{
+					ga = gas.blend8(ga, mask >> 16);
+				}
+				else
+				{
+					if(sel.fpsm != 1)
+					{
+						ga = ga.mix16(gas);
+					}
+				}
+			}
+
+			// WriteFrame
+
+			if(sel.fwrite)
+			{
+				if(sel.fpsm == 2 && sel.dthe)
+				{
+					int y = (top & 3) << 1;
+
+					rb = rb.add16(m_global.dimx[0 + y]);
+					ga = ga.add16(m_global.dimx[1 + y]);
+				}
+
+				if(sel.colclamp == 0)
+				{
+					rb &= GSVector4i::x00ff();
+					ga &= GSVector4i::x00ff();
+				}
+
+				GSVector4i fs = rb.upl16(ga).pu16(rb.uph16(ga));
+
+				if(sel.fba && sel.fpsm != 1)
+				{
+					fs |= GSVector4i::x80000000();
+				}
+
+				if(sel.fpsm == 2)
+				{
+					GSVector4i rb = fs & 0x00f800f8;
+					GSVector4i ga = fs & 0x8000f800;
+
+					fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+				}
+
+				if(sel.rfb)
+				{
+					fs = fs.blend(fd, fm);
+				}
+
+				bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest;
+
+				if(sel.notest)
+				{
+					if(fast)
+					{
+						GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
+						GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
+					}
+					else
+					{
+						WritePixel(fs, fa, 0, sel.fpsm);
+						WritePixel(fs, fa, 1, sel.fpsm);
+						WritePixel(fs, fa, 2, sel.fpsm);
+						WritePixel(fs, fa, 3, sel.fpsm);
+					}
+				}
+				else
+				{
+					if(fast)
+					{
+						if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
+						if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
+					}
+					else
+					{
+						if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm);
+						if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm);
+						if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm);
+						if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm);
+					}
+				}
+			}
+		}
+		while(0);
+
+		if(sel.edge) break;
+
+		if(steps <= 0) break;
+
+		// Step
+		
+		steps -= 4;
+
+		fza_offset++;
+
+		if(sel.prim != GS_SPRITE_CLASS)
+		{
+			if(sel.zb)
+			{
+				zo += m_local.d4.z;
+			}
+
+			if(sel.fwrite && sel.fge)
+			{
+				f = f.add16(m_local.d4.f);
+			}
+		}
+
+		if(sel.fb)
+		{
+			if(sel.tfx != TFX_NONE)
+			{
+				if(sel.fst)
+				{
+					GSVector4i stq = GSVector4i::cast(m_local.d4.stq);
+
+					s = GSVector4::cast(GSVector4i::cast(s) + stq.xxxx());
+					
+					if(sel.prim != GS_SPRITE_CLASS || sel.mmin)
+					{
+						t = GSVector4::cast(GSVector4i::cast(t) + stq.yyyy());
+					}
+				}
+				else
+				{
+					GSVector4 stq = m_local.d4.stq;
+
+					s += stq.xxxx();
+					t += stq.yyyy();
+					q += stq.zzzz();
+				}
+			}
+		}
+
+		if(!(sel.tfx == TFX_DECAL && sel.tcc))
+		{
+			if(sel.iip)
+			{
+				GSVector4i c = m_local.d4.c;
+
+				rbf = rbf.add16(c.xxxx()).max_i16(GSVector4i::zero());
+				gaf = gaf.add16(c.yyyy()).max_i16(GSVector4i::zero());
+			}
+		}
+
+		if(!sel.notest)
+		{
+			test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
+		}
+	}
+
+	#endif
+}
+
+void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	uint32 zwrite = m_global.sel.zwrite;
+	uint32 edge = m_global.sel.edge;
+
+	m_global.sel.zwrite = 0;
+	m_global.sel.edge = 1;
+
+	DrawScanline(pixels, left, top, scan);
+
+	m_global.sel.zwrite = zwrite;
+	m_global.sel.edge = edge;
+}
+
+template<class T>
+bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga)
+{
+	GSScanlineSelector sel = m_global.sel;
+
+	switch(sel.afail)
+	{
+	case AFAIL_FB_ONLY:
+		if(!sel.zwrite) return true;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		if(!sel.fwrite) return true;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		if(!sel.zwrite && sel.fpsm == 1) return true;
+		break;
+	}
+
+	T t;
+
+	switch(sel.atst)
+	{
+	case ATST_NEVER:
+		t = GSVector4i::xffffffff();
+		break;
+
+	case ATST_ALWAYS:
+		return true;
+
+	case ATST_LESS:
+	case ATST_LEQUAL:
+		t = (ga >> 16) > T(m_global.aref);
+		break;
+
+	case ATST_EQUAL:
+		t = (ga >> 16) != T(m_global.aref);
+		break;
+
+	case ATST_GEQUAL:
+	case ATST_GREATER:
+		t = (ga >> 16) < T(m_global.aref);
+		break;
+
+	case ATST_NOTEQUAL:
+		t = (ga >> 16) == T(m_global.aref);
+		break;
+
+	default:
+		__assume(0);
+	}
+
+	switch(sel.afail)
+	{
+	case AFAIL_KEEP:
+		test |= t;
+		if(test.alltrue()) return false;
+		break;
+
+	case AFAIL_FB_ONLY:
+		zm |= t;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		fm |= t;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		zm |= t;
+		fm |= t & T::xff000000(); // fpsm 16 bit => & 0xffff8000?
+		break;
+
+	default:
+		__assume(0);
+	}
+
+	return true;
+}
+
+static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26}; // columnTable16[0]
+
+template<class T> void GSDrawScanline::WritePixel(const T& src, int addr, int i, uint32 psm)
+{
+	uint8* dst = (uint8*)m_global.vm + addr * 2 + s_offsets[i] * 2;
+
+	switch(psm)
+	{
+	case 0:
+		*(uint32*)dst = src.u32[i];
+		break;
+	case 1:
+		*(uint32*)dst = (src.u32[i] & 0xffffff) | (*(uint32*)dst & 0xff000000);
+		break;
+	case 2:
+		*(uint16*)dst = src.u16[i * 2];
+		break;
+	}
+}
+
+#endif
+
+void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
+{
+	ASSERT(r.y >= 0);
+	ASSERT(r.w >= 0);
+
+	// FIXME: sometimes the frame and z buffer may overlap, the outcome is undefined
+
+	uint32 m;
+
+	#if _M_SSE >= 0x501
+	m = m_global.zm;
+	#else
+	m = m_global.zm.u32[0];
+	#endif
+
+	if(m != 0xffffffff)
+	{
+		const int* zbr = m_global.zbr;
+		const int* zbc = m_global.zbc;
+
+		uint32 z = v.t.u32[3]; // (uint32)v.p.z;
+
+		if(m_global.sel.zpsm != 2)
+		{
+			if(m == 0)
+			{
+				DrawRectT<uint32, false>(zbr, zbc, r, z, m);
+			}
+			else
+			{
+				DrawRectT<uint32, true>(zbr, zbc, r, z, m);
+			}
+		}
+		else
+		{
+			if((m & 0xffff) == 0)
+			{
+				DrawRectT<uint16, false>(zbr, zbc, r, z, m);
+			}
+			else
+			{
+				DrawRectT<uint16, true>(zbr, zbc, r, z, m);
+			}
+		}
+	}
+
+	#if _M_SSE >= 0x501
+	m = m_global.fm;
+	#else
+	m = m_global.fm.u32[0];
+	#endif
+
+	if(m != 0xffffffff)
+	{
+		const int* fbr = m_global.fbr;
+		const int* fbc = m_global.fbc;
+
+		uint32 c = (GSVector4i(v.c) >> 7).rgba32();
+
+		if(m_global.sel.fba)
+		{
+			c |= 0x80000000;
+		}
+
+		if(m_global.sel.fpsm != 2)
+		{
+			if(m == 0)
+			{
+				DrawRectT<uint32, false>(fbr, fbc, r, c, m);
+			}
+			else
+			{
+				DrawRectT<uint32, true>(fbr, fbc, r, c, m);
+			}
+		}
+		else
+		{
+			c = ((c & 0xf8) >> 3) | ((c & 0xf800) >> 6) | ((c & 0xf80000) >> 9) | ((c & 0x80000000) >> 16);
+
+			if((m & 0xffff) == 0)
+			{
+				DrawRectT<uint16, false>(fbr, fbc, r, c, m);
+			}
+			else
+			{
+				DrawRectT<uint16, true>(fbr, fbc, r, c, m);
+			}
+		}
+	}
+}
+
+template<class T, bool masked>
+void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m)
+{
+	if(m == 0xffffffff) return;
+
+	#if _M_SSE >= 0x501
+
+	GSVector8i color((int)c);
+	GSVector8i mask((int)m);
+
+	#else
+
+	GSVector4i color((int)c);
+	GSVector4i mask((int)m);
+
+	#endif
+
+	if(sizeof(T) == sizeof(uint16))
+	{
+		color = color.xxzzlh();
+		mask = mask.xxzzlh();
+		c = (c & 0xffff) | (c << 16);
+		m = (m & 0xffff) | (m << 16);
+	}
+
+	color = color.andnot(mask);
+	c = c & (~m);
+
+	if(masked) ASSERT(mask.u32[0] != 0);
+
+	GSVector4i br = r.ralign<Align_Inside>(GSVector2i(8 * 4 / sizeof(T), 8));
+
+	if(!br.rempty())
+	{
+		FillRect<T, masked>(row, col, GSVector4i(r.x, r.y, r.z, br.y), c, m);
+		FillRect<T, masked>(row, col, GSVector4i(r.x, br.w, r.z, r.w), c, m);
+
+		if(r.x < br.x || br.z < r.z)
+		{
+			FillRect<T, masked>(row, col, GSVector4i(r.x, br.y, br.x, br.w), c, m);
+			FillRect<T, masked>(row, col, GSVector4i(br.z, br.y, r.z, br.w), c, m);
+		}
+
+		FillBlock<T, masked>(row, col, br, color, mask);
+	}
+	else
+	{
+		FillRect<T, masked>(row, col, r, c, m);
+	}
+}
+
+template<class T, bool masked>
+void GSDrawScanline::FillRect(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m)
+{
+	if(r.x >= r.z) return;
+
+	T* vm = (T*)m_global.vm;
+
+	for(int y = r.y; y < r.w; y++)
+	{
+		T* RESTRICT d = &vm[row[y]];
+
+		for(int x = r.x; x < r.z; x++)
+		{
+			d[col[x]] = (T)(!masked ? c : (c | (d[col[x]] & m)));
+		}
+	}
+}
+
+#if _M_SSE >= 0x501
+
+template<class T, bool masked>
+void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector8i& c, const GSVector8i& m)
+{
+	if(r.x >= r.z) return;
+
+	T* vm = (T*)m_global.vm;
+
+	for(int y = r.y; y < r.w; y += 8)
+	{
+		T* RESTRICT d = &vm[row[y]];
+
+		for(int x = r.x; x < r.z; x += 8 * 4 / sizeof(T))
+		{
+			GSVector8i* RESTRICT p = (GSVector8i*)&d[col[x]];
+
+			p[0] = !masked ? c : (c | (p[0] & m));
+			p[1] = !masked ? c : (c | (p[1] & m));
+			p[2] = !masked ? c : (c | (p[2] & m));
+			p[3] = !masked ? c : (c | (p[3] & m));
+			p[4] = !masked ? c : (c | (p[4] & m));
+			p[5] = !masked ? c : (c | (p[5] & m));
+			p[6] = !masked ? c : (c | (p[6] & m));
+			p[7] = !masked ? c : (c | (p[7] & m));
+		}
+	}
+}
+
+#else
+
+template<class T, bool masked>
+void GSDrawScanline::FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m)
+{
+	if(r.x >= r.z) return;
+
+	T* vm = (T*)m_global.vm;
+
+	for(int y = r.y; y < r.w; y += 8)
+	{
+		T* RESTRICT d = &vm[row[y]];
+
+		for(int x = r.x; x < r.z; x += 8 * 4 / sizeof(T))
+		{
+			GSVector4i* RESTRICT p = (GSVector4i*)&d[col[x]];
+
+			for(int i = 0; i < 16; i += 4)
+			{
+				p[i + 0] = !masked ? c : (c | (p[i + 0] & m));
+				p[i + 1] = !masked ? c : (c | (p[i + 1] & m));
+				p[i + 2] = !masked ? c : (c | (p[i + 2] & m));
+				p[i + 3] = !masked ? c : (c | (p[i + 3] & m));
+			}
+		}
+	}
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSDrawScanline.h b/plugins/GSdx_legacy/GSDrawScanline.h
new file mode 100644
index 0000000000..f1acc6a0b9
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanline.h
@@ -0,0 +1,90 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSState.h"
+#include "GSRasterizer.h"
+#include "GSScanlineEnvironment.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSDrawScanlineCodeGenerator.h"
+
+class GSDrawScanline : public IDrawScanline
+{
+public:
+	class SharedData : public GSRasterizerData
+	{
+	public:
+		GSScanlineGlobalData global;
+	};
+
+protected:
+	GSScanlineGlobalData m_global;
+	GSScanlineLocalData m_local;
+
+	GSCodeGeneratorFunctionMap<GSSetupPrimCodeGenerator, uint64, SetupPrimPtr> m_sp_map;
+	GSCodeGeneratorFunctionMap<GSDrawScanlineCodeGenerator, uint64, DrawScanlinePtr> m_ds_map;
+
+	template<class T, bool masked>
+	void DrawRectT(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m);
+
+	template<class T, bool masked>
+	__forceinline void FillRect(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m);
+
+	#if _M_SSE >= 0x501
+
+	template<class T, bool masked>
+	__forceinline void FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector8i& c, const GSVector8i& m);
+
+	#else
+
+	template<class T, bool masked>
+	__forceinline void FillBlock(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m);
+
+	#endif
+
+public:
+	GSDrawScanline();
+	virtual ~GSDrawScanline();
+
+	// IDrawScanline
+
+	void BeginDraw(const GSRasterizerData* data);
+	void EndDraw(uint64 frame, uint64 ticks, int actual, int total);
+
+	void DrawRect(const GSVector4i& r, const GSVertexSW& v);
+
+#ifndef ENABLE_JIT_RASTERIZER
+	
+	void SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan);
+	void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan);
+	void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan);
+
+	bool IsEdge() const {return m_global.sel.aa1;}
+	bool IsRect() const {return m_global.sel.IsSolidRect();}
+
+	template<class T> bool TestAlpha(T& test, T& fm, T& zm, const T& ga);
+	template<class T> void WritePixel(const T& src, int addr, int i, uint32 psm);
+
+#endif
+
+	void PrintStats() {m_ds_map.PrintStats();}
+};
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.cpp
new file mode 100644
index 0000000000..3909a6e787
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.cpp
@@ -0,0 +1,357 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+
+#if _M_SSE >= 0x501
+
+__aligned(const uint8, 8) GSDrawScanlineCodeGenerator::m_test[16][8] =
+{
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+	{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+	{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+	{0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+	{0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+	{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+	{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+	{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	{0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	{0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+	{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+};
+
+const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] = 
+{
+	GSVector8(0.204446009836232697516f),
+	GSVector8(-1.04913055217340124191f),
+	GSVector8(2.28330284476918490682f),
+	GSVector8(1.0f),
+};
+
+#else
+
+const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
+{
+	GSVector4i::zero(),
+	GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
+	GSVector4i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
+	GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffffffff),
+	GSVector4i::zero(),
+};
+
+const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] = 
+{
+	GSVector4(0.204446009836232697516f),
+	GSVector4(-1.04913055217340124191f),
+	GSVector4(2.28330284476918490682f),
+	GSVector4(1.0f),
+};
+
+#endif
+
+GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
+	: GSCodeGenerator(code, maxsize)
+	, m_local(*(GSScanlineLocalData*)param)
+{
+	m_sel.key = key;
+
+	Generate();
+}
+
+#if _M_SSE >= 0x501
+
+void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift)
+{
+	if(shift == 0)
+	{
+		vpmulhrsw(a, f);
+	}
+	else
+	{
+		vpsllw(a, (uint8)(shift + 1));
+		vpmulhw(a, f);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift)
+{
+	vpsubw(a, b);
+	modulate16(a, f, shift);
+	vpaddw(a, b);
+}
+
+void GSDrawScanlineCodeGenerator::lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f)
+{
+	vpsubw(a, b);
+	vpmullw(a, f);
+	vpsraw(a, 4);
+	vpaddw(a, b);
+}
+
+void GSDrawScanlineCodeGenerator::mix16(const Ymm& a, const Ymm& b, const Ymm& temp)
+{
+	vpblendw(a, b, 0xaa);
+}
+
+void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp)
+{
+	vpackuswb(a, a);
+	vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
+	vpmovzxbw(a, a);
+}
+
+void GSDrawScanlineCodeGenerator::alltrue()
+{
+	vpmovmskb(eax, ymm7);
+	cmp(eax, 0xffffffff);
+	je("step", T_NEAR);
+}
+
+void GSDrawScanlineCodeGenerator::blend(const Ymm& a, const Ymm& b, const Ymm& mask)
+{
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(a, b, mask);
+}
+
+void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask)
+{
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(b, mask);
+}
+
+void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b)
+{
+	vpblendvb(a, a, b, xmm0);
+}
+
+void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
+{
+	vpblendvb(b, a, b, xmm0);
+}
+
+#else
+
+void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
+{
+	#if _M_SSE >= 0x500
+
+	if(shift == 0)
+	{
+		vpmulhrsw(a, f);
+	}
+	else
+	{
+		vpsllw(a, shift + 1);
+		vpmulhw(a, f);
+	}
+
+	#else
+
+	if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
+	{
+		pmulhrsw(a, f);
+	}
+	else
+	{
+		psllw(a, shift + 1);
+		pmulhw(a, f);
+	}
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift)
+{
+	#if _M_SSE >= 0x500
+
+	vpsubw(a, b);
+	modulate16(a, f, shift);
+	vpaddw(a, b);
+
+	#else
+
+	psubw(a, b);
+	modulate16(a, f, shift);
+	paddw(a, b);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
+{
+	#if _M_SSE >= 0x500
+
+	vpsubw(a, b);
+	vpmullw(a, f);
+	vpsraw(a, 4);
+	vpaddw(a, b);
+
+	#else
+
+	psubw(a, b);
+	pmullw(a, f);
+	psraw(a, 4);
+	paddw(a, b);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
+{
+	#if _M_SSE >= 0x500
+
+	vpblendw(a, b, 0xaa);
+	
+	#elif _M_SSE >= 0x401
+
+	pblendw(a, b, 0xaa);
+
+	#else
+
+	pcmpeqd(temp, temp);
+	psrld(temp, 16);
+	pand(a, temp);
+	pandn(temp, b);
+	por(a, temp);
+	
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
+{
+	#if _M_SSE >= 0x500
+	
+	vpackuswb(a, a);
+	vpmovzxbw(a, a);
+
+	#elif _M_SSE >= 0x401
+
+	packuswb(a, a);
+	pmovzxbw(a, a);
+
+	#else
+
+	packuswb(a, a);
+	pxor(temp, temp);
+	punpcklbw(a, temp);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::alltrue()
+{
+	#if _M_SSE >= 0x500
+	
+	vpmovmskb(eax, xmm7);
+	cmp(eax, 0xffff);
+	je("step", T_NEAR);
+
+	#else
+
+	pmovmskb(eax, xmm7);
+	cmp(eax, 0xffff);
+	je("step", T_NEAR);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
+{
+	#if _M_SSE >= 0x500
+
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(a, b, mask);
+
+	#else
+
+	pand(b, mask);
+	pandn(mask, a);
+	por(b, mask);
+	movdqa(a, b);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
+{
+	#if _M_SSE >= 0x500
+
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(b, mask);
+
+	#else
+
+	pand(b, mask);
+	pandn(mask, a);
+	por(b, mask);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
+{
+	#if _M_SSE >= 0x500
+	
+	vpblendvb(a, a, b, xmm0);
+
+	#elif _M_SSE >= 0x401
+	
+	pblendvb(a, b);
+
+	#else
+
+	blend(a, b, xmm0);
+
+	#endif
+}
+
+void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
+{
+	#if _M_SSE >= 0x500
+	
+	vpblendvb(b, a, b, xmm0);
+
+	#elif _M_SSE >= 0x401
+
+	pblendvb(a, b);
+	movdqa(b, a);
+
+	#else
+
+	blendr(b, a, xmm0);
+
+	#endif
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.h b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.h
new file mode 100644
index 0000000000..282285bcbd
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.h
@@ -0,0 +1,145 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSScanlineEnvironment.h"
+#include "GSFunctionMap.h"
+
+using namespace Xbyak;
+
+class GSDrawScanlineCodeGenerator : public GSCodeGenerator
+{
+	void operator = (const GSDrawScanlineCodeGenerator&);
+
+	GSScanlineSelector m_sel;
+	GSScanlineLocalData& m_local;
+
+	void Generate();
+
+	#if _M_SSE >= 0x501
+
+	void Init();
+	void Step();
+	void TestZ(const Ymm& temp1, const Ymm& temp2);
+	void SampleTexture();
+	void Wrap(const Ymm& uv0);
+	void Wrap(const Ymm& uv0, const Ymm& uv1);
+	void SampleTextureLOD();
+	void WrapLOD(const Ymm& uv0);
+	void WrapLOD(const Ymm& uv0, const Ymm& uv1);
+	void AlphaTFX();
+	void ReadMask();
+	void TestAlpha();
+	void ColorTFX();
+	void Fog();
+	void ReadFrame();
+	void TestDestAlpha();
+	void WriteMask();
+	void WriteZBuf();
+	void AlphaBlend();
+	void WriteFrame();
+
+	#if defined(_M_AMD64) || defined(_WIN64)
+	void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr);
+	void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
+	#else
+	void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
+	void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
+	#endif
+
+	void ReadTexel(int pixels, int mip_offset = 0);
+	void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
+
+	void modulate16(const Ymm& a, const Operand& f, int shift);
+	void lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift);
+	void lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f);
+	void mix16(const Ymm& a, const Ymm& b, const Ymm& temp);
+	void clamp16(const Ymm& a, const Ymm& temp);
+	void alltrue();
+	void blend(const Ymm& a, const Ymm& b, const Ymm& mask);
+	void blendr(const Ymm& b, const Ymm& a, const Ymm& mask);
+	void blend8(const Ymm& a, const Ymm& b);
+	void blend8r(const Ymm& b, const Ymm& a);
+
+	#else
+
+	void Init();
+	void Step();
+	void TestZ(const Xmm& temp1, const Xmm& temp2);
+	void SampleTexture();
+	void Wrap(const Xmm& uv0);
+	void Wrap(const Xmm& uv0, const Xmm& uv1);
+	void SampleTextureLOD();
+	void WrapLOD(const Xmm& uv0);
+	void WrapLOD(const Xmm& uv0, const Xmm& uv1);
+	void AlphaTFX();
+	void ReadMask();
+	void TestAlpha();
+	void ColorTFX();
+	void Fog();
+	void ReadFrame();
+	void TestDestAlpha();
+	void WriteMask();
+	void WriteZBuf();
+	void AlphaBlend();
+	void WriteFrame();
+
+	#if defined(_M_AMD64) || defined(_WIN64)
+	void ReadPixel(const Xmm& dst, const Reg64& addr);
+	void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm);
+	#else
+	void ReadPixel(const Xmm& dst, const Reg32& addr);
+	void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm);
+	#endif
+
+	void ReadTexel(int pixels, int mip_offset = 0);
+	void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i);
+
+	void modulate16(const Xmm& a, const Operand& f, int shift);
+	void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);
+	void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f);
+	void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
+	void clamp16(const Xmm& a, const Xmm& temp);
+	void alltrue();
+	void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
+	void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
+	void blend8(const Xmm& a, const Xmm& b);
+	void blend8r(const Xmm& b, const Xmm& a);
+
+	#endif
+
+public:
+	GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
+
+	#if _M_SSE >= 0x501
+	static __aligned(const uint8, 8) m_test[16][8];
+	static const GSVector8 m_log2_coef[4];
+	#else
+	static const GSVector4i m_test[8];
+	static const GSVector4 m_log2_coef[4];
+	#endif
+
+};
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.avx.cpp
new file mode 100644
index 0000000000..a4f3a9bc4b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.avx.cpp
@@ -0,0 +1,1828 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64))
+
+#error TODO
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+	// TODO: on linux/mac rsi, rdi, xmm6-xmm15 are all caller saved
+
+	push(rbx);
+	push(rsi);
+	push(rdi);
+	push(rbp);
+	push(r12);
+	push(r13);
+
+	sub(rsp, 8 + 10 * 16);
+	
+	for(int i = 6; i < 16; i++)
+	{
+		vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i));
+	}
+
+	mov(r10, (size_t)&m_test[0]);
+	mov(r11, (size_t)&m_local);
+	mov(r12, (size_t)m_local.gd);
+	mov(r13, (size_t)m_local.gd->vm);
+
+	Init();
+
+	// rcx = steps
+	// rsi = fza_base
+	// rdi = fza_offset
+	// r10 = &m_test[0]
+	// r11 = &m_local
+	// r12 = m_local->gd
+	// r13 = m_local->gd.vm
+	// xmm7 = vf (sprite && ltf)
+	// xmm8 = z
+	// xmm9 = f
+	// xmm10 = s
+	// xmm11 = t
+	// xmm12 = q
+	// xmm13 = rb
+	// xmm14 = ga 
+	// xmm15 = test
+
+	if(!m_sel.edge)
+	{
+		align(16);
+	}
+
+L("loop");
+
+	TestZ(xmm5, xmm6);
+
+	// ebp = za
+
+	if(m_sel.mmin)
+	{
+		SampleTextureLOD();
+	}
+	else
+	{
+		SampleTexture();
+	}
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+
+	AlphaTFX();
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+
+	ReadMask();
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+
+	TestAlpha();
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+
+	ColorTFX();
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+
+	Fog();
+
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+
+	ReadFrame();
+
+	// ebx = fa
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+	// xmm6 = fd
+
+	TestDestAlpha();
+
+	// ebx = fa
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+	// xmm6 = fd
+
+	WriteMask();
+
+	// ebx = fa
+	// edx = fzm
+	// ebp = za
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm5 = zm
+	// xmm6 = fd
+
+	WriteZBuf();
+
+	// ebx = fa
+	// edx = fzm
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm6 = fd
+
+	AlphaBlend();
+
+	// ebx = fa
+	// edx = fzm
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm4 = fm
+	// xmm6 = fd
+
+	WriteFrame();
+
+L("step");
+
+	// if(steps <= 0) break;
+
+	if(!m_sel.edge)
+	{
+		test(rcx, rcx);
+
+		jle("exit", T_NEAR);
+
+		Step();
+
+		jmp("loop", T_NEAR);
+	}
+
+L("exit");
+
+	for(int i = 6; i < 16; i++)
+	{
+		vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]);
+	}
+
+	add(rsp, 8 + 10 * 16);
+
+	pop(r13);
+	pop(r12);
+	pop(rbp);
+	pop(rdi);
+	pop(rsi);
+	pop(rbx);
+
+	ret();
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+	// int skip = left & 3;
+
+	mov(rbx, rdx);
+	and(rdx, 3);
+
+	// left -= skip;
+
+	sub(rbx, rdx);
+
+	// int steps = pixels + skip - 4;
+
+	lea(rcx, ptr[rcx + rdx - 4]);
+
+	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
+
+	shl(rdx, 4);
+
+	vmovdqa(xmm15, ptr[rdx + r10]);
+
+	mov(rax, rcx);
+	sar(rax, 63);
+	and(rax, rcx);
+	shl(rax, 4);
+
+	vpor(xmm15, ptr[rax + r10 + 7 * 16]);
+
+	// GSVector2i* fza_base = &m_local.gd->fzbr[top];
+
+	mov(rax, (size_t)m_local.gd->fzbr);
+	lea(rsi, ptr[rax + r8 * 8]);
+
+	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
+
+	mov(rax, (size_t)m_local.gd->fzbc);
+	lea(rdi, ptr[rax + rbx * 2]);
+
+	if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
+	{
+		// edx = &m_local.d[skip]
+
+		lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]);
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
+		{
+			vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p
+
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+
+				vcvttps2dq(xmm9, xmm0);
+				vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2));
+				vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2));
+				vpaddw(xmm9, ptr[rdx + 16 * 6]);
+			}
+
+			if(m_sel.zb)
+			{
+				// z = vp.zzzz() + m_local.d[skip].z;
+
+				vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+				vaddps(xmm8, ptr[rdx]);
+			}
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.edge || m_sel.tfx != TFX_NONE)
+		{
+			vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t
+		}
+
+		if(m_sel.edge)
+		{
+			vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+			vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+			vpsrlw(xmm1, 9);
+
+			vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1);
+		}
+
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i vti(vt);
+
+				vcvttps2dq(xmm0, xmm0);
+
+				// s = vti.xxxx() + m_local.d[skip].s;
+				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
+
+				vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
+				}
+				else
+				{
+					if(m_sel.ltf)
+					{
+						vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0));
+						vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+						vpsrlw(xmm6, 1);
+					}
+				}
+			}
+			else
+			{
+				// s = vt.xxxx() + m_local.d[skip].s;
+				// t = vt.yyyy() + m_local.d[skip].t;
+				// q = vt.zzzz() + m_local.d[skip].q;
+
+				vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
+				vaddps(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
+				vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i vc = GSVector4i(v.c);
+
+				vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c
+
+				// vc = vc.upl16(vc.zwxy());
+
+				vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+				vpunpcklwd(xmm0, xmm1);
+
+				// rb = vc.xxxx().add16(m_local.d[skip].rb);
+				// ga = vc.zzzz().add16(m_local.d[skip].ga);
+
+				vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vpaddw(xmm13, ptr[rdx + offsetof(GSScanlineLocalData::skip, rb)]);
+				vpaddw(xmm14, ptr[rdx + offsetof(GSScanlineLocalData::skip, ga)]);
+			}
+			else
+			{
+				vmovdqa(xmm13, ptr[r11 + offsetof(GSScanlineLocalData, c.rb)]);
+				vmovdqa(xmm14, ptr[r11 + offsetof(GSScanlineLocalData, c.ga)]);
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 4;
+
+	sub(rcx, 4);
+
+	// fza_offset++;
+
+	add(rdi, 8);
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// z += m_local.d4.z;
+
+		if(m_sel.zb)
+		{
+			vaddps(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, d4.z)]);
+		}
+
+		// f = f.add16(m_local.d4.f);
+
+		if(m_sel.fwrite && m_sel.fge)
+		{
+			vpaddw(xmm9, ptr[r11 + offsetof(GSScanlineLocalData, d4.f)]);
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i st = m_local.d4.st;
+
+				// si += st.xxxx();
+				// if(!sprite) ti += st.yyyy();
+
+				vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]);
+
+				vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vpaddd(xmm10, xmm1);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+					vpaddd(xmm11, xmm1);
+				}
+			}
+			else
+			{
+				// GSVector4 stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// t += stq.yyyy();
+				// q += stq.zzzz();
+
+				vmovaps(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]);
+
+				vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(xmm10, xmm1);
+				vaddps(xmm11, xmm2);
+				vaddps(xmm12, xmm3);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i c = m_local.d4.c;
+
+				// rb = rb.add16(c.xxxx());
+				// ga = ga.add16(c.yyyy());
+
+				vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.c)]);
+
+				vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddw(xmm13, xmm1);
+				vpaddw(xmm14, xmm2);
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				vpxor(xmm0, xmm0);
+				vpmaxsw(xmm13, xmm0);
+				vpmaxsw(xmm14, xmm0);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+				}
+			}
+		}
+	}
+
+	// test = m_test[7 + (steps & (steps >> 31))];
+
+	mov(rdx, rcx);
+	sar(rdx, 63);
+	and(rdx, rcx);
+	shl(rdx, 4);
+
+	vmovdqa(xmm15, ptr[rdx + r10 + 7 * 16]);
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
+{
+	if(!m_sel.zb)
+	{
+		return;
+	}
+
+	// int za = fza_base.y + fza_offset->y;
+
+	movsxd(rbp, dword[rsi + 4]);
+	movsxd(rax, dword[rdi + 4]);
+	add(rbp, rax);
+
+	// GSVector4i zs = zi;
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.zoverflow)
+		{
+			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+			
+			mov(rax, (size_t)&GSVector4::m_half);
+
+			vbroadcastss(xmm0, ptr[rax]);
+			vmulps(xmm0, xmm8);
+			vcvttps2dq(xmm0, xmm0);
+			vpslld(xmm0, 1);
+
+			vcvttps2dq(xmm1, xmm8);
+			vpcmpeqd(xmm2, xmm2);
+			vpsrld(xmm2, 31);
+			vpand(xmm1, xmm2);
+
+			vpor(xmm0, xmm1);
+		}
+		else
+		{
+			// zs = GSVector4i(z);
+
+			vcvttps2dq(xmm0, xmm8);
+		}
+
+		if(m_sel.zwrite)
+		{
+			vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0);
+		}
+	}
+
+	if(m_sel.ztest)
+	{
+		ReadPixel(xmm1, rbp);
+
+		if(m_sel.zwrite && m_sel.zpsm < 2)
+		{
+			vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1);
+		}
+
+		// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+		if(m_sel.zpsm)
+		{
+			vpslld(xmm1, m_sel.zpsm * 8);
+			vpsrld(xmm1, m_sel.zpsm * 8);
+		}
+
+		if(m_sel.zoverflow || m_sel.zpsm == 0)
+		{
+			// GSVector4i o = GSVector4i::x80000000();
+
+			vpcmpeqd(xmm2, xmm2);
+			vpslld(xmm2, 31);
+
+			// GSVector4i zso = zs - o;
+			// GSVector4i zdo = zd - o;
+
+			vpsubd(xmm0, xmm2);
+			vpsubd(xmm1, xmm2);
+		}
+
+		switch(m_sel.ztst)
+		{
+		case ZTST_GEQUAL:
+			// test |= zso < zdo; // ~(zso >= zdo)
+			vpcmpgtd(xmm1, xmm0);
+			vpor(xmm15, xmm1);
+			break;
+
+		case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+			// test |= zso <= zdo; // ~(zso > zdo)
+			vpcmpgtd(xmm0, xmm1);
+			vpcmpeqd(xmm2, xmm2);
+			vpxor(xmm0, xmm2);
+			vpor(xmm15, xmm0);
+			break;
+		}
+
+		alltrue();
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]);
+
+	// ebx = tex
+
+	if(!m_sel.fst)
+	{
+		vrcpps(xmm0, xmm12);
+
+		vmulps(xmm4, xmm10, xmm0);
+		vmulps(xmm5, xmm11, xmm0);
+
+		vcvttps2dq(xmm4, xmm4);
+		vcvttps2dq(xmm5, xmm5);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			vmovd(xmm0, eax);
+			vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+			vpsubd(xmm4, xmm0);
+			vpsubd(xmm5, xmm0);
+		}
+	}
+	else
+	{
+		vmovdqa(xmm4, xmm10);
+		vmovdqa(xmm5, xmm11);
+	}
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+
+		vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(xmm6, 1);
+
+		if(m_sel.prim != GS_SPRITE_CLASS)
+		{
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(xmm7, 1);
+		}
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	vpsrad(xmm4, 16);
+	vpsrad(xmm5, 16);
+	vpackssdw(xmm4, xmm5);
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		vpcmpeqd(xmm0, xmm0);
+		vpsrlw(xmm0, 15);
+		vpaddw(xmm5, xmm4, xmm0);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		Wrap(xmm4, xmm5);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		Wrap(xmm4);
+	}
+
+	// xmm4 = uv0
+	// xmm5 = uv1 (ltf)
+	// xmm6 = uf
+	// xmm7 = vf
+
+	// GSVector4i x0 = uv0.upl16();
+	// GSVector4i y0 = uv0.uph16() << tw;
+
+	vpxor(xmm0, xmm0);
+
+	vpunpcklwd(xmm2, xmm4, xmm0);
+	vpunpckhwd(xmm3, xmm4, xmm0);
+	vpslld(xmm3, m_sel.tw + 3);
+
+	// xmm0 = 0
+	// xmm2 = x0
+	// xmm3 = y0
+	// xmm5 = uv1 (ltf)
+	// xmm6 = uf
+	// xmm7 = vf
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i x1 = uv1.upl16();
+		// GSVector4i y1 = uv1.uph16() << tw;
+
+		vpunpcklwd(xmm4, xmm5, xmm0);
+		vpunpckhwd(xmm5, xmm5, xmm0);
+		vpslld(xmm5, m_sel.tw + 3);
+
+		// xmm2 = x0
+		// xmm3 = y0
+		// xmm4 = x1
+		// xmm5 = y1
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		vpaddd(xmm0, xmm3, xmm2);
+		vpaddd(xmm1, xmm3, xmm4);
+		vpaddd(xmm2, xmm5, xmm2);
+		vpaddd(xmm3, xmm5, xmm4);
+
+		// xmm0 = addr00
+		// xmm1 = addr01
+		// xmm2 = addr10
+		// xmm3 = addr11
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+		
+		ReadTexel(4, 0);
+
+		// xmm0 = c00
+		// xmm1 = c01
+		// xmm2 = c10
+		// xmm3 = c11
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		vpsllw(xmm4, xmm0, 8);
+		vpsrlw(xmm4, 8);
+		vpsrlw(xmm5, xmm0, 8);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		vpsllw(xmm0, xmm1, 8);
+		vpsrlw(xmm0, 8);
+		vpsrlw(xmm1, 8);
+
+		// xmm0 = rb01
+		// xmm1 = ga01
+		// xmm2 = c10
+		// xmm3 = c11
+		// xmm4 = rb00
+		// xmm5 = ga00
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// rb00 = rb00.lerp16<0>(rb01, uf);
+		// ga00 = ga00.lerp16<0>(ga01, uf);
+
+		lerp16(xmm0, xmm4, xmm6, 0);
+		lerp16(xmm1, xmm5, xmm6, 0);
+
+		// xmm0 = rb00
+		// xmm1 = ga00
+		// xmm2 = c10
+		// xmm3 = c11
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		vpsrlw(xmm5, xmm2, 8);
+		vpsllw(xmm2, 8);
+		vpsrlw(xmm4, xmm2, 8);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		vpsrlw(xmm2, xmm3, 8);
+		vpsllw(xmm3, 8);
+		vpsrlw(xmm3, 8);
+
+		// xmm0 = rb00
+		// xmm1 = ga00
+		// xmm2 = rb11
+		// xmm3 = ga11
+		// xmm4 = rb10
+		// xmm5 = ga10
+		// xmm6 = uf
+		// xmm7 = vf
+
+		// rb10 = rb10.lerp16<0>(rb11, uf);
+		// ga10 = ga10.lerp16<0>(ga11, uf);
+
+		lerp16(xmm2, xmm4, xmm6, 0);
+		lerp16(xmm3, xmm5, xmm6, 0);
+
+		// xmm0 = rb00
+		// xmm1 = ga00
+		// xmm2 = rb10
+		// xmm3 = ga10
+		// xmm7 = vf
+
+		// rb00 = rb00.lerp16<0>(rb10, vf);
+		// ga00 = ga00.lerp16<0>(ga10, vf);
+
+		lerp16(xmm2, xmm0, xmm7, 0);
+		lerp16(xmm3, xmm1, xmm7, 0);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		vpaddd(xmm3, xmm2);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		vpsrlw(xmm3, xmm2, 8);
+		vpsllw(xmm2, 8);
+		vpsrlw(xmm2, 8);
+	}
+
+	// xmm2 = rb
+	// xmm3 = ga
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
+{
+	// xmm0, xmm1, xmm2, xmm3 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv, xmm0);
+			}
+
+			vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+		}
+		else
+		{
+			vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+
+			if(region)
+			{
+				vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+		vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+		vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]);
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv, xmm2);
+
+		if(region)
+		{
+			vpor(xmm1, xmm3);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv, xmm2);
+		vpminsw(uv, xmm3);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
+{
+	// xmm0, xmm1, xmm2, xmm3 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+				vpmaxsw(uv0, xmm0);
+				vpmaxsw(uv1, xmm0);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv0, xmm0);
+				vpmaxsw(uv1, xmm0);
+			}
+
+			vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+			vpminsw(uv0, xmm0);
+			vpminsw(uv1, xmm0);
+		}
+		else
+		{
+			vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+			vpand(uv0, xmm0);
+			vpand(uv1, xmm0);
+
+			if(region)
+			{
+				vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+				vpor(uv0, xmm0);
+				vpor(uv1, xmm0);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
+		vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
+		vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]);
+
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv0, xmm2);
+
+		if(region)
+		{
+			vpor(xmm1, xmm3);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv0, xmm2);
+		vpminsw(uv0, xmm3);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv0, xmm1, xmm0);
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv1, xmm2);
+
+		if(region)
+		{
+			vpor(xmm1, xmm3);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv1, xmm2);
+		vpminsw(uv1, xmm3);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv1, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTextureLOD()
+{
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
+{
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
+{
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// gat = gat.modulate16<1>(ga).clamp8();
+
+		modulate16(xmm3, xmm14, 1);
+
+		clamp16(xmm3, xmm0);
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			vpsrlw(xmm1, xmm14, 7);
+
+			mix16(xmm3, xmm1, xmm0);
+		}
+
+		break;
+
+	case TFX_DECAL:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			vpsrlw(xmm1, xmm14, 7);
+
+			mix16(xmm3, xmm1, xmm0);
+		}
+
+		break;
+
+	case TFX_HIGHLIGHT:
+
+		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+		vpsrlw(xmm1, xmm14, 7);
+
+		if(m_sel.tcc) 
+		{
+			vpaddusb(xmm1, xmm3);
+		}
+
+		mix16(xmm3, xmm1, xmm0);
+
+		break;
+
+	case TFX_HIGHLIGHT2:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			vpsrlw(xmm1, xmm14, 7);
+
+			mix16(xmm3, xmm1, xmm0);
+		}
+
+		break;
+
+	case TFX_NONE:
+
+		// gat = iip ? ga.srl16(7) : ga;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(xmm3, xmm14, 7);
+		}
+
+		break;
+	}
+
+	// TODO: aa1
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+	if(m_sel.fwrite)
+	{
+		vmovdqa(xmm4, ptr[r12 + offsetof(GSScanlineGlobalData, fm)]);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vmovdqa(xmm5, ptr[r12 + offsetof(GSScanlineGlobalData, zm)]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+	switch(m_sel.afail)
+	{
+	case AFAIL_FB_ONLY:
+		if(!m_sel.zwrite) return;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		if(!m_sel.fwrite) return;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		if(!m_sel.zwrite && m_sel.fpsm == 1) return;
+		break;
+	}
+
+	switch(m_sel.atst)
+	{
+	case ATST_NEVER:
+		// t = GSVector4i::xffffffff();
+		vpcmpeqd(xmm1, xmm1);
+		break;
+
+	case ATST_ALWAYS:
+		return;
+
+	case ATST_LESS:
+	case ATST_LEQUAL:
+		// t = (ga >> 16) > m_local.gd->aref;
+		vpsrld(xmm1, xmm3, 16);
+		vpcmpgtd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]);
+		break;
+
+	case ATST_EQUAL:
+		// t = (ga >> 16) != m_local.gd->aref;
+		vpsrld(xmm1, xmm3, 16);
+		vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]);
+		vpcmpeqd(xmm0, xmm0);
+		vpxor(xmm1, xmm0);
+		break;
+
+	case ATST_GEQUAL:
+	case ATST_GREATER:
+		// t = (ga >> 16) < m_local.gd->aref;
+		vpsrld(xmm0, xmm3, 16);
+		vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]);
+		vpcmpgtd(xmm1, xmm0);
+		break;
+
+	case ATST_NOTEQUAL:
+		// t = (ga >> 16) == m_local.gd->aref;
+		vpsrld(xmm1, xmm3, 16);
+		vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]);
+		break;
+	}
+
+	switch(m_sel.afail)
+	{
+	case AFAIL_KEEP:
+		// test |= t;
+		vpor(xmm15, xmm1);
+		alltrue();
+		break;
+
+	case AFAIL_FB_ONLY:
+		// zm |= t;
+		vpor(xmm5, xmm1);
+		break;
+
+	case AFAIL_ZB_ONLY:
+		// fm |= t;
+		vpor(xmm4, xmm1);
+		break;
+
+	case AFAIL_RGB_ONLY:
+		// zm |= t;
+		vpor(xmm5, xmm1);
+		// fm |= t & GSVector4i::xff000000();
+		vpsrld(xmm1, 24);
+		vpslld(xmm1, 24);
+		vpor(xmm4, xmm1);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// rbt = rbt.modulate16<1>(rb).clamp8();
+
+		modulate16(xmm2, xmm13, 1);
+
+		clamp16(xmm2, xmm0);
+
+		break;
+
+	case TFX_DECAL:
+
+		break;
+
+	case TFX_HIGHLIGHT:
+	case TFX_HIGHLIGHT2:
+
+		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+		vmovdqa(xmm1, xmm3);
+
+		modulate16(xmm3, xmm14, 1);
+
+		vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1));
+		vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1));
+		vpsrlw(xmm6, 7);
+
+		vpaddw(xmm3, xmm6);
+
+		clamp16(xmm3, xmm0);
+		
+		mix16(xmm3, xmm1, xmm0);
+
+		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+		modulate16(xmm2, xmm13, 1);
+
+		vpaddw(xmm2, xmm6);
+		
+		clamp16(xmm2, xmm0);
+
+		break;
+
+	case TFX_NONE:
+
+		// rbt = iip ? rb.srl16(7) : rb;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(xmm2, xmm13, 7);
+		}
+
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+	if(!m_sel.fwrite || !m_sel.fge)
+	{
+		return;
+	}
+
+	// rb = m_local.gd->frb.lerp16<0>(rb, f);
+	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
+
+	vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]);
+	vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]);
+
+	vmovdqa(xmm6, xmm3);
+
+	lerp16(xmm2, xmm0, xmm9, 0);
+	lerp16(xmm3, xmm1, xmm9, 0);
+
+	mix16(xmm3, xmm6, xmm9);
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	// int fa = fza_base.x + fza_offset->x;
+
+	mov(ebx, dword[rsi]);
+	add(ebx, dword[rdi]);
+	movsxd(rbx, ebx);
+
+	if(!m_sel.rfb)
+	{
+		return;
+	}
+
+	ReadPixel(xmm6, rbx);
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+	if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
+	{
+		return;
+	}
+
+	// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
+
+	if(m_sel.datm)
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpxor(xmm0, xmm0);
+			//vpsrld(xmm1, xmm6, 15);
+			vpslld(xmm1, xmm6, 16);
+			vpsrad(xmm1, 31);
+			vpcmpeqd(xmm1, xmm0);
+		}
+		else
+		{
+			vpcmpeqd(xmm0, xmm0);
+			vpxor(xmm1, xmm6, xmm0);
+			vpsrad(xmm1, 31);
+		}
+	}
+	else
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpslld(xmm1, xmm6, 16);
+			vpsrad(xmm1, 31);
+		}
+		else
+		{
+			vpsrad(xmm1, xmm6, 31);
+		}
+	}
+
+	vpor(xmm15, xmm1);
+
+	alltrue();
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+	// fm |= test;
+	// zm |= test;
+
+	if(m_sel.fwrite)
+	{
+		vpor(xmm4, xmm15);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vpor(xmm5, xmm15);
+	}
+
+	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+
+	vpcmpeqd(xmm1, xmm1);
+
+	if(m_sel.fwrite && m_sel.zwrite)
+	{
+		vpcmpeqd(xmm0, xmm1, xmm5);
+		vpcmpeqd(xmm1, xmm4);
+		vpackssdw(xmm1, xmm0);
+	}
+	else if(m_sel.fwrite)
+	{
+		vpcmpeqd(xmm1, xmm4);
+		vpackssdw(xmm1, xmm1);
+	}
+	else if(m_sel.zwrite)
+	{
+		vpcmpeqd(xmm1, xmm5);
+		vpackssdw(xmm1, xmm1);
+	}
+
+	vpmovmskb(edx, xmm1);
+
+	not(edx);
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+	if(!m_sel.zwrite)
+	{
+		return;
+	}
+
+	bool fast = m_sel.ztest && m_sel.zpsm < 2;
+
+	vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]);
+
+	if(fast)
+	{
+		// zs = zs.blend8(zd, zm);
+
+		vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4);
+	}
+
+	WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.abe == 0 && m_sel.aa1 == 0)
+	{
+		return;
+	}
+
+	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
+	{
+		switch(m_sel.fpsm)
+		{
+		case 0:
+		case 1:
+
+			// c[2] = fd & mask;
+			// c[3] = (fd >> 8) & mask;
+
+			vpsllw(xmm0, xmm6, 8);
+			vpsrlw(xmm0, 8);
+			vpsrlw(xmm1, xmm6, 8);
+
+			break;
+
+		case 2:
+
+			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+			vpcmpeqd(xmm15, xmm15);
+
+			vpsrld(xmm15, 27); // 0x0000001f
+			vpand(xmm0, xmm6, xmm15);
+			vpslld(xmm0, 3);
+
+			vpslld(xmm15, 10); // 0x00007c00
+			vpand(xmm5, xmm6, xmm15);
+			vpslld(xmm5, 9);
+
+			vpor(xmm0, xmm1);
+
+			vpsrld(xmm15, 5); // 0x000003e0
+			vpand(xmm1, xmm6, xmm15);
+			vpsrld(xmm1, 2);
+
+			vpsllw(xmm15, 10); // 0x00008000
+			vpand(xmm5, xmm6, xmm15);
+			vpslld(xmm5, 8);
+
+			vpor(xmm1, xmm5);
+
+			break;
+		}
+	}
+
+	// xmm2, xmm3 = src rb, ga
+	// xmm0, xmm1 = dst rb, ga
+	// xmm5, xmm15 = free
+
+	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
+	{
+		vmovdqa(xmm5, xmm2);
+	}
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// rb = c[aba * 2 + 0];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm2, xmm0); break;
+		case 2: vpxor(xmm2, xmm2); break;
+		}
+
+		// rb = rb.sub16(c[abb * 2 + 0]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(xmm2, xmm5); break;
+		case 1: vpsubw(xmm2, xmm0); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
+
+			switch(m_sel.abc)
+			{
+			case 0:
+			case 1:
+				vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1));
+				vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1));
+				vpsllw(xmm15, 7);
+				break;
+			case 2:
+				vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]);
+				break;
+			}
+
+			// rb = rb.modulate16<1>(a);
+
+			modulate16(xmm2, xmm15, 1);
+		}
+
+		// rb = rb.add16(c[abd * 2 + 0]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(xmm2, xmm5); break;
+		case 1: vpaddw(xmm2, xmm0); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// rb = c[abd * 2 + 0];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm2, xmm0); break;
+		case 2: vpxor(xmm2, xmm2); break;
+		}
+	}
+
+	if(m_sel.pabe)
+	{
+		// mask = (c[1] << 8).sra32(31);
+
+		vpslld(xmm0, xmm3, 8);
+		vpsrad(xmm0, 31);
+
+		// rb = c[0].blend8(rb, mask);
+
+		vpblendvb(xmm2, xmm5, xmm2, xmm0);
+	}
+
+	// xmm0 = pabe mask
+	// xmm3 = src ga
+	// xmm1 = dst ga
+	// xmm2 = rb
+	// xmm15 = a
+	// xmm5 = free
+
+	vmovdqa(xmm5, xmm3);
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// ga = c[aba * 2 + 1];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm3, xmm1); break;
+		case 2: vpxor(xmm3, xmm3); break;
+		}
+
+		// ga = ga.sub16(c[abeb * 2 + 1]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(xmm3, xmm5); break;
+		case 1: vpsubw(xmm3, xmm1); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// ga = ga.modulate16<1>(a);
+
+			modulate16(xmm3, xmm15, 1);
+		}
+
+		// ga = ga.add16(c[abd * 2 + 1]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(xmm3, xmm5); break;
+		case 1: vpaddw(xmm3, xmm1); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// ga = c[abd * 2 + 1];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm3, xmm1); break;
+		case 2: vpxor(xmm3, xmm3); break;
+		}
+	}
+
+	// xmm0 = pabe mask
+	// xmm5 = src ga
+	// xmm2 = rb
+	// xmm3 = ga
+	// xmm1, xmm15 = free
+
+	if(m_sel.pabe)
+	{
+		vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+		vpblendvb(xmm3, xmm5, xmm3, xmm0);
+	}
+	else
+	{
+		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		{
+			mix16(xmm3, xmm5, xmm15);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.colclamp == 0)
+	{
+		// c[0] &= 0x00ff00ff;
+		// c[1] &= 0x00ff00ff;
+
+		vpcmpeqd(xmm15, xmm15);
+		vpsrlw(xmm15, 8);
+		vpand(xmm2, xmm15);
+		vpand(xmm3, xmm15);
+	}
+
+	if(m_sel.fpsm == 2 && m_sel.dthe)
+	{
+		mov(rax, r8);
+		and(rax, 3);
+		shl(rax, 5);
+		vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 0]);
+		vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx) + sizeof(GSVector4i) * 1]);
+	}
+
+	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+	vpunpckhwd(xmm15, xmm2, xmm3);
+	vpunpcklwd(xmm2, xmm3);
+	vpackuswb(xmm2, xmm15);
+
+	if(m_sel.fba && m_sel.fpsm != 1)
+	{
+		// fs |= 0x80000000;
+
+		vpcmpeqd(xmm15, xmm15);
+		vpslld(xmm15, 31);
+		vpor(xmm2, xmm15);
+	}
+
+	// xmm2 = fs
+	// xmm4 = fm
+	// xmm6 = fd
+
+	if(m_sel.fpsm == 2)
+	{
+		// GSVector4i rb = fs & 0x00f800f8;
+		// GSVector4i ga = fs & 0x8000f800;
+
+		mov(eax, 0x00f800f8);
+		vmovd(xmm0, eax);
+		vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		mov(eax, 0x8000f800);
+		vmovd(xmm1, eax);
+		vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
+
+		vpand(xmm0, xmm2);
+		vpand(xmm1, xmm2);
+
+		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+		vpsrld(xmm2, xmm0, 9);
+		vpsrld(xmm0, 3);
+		vpsrld(xmm3, xmm1, 16);
+		vpsrld(xmm1, 6);
+
+		vpor(xmm0, xmm1);
+		vpor(xmm2, xmm3);
+		vpor(xmm2, xmm0);
+	}
+
+	if(m_sel.rfb)
+	{
+		// fs = fs.blend(fd, fm);
+
+		blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm
+	}
+
+	bool fast = m_sel.rfb && m_sel.fpsm < 2;
+
+	WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr)
+{
+	vmovq(dst, qword[r13 + addr * 2]);
+	vmovhps(dst, qword[r13 + addr * 2 + 8 * 2]);
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
+{
+	if(fast)
+	{
+		// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
+		// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
+
+		test(mask, 0x0f);
+		je("@f");
+		vmovq(qword[r13 + addr * 2], src);
+		L("@@");
+
+		test(mask, 0xf0);
+		je("@f");
+		vmovhps(qword[r13 + addr * 2 + 8 * 2], src);
+		L("@@");
+
+		// vmaskmovps?
+	}
+	else
+	{
+		// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
+		// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
+		// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
+		// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
+
+		test(mask, 0x03);
+		je("@f");
+		WritePixel(src, addr, 0, psm);
+		L("@@");
+
+		test(mask, 0x0c);
+		je("@f");
+		WritePixel(src, addr, 1, psm);
+		L("@@");
+
+		test(mask, 0x30);
+		je("@f");
+		WritePixel(src, addr, 2, psm);
+		L("@@");
+
+		test(mask, 0xc0);
+		je("@f");
+		WritePixel(src, addr, 3, psm);
+		L("@@");
+	}
+}
+
+static const int s_offsets[4] = {0, 2, 8, 10};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm)
+{
+	Address dst = ptr[r13 + addr * 2 + s_offsets[i] * 2];
+
+	switch(psm)
+	{
+	case 0:
+		if(i == 0) vmovd(dst, src);
+		else vpextrd(dst, src, i);
+		break;
+	case 1:
+		if(i == 0) vmovd(eax, src);
+		else vpextrd(eax, src, i);
+		xor(eax, dst);
+		and(eax, 0xffffff);
+		xor(dst, eax);
+		break;
+	case 2:
+		vpextrw(eax, src, i * 2);
+		mov(dst, ax);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
+{
+	// TODO
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
+{
+	const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4];
+
+	if(i == 0) vmovd(eax, addr);
+	else vpextrd(eax, addr, i);
+
+	if(m_sel.tlu) movzx(rax, byte[rbx + rax]);
+
+	if(i == 0) vmovd(dst, src);
+	else vpinsrd(dst, src, i);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.cpp
new file mode 100644
index 0000000000..40631c1c9e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x64.cpp
@@ -0,0 +1,121 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+
+#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
+{
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
+{
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
+{
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr)
+{
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
+{
+}
+
+static const int s_offsets[4] = {0, 2, 8, 10};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm)
+{
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
+{
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
+{
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx.cpp
new file mode 100644
index 0000000000..9f7b7ef74c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx.cpp
@@ -0,0 +1,2921 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+
+static const int _args = 16;
+static const int _top = _args + 4;
+static const int _v = _args + 8;
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+//ret(8);
+	push(ebx);
+	push(esi);
+	push(edi);
+	push(ebp);
+
+	Init();
+
+	if(!m_sel.edge)
+	{
+		align(16);
+	}
+
+L("loop");
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// xmm0 = z/zi
+	// xmm2 = s/u (tme)
+	// xmm3 = t/v (tme)
+	// xmm4 = q (tme)
+	// xmm5 = rb (!tme)
+	// xmm6 = ga (!tme)
+	// xmm7 = test
+
+	bool tme = m_sel.tfx != TFX_NONE;
+
+	TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - xmm0
+	// xmm2 = s/u (tme)
+	// xmm3 = t/v (tme)
+	// xmm4 = q (tme)
+	// xmm5 = rb (!tme)
+	// xmm6 = ga (!tme)
+	// xmm7 = test
+
+	if(m_sel.mmin)
+	{
+		SampleTextureLOD();
+	}
+	else
+	{
+		SampleTexture();
+	}
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - xmm2
+	// - xmm3
+	// - xmm4
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	AlphaTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ReadMask();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	TestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ColorTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	Fog();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ReadFrame();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	TestDestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	WriteMask();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+
+	WriteZBuf();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// - ebp
+	// xmm2 = fd
+	// xmm3 = fm
+	// - xmm4
+	// xmm5 = rb
+	// xmm6 = ga
+
+	AlphaBlend();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm5 = rb
+	// xmm6 = ga
+
+	WriteFrame();
+
+L("step");
+
+	// if(steps <= 0) break;
+
+	if(!m_sel.edge)
+	{
+		test(ecx, ecx);
+
+		jle("exit", T_NEAR);
+
+		Step();
+
+		jmp("loop", T_NEAR);
+	}
+
+L("exit");
+
+	// vzeroupper();
+
+	pop(ebp);
+	pop(edi);
+	pop(esi);
+	pop(ebx);
+
+	ret(8);
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+	if(!m_sel.notest)
+	{
+		// int skip = left & 3;
+
+		mov(ebx, edx);
+		and(edx, 3);
+
+		// int steps = pixels + skip - 4;
+
+		lea(ecx, ptr[ecx + edx - 4]);
+
+		// left -= skip;
+
+		sub(ebx, edx);
+
+		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
+
+		shl(edx, 4);
+
+		vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
+
+		mov(eax, ecx);
+		sar(eax, 31);
+		and(eax, ecx);
+		shl(eax, 4);
+
+		vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
+	}
+	else
+	{
+		mov(ebx, edx); // left
+		xor(edx, edx); // skip
+		lea(ecx, ptr[ecx - 4]); // steps
+	}
+
+	// GSVector2i* fza_base = &m_local.gd->fzbr[top];
+
+	mov(esi, ptr[esp + _top]);
+	lea(esi, ptr[esi * 8]);
+	add(esi, ptr[&m_local.gd->fzbr]);
+
+	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
+
+	lea(edi, ptr[ebx * 2]);
+	add(edi, ptr[&m_local.gd->fzbc]);
+
+	if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
+	{
+		// edx = &m_local.d[skip]
+
+		lea(edx, ptr[edx * 8 + (size_t)m_local.d]);
+
+		// ebx = &v
+
+		mov(ebx, ptr[esp + _v]);
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
+		{
+			vmovaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
+
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+
+				vcvttps2dq(xmm1, xmm0);
+				vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vpaddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
+
+				vmovdqa(ptr[&m_local.temp.f], xmm1);
+			}
+
+			if(m_sel.zb)
+			{
+				// z = vp.zzzz() + m_local.d[skip].z;
+
+				vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+				vmovaps(ptr[&m_local.temp.z], xmm0);
+				vmovaps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
+				vmovaps(ptr[&m_local.temp.zo], xmm2);
+				vaddps(xmm0, xmm2);
+			}
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			vmovdqa(xmm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.edge || m_sel.tfx != TFX_NONE)
+		{
+			vmovaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
+		}
+
+		if(m_sel.edge)
+		{
+			// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
+
+			vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+			vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
+			vpsrlw(xmm3, 9);
+
+			vmovdqa(ptr[&m_local.temp.cov], xmm3);
+		}
+
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i vti(vt);
+
+				vcvttps2dq(xmm6, xmm4);
+
+				// s = vti.xxxx() + m_local.d[skip].s;
+				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
+
+				vpshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					vpaddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				}
+				else
+				{
+					if(m_sel.ltf)
+					{
+						vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+						vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+						vpsrlw(xmm6, 12);
+						vmovdqa(ptr[&m_local.temp.vf], xmm6);
+					}
+				}
+
+				vmovdqa(ptr[&m_local.temp.s], xmm2);
+				vmovdqa(ptr[&m_local.temp.t], xmm3);
+			}
+			else
+			{
+				// s = vt.xxxx() + m_local.d[skip].s;
+				// t = vt.yyyy() + m_local.d[skip].t;
+				// q = vt.zzzz() + m_local.d[skip].q;
+
+				vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+				vaddps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				vaddps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
+
+				vmovaps(ptr[&m_local.temp.s], xmm2);
+				vmovaps(ptr[&m_local.temp.t], xmm3);
+				vmovaps(ptr[&m_local.temp.q], xmm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i vc = GSVector4i(v.c);
+
+				vcvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
+
+				// vc = vc.upl16(vc.zwxy());
+
+				vpshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
+				vpunpcklwd(xmm6, xmm5);
+
+				// rb = vc.xxxx().add16(m_local.d[skip].rb);
+				// ga = vc.zzzz().add16(m_local.d[skip].ga);
+
+				vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vpaddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
+				vpaddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
+
+				vmovdqa(ptr[&m_local.temp.rb], xmm5);
+				vmovdqa(ptr[&m_local.temp.ga], xmm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					vmovdqa(xmm5, ptr[&m_local.c.rb]);
+					vmovdqa(xmm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 4;
+
+	sub(ecx, 4);
+
+	// fza_offset++;
+
+	add(edi, 8);
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// z += m_local.d4.z;
+
+		if(m_sel.zb)
+		{
+			vmovaps(xmm0, ptr[&m_local.temp.zo]);
+			vaddps(xmm0, ptr[&m_local.d4.z]);
+			vmovaps(ptr[&m_local.temp.zo], xmm0);
+			vaddps(xmm0, ptr[&m_local.temp.z]);
+		}
+
+		// f = f.add16(m_local.d4.f);
+
+		if(m_sel.fwrite && m_sel.fge)
+		{
+			vmovdqa(xmm1, ptr[&m_local.temp.f]);
+			vpaddw(xmm1, ptr[&m_local.d4.f]);
+			vmovdqa(ptr[&m_local.temp.f], xmm1);
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			vmovdqa(xmm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// if(!sprite) t += stq.yyyy();
+
+				vmovdqa(xmm4, ptr[&m_local.d4.stq]);
+
+				vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vpaddd(xmm2, ptr[&m_local.temp.s]);
+				vmovdqa(ptr[&m_local.temp.s], xmm2);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+					vpaddd(xmm3, ptr[&m_local.temp.t]);
+					vmovdqa(ptr[&m_local.temp.t], xmm3);
+				}
+				else
+				{
+					vmovdqa(xmm3, ptr[&m_local.temp.t]);
+				}
+			}
+			else
+			{
+				// GSVector4 stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// t += stq.yyyy();
+				// q += stq.zzzz();
+
+				vmovaps(xmm4, ptr[&m_local.d4.stq]);
+
+				vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(xmm2, ptr[&m_local.temp.s]);
+				vaddps(xmm3, ptr[&m_local.temp.t]);
+				vaddps(xmm4, ptr[&m_local.temp.q]);
+
+				vmovaps(ptr[&m_local.temp.s], xmm2);
+				vmovaps(ptr[&m_local.temp.t], xmm3);
+				vmovaps(ptr[&m_local.temp.q], xmm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i c = m_local.d4.c;
+
+				// rb = rb.add16(c.xxxx());
+				// ga = ga.add16(c.yyyy());
+
+				vmovdqa(xmm7, ptr[&m_local.d4.c]);
+
+				vpshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddw(xmm5, ptr[&m_local.temp.rb]);
+				vpaddw(xmm6, ptr[&m_local.temp.ga]);
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				vpxor(xmm7, xmm7);
+				vpmaxsw(xmm5, xmm7);
+				vpmaxsw(xmm6, xmm7);
+
+				vmovdqa(ptr[&m_local.temp.rb], xmm5);
+				vmovdqa(ptr[&m_local.temp.ga], xmm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					vmovdqa(xmm5, ptr[&m_local.c.rb]);
+					vmovdqa(xmm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+
+	if(!m_sel.notest)
+	{
+		// test = m_test[7 + (steps & (steps >> 31))];
+
+		mov(edx, ecx);
+		sar(edx, 31);
+		and(edx, ecx);
+		shl(edx, 4);
+
+		vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
+{
+	if(!m_sel.zb)
+	{
+		return;
+	}
+
+	// int za = fza_base.y + fza_offset->y;
+
+	mov(ebp, ptr[esi + 4]);
+	add(ebp, ptr[edi + 4]);
+
+	// GSVector4i zs = zi;
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.zoverflow)
+		{
+			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+			vbroadcastss(temp1, ptr[&GSVector4::m_half]);
+			vmulps(temp1, xmm0);
+			vcvttps2dq(temp1, temp1);
+			vpslld(temp1, 1);
+
+			vcvttps2dq(xmm0, xmm0);
+			vpcmpeqd(temp2, temp2);
+			vpsrld(temp2, 31);
+			vpand(xmm0, temp2);
+
+			vpor(xmm0, temp1);
+		}
+		else
+		{
+			// zs = GSVector4i(z);
+
+			vcvttps2dq(xmm0, xmm0);
+		}
+
+		if(m_sel.zwrite)
+		{
+			vmovdqa(ptr[&m_local.temp.zs], xmm0);
+		}
+	}
+
+	if(m_sel.ztest)
+	{
+		ReadPixel(xmm1, ebp);
+
+		if(m_sel.zwrite && m_sel.zpsm < 2)
+		{
+			vmovdqa(ptr[&m_local.temp.zd], xmm1);
+		}
+
+		// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+		if(m_sel.zpsm)
+		{
+			vpslld(xmm1, m_sel.zpsm * 8);
+			vpsrld(xmm1, m_sel.zpsm * 8);
+		}
+
+		if(m_sel.zoverflow || m_sel.zpsm == 0)
+		{
+			// GSVector4i o = GSVector4i::x80000000();
+
+			vpcmpeqd(temp1, temp1);
+			vpslld(temp1, 31);
+
+			// GSVector4i zso = zs - o;
+			// GSVector4i zdo = zd - o;
+
+			vpsubd(xmm0, temp1);
+			vpsubd(xmm1, temp1);
+		}
+
+		switch(m_sel.ztst)
+		{
+		case ZTST_GEQUAL:
+			// test |= zso < zdo; // ~(zso >= zdo)
+			vpcmpgtd(xmm1, xmm0);
+			vpor(xmm7, xmm1);
+			break;
+
+		case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+			// test |= zso <= zdo; // ~(zso > zdo)
+			vpcmpgtd(xmm0, xmm1);
+			vpcmpeqd(temp1, temp1);
+			vpxor(xmm0, temp1);
+			vpor(xmm7, xmm0);
+			break;
+		}
+
+		alltrue();
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	mov(ebx, ptr[&m_local.gd->tex[0]]);
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	// ebx = tex
+	// edx = clut
+
+	if(!m_sel.fst)
+	{
+		vrcpps(xmm0, xmm4);
+
+		vmulps(xmm2, xmm0);
+		vmulps(xmm3, xmm0);
+
+		vcvttps2dq(xmm2, xmm2);
+		vcvttps2dq(xmm3, xmm3);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			vmovd(xmm4, eax);
+			vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+			vpsubd(xmm2, xmm4);
+			vpsubd(xmm3, xmm4);
+		}
+	}
+
+	// xmm2 = u
+	// xmm3 = v
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+
+		vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(xmm0, 12);
+		vmovdqa(ptr[&m_local.temp.uf], xmm0);
+
+		if(m_sel.prim != GS_SPRITE_CLASS)
+		{
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(xmm0, 12);
+			vmovdqa(ptr[&m_local.temp.vf], xmm0);
+		}
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	vpsrad(xmm2, 16);
+	vpsrad(xmm3, 16);
+	vpackssdw(xmm2, xmm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		vpcmpeqd(xmm1, xmm1);
+		vpsrlw(xmm1, 15);
+		vpaddw(xmm3, xmm2, xmm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		Wrap(xmm2, xmm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		Wrap(xmm2);
+	}
+
+	// xmm2 = uv0
+	// xmm3 = uv1 (ltf)
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+	// xmm7 = used
+
+	// GSVector4i y0 = uv0.uph16() << tw;
+	// GSVector4i x0 = uv0.upl16();
+
+	vpxor(xmm0, xmm0);
+
+	vpunpcklwd(xmm4, xmm2, xmm0);
+	vpunpckhwd(xmm2, xmm2, xmm0);
+	vpslld(xmm2, m_sel.tw + 3);
+
+	// xmm0 = 0
+	// xmm2 = y0
+	// xmm3 = uv1 (ltf)
+	// xmm4 = x0
+	// xmm1, xmm5, xmm6 = free
+	// xmm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i y1 = uv1.uph16() << tw;
+		// GSVector4i x1 = uv1.upl16();
+
+		vpunpcklwd(xmm6, xmm3, xmm0);
+		vpunpckhwd(xmm3, xmm3, xmm0);
+		vpslld(xmm3, m_sel.tw + 3);
+
+		// xmm2 = y0
+		// xmm3 = y1
+		// xmm4 = x0
+		// xmm6 = x1
+		// xmm0, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		vpaddd(xmm5, xmm2, xmm4);
+		vpaddd(xmm2, xmm2, xmm6);
+		vpaddd(xmm0, xmm3, xmm4);
+		vpaddd(xmm3, xmm3, xmm6);
+
+		// xmm5 = addr00
+		// xmm2 = addr01
+		// xmm0 = addr10
+		// xmm3 = addr11
+		// xmm1, xmm4, xmm6 = free
+		// xmm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// xmm6 = c00
+		// xmm4 = c01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm0, xmm2, xmm3 = free
+		// xmm7 = used
+
+		vmovdqa(xmm0, ptr[&m_local.temp.uf]);
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		vpsllw(xmm2, xmm6, 8);
+		vpsrlw(xmm2, 8);
+		vpsrlw(xmm6, 8);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		vpsllw(xmm3, xmm4, 8);
+		vpsrlw(xmm3, 8);
+		vpsrlw(xmm4, 8);
+
+		// xmm0 = uf
+		// xmm2 = rb00
+		// xmm3 = rb01
+		// xmm6 = ga00
+		// xmm4 = ga01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm7 = used
+
+		// rb00 = rb00.lerp16_4(rb01, uf);
+		// ga00 = ga00.lerp16_4(ga01, uf);
+
+		lerp16_4(xmm3, xmm2, xmm0);
+		lerp16_4(xmm4, xmm6, xmm0);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm2, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		vpsrlw(xmm2, xmm1, 8);
+		vpsllw(xmm1, 8);
+		vpsrlw(xmm1, 8);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		vpsrlw(xmm6, xmm5, 8);
+		vpsllw(xmm5, 8);
+		vpsrlw(xmm5, 8);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = rb10
+		// xmm5 = rb11
+		// xmm2 = ga10
+		// xmm6 = ga11
+		// xmm7 = used
+
+		// rb10 = rb10.lerp16_4(rb11, uf);
+		// ga10 = ga10.lerp16_4(ga11, uf);
+
+		lerp16_4(xmm5, xmm1, xmm0);
+		lerp16_4(xmm6, xmm2, xmm0);
+
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm5 = rb10
+		// xmm6 = ga10
+		// xmm0, xmm1, xmm2 = free
+		// xmm7 = used
+
+		// rb00 = rb00.lerp16_4(rb10, vf);
+		// ga00 = ga00.lerp16_4(ga10, vf);
+
+		vmovdqa(xmm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(xmm5, xmm3, xmm0);
+		lerp16_4(xmm6, xmm4, xmm0);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		vpaddd(xmm5, xmm2, xmm4);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		vpsllw(xmm5, xmm6, 8);
+		vpsrlw(xmm5, 8);
+		vpsrlw(xmm6, 8);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
+{
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv, ptr[&m_local.gd->t.min]);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv, xmm0);
+			}
+
+			vpminsw(uv, ptr[&m_local.gd->t.max]);
+		}
+		else
+		{
+			vpand(uv, ptr[&m_local.gd->t.min]);
+
+			if(region)
+			{
+				vpor(uv, ptr[&m_local.gd->t.max]);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+		vmovdqa(xmm5, ptr[&m_local.gd->t.max]);
+		vmovdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv, xmm4);
+
+		if(region)
+		{
+			vpor(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv, xmm4);
+		vpminsw(uv, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
+{
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+				vpmaxsw(uv0, xmm4);
+				vpmaxsw(uv1, xmm4);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv0, xmm0);
+				vpmaxsw(uv1, xmm0);
+			}
+
+			vmovdqa(xmm5, ptr[&m_local.gd->t.max]);
+			vpminsw(uv0, xmm5);
+			vpminsw(uv1, xmm5);
+		}
+		else
+		{
+			vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+			vpand(uv0, xmm4);
+			vpand(uv1, xmm4);
+
+			if(region)
+			{
+				vmovdqa(xmm5, ptr[&m_local.gd->t.max]);
+				vpor(uv0, xmm5);
+				vpor(uv1, xmm5);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+		vmovdqa(xmm5, ptr[&m_local.gd->t.max]);
+		vmovdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv0, xmm4);
+
+		if(region)
+		{
+			vpor(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv0, xmm4);
+		vpminsw(uv0, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv0, xmm1, xmm0);
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv1, xmm4);
+
+		if(region)
+		{
+			vpor(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv1, xmm4);
+		vpminsw(uv1, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv1, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTextureLOD()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	push(ebp);
+
+	mov(ebp, (size_t)m_local.gd->tex);
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	if(!m_sel.fst)
+	{
+		vrcpps(xmm0, xmm4);
+
+		vmulps(xmm2, xmm0);
+		vmulps(xmm3, xmm0);
+
+		vcvttps2dq(xmm2, xmm2);
+		vcvttps2dq(xmm3, xmm3);
+	}
+
+	// xmm2 = u
+	// xmm3 = v
+	// xmm4 = q
+	// xmm0 = xmm1 = xmm5 = xmm6 = free
+
+	// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?)
+
+	if(!m_sel.lcm)
+	{
+		// lod = -log2(Q) * (1 << L) + K
+
+		vpcmpeqd(xmm1, xmm1);
+		vpsrld(xmm1, xmm1, 25);
+		vpslld(xmm0, xmm4, 1);
+		vpsrld(xmm0, xmm0, 24);
+		vpsubd(xmm0, xmm1);
+		vcvtdq2ps(xmm0, xmm0); 
+
+		// xmm0 = (float)(exp(q) - 127)
+
+		vpslld(xmm4, xmm4, 9);
+		vpsrld(xmm4, xmm4, 9);
+		vorps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+			
+		// xmm4 = mant(q) | 1.0f
+
+		if(m_cpu.has(util::Cpu::tFMA))
+		{
+			vmovaps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0
+			vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * xmm4 + c1
+			vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * xmm4 + c1) * xmm4 + c2
+			vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // xmm4 - 1.0f
+			vfmadd213ps(xmm4, xmm5, xmm0); // ((c0 * xmm4 + c1) * xmm4 + c2) * (xmm4 - 1.0f) + xmm0
+		}
+		else
+		{
+			vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
+			vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
+			vmulps(xmm5, xmm4);
+			vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+			vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
+			vmulps(xmm4, xmm5);
+			vaddps(xmm4, xmm0);
+		}
+
+		// xmm4 = log2(Q) = ((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0
+
+		if(m_cpu.has(util::Cpu::tFMA))
+		{
+			vmovaps(xmm5, ptr[&m_local.gd->l]);
+			vfmadd213ps(xmm4, xmm5, ptr[&m_local.gd->k]); 
+		}
+		else
+		{
+			vmulps(xmm4, ptr[&m_local.gd->l]);
+			vaddps(xmm4, ptr[&m_local.gd->k]);
+		}
+
+		// xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000
+
+		vxorps(xmm0, xmm0);
+		vminps(xmm4, ptr[&m_local.gd->mxl]);
+		vmaxps(xmm4, xmm0);
+		vcvtps2dq(xmm4, xmm4);
+
+		if(m_sel.mmin == 1) // round-off mode
+		{
+			mov(eax, 0x8000);
+			vmovd(xmm0, eax);
+			vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			vpaddd(xmm4, xmm0);
+		}
+
+		vpsrld(xmm0, xmm4, 16);
+
+		vmovdqa(ptr[&m_local.temp.lod.i], xmm0);
+/*
+vpslld(xmm5, xmm0, 6);
+vpslld(xmm6, xmm4, 16);
+vpsrld(xmm6, xmm6, 24);
+return;
+*/
+		if(m_sel.mmin == 2) // trilinear mode
+		{
+			vpshuflw(xmm1, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
+			vmovdqa(ptr[&m_local.temp.lod.f], xmm1);
+		}
+
+		// shift u/v/minmax by (int)lod
+
+		if(m_cpu.has(util::Cpu::tAVX2))
+		{
+			vpsravd(xmm2, xmm2, xmm0);
+			vpsravd(xmm3, xmm3, xmm0);
+
+			vmovdqa(ptr[&m_local.temp.uv[0]], xmm2);
+			vmovdqa(ptr[&m_local.temp.uv[1]], xmm3);
+
+			// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
+
+			vpxor(xmm1, xmm1);
+
+			vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+			vpunpcklwd(xmm5, xmm4, xmm1); // minu
+			vpunpckhwd(xmm6, xmm4, xmm1); // minv
+			vpsrlvd(xmm5, xmm5, xmm0);
+			vpsrlvd(xmm6, xmm6, xmm0);
+			vpackusdw(xmm5, xmm6);
+
+			vmovdqa(xmm4, ptr[&m_local.gd->t.max]);
+			vpunpcklwd(xmm6, xmm4, xmm1); // maxu
+			vpunpckhwd(xmm4, xmm4, xmm1); // maxv
+			vpsrlvd(xmm6, xmm6, xmm0);
+			vpsrlvd(xmm4, xmm4, xmm0);
+			vpackusdw(xmm6, xmm4);
+
+			vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
+			vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
+		}
+		else
+		{
+			vmovq(xmm4, ptr[&m_local.gd->t.minmax]);
+
+			vpunpckldq(xmm5, xmm2, xmm3);
+			vpunpckhdq(xmm6, xmm2, xmm3);
+			vmovdqa(xmm2, xmm5);
+			vmovdqa(xmm3, xmm6);
+
+			vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); 
+			vpsrad(xmm2, xmm0);
+			vpsrlw(xmm1, xmm4, xmm0);
+			vmovq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1);
+
+			vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]);
+			vpsrad(xmm5, xmm0);
+			vpsrlw(xmm1, xmm4, xmm0);
+			vmovq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1);
+
+			vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]);
+			vpsrad(xmm3, xmm0);
+			vpsrlw(xmm1, xmm4, xmm0);
+			vmovq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1);
+
+			vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]);
+			vpsrad(xmm6, xmm0);
+			vpsrlw(xmm1, xmm4, xmm0);
+			vmovq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1);
+
+			vpunpckldq(xmm2, xmm3);
+			vpunpckhdq(xmm5, xmm6);
+			vpunpckhdq(xmm3, xmm2, xmm5);
+			vpunpckldq(xmm2, xmm5);
+
+			vmovdqa(ptr[&m_local.temp.uv[0]], xmm2);
+			vmovdqa(ptr[&m_local.temp.uv[1]], xmm3);
+
+			vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+			vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+
+			vpunpcklwd(xmm0, xmm5, xmm6);
+			vpunpckhwd(xmm1, xmm5, xmm6);
+			vpunpckldq(xmm5, xmm0, xmm1);
+			vpunpckhdq(xmm6, xmm0, xmm1);
+
+			vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
+			vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
+		}
+	}
+	else
+	{
+		// lod = K
+
+		vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]);
+
+		vpsrad(xmm2, xmm0);
+		vpsrad(xmm3, xmm0);
+
+		vmovdqa(ptr[&m_local.temp.uv[0]], xmm2);
+		vmovdqa(ptr[&m_local.temp.uv[1]], xmm3);
+
+		vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+		vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+	}
+
+	// xmm2 = m_local.temp.uv[0] = u (level m)
+	// xmm3 = m_local.temp.uv[1] = v (level m)
+	// xmm5 = minuv
+	// xmm6 = maxuv
+
+	if(m_sel.ltf)
+	{
+		// u -= 0x8000;
+		// v -= 0x8000;
+
+		mov(eax, 0x8000);
+		vmovd(xmm4, eax);
+		vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+		vpsubd(xmm2, xmm4);
+		vpsubd(xmm3, xmm4);
+
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+	
+		vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(xmm0, 12);
+		vmovdqa(ptr[&m_local.temp.uf], xmm0);
+
+		// GSVector4i vf = v.xxzzlh().srl16(1);
+
+		vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(xmm0, 12);
+		vmovdqa(ptr[&m_local.temp.vf], xmm0);
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	vpsrad(xmm2, 16);
+	vpsrad(xmm3, 16);
+	vpackssdw(xmm2, xmm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		vpcmpeqd(xmm1, xmm1);
+		vpsrlw(xmm1, 15);
+		vpaddw(xmm3, xmm2, xmm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		WrapLOD(xmm2, xmm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		WrapLOD(xmm2);
+	}
+
+	// xmm2 = uv0
+	// xmm3 = uv1 (ltf)
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+	// xmm7 = used
+
+	// GSVector4i x0 = uv0.upl16();
+	// GSVector4i y0 = uv0.uph16() << tw;
+
+	vpxor(xmm0, xmm0);
+
+	vpunpcklwd(xmm4, xmm2, xmm0);
+	vpunpckhwd(xmm2, xmm2, xmm0);
+	vpslld(xmm2, m_sel.tw + 3);
+
+	// xmm0 = 0
+	// xmm2 = y0
+	// xmm3 = uv1 (ltf)
+	// xmm4 = x0
+	// xmm1, xmm5, xmm6 = free
+	// xmm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i x1 = uv1.upl16();
+		// GSVector4i y1 = uv1.uph16() << tw;
+
+		vpunpcklwd(xmm6, xmm3, xmm0);
+		vpunpckhwd(xmm3, xmm3, xmm0);
+		vpslld(xmm3, m_sel.tw + 3);
+
+		// xmm2 = y0
+		// xmm3 = y1
+		// xmm4 = x0
+		// xmm6 = x1
+		// xmm0, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		vpaddd(xmm5, xmm2, xmm4);
+		vpaddd(xmm2, xmm2, xmm6);
+		vpaddd(xmm0, xmm3, xmm4);
+		vpaddd(xmm3, xmm3, xmm6);
+
+		// xmm5 = addr00
+		// xmm2 = addr01
+		// xmm0 = addr10
+		// xmm3 = addr11
+		// xmm1, xmm4, xmm6 = free
+		// xmm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// xmm6 = c00
+		// xmm4 = c01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm0, xmm2, xmm3 = free
+		// xmm7 = used
+
+		vmovdqa(xmm0, ptr[&m_local.temp.uf]);
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		vpsllw(xmm2, xmm6, 8);
+		vpsrlw(xmm2, 8);
+		vpsrlw(xmm6, 8);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		vpsllw(xmm3, xmm4, 8);
+		vpsrlw(xmm3, 8);
+		vpsrlw(xmm4, 8);
+
+		// xmm0 = uf
+		// xmm2 = rb00
+		// xmm3 = rb01
+		// xmm6 = ga00
+		// xmm4 = ga01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm7 = used
+
+		// rb00 = rb00.lerp16_4(rb01, uf);
+		// ga00 = ga00.lerp16_4(ga01, uf);
+
+		lerp16_4(xmm3, xmm2, xmm0);
+		lerp16_4(xmm4, xmm6, xmm0);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm2, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		vpsrlw(xmm2, xmm1, 8);
+		vpsllw(xmm1, 8);
+		vpsrlw(xmm1, 8);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		vpsrlw(xmm6, xmm5, 8);
+		vpsllw(xmm5, 8);
+		vpsrlw(xmm5, 8);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = rb10
+		// xmm5 = rb11
+		// xmm2 = ga10
+		// xmm6 = ga11
+		// xmm7 = used
+
+		// rb10 = rb10.lerp16_4(rb11, uf);
+		// ga10 = ga10.lerp16_4(ga11, uf);
+
+		lerp16_4(xmm5, xmm1, xmm0);
+		lerp16_4(xmm6, xmm2, xmm0);
+
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm5 = rb10
+		// xmm6 = ga10
+		// xmm0, xmm1, xmm2 = free
+		// xmm7 = used
+
+		// rb00 = rb00.lerp16_4(rb10, vf);
+		// ga00 = ga00.lerp16_4(ga10, vf);
+
+		vmovdqa(xmm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(xmm5, xmm3, xmm0);
+		lerp16_4(xmm6, xmm4, xmm0);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		vpaddd(xmm5, xmm2, xmm4);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		vpsllw(xmm5, xmm6, 8);
+		vpsrlw(xmm5, 8);
+		vpsrlw(xmm6, 8);
+	}
+
+	if(m_sel.mmin != 1) // !round-off mode
+	{
+		vmovdqa(ptr[&m_local.temp.trb], xmm5);
+		vmovdqa(ptr[&m_local.temp.tga], xmm6);
+
+		vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]);
+		vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]);
+
+		vpsrad(xmm2, 1);
+		vpsrad(xmm3, 1);
+
+		vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+		vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+
+		vpsrlw(xmm5, 1);
+		vpsrlw(xmm6, 1);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			vmovd(xmm4, eax);
+			vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+			vpsubd(xmm2, xmm4);
+			vpsubd(xmm3, xmm4);
+
+			// GSVector4i uf = u.xxzzlh().srl16(1);
+	
+			vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(xmm0, 12);
+			vmovdqa(ptr[&m_local.temp.uf], xmm0);
+
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(xmm0, 12);
+			vmovdqa(ptr[&m_local.temp.vf], xmm0);
+		}
+
+		// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+		vpsrad(xmm2, 16);
+		vpsrad(xmm3, 16);
+		vpackssdw(xmm2, xmm3);
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+			vpcmpeqd(xmm1, xmm1);
+			vpsrlw(xmm1, 15);
+			vpaddw(xmm3, xmm2, xmm1);
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			WrapLOD(xmm2, xmm3);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			WrapLOD(xmm2);
+		}
+
+		// xmm2 = uv0
+		// xmm3 = uv1 (ltf)
+		// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i x0 = uv0.upl16();
+		// GSVector4i y0 = uv0.uph16() << tw;
+
+		vpxor(xmm0, xmm0);
+
+		vpunpcklwd(xmm4, xmm2, xmm0);
+		vpunpckhwd(xmm2, xmm2, xmm0);
+		vpslld(xmm2, m_sel.tw + 3);
+
+		// xmm0 = 0
+		// xmm2 = y0
+		// xmm3 = uv1 (ltf)
+		// xmm4 = x0
+		// xmm1, xmm5, xmm6 = free
+		// xmm7 = used
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i x1 = uv1.upl16();
+			// GSVector4i y1 = uv1.uph16() << tw;
+
+			vpunpcklwd(xmm6, xmm3, xmm0);
+			vpunpckhwd(xmm3, xmm3, xmm0);
+			vpslld(xmm3, m_sel.tw + 3);
+
+			// xmm2 = y0
+			// xmm3 = y1
+			// xmm4 = x0
+			// xmm6 = x1
+			// xmm0, xmm5, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i addr00 = y0 + x0;
+			// GSVector4i addr01 = y0 + x1;
+			// GSVector4i addr10 = y1 + x0;
+			// GSVector4i addr11 = y1 + x1;
+
+			vpaddd(xmm5, xmm2, xmm4);
+			vpaddd(xmm2, xmm2, xmm6);
+			vpaddd(xmm0, xmm3, xmm4);
+			vpaddd(xmm3, xmm3, xmm6);
+
+			// xmm5 = addr00
+			// xmm2 = addr01
+			// xmm0 = addr10
+			// xmm3 = addr11
+			// xmm1, xmm4, xmm6 = free
+			// xmm7 = used
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(4, 1);
+
+			// xmm6 = c00
+			// xmm4 = c01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm0, xmm2, xmm3 = free
+			// xmm7 = used
+
+			vmovdqa(xmm0, ptr[&m_local.temp.uf]);
+
+			// GSVector4i rb00 = c00 & mask;
+			// GSVector4i ga00 = (c00 >> 8) & mask;
+
+			vpsllw(xmm2, xmm6, 8);
+			vpsrlw(xmm2, 8);
+			vpsrlw(xmm6, 8);
+
+			// GSVector4i rb01 = c01 & mask;
+			// GSVector4i ga01 = (c01 >> 8) & mask;
+
+			vpsllw(xmm3, xmm4, 8);
+			vpsrlw(xmm3, 8);
+			vpsrlw(xmm4, 8);
+
+			// xmm0 = uf
+			// xmm2 = rb00
+			// xmm3 = rb01
+			// xmm6 = ga00
+			// xmm4 = ga01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm7 = used
+
+			// rb00 = rb00.lerp16_4(rb01, uf);
+			// ga00 = ga00.lerp16_4(ga01, uf);
+
+			lerp16_4(xmm3, xmm2, xmm0);
+			lerp16_4(xmm4, xmm6, xmm0);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm2, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i rb10 = c10 & mask;
+			// GSVector4i ga10 = (c10 >> 8) & mask;
+
+			vpsrlw(xmm2, xmm1, 8);
+			vpsllw(xmm1, 8);
+			vpsrlw(xmm1, 8);
+
+			// GSVector4i rb11 = c11 & mask;
+			// GSVector4i ga11 = (c11 >> 8) & mask;
+
+			vpsrlw(xmm6, xmm5, 8);
+			vpsllw(xmm5, 8);
+			vpsrlw(xmm5, 8);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = rb10
+			// xmm5 = rb11
+			// xmm2 = ga10
+			// xmm6 = ga11
+			// xmm7 = used
+
+			// rb10 = rb10.lerp16_4(rb11, uf);
+			// ga10 = ga10.lerp16_4(ga11, uf);
+
+			lerp16_4(xmm5, xmm1, xmm0);
+			lerp16_4(xmm6, xmm2, xmm0);
+
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm5 = rb10
+			// xmm6 = ga10
+			// xmm0, xmm1, xmm2 = free
+			// xmm7 = used
+
+			// rb00 = rb00.lerp16_4(rb10, vf);
+			// ga00 = ga00.lerp16_4(ga10, vf);
+
+			vmovdqa(xmm0, ptr[&m_local.temp.vf]);
+
+			lerp16_4(xmm5, xmm3, xmm0);
+			lerp16_4(xmm6, xmm4, xmm0);
+		}
+		else
+		{
+			// GSVector4i addr00 = y0 + x0;
+
+			vpaddd(xmm5, xmm2, xmm4);
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(1, 1);
+
+			// GSVector4i mask = GSVector4i::x00ff();
+
+			// c[0] = c00 & mask;
+			// c[1] = (c00 >> 8) & mask;
+
+			vpsllw(xmm5, xmm6, 8);
+			vpsrlw(xmm5, 8);
+			vpsrlw(xmm6, 8);
+		}
+
+		vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
+		vpsrlw(xmm0, xmm0, 1);
+
+		vmovdqa(xmm2, ptr[&m_local.temp.trb]);
+		vmovdqa(xmm3, ptr[&m_local.temp.tga]);
+
+		lerp16(xmm5, xmm2, xmm0, 0);
+		lerp16(xmm6, xmm3, xmm0, 0);
+	}
+
+	pop(ebp);
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
+{
+	// xmm5 = minuv
+	// xmm6 = maxuv
+	// xmm0, xmm1, xmm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv, xmm5);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv, xmm0);
+			}
+
+			vpminsw(uv, xmm6);
+		}
+		else
+		{
+			vpand(uv, xmm5);
+
+			if(region)
+			{
+				vpor(uv, xmm6);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv, xmm5);
+
+		if(region)
+		{
+			vpor(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv, xmm5);
+		vpminsw(uv, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
+{
+	// xmm5 = minuv
+	// xmm6 = maxuv
+	// xmm0, xmm1, xmm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv0, xmm5);
+				vpmaxsw(uv1, xmm5);
+			}
+			else
+			{
+				vpxor(xmm0, xmm0);
+				vpmaxsw(uv0, xmm0);
+				vpmaxsw(uv1, xmm0);
+			}
+
+			vpminsw(uv0, xmm6);
+			vpminsw(uv1, xmm6);
+		}
+		else
+		{
+			vpand(uv0, xmm5);
+			vpand(uv1, xmm5);
+
+			if(region)
+			{
+				vpor(uv0, xmm6);
+				vpor(uv1, xmm6);
+			}
+		}
+	}
+	else
+	{
+		vmovdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv0, xmm5);
+
+		if(region)
+		{
+			vpor(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv0, xmm5);
+		vpminsw(uv0, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv0, xmm1, xmm0);
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(xmm1, uv1, xmm5);
+
+		if(region)
+		{
+			vpor(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv1, xmm5);
+		vpminsw(uv1, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv1, xmm1, xmm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+		vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+		// gat = gat.modulate16<1>(ga).clamp8();
+
+		modulate16(xmm6, xmm4, 1);
+
+		clamp16(xmm6, xmm3);
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			vpsrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_DECAL:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+			vpsrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_HIGHLIGHT:
+
+		// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+		vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		vmovdqa(xmm2, xmm4);
+
+		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+		vpsrlw(xmm4, 7);
+
+		if(m_sel.tcc)
+		{
+			vpaddusb(xmm4, xmm6);
+		}
+
+		mix16(xmm6, xmm4, xmm3);
+
+		break;
+
+	case TFX_HIGHLIGHT2:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+			vmovdqa(xmm2, xmm4);
+
+			vpsrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_NONE:
+
+		// gat = iip ? ga.srl16(7) : ga;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(xmm6, 7);
+		}
+
+		break;
+	}
+
+	if(m_sel.aa1)
+	{
+		// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+		// FIXME: bios config screen cubes
+
+		if(!m_sel.abe)
+		{
+			// a = cov
+
+			if(m_sel.edge)
+			{
+				vmovdqa(xmm0, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				vpcmpeqd(xmm0, xmm0);
+				vpsllw(xmm0, 15);
+				vpsrlw(xmm0, 8);
+			}
+
+			mix16(xmm6, xmm0, xmm1);
+		}
+		else
+		{
+			// a = a == 0x80 ? cov : a
+
+			vpcmpeqd(xmm0, xmm0);
+			vpsllw(xmm0, 15);
+			vpsrlw(xmm0, 8);
+
+			if(m_sel.edge)
+			{
+				vmovdqa(xmm1, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				vmovdqa(xmm1, xmm0);
+			}
+
+			vpcmpeqw(xmm0, xmm6);
+			vpsrld(xmm0, 16);
+			vpslld(xmm0, 16);
+
+			vpblendvb(xmm6, xmm1, xmm0);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+	if(m_sel.fwrite)
+	{
+		vmovdqa(xmm3, ptr[&m_local.gd->fm]);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vmovdqa(xmm4, ptr[&m_local.gd->zm]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+	switch(m_sel.afail)
+	{
+	case AFAIL_FB_ONLY:
+		if(!m_sel.zwrite) return;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		if(!m_sel.fwrite) return;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		if(!m_sel.zwrite && m_sel.fpsm == 1) return;
+		break;
+	}
+
+	switch(m_sel.atst)
+	{
+	case ATST_NEVER:
+		// t = GSVector4i::xffffffff();
+		vpcmpeqd(xmm1, xmm1);
+		break;
+
+	case ATST_ALWAYS:
+		return;
+
+	case ATST_LESS:
+	case ATST_LEQUAL:
+		// t = (ga >> 16) > m_local.gd->aref;
+		vpsrld(xmm1, xmm6, 16);
+		vpcmpgtd(xmm1, ptr[&m_local.gd->aref]);
+		break;
+
+	case ATST_EQUAL:
+		// t = (ga >> 16) != m_local.gd->aref;
+		vpsrld(xmm1, xmm6, 16);
+		vpcmpeqd(xmm1, ptr[&m_local.gd->aref]);
+		vpcmpeqd(xmm0, xmm0);
+		vpxor(xmm1, xmm0);
+		break;
+
+	case ATST_GEQUAL:
+	case ATST_GREATER:
+		// t = (ga >> 16) < m_local.gd->aref;
+		vpsrld(xmm0, xmm6, 16);
+		vmovdqa(xmm1, ptr[&m_local.gd->aref]);
+		vpcmpgtd(xmm1, xmm0);
+		break;
+
+	case ATST_NOTEQUAL:
+		// t = (ga >> 16) == m_local.gd->aref;
+		vpsrld(xmm1, xmm6, 16);
+		vpcmpeqd(xmm1, ptr[&m_local.gd->aref]);
+		break;
+	}
+
+	switch(m_sel.afail)
+	{
+	case AFAIL_KEEP:
+		// test |= t;
+		vpor(xmm7, xmm1);
+		alltrue();
+		break;
+
+	case AFAIL_FB_ONLY:
+		// zm |= t;
+		vpor(xmm4, xmm1);
+		break;
+
+	case AFAIL_ZB_ONLY:
+		// fm |= t;
+		vpor(xmm3, xmm1);
+		break;
+
+	case AFAIL_RGB_ONLY:
+		// zm |= t;
+		vpor(xmm4, xmm1);
+		// fm |= t & GSVector4i::xff000000();
+		vpsrld(xmm1, 24);
+		vpslld(xmm1, 24);
+		vpor(xmm3, xmm1);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).clamp8();
+
+		modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		clamp16(xmm5, xmm1);
+
+		break;
+
+	case TFX_DECAL:
+
+		break;
+
+	case TFX_HIGHLIGHT:
+	case TFX_HIGHLIGHT2:
+
+		if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		}
+
+		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+		vmovdqa(xmm1, xmm6);
+
+		modulate16(xmm6, xmm2, 1);
+
+		vpshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+		vpshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+		vpsrlw(xmm2, 7);
+
+		vpaddw(xmm6, xmm2);
+
+		clamp16(xmm6, xmm0);
+
+		mix16(xmm6, xmm1, xmm0);
+
+		// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+		modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		vpaddw(xmm5, xmm2);
+
+		clamp16(xmm5, xmm0);
+
+		break;
+
+	case TFX_NONE:
+
+		// rbt = iip ? rb.srl16(7) : rb;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(xmm5, 7);
+		}
+
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+	if(!m_sel.fwrite || !m_sel.fge)
+	{
+		return;
+	}
+
+	// rb = m_local.gd->frb.lerp16<0>(rb, f);
+	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
+
+	vmovdqa(xmm0, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.f : &m_local.p.f]);
+	vmovdqa(xmm1, xmm6);
+
+	vmovdqa(xmm2, ptr[&m_local.gd->frb]);
+	lerp16(xmm5, xmm2, xmm0, 0);
+
+	vmovdqa(xmm2, ptr[&m_local.gd->fga]);
+	lerp16(xmm6, xmm2, xmm0, 0);
+	mix16(xmm6, xmm1, xmm0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	// int fa = fza_base.x + fza_offset->x;
+
+	mov(ebx, ptr[esi]);
+	add(ebx, ptr[edi]);
+
+	if(!m_sel.rfb)
+	{
+		return;
+	}
+
+	ReadPixel(xmm2, ebx);
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+	if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
+	{
+		return;
+	}
+
+	// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
+
+	if(m_sel.datm)
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpxor(xmm0, xmm0);
+			//vpsrld(xmm1, xmm2, 15);
+			vpslld(xmm1, xmm2, 16);
+			vpsrad(xmm1, 31);
+			vpcmpeqd(xmm1, xmm0);
+		}
+		else
+		{
+			vpcmpeqd(xmm0, xmm0);
+			vpxor(xmm1, xmm2, xmm0);
+			vpsrad(xmm1, 31);
+		}
+	}
+	else
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpslld(xmm1, xmm2, 16);
+			vpsrad(xmm1, 31);
+		}
+		else
+		{
+			vpsrad(xmm1, xmm2, 31);
+		}
+	}
+
+	vpor(xmm7, xmm1);
+
+	alltrue();
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+	if(m_sel.notest)
+	{
+		return;
+	}
+
+	// fm |= test;
+	// zm |= test;
+
+	if(m_sel.fwrite)
+	{
+		vpor(xmm3, xmm7);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vpor(xmm4, xmm7);
+	}
+
+	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+
+	vpcmpeqd(xmm1, xmm1);
+
+	if(m_sel.fwrite && m_sel.zwrite)
+	{
+		vpcmpeqd(xmm0, xmm1, xmm4);
+		vpcmpeqd(xmm1, xmm3);
+		vpackssdw(xmm1, xmm0);
+	}
+	else if(m_sel.fwrite)
+	{
+		vpcmpeqd(xmm1, xmm3);
+		vpackssdw(xmm1, xmm1);
+	}
+	else if(m_sel.zwrite)
+	{
+		vpcmpeqd(xmm1, xmm4);
+		vpackssdw(xmm1, xmm1);
+	}
+
+	vpmovmskb(edx, xmm1);
+
+	not(edx);
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+	if(!m_sel.zwrite)
+	{
+		return;
+	}
+
+	vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
+
+	if(m_sel.ztest && m_sel.zpsm < 2)
+	{
+		// zs = zs.blend8(zd, zm);
+
+		vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
+	}
+
+	bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
+
+	WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.abe == 0 && m_sel.aa1 == 0)
+	{
+		return;
+	}
+
+	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
+	{
+		switch(m_sel.fpsm)
+		{
+		case 0:
+		case 1:
+
+			// c[2] = fd & mask;
+			// c[3] = (fd >> 8) & mask;
+
+			vpsllw(xmm0, xmm2, 8);
+			vpsrlw(xmm0, 8);
+			vpsrlw(xmm1, xmm2, 8);
+
+			break;
+
+		case 2:
+
+			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+			vpcmpeqd(xmm7, xmm7);
+
+			vpsrld(xmm7, 27); // 0x0000001f
+			vpand(xmm0, xmm2, xmm7);
+			vpslld(xmm0, 3);
+
+			vpslld(xmm7, 10); // 0x00007c00
+			vpand(xmm4, xmm2, xmm7);
+			vpslld(xmm4, 9);
+
+			vpor(xmm0, xmm4);
+
+			vpsrld(xmm7, 5); // 0x000003e0
+			vpand(xmm1, xmm2, xmm7);
+			vpsrld(xmm1, 2);
+
+			vpsllw(xmm7, 10); // 0x00008000
+			vpand(xmm4, xmm2, xmm7);
+			vpslld(xmm4, 8);
+
+			vpor(xmm1, xmm4);
+
+			break;
+		}
+	}
+
+	// xmm5, xmm6 = src rb, ga
+	// xmm0, xmm1 = dst rb, ga
+	// xmm2, xmm3 = used
+	// xmm4, xmm7 = free
+
+	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
+	{
+		vmovdqa(xmm4, xmm5);
+	}
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// rb = c[aba * 2 + 0];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm5, xmm0); break;
+		case 2: vpxor(xmm5, xmm5); break;
+		}
+
+		// rb = rb.sub16(c[abb * 2 + 0]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(xmm5, xmm4); break;
+		case 1: vpsubw(xmm5, xmm0); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
+
+			switch(m_sel.abc)
+			{
+			case 0:
+			case 1:
+				vpshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1));
+				vpshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
+				vpsllw(xmm7, 7);
+				break;
+			case 2:
+				vmovdqa(xmm7, ptr[&m_local.gd->afix]);
+				break;
+			}
+
+			// rb = rb.modulate16<1>(a);
+
+			modulate16(xmm5, xmm7, 1);
+		}
+
+		// rb = rb.add16(c[abd * 2 + 0]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(xmm5, xmm4); break;
+		case 1: vpaddw(xmm5, xmm0); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// rb = c[abd * 2 + 0];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm5, xmm0); break;
+		case 2: vpxor(xmm5, xmm5); break;
+		}
+	}
+
+	if(m_sel.pabe)
+	{
+		// mask = (c[1] << 8).sra32(31);
+
+		vpslld(xmm0, xmm6, 8);
+		vpsrad(xmm0, 31);
+
+		// rb = c[0].blend8(rb, mask);
+
+		vpblendvb(xmm5, xmm4, xmm5, xmm0);
+	}
+
+	// xmm6 = src ga
+	// xmm1 = dst ga
+	// xmm5 = rb
+	// xmm7 = a
+	// xmm2, xmm3 = used
+	// xmm0, xmm4 = free
+
+	vmovdqa(xmm4, xmm6);
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// ga = c[aba * 2 + 1];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm6, xmm1); break;
+		case 2: vpxor(xmm6, xmm6); break;
+		}
+
+		// ga = ga.sub16(c[abeb * 2 + 1]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(xmm6, xmm4); break;
+		case 1: vpsubw(xmm6, xmm1); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// ga = ga.modulate16<1>(a);
+
+			modulate16(xmm6, xmm7, 1);
+		}
+
+		// ga = ga.add16(c[abd * 2 + 1]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(xmm6, xmm4); break;
+		case 1: vpaddw(xmm6, xmm1); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// ga = c[abd * 2 + 1];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(xmm6, xmm1); break;
+		case 2: vpxor(xmm6, xmm6); break;
+		}
+	}
+
+	// xmm4 = src ga
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm2, xmm3 = used
+	// xmm0, xmm1, xmm7 = free
+
+	if(m_sel.pabe)
+	{
+		vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+		vpblendvb(xmm6, xmm4, xmm6, xmm0);
+	}
+	else
+	{
+		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		{
+			mix16(xmm6, xmm4, xmm7);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.fpsm == 2 && m_sel.dthe)
+	{
+		mov(eax, ptr[esp + _top]);
+		and(eax, 3);
+		shl(eax, 5);
+		mov(ebp, ptr[&m_local.gd->dimx]);
+		vpaddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
+		vpaddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
+	}
+
+	if(m_sel.colclamp == 0)
+	{
+		// c[0] &= 0x00ff00ff;
+		// c[1] &= 0x00ff00ff;
+
+		vpcmpeqd(xmm7, xmm7);
+		vpsrlw(xmm7, 8);
+		vpand(xmm5, xmm7);
+		vpand(xmm6, xmm7);
+	}
+
+	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+	vpunpckhwd(xmm7, xmm5, xmm6);
+	vpunpcklwd(xmm5, xmm6);
+	vpackuswb(xmm5, xmm7);
+
+	if(m_sel.fba && m_sel.fpsm != 1)
+	{
+		// fs |= 0x80000000;
+
+		vpcmpeqd(xmm7, xmm7);
+		vpslld(xmm7, 31);
+		vpor(xmm5, xmm7);
+	}
+
+	if(m_sel.fpsm == 2)
+	{
+		// GSVector4i rb = fs & 0x00f800f8;
+		// GSVector4i ga = fs & 0x8000f800;
+
+		mov(eax, 0x00f800f8);
+		vmovd(xmm6, eax);
+		vpshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+
+		mov(eax, 0x8000f800);
+		vmovd(xmm7, eax);
+		vpshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+
+		vpand(xmm4, xmm5, xmm6);
+		vpand(xmm5, xmm7);
+
+		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+		vpsrld(xmm6, xmm4, 9);
+		vpsrld(xmm4, 3);
+		vpsrld(xmm7, xmm5, 16);
+		vpsrld(xmm5, 6);
+
+		vpor(xmm5, xmm4);
+		vpor(xmm7, xmm6);
+		vpor(xmm5, xmm7);
+	}
+
+	if(m_sel.rfb)
+	{
+		// fs = fs.blend(fd, fm);
+
+		blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
+	}
+
+	bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
+
+	WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
+{
+	vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
+	vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
+{
+	if(m_sel.notest)
+	{
+		if(fast)
+		{
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
+		}
+		else
+		{
+			WritePixel(src, addr, 0, psm);
+			WritePixel(src, addr, 1, psm);
+			WritePixel(src, addr, 2, psm);
+			WritePixel(src, addr, 3, psm);
+		}
+	}
+	else
+	{
+		if(fast)
+		{
+			// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
+			// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
+
+			test(mask, 0x0f);
+			je("@f");
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
+			L("@@");
+
+			test(mask, 0xf0);
+			je("@f");
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
+			L("@@");
+
+			// vmaskmovps?
+		}
+		else
+		{
+			// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
+			// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
+			// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
+			// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
+
+			test(mask, 0x03);
+			je("@f");
+			WritePixel(src, addr, 0, psm);
+			L("@@");
+
+			test(mask, 0x0c);
+			je("@f");
+			WritePixel(src, addr, 1, psm);
+			L("@@");
+
+			test(mask, 0x30);
+			je("@f");
+			WritePixel(src, addr, 2, psm);
+			L("@@");
+
+			test(mask, 0xc0);
+			je("@f");
+			WritePixel(src, addr, 3, psm);
+			L("@@");
+		}
+	}
+}
+
+static const int s_offsets[] = {0, 2, 8, 10};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
+{
+	Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
+
+	switch(psm)
+	{
+	case 0:
+		if(i == 0) vmovd(dst, src);
+		else vpextrd(dst, src, i);
+		break;
+	case 1:
+		if(i == 0) vmovd(eax, src);
+		else vpextrd(eax, src, i);
+		xor(eax, dst);
+		and(eax, 0xffffff);
+		xor(dst, eax);
+		break;
+	case 2:
+		if(i == 0) vmovd(eax, src);
+		else vpextrw(eax, src, i * 2);
+		mov(dst, ax);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
+{
+	// in
+	// xmm5 = addr00
+	// xmm2 = addr01
+	// xmm0 = addr10
+	// xmm3 = addr11
+	// ebx = m_local.tex[0] (!m_sel.mmin)
+	// ebp = m_local.tex (m_sel.mmin)
+	// edx = m_local.clut (m_sel.tlu)
+
+	// out
+	// xmm6 = c00
+	// xmm4 = c01
+	// xmm1 = c10
+	// xmm5 = c11
+
+	ASSERT(pixels == 1 || pixels == 4);
+
+	mip_offset *= sizeof(void*);
+
+	const GSVector4i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i;
+
+	if(m_sel.mmin && !m_sel.lcm)
+	{
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
+
+		if(pixels == 4)
+		{
+			vmovdqa(ptr[&m_local.temp.test], xmm7);
+		}
+
+		for(uint8 j = 0; j < 4; j++)
+		{
+			mov(ebx, ptr[&lod_i->u32[j]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			for(int i = 0; i < pixels; i++)
+			{
+				ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
+			}
+		}
+
+		if(pixels == 4)
+		{
+			vmovdqa(xmm5, xmm7);
+			vmovdqa(xmm7, ptr[&m_local.temp.test]);
+		}
+	}
+	else
+	{
+		if(m_sel.mmin && m_sel.lcm)
+		{
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+		}
+
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
+		const int t[] = {4, 1, 5, 2};
+
+		for(int i = 0; i < pixels; i++)
+		{
+			for(uint8 j = 0; j < 4; j++)
+			{
+				ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
+{
+	ASSERT(i < 4);
+
+	const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
+
+	if(i == 0) vmovd(eax, addr);
+	else vpextrd(eax, addr, i);
+
+	if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
+
+	if(i == 0) vmovd(dst, src);
+	else vpinsrd(dst, src, i);
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx2.cpp
new file mode 100644
index 0000000000..7ee865c0ec
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.avx2.cpp
@@ -0,0 +1,2970 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
+
+static const int _args = 16;
+static const int _top = _args + 4;
+static const int _v = _args + 8;
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+//ret(8);
+
+	push(ebx);
+	push(esi);
+	push(edi);
+	push(ebp);
+
+	//db(0xcc);
+
+	Init();
+
+	if(!m_sel.edge)
+	{
+		align(16);
+	}
+
+L("loop");
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ymm0 = z/zi
+	// ymm2 = s/u (tme)
+	// ymm3 = t/v (tme)
+	// ymm4 = q (tme)
+	// ymm5 = rb (!tme)
+	// ymm6 = ga (!tme)
+	// ymm7 = test
+
+	bool tme = m_sel.tfx != TFX_NONE;
+
+	TestZ(tme ? ymm5 : ymm2, tme ? ymm6 : ymm3);
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - ymm0
+	// ymm2 = s/u (tme)
+	// ymm3 = t/v (tme)
+	// ymm4 = q (tme)
+	// ymm5 = rb (!tme)
+	// ymm6 = ga (!tme)
+	// ymm7 = test
+
+	if(m_sel.mmin)
+	{
+		SampleTextureLOD();
+	}
+	else
+	{
+		SampleTexture();
+	}
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - ymm2
+	// - ymm3
+	// - ymm4
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	AlphaTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	ReadMask();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	TestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	ColorTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	Fog();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	ReadFrame();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = fd
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	TestDestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = fd
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm7 = test
+
+	WriteMask();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// ymm2 = fd
+	// ymm3 = fm
+	// ymm4 = zm
+	// ymm5 = rb
+	// ymm6 = ga
+
+	WriteZBuf();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// - ebp
+	// ymm2 = fd
+	// ymm3 = fm
+	// - ymm4
+	// ymm5 = rb
+	// ymm6 = ga
+
+	AlphaBlend();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// ymm2 = fd
+	// ymm3 = fm
+	// ymm5 = rb
+	// ymm6 = ga
+
+	WriteFrame();
+
+L("step");
+
+	// if(steps <= 0) break;
+
+	if(!m_sel.edge)
+	{
+		test(ecx, ecx);
+
+		jle("exit", T_NEAR);
+
+		Step();
+
+		jmp("loop", T_NEAR);
+	}
+
+L("exit");
+
+	pop(ebp);
+	pop(edi);
+	pop(esi);
+	pop(ebx);
+
+	ret(8);
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+	if(!m_sel.notest)
+	{
+		// int skip = left & 7;
+
+		mov(ebx, edx);
+		and(edx, 7);
+
+		// int steps = pixels + skip - 8;
+
+		lea(ecx, ptr[ecx + edx - 8]);
+
+		// left -= skip;
+
+		sub(ebx, edx);
+
+		// GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))];
+		
+		mov(eax, ecx);
+		sar(eax, 31);
+		and(eax, ecx);
+
+		vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[0]]);
+		vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)&m_test[15]]);
+		vpor(ymm7, ymm0);
+
+		shl(edx, 5);
+	}
+	else
+	{
+		mov(ebx, edx); // left
+		xor(edx, edx); // skip
+		lea(ecx, ptr[ecx - 8]); // steps
+	}
+
+	// GSVector2i* fza_base = &m_local.gd->fzbr[top];
+
+	mov(esi, ptr[esp + _top]);
+	lea(esi, ptr[esi * 8]);
+	add(esi, ptr[&m_local.gd->fzbr]);
+
+	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
+
+	lea(edi, ptr[ebx * 2]);
+	add(edi, ptr[&m_local.gd->fzbc]);
+
+	if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
+	{
+		// edx = &m_local.d[skip]
+
+		lea(edx, ptr[edx * 8 + (size_t)m_local.d]);
+
+		// ebx = &v
+
+		mov(ebx, ptr[esp + _v]);
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
+		{
+			vbroadcastf128(ymm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
+
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				// f = GSVector8i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+
+				vcvttps2dq(ymm1, ymm0);
+				vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vpshufd(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2));
+				vpaddw(ymm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
+
+				vmovdqa(ptr[&m_local.temp.f], ymm1);
+			}
+
+			if(m_sel.zb)
+			{
+				// z = vp.zzzz() + m_local.d[skip].z;
+
+				vshufps(ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+				vmovaps(ptr[&m_local.temp.z], ymm0);
+				vmovaps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
+				vmovaps(ptr[&m_local.temp.zo], ymm2);
+				vaddps(ymm0, ymm2);
+			}
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			vpbroadcastd(ymm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.edge || m_sel.tfx != TFX_NONE)
+		{
+			vbroadcastf128(ymm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
+		}
+
+		if(m_sel.edge)
+		{
+			// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
+
+			vpshufhw(ymm3, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
+			vpshufd(ymm3, ymm3, _MM_SHUFFLE(3, 3, 3, 3));
+			vpsrlw(ymm3, 9);
+
+			vmovdqa(ptr[&m_local.temp.cov], ymm3);
+		}
+
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i vti(vt);
+
+				vcvttps2dq(ymm6, ymm4);
+
+				// s = vti.xxxx() + m_local.d[skip].s;
+				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
+
+				vpshufd(ymm2, ymm6, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(ymm3, ymm6, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddd(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					vpaddd(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				}
+				else
+				{
+					if(m_sel.ltf)
+					{
+						vpshuflw(ymm6, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
+						vpshufhw(ymm6, ymm6, _MM_SHUFFLE(2, 2, 0, 0));
+						vpsrlw(ymm6, 12);
+						vmovdqa(ptr[&m_local.temp.vf], ymm6);
+					}
+				}
+
+				vmovdqa(ptr[&m_local.temp.s], ymm2);
+				vmovdqa(ptr[&m_local.temp.t], ymm3);
+			}
+			else
+			{
+				// s = vt.xxxx() + m_local.d[skip].s;
+				// t = vt.yyyy() + m_local.d[skip].t;
+				// q = vt.zzzz() + m_local.d[skip].q;
+
+				vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+				vaddps(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				vaddps(ymm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
+
+				vmovaps(ptr[&m_local.temp.s], ymm2);
+				vmovaps(ptr[&m_local.temp.t], ymm3);
+				vmovaps(ptr[&m_local.temp.q], ymm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i vc = GSVector4i(v.c);
+
+				vbroadcastf128(ymm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
+				vcvttps2dq(ymm6, ymm6);
+
+				// vc = vc.upl16(vc.zwxy());
+
+				vpshufd(ymm5, ymm6, _MM_SHUFFLE(1, 0, 3, 2));
+				vpunpcklwd(ymm6, ymm5);
+
+				// rb = vc.xxxx().add16(m_local.d[skip].rb);
+				// ga = vc.zzzz().add16(m_local.d[skip].ga);
+
+				vpshufd(ymm5, ymm6, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(ymm6, ymm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vpaddw(ymm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
+				vpaddw(ymm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
+
+				vmovdqa(ptr[&m_local.temp.rb], ymm5);
+				vmovdqa(ptr[&m_local.temp.ga], ymm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					vmovdqa(ymm5, ptr[&m_local.c.rb]);
+					vmovdqa(ymm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 8;
+
+	sub(ecx, 8);
+
+	// fza_offset += 2;
+
+	add(edi, 16);
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// zo += GSVector8::broadcast32(&m_local.d8.p.z);
+
+		if(m_sel.zb)
+		{
+			vbroadcastss(ymm0, ptr[&m_local.d8.p.z]);
+			vaddps(ymm0, ptr[&m_local.temp.zo]);
+			vmovaps(ptr[&m_local.temp.zo], ymm0);
+			vaddps(ymm0, ptr[&m_local.temp.z]);
+		}
+
+		// f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
+
+		if(m_sel.fwrite && m_sel.fge)
+		{
+			vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]);
+			vpaddw(ymm1, ptr[&m_local.temp.f]);
+			vmovdqa(ptr[&m_local.temp.f], ymm1);
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			vpbroadcastd(ymm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
+
+				vbroadcasti128(ymm4, ptr[&m_local.d8.stq]);
+
+				// s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
+
+				vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vpaddd(ymm2, ptr[&m_local.temp.s]);
+				vmovdqa(ptr[&m_local.temp.s], ymm2);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					// t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy());
+
+					vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
+					vpaddd(ymm3, ptr[&m_local.temp.t]);
+					vmovdqa(ptr[&m_local.temp.t], ymm3);
+				}
+				else
+				{
+					vmovdqa(ymm3, ptr[&m_local.temp.t]);
+				}
+			}
+			else
+			{
+				// GSVector8 stq(m_local.d8.stq);
+
+				// s += stq.xxxx();
+				// t += stq.yyyy();
+				// q += stq.zzzz();
+
+				vbroadcastf128(ymm4, ptr[&m_local.d8.stq]);
+
+				vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
+				vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
+				vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				vaddps(ymm2, ptr[&m_local.temp.s]);
+				vaddps(ymm3, ptr[&m_local.temp.t]);
+				vaddps(ymm4, ptr[&m_local.temp.q]);
+
+				vmovaps(ptr[&m_local.temp.s], ymm2);
+				vmovaps(ptr[&m_local.temp.t], ymm3);
+				vmovaps(ptr[&m_local.temp.q], ymm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
+
+				vpbroadcastq(ymm7, ptr[&m_local.d8.c]);
+
+				// rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero());
+				// ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero());
+
+				vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0));
+				vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1));
+
+				vpaddw(ymm5, ptr[&m_local.temp.rb]);
+				vpaddw(ymm6, ptr[&m_local.temp.ga]);
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				vpxor(ymm7, ymm7);
+				vpmaxsw(ymm5, ymm7);
+				vpmaxsw(ymm6, ymm7);
+
+				vmovdqa(ptr[&m_local.temp.rb], ymm5);
+				vmovdqa(ptr[&m_local.temp.ga], ymm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					vmovdqa(ymm5, ptr[&m_local.c.rb]);
+					vmovdqa(ymm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+
+	if(!m_sel.notest)
+	{
+		// test = m_test[15 + (steps & (steps >> 31))];
+
+		mov(edx, ecx);
+		sar(edx, 31);
+		and(edx, ecx);
+
+		vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[15]]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2)
+{
+	if(!m_sel.zb)
+	{
+		return;
+	}
+
+	// int za = fza_base.y + fza_offset->y;
+
+	mov(ebp, ptr[esi + 4]);
+	add(ebp, ptr[edi + 4]);
+
+	// GSVector8i zs = zi;
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.zoverflow)
+		{
+			// zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001());
+
+			vbroadcastss(temp1, ptr[&GSVector8::m_half]);
+			vmulps(temp1, ymm0);
+			vcvttps2dq(temp1, temp1);
+			vpslld(temp1, 1);
+
+			vcvttps2dq(ymm0, ymm0);
+			vpcmpeqd(temp2, temp2);
+			vpsrld(temp2, 31);
+			vpand(ymm0, temp2);
+
+			vpor(ymm0, temp1);
+		}
+		else
+		{
+			// zs = GSVector8i(z);
+
+			vcvttps2dq(ymm0, ymm0);
+		}
+
+		if(m_sel.zwrite)
+		{
+			vmovdqa(ptr[&m_local.temp.zs], ymm0);
+		}
+	}
+
+	if(m_sel.ztest)
+	{
+		ReadPixel(ymm1, temp1, ebp);
+
+		if(m_sel.zwrite && m_sel.zpsm < 2)
+		{
+			vmovdqa(ptr[&m_local.temp.zd], ymm1);
+		}
+
+		// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+		if(m_sel.zpsm)
+		{
+			vpslld(ymm1, (uint8)(m_sel.zpsm * 8));
+			vpsrld(ymm1, (uint8)(m_sel.zpsm * 8));
+		}
+
+		if(m_sel.zoverflow || m_sel.zpsm == 0)
+		{
+			// GSVector8i o = GSVector8i::x80000000();
+
+			vpcmpeqd(temp1, temp1);
+			vpslld(temp1, 31);
+
+			// GSVector8i zso = zs - o;
+			// GSVector8i zdo = zd - o;
+
+			vpsubd(ymm0, temp1);
+			vpsubd(ymm1, temp1);
+		}
+
+		switch(m_sel.ztst)
+		{
+		case ZTST_GEQUAL:
+			// test |= zso < zdo; // ~(zso >= zdo)
+			vpcmpgtd(ymm1, ymm0);
+			vpor(ymm7, ymm1);
+			break;
+
+		case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+			// test |= zso <= zdo; // ~(zso > zdo)
+			vpcmpgtd(ymm0, ymm1);
+			vpcmpeqd(temp1, temp1);
+			vpxor(ymm0, temp1);
+			vpor(ymm7, ymm0);
+			break;
+		}
+
+		alltrue();
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	mov(ebx, ptr[&m_local.gd->tex[0]]);
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	// ebx = tex
+	// edx = clut
+
+	if(!m_sel.fst)
+	{
+		vrcpps(ymm0, ymm4);
+
+		vmulps(ymm2, ymm0);
+		vmulps(ymm3, ymm0);
+
+		vcvttps2dq(ymm2, ymm2);
+		vcvttps2dq(ymm3, ymm3);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			vmovd(xmm4, eax);
+			vpbroadcastd(ymm4, xmm4);
+
+			vpsubd(ymm2, ymm4);
+			vpsubd(ymm3, ymm4);
+		}
+	}
+
+	// ymm2 = u
+	// ymm3 = v
+
+	if(m_sel.ltf)
+	{
+		// GSVector8i uf = u.xxzzlh().srl16(1);
+
+		vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(ymm0, 12);
+		vmovdqa(ptr[&m_local.temp.uf], ymm0);
+
+		if(m_sel.prim != GS_SPRITE_CLASS)
+		{
+			// GSVector8i vf = v.xxzzlh().srl16(1);
+
+			vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(ymm0, 12);
+			vmovdqa(ptr[&m_local.temp.vf], ymm0);
+		}
+	}
+
+	// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	vpsrad(ymm2, 16);
+	vpsrad(ymm3, 16);
+	vpackssdw(ymm2, ymm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector8i uv1 = uv0.add16(GSVector8i::x0001());
+
+		vpcmpeqd(ymm1, ymm1);
+		vpsrlw(ymm1, 15);
+		vpaddw(ymm3, ymm2, ymm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		Wrap(ymm2, ymm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		Wrap(ymm2);
+	}
+
+	// ymm2 = uv0
+	// ymm3 = uv1 (ltf)
+	// ymm0, ymm1, ymm4, ymm5, ymm6 = free
+	// ymm7 = used
+
+	// GSVector8i y0 = uv0.uph16() << tw;
+	// GSVector8i x0 = uv0.upl16();
+
+	vpxor(ymm0, ymm0);
+
+	vpunpcklwd(ymm4, ymm2, ymm0);
+	vpunpckhwd(ymm2, ymm2, ymm0);
+	vpslld(ymm2, (uint8)(m_sel.tw + 3));
+
+	// ymm0 = 0
+	// ymm2 = y0
+	// ymm3 = uv1 (ltf)
+	// ymm4 = x0
+	// ymm1, ymm5, ymm6 = free
+	// ymm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector8i y1 = uv1.uph16() << tw;
+		// GSVector8i x1 = uv1.upl16();
+
+		vpunpcklwd(ymm6, ymm3, ymm0);
+		vpunpckhwd(ymm3, ymm3, ymm0);
+		vpslld(ymm3, (uint8)(m_sel.tw + 3));
+
+		// ymm2 = y0
+		// ymm3 = y1
+		// ymm4 = x0
+		// ymm6 = x1
+		// ymm0, ymm5, ymm6 = free
+		// ymm7 = used
+
+		// GSVector8i addr00 = y0 + x0;
+		// GSVector8i addr01 = y0 + x1;
+		// GSVector8i addr10 = y1 + x0;
+		// GSVector8i addr11 = y1 + x1;
+
+		vpaddd(ymm5, ymm2, ymm4);
+		vpaddd(ymm2, ymm2, ymm6);
+		vpaddd(ymm0, ymm3, ymm4);
+		vpaddd(ymm3, ymm3, ymm6);
+
+		// ymm5 = addr00
+		// ymm2 = addr01
+		// ymm0 = addr10
+		// ymm3 = addr11
+		// ymm1, ymm4, ymm6 = free
+		// ymm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// ymm6 = c00
+		// ymm4 = c01
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm0, ymm2, ymm3 = free
+		// ymm7 = used
+
+		vmovdqa(ymm0, ptr[&m_local.temp.uf]);
+
+		// GSVector8i rb00 = c00 & mask;
+		// GSVector8i ga00 = (c00 >> 8) & mask;
+
+		vpsllw(ymm2, ymm6, 8);
+		vpsrlw(ymm2, 8);
+		vpsrlw(ymm6, 8);
+
+		// GSVector8i rb01 = c01 & mask;
+		// GSVector8i ga01 = (c01 >> 8) & mask;
+
+		vpsllw(ymm3, ymm4, 8);
+		vpsrlw(ymm3, 8);
+		vpsrlw(ymm4, 8);
+
+		// ymm0 = uf
+		// ymm2 = rb00
+		// ymm3 = rb01
+		// ymm6 = ga00
+		// ymm4 = ga01
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm7 = used
+
+		// rb00 = rb00.lerp16_4(rb01, uf);
+		// ga00 = ga00.lerp16_4(ga01, uf);
+
+		lerp16_4(ymm3, ymm2, ymm0);
+		lerp16_4(ymm4, ymm6, ymm0);
+
+		// ymm0 = uf
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm2, ymm6 = free
+		// ymm7 = used
+
+		// GSVector8i rb10 = c10 & mask;
+		// GSVector8i ga10 = (c10 >> 8) & mask;
+
+		vpsrlw(ymm2, ymm1, 8);
+		vpsllw(ymm1, 8);
+		vpsrlw(ymm1, 8);
+
+		// GSVector8i rb11 = c11 & mask;
+		// GSVector8i ga11 = (c11 >> 8) & mask;
+
+		vpsrlw(ymm6, ymm5, 8);
+		vpsllw(ymm5, 8);
+		vpsrlw(ymm5, 8);
+
+		// ymm0 = uf
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm1 = rb10
+		// ymm5 = rb11
+		// ymm2 = ga10
+		// ymm6 = ga11
+		// ymm7 = used
+
+		// rb10 = rb10.lerp16_4(rb11, uf);
+		// ga10 = ga10.lerp16_4(ga11, uf);
+
+		lerp16_4(ymm5, ymm1, ymm0);
+		lerp16_4(ymm6, ymm2, ymm0);
+
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm5 = rb10
+		// ymm6 = ga10
+		// ymm0, ymm1, ymm2 = free
+		// ymm7 = used
+
+		// rb00 = rb00.lerp16_4(rb10, vf);
+		// ga00 = ga00.lerp16_4(ga10, vf);
+
+		vmovdqa(ymm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(ymm5, ymm3, ymm0);
+		lerp16_4(ymm6, ymm4, ymm0);
+	}
+	else
+	{
+		// GSVector8i addr00 = y0 + x0;
+
+		vpaddd(ymm5, ymm2, ymm4);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector8i mask = GSVector8i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		vpsllw(ymm5, ymm6, 8);
+		vpsrlw(ymm5, 8);
+		vpsrlw(ymm6, 8);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv)
+{
+	// ymm0, ymm1, ymm4, ymm5, ymm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]);
+				vpmaxsw(uv, ymm0);
+			}
+			else
+			{
+				vpxor(ymm0, ymm0);
+				vpmaxsw(uv, ymm0);
+			}
+
+			vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]);
+			vpminsw(uv, ymm0);
+		}
+		else
+		{
+			vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]);
+			vpand(uv, ymm0);
+
+			if(region)
+			{
+				vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]);
+				vpor(uv, ymm0);
+			}
+		}
+	}
+	else
+	{
+		vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
+		vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
+		vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv, ymm4);
+
+		if(region)
+		{
+			vpor(ymm1, ymm5);
+		}
+
+		// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv, ymm4);
+		vpminsw(uv, ymm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv, ymm1, ymm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv0, const Ymm& uv1)
+{
+	// ymm0, ymm1, ymm4, ymm5, ymm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
+				vpmaxsw(uv0, ymm4);
+				vpmaxsw(uv1, ymm4);
+			}
+			else
+			{
+				vpxor(ymm0, ymm0);
+				vpmaxsw(uv0, ymm0);
+				vpmaxsw(uv1, ymm0);
+			}
+
+			vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
+			vpminsw(uv0, ymm5);
+			vpminsw(uv1, ymm5);
+		}
+		else
+		{
+			vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
+			vpand(uv0, ymm4);
+			vpand(uv1, ymm4);
+
+			if(region)
+			{
+				vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
+				vpor(uv0, ymm5);
+				vpor(uv1, ymm5);
+			}
+		}
+	}
+	else
+	{
+		vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
+		vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
+		vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
+
+		// uv0
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv0, ymm4);
+
+		if(region)
+		{
+			vpor(ymm1, ymm5);
+		}
+
+		// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv0, ymm4);
+		vpminsw(uv0, ymm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv0, ymm1, ymm0);
+
+		// uv1
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv1, ymm4);
+
+		if(region)
+		{
+			vpor(ymm1, ymm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv1, ymm4);
+		vpminsw(uv1, ymm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv1, ymm1, ymm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTextureLOD()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	push(ebp);
+
+	mov(ebp, (size_t)m_local.gd->tex);
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	if(!m_sel.fst)
+	{
+		vrcpps(ymm0, ymm4);
+
+		vmulps(ymm2, ymm0);
+		vmulps(ymm3, ymm0);
+
+		vcvttps2dq(ymm2, ymm2);
+		vcvttps2dq(ymm3, ymm3);
+	}
+
+	// ymm2 = u
+	// ymm3 = v
+	// ymm4 = q
+	// ymm0 = ymm1 = ymm5 = ymm6 = free
+
+	// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?)
+
+	if(!m_sel.lcm)
+	{
+		// lod = -log2(Q) * (1 << L) + K
+
+		vpcmpeqd(ymm1, ymm1);
+		vpsrld(ymm1, ymm1, 25);
+		vpslld(ymm0, ymm4, 1);
+		vpsrld(ymm0, ymm0, 24);
+		vpsubd(ymm0, ymm1);
+		vcvtdq2ps(ymm0, ymm0); 
+
+		// ymm0 = (float)(exp(q) - 127)
+
+		vpslld(ymm4, ymm4, 9);
+		vpsrld(ymm4, ymm4, 9);
+		vorps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+			
+		// ymm4 = mant(q) | 1.0f
+
+		if(m_cpu.has(util::Cpu::tFMA))
+		{
+			vmovaps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0
+			vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * ymm4 + c1
+			vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
+			vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // ymm4 - 1.0f
+			vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0
+		}
+		else
+		{
+			vmulps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
+			vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
+			vmulps(ymm5, ymm4);
+			vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+			vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
+			vmulps(ymm4, ymm5);
+			vaddps(ymm4, ymm0);
+		}
+
+		// ymm4 = log2(Q) = ((((c0 * ymm4) + c1) * ymm4) + c2) * (ymm4 - 1.0f) + ymm0
+
+		if(m_cpu.has(util::Cpu::tFMA))
+		{
+			vmovaps(ymm5, ptr[&m_local.gd->l]);
+			vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]); 
+		}
+		else
+		{
+			vmulps(ymm4, ptr[&m_local.gd->l]);
+			vaddps(ymm4, ptr[&m_local.gd->k]);
+		}
+
+		// ymm4 = (-log2(Q) * (1 << L) + K) * 0x10000
+
+		vxorps(ymm0, ymm0);
+		vminps(ymm4, ptr[&m_local.gd->mxl]);
+		vmaxps(ymm4, ymm0);
+		vcvtps2dq(ymm4, ymm4);
+
+		if(m_sel.mmin == 1) // round-off mode
+		{
+			mov(eax, 0x8000);
+			vmovd(xmm0, eax);
+			vpbroadcastd(ymm0, xmm0);
+			vpaddd(ymm4, ymm0);
+		}
+
+		vpsrld(ymm0, ymm4, 16);
+
+		vmovdqa(ptr[&m_local.temp.lod.i], ymm0);
+/*
+vpslld(ymm5, ymm0, 6);
+vpslld(ymm6, ymm4, 16);
+vpsrld(ymm6, ymm6, 24);
+return;
+*/
+		if(m_sel.mmin == 2) // trilinear mode
+		{
+			vpshuflw(ymm1, ymm4, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 0, 0));
+			vmovdqa(ptr[&m_local.temp.lod.f], ymm1);
+		}
+
+		// shift u/v/minmax by (int)lod
+
+		vpsravd(ymm2, ymm2, ymm0);
+		vpsravd(ymm3, ymm3, ymm0);
+
+		vmovdqa(ptr[&m_local.temp.uv[0]], ymm2);
+		vmovdqa(ptr[&m_local.temp.uv[1]], ymm3);
+
+		// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
+
+		vpxor(ymm1, ymm1);
+
+		vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
+		vpunpcklwd(ymm5, ymm4, ymm1); // minu
+		vpunpckhwd(ymm6, ymm4, ymm1); // minv
+		vpsrlvd(ymm5, ymm5, ymm0);
+		vpsrlvd(ymm6, ymm6, ymm0);
+		vpackusdw(ymm5, ymm6);
+
+		vbroadcasti128(ymm4, ptr[&m_local.gd->t.max]);
+		vpunpcklwd(ymm6, ymm4, ymm1); // maxu
+		vpunpckhwd(ymm4, ymm4, ymm1); // maxv
+		vpsrlvd(ymm6, ymm6, ymm0);
+		vpsrlvd(ymm4, ymm4, ymm0);
+		vpackusdw(ymm6, ymm4);
+
+		vmovdqa(ptr[&m_local.temp.uv_minmax[0]], ymm5);
+		vmovdqa(ptr[&m_local.temp.uv_minmax[1]], ymm6);
+	}
+	else
+	{
+		// lod = K
+
+		vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]);
+
+		vpsrad(ymm2, xmm0);
+		vpsrad(ymm3, xmm0);
+
+		vmovdqa(ptr[&m_local.temp.uv[0]], ymm2);
+		vmovdqa(ptr[&m_local.temp.uv[1]], ymm3);
+
+		vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]);
+		vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]);
+	}
+
+	// ymm2 = m_local.temp.uv[0] = u (level m)
+	// ymm3 = m_local.temp.uv[1] = v (level m)
+	// ymm5 = minuv
+	// ymm6 = maxuv
+
+	if(m_sel.ltf)
+	{
+		// u -= 0x8000;
+		// v -= 0x8000;
+
+		mov(eax, 0x8000);
+		vmovd(xmm4, eax);
+		vpbroadcastd(ymm4, xmm4);
+
+		vpsubd(ymm2, ymm4);
+		vpsubd(ymm3, ymm4);
+
+		// GSVector8i uf = u.xxzzlh().srl16(1);
+	
+		vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(ymm0, 12);
+		vmovdqa(ptr[&m_local.temp.uf], ymm0);
+
+		// GSVector8i vf = v.xxzzlh().srl16(1);
+
+		vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
+		vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+		vpsrlw(ymm0, 12);
+		vmovdqa(ptr[&m_local.temp.vf], ymm0);
+	}
+
+	// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	vpsrad(ymm2, 16);
+	vpsrad(ymm3, 16);
+	vpackssdw(ymm2, ymm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector8i uv1 = uv0.add16(GSVector8i::x0001());
+
+		vpcmpeqd(ymm1, ymm1);
+		vpsrlw(ymm1, 15);
+		vpaddw(ymm3, ymm2, ymm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		WrapLOD(ymm2, ymm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		WrapLOD(ymm2);
+	}
+
+	// ymm2 = uv0
+	// ymm3 = uv1 (ltf)
+	// ymm0, ymm1, ymm4, ymm5, ymm6 = free
+	// ymm7 = used
+
+	// GSVector8i x0 = uv0.upl16();
+	// GSVector8i y0 = uv0.uph16() << tw;
+
+	vpxor(ymm0, ymm0);
+
+	vpunpcklwd(ymm4, ymm2, ymm0);
+	vpunpckhwd(ymm2, ymm2, ymm0);
+	vpslld(ymm2, (uint8)(m_sel.tw + 3));
+
+	// ymm0 = 0
+	// ymm2 = y0
+	// ymm3 = uv1 (ltf)
+	// ymm4 = x0
+	// ymm1, ymm5, ymm6 = free
+	// ymm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector8i x1 = uv1.upl16();
+		// GSVector8i y1 = uv1.uph16() << tw;
+
+		vpunpcklwd(ymm6, ymm3, ymm0);
+		vpunpckhwd(ymm3, ymm3, ymm0);
+		vpslld(ymm3, (uint8)(m_sel.tw + 3));
+
+		// ymm2 = y0
+		// ymm3 = y1
+		// ymm4 = x0
+		// ymm6 = x1
+		// ymm0, ymm5, ymm6 = free
+		// ymm7 = used
+
+		// GSVector8i addr00 = y0 + x0;
+		// GSVector8i addr01 = y0 + x1;
+		// GSVector8i addr10 = y1 + x0;
+		// GSVector8i addr11 = y1 + x1;
+
+		vpaddd(ymm5, ymm2, ymm4);
+		vpaddd(ymm2, ymm2, ymm6);
+		vpaddd(ymm0, ymm3, ymm4);
+		vpaddd(ymm3, ymm3, ymm6);
+
+		// ymm5 = addr00
+		// ymm2 = addr01
+		// ymm0 = addr10
+		// ymm3 = addr11
+		// ymm1, ymm4, ymm6 = free
+		// ymm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// ymm6 = c00
+		// ymm4 = c01
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm0, ymm2, ymm3 = free
+		// ymm7 = used
+
+		vmovdqa(ymm0, ptr[&m_local.temp.uf]);
+
+		// GSVector8i rb00 = c00 & mask;
+		// GSVector8i ga00 = (c00 >> 8) & mask;
+
+		vpsllw(ymm2, ymm6, 8);
+		vpsrlw(ymm2, 8);
+		vpsrlw(ymm6, 8);
+
+		// GSVector8i rb01 = c01 & mask;
+		// GSVector8i ga01 = (c01 >> 8) & mask;
+
+		vpsllw(ymm3, ymm4, 8);
+		vpsrlw(ymm3, 8);
+		vpsrlw(ymm4, 8);
+
+		// ymm0 = uf
+		// ymm2 = rb00
+		// ymm3 = rb01
+		// ymm6 = ga00
+		// ymm4 = ga01
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm7 = used
+
+		// rb00 = rb00.lerp16_4(rb01, uf);
+		// ga00 = ga00.lerp16_4(ga01, uf);
+
+		lerp16_4(ymm3, ymm2, ymm0);
+		lerp16_4(ymm4, ymm6, ymm0);
+
+		// ymm0 = uf
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm1 = c10
+		// ymm5 = c11
+		// ymm2, ymm6 = free
+		// ymm7 = used
+
+		// GSVector8i rb10 = c10 & mask;
+		// GSVector8i ga10 = (c10 >> 8) & mask;
+
+		vpsrlw(ymm2, ymm1, 8);
+		vpsllw(ymm1, 8);
+		vpsrlw(ymm1, 8);
+
+		// GSVector8i rb11 = c11 & mask;
+		// GSVector8i ga11 = (c11 >> 8) & mask;
+
+		vpsrlw(ymm6, ymm5, 8);
+		vpsllw(ymm5, 8);
+		vpsrlw(ymm5, 8);
+
+		// ymm0 = uf
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm1 = rb10
+		// ymm5 = rb11
+		// ymm2 = ga10
+		// ymm6 = ga11
+		// ymm7 = used
+
+		// rb10 = rb10.lerp16_4(rb11, uf);
+		// ga10 = ga10.lerp16_4(ga11, uf);
+
+		lerp16_4(ymm5, ymm1, ymm0);
+		lerp16_4(ymm6, ymm2, ymm0);
+
+		// ymm3 = rb00
+		// ymm4 = ga00
+		// ymm5 = rb10
+		// ymm6 = ga10
+		// ymm0, ymm1, ymm2 = free
+		// ymm7 = used
+
+		// rb00 = rb00.lerp16_4(rb10, vf);
+		// ga00 = ga00.lerp16_4(ga10, vf);
+
+		vmovdqa(ymm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(ymm5, ymm3, ymm0);
+		lerp16_4(ymm6, ymm4, ymm0);
+	}
+	else
+	{
+		// GSVector8i addr00 = y0 + x0;
+
+		vpaddd(ymm5, ymm2, ymm4);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector8i mask = GSVector8i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		vpsllw(ymm5, ymm6, 8);
+		vpsrlw(ymm5, 8);
+		vpsrlw(ymm6, 8);
+	}
+
+	if(m_sel.mmin != 1) // !round-off mode
+	{
+		vmovdqa(ptr[&m_local.temp.trb], ymm5);
+		vmovdqa(ptr[&m_local.temp.tga], ymm6);
+
+		vmovdqa(ymm2, ptr[&m_local.temp.uv[0]]);
+		vmovdqa(ymm3, ptr[&m_local.temp.uv[1]]);
+
+		vpsrad(ymm2, 1);
+		vpsrad(ymm3, 1);
+
+		vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]);
+		vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]);
+
+		vpsrlw(ymm5, 1);
+		vpsrlw(ymm6, 1);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			vmovd(xmm4, eax);
+			vpbroadcastd(ymm4, xmm4);
+
+			vpsubd(ymm2, ymm4);
+			vpsubd(ymm3, ymm4);
+
+			// GSVector8i uf = u.xxzzlh().srl16(1);
+	
+			vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(ymm0, 12);
+			vmovdqa(ptr[&m_local.temp.uf], ymm0);
+
+			// GSVector8i vf = v.xxzzlh().srl16(1);
+
+			vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpsrlw(ymm0, 12);
+			vmovdqa(ptr[&m_local.temp.vf], ymm0);
+		}
+
+		// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+		vpsrad(ymm2, 16);
+		vpsrad(ymm3, 16);
+		vpackssdw(ymm2, ymm3);
+
+		if(m_sel.ltf)
+		{
+			// GSVector8i uv1 = uv0.add16(GSVector4i::x0001());
+
+			vpcmpeqd(ymm1, ymm1);
+			vpsrlw(ymm1, 15);
+			vpaddw(ymm3, ymm2, ymm1);
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			WrapLOD(ymm2, ymm3);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			WrapLOD(ymm2);
+		}
+
+		// ymm2 = uv0
+		// ymm3 = uv1 (ltf)
+		// ymm0, ymm1, ymm4, ymm5, ymm6 = free
+		// ymm7 = used
+
+		// GSVector8i x0 = uv0.upl16();
+		// GSVector8i y0 = uv0.uph16() << tw;
+
+		vpxor(ymm0, ymm0);
+
+		vpunpcklwd(ymm4, ymm2, ymm0);
+		vpunpckhwd(ymm2, ymm2, ymm0);
+		vpslld(ymm2, (uint8)(m_sel.tw + 3));
+
+		// ymm0 = 0
+		// ymm2 = y0
+		// ymm3 = uv1 (ltf)
+		// ymm4 = x0
+		// ymm1, ymm5, ymm6 = free
+		// ymm7 = used
+
+		if(m_sel.ltf)
+		{
+			// GSVector8i x1 = uv1.upl16();
+			// GSVector8i y1 = uv1.uph16() << tw;
+
+			vpunpcklwd(ymm6, ymm3, ymm0);
+			vpunpckhwd(ymm3, ymm3, ymm0);
+			vpslld(ymm3, (uint8)(m_sel.tw + 3));
+
+			// ymm2 = y0
+			// ymm3 = y1
+			// ymm4 = x0
+			// ymm6 = x1
+			// ymm0, ymm5, ymm6 = free
+			// ymm7 = used
+
+			// GSVector8i addr00 = y0 + x0;
+			// GSVector8i addr01 = y0 + x1;
+			// GSVector8i addr10 = y1 + x0;
+			// GSVector8i addr11 = y1 + x1;
+
+			vpaddd(ymm5, ymm2, ymm4);
+			vpaddd(ymm2, ymm2, ymm6);
+			vpaddd(ymm0, ymm3, ymm4);
+			vpaddd(ymm3, ymm3, ymm6);
+
+			// ymm5 = addr00
+			// ymm2 = addr01
+			// ymm0 = addr10
+			// ymm3 = addr11
+			// ymm1, ymm4, ymm6 = free
+			// ymm7 = used
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(4, 1);
+
+			// ymm6 = c00
+			// ymm4 = c01
+			// ymm1 = c10
+			// ymm5 = c11
+			// ymm0, ymm2, ymm3 = free
+			// ymm7 = used
+
+			vmovdqa(ymm0, ptr[&m_local.temp.uf]);
+
+			// GSVector8i rb00 = c00 & mask;
+			// GSVector8i ga00 = (c00 >> 8) & mask;
+
+			vpsllw(ymm2, ymm6, 8);
+			vpsrlw(ymm2, 8);
+			vpsrlw(ymm6, 8);
+
+			// GSVector8i rb01 = c01 & mask;
+			// GSVector8i ga01 = (c01 >> 8) & mask;
+
+			vpsllw(ymm3, ymm4, 8);
+			vpsrlw(ymm3, 8);
+			vpsrlw(ymm4, 8);
+
+			// ymm0 = uf
+			// ymm2 = rb00
+			// ymm3 = rb01
+			// ymm6 = ga00
+			// ymm4 = ga01
+			// ymm1 = c10
+			// ymm5 = c11
+			// ymm7 = used
+
+			// rb00 = rb00.lerp16_4(rb01, uf);
+			// ga00 = ga00.lerp16_4(ga01, uf);
+
+			lerp16_4(ymm3, ymm2, ymm0);
+			lerp16_4(ymm4, ymm6, ymm0);
+
+			// ymm0 = uf
+			// ymm3 = rb00
+			// ymm4 = ga00
+			// ymm1 = c10
+			// ymm5 = c11
+			// ymm2, ymm6 = free
+			// ymm7 = used
+
+			// GSVector8i rb10 = c10 & mask;
+			// GSVector8i ga10 = (c10 >> 8) & mask;
+
+			vpsrlw(ymm2, ymm1, 8);
+			vpsllw(ymm1, 8);
+			vpsrlw(ymm1, 8);
+
+			// GSVector8i rb11 = c11 & mask;
+			// GSVector8i ga11 = (c11 >> 8) & mask;
+
+			vpsrlw(ymm6, ymm5, 8);
+			vpsllw(ymm5, 8);
+			vpsrlw(ymm5, 8);
+
+			// ymm0 = uf
+			// ymm3 = rb00
+			// ymm4 = ga00
+			// ymm1 = rb10
+			// ymm5 = rb11
+			// ymm2 = ga10
+			// ymm6 = ga11
+			// ymm7 = used
+
+			// rb10 = rb10.lerp16_4(rb11, uf);
+			// ga10 = ga10.lerp16_4(ga11, uf);
+
+			lerp16_4(ymm5, ymm1, ymm0);
+			lerp16_4(ymm6, ymm2, ymm0);
+
+			// ymm3 = rb00
+			// ymm4 = ga00
+			// ymm5 = rb10
+			// ymm6 = ga10
+			// ymm0, ymm1, ymm2 = free
+			// ymm7 = used
+
+			// rb00 = rb00.lerp16_4(rb10, vf);
+			// ga00 = ga00.lerp16_4(ga10, vf);
+
+			vmovdqa(ymm0, ptr[&m_local.temp.vf]);
+
+			lerp16_4(ymm5, ymm3, ymm0);
+			lerp16_4(ymm6, ymm4, ymm0);
+		}
+		else
+		{
+			// GSVector8i addr00 = y0 + x0;
+
+			vpaddd(ymm5, ymm2, ymm4);
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(1, 1);
+
+			// GSVector8i mask = GSVector8i::x00ff();
+
+			// c[0] = c00 & mask;
+			// c[1] = (c00 >> 8) & mask;
+
+			vpsllw(ymm5, ymm6, 8);
+			vpsrlw(ymm5, 8);
+			vpsrlw(ymm6, 8);
+		}
+
+		vmovdqa(ymm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
+		vpsrlw(ymm0, ymm0, 1);
+
+		vmovdqa(ymm2, ptr[&m_local.temp.trb]);
+		vmovdqa(ymm3, ptr[&m_local.temp.tga]);
+
+		lerp16(ymm5, ymm2, ymm0, 0);
+		lerp16(ymm6, ymm3, ymm0, 0);
+	}
+
+	pop(ebp);
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv)
+{
+	// ymm5 = minuv
+	// ymm6 = maxuv
+	// ymm0, ymm1, ymm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv, ymm5);
+			}
+			else
+			{
+				vpxor(ymm0, ymm0);
+				vpmaxsw(uv, ymm0);
+			}
+
+			vpminsw(uv, ymm6);
+		}
+		else
+		{
+			vpand(uv, ymm5);
+
+			if(region)
+			{
+				vpor(uv, ymm6);
+			}
+		}
+	}
+	else
+	{
+		vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv, ymm5);
+
+		if(region)
+		{
+			vpor(ymm1, ymm6);
+		}
+
+		// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv, ymm5);
+		vpminsw(uv, ymm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv, ymm1, ymm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv0, const Ymm& uv1)
+{
+	// ymm5 = minuv
+	// ymm6 = maxuv
+	// ymm0, ymm1, ymm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				vpmaxsw(uv0, ymm5);
+				vpmaxsw(uv1, ymm5);
+			}
+			else
+			{
+				vpxor(ymm0, ymm0);
+				vpmaxsw(uv0, ymm0);
+				vpmaxsw(uv1, ymm0);
+			}
+
+			vpminsw(uv0, ymm6);
+			vpminsw(uv1, ymm6);
+		}
+		else
+		{
+			vpand(uv0, ymm5);
+			vpand(uv1, ymm5);
+
+			if(region)
+			{
+				vpor(uv0, ymm6);
+				vpor(uv1, ymm6);
+			}
+		}
+	}
+	else
+	{
+		vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
+
+		// uv0
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv0, ymm5);
+
+		if(region)
+		{
+			vpor(ymm1, ymm6);
+		}
+
+		// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv0, ymm5);
+		vpminsw(uv0, ymm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv0, ymm1, ymm0);
+
+		// uv1
+
+		// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		vpand(ymm1, uv1, ymm5);
+
+		if(region)
+		{
+			vpor(ymm1, ymm6);
+		}
+
+		// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		vpmaxsw(uv1, ymm5);
+		vpminsw(uv1, ymm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		vpblendvb(uv1, ymm1, ymm0);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector8i ga = iip ? gaf : m_local.c.ga;
+
+		vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+		// gat = gat.modulate16<1>(ga).clamp8();
+
+		modulate16(ymm6, ymm4, 1);
+
+		clamp16(ymm6, ymm3);
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			vpsrlw(ymm4, 7);
+
+			mix16(ymm6, ymm4, ymm3);
+		}
+
+		break;
+
+	case TFX_DECAL:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+			vpsrlw(ymm4, 7);
+
+			mix16(ymm6, ymm4, ymm3);
+		}
+
+		break;
+
+	case TFX_HIGHLIGHT:
+
+		// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+		vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		vmovdqa(ymm2, ymm4);
+
+		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+		vpsrlw(ymm4, 7);
+
+		if(m_sel.tcc)
+		{
+			vpaddusb(ymm4, ymm6);
+		}
+
+		mix16(ymm6, ymm4, ymm3);
+
+		break;
+
+	case TFX_HIGHLIGHT2:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+			vmovdqa(ymm2, ymm4);
+
+			vpsrlw(ymm4, 7);
+
+			mix16(ymm6, ymm4, ymm3);
+		}
+
+		break;
+
+	case TFX_NONE:
+
+		// gat = iip ? ga.srl16(7) : ga;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(ymm6, 7);
+		}
+
+		break;
+	}
+
+	if(m_sel.aa1)
+	{
+		// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+		// FIXME: bios config screen cubes
+
+		if(!m_sel.abe)
+		{
+			// a = cov
+
+			if(m_sel.edge)
+			{
+				vmovdqa(ymm0, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				vpcmpeqd(ymm0, ymm0);
+				vpsllw(ymm0, 15);
+				vpsrlw(ymm0, 8);
+			}
+
+			mix16(ymm6, ymm0, ymm1);
+		}
+		else
+		{
+			// a = a == 0x80 ? cov : a
+
+			vpcmpeqd(ymm0, ymm0);
+			vpsllw(ymm0, 15);
+			vpsrlw(ymm0, 8);
+
+			if(m_sel.edge)
+			{
+				vmovdqa(ymm1, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				vmovdqa(ymm1, ymm0);
+			}
+
+			vpcmpeqw(ymm0, ymm6);
+			vpsrld(ymm0, 16);
+			vpslld(ymm0, 16);
+
+			vpblendvb(ymm6, ymm1, ymm0);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+	if(m_sel.fwrite)
+	{
+		vpbroadcastd(ymm3, ptr[&m_local.gd->fm]);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vpbroadcastd(ymm4, ptr[&m_local.gd->zm]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+	switch(m_sel.afail)
+	{
+	case AFAIL_FB_ONLY:
+		if(!m_sel.zwrite) return;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		if(!m_sel.fwrite) return;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		if(!m_sel.zwrite && m_sel.fpsm == 1) return;
+		break;
+	}
+
+	switch(m_sel.atst)
+	{
+	case ATST_NEVER:
+		// t = GSVector8i::xffffffff();
+		vpcmpeqd(ymm1, ymm1);
+		break;
+
+	case ATST_ALWAYS:
+		return;
+
+	case ATST_LESS:
+	case ATST_LEQUAL:
+		// t = (ga >> 16) > m_local.gd->aref;
+		vpsrld(ymm1, ymm6, 16);
+		vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
+		vpcmpgtd(ymm1, ymm0);
+		break;
+
+	case ATST_EQUAL:
+		// t = (ga >> 16) != m_local.gd->aref;
+		vpsrld(ymm1, ymm6, 16);
+		vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
+		vpcmpeqd(ymm1, ymm0);
+		vpcmpeqd(ymm0, ymm0);
+		vpxor(ymm1, ymm0);
+		break;
+
+	case ATST_GEQUAL:
+	case ATST_GREATER:
+		// t = (ga >> 16) < m_local.gd->aref;
+		vpsrld(ymm0, ymm6, 16);
+		vbroadcasti128(ymm1, ptr[&m_local.gd->aref]);
+		vpcmpgtd(ymm1, ymm0);
+		break;
+
+	case ATST_NOTEQUAL:
+		// t = (ga >> 16) == m_local.gd->aref;
+		vpsrld(ymm1, ymm6, 16);
+		vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
+		vpcmpeqd(ymm1, ymm0);
+		break;
+	}
+
+	switch(m_sel.afail)
+	{
+	case AFAIL_KEEP:
+		// test |= t;
+		vpor(ymm7, ymm1);
+		alltrue();
+		break;
+
+	case AFAIL_FB_ONLY:
+		// zm |= t;
+		vpor(ymm4, ymm1);
+		break;
+
+	case AFAIL_ZB_ONLY:
+		// fm |= t;
+		vpor(ymm3, ymm1);
+		break;
+
+	case AFAIL_RGB_ONLY:
+		// zm |= t;
+		vpor(ymm4, ymm1);
+		// fm |= t & GSVector8i::xff000000();
+		vpsrld(ymm1, 24);
+		vpslld(ymm1, 24);
+		vpor(ymm3, ymm1);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector8i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).clamp8();
+
+		modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		clamp16(ymm5, ymm1);
+
+		break;
+
+	case TFX_DECAL:
+
+		break;
+
+	case TFX_HIGHLIGHT:
+	case TFX_HIGHLIGHT2:
+
+		if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+		{
+			// GSVector8i ga = iip ? gaf : m_local.c.ga;
+
+			vmovdqa(ymm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		}
+
+		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+		vmovdqa(ymm1, ymm6);
+
+		modulate16(ymm6, ymm2, 1);
+
+		vpshuflw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1));
+		vpshufhw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1));
+		vpsrlw(ymm2, 7);
+
+		vpaddw(ymm6, ymm2);
+
+		clamp16(ymm6, ymm0);
+
+		mix16(ymm6, ymm1, ymm0);
+
+		// GSVector8i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+		modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		vpaddw(ymm5, ymm2);
+
+		clamp16(ymm5, ymm0);
+
+		break;
+
+	case TFX_NONE:
+
+		// rbt = iip ? rb.srl16(7) : rb;
+
+		if(m_sel.iip)
+		{
+			vpsrlw(ymm5, 7);
+		}
+
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+	if(!m_sel.fwrite || !m_sel.fge)
+	{
+		return;
+	}
+
+	// rb = m_local.gd->frb.lerp16<0>(rb, f);
+	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		vmovdqa(ymm0, ptr[&m_local.temp.f]);
+	}
+	else
+	{
+		vpbroadcastw(ymm0, ptr[&m_local.p.f]);
+	}
+
+	vmovdqa(ymm1, ymm6);
+
+	vpbroadcastd(ymm2, ptr[&m_local.gd->frb]);
+	lerp16(ymm5, ymm2, ymm0, 0);
+
+	vpbroadcastd(ymm2, ptr[&m_local.gd->fga]);
+	lerp16(ymm6, ymm2, ymm0, 0);
+	mix16(ymm6, ymm1, ymm0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	// int fa = fza_base.x + fza_offset->x;
+
+	mov(ebx, ptr[esi]);
+	add(ebx, ptr[edi]);
+
+	if(!m_sel.rfb)
+	{
+		return;
+	}
+
+	ReadPixel(ymm2, ymm0, ebx);
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+	if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
+	{
+		return;
+	}
+
+	// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
+
+	if(m_sel.datm)
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpxor(ymm0, ymm0);
+			//vpsrld(ymm1, ymm2, 15);
+			vpslld(ymm1, ymm2, 16);
+			vpsrad(ymm1, 31);
+			vpcmpeqd(ymm1, ymm0);
+		}
+		else
+		{
+			vpcmpeqd(ymm0, ymm0);
+			vpxor(ymm1, ymm2, ymm0);
+			vpsrad(ymm1, 31);
+		}
+	}
+	else
+	{
+		if(m_sel.fpsm == 2)
+		{
+			vpslld(ymm1, ymm2, 16);
+			vpsrad(ymm1, 31);
+		}
+		else
+		{
+			vpsrad(ymm1, ymm2, 31);
+		}
+	}
+
+	vpor(ymm7, ymm1);
+
+	alltrue();
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+	if(m_sel.notest)
+	{
+		return;
+	}
+
+	// fm |= test;
+	// zm |= test;
+
+	if(m_sel.fwrite)
+	{
+		vpor(ymm3, ymm7);
+	}
+
+	if(m_sel.zwrite)
+	{
+		vpor(ymm4, ymm7);
+	}
+
+	// int fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask();
+
+	vpcmpeqd(ymm1, ymm1);
+
+	if(m_sel.fwrite && m_sel.zwrite)
+	{
+		vpcmpeqd(ymm0, ymm1, ymm4);
+		vpcmpeqd(ymm1, ymm3);
+		vpackssdw(ymm1, ymm0);
+	}
+	else if(m_sel.fwrite)
+	{
+		vpcmpeqd(ymm1, ymm3);
+		vpackssdw(ymm1, ymm1);
+	}
+	else if(m_sel.zwrite)
+	{
+		vpcmpeqd(ymm1, ymm4);
+		vpackssdw(ymm1, ymm1);
+	}
+
+	vpmovmskb(edx, ymm1);
+
+	not(edx);
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+	if(!m_sel.zwrite)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		vmovdqa(ymm1, ptr[&m_local.temp.zs]);
+	}
+	else
+	{
+		vpbroadcastd(ymm1, ptr[&m_local.p.z]);
+	}
+
+	if(m_sel.ztest && m_sel.zpsm < 2)
+	{
+		// zs = zs.blend8(zd, zm);
+
+		vpblendvb(ymm1, ptr[&m_local.temp.zd], ymm4);
+	}
+
+	bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
+
+	WritePixel(ymm1, ymm0, ebp, edx, fast, m_sel.zpsm, 1);
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.abe == 0 && m_sel.aa1 == 0)
+	{
+		return;
+	}
+
+	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
+	{
+		switch(m_sel.fpsm)
+		{
+		case 0:
+		case 1:
+
+			// c[2] = fd & mask;
+			// c[3] = (fd >> 8) & mask;
+
+			vpsllw(ymm0, ymm2, 8);
+			vpsrlw(ymm0, 8);
+			vpsrlw(ymm1, ymm2, 8);
+
+			break;
+
+		case 2:
+
+			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+			vpcmpeqd(ymm7, ymm7);
+
+			vpsrld(ymm7, 27); // 0x0000001f
+			vpand(ymm0, ymm2, ymm7);
+			vpslld(ymm0, 3);
+
+			vpslld(ymm7, 10); // 0x00007c00
+			vpand(ymm4, ymm2, ymm7);
+			vpslld(ymm4, 9);
+
+			vpor(ymm0, ymm4);
+
+			vpsrld(ymm7, 5); // 0x000003e0
+			vpand(ymm1, ymm2, ymm7);
+			vpsrld(ymm1, 2);
+
+			vpsllw(ymm7, 10); // 0x00008000
+			vpand(ymm4, ymm2, ymm7);
+			vpslld(ymm4, 8);
+
+			vpor(ymm1, ymm4);
+
+			break;
+		}
+	}
+
+	// ymm5, ymm6 = src rb, ga
+	// ymm0, ymm1 = dst rb, ga
+	// ymm2, ymm3 = used
+	// ymm4, ymm7 = free
+
+	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
+	{
+		vmovdqa(ymm4, ymm5);
+	}
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// rb = c[aba * 2 + 0];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(ymm5, ymm0); break;
+		case 2: vpxor(ymm5, ymm5); break;
+		}
+
+		// rb = rb.sub16(c[abb * 2 + 0]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(ymm5, ymm4); break;
+		case 1: vpsubw(ymm5, ymm0); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
+
+			switch(m_sel.abc)
+			{
+			case 0:
+			case 1:
+				vpshuflw(ymm7, m_sel.abc ? ymm1 : ymm6, _MM_SHUFFLE(3, 3, 1, 1));
+				vpshufhw(ymm7, ymm7, _MM_SHUFFLE(3, 3, 1, 1));
+				vpsllw(ymm7, 7);
+				break;
+			case 2:
+				vpbroadcastw(ymm7, ptr[&m_local.gd->afix]);
+				break;
+			}
+
+			// rb = rb.modulate16<1>(a);
+
+			modulate16(ymm5, ymm7, 1);
+		}
+
+		// rb = rb.add16(c[abd * 2 + 0]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(ymm5, ymm4); break;
+		case 1: vpaddw(ymm5, ymm0); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// rb = c[abd * 2 + 0];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(ymm5, ymm0); break;
+		case 2: vpxor(ymm5, ymm5); break;
+		}
+	}
+
+	if(m_sel.pabe)
+	{
+		// mask = (c[1] << 8).sra32(31);
+
+		vpslld(ymm0, ymm6, 8);
+		vpsrad(ymm0, 31);
+
+		// rb = c[0].blend8(rb, mask);
+
+		vpblendvb(ymm5, ymm4, ymm5, ymm0);
+	}
+
+	// ymm6 = src ga
+	// ymm1 = dst ga
+	// ymm5 = rb
+	// ymm7 = a
+	// ymm2, ymm3 = used
+	// ymm0, ymm4 = free
+
+	vmovdqa(ymm4, ymm6);
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// ga = c[aba * 2 + 1];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: vmovdqa(ymm6, ymm1); break;
+		case 2: vpxor(ymm6, ymm6); break;
+		}
+
+		// ga = ga.sub16(c[abeb * 2 + 1]);
+
+		switch(m_sel.abb)
+		{
+		case 0: vpsubw(ymm6, ymm4); break;
+		case 1: vpsubw(ymm6, ymm1); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// ga = ga.modulate16<1>(a);
+
+			modulate16(ymm6, ymm7, 1);
+		}
+
+		// ga = ga.add16(c[abd * 2 + 1]);
+
+		switch(m_sel.abd)
+		{
+		case 0: vpaddw(ymm6, ymm4); break;
+		case 1: vpaddw(ymm6, ymm1); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// ga = c[abd * 2 + 1];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: vmovdqa(ymm6, ymm1); break;
+		case 2: vpxor(ymm6, ymm6); break;
+		}
+	}
+
+	// ymm4 = src ga
+	// ymm5 = rb
+	// ymm6 = ga
+	// ymm2, ymm3 = used
+	// ymm0, ymm1, ymm7 = free
+
+	if(m_sel.pabe)
+	{
+		vpsrld(ymm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+		vpblendvb(ymm6, ymm4, ymm6, ymm0);
+	}
+	else
+	{
+		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		{
+			mix16(ymm6, ymm4, ymm7);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.fpsm == 2 && m_sel.dthe)
+	{
+		mov(eax, ptr[esp + _top]);
+		and(eax, 3);
+		shl(eax, 5);
+		mov(ebp, ptr[&m_local.gd->dimx]);
+		vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
+		vpaddw(ymm5, ymm7);
+		vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
+		vpaddw(ymm6, ymm7);
+	}
+
+	if(m_sel.colclamp == 0)
+	{
+		// c[0] &= 0x00ff00ff;
+		// c[1] &= 0x00ff00ff;
+
+		vpcmpeqd(ymm7, ymm7);
+		vpsrlw(ymm7, 8);
+		vpand(ymm5, ymm7);
+		vpand(ymm6, ymm7);
+	}
+
+	// GSVector8i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+	vpunpckhwd(ymm7, ymm5, ymm6);
+	vpunpcklwd(ymm5, ymm6);
+	vpackuswb(ymm5, ymm7);
+
+	if(m_sel.fba && m_sel.fpsm != 1)
+	{
+		// fs |= 0x80000000;
+
+		vpcmpeqd(ymm7, ymm7);
+		vpslld(ymm7, 31);
+		vpor(ymm5, ymm7);
+	}
+
+	if(m_sel.fpsm == 2)
+	{
+		// GSVector8i rb = fs & 0x00f800f8;
+		// GSVector8i ga = fs & 0x8000f800;
+
+		mov(eax, 0x00f800f8);
+		vmovd(xmm6, eax);
+		vpbroadcastd(ymm6, xmm6);
+
+		mov(eax, 0x8000f800);
+		vmovd(xmm7, eax);
+		vpbroadcastd(ymm7, xmm7);
+
+		vpand(ymm4, ymm5, ymm6);
+		vpand(ymm5, ymm7);
+
+		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+		vpsrld(ymm6, ymm4, 9);
+		vpsrld(ymm4, 3);
+		vpsrld(ymm7, ymm5, 16);
+		vpsrld(ymm5, 6);
+
+		vpor(ymm5, ymm4);
+		vpor(ymm7, ymm6);
+		vpor(ymm5, ymm7);
+	}
+
+	if(m_sel.rfb)
+	{
+		// fs = fs.blend(fd, fm);
+
+		blend(ymm5, ymm2, ymm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
+	}
+
+	bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
+
+	WritePixel(ymm5, ymm0, ebx, edx, fast, m_sel.fpsm, 0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr)
+{
+	vmovq(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm]);
+	vmovhps(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);	
+	vmovq(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]);
+	vmovhps(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2]);	
+	vinserti128(dst, dst, temp, 1);	
+/*
+	vmovdqu(dst, ptr[addr * 2 + (size_t)m_local.gd->vm]);
+	vmovdqu(temp, ptr[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]);
+	vpunpcklqdq(dst, dst, temp);
+	vpermq(dst, dst, _MM_SHUFFLE(3, 1, 2, 0));
+*/
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz)
+{
+	Xmm src1 = Xmm(src.getIdx());
+	Xmm src2 = Xmm(temp.getIdx());
+
+	vextracti128(src2, src, 1); 
+
+	if(m_sel.notest)
+	{
+		if(fast)
+		{
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1);
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1);
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2);
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2);
+		}
+		else
+		{
+			WritePixel(src1, addr, 0, 0, psm);
+			WritePixel(src1, addr, 1, 1, psm);
+			WritePixel(src1, addr, 2, 2, psm);
+			WritePixel(src1, addr, 3, 3, psm);
+			WritePixel(src2, addr, 4, 0, psm);
+			WritePixel(src2, addr, 5, 1, psm);
+			WritePixel(src2, addr, 6, 2, psm);
+			WritePixel(src2, addr, 7, 3, psm);
+		}
+	}
+	else
+	{
+		// cascade tests?
+
+		if(fast)
+		{
+			test(mask, 0x0000000f << (fz * 8));
+			je("@f");
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1);
+			L("@@");
+
+			test(mask, 0x000000f0 << (fz * 8));
+			je("@f");
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1);
+			L("@@");
+
+			test(mask, 0x000f0000 << (fz * 8));
+			je("@f");
+			vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2);
+			L("@@");
+
+			test(mask, 0x00f00000 << (fz * 8));
+			je("@f");
+			vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2);
+			L("@@");
+
+			// vmaskmovps?
+		}
+		else
+		{
+			test(mask, 0x00000003 << (fz * 8));
+			je("@f");
+			WritePixel(src1, addr, 0, 0, psm);
+			L("@@");
+
+			test(mask, 0x0000000c << (fz * 8));
+			je("@f");
+			WritePixel(src1, addr, 1, 1, psm);
+			L("@@");
+
+			test(mask, 0x00000030 << (fz * 8));
+			je("@f");
+			WritePixel(src1, addr, 2, 2, psm);
+			L("@@");
+
+			test(mask, 0x000000c0 << (fz * 8));
+			je("@f");
+			WritePixel(src1, addr, 3, 3, psm);
+			L("@@");
+
+			test(mask, 0x00030000 << (fz * 8));
+			je("@f");
+			WritePixel(src2, addr, 4, 0, psm);
+			L("@@");
+
+			test(mask, 0x000c0000 << (fz * 8));
+			je("@f");
+			WritePixel(src2, addr, 5, 1, psm);
+			L("@@");
+
+			test(mask, 0x00300000 << (fz * 8));
+			je("@f");
+			WritePixel(src2, addr, 6, 2, psm);
+			L("@@");
+
+			test(mask, 0x00c00000 << (fz * 8));
+			je("@f");
+			WritePixel(src2, addr, 7, 3, psm);
+			L("@@");
+		}
+	}
+}
+
+static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm)
+{
+	Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
+
+	switch(psm)
+	{
+	case 0:
+		if(j == 0) vmovd(dst, src);
+		else vpextrd(dst, src, j);
+		break;
+	case 1:
+		if(j == 0) vmovd(eax, src);
+		else vpextrd(eax, src, j);
+		xor(eax, dst);
+		and(eax, 0xffffff);
+		xor(dst, eax);
+		break;
+	case 2:
+		if(j == 0) vmovd(eax, src);
+		else vpextrw(eax, src, j * 2);
+		mov(dst, ax);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
+{
+	// in
+	// ymm5 = addr00
+	// ymm2 = addr01
+	// ymm0 = addr10
+	// ymm3 = addr11
+	// ebx = m_local.tex[0] (!m_sel.mmin)
+	// ebp = m_local.tex (m_sel.mmin)
+	// edx = m_local.clut (m_sel.tlu)
+
+	// out
+	// ymm6 = c00
+	// ymm4 = c01
+	// ymm1 = c10
+	// ymm5 = c11
+
+	ASSERT(pixels == 1 || pixels == 4);
+
+	mip_offset *= sizeof(void*);
+
+	const GSVector8i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i;
+
+	if(m_sel.mmin && !m_sel.lcm)
+	{
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
+		const int t[] = {1, 4, 5, 1, 2, 5, 0, 2};
+
+		for(int i = 0; i < pixels; i++)
+		{
+			Ymm src = Ymm(r[i * 2 + 0]);
+			Ymm dst = Ymm(r[i * 2 + 1]);
+			Ymm t1 = Ymm(t[i * 2 + 0]);
+			Ymm t2 = Ymm(t[i * 2 + 1]);
+
+			vextracti128(Xmm(t1.getIdx()), src, 1);
+
+			for(uint8 j = 0; j < 4; j++)
+			{
+				mov(ebx, ptr[&lod_i->u32[j + 0]]);
+				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+				ReadTexel(dst, src, j);
+
+				mov(ebx, ptr[&lod_i->u32[j + 4]]);
+				mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+				ReadTexel(t2, t1, j);
+			}
+
+			vinserti128(dst, dst, t2, 1);
+		}
+	}
+	else 
+	{
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
+		const int t[] = {1, 4, 5, 1, 2, 5, 0, 2};
+		
+		if(m_sel.mmin && m_sel.lcm)
+		{
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+		}
+
+		for(int i = 0; i < pixels; i++)
+		{
+			Ymm src = Ymm(r[i * 2 + 0]);
+			Ymm dst = Ymm(r[i * 2 + 1]);
+			Ymm t1 = Ymm(t[i * 2 + 0]);
+			Ymm t2 = Ymm(t[i * 2 + 1]);
+
+			if(!m_sel.tlu)
+			{
+				vpcmpeqd(t1, t1);
+				vpgatherdd(dst, ptr[ebx + src * 4], t1);
+			}
+			else
+			{
+				vextracti128(Xmm(t1.getIdx()), src, 1);
+
+				for(uint8 j = 0; j < 4; j++)
+				{
+					ReadTexel(dst, src, j);
+					ReadTexel(t2, t1, j);
+				}
+
+				vinserti128(dst, dst, t2, 1);
+				/*
+				vpcmpeqd(t1, t1);
+				vpgatherdd(t2, ptr[ebx + src * 1], t1); // either this 1x scale, or the latency of two dependendent gathers are too slow
+				vpslld(t2, 24);
+				vpsrld(t2, 24);
+				vpcmpeqd(t1, t1);
+				vpgatherdd(dst, ptr[edx + t2 * 4], t1);
+				*/
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i)
+{
+	ASSERT(i < 4);
+
+	const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
+
+	if(i == 0) vmovd(eax, Xmm(addr.getIdx()));
+	else vpextrd(eax, Xmm(addr.getIdx()), i);
+	
+	if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
+
+	if(i == 0) vmovd(Xmm(dst.getIdx()), src);
+	else vpinsrd(Xmm(dst.getIdx()), src, i);
+}
+
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.cpp
new file mode 100644
index 0000000000..eb95e857ce
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawScanlineCodeGenerator.x86.cpp
@@ -0,0 +1,3175 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDrawScanlineCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+
+static const int _args = 16;
+static const int _top = _args + 4;
+static const int _v = _args + 8;
+
+void GSDrawScanlineCodeGenerator::Generate()
+{
+	push(ebx);
+	push(esi);
+	push(edi);
+	push(ebp);
+
+	Init();
+
+	if(!m_sel.edge)
+	{
+		align(16);
+	}
+
+L("loop");
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// xmm0 = z/zi
+	// xmm2 = s/u (tme)
+	// xmm3 = t/v (tme)
+	// xmm4 = q (tme)
+	// xmm5 = rb (!tme)
+	// xmm6 = ga (!tme)
+	// xmm7 = test
+
+	bool tme = m_sel.tfx != TFX_NONE;
+
+	TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - xmm0
+	// xmm2 = s/u (tme)
+	// xmm3 = t/v (tme)
+	// xmm4 = q (tme)
+	// xmm5 = rb (!tme)
+	// xmm6 = ga (!tme)
+	// xmm7 = test
+
+	if(m_sel.mmin)
+	{
+		SampleTextureLOD();
+	}
+	else
+	{
+		SampleTexture();
+	}
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// - xmm2
+	// - xmm3
+	// - xmm4
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	AlphaTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ReadMask();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	TestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ColorTFX();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	Fog();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	ReadFrame();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	TestDestAlpha();
+
+	// ecx = steps
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm7 = test
+
+	WriteMask();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// ebp = za
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm4 = zm
+	// xmm5 = rb
+	// xmm6 = ga
+
+	WriteZBuf();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// - ebp
+	// xmm2 = fd
+	// xmm3 = fm
+	// - xmm4
+	// xmm5 = rb
+	// xmm6 = ga
+
+	AlphaBlend();
+
+	// ebx = fa
+	// ecx = steps
+	// edx = fzm
+	// esi = fzbr
+	// edi = fzbc
+	// xmm2 = fd
+	// xmm3 = fm
+	// xmm5 = rb
+	// xmm6 = ga
+
+	WriteFrame();
+
+L("step");
+
+	// if(steps <= 0) break;
+
+	if(!m_sel.edge)
+	{
+		test(ecx, ecx);
+
+		jle("exit", T_NEAR);
+
+		Step();
+
+		jmp("loop", T_NEAR);
+	}
+
+L("exit");
+
+	// vzeroupper();
+
+	pop(ebp);
+	pop(edi);
+	pop(esi);
+	pop(ebx);
+
+	ret(8);
+}
+
+void GSDrawScanlineCodeGenerator::Init()
+{
+	if(!m_sel.notest)
+	{
+		// int skip = left & 3;
+
+		mov(ebx, edx);
+		and(edx, 3);
+
+		// int steps = pixels + skip - 4;
+
+		lea(ecx, ptr[ecx + edx - 4]);
+
+		// left -= skip;
+
+		sub(ebx, edx);
+
+		// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
+
+		shl(edx, 4);
+
+		movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
+
+		mov(eax, ecx);
+		sar(eax, 31);
+		and(eax, ecx);
+		shl(eax, 4);
+
+		por(xmm7, ptr[eax + (size_t)&m_test[7]]);
+	}
+	else
+	{
+		mov(ebx, edx); // left
+		xor(edx, edx); // skip
+		lea(ecx, ptr[ecx - 4]); // steps
+	}
+
+	// GSVector2i* fza_base = &m_local.gd->fzbr[top];
+
+	mov(esi, ptr[esp + _top]);
+	lea(esi, ptr[esi * 8]);
+	add(esi, ptr[&m_local.gd->fzbr]);
+
+	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
+
+	lea(edi, ptr[ebx * 2]);
+	add(edi, ptr[&m_local.gd->fzbc]);
+
+	if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
+	{
+		// edx = &m_local.d[skip]
+
+		lea(edx, ptr[edx * 8 + (size_t)m_local.d]);
+
+		// ebx = &v
+
+		mov(ebx, ptr[esp + _v]);
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
+		{
+			movaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
+
+			if(m_sel.fwrite && m_sel.fge)
+			{
+				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
+
+				cvttps2dq(xmm1, xmm0);
+				pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+				paddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
+
+				movdqa(ptr[&m_local.temp.f], xmm1);
+			}
+
+			if(m_sel.zb)
+			{
+				// z = vp.zzzz() + m_local.d[skip].z;
+
+				shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+				movaps(ptr[&m_local.temp.z], xmm0);
+				movaps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
+				movaps(ptr[&m_local.temp.zo], xmm2);
+				addps(xmm0, xmm2);
+			}
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			movdqa(xmm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.edge || m_sel.tfx != TFX_NONE)
+		{
+			movaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
+		}
+
+		if(m_sel.edge)
+		{
+			// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
+
+			pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+			pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
+			psrlw(xmm3, 9);
+
+			movdqa(ptr[&m_local.temp.cov], xmm3);
+		}
+
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i vti(vt);
+
+				cvttps2dq(xmm6, xmm4);
+
+				// s = vti.xxxx() + m_local.d[skip].s;
+				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
+
+				pshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+				pshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
+
+				paddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					paddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				}
+				else
+				{
+					if(m_sel.ltf)
+					{
+						pshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+						pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
+						psrlw(xmm6, 12);
+						movdqa(ptr[&m_local.temp.vf], xmm6);
+					}
+				}
+
+				movdqa(ptr[&m_local.temp.s], xmm2);
+				movdqa(ptr[&m_local.temp.t], xmm3);
+			}
+			else
+			{
+				// s = vt.xxxx() + m_local.d[skip].s;
+				// t = vt.yyyy() + m_local.d[skip].t;
+				// q = vt.zzzz() + m_local.d[skip].q;
+
+				movaps(xmm2, xmm4);
+				movaps(xmm3, xmm4);
+
+				shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
+				shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
+				shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				addps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
+				addps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
+				addps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
+
+				movaps(ptr[&m_local.temp.s], xmm2);
+				movaps(ptr[&m_local.temp.t], xmm3);
+				movaps(ptr[&m_local.temp.q], xmm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i vc = GSVector4i(v.c);
+
+				cvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
+
+				// vc = vc.upl16(vc.zwxy());
+
+				pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
+				punpcklwd(xmm6, xmm5);
+
+				// rb = vc.xxxx().add16(m_local.d[skip].rb);
+				// ga = vc.zzzz().add16(m_local.d[skip].ga);
+
+				pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+				pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
+
+				paddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
+				paddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
+
+				movdqa(ptr[&m_local.temp.rb], xmm5);
+				movdqa(ptr[&m_local.temp.ga], xmm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					movdqa(xmm5, ptr[&m_local.c.rb]);
+					movdqa(xmm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Step()
+{
+	// steps -= 4;
+
+	sub(ecx, 4);
+
+	// fza_offset++;
+
+	add(edi, 8);
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// z += m_local.d4.z;
+
+		if(m_sel.zb)
+		{
+			movaps(xmm0, ptr[&m_local.temp.zo]);
+			addps(xmm0, ptr[&m_local.d4.z]);
+			movaps(ptr[&m_local.temp.zo], xmm0);
+			addps(xmm0, ptr[&m_local.temp.z]);
+		}
+
+		// f = f.add16(m_local.d4.f);
+
+		if(m_sel.fwrite && m_sel.fge)
+		{
+			movdqa(xmm1, ptr[&m_local.temp.f]);
+			paddw(xmm1, ptr[&m_local.d4.f]);
+			movdqa(ptr[&m_local.temp.f], xmm1);
+		}
+	}
+	else
+	{
+		if(m_sel.ztest)
+		{
+			movdqa(xmm0, ptr[&m_local.p.z]);
+		}
+	}
+
+	if(m_sel.fb)
+	{
+		if(m_sel.tfx != TFX_NONE)
+		{
+			if(m_sel.fst)
+			{
+				// GSVector4i stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// if(!sprite) t += stq.yyyy();
+
+				movdqa(xmm4, ptr[&m_local.d4.stq]);
+
+				pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+				paddd(xmm2, ptr[&m_local.temp.s]);
+				movdqa(ptr[&m_local.temp.s], xmm2);
+
+				if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
+				{
+					pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
+					paddd(xmm3, ptr[&m_local.temp.t]);
+					movdqa(ptr[&m_local.temp.t], xmm3);
+				}
+				else
+				{
+					movdqa(xmm3, ptr[&m_local.temp.t]);
+				}
+			}
+			else
+			{
+				// GSVector4 stq = m_local.d4.stq;
+
+				// s += stq.xxxx();
+				// t += stq.yyyy();
+				// q += stq.zzzz();
+
+				movaps(xmm4, ptr[&m_local.d4.stq]);
+				movaps(xmm2, xmm4);
+				movaps(xmm3, xmm4);
+
+				shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
+				shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
+				shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
+
+				addps(xmm2, ptr[&m_local.temp.s]);
+				addps(xmm3, ptr[&m_local.temp.t]);
+				addps(xmm4, ptr[&m_local.temp.q]);
+
+				movaps(ptr[&m_local.temp.s], xmm2);
+				movaps(ptr[&m_local.temp.t], xmm3);
+				movaps(ptr[&m_local.temp.q], xmm4);
+			}
+		}
+
+		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
+		{
+			if(m_sel.iip)
+			{
+				// GSVector4i c = m_local.d4.c;
+
+				// rb = rb.add16(c.xxxx());
+				// ga = ga.add16(c.yyyy());
+
+				movdqa(xmm7, ptr[&m_local.d4.c]);
+
+				pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+				pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));
+
+				paddw(xmm5, ptr[&m_local.temp.rb]);
+				paddw(xmm6, ptr[&m_local.temp.ga]);
+
+				// FIXME: color may underflow and roll over at the end of the line, if decreasing
+
+				pxor(xmm7, xmm7);
+				pmaxsw(xmm5, xmm7);
+				pmaxsw(xmm6, xmm7);
+
+				movdqa(ptr[&m_local.temp.rb], xmm5);
+				movdqa(ptr[&m_local.temp.ga], xmm6);
+			}
+			else
+			{
+				if(m_sel.tfx == TFX_NONE)
+				{
+					movdqa(xmm5, ptr[&m_local.c.rb]);
+					movdqa(xmm6, ptr[&m_local.c.ga]);
+				}
+			}
+		}
+	}
+
+	if(!m_sel.notest)
+	{
+		// test = m_test[7 + (steps & (steps >> 31))];
+
+		mov(edx, ecx);
+		sar(edx, 31);
+		and(edx, ecx);
+		shl(edx, 4);
+
+		movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
+{
+	if(!m_sel.zb)
+	{
+		return;
+	}
+
+	// int za = fza_base.y + fza_offset->y;
+
+	mov(ebp, ptr[esi + 4]);
+	add(ebp, ptr[edi + 4]);
+
+	// GSVector4i zs = zi;
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		if(m_sel.zoverflow)
+		{
+			// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+			movaps(temp1, ptr[&GSVector4::m_half]);
+			mulps(temp1, xmm0);
+			cvttps2dq(temp1, temp1);
+			pslld(temp1, 1);
+
+			cvttps2dq(xmm0, xmm0);
+			pcmpeqd(temp2, temp2);
+			psrld(temp2, 31);
+			pand(xmm0, temp2);
+
+			por(xmm0, temp1);
+		}
+		else
+		{
+			// zs = GSVector4i(z);
+
+			cvttps2dq(xmm0, xmm0);
+		}
+
+		if(m_sel.zwrite)
+		{
+			movdqa(ptr[&m_local.temp.zs], xmm0);
+		}
+	}
+
+	if(m_sel.ztest)
+	{
+		ReadPixel(xmm1, ebp);
+
+		if(m_sel.zwrite && m_sel.zpsm < 2)
+		{
+			movdqa(ptr[&m_local.temp.zd], xmm1);
+		}
+
+		// zd &= 0xffffffff >> m_sel.zpsm * 8;
+
+		if(m_sel.zpsm)
+		{
+			pslld(xmm1, m_sel.zpsm * 8);
+			psrld(xmm1, m_sel.zpsm * 8);
+		}
+
+		if(m_sel.zoverflow || m_sel.zpsm == 0)
+		{
+			// GSVector4i o = GSVector4i::x80000000();
+
+			pcmpeqd(temp1, temp1);
+			pslld(temp1, 31);
+
+			// GSVector4i zso = zs - o;
+			// GSVector4i zdo = zd - o;
+
+			psubd(xmm0, temp1);
+			psubd(xmm1, temp1);
+		}
+
+		switch(m_sel.ztst)
+		{
+		case ZTST_GEQUAL:
+			// test |= zso < zdo; // ~(zso >= zdo)
+			pcmpgtd(xmm1, xmm0);
+			por(xmm7, xmm1);
+			break;
+
+		case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
+			// test |= zso <= zdo; // ~(zso > zdo)
+			pcmpgtd(xmm0, xmm1);
+			pcmpeqd(temp1, temp1);
+			pxor(xmm0, temp1);
+			por(xmm7, xmm0);
+			break;
+		}
+
+		alltrue();
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTexture()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	mov(ebx, ptr[&m_local.gd->tex[0]]);
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	// ebx = tex
+	// edx = clut
+
+	if(!m_sel.fst)
+	{
+		rcpps(xmm4, xmm4);
+
+		mulps(xmm2, xmm4);
+		mulps(xmm3, xmm4);
+
+		cvttps2dq(xmm2, xmm2);
+		cvttps2dq(xmm3, xmm3);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			movd(xmm4, eax);
+			pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+			psubd(xmm2, xmm4);
+			psubd(xmm3, xmm4);
+		}
+	}
+
+	// xmm2 = u
+	// xmm3 = v
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+
+		pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+		pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		psrlw(xmm0, 12);
+		movdqa(ptr[&m_local.temp.uf], xmm0);
+
+		if(m_sel.prim != GS_SPRITE_CLASS)
+		{
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			psrlw(xmm0, 12);
+			movdqa(ptr[&m_local.temp.vf], xmm0);
+		}
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	psrad(xmm2, 16);
+	psrad(xmm3, 16);
+	packssdw(xmm2, xmm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		movdqa(xmm3, xmm2);
+		pcmpeqd(xmm1, xmm1);
+		psrlw(xmm1, 15);
+		paddw(xmm3, xmm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		Wrap(xmm2, xmm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		Wrap(xmm2);
+	}
+
+	// xmm2 = uv0
+	// xmm3 = uv1 (ltf)
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+	// xmm7 = used
+
+	// GSVector4i y0 = uv0.uph16() << tw;
+	// GSVector4i x0 = uv0.upl16();
+
+	pxor(xmm0, xmm0);
+
+	movdqa(xmm4, xmm2);
+	punpckhwd(xmm2, xmm0);
+	punpcklwd(xmm4, xmm0);
+	pslld(xmm2, m_sel.tw + 3);
+
+	// xmm0 = 0
+	// xmm2 = y0
+	// xmm3 = uv1 (ltf)
+	// xmm4 = x0
+	// xmm1, xmm5, xmm6 = free
+	// xmm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i y1 = uv1.uph16() << tw;
+		// GSVector4i x1 = uv1.upl16();
+
+		movdqa(xmm6, xmm3);
+		punpckhwd(xmm3, xmm0);
+		punpcklwd(xmm6, xmm0);
+		pslld(xmm3, m_sel.tw + 3);
+
+		// xmm2 = y0
+		// xmm3 = y1
+		// xmm4 = x0
+		// xmm6 = x1
+		// xmm0, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		movdqa(xmm5, xmm2);
+		paddd(xmm5, xmm4);
+		paddd(xmm2, xmm6);
+
+		movdqa(xmm0, xmm3);
+		paddd(xmm0, xmm4);
+		paddd(xmm3, xmm6);
+
+		// xmm5 = addr00
+		// xmm2 = addr01
+		// xmm0 = addr10
+		// xmm3 = addr11
+		// xmm1, xmm4, xmm6 = free
+		// xmm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// xmm6 = c00
+		// xmm4 = c01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm0, xmm2, xmm3 = free
+		// xmm7 = used
+
+		movdqa(xmm0, ptr[&m_local.temp.uf]);
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		movdqa(xmm2, xmm6);
+		psllw(xmm2, 8);
+		psrlw(xmm2, 8);
+		psrlw(xmm6, 8);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		movdqa(xmm3, xmm4);
+		psllw(xmm3, 8);
+		psrlw(xmm3, 8);
+		psrlw(xmm4, 8);
+
+		// xmm0 = uf
+		// xmm2 = rb00
+		// xmm3 = rb01
+		// xmm6 = ga00
+		// xmm4 = ga01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm7 = used
+
+		// rb00 = rb00.lerp_4(rb01, uf);
+		// ga00 = ga00.lerp_4(ga01, uf);
+
+		lerp16_4(xmm3, xmm2, xmm0);
+		lerp16_4(xmm4, xmm6, xmm0);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm2, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		movdqa(xmm2, xmm1);
+		psllw(xmm1, 8);
+		psrlw(xmm1, 8);
+		psrlw(xmm2, 8);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		movdqa(xmm6, xmm5);
+		psllw(xmm5, 8);
+		psrlw(xmm5, 8);
+		psrlw(xmm6, 8);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = rb10
+		// xmm5 = rb11
+		// xmm2 = ga10
+		// xmm6 = ga11
+		// xmm7 = used
+
+		// rb10 = rb10.lerp_4(rb11, uf);
+		// ga10 = ga10.lerp_4(ga11, uf);
+
+		lerp16_4(xmm5, xmm1, xmm0);
+		lerp16_4(xmm6, xmm2, xmm0);
+
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm5 = rb10
+		// xmm6 = ga10
+		// xmm0, xmm1, xmm2 = free
+		// xmm7 = used
+
+		// rb00 = rb00.lerp_4(rb10, vf);
+		// ga00 = ga00.lerp_4(ga10, vf);
+
+		movdqa(xmm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(xmm5, xmm3, xmm0);
+		lerp16_4(xmm6, xmm4, xmm0);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		paddd(xmm2, xmm4);
+		movdqa(xmm5, xmm2);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		movdqa(xmm5, xmm6);
+		psllw(xmm5, 8);
+		psrlw(xmm5, 8);
+		psrlw(xmm6, 8);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
+{
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				pmaxsw(uv, ptr[&m_local.gd->t.min]);
+			}
+			else
+			{
+				pxor(xmm0, xmm0);
+				pmaxsw(uv, xmm0);
+			}
+
+			pminsw(uv, ptr[&m_local.gd->t.max]);
+		}
+		else
+		{
+			pand(uv, ptr[&m_local.gd->t.min]);
+
+			if(region)
+			{
+				por(uv, ptr[&m_local.gd->t.max]);
+			}
+		}
+	}
+	else
+	{
+		movdqa(xmm4, ptr[&m_local.gd->t.min]);
+		movdqa(xmm5, ptr[&m_local.gd->t.max]);
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv);
+
+		pand(xmm1, xmm4);
+
+		if(region)
+		{
+			por(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv, xmm4);
+		pminsw(uv, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		blend8(uv, xmm1);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
+{
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				movdqa(xmm4, ptr[&m_local.gd->t.min]);
+				pmaxsw(uv0, xmm4);
+				pmaxsw(uv1, xmm4);
+			}
+			else
+			{
+				pxor(xmm0, xmm0);
+				pmaxsw(uv0, xmm0);
+				pmaxsw(uv1, xmm0);
+			}
+
+			movdqa(xmm5, ptr[&m_local.gd->t.max]);
+			pminsw(uv0, xmm5);
+			pminsw(uv1, xmm5);
+		}
+		else
+		{
+			movdqa(xmm4, ptr[&m_local.gd->t.min]);
+			pand(uv0, xmm4);
+			pand(uv1, xmm4);
+
+			if(region)
+			{
+				movdqa(xmm5, ptr[&m_local.gd->t.max]);
+				por(uv0, xmm5);
+				por(uv1, xmm5);
+			}
+		}
+	}
+	else
+	{
+		movdqa(xmm4, ptr[&m_local.gd->t.min]);
+		movdqa(xmm5, ptr[&m_local.gd->t.max]);
+
+		#if _M_SSE >= 0x401
+		
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
+		
+		#else
+		
+		movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
+		movdqa(xmm6, xmm0);
+		
+		#endif
+
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv0);
+
+		pand(xmm1, xmm4);
+
+		if(region)
+		{
+			por(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv0, xmm4);
+		pminsw(uv0, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		#if _M_SSE >= 0x401
+		
+		pblendvb(uv0, xmm1);
+
+		#else
+
+		blendr(uv0, xmm1, xmm0);
+
+		#endif
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv1);
+
+		pand(xmm1, xmm4);
+
+		if(region)
+		{
+			por(xmm1, xmm5);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv1, xmm4);
+		pminsw(uv1, xmm5);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		#if _M_SSE >= 0x401
+
+		pblendvb(uv1, xmm1);
+
+		#else
+		
+		blendr(uv1, xmm1, xmm6);
+
+		#endif
+	}
+}
+
+void GSDrawScanlineCodeGenerator::SampleTextureLOD()
+{
+	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
+	{
+		return;
+	}
+
+	push(ebp);
+
+	mov(ebp, (size_t)m_local.gd->tex);		
+
+	if(m_sel.tlu)
+	{
+		mov(edx, ptr[&m_local.gd->clut]);
+	}
+
+	if(!m_sel.fst)
+	{
+		rcpps(xmm0, xmm4);
+
+		mulps(xmm2, xmm0);
+		mulps(xmm3, xmm0);
+
+		cvttps2dq(xmm2, xmm2);
+		cvttps2dq(xmm3, xmm3);
+	}
+
+	// xmm2 = u
+	// xmm3 = v
+	// xmm4 = q
+	// xmm0 = xmm1 = xmm5 = xmm6 = free
+
+	// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?)
+
+	if(!m_sel.lcm)
+	{
+		// store u/v
+
+		movdqa(xmm0, xmm2);
+		punpckldq(xmm2, xmm3);
+		movdqa(ptr[&m_local.temp.uv[0]], xmm2);
+		punpckhdq(xmm0, xmm3);
+		movdqa(ptr[&m_local.temp.uv[1]], xmm0);
+
+		// lod = -log2(Q) * (1 << L) + K
+
+		movdqa(xmm0, xmm4);
+		pcmpeqd(xmm1, xmm1);
+		psrld(xmm1, 25);
+		pslld(xmm0, 1);
+		psrld(xmm0, 24);
+		psubd(xmm0, xmm1);
+		cvtdq2ps(xmm0, xmm0); 
+
+		// xmm0 = (float)(exp(q) - 127)
+
+		pslld(xmm4, 9);
+		psrld(xmm4, 9);
+		orps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+			
+		// xmm4 = mant(q) | 1.0f
+
+		movdqa(xmm5, xmm4);
+		mulps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
+		addps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
+		mulps(xmm5, xmm4);
+		subps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); 
+		addps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
+		mulps(xmm4, xmm5);
+		addps(xmm4, xmm0);
+
+		// xmm4 = log2(Q) = ((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0
+
+		mulps(xmm4, ptr[&m_local.gd->l]);
+		addps(xmm4, ptr[&m_local.gd->k]);
+
+		// xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000
+
+		xorps(xmm0, xmm0);
+		minps(xmm4, ptr[&m_local.gd->mxl]);
+		maxps(xmm4, xmm0);
+		cvtps2dq(xmm4, xmm4);
+
+		if(m_sel.mmin == 1) // round-off mode
+		{
+			mov(eax, 0x8000);
+			movd(xmm0, eax);
+			pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+			paddd(xmm4, xmm0);
+		}
+
+		movdqa(xmm0, xmm4);
+		psrld(xmm4, 16);
+		movdqa(ptr[&m_local.temp.lod.i], xmm4);
+
+		if(m_sel.mmin == 2) // trilinear mode
+		{
+			pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			movdqa(ptr[&m_local.temp.lod.f], xmm0);
+		}
+
+		// shift u/v by (int)lod
+
+		movq(xmm4, ptr[&m_local.gd->t.minmax]);
+
+		movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
+		movdqa(xmm5, xmm2);
+		movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
+		movdqa(xmm6, xmm3);
+
+		movd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); 
+		psrad(xmm2, xmm0);
+		movdqa(xmm1, xmm4);
+		psrlw(xmm1, xmm0);
+		movq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1);
+
+		movd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]);
+		psrad(xmm5, xmm0);
+		movdqa(xmm1, xmm4);
+		psrlw(xmm1, xmm0);
+		movq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1);
+
+		movd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]);
+		psrad(xmm3, xmm0);
+		movdqa(xmm1, xmm4);
+		psrlw(xmm1, xmm0);
+		movq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1);
+
+		movd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]);
+		psrad(xmm6, xmm0);
+		movdqa(xmm1, xmm4);
+		psrlw(xmm1, xmm0);
+		movq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1);
+
+		punpckldq(xmm2, xmm3);
+		punpckhdq(xmm5, xmm6);
+		movdqa(xmm3, xmm2);
+		punpckldq(xmm2, xmm5);
+		punpckhdq(xmm3, xmm5);
+
+		movdqa(ptr[&m_local.temp.uv[0]], xmm2);
+		movdqa(ptr[&m_local.temp.uv[1]], xmm3);
+
+		movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+		movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+
+		movdqa(xmm0, xmm5);
+		punpcklwd(xmm5, xmm6);
+		punpckhwd(xmm0, xmm6);
+		movdqa(xmm6, xmm5);
+		punpckldq(xmm5, xmm0);
+		punpckhdq(xmm6, xmm0);
+
+		movdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
+		movdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
+	}
+	else
+	{
+		// lod = K
+
+		movd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]);
+
+		psrad(xmm2, xmm0);
+		psrad(xmm3, xmm0);
+
+		movdqa(ptr[&m_local.temp.uv[0]], xmm2);
+		movdqa(ptr[&m_local.temp.uv[1]], xmm3);
+
+		movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+		movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+	}
+
+	// xmm2 = m_local.temp.uv[0] = u (level m)
+	// xmm3 = m_local.temp.uv[1] = v (level m)
+	// xmm5 = minuv
+	// xmm6 = maxuv
+
+	if(m_sel.ltf)
+	{
+		// u -= 0x8000;
+		// v -= 0x8000;
+
+		mov(eax, 0x8000);
+		movd(xmm4, eax);
+		pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+		psubd(xmm2, xmm4);
+		psubd(xmm3, xmm4);
+
+		// GSVector4i uf = u.xxzzlh().srl16(1);
+	
+		pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+		pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		psrlw(xmm0, 12);
+		movdqa(ptr[&m_local.temp.uf], xmm0);
+
+		// GSVector4i vf = v.xxzzlh().srl16(1);
+
+		pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+		pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+		psrlw(xmm0, 12);
+		movdqa(ptr[&m_local.temp.vf], xmm0);
+	}
+
+	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+	psrad(xmm2, 16);
+	psrad(xmm3, 16);
+	packssdw(xmm2, xmm3);
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+		movdqa(xmm3, xmm2);
+		pcmpeqd(xmm1, xmm1);
+		psrlw(xmm1, 15);
+		paddw(xmm3, xmm1);
+
+		// uv0 = Wrap(uv0);
+		// uv1 = Wrap(uv1);
+
+		WrapLOD(xmm2, xmm3);
+	}
+	else
+	{
+		// uv0 = Wrap(uv0);
+
+		WrapLOD(xmm2);
+	}
+
+	// xmm2 = uv0
+	// xmm3 = uv1 (ltf)
+	// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+	// xmm7 = used
+
+	// GSVector4i x0 = uv0.upl16();
+	// GSVector4i y0 = uv0.uph16() << tw;
+
+	pxor(xmm0, xmm0);
+
+	movdqa(xmm4, xmm2);
+	punpckhwd(xmm2, xmm0);
+	punpcklwd(xmm4, xmm0);
+	pslld(xmm2, m_sel.tw + 3);
+
+	// xmm0 = 0
+	// xmm2 = y0
+	// xmm3 = uv1 (ltf)
+	// xmm4 = x0
+	// xmm1, xmm5, xmm6 = free
+	// xmm7 = used
+
+	if(m_sel.ltf)
+	{
+		// GSVector4i x1 = uv1.upl16();
+		// GSVector4i y1 = uv1.uph16() << tw;
+
+		movdqa(xmm6, xmm3);
+		punpcklwd(xmm6, xmm0);
+		punpckhwd(xmm3, xmm0);
+		pslld(xmm3, m_sel.tw + 3);
+
+		// xmm2 = y0
+		// xmm3 = y1
+		// xmm4 = x0
+		// xmm6 = x1
+		// xmm0, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i addr00 = y0 + x0;
+		// GSVector4i addr01 = y0 + x1;
+		// GSVector4i addr10 = y1 + x0;
+		// GSVector4i addr11 = y1 + x1;
+
+		movdqa(xmm5, xmm2);
+		paddd(xmm5, xmm4);
+		paddd(xmm2, xmm6);
+
+		movdqa(xmm0, xmm3);
+		paddd(xmm0, xmm4);
+		paddd(xmm3, xmm6);
+
+		// xmm5 = addr00
+		// xmm2 = addr01
+		// xmm0 = addr10
+		// xmm3 = addr11
+		// xmm1, xmm4, xmm6 = free
+		// xmm7 = used
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(4, 0);
+
+		// xmm6 = c00
+		// xmm4 = c01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm0, xmm2, xmm3 = free
+		// xmm7 = used
+
+		movdqa(xmm0, ptr[&m_local.temp.uf]);
+
+		// GSVector4i rb00 = c00 & mask;
+		// GSVector4i ga00 = (c00 >> 8) & mask;
+
+		movdqa(xmm2, xmm6);
+		psrlw(xmm6, 8);
+		psllw(xmm2, 8);
+		psrlw(xmm2, 8);
+
+		// GSVector4i rb01 = c01 & mask;
+		// GSVector4i ga01 = (c01 >> 8) & mask;
+
+		movdqa(xmm3, xmm4);
+		psrlw(xmm4, 8);
+		psllw(xmm3, 8);
+		psrlw(xmm3, 8);
+
+		// xmm0 = uf
+		// xmm2 = rb00
+		// xmm3 = rb01
+		// xmm6 = ga00
+		// xmm4 = ga01
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm7 = used
+
+		// rb00 = rb00.lerp_4(rb01, uf);
+		// ga00 = ga00.lerp_4(ga01, uf);
+
+		lerp16_4(xmm3, xmm2, xmm0);
+		lerp16_4(xmm4, xmm6, xmm0);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = c10
+		// xmm5 = c11
+		// xmm2, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i rb10 = c10 & mask;
+		// GSVector4i ga10 = (c10 >> 8) & mask;
+
+		movdqa(xmm2, xmm1);
+		psllw(xmm1, 8);
+		psrlw(xmm1, 8);
+		psrlw(xmm2, 8);
+
+		// GSVector4i rb11 = c11 & mask;
+		// GSVector4i ga11 = (c11 >> 8) & mask;
+
+		movdqa(xmm6, xmm5);
+		psllw(xmm5, 8);
+		psrlw(xmm5, 8);
+		psrlw(xmm6, 8);
+
+		// xmm0 = uf
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm1 = rb10
+		// xmm5 = rb11
+		// xmm2 = ga10
+		// xmm6 = ga11
+		// xmm7 = used
+
+		// rb10 = rb10.lerp_4(rb11, uf);
+		// ga10 = ga10.lerp_4(ga11, uf);
+
+		lerp16_4(xmm5, xmm1, xmm0);
+		lerp16_4(xmm6, xmm2, xmm0);
+
+		// xmm3 = rb00
+		// xmm4 = ga00
+		// xmm5 = rb10
+		// xmm6 = ga10
+		// xmm0, xmm1, xmm2 = free
+		// xmm7 = used
+
+		// rb00 = rb00.lerp_4(rb10, vf);
+		// ga00 = ga00.lerp_4(ga10, vf);
+
+		movdqa(xmm0, ptr[&m_local.temp.vf]);
+
+		lerp16_4(xmm5, xmm3, xmm0);
+		lerp16_4(xmm6, xmm4, xmm0);
+	}
+	else
+	{
+		// GSVector4i addr00 = y0 + x0;
+
+		paddd(xmm2, xmm4);
+		movdqa(xmm5, xmm2);
+
+		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+		ReadTexel(1, 0);
+
+		// GSVector4i mask = GSVector4i::x00ff();
+
+		// c[0] = c00 & mask;
+		// c[1] = (c00 >> 8) & mask;
+
+		movdqa(xmm5, xmm6);
+		psllw(xmm5, 8);
+		psrlw(xmm5, 8);
+		psrlw(xmm6, 8);
+	}
+
+	if(m_sel.mmin != 1) // !round-off mode
+	{
+		movdqa(ptr[&m_local.temp.trb], xmm5);
+		movdqa(ptr[&m_local.temp.tga], xmm6);
+
+		movdqa(xmm2, ptr[&m_local.temp.uv[0]]);
+		movdqa(xmm3, ptr[&m_local.temp.uv[1]]);
+
+		psrad(xmm2, 1);
+		psrad(xmm3, 1);
+
+		movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]);
+		movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]);
+
+		psrlw(xmm5, 1);
+		psrlw(xmm6, 1);
+
+		if(m_sel.ltf)
+		{
+			// u -= 0x8000;
+			// v -= 0x8000;
+
+			mov(eax, 0x8000);
+			movd(xmm4, eax);
+			pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
+
+			psubd(xmm2, xmm4);
+			psubd(xmm3, xmm4);
+
+			// GSVector4i uf = u.xxzzlh().srl16(1);
+	
+			pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			psrlw(xmm0, 12);
+			movdqa(ptr[&m_local.temp.uf], xmm0);
+
+			// GSVector4i vf = v.xxzzlh().srl16(1);
+
+			pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
+			psrlw(xmm0, 12);
+			movdqa(ptr[&m_local.temp.vf], xmm0);
+		}
+
+		// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
+
+		psrad(xmm2, 16);
+		psrad(xmm3, 16);
+		packssdw(xmm2, xmm3);
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
+
+			movdqa(xmm3, xmm2);
+			pcmpeqd(xmm1, xmm1);
+			psrlw(xmm1, 15);
+			paddw(xmm3, xmm1);
+
+			// uv0 = Wrap(uv0);
+			// uv1 = Wrap(uv1);
+
+			WrapLOD(xmm2, xmm3);
+		}
+		else
+		{
+			// uv0 = Wrap(uv0);
+
+			WrapLOD(xmm2);
+		}
+
+		// xmm2 = uv0
+		// xmm3 = uv1 (ltf)
+		// xmm0, xmm1, xmm4, xmm5, xmm6 = free
+		// xmm7 = used
+
+		// GSVector4i x0 = uv0.upl16();
+		// GSVector4i y0 = uv0.uph16() << tw;
+
+		pxor(xmm0, xmm0);
+
+		movdqa(xmm4, xmm2);
+		punpckhwd(xmm2, xmm0);
+		punpcklwd(xmm4, xmm0);
+		pslld(xmm2, m_sel.tw + 3);
+
+		// xmm0 = 0
+		// xmm2 = y0
+		// xmm3 = uv1 (ltf)
+		// xmm4 = x0
+		// xmm1, xmm5, xmm6 = free
+		// xmm7 = used
+
+		if(m_sel.ltf)
+		{
+			// GSVector4i x1 = uv1.upl16();
+			// GSVector4i y1 = uv1.uph16() << tw;
+
+			movdqa(xmm6, xmm3);
+			punpckhwd(xmm3, xmm0);
+			punpcklwd(xmm6, xmm0);
+			pslld(xmm3, m_sel.tw + 3);
+
+			// xmm2 = y0
+			// xmm3 = y1
+			// xmm4 = x0
+			// xmm6 = x1
+			// xmm0, xmm5, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i addr00 = y0 + x0;
+			// GSVector4i addr01 = y0 + x1;
+			// GSVector4i addr10 = y1 + x0;
+			// GSVector4i addr11 = y1 + x1;
+
+			movdqa(xmm5, xmm2);
+			paddd(xmm5, xmm4);
+			paddd(xmm2, xmm6);
+
+			movdqa(xmm0, xmm3);
+			paddd(xmm0, xmm4);
+			paddd(xmm3, xmm6);
+
+			// xmm5 = addr00
+			// xmm2 = addr01
+			// xmm0 = addr10
+			// xmm3 = addr11
+			// xmm1, xmm4, xmm6 = free
+			// xmm7 = used
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
+			// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(4, 1);
+
+			// xmm6 = c00
+			// xmm4 = c01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm0, xmm2, xmm3 = free
+			// xmm7 = used
+
+			movdqa(xmm0, ptr[&m_local.temp.uf]);
+
+			// GSVector4i rb00 = c00 & mask;
+			// GSVector4i ga00 = (c00 >> 8) & mask;
+
+			movdqa(xmm2, xmm6);
+			psllw(xmm2, 8);
+			psrlw(xmm2, 8);
+			psrlw(xmm6, 8);
+
+			// GSVector4i rb01 = c01 & mask;
+			// GSVector4i ga01 = (c01 >> 8) & mask;
+
+			movdqa(xmm3, xmm4);
+			psllw(xmm3, 8);
+			psrlw(xmm3, 8);
+			psrlw(xmm4, 8);
+
+			// xmm0 = uf
+			// xmm2 = rb00
+			// xmm3 = rb01
+			// xmm6 = ga00
+			// xmm4 = ga01
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm7 = used
+
+			// rb00 = rb00.lerp_4(rb01, uf);
+			// ga00 = ga00.lerp_4(ga01, uf);
+
+			lerp16_4(xmm3, xmm2, xmm0);
+			lerp16_4(xmm4, xmm6, xmm0);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = c10
+			// xmm5 = c11
+			// xmm2, xmm6 = free
+			// xmm7 = used
+
+			// GSVector4i rb10 = c10 & mask;
+			// GSVector4i ga10 = (c10 >> 8) & mask;
+
+			movdqa(xmm2, xmm1);
+			psllw(xmm1, 8);
+			psrlw(xmm1, 8);
+			psrlw(xmm2, 8);
+
+			// GSVector4i rb11 = c11 & mask;
+			// GSVector4i ga11 = (c11 >> 8) & mask;
+
+			movdqa(xmm6, xmm5);
+			psllw(xmm5, 8);
+			psrlw(xmm5, 8);
+			psrlw(xmm6, 8);
+
+			// xmm0 = uf
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm1 = rb10
+			// xmm5 = rb11
+			// xmm2 = ga10
+			// xmm6 = ga11
+			// xmm7 = used
+
+			// rb10 = rb10.lerp_4(rb11, uf);
+			// ga10 = ga10.lerp_4(ga11, uf);
+
+			lerp16_4(xmm5, xmm1, xmm0);
+			lerp16_4(xmm6, xmm2, xmm0);
+
+			// xmm3 = rb00
+			// xmm4 = ga00
+			// xmm5 = rb10
+			// xmm6 = ga10
+			// xmm0, xmm1, xmm2 = free
+			// xmm7 = used
+
+			// rb00 = rb00.lerp_4(rb10, vf);
+			// ga00 = ga00.lerp_4(ga10, vf);
+
+			movdqa(xmm0, ptr[&m_local.temp.vf]);
+
+			lerp16_4(xmm5, xmm3, xmm0);
+			lerp16_4(xmm6, xmm4, xmm0);
+		}
+		else
+		{
+			// GSVector4i addr00 = y0 + x0;
+
+			paddd(xmm2, xmm4);
+			movdqa(xmm5, xmm2);
+
+			// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
+
+			ReadTexel(1, 1);
+
+			// GSVector4i mask = GSVector4i::x00ff();
+
+			// c[0] = c00 & mask;
+			// c[1] = (c00 >> 8) & mask;
+
+			movdqa(xmm5, xmm6);
+			psllw(xmm5, 8);
+			psrlw(xmm5, 8);
+			psrlw(xmm6, 8);
+		}
+
+		movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
+		psrlw(xmm0, 1);
+
+		movdqa(xmm2, ptr[&m_local.temp.trb]);
+		movdqa(xmm3, ptr[&m_local.temp.tga]);
+
+		lerp16(xmm5, xmm2, xmm0, 0);
+		lerp16(xmm6, xmm3, xmm0, 0);
+	}
+
+	pop(ebp);
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
+{
+	// xmm5 = minuv
+	// xmm6 = maxuv
+	// xmm0, xmm1, xmm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				pmaxsw(uv, xmm5);
+			}
+			else
+			{
+				pxor(xmm0, xmm0);
+				pmaxsw(uv, xmm0);
+			}
+
+			pminsw(uv, xmm6);
+		}
+		else
+		{
+			pand(uv, xmm5);
+
+			if(region)
+			{
+				por(uv, xmm6);
+			}
+		}
+	}
+	else
+	{
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv);
+
+		pand(xmm1, xmm5);
+
+		if(region)
+		{
+			por(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv, xmm5);
+		pminsw(uv, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		blend8(uv, xmm1);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
+{
+	// xmm5 = minuv
+	// xmm6 = maxuv
+	// xmm0, xmm1, xmm4 = free
+
+	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
+	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
+
+	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
+
+	if(wms_clamp == wmt_clamp)
+	{
+		if(wms_clamp)
+		{
+			if(region)
+			{
+				pmaxsw(uv0, xmm5);
+				pmaxsw(uv1, xmm5);
+			}
+			else
+			{
+				pxor(xmm0, xmm0);
+				pmaxsw(uv0, xmm0);
+				pmaxsw(uv1, xmm0);
+			}
+
+			pminsw(uv0, xmm6);
+			pminsw(uv1, xmm6);
+		}
+		else
+		{
+			pand(uv0, xmm5);
+			pand(uv1, xmm5);
+
+			if(region)
+			{
+				por(uv0, xmm6);
+				por(uv1, xmm6);
+			}
+		}
+	}
+	else
+	{
+		#if _M_SSE >= 0x401
+		
+		movdqa(xmm0, ptr[&m_local.gd->t.mask]);
+
+		#else
+		
+		movdqa(xmm0, ptr[&m_local.gd->t.invmask]);
+		movdqa(xmm4, xmm0);
+
+		#endif
+
+		// uv0
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv0);
+
+		pand(xmm1, xmm5);
+
+		if(region)
+		{
+			por(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv0, xmm5);
+		pminsw(uv0, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		#if _M_SSE >= 0x401
+
+		pblendvb(uv0, xmm1);
+
+		#else
+		
+		blendr(uv0, xmm1, xmm0);
+
+		#endif
+
+		// uv1
+
+		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
+
+		movdqa(xmm1, uv1);
+
+		pand(xmm1, xmm5);
+
+		if(region)
+		{
+			por(xmm1, xmm6);
+		}
+
+		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
+
+		pmaxsw(uv1, xmm5);
+		pminsw(uv1, xmm6);
+
+		// clamp.blend8(repeat, m_local.gd->t.mask);
+
+		#if _M_SSE >= 0x401
+		
+		pblendvb(uv1, xmm1);
+
+		#else
+		
+		blendr(uv1, xmm1, xmm4);
+
+		#endif
+	}
+}
+
+void GSDrawScanlineCodeGenerator::AlphaTFX()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+		movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+		// gat = gat.modulate16<1>(ga).clamp8();
+
+		modulate16(xmm6, xmm4, 1);
+
+		clamp16(xmm6, xmm3);
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			psrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_DECAL:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+
+			psrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_HIGHLIGHT:
+
+		// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+		movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		movdqa(xmm2, xmm4);
+
+		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
+
+		psrlw(xmm4, 7);
+
+		if(m_sel.tcc)
+		{
+			paddusb(xmm4, xmm6);
+		}
+
+		mix16(xmm6, xmm4, xmm3);
+
+		break;
+
+	case TFX_HIGHLIGHT2:
+
+		// if(!tcc) gat = gat.mix16(ga.srl16(7));
+
+		if(!m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+			movdqa(xmm2, xmm4);
+
+			psrlw(xmm4, 7);
+
+			mix16(xmm6, xmm4, xmm3);
+		}
+
+		break;
+
+	case TFX_NONE:
+
+		// gat = iip ? ga.srl16(7) : ga;
+
+		if(m_sel.iip)
+		{
+			psrlw(xmm6, 7);
+		}
+
+		break;
+	}
+
+	if(m_sel.aa1)
+	{
+		// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
+
+		// FIXME: bios config screen cubes
+
+		if(!m_sel.abe)
+		{
+			// a = cov
+
+			if(m_sel.edge)
+			{
+				movdqa(xmm0, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				pcmpeqd(xmm0, xmm0);
+				psllw(xmm0, 15);
+				psrlw(xmm0, 8);
+			}
+
+			mix16(xmm6, xmm0, xmm1);
+		}
+		else
+		{
+			// a = a == 0x80 ? cov : a
+
+			pcmpeqd(xmm0, xmm0);
+			psllw(xmm0, 15);
+			psrlw(xmm0, 8);
+
+			if(m_sel.edge)
+			{
+				movdqa(xmm1, ptr[&m_local.temp.cov]);
+			}
+			else
+			{
+				movdqa(xmm1, xmm0);
+			}
+
+			pcmpeqw(xmm0, xmm6);
+			psrld(xmm0, 16);
+			pslld(xmm0, 16);
+
+			blend8(xmm6, xmm1);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadMask()
+{
+	if(m_sel.fwrite)
+	{
+		movdqa(xmm3, ptr[&m_local.gd->fm]);
+	}
+
+	if(m_sel.zwrite)
+	{
+		movdqa(xmm4, ptr[&m_local.gd->zm]);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::TestAlpha()
+{
+	switch(m_sel.afail)
+	{
+	case AFAIL_FB_ONLY:
+		if(!m_sel.zwrite) return;
+		break;
+
+	case AFAIL_ZB_ONLY:
+		if(!m_sel.fwrite) return;
+		break;
+
+	case AFAIL_RGB_ONLY:
+		if(!m_sel.zwrite && m_sel.fpsm == 1) return;
+		break;
+	}
+
+	switch(m_sel.atst)
+	{
+	case ATST_NEVER:
+		// t = GSVector4i::xffffffff();
+		pcmpeqd(xmm1, xmm1);
+		break;
+
+	case ATST_ALWAYS:
+		return;
+
+	case ATST_LESS:
+	case ATST_LEQUAL:
+		// t = (ga >> 16) > m_local.gd->aref;
+		movdqa(xmm1, xmm6);
+		psrld(xmm1, 16);
+		pcmpgtd(xmm1, ptr[&m_local.gd->aref]);
+		break;
+
+	case ATST_EQUAL:
+		// t = (ga >> 16) != m_local.gd->aref;
+		movdqa(xmm1, xmm6);
+		psrld(xmm1, 16);
+		pcmpeqd(xmm1, ptr[&m_local.gd->aref]);
+		pcmpeqd(xmm0, xmm0);
+		pxor(xmm1, xmm0);
+		break;
+
+	case ATST_GEQUAL:
+	case ATST_GREATER:
+		// t = (ga >> 16) < m_local.gd->aref;
+		movdqa(xmm0, xmm6);
+		psrld(xmm0, 16);
+		movdqa(xmm1, ptr[&m_local.gd->aref]);
+		pcmpgtd(xmm1, xmm0);
+		break;
+
+	case ATST_NOTEQUAL:
+		// t = (ga >> 16) == m_local.gd->aref;
+		movdqa(xmm1, xmm6);
+		psrld(xmm1, 16);
+		pcmpeqd(xmm1, ptr[&m_local.gd->aref]);
+		break;
+	}
+
+	switch(m_sel.afail)
+	{
+	case AFAIL_KEEP:
+		// test |= t;
+		por(xmm7, xmm1);
+		alltrue();
+		break;
+
+	case AFAIL_FB_ONLY:
+		// zm |= t;
+		por(xmm4, xmm1);
+		break;
+
+	case AFAIL_ZB_ONLY:
+		// fm |= t;
+		por(xmm3, xmm1);
+		break;
+
+	case AFAIL_RGB_ONLY:
+		// zm |= t;
+		por(xmm4, xmm1);
+		// fm |= t & GSVector4i::xff000000();
+		psrld(xmm1, 24);
+		pslld(xmm1, 24);
+		por(xmm3, xmm1);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ColorTFX()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	switch(m_sel.tfx)
+	{
+	case TFX_MODULATE:
+
+		// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).clamp8();
+
+		modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		clamp16(xmm5, xmm1);
+
+		break;
+
+	case TFX_DECAL:
+
+		break;
+
+	case TFX_HIGHLIGHT:
+	case TFX_HIGHLIGHT2:
+
+		if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
+		{
+			// GSVector4i ga = iip ? gaf : m_local.c.ga;
+
+			movdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
+		}
+
+		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
+
+		movdqa(xmm1, xmm6);
+
+		modulate16(xmm6, xmm2, 1);
+
+		pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+		pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
+		psrlw(xmm2, 7);
+
+		paddw(xmm6, xmm2);
+
+		clamp16(xmm6, xmm0);
+
+		mix16(xmm6, xmm1, xmm0);
+
+		// GSVector4i rb = iip ? rbf : m_local.c.rb;
+
+		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
+
+		modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
+
+		paddw(xmm5, xmm2);
+
+		clamp16(xmm5, xmm0);
+
+		break;
+
+	case TFX_NONE:
+
+		// rbt = iip ? rb.srl16(7) : rb;
+
+		if(m_sel.iip)
+		{
+			psrlw(xmm5, 7);
+		}
+
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::Fog()
+{
+	if(!m_sel.fwrite || !m_sel.fge)
+	{
+		return;
+	}
+
+	// rb = m_local.gd->frb.lerp16<0>(rb, f);
+	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
+
+	movdqa(xmm0, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.f : &m_local.p.f]);
+	movdqa(xmm1, xmm6);
+
+	movdqa(xmm2, ptr[&m_local.gd->frb]);
+	lerp16(xmm5, xmm2, xmm0, 0);
+
+	movdqa(xmm2, ptr[&m_local.gd->fga]);
+	lerp16(xmm6, xmm2, xmm0, 0);
+	mix16(xmm6, xmm1, xmm0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadFrame()
+{
+	if(!m_sel.fb)
+	{
+		return;
+	}
+
+	// int fa = fza_base.x + fza_offset->x;
+
+	mov(ebx, ptr[esi]);
+	add(ebx, ptr[edi]);
+
+	if(!m_sel.rfb)
+	{
+		return;
+	}
+
+	ReadPixel(xmm2, ebx);
+}
+
+void GSDrawScanlineCodeGenerator::TestDestAlpha()
+{
+	if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
+	{
+		return;
+	}
+
+	// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
+
+	movdqa(xmm1, xmm2);
+
+	if(m_sel.datm)
+	{
+		if(m_sel.fpsm == 2)
+		{
+			pxor(xmm0, xmm0);
+			psrld(xmm1, 15);
+			pcmpeqd(xmm1, xmm0);
+		}
+		else
+		{
+			pcmpeqd(xmm0, xmm0);
+			pxor(xmm1, xmm0);
+			psrad(xmm1, 31);
+		}
+	}
+	else
+	{
+		if(m_sel.fpsm == 2)
+		{
+			pslld(xmm1, 16);
+		}
+
+		psrad(xmm1, 31);
+	}
+
+	por(xmm7, xmm1);
+
+	alltrue();
+}
+
+void GSDrawScanlineCodeGenerator::WriteMask()
+{
+	if(m_sel.notest)
+	{
+		return;
+	}
+
+	// fm |= test;
+	// zm |= test;
+
+	if(m_sel.fwrite)
+	{
+		por(xmm3, xmm7);
+	}
+
+	if(m_sel.zwrite)
+	{
+		por(xmm4, xmm7);
+	}
+
+	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
+
+	pcmpeqd(xmm1, xmm1);
+
+	if(m_sel.fwrite && m_sel.zwrite)
+	{
+		movdqa(xmm0, xmm1);
+		pcmpeqd(xmm1, xmm3);
+		pcmpeqd(xmm0, xmm4);
+		packssdw(xmm1, xmm0);
+	}
+	else if(m_sel.fwrite)
+	{
+		pcmpeqd(xmm1, xmm3);
+		packssdw(xmm1, xmm1);
+	}
+	else if(m_sel.zwrite)
+	{
+		pcmpeqd(xmm1, xmm4);
+		packssdw(xmm1, xmm1);
+	}
+
+	pmovmskb(edx, xmm1);
+
+	not(edx);
+}
+
+void GSDrawScanlineCodeGenerator::WriteZBuf()
+{
+	if(!m_sel.zwrite)
+	{
+		return;
+	}
+
+	movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
+
+	if(m_sel.ztest && m_sel.zpsm < 2)
+	{
+		// zs = zs.blend8(zd, zm);
+
+		movdqa(xmm0, xmm4);
+		movdqa(xmm7, ptr[&m_local.temp.zd]);
+		blend8(xmm1, xmm7);
+	}
+
+	bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
+
+	WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
+}
+
+void GSDrawScanlineCodeGenerator::AlphaBlend()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.abe == 0 && m_sel.aa1 == 0)
+	{
+		return;
+	}
+
+	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
+	{
+		switch(m_sel.fpsm)
+		{
+		case 0:
+		case 1:
+
+			// c[2] = fd & mask;
+			// c[3] = (fd >> 8) & mask;
+
+			movdqa(xmm0, xmm2);
+			movdqa(xmm1, xmm2);
+
+			psllw(xmm0, 8);
+			psrlw(xmm0, 8);
+			psrlw(xmm1, 8);
+
+			break;
+
+		case 2:
+
+			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
+			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
+
+			movdqa(xmm0, xmm2);
+			movdqa(xmm1, xmm2);
+			movdqa(xmm4, xmm2);
+
+			pcmpeqd(xmm7, xmm7);
+			psrld(xmm7, 27); // 0x0000001f
+			pand(xmm0, xmm7);
+			pslld(xmm0, 3);
+
+			pslld(xmm7, 10); // 0x00007c00
+			pand(xmm4, xmm7);
+			pslld(xmm4, 9);
+
+			por(xmm0, xmm4);
+
+			movdqa(xmm4, xmm1);
+
+			psrld(xmm7, 5); // 0x000003e0
+			pand(xmm1, xmm7);
+			psrld(xmm1, 2);
+
+			psllw(xmm7, 10); // 0x00008000
+			pand(xmm4, xmm7);
+			pslld(xmm4, 8);
+
+			por(xmm1, xmm4);
+
+			break;
+		}
+	}
+
+	// xmm5, xmm6 = src rb, ga
+	// xmm0, xmm1 = dst rb, ga
+	// xmm2, xmm3 = used
+	// xmm4, xmm7 = free
+
+	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
+	{
+		movdqa(xmm4, xmm5);
+	}
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// rb = c[aba * 2 + 0];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: movdqa(xmm5, xmm0); break;
+		case 2: pxor(xmm5, xmm5); break;
+		}
+
+		// rb = rb.sub16(c[abb * 2 + 0]);
+
+		switch(m_sel.abb)
+		{
+		case 0: psubw(xmm5, xmm4); break;
+		case 1: psubw(xmm5, xmm0); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
+
+			switch(m_sel.abc)
+			{
+			case 0:
+			case 1:
+				pshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1));
+				pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
+				psllw(xmm7, 7);
+				break;
+			case 2:
+				movdqa(xmm7, ptr[&m_local.gd->afix]);
+				break;
+			}
+
+			// rb = rb.modulate16<1>(a);
+
+			modulate16(xmm5, xmm7, 1);
+		}
+
+		// rb = rb.add16(c[abd * 2 + 0]);
+
+		switch(m_sel.abd)
+		{
+		case 0: paddw(xmm5, xmm4); break;
+		case 1: paddw(xmm5, xmm0); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// rb = c[abd * 2 + 0];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: movdqa(xmm5, xmm0); break;
+		case 2: pxor(xmm5, xmm5); break;
+		}
+	}
+
+	if(m_sel.pabe)
+	{
+		// mask = (c[1] << 8).sra32(31);
+
+		movdqa(xmm0, xmm6);
+		pslld(xmm0, 8);
+		psrad(xmm0, 31);
+
+		// rb = c[0].blend8(rb, mask);
+
+		blend8r(xmm5, xmm4);
+	}
+
+	// xmm6 = src ga
+	// xmm1 = dst ga
+	// xmm5 = rb
+	// xmm7 = a
+	// xmm2, xmm3 = used
+	// xmm0, xmm4 = free
+
+	movdqa(xmm4, xmm6);
+
+	if(m_sel.aba != m_sel.abb)
+	{
+		// ga = c[aba * 2 + 1];
+
+		switch(m_sel.aba)
+		{
+		case 0: break;
+		case 1: movdqa(xmm6, xmm1); break;
+		case 2: pxor(xmm6, xmm6); break;
+		}
+
+		// ga = ga.sub16(c[abeb * 2 + 1]);
+
+		switch(m_sel.abb)
+		{
+		case 0: psubw(xmm6, xmm4); break;
+		case 1: psubw(xmm6, xmm1); break;
+		case 2: break;
+		}
+
+		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
+		{
+			// ga = ga.modulate16<1>(a);
+
+			modulate16(xmm6, xmm7, 1);
+		}
+
+		// ga = ga.add16(c[abd * 2 + 1]);
+
+		switch(m_sel.abd)
+		{
+		case 0: paddw(xmm6, xmm4); break;
+		case 1: paddw(xmm6, xmm1); break;
+		case 2: break;
+		}
+	}
+	else
+	{
+		// ga = c[abd * 2 + 1];
+
+		switch(m_sel.abd)
+		{
+		case 0: break;
+		case 1: movdqa(xmm6, xmm1); break;
+		case 2: pxor(xmm6, xmm6); break;
+		}
+	}
+
+	// xmm4 = src ga
+	// xmm5 = rb
+	// xmm6 = ga
+	// xmm2, xmm3 = used
+	// xmm0, xmm1, xmm7 = free
+
+	if(m_sel.pabe)
+	{
+		#if _M_SSE < 0x401
+
+		// doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)
+
+		movdqa(xmm0, xmm4);
+		pslld(xmm0, 8);
+		psrad(xmm0, 31);
+
+		#endif
+
+		psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
+
+		// ga = c[1].blend8(ga, mask).mix16(c[1]);
+
+		blend8r(xmm6, xmm4);
+	}
+	else
+	{
+		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
+		{
+			mix16(xmm6, xmm4, xmm7);
+		}
+	}
+}
+
+void GSDrawScanlineCodeGenerator::WriteFrame()
+{
+	if(!m_sel.fwrite)
+	{
+		return;
+	}
+
+	if(m_sel.fpsm == 2 && m_sel.dthe)
+	{
+		mov(eax, ptr[esp + _top]);
+		and(eax, 3);
+		shl(eax, 5);
+		mov(ebp, ptr[&m_local.gd->dimx]);
+		paddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
+		paddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
+	}
+
+	if(m_sel.colclamp == 0)
+	{
+		// c[0] &= 0x000000ff;
+		// c[1] &= 0x000000ff;
+
+		pcmpeqd(xmm7, xmm7);
+		psrlw(xmm7, 8);
+		pand(xmm5, xmm7);
+		pand(xmm6, xmm7);
+	}
+
+	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
+
+	movdqa(xmm7, xmm5);
+	punpcklwd(xmm5, xmm6);
+	punpckhwd(xmm7, xmm6);
+	packuswb(xmm5, xmm7);
+
+	if(m_sel.fba && m_sel.fpsm != 1)
+	{
+		// fs |= 0x80000000;
+
+		pcmpeqd(xmm7, xmm7);
+		pslld(xmm7, 31);
+		por(xmm5, xmm7);
+	}
+
+	if(m_sel.fpsm == 2)
+	{
+		// GSVector4i rb = fs & 0x00f800f8;
+		// GSVector4i ga = fs & 0x8000f800;
+
+		mov(eax, 0x00f800f8);
+		movd(xmm6, eax);
+		pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
+
+		mov(eax, 0x8000f800);
+		movd(xmm7, eax);
+		pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
+
+		movdqa(xmm4, xmm5);
+		pand(xmm4, xmm6);
+		pand(xmm5, xmm7);
+
+		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
+
+		movdqa(xmm6, xmm4);
+		movdqa(xmm7, xmm5);
+
+		psrld(xmm4, 3);
+		psrld(xmm6, 9);
+		psrld(xmm5, 6);
+		psrld(xmm7, 16);
+
+		por(xmm5, xmm4);
+		por(xmm7, xmm6);
+		por(xmm5, xmm7);
+	}
+
+	if(m_sel.rfb)
+	{
+		// fs = fs.blend(fd, fm);
+
+		blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
+	}
+
+	bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
+
+	WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
+}
+
+void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
+{
+	movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]);
+	movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
+}
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
+{
+	if(m_sel.notest)
+	{
+		if(fast)
+		{
+			movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
+			movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
+		}
+		else
+		{
+			WritePixel(src, addr, 0, psm);
+			WritePixel(src, addr, 1, psm);
+			WritePixel(src, addr, 2, psm);
+			WritePixel(src, addr, 3, psm);
+		}
+	}
+	else
+	{
+		if(fast)
+		{
+			// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
+			// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
+
+			test(mask, 0x0f);
+			je("@f");
+			movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
+			L("@@");
+
+			test(mask, 0xf0);
+			je("@f");
+			movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
+			L("@@");
+		}
+		else
+		{
+			// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
+			// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
+			// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
+			// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
+
+			test(mask, 0x03);
+			je("@f");
+			WritePixel(src, addr, 0, psm);
+			L("@@");
+
+			test(mask, 0x0c);
+			je("@f");
+			WritePixel(src, addr, 1, psm);
+			L("@@");
+
+			test(mask, 0x30);
+			je("@f");
+			WritePixel(src, addr, 2, psm);
+			L("@@");
+
+			test(mask, 0xc0);
+			je("@f");
+			WritePixel(src, addr, 3, psm);
+			L("@@");
+		}
+	}
+}
+
+static const int s_offsets[4] = {0, 2, 8, 10};
+
+void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
+{
+	Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
+
+	switch(psm)
+	{
+	case 0:
+		if(i == 0) movd(dst, src);
+		#if _M_SSE >= 0x401
+		else pextrd(dst, src, i);
+		#else
+		else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);}
+		#endif
+		break;
+	case 1:
+		if(i == 0) movd(eax, src);
+		#if _M_SSE >= 0x401
+		else pextrd(eax, src, i);
+		#else
+		else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);}
+		#endif
+		xor(eax, dst);
+		and(eax, 0xffffff);
+		xor(dst, eax);
+		break;
+	case 2:
+		if(i == 0) movd(eax, src);
+		else pextrw(eax, src, i * 2);
+		mov(dst, ax);
+		break;
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
+{
+	// in
+	// xmm5 = addr00
+	// xmm2 = addr01
+	// xmm0 = addr10
+	// xmm3 = addr11
+	// ebx = m_local.tex[0] (!m_sel.mmin)
+	// ebp = m_local.tex (m_sel.mmin)
+	// edx = m_local.clut (m_sel.tlu)
+
+	// out
+	// xmm6 = c00
+	// xmm4 = c01
+	// xmm1 = c10
+	// xmm5 = c11
+
+	ASSERT(pixels == 1 || pixels == 4);
+
+	mip_offset *= sizeof(void*);
+
+	const GSVector4i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i;
+
+	if(m_sel.mmin && !m_sel.lcm)
+	{
+		#if _M_SSE >= 0x401
+
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 7};
+
+		if(pixels == 4)
+		{
+			movdqa(ptr[&m_local.temp.test], xmm7);
+		}
+
+		for(int j = 0; j < 4; j++)
+		{
+			mov(ebx, ptr[&lod_i->u32[j]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			for(int i = 0; i < pixels; i++)
+			{
+				ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
+			}
+		}
+
+		if(pixels == 4)
+		{
+			movdqa(xmm5, xmm7);
+			movdqa(xmm7, ptr[&m_local.temp.test]);
+		}
+
+		#else
+
+		if(pixels == 4)
+		{
+			movdqa(ptr[&m_local.temp.test], xmm7);
+
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm6, xmm5, 0);
+			psrldq(xmm5, 4);
+			ReadTexel(xmm4, xmm2, 0);
+			psrldq(xmm2, 4);
+
+			mov(ebx, ptr[&lod_i->u32[1]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm1, xmm5, 0);
+			psrldq(xmm5, 4);
+			ReadTexel(xmm7, xmm2, 0);
+			psrldq(xmm2, 4);
+
+			punpckldq(xmm6, xmm1);
+			punpckldq(xmm4, xmm7);
+
+			mov(ebx, ptr[&lod_i->u32[2]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm1, xmm5, 0);
+			psrldq(xmm5, 4);
+			ReadTexel(xmm7, xmm2, 0);
+			psrldq(xmm2, 4);
+
+			mov(ebx, ptr[&lod_i->u32[3]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm5, xmm5, 0);
+			ReadTexel(xmm2, xmm2, 0);
+
+			punpckldq(xmm1, xmm5);
+			punpckldq(xmm7, xmm2);
+
+			punpcklqdq(xmm6, xmm1);
+			punpcklqdq(xmm4, xmm7);
+
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm1, xmm0, 0);
+			psrldq(xmm0, 4);
+			ReadTexel(xmm5, xmm3, 0);
+			psrldq(xmm3, 4);
+
+			mov(ebx, ptr[&lod_i->u32[1]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm2, xmm0, 0);
+			psrldq(xmm0, 4);
+			ReadTexel(xmm7, xmm3, 0);
+			psrldq(xmm3, 4);
+
+			punpckldq(xmm1, xmm2);
+			punpckldq(xmm5, xmm7);
+
+			mov(ebx, ptr[&lod_i->u32[2]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm2, xmm0, 0);
+			psrldq(xmm0, 4);
+			ReadTexel(xmm7, xmm3, 0);
+			psrldq(xmm3, 4);
+
+			mov(ebx, ptr[&lod_i->u32[3]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm0, xmm0, 0);
+			ReadTexel(xmm3, xmm3, 0);
+
+			punpckldq(xmm2, xmm0);
+			punpckldq(xmm7, xmm3);
+
+			punpcklqdq(xmm1, xmm2);
+			punpcklqdq(xmm5, xmm7);
+
+			movdqa(xmm7, ptr[&m_local.temp.test]);
+		}
+		else
+		{
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm6, xmm5, 0);
+			psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation)
+
+			mov(ebx, ptr[&lod_i->u32[1]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm1, xmm5, 0);
+			psrldq(xmm5, 4);
+
+			punpckldq(xmm6, xmm1);
+
+			mov(ebx, ptr[&lod_i->u32[2]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm1, xmm5, 0);
+			psrldq(xmm5, 4);
+
+			mov(ebx, ptr[&lod_i->u32[3]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+
+			ReadTexel(xmm4, xmm5, 0);
+			// psrldq(xmm5, 4);
+
+			punpckldq(xmm1, xmm4);
+
+			punpcklqdq(xmm6, xmm1);
+		}
+
+		#endif
+	}
+	else
+	{
+		if(m_sel.mmin && m_sel.lcm)
+		{
+			mov(ebx, ptr[&lod_i->u32[0]]);
+			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
+		}
+
+		const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
+
+		#if _M_SSE >= 0x401
+
+		for(int i = 0; i < pixels; i++)
+		{
+			for(int j = 0; j < 4; j++)
+			{
+				ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
+			}
+		}
+		
+		#else
+		
+		const int t[] = {1, 4, 1, 5, 2, 5, 2, 0};
+
+		for(int i = 0; i < pixels; i++)
+		{
+			const Xmm& addr = Xmm(r[i * 2 + 0]);
+			const Xmm& dst = Xmm(r[i * 2 + 1]);
+			const Xmm& temp1 = Xmm(t[i * 2 + 0]);
+			const Xmm& temp2 = Xmm(t[i * 2 + 1]);
+
+			ReadTexel(dst, addr, 0);
+			psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation)
+			ReadTexel(temp1, addr, 0);
+			psrldq(addr, 4);
+			punpckldq(dst, temp1);
+
+			ReadTexel(temp1, addr, 0);
+			psrldq(addr, 4);
+			ReadTexel(temp2, addr, 0);
+			// psrldq(addr, 4);
+			punpckldq(temp1, temp2);
+
+			punpcklqdq(dst, temp1);
+		}
+
+		#endif
+	}
+}
+
+void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
+{
+	const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
+
+	#if _M_SSE < 0x401
+	
+	ASSERT(i == 0);
+
+	#endif
+
+	if(i == 0) movd(eax, addr);
+	else pextrd(eax, addr, i);
+
+	if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
+
+	if(i == 0) movd(dst, src);
+	else pinsrd(dst, src, i);
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSDrawingContext.cpp b/plugins/GSdx_legacy/GSDrawingContext.cpp
new file mode 100644
index 0000000000..d9cd94d396
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawingContext.cpp
@@ -0,0 +1,138 @@
+/*
+*	Copyright (C) 2007-2009 Gabest
+*	http://www.gabest.org
+*
+*  This Program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2, or (at your option)
+*  any later version.
+*
+*  This Program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License
+*  along with GNU Make; see the file COPYING.  If not, write to
+*  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+*  http://www.gnu.org/copyleft/gpl.html
+*
+*/
+
+#include "stdafx.h"
+#include "GSDrawingContext.h"
+#include "GSdx.h"
+
+static int findmax(int tl, int br, int limit, int wm, int minuv, int maxuv)
+{
+	// return max possible texcoord
+
+	int uv = br;
+
+	if(wm == CLAMP_CLAMP)
+	{
+		if(uv > limit) uv = limit;
+	}
+	else if(wm == CLAMP_REPEAT)
+	{
+		if(tl < 0) uv = limit; // wrap around
+		else if(uv > limit) uv = limit;
+	}
+	else if(wm == CLAMP_REGION_CLAMP)
+	{
+		if(uv < minuv) uv = minuv;
+		if(uv > maxuv) uv = maxuv;
+	}
+	else if(wm == CLAMP_REGION_REPEAT)
+	{
+		if(tl < 0) uv = minuv | maxuv; // wrap around, just use (any & mask) | fix
+		else uv = std::min(uv, minuv) | maxuv; // (any & mask) cannot be larger than mask, select br if that is smaller (not br & mask because there might be a larger value between tl and br when &'ed with the mask)
+	}
+
+	return uv;
+}
+
+static int reduce(int uv, int size)
+{
+	while(size > 3 && (1 << (size - 1)) >= uv + 1)
+	{
+		size--;
+	}
+
+	return size;
+}
+
+static int extend(int uv, int size)
+{
+	while(size < 10 && (1 << size) < uv + 1)
+	{
+		size++;
+	}
+
+	return size;
+}
+
+GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap)
+{
+	if(mipmap) return TEX0; // no mipmaping allowed
+
+	// find the optimal value for TW/TH by analyzing vertex trace and clamping values, extending only for region modes where uv may be outside
+
+	int tw = TEX0.TW;
+	int th = TEX0.TH;
+
+	int wms = (int)CLAMP.WMS;
+	int wmt = (int)CLAMP.WMT;
+
+	int minu = (int)CLAMP.MINU;
+	int minv = (int)CLAMP.MINV;
+	int maxu = (int)CLAMP.MAXU;
+	int maxv = (int)CLAMP.MAXV;
+
+	GSVector4 uvf = st;
+
+	if(linear)
+	{
+		uvf += GSVector4(-0.5f, 0.5f).xxyy();
+	}
+
+	GSVector4i uv = GSVector4i(uvf.floor());
+
+	uv.x = findmax(uv.x, uv.z, (1 << tw) - 1, wms, minu, maxu);
+	uv.y = findmax(uv.y, uv.w, (1 << th) - 1, wmt, minv, maxv);
+
+	if(tw + th >= 19) // smaller sizes aren't worth, they just create multiple entries in the textue cache and the saved memory is less
+	{
+		tw = reduce(uv.x, tw);
+		th = reduce(uv.y, th);
+	}
+
+	if(wms == CLAMP_REGION_CLAMP || wms == CLAMP_REGION_REPEAT)
+	{
+		tw = extend(uv.x, tw);
+	}
+
+	if(wmt == CLAMP_REGION_CLAMP || wmt == CLAMP_REGION_REPEAT)
+	{
+		th = extend(uv.y, th);
+	}
+
+#ifdef _DEBUG
+	if(TEX0.TW != tw || TEX0.TH != th)
+	{
+		printf("FixedTEX0 %05x %d %d tw %d=>%d th %d=>%d st (%.0f,%.0f,%.0f,%.0f) uvmax %d,%d wm %d,%d (%d,%d,%d,%d)\n",
+			(int)TEX0.TBP0, (int)TEX0.TBW, (int)TEX0.PSM,
+			(int)TEX0.TW, tw, (int)TEX0.TH, th,
+			uvf.x, uvf.y, uvf.z, uvf.w,
+			uv.x, uv.y,
+			wms, wmt, minu, maxu, minv, maxv);
+	}
+#endif
+
+	GIFRegTEX0 res = TEX0;
+
+	res.TW = tw;
+	res.TH = th;
+
+	return res;
+}
diff --git a/plugins/GSdx_legacy/GSDrawingContext.h b/plugins/GSdx_legacy/GSDrawingContext.h
new file mode 100644
index 0000000000..9ac9f92c09
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawingContext.h
@@ -0,0 +1,233 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSLocalMemory.h"
+
+__aligned(class, 32) GSDrawingContext
+{
+public:
+	GIFRegXYOFFSET	XYOFFSET;
+	GIFRegTEX0		TEX0;
+	GIFRegTEX1		TEX1;
+	GIFRegTEX2		TEX2;
+	GIFRegCLAMP		CLAMP;
+	GIFRegMIPTBP1	MIPTBP1;
+	GIFRegMIPTBP2	MIPTBP2;
+	GIFRegSCISSOR	SCISSOR;
+	GIFRegALPHA		ALPHA;
+	GIFRegTEST		TEST;
+	GIFRegFBA		FBA;
+	GIFRegFRAME		FRAME;
+	GIFRegZBUF		ZBUF;
+
+	struct
+	{
+		GSVector4 in;
+		GSVector4i ex;
+		GSVector4 ofex;
+		GSVector4i ofxy;
+	} scissor;
+
+	struct
+	{
+		GSOffset* fb;
+		GSOffset* zb;
+		GSOffset* tex;
+		GSPixelOffset* fzb;
+		GSPixelOffset4* fzb4;
+	} offset;
+
+	GSDrawingContext()
+	{
+		memset(&offset, 0, sizeof(offset));
+
+		Reset();
+	}
+
+	void Reset()
+	{
+		memset(&XYOFFSET, 0, sizeof(XYOFFSET));
+		memset(&TEX0, 0, sizeof(TEX0));
+		memset(&TEX1, 0, sizeof(TEX1));
+		memset(&TEX2, 0, sizeof(TEX2));
+		memset(&CLAMP, 0, sizeof(CLAMP));
+		memset(&MIPTBP1, 0, sizeof(MIPTBP1));
+		memset(&MIPTBP2, 0, sizeof(MIPTBP2));
+		memset(&SCISSOR, 0, sizeof(SCISSOR));
+		memset(&ALPHA, 0, sizeof(ALPHA));
+		memset(&TEST, 0, sizeof(TEST));
+		memset(&FBA, 0, sizeof(FBA));
+		memset(&FRAME, 0, sizeof(FRAME));
+		memset(&ZBUF, 0, sizeof(ZBUF));
+	}
+
+	void UpdateScissor()
+	{
+		ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
+
+		scissor.ex.u16[0] = (uint16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
+		scissor.ex.u16[1] = (uint16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
+		scissor.ex.u16[2] = (uint16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
+		scissor.ex.u16[3] = (uint16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
+
+		scissor.ofex = GSVector4(
+			(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
+			(int)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY),
+			(int)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX),
+			(int)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY));
+
+		scissor.in = GSVector4(
+			(int)SCISSOR.SCAX0,
+			(int)SCISSOR.SCAY0,
+			(int)SCISSOR.SCAX1 + 1,
+			(int)SCISSOR.SCAY1 + 1);
+
+		scissor.ofxy = GSVector4i(
+			0x8000, 
+			0x8000, 
+			(int)XYOFFSET.OFX - 15, 
+			(int)XYOFFSET.OFY - 15);
+	}
+
+	bool DepthRead() const
+	{
+		return TEST.ZTE && TEST.ZTST >= 2;
+	}
+
+	bool DepthWrite() const
+	{
+		if(TEST.ATE && TEST.ATST == ATST_NEVER && TEST.AFAIL != AFAIL_ZB_ONLY) // alpha test, all pixels fail, z buffer is not updated
+		{
+			return false;
+		}
+
+		return ZBUF.ZMSK == 0 && TEST.ZTE != 0; // ZTE == 0 is bug on the real hardware, write is blocked then
+	}
+
+	GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap);
+
+	void Dump(const std::string& filename)
+	{
+		// Append on purpose so env + context are merged into a single file
+		FILE* fp = fopen(filename.c_str(), "at");
+		if (!fp) return;
+
+		fprintf(fp, "XYOFFSET\n"
+				"\tX:%d\n"
+				"\tY:%d\n\n"
+				, XYOFFSET.OFX, XYOFFSET.OFY);
+
+		fprintf(fp, "TEX0\n"
+				"\tTBP0:0x%x\n"
+				"\tTBW:%d\n"
+				"\tPSM:0x%x\n"
+				"\tTW:%d\n"
+				"\tTCC:%d\n"
+				"\tTFX:%d\n"
+				"\tCBP:0x%x\n"
+				"\tCPSM:0x%x\n"
+				"\tCSM:%d\n"
+				"\tCSA:%d\n"
+				"\tCLD:%d\n"
+				"\tTH:%lld\n\n"
+				, TEX0.TBP0, TEX0.TBW, TEX0.PSM, TEX0.TW, TEX0.TCC, TEX0.TFX, TEX0.CBP, TEX0.CPSM, TEX0.CSM, TEX0.CSA, TEX0.CLD, TEX0.TH);
+
+		fprintf(fp, "TEX1\n"
+				"\tLCM:%d\n"
+				"\tMXL:%d\n"
+				"\tMMAG:%d\n"
+				"\tMMIN:%d\n"
+				"\tMTBA:%d\n"
+				"\tL:%d\n"
+				"\tK:%d\n\n"
+				, TEX1.LCM, TEX1.MXL, TEX1.MMAG, TEX1.MMIN, TEX1.MTBA, TEX1.L, TEX1.K);
+
+		fprintf(fp, "TEX2\n"
+				"\tPSM:0x%x\n"
+				"\tCBP:0x%x\n"
+				"\tCPSM:0x%x\n"
+				"\tCSM:%d\n"
+				"\tCSA:%d\n"
+				"\tCLD:%d\n\n"
+				, TEX2.PSM, TEX2.CBP, TEX2.CPSM, TEX2.CSM, TEX2.CSA, TEX2.CLD);
+
+		fprintf(fp, "CLAMP\n"
+				"\tWMS:%d\n"
+				"\tWMT:%d\n"
+				"\tMINU:%d\n"
+				"\tMAXU:%d\n"
+				"\tMAXV:%d\n"
+				"\tMINV:%lld\n\n"
+				, CLAMP.WMS, CLAMP.WMT, CLAMP.MINU, CLAMP.MAXU, CLAMP.MAXV, CLAMP.MINV);
+
+		// TODO mimmap? (yes I'm lazy)
+		fprintf(fp, "SCISSOR\n"
+				"\tX0:%d\n"
+				"\tX1:%d\n"
+				"\tY0:%d\n"
+				"\tY1:%d\n\n"
+				, SCISSOR.SCAX0, SCISSOR.SCAX1, SCISSOR.SCAY0, SCISSOR.SCAY1);
+
+		fprintf(fp, "ALPHA\n"
+				"\tA:%d\n"
+				"\tB:%d\n"
+				"\tC:%d\n"
+				"\tD:%d\n"
+				"\tFIX:%d\n"
+				, ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D, ALPHA.FIX);
+		const char *col[3] = {"Cs", "Cd", "0"};
+		const char *alpha[3] = {"As", "Ad", "Af"};
+		fprintf(fp, "\t=> (%s - %s) * %s + %s\n\n", col[ALPHA.A], col[ALPHA.B], alpha[ALPHA.C], col[ALPHA.D]);
+
+		fprintf(fp, "TEST\n"
+				"\tATE:%d\n"
+				"\tATST:%d\n"
+				"\tAREF:%d\n"
+				"\tAFAIL:%d\n"
+				"\tDATE:%d\n"
+				"\tDATM:%d\n"
+				"\tZTE:%d\n"
+				"\tZTST:%d\n\n"
+				, TEST.ATE, TEST.ATST, TEST.AREF, TEST.AFAIL, TEST.DATE, TEST.DATM, TEST.ZTE, TEST.ZTST);
+
+		fprintf(fp, "FBA\n"
+				"\tFBA:%d\n\n"
+				, FBA.FBA);
+
+		fprintf(fp, "FRAME\n"
+				"\tFBP (*32):0x%x\n"
+				"\tFBW:%d\n"
+				"\tPSM:0x%x\n"
+				"\tFBMSK:0x%x\n\n"
+				, FRAME.FBP*32, FRAME.FBW, FRAME.PSM, FRAME.FBMSK);
+
+		fprintf(fp, "ZBUF\n"
+				"\tZBP (*32):0x%x\n"
+				"\tPSM:0x%x\n"
+				"\tZMSK:%d\n\n"
+				, ZBUF.ZBP*32, ZBUF.PSM, ZBUF.ZMSK);
+
+		fclose(fp);
+	}
+};
diff --git a/plugins/GSdx_legacy/GSDrawingEnvironment.h b/plugins/GSdx_legacy/GSDrawingEnvironment.h
new file mode 100644
index 0000000000..07c6d4eab2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDrawingEnvironment.h
@@ -0,0 +1,205 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+
+__aligned(class, 32) GSDrawingEnvironment
+{
+public:
+	GIFRegPRIM			PRIM;
+	GIFRegPRMODE		PRMODE;
+	GIFRegPRMODECONT	PRMODECONT;
+	GIFRegTEXCLUT		TEXCLUT;
+	GIFRegSCANMSK		SCANMSK;
+	GIFRegTEXA			TEXA;
+	GIFRegFOGCOL		FOGCOL;
+	GIFRegDIMX			DIMX;
+	GIFRegDTHE			DTHE;
+	GIFRegCOLCLAMP		COLCLAMP;
+	GIFRegPABE			PABE;
+	GIFRegBITBLTBUF		BITBLTBUF;
+	GIFRegTRXDIR		TRXDIR;
+	GIFRegTRXPOS		TRXPOS;
+	GIFRegTRXREG		TRXREG;
+	GSDrawingContext	CTXT[2];
+
+	GSDrawingEnvironment()
+	{
+	}
+
+	void Reset()
+	{
+		memset(&PRIM, 0, sizeof(PRIM));
+		memset(&PRMODE, 0, sizeof(PRMODE));
+		memset(&PRMODECONT, 0, sizeof(PRMODECONT));
+		memset(&TEXCLUT, 0, sizeof(TEXCLUT));
+		memset(&SCANMSK, 0, sizeof(SCANMSK));
+		memset(&TEXA, 0, sizeof(TEXA));
+		memset(&FOGCOL, 0, sizeof(FOGCOL));
+		memset(&DIMX, 0, sizeof(DIMX));
+		memset(&DTHE, 0, sizeof(DTHE));
+		memset(&COLCLAMP, 0, sizeof(COLCLAMP));
+		memset(&PABE, 0, sizeof(PABE));
+		memset(&BITBLTBUF, 0, sizeof(BITBLTBUF));
+		memset(&TRXDIR, 0, sizeof(TRXDIR));
+		memset(&TRXPOS, 0, sizeof(TRXPOS));
+		memset(&TRXREG, 0, sizeof(TRXREG));
+
+		CTXT[0].Reset();
+		CTXT[1].Reset();
+
+		memset(dimx, 0, sizeof(dimx));
+	}
+
+	GSVector4i dimx[8];
+
+	void UpdateDIMX()
+	{
+		dimx[1] = GSVector4i(DIMX.DM00, 0, DIMX.DM01, 0, DIMX.DM02, 0, DIMX.DM03, 0);
+		dimx[0] = dimx[1].xxzzlh();
+		dimx[3] = GSVector4i(DIMX.DM10, 0, DIMX.DM11, 0, DIMX.DM12, 0, DIMX.DM13, 0),
+		dimx[2] = dimx[3].xxzzlh();
+		dimx[5] = GSVector4i(DIMX.DM20, 0, DIMX.DM21, 0, DIMX.DM22, 0, DIMX.DM23, 0),
+		dimx[4] = dimx[5].xxzzlh();
+		dimx[7] = GSVector4i(DIMX.DM30, 0, DIMX.DM31, 0, DIMX.DM32, 0, DIMX.DM33, 0),
+		dimx[6] = dimx[7].xxzzlh();
+	}
+
+	void Dump(const std::string& filename)
+	{
+		FILE* fp = fopen(filename.c_str(), "wt");
+		if (!fp) return;
+
+		fprintf(fp, "PRIM\n"
+				"\tPRIM:%d\n"
+				"\tIIP:%d\n"
+				"\tTME:%d\n"
+				"\tFGE:%d\n"
+				"\tABE:%d\n"
+				"\tAA1:%d\n"
+				"\tFST:%d\n"
+				"\tCTXT:%d\n"
+				"\tFIX:%d\n\n"
+				, PRIM.PRIM, PRIM.IIP, PRIM.TME, PRIM.FGE, PRIM.ABE, PRIM.AA1, PRIM.FST, PRIM.CTXT, PRIM.FIX);
+
+		fprintf(fp, "PRMODE (when AC=0)\n"
+				"\t_PRIM:%d\n"
+				"\tIIP:%d\n"
+				"\tTME:%d\n"
+				"\tFGE:%d\n"
+				"\tABE:%d\n"
+				"\tAA1:%d\n"
+				"\tFST:%d\n"
+				"\tCTXT:%d\n"
+				"\tFIX:%d\n\n"
+				, PRMODE._PRIM, PRMODE.IIP, PRMODE.TME, PRMODE.FGE, PRMODE.ABE, PRMODE.AA1, PRMODE.FST, PRMODE.CTXT, PRMODE.FIX);
+
+		fprintf(fp, "PRMODECONT\n"
+				"\tAC:%d\n\n"
+				, PRMODECONT.AC);
+
+		fprintf(fp, "TEXCLUT\n"
+				"\tCOU:%d\n"
+				"\tCBW:%d\n"
+				"\tCOV:%d\n\n"
+				, TEXCLUT.COU, TEXCLUT.CBW, TEXCLUT.COV);
+
+		fprintf(fp, "SCANMSK\n"
+				"\tMSK:%d\n\n"
+				"\n"
+				, SCANMSK.MSK);
+
+		fprintf(fp, "TEXA\n"
+				"\tAEM:%d\n"
+				"\tTA0:%d\n"
+				"\tTA1:%d\n\n"
+				, TEXA.AEM, TEXA.TA0, TEXA.TA1);
+
+		fprintf(fp, "FOGCOL\n"
+				"\tFCG:%d\n"
+				"\tFCB:%d\n"
+				"\tFCR:%d\n\n"
+				, FOGCOL.FCG, FOGCOL.FCB, FOGCOL.FCR);
+
+		fprintf(fp, "DIMX\n"
+				"\tDM22:%d\n"
+				"\tDM23:%d\n"
+				"\tDM31:%d\n"
+				"\tDM02:%d\n"
+				"\tDM21:%d\n"
+				"\tDM12:%d\n"
+				"\tDM03:%d\n"
+				"\tDM01:%d\n"
+				"\tDM33:%d\n"
+				"\tDM30:%d\n"
+				"\tDM11:%d\n"
+				"\tDM10:%d\n"
+				"\tDM20:%d\n"
+				"\tDM32:%d\n"
+				"\tDM00:%d\n"
+				"\tDM13:%d\n\n"
+				, DIMX.DM22, DIMX.DM23, DIMX.DM31, DIMX.DM02, DIMX.DM21, DIMX.DM12, DIMX.DM03, DIMX.DM01, DIMX.DM33, DIMX.DM30, DIMX.DM11, DIMX.DM10, DIMX.DM20, DIMX.DM32, DIMX.DM00, DIMX.DM13);
+
+		fprintf(fp, "DTHE\n"
+				"\tDTHE:%d\n\n"
+				, DTHE.DTHE);
+
+		fprintf(fp, "COLCLAMP\n"
+				"\tCLAMP:%d\n\n"
+				, COLCLAMP.CLAMP);
+
+		fprintf(fp, "PABE\n"
+				"\tPABE:%d\n\n"
+				, PABE.PABE);
+
+		fprintf(fp, "BITBLTBUF\n"
+				"\tSBW:%d\n"
+				"\tSBP:0x%x\n"
+				"\tSPSM:%d\n"
+				"\tDBW:%d\n"
+				"\tDPSM:%d\n"
+				"\tDBP:0x%x\n\n"
+				, BITBLTBUF.SBW, BITBLTBUF.SBP, BITBLTBUF.SPSM, BITBLTBUF.DBW, BITBLTBUF.DPSM, BITBLTBUF.DBP);
+
+		fprintf(fp, "TRXDIR\n"
+				"\tXDIR:%d\n\n"
+				, TRXDIR.XDIR);
+
+		fprintf(fp, "TRXPOS\n"
+				"\tDIRY:%d\n"
+				"\tSSAY:%d\n"
+				"\tSSAX:%d\n"
+				"\tDIRX:%d\n"
+				"\tDSAX:%d\n"
+				"\tDSAY:%d\n\n"
+				, TRXPOS.DIRY, TRXPOS.SSAY, TRXPOS.SSAX, TRXPOS.DIRX, TRXPOS.DSAX, TRXPOS.DSAY);
+
+		fprintf(fp, "TRXREG\n"
+				"\tRRH:%d\n"
+				"\tRRW:%d\n\n"
+				, TRXREG.RRH, TRXREG.RRW);
+
+		fclose(fp);
+	}
+
+};
diff --git a/plugins/GSdx_legacy/GSDump.cpp b/plugins/GSdx_legacy/GSDump.cpp
new file mode 100644
index 0000000000..973fe6acb8
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDump.cpp
@@ -0,0 +1,95 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDump.h"
+
+GSDump::GSDump()
+	: m_gs(NULL)
+	, m_frames(0)
+	, m_extra_frames(0)
+{
+}
+
+GSDump::~GSDump()
+{
+	Close();
+}
+
+void GSDump::Open(const string& fn, uint32 crc, const GSFreezeData& fd, const GSPrivRegSet* regs)
+{
+	m_gs = fopen((fn + ".gs").c_str(), "wb");
+
+	m_frames = 0;
+	m_extra_frames = 2;
+
+	if(m_gs)
+	{
+		fwrite(&crc, 4, 1, m_gs);
+		fwrite(&fd.size, 4, 1, m_gs);
+		fwrite(fd.data, fd.size, 1, m_gs);
+		fwrite(regs, sizeof(*regs), 1, m_gs);
+	}
+}
+
+void GSDump::Close()
+{
+	if(m_gs) {fclose(m_gs); m_gs = NULL;}
+}
+
+void GSDump::Transfer(int index, const uint8* mem, size_t size)
+{
+	if(m_gs && size > 0)
+	{
+		fputc(0, m_gs);
+		fputc(index, m_gs);
+		fwrite(&size, 4, 1, m_gs);
+		fwrite(mem, size, 1, m_gs);
+	}
+}
+
+void GSDump::ReadFIFO(uint32 size)
+{
+	if(m_gs && size > 0)
+	{
+		fputc(2, m_gs);
+		fwrite(&size, 4, 1, m_gs);
+	}
+}
+
+void GSDump::VSync(int field, bool last, const GSPrivRegSet* regs)
+{
+	if(m_gs)
+	{
+		fputc(3, m_gs);
+		fwrite(regs, sizeof(*regs), 1, m_gs);
+
+		fputc(1, m_gs);
+		fputc(field, m_gs);
+
+		if((++m_frames & 1) == 0 && last && (m_extra_frames <= 0))
+		{
+			Close();
+		} else if (last) {
+			m_extra_frames--;
+		}
+	}
+}
diff --git a/plugins/GSdx_legacy/GSDump.h b/plugins/GSdx_legacy/GSDump.h
new file mode 100644
index 0000000000..d8e31262b0
--- /dev/null
+++ b/plugins/GSdx_legacy/GSDump.h
@@ -0,0 +1,62 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSVertexSW.h"
+
+/*
+
+Dump file format:
+- [crc/4] [state size/4] [state data/size] [PMODE/0x2000] [id/1] [data/?] .. [id/1] [data/?]
+
+Transfer data (id == 0)
+- [0/1] [path index/1] [size/4] [data/size]
+
+VSync data (id == 1)
+- [1/1] [field/1]
+
+ReadFIFO2 data (id == 2)
+- [2/1] [size/?]
+
+Regs data (id == 3)
+- [PMODE/0x2000]
+
+*/
+
+class GSDump
+{
+	FILE* m_gs;
+	int m_frames;
+	int m_extra_frames;
+
+public:
+	GSDump();
+	virtual ~GSDump();
+
+	void Open(const string& fn, uint32 crc, const GSFreezeData& fd, const GSPrivRegSet* regs);
+	void Close();
+	void ReadFIFO(uint32 size);
+	void Transfer(int index, const uint8* mem, size_t size);
+	void VSync(int field, bool last, const GSPrivRegSet* regs);
+	operator bool() {return m_gs != NULL;}
+};
diff --git a/plugins/GSdx_legacy/GSFunctionMap.cpp b/plugins/GSdx_legacy/GSFunctionMap.cpp
new file mode 100644
index 0000000000..1b1974c0ea
--- /dev/null
+++ b/plugins/GSdx_legacy/GSFunctionMap.cpp
@@ -0,0 +1,23 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSFunctionMap.h"
diff --git a/plugins/GSdx_legacy/GSFunctionMap.h b/plugins/GSdx_legacy/GSFunctionMap.h
new file mode 100644
index 0000000000..3876679cc3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSFunctionMap.h
@@ -0,0 +1,241 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSCodeBuffer.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
+template<class KEY, class VALUE> class GSFunctionMap
+{
+protected:
+	struct ActivePtr
+	{
+		uint64 frame, frames;
+		uint64 ticks, actual, total;
+		VALUE f;
+	};
+
+	hash_map<KEY, VALUE> m_map;
+	hash_map<KEY, ActivePtr*> m_map_active;
+
+	ActivePtr* m_active;
+
+	virtual VALUE GetDefaultFunction(KEY key) = 0;
+
+public:
+	GSFunctionMap()
+		: m_active(NULL)
+	{
+	}
+
+	virtual ~GSFunctionMap()
+	{
+		for_each(m_map_active.begin(), m_map_active.end(), delete_second());
+	}
+
+	VALUE operator [] (KEY key)
+	{
+		m_active = NULL;
+
+		typename hash_map<KEY, ActivePtr*>::iterator i = m_map_active.find(key);
+
+		if(i != m_map_active.end())
+		{
+			m_active = i->second;
+		}
+		else
+		{
+			typename hash_map<KEY, VALUE>::iterator i = m_map.find(key);
+
+			ActivePtr* p = new ActivePtr();
+
+			memset(p, 0, sizeof(*p));
+
+			p->frame = (uint64)-1;
+
+			p->f = i != m_map.end() ? i->second : GetDefaultFunction(key);
+
+			m_map_active[key] = p;
+
+			m_active = p;
+		}
+
+		return m_active->f;
+	}
+
+	void UpdateStats(uint64 frame, uint64 ticks, int actual, int total)
+	{
+		if(m_active)
+		{
+			if(m_active->frame != frame)
+			{
+				m_active->frame = frame;
+				m_active->frames++;
+			}
+
+			m_active->ticks += ticks;
+			m_active->actual += actual;
+			m_active->total += total;
+
+			ASSERT(m_active->total >= m_active->actual);
+		}
+	}
+
+	virtual void PrintStats()
+	{
+		uint64 ttpf = 0;
+
+		typename hash_map<KEY, ActivePtr*>::iterator i;
+
+		for(i = m_map_active.begin(); i != m_map_active.end(); i++)
+		{
+			ActivePtr* p = i->second;
+
+			if(p->frames)
+			{
+				ttpf += p->ticks / p->frames;
+			}
+		}
+
+		printf("GS stats\n");
+
+		for(i = m_map_active.begin(); i != m_map_active.end(); i++)
+		{
+			KEY key = i->first;
+			ActivePtr* p = i->second;
+
+			if(p->frames && ttpf)
+			{
+				uint64 tpp = p->actual > 0 ? p->ticks / p->actual : 0;
+				uint64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
+				uint64 ppf = p->frames > 0 ? p->actual / p->frames : 0;
+
+				printf("[%014llx]%c %6.2f%% %5.2f%% f %4lld t %12lld p %12lld w %12lld tpp %4lld tpf %9lld ppf %9lld\n",
+					(uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ',
+					(float)(tpf * 10000 / 34000000) / 100,
+					(float)(tpf * 10000 / ttpf) / 100,
+					p->frames, p->ticks, p->actual, p->total - p->actual,
+					tpp, tpf, ppf);
+			}
+		}
+	}
+};
+
+class GSCodeGenerator : public Xbyak::CodeGenerator
+{
+protected:
+	Xbyak::util::Cpu m_cpu;
+
+public:
+	GSCodeGenerator(void* code, size_t maxsize)
+		: Xbyak::CodeGenerator(maxsize, code)
+	{
+	}
+};
+
+template<class CG, class KEY, class VALUE>
+class GSCodeGeneratorFunctionMap : public GSFunctionMap<KEY, VALUE>
+{
+	string m_name;
+	void* m_param;
+	hash_map<uint64, VALUE> m_cgmap;
+	GSCodeBuffer m_cb;
+
+	enum {MAX_SIZE = 8192};
+
+public:
+	GSCodeGeneratorFunctionMap(const char* name, void* param)
+		: m_name(name)
+		, m_param(param)
+	{
+	}
+
+	VALUE GetDefaultFunction(KEY key)
+	{
+		VALUE ret = NULL;
+
+		typename hash_map<uint64, VALUE>::iterator i = m_cgmap.find(key);
+
+		if(i != m_cgmap.end())
+		{
+			ret = i->second;
+		}
+		else
+		{
+			CG* cg = new CG(m_param, key, m_cb.GetBuffer(MAX_SIZE), MAX_SIZE);
+
+			ASSERT(cg->getSize() < MAX_SIZE);
+
+			m_cb.ReleaseBuffer(cg->getSize());
+
+			ret = (VALUE)cg->getCode();
+
+			m_cgmap[key] = ret;
+
+			#ifdef ENABLE_VTUNE
+
+			// vtune method registration
+
+			// if(iJIT_IsProfilingActive()) // always > 0
+			{
+				string name = format("%s<%016llx>()", m_name.c_str(), (uint64)key);
+
+				iJIT_Method_Load ml;
+
+				memset(&ml, 0, sizeof(ml));
+
+				ml.method_id = iJIT_GetNewMethodID();
+				ml.method_name = (char*)name.c_str();
+				ml.method_load_address = (void*)cg->getCode();
+				ml.method_size = (unsigned int)cg->getSize();
+
+				iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml);
+/*
+				name = format("c:/temp1/%s_%016llx.bin", m_name.c_str(), (uint64)key);
+
+				if(FILE* fp = fopen(name.c_str(), "wb"))
+				{
+					fputc(0x0F, fp); fputc(0x0B, fp);
+					fputc(0xBB, fp); fputc(0x6F, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp);
+					fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
+
+					fwrite(cg->getCode(), cg->getSize(), 1, fp);
+
+					fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp);
+					fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
+					fputc(0x0F, fp); fputc(0x0B, fp);
+
+					fclose(fp);
+				}
+*/
+			}
+
+			#endif
+
+			delete cg;
+		}
+
+		return ret;
+	}
+};
diff --git a/plugins/GSdx_legacy/GSLinuxDialog.cpp b/plugins/GSdx_legacy/GSLinuxDialog.cpp
new file mode 100644
index 0000000000..369abe0c3c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLinuxDialog.cpp
@@ -0,0 +1,520 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include <gtk/gtk.h>
+#include "GS.h"
+#include "GSdx.h"
+#include "GSLinuxLogo.h"
+#include "GSSetting.h"
+
+void AddTooltip(GtkWidget* w, int idc)
+{
+	gtk_widget_set_tooltip_text(w, dialog_message(idc));
+}
+
+void AddTooltip(GtkWidget* w1, GtkWidget* w2, int idc)
+{
+	AddTooltip(w1, idc);
+	AddTooltip(w2, idc);
+}
+
+GtkWidget* left_label(const char* lbl)
+{
+	GtkWidget* w = gtk_label_new(lbl);
+#if GTK_MAJOR_VERSION >= 3
+	gtk_widget_set_halign(w, GTK_ALIGN_START);
+#else
+	gtk_misc_set_alignment(GTK_MISC(w),0.0,0.5);
+#endif
+	return w;
+}
+
+void CB_ChangedComboBox(GtkComboBox *combo, gpointer user_data)
+{
+	int p = gtk_combo_box_get_active(combo);
+	vector<GSSetting>* s = (vector<GSSetting>*)g_object_get_data(G_OBJECT(combo), "Settings");
+
+	try {
+		theApp.SetConfig((char*)user_data, s->at(p).value);
+	} catch (...) {
+	}
+}
+
+GtkWidget* CreateComboBoxFromVector(const vector<GSSetting>& s, const char* opt_name, int32_t opt_default = 0)
+{
+	GtkWidget* combo_box = gtk_combo_box_text_new();
+	int32_t opt_value    = theApp.GetConfig(opt_name, opt_default);
+	int opt_position     = 0;
+
+	for(size_t i = 0; i < s.size(); i++)
+	{
+		string label = s[i].name;
+
+		if(!s[i].note.empty()) label += format(" (%s)", s[i].note.c_str());
+
+		gtk_combo_box_text_append_text(GTK_COMBO_BOX_TEXT(combo_box), label.c_str());
+
+		if (s[i].value == opt_value)
+			opt_position = i;
+	}
+
+	gtk_combo_box_set_active(GTK_COMBO_BOX(combo_box), opt_position);
+
+	g_signal_connect(combo_box, "changed", G_CALLBACK(CB_ChangedComboBox), const_cast<char*>(opt_name));
+	g_object_set_data(G_OBJECT(combo_box), "Settings", (void*)&s);
+
+	return combo_box;
+}
+
+void CB_PreEntryActived(GtkEntry *entry, gchar* preedit, gpointer user_data)
+{
+	int hex_value = 0;
+	sscanf(preedit,"%X",&hex_value);
+
+	theApp.SetConfig((char*)user_data, hex_value);
+}
+
+void CB_EntryActived(GtkEntry *entry, gpointer user_data)
+{
+	int hex_value = 0;
+	const gchar *data = gtk_entry_get_text(entry);
+	sscanf(data,"%X",&hex_value);
+
+	theApp.SetConfig((char*)user_data, hex_value);
+}
+
+GtkWidget* CreateTextBox(const char* opt_name, int opt_default = 0) {
+	GtkWidget* entry = gtk_entry_new();
+
+	int hex_value = theApp.GetConfig(opt_name, opt_default);
+
+	gchar* data=(gchar *)g_malloc(sizeof(gchar)*40);
+	sprintf(data,"%X", hex_value);
+	gtk_entry_set_text(GTK_ENTRY(entry),data);
+	g_free(data);
+
+	g_signal_connect(entry, "activate", G_CALLBACK(CB_EntryActived), const_cast<char*>(opt_name));
+	// Note it doesn't seem to work as expected
+	g_signal_connect(entry, "preedit-changed", G_CALLBACK(CB_PreEntryActived), const_cast<char*>(opt_name));
+
+	return entry;
+}
+
+void CB_ToggleCheckBox(GtkToggleButton *togglebutton, gpointer user_data)
+{
+	theApp.SetConfig((char*)user_data, (int)gtk_toggle_button_get_active(togglebutton));
+}
+
+GtkWidget* CreateCheckBox(const char* label, const char* opt_name, bool opt_default = false)
+{
+	GtkWidget* check = gtk_check_button_new_with_label(label);
+
+	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(check), theApp.GetConfig(opt_name, opt_default));
+
+	g_signal_connect(check, "toggled", G_CALLBACK(CB_ToggleCheckBox), const_cast<char*>(opt_name));
+
+	return check;
+}
+
+void CB_SpinButton(GtkSpinButton *spin, gpointer user_data)
+{
+	theApp.SetConfig((char*)user_data, (int)gtk_spin_button_get_value(spin));
+}
+
+GtkWidget* CreateSpinButton(double min, double max, const char* opt_name, int opt_default = 0)
+{
+	GtkWidget* spin = gtk_spin_button_new_with_range(min, max, 1);
+
+	gtk_spin_button_set_value(GTK_SPIN_BUTTON(spin), theApp.GetConfig(opt_name, opt_default));
+
+	g_signal_connect(spin, "value-changed", G_CALLBACK(CB_SpinButton), const_cast<char*>(opt_name));
+
+	return spin;
+}
+
+void CB_RangeChanged(GtkRange* range, gpointer user_data)
+{
+	theApp.SetConfig((char*)user_data, (int)gtk_range_get_value(range));
+}
+
+GtkWidget* CreateScale(const char* opt_name, int opt_default = 0)
+{
+#if GTK_MAJOR_VERSION < 3
+	GtkWidget* scale = gtk_hscale_new_with_range(0, 200, 10);
+#else
+	GtkWidget* scale = gtk_scale_new_with_range(GTK_ORIENTATION_HORIZONTAL, 0, 200, 10);
+#endif
+
+	gtk_scale_set_value_pos(GTK_SCALE(scale), GTK_POS_RIGHT);
+	gtk_range_set_value(GTK_RANGE(scale), theApp.GetConfig(opt_name, opt_default));
+
+	g_signal_connect(scale, "value-changed", G_CALLBACK(CB_RangeChanged), const_cast<char*>(opt_name));
+
+	return scale;
+}
+
+void CB_PickFile(GtkFileChooserButton *chooser, gpointer user_data)
+{
+	theApp.SetConfig((char*)user_data, gtk_file_chooser_get_filename(GTK_FILE_CHOOSER(chooser)));
+}
+
+GtkWidget* CreateFileChooser(GtkFileChooserAction action, const char* label, const char* opt_name, const char* opt_default)
+{
+	GtkWidget* chooser = gtk_file_chooser_button_new(label, action);
+
+	gtk_file_chooser_set_filename(GTK_FILE_CHOOSER(chooser), theApp.GetConfig(opt_name, opt_default).c_str());
+
+	g_signal_connect(chooser, "file-set", G_CALLBACK(CB_PickFile), const_cast<char*>(opt_name));
+
+	return chooser;
+}
+
+static int s_table_line = 0;
+static void InsertWidgetInTable(GtkWidget* table, GtkWidget *left, GtkWidget *right = NULL, GtkWidget *third = NULL) {
+	GtkAttachOptions opt = (GtkAttachOptions)(GTK_EXPAND | GTK_FILL); // default
+	guint l_xpad = GTK_IS_CHECK_BUTTON(left) ? 0 : 22;
+	guint r_xpad = 0;
+	guint ypad = 0;
+	if (!left) {
+		gtk_table_attach(GTK_TABLE(table), right, 1, 2, s_table_line, s_table_line+1, opt, opt, r_xpad, ypad);
+	} else if (!right) {
+		gtk_table_attach(GTK_TABLE(table), left, 0, 1, s_table_line, s_table_line+1, opt, opt, l_xpad, ypad);
+	} else if (right == left) {
+		gtk_table_attach(GTK_TABLE(table), left, 0, 2, s_table_line, s_table_line+1, opt, opt, r_xpad, ypad);
+	} else {
+		gtk_table_attach(GTK_TABLE(table), left, 0, 1, s_table_line, s_table_line+1, opt, opt, l_xpad, ypad);
+		gtk_table_attach(GTK_TABLE(table), right, 1, 2, s_table_line, s_table_line+1, opt, opt, r_xpad, ypad);
+	}
+	if (third) {
+		gtk_table_attach(GTK_TABLE(table), third, 2, 3, s_table_line, s_table_line+1, opt, opt, r_xpad, ypad);
+	}
+	s_table_line++;
+}
+
+GtkWidget* CreateTableInBox(GtkWidget* parent_box, const char* frame_title, int row, int col) {
+	GtkWidget* table = gtk_table_new(row, col, false);
+	GtkWidget* container = (frame_title) ? gtk_frame_new (frame_title) : gtk_vbox_new(false, 5);
+	gtk_container_add(GTK_CONTAINER(container), table);
+	gtk_container_add(GTK_CONTAINER(parent_box), container);
+
+	return table;
+}
+
+void populate_hw_table(GtkWidget* hw_table)
+{
+	GtkWidget* filter_label     = left_label("Texture Filtering:");
+	GtkWidget* filter_combo_box = CreateComboBoxFromVector(theApp.m_gs_filter, "filter", 2);
+
+	GtkWidget* fsaa_label     = left_label("Internal Resolution:");
+	GtkWidget* fsaa_combo_box = CreateComboBoxFromVector(theApp.m_gs_upscale_multiplier, "upscale_multiplier", 1);
+
+	GtkWidget* af_label     = left_label("Anisotropic Filtering:");
+	GtkWidget* af_combo_box = CreateComboBoxFromVector(theApp.m_gs_max_anisotropy, "MaxAnisotropy", 0);
+
+	GtkWidget* crc_label     = left_label("Automatic CRC level:");
+	GtkWidget* crc_combo_box = CreateComboBoxFromVector(theApp.m_gs_crc_level, "crc_hack_level", 3);
+
+	GtkWidget* paltex_check     = CreateCheckBox("Allow 8 bits textures", "paltex");
+	GtkWidget* acc_date_check   = CreateCheckBox("Accurate Date", "accurate_date", false);
+	GtkWidget* tc_depth_check   = CreateCheckBox("Full Depth Emulation", "texture_cache_depth", true);
+
+	GtkWidget* acc_bld_label     = left_label("Blending Unit Accuracy:");
+	GtkWidget* acc_bld_combo_box = CreateComboBoxFromVector(theApp.m_gs_acc_blend_level, "accurate_blending_unit", 1);
+
+	// Some helper string
+	AddTooltip(paltex_check, IDC_PALTEX);
+	AddTooltip(acc_date_check, IDC_ACCURATE_DATE);
+	AddTooltip(crc_label, crc_combo_box, IDC_CRC_LEVEL);
+	AddTooltip(acc_bld_label, acc_bld_combo_box, IDC_ACCURATE_BLEND_UNIT);
+	AddTooltip(tc_depth_check, IDC_TC_DEPTH);
+	AddTooltip(filter_label, filter_combo_box, IDC_FILTER);
+	AddTooltip(af_label, af_combo_box, IDC_AFCOMBO);
+
+	s_table_line = 0;
+	InsertWidgetInTable(hw_table, paltex_check, tc_depth_check);
+	InsertWidgetInTable(hw_table, acc_date_check);
+	InsertWidgetInTable(hw_table, fsaa_label, fsaa_combo_box);
+	InsertWidgetInTable(hw_table, filter_label, filter_combo_box);
+	InsertWidgetInTable(hw_table, af_label, af_combo_box);
+	InsertWidgetInTable(hw_table, acc_bld_label, acc_bld_combo_box);
+	InsertWidgetInTable(hw_table, crc_label, crc_combo_box);
+}
+
+void populate_gl_table(GtkWidget* gl_table)
+{
+	GtkWidget* gl_bs_label = left_label("Buffer Storage:");
+	GtkWidget* gl_bs_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_GL_ARB_buffer_storage", -1);
+	GtkWidget* gl_sso_label = left_label("Separate Shader:");
+	GtkWidget* gl_sso_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_GL_ARB_separate_shader_objects", -1);
+	GtkWidget* gl_gs_label = left_label("Geometry Shader:");
+	GtkWidget* gl_gs_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_geometry_shader", -1);
+	GtkWidget* gl_ils_label = left_label("Image Load Store:");
+	GtkWidget* gl_ils_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_GL_ARB_shader_image_load_store", -1);
+	GtkWidget* gl_cc_label = left_label("Clip Control (depth accuracy):");
+	GtkWidget* gl_cc_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_GL_ARB_clip_control", -1);
+	GtkWidget* gl_tb_label = left_label("Texture Barrier:");
+	GtkWidget* gl_tb_combo = CreateComboBoxFromVector(theApp.m_gs_gl_ext, "override_GL_ARB_texture_barrier", -1);
+
+	s_table_line = 0;
+	InsertWidgetInTable(gl_table , gl_gs_label  , gl_gs_combo);
+	InsertWidgetInTable(gl_table , gl_bs_label  , gl_bs_combo);
+	InsertWidgetInTable(gl_table , gl_sso_label , gl_sso_combo);
+	InsertWidgetInTable(gl_table , gl_ils_label , gl_ils_combo);
+	InsertWidgetInTable(gl_table , gl_cc_label  , gl_cc_combo);
+	InsertWidgetInTable(gl_table , gl_tb_label  , gl_tb_combo);
+}
+
+void populate_sw_table(GtkWidget* sw_table)
+{
+	GtkWidget* threads_label = left_label("Extra rendering threads:");
+	GtkWidget* threads_spin  = CreateSpinButton(0, 32, "extrathreads", DEFAULT_EXTRA_RENDERING_THREADS);
+
+	GtkWidget* aa_check         = CreateCheckBox("Edge anti-aliasing (AA1)", "aa1");
+	GtkWidget* mipmap_check     = CreateCheckBox("Mipmap", "mipmap", true);
+
+	AddTooltip(aa_check, IDC_AA1);
+	AddTooltip(mipmap_check, IDC_MIPMAP);
+	AddTooltip(threads_label, threads_spin, IDC_SWTHREADS);
+
+	s_table_line = 0;
+	InsertWidgetInTable(sw_table , threads_label     , threads_spin);
+	InsertWidgetInTable(sw_table , aa_check, mipmap_check);
+}
+
+void populate_shader_table(GtkWidget* shader_table)
+{
+	GtkWidget* shader            = CreateFileChooser(GTK_FILE_CHOOSER_ACTION_OPEN, "Select an external shader", "shaderfx_glsl", "dummy.glsl");
+	GtkWidget* shader_conf       = CreateFileChooser(GTK_FILE_CHOOSER_ACTION_OPEN, "Then select a config", "shaderfx_conf", "dummy.ini");
+	GtkWidget* shader_label      = left_label("External shader glsl");
+	GtkWidget* shader_conf_label = left_label("External shader conf");
+
+	GtkWidget* shadeboost_check = CreateCheckBox("Shade boost", "ShadeBoost");
+	GtkWidget* fxaa_check       = CreateCheckBox("Fxaa shader", "fxaa");
+	GtkWidget* shaderfx_check   = CreateCheckBox("External shader", "shaderfx");
+
+	GtkWidget* tv_shader_label  = left_label("TV shader:");
+	GtkWidget* tv_shader        = CreateComboBoxFromVector(theApp.m_gs_tv_shaders, "TVShader");
+
+	// Shadeboost scale
+	GtkWidget* sb_brightness       = CreateScale("ShadeBoost_Brightness", 50);
+	GtkWidget* sb_brightness_label = left_label("Shade Boost Brightness:");
+
+	GtkWidget* sb_contrast         = CreateScale("ShadeBoost_Contrast", 50);
+	GtkWidget* sb_contrast_label   = left_label("Shade Boost Contrast:");
+
+	GtkWidget* sb_saturation       = CreateScale("ShadeBoost_Saturation", 50);
+	GtkWidget* sb_saturation_label = left_label("Shade Boost Saturation:");
+
+	AddTooltip(shadeboost_check, IDC_SHADEBOOST);
+	AddTooltip(shaderfx_check, IDC_SHADER_FX);
+	AddTooltip(fxaa_check, IDC_FXAA);
+
+	s_table_line = 0;
+	InsertWidgetInTable(shader_table , fxaa_check);
+	InsertWidgetInTable(shader_table , shadeboost_check);
+	InsertWidgetInTable(shader_table , sb_brightness_label , sb_brightness);
+	InsertWidgetInTable(shader_table , sb_contrast_label   , sb_contrast);
+	InsertWidgetInTable(shader_table , sb_saturation_label , sb_saturation);
+	InsertWidgetInTable(shader_table , shaderfx_check);
+	InsertWidgetInTable(shader_table , shader_label        , shader);
+	InsertWidgetInTable(shader_table , shader_conf_label   , shader_conf);
+	InsertWidgetInTable(shader_table , tv_shader_label, tv_shader);
+}
+
+void populate_hack_table(GtkWidget* hack_table)
+{
+	GtkWidget* hack_offset_check   = CreateCheckBox("Half-pixel Offset Hack", "UserHacks_HalfPixelOffset");
+	GtkWidget* hack_skipdraw_label = left_label("Skipdraw:");
+	GtkWidget* hack_skipdraw_spin  = CreateSpinButton(0, 1000, "UserHacks_SkipDraw");
+	GtkWidget* hack_enble_check    = CreateCheckBox("Enable User Hacks", "UserHacks");
+	GtkWidget* hack_wild_check     = CreateCheckBox("Wild Arms Hack", "UserHacks_WildHack");
+	GtkWidget* hack_tco_label      = left_label("Texture Offset: 0x");
+	GtkWidget* hack_tco_entry      = CreateTextBox("UserHacks_TCOffset");
+	GtkWidget* align_sprite_check  = CreateCheckBox("Align sprite hack", "UserHacks_align_sprite_X");
+	GtkWidget* preload_gs_check    = CreateCheckBox("Preload Frame", "preload_frame_with_gs_data");
+	GtkWidget* hack_safe_fbmask    = CreateCheckBox("Safe Accurate Blending", "UserHacks_safe_fbmask");
+	GtkWidget* hack_fast_inv       = CreateCheckBox("Fast Texture Invalidation", "UserHacks_DisablePartialInvalidation");
+
+	GtkWidget* hack_sprite_box     = CreateComboBoxFromVector(theApp.m_gs_hack, "UserHacks_SpriteHack");
+	GtkWidget* hack_sprite_label   = left_label("Alpha-Sprite Hack:");
+	GtkWidget* stretch_hack_box    = CreateComboBoxFromVector(theApp.m_gs_hack, "UserHacks_round_sprite_offset");
+	GtkWidget* stretch_hack_label  = left_label("Align Sprite Texture:");
+
+	// Reuse windows helper string :)
+	AddTooltip(hack_offset_check, IDC_OFFSETHACK);
+	AddTooltip(hack_skipdraw_label, IDC_SKIPDRAWHACK);
+	AddTooltip(hack_skipdraw_spin, IDC_SKIPDRAWHACK);
+	gtk_widget_set_tooltip_text(hack_enble_check, "Allows the use of the hack below");
+	AddTooltip(hack_wild_check, IDC_WILDHACK);
+	AddTooltip(hack_sprite_label, hack_sprite_box, IDC_SPRITEHACK);
+	AddTooltip(hack_tco_label, IDC_TCOFFSETX);
+	AddTooltip(hack_tco_entry, IDC_TCOFFSETX);
+	AddTooltip(align_sprite_check, IDC_ALIGN_SPRITE);
+	AddTooltip(stretch_hack_label, stretch_hack_box, IDC_ROUND_SPRITE);
+	AddTooltip(preload_gs_check, IDC_PRELOAD_GS);
+	AddTooltip(hack_safe_fbmask, IDC_SAFE_FBMASK);
+	AddTooltip(hack_fast_inv, IDC_FAST_TC_INV);
+
+
+	s_table_line = 0;
+	InsertWidgetInTable(hack_table , hack_enble_check);
+	InsertWidgetInTable(hack_table , hack_wild_check     , align_sprite_check);
+	InsertWidgetInTable(hack_table , hack_offset_check   , preload_gs_check);
+	InsertWidgetInTable(hack_table , hack_safe_fbmask    , hack_fast_inv);
+	InsertWidgetInTable(hack_table , hack_sprite_label   , hack_sprite_box );
+	InsertWidgetInTable(hack_table , stretch_hack_label  , stretch_hack_box );
+	InsertWidgetInTable(hack_table , hack_skipdraw_label , hack_skipdraw_spin);
+	InsertWidgetInTable(hack_table , hack_tco_label      , hack_tco_entry);
+}
+
+void populate_main_table(GtkWidget* main_table)
+{
+	GtkWidget* render_label     = left_label("Renderer:");
+	GtkWidget* render_combo_box = CreateComboBoxFromVector(theApp.m_gs_renderers, "Renderer", static_cast<int>(GSRendererType::Default));
+	GtkWidget* interlace_label     = left_label("Interlacing (F5):");
+	GtkWidget* interlace_combo_box = CreateComboBoxFromVector(theApp.m_gs_interlace, "interlace", 7);
+
+	s_table_line = 0;
+	InsertWidgetInTable(main_table, render_label, render_combo_box);
+	InsertWidgetInTable(main_table, interlace_label, interlace_combo_box);
+}
+
+void populate_debug_table(GtkWidget* debug_table)
+{
+	GtkWidget* glsl_debug_check = CreateCheckBox("GLSL compilation", "debug_glsl_shader");
+	GtkWidget* gl_debug_check   = CreateCheckBox("Print GL error", "debug_opengl");
+	GtkWidget* gs_dump_check    = CreateCheckBox("Dump GS data", "dump");
+	GtkWidget* gs_save_check    = CreateCheckBox("Save RT", "save");
+	GtkWidget* gs_savef_check   = CreateCheckBox("Save Frame", "savef");
+	GtkWidget* gs_savet_check   = CreateCheckBox("Save Texture", "savet");
+	GtkWidget* gs_savez_check   = CreateCheckBox("Save Depth", "savez");
+
+	GtkWidget* gs_saven_label   = left_label("Start of Dump");
+	GtkWidget* gs_saven_spin    = CreateSpinButton(0, pow(10, 9), "saven");
+	GtkWidget* gs_savel_label   = left_label("Length of Dump");
+	GtkWidget* gs_savel_spin    = CreateSpinButton(0, pow(10, 5), "savel");
+
+	s_table_line = 0;
+	InsertWidgetInTable(debug_table, gl_debug_check, glsl_debug_check);
+	InsertWidgetInTable(debug_table, gs_dump_check);
+	InsertWidgetInTable(debug_table, gs_save_check, gs_savef_check);
+	InsertWidgetInTable(debug_table, gs_savet_check, gs_savez_check);
+	InsertWidgetInTable(debug_table, gs_saven_label, gs_saven_spin);
+	InsertWidgetInTable(debug_table, gs_savel_label, gs_savel_spin);
+}
+
+void populate_record_table(GtkWidget* record_table)
+{
+	GtkWidget* capture_check = CreateCheckBox("Enable Recording (with F12)", "capture_enabled");
+	GtkWidget* resxy_label   = left_label("Resolution:");
+	GtkWidget* resx_spin     = CreateSpinButton(256, 8192, "capture_resx", 1280);
+	GtkWidget* resy_spin     = CreateSpinButton(256, 8192, "capture_resy", 1024);
+	GtkWidget* threads_label = left_label("Saving Threads:");
+	GtkWidget* threads_spin  = CreateSpinButton(1, 32, "capture_threads", 4);
+	GtkWidget* out_dir_label = left_label("Output Directory:");
+	GtkWidget* out_dir       = CreateFileChooser(GTK_FILE_CHOOSER_ACTION_SELECT_FOLDER, "Select a directory", "capture_out_dir", "/tmp");
+	GtkWidget* png_label     = left_label("PNG Compression Level:");
+	GtkWidget* png_level     = CreateSpinButton(1, 9, "png_compression_level", 1);
+
+	InsertWidgetInTable(record_table , capture_check);
+	InsertWidgetInTable(record_table , resxy_label   , resx_spin      , resy_spin);
+	InsertWidgetInTable(record_table , threads_label , threads_spin);
+	InsertWidgetInTable(record_table , png_label     , png_level);
+	InsertWidgetInTable(record_table , out_dir_label , out_dir);
+}
+
+bool RunLinuxDialog()
+{
+	GtkWidget *dialog;
+	int return_value;
+
+	/* Create the widgets */
+	dialog = gtk_dialog_new_with_buttons (
+		"GSdx Config",
+		NULL, /* parent window*/
+		(GtkDialogFlags)(GTK_DIALOG_MODAL | GTK_DIALOG_DESTROY_WITH_PARENT),
+		"OK", GTK_RESPONSE_ACCEPT,
+		// "Cancel", GTK_RESPONSE_REJECT, // Drop because it is too annoying to support call back this way
+		NULL);
+
+	// The main area for the whole dialog box.
+	GtkWidget* main_box     = gtk_vbox_new(false, 5);
+	GtkWidget* central_box  = gtk_vbox_new(false, 5);
+	GtkWidget* advanced_box = gtk_vbox_new(false, 5);
+	GtkWidget* debug_box    = gtk_vbox_new(false, 5);
+
+	// Grab a logo, to make things look nice.
+	GdkPixbuf* logo_pixmap = gdk_pixbuf_from_pixdata(&gsdx_ogl_logo, false, NULL);
+	GtkWidget* logo_image  = gtk_image_new_from_pixbuf(logo_pixmap);
+	gtk_box_pack_start(GTK_BOX(main_box), logo_image, true, true, 0);
+
+	GtkWidget* main_table   = CreateTableInBox(main_box    , NULL                                   , 2  , 2);
+
+	GtkWidget* shader_table = CreateTableInBox(central_box , "Custom Shader Settings"               , 9  , 2);
+	GtkWidget* hw_table     = CreateTableInBox(central_box , "Hardware Mode Settings"               , 7  , 2);
+	GtkWidget* sw_table     = CreateTableInBox(central_box , "Software Mode Settings"               , 2  , 2);
+
+	GtkWidget* hack_table   = CreateTableInBox(advanced_box, "Hacks"                                , 7 , 2);
+	GtkWidget* gl_table     = CreateTableInBox(advanced_box, "OpenGL Very Advanced Custom Settings" , 6 , 2);
+
+	GtkWidget* record_table = CreateTableInBox(debug_box   , "Recording Settings"                   , 4  , 3);
+	GtkWidget* debug_table  = CreateTableInBox(debug_box   , "OpenGL / GSdx Debug Settings"         , 6  , 3);
+
+	// Populate all the tables
+	populate_main_table(main_table);
+
+	populate_shader_table(shader_table);
+	populate_hw_table(hw_table);
+	populate_sw_table(sw_table);
+
+	populate_hack_table(hack_table);
+	populate_gl_table(gl_table);
+
+	populate_debug_table(debug_table);
+	populate_record_table(record_table);
+
+	// Handle some nice tab
+	GtkWidget* notebook = gtk_notebook_new();
+	gtk_notebook_append_page(GTK_NOTEBOOK(notebook), central_box , gtk_label_new("Global Settings"));
+	gtk_notebook_append_page(GTK_NOTEBOOK(notebook), advanced_box, gtk_label_new("Advanced Settings"));
+	gtk_notebook_append_page(GTK_NOTEBOOK(notebook), debug_box   , gtk_label_new("Debug/Recording Settings"));
+
+	// Put everything in the big box.
+	gtk_container_add(GTK_CONTAINER(main_box), notebook);
+
+	// Put the box in the dialog and show it to the world.
+	gtk_container_add (GTK_CONTAINER(gtk_dialog_get_content_area(GTK_DIALOG(dialog))), main_box);
+	gtk_widget_show_all (dialog);
+	return_value = gtk_dialog_run (GTK_DIALOG (dialog));
+
+	// Compatibility & not supported option
+	int mode_width = theApp.GetConfig("ModeWidth", 640);
+	int mode_height = theApp.GetConfig("ModeHeight", 480);
+	theApp.SetConfig("ModeHeight", mode_height);
+	theApp.SetConfig("ModeWidth", mode_width);
+	theApp.SetConfig("msaa", 0);
+	theApp.SetConfig("windowed", 1);
+
+	gtk_widget_destroy (dialog);
+
+	return (return_value == GTK_RESPONSE_ACCEPT);
+}
diff --git a/plugins/GSdx_legacy/GSLinuxLogo.h b/plugins/GSdx_legacy/GSLinuxLogo.h
new file mode 100644
index 0000000000..1080969147
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLinuxLogo.h
@@ -0,0 +1,1617 @@
+/*
+ *     Generated file
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+/* GdkPixbuf RGB C-Source image dump 1-byte-run-length-encoded */
+
+#include <gdk-pixbuf/gdk-pixdata.h>
+
+//gdk-pixbuf-csource res/logo-ogl.bmp --struct  >! GSLinuxLogo.h
+
+static const GdkPixdata gsdx_ogl_logo = {
+  0x47646b50, /* Pixbuf magic: 'GdkP' */
+  24 + 43884, /* header length + pixel_data length */
+  0x2010001, /* pixdata_type */
+  786, /* rowstride */
+  262, /* width */
+  71, /* height */
+  /* pixel_data: */
+  (unsigned char*)"\7\22\24\26\22\25\27\22\24\26\21\25\26\22\24\26\22\24\27\22\24\26\202"
+  "\21\24\26\10\22\25\26\22\24\26\22\25\27\22\24\26\22\25\27\21\24\27\22"
+  "\24\26\21\24\26\202\22\24\26\1\21\24\26\202\22\24\27\202\22\24\26\202"
+  "\21\24\26\3\22\24\27\22\24\26\22\25\26\202\21\24\27\4\22\24\27\21\25"
+  "\27\22\24\27\22\24\26\204\22\24\27\6\21\24\27\22\24\26\22\24\27\22\24"
+  "\26\21\24\27\22\24\27\202\21\24\26\1\22\24\27\202\22\24\26\36\21\25\26"
+  "\22\24\26\22\24\27\21\24\26\22\24\26\21\24\26\22\25\26\21\24\27\21\25"
+  "\26\21\24\26\21\24\27\22\24\26\22\25\27\22\24\27\21\25\26\22\24\27\22"
+  "\25\27\21\24\27\22\25\26\21\24\26\22\25\26\21\25\27\22\24\27\22\24\26"
+  "\22\24\27\22\24\26\22\24\27\21\25\26\21\24\26\21\25\27\202\22\24\26\11"
+  "\22\24\27\21\24\27\22\24\26\21\24\27\22\24\27\22\25\27\22\24\26\21\24"
+  "\26\21\24\27\202\21\24\26\5\22\24\27\22\24\26\21\24\27\22\24\27\22\25"
+  "\26\202\22\24\26\202\21\24\26\204\22\24\26\7\22\24\27\21\25\26\22\24"
+  "\27\21\24\26\22\24\26\21\24\27\22\24\26\202\22\24\27\4\22\24\26\22\24"
+  "\27\21\25\26\22\24\27\203\22\24\26\203\22\24\27\202\22\25\26\1\22\24"
+  "\26\202\21\24\26\11\22\24\27\22\24\26\21\25\26\22\24\26\22\24\27\22\24"
+  "\26\21\24\26\22\24\26\21\24\26\202\21\24\27\203\22\24\26\2\22\25\26\21"
+  "\24\26\202\22\24\27\11\22\24\26\22\25\26\21\24\27\21\25\26\22\24\26\22"
+  "\24\27\21\24\26\22\24\27\21\24\27\202\22\24\27\4\21\25\26\22\25\26\22"
+  "\24\26\22\25\26\202\22\24\27\202\22\24\26\202\22\24\27\2\21\24\27\22"
+  "\25\26\203\22\24\26\1\21\24\26\202\22\24\26\11\21\24\27\22\25\26\22\24"
+  "\26\21\24\26\22\24\26\21\24\26\22\25\27\22\25\26\22\24\27\202\22\25\26"
+  "\21\22\24\27\22\24\26\22\24\27\21\25\27\22\25\27\22\25\26\22\24\26\22"
+  "\24\27\22\25\27\22\24\26\22\24\27\21\24\27\22\24\27\22\25\27\22\24\27"
+  "\22\24\26\21\25\27\202\22\25\26\203\22\24\26\203\21\24\26\204\22\24\26"
+  "\4\22\24\27\22\25\26\21\25\27\22\24\26\203\22\24\27\4\22\25\26\22\24"
+  "\26\21\24\26\21\25\26\204\22\24\27\4\21\24\27\22\24\27\22\24\26\22\25"
+  "\27\202\21\24\27\2\22\24\26\21\25\26\202\22\24\26\5\21\25\26\21\24\27"
+  "\22\24\26\22\24\27\21\24\27\202\22\24\26\202\22\24\27\6\22\24\26\22\24"
+  "\27\21\24\26\21\25\26\22\24\26\22\24\27\203\21\24\27\4\22\25\26\22\24"
+  "\27\22\25\27\22\24\27\202\21\24\27\2\21\25\27\22\25\30\203\22\25\27\3"
+  "\22\24\27\22\25\27\21\24\27\202\22\25\27\1\22\24\30\203\22\25\27\3\22"
+  "\24\27\22\25\30\22\24\27\202\22\25\27\202\22\25\30\204\22\25\27\11\22"
+  "\24\27\22\25\30\22\25\27\22\24\27\22\25\27\22\25\30\22\25\27\21\25\27"
+  "\22\25\30\202\22\25\27\203\22\24\30\6\22\25\27\22\24\30\22\25\27\22\24"
+  "\30\22\25\27\21\25\27\203\22\25\27\4\22\24\27\22\25\30\21\25\27\22\24"
+  "\27\202\22\25\27\1\22\24\27\202\22\25\27\1\22\24\27\202\22\25\27\202"
+  "\22\25\30\202\22\25\27\1\22\25\30\202\22\25\27\4\22\24\27\22\25\30\22"
+  "\25\27\22\25\30\202\22\25\27\1\22\24\27\204\22\25\27\5\22\25\30\22\25"
+  "\27\21\24\27\22\25\27\22\24\27\202\22\25\27\202\22\24\30\5\22\25\27\22"
+  "\25\30\21\24\27\22\25\27\22\24\30\202\22\25\27\10\21\25\27\22\25\27\22"
+  "\25\30\22\25\27\22\24\30\22\24\27\21\25\27\22\25\27\202\21\24\27\1\22"
+  "\24\27\202\22\25\30\2\22\25\27\22\24\27\202\22\25\27\2\22\24\27\22\24"
+  "\30\202\22\25\27\2\22\24\27\21\25\27\203\22\25\27\202\22\25\30\205\22"
+  "\25\27\14\21\25\30\22\24\27\22\25\30\22\24\27\22\25\27\22\24\27\21\25"
+  "\27\22\25\27\22\25\30\22\24\30\22\25\27\22\24\27\202\22\25\30\13\22\25"
+  "\27\22\24\27\22\25\30\22\25\27\21\25\27\22\25\27\21\25\30\22\25\27\22"
+  "\25\30\21\25\30\22\24\30\202\22\24\27\203\22\25\27\13\22\25\30\21\25"
+  "\27\21\24\27\22\25\27\21\25\27\22\25\27\22\24\27\22\25\30\22\24\27\22"
+  "\25\27\22\24\30\202\22\25\27\2\21\25\30\21\25\27\204\22\25\27\202\22"
+  "\25\30\1\22\24\30\202\22\25\27\6\22\25\30\21\25\27\22\25\27\21\25\30"
+  "\21\25\27\22\24\27\202\22\25\27\10\21\25\27\22\25\27\21\25\27\22\25\27"
+  "\21\24\27\22\25\27\22\24\30\22\24\27\202\22\24\30\202\22\25\27\202\22"
+  "\25\30\202\22\25\27\1\21\25\27\202\22\25\27\1\21\25\27\202\22\25\30\202"
+  "\22\25\27\1\21\25\27\202\22\25\27\10\22\24\27\22\25\30\22\25\27\22\24"
+  "\27\22\25\27\22\24\27\22\25\27\21\25\30\205\22\25\27\5\21\24\27\22\25"
+  "\27\22\25\30\22\24\27\21\25\27\202\22\25\27\3\22\24\27\22\25\27\22\25"
+  "\30\202\22\25\27\5\22\25\30\21\24\27\22\24\27\22\25\30\22\25\27\202\22"
+  "\24\27\202\22\25\30\1\21\24\27\203\22\25\27\1\22\25\30\202\22\25\27\2"
+  "\21\25\27\22\25\30\202\22\26\30\202\22\25\30\1\22\26\30\202\22\25\30"
+  "\1\22\25\31\213\22\25\30\3\22\26\30\22\25\30\22\26\31\202\22\25\30\3"
+  "\22\26\30\22\25\30\22\25\31\204\22\25\30\7\22\26\30\23\25\31\22\25\30"
+  "\22\25\31\22\25\30\22\26\30\23\25\30\206\22\25\30\2\22\26\30\22\25\30"
+  "\202\22\26\30\203\22\25\30\2\22\26\30\23\25\30\202\22\25\30\2\22\26\30"
+  "\22\25\31\205\22\25\30\202\22\26\30\204\22\25\30\3\22\26\30\22\25\30"
+  "\22\25\31\202\22\25\30\1\22\25\31\203\22\25\30\1\22\25\31\203\22\25\30"
+  "\1\23\26\30\202\22\26\30\2\22\25\30\22\26\30\202\22\25\30\1\22\26\30"
+  "\204\22\25\30\2\22\26\30\22\25\31\205\22\25\30\202\22\26\30\202\22\25"
+  "\30\1\22\26\30\202\22\25\30\202\22\26\31\5\22\25\31\22\25\30\22\26\30"
+  "\22\25\30\22\26\30\207\22\25\30\1\22\26\30\202\22\25\30\2\22\26\30\22"
+  "\25\31\203\22\25\30\1\22\26\30\204\22\25\30\1\22\26\30\204\22\25\30\203"
+  "\22\26\30\202\22\25\30\1\22\26\30\203\22\25\30\1\22\25\31\210\22\25\30"
+  "\2\22\26\31\22\25\31\202\22\25\30\202\22\26\30\3\22\25\30\23\25\30\22"
+  "\25\31\210\22\25\30\202\22\26\30\202\22\25\31\3\22\26\30\22\25\30\22"
+  "\26\30\205\22\25\30\1\22\26\30\203\22\25\30\204\22\26\30\202\22\25\30"
+  "\1\22\26\30\203\22\25\30\1\22\25\31\202\22\25\30\1\22\25\31\202\22\25"
+  "\30\1\22\26\30\202\22\25\30\1\22\25\31\203\22\25\30\1\22\26\30\202\22"
+  "\25\30\1\22\25\31\203\22\25\30\3\23\25\30\22\25\30\22\26\31\202\22\25"
+  "\30\3\23\25\30\22\25\30\22\26\31\203\22\25\30\2\22\26\30\22\26\31\202"
+  "\22\25\30\3\22\25\31\22\26\30\23\25\30\205\22\25\30\5\22\26\31\22\25"
+  "\30\22\25\31\22\25\30\22\26\30\204\22\25\30\1\23\25\30\204\22\26\30\202"
+  "\22\25\30\3\23\25\30\22\26\30\22\26\31\202\23\26\31\1\22\26\31\202\22"
+  "\25\31\2\22\26\31\22\25\31\207\22\26\31\2\22\25\31\23\26\31\203\22\26"
+  "\31\1\23\26\31\204\22\26\31\1\22\25\31\210\22\26\31\3\23\25\31\22\26"
+  "\31\22\25\31\203\22\26\31\4\22\25\31\23\26\31\22\26\31\22\25\31\221\22"
+  "\26\31\4\23\26\31\23\25\31\22\25\31\22\26\31\202\23\26\31\207\22\26\31"
+  "\1\23\26\31\203\22\26\31\1\23\25\31\204\22\26\31\3\23\26\31\22\26\31"
+  "\22\25\31\205\22\26\31\202\22\25\31\204\22\26\31\202\23\26\31\203\22"
+  "\26\31\6\22\26\30\22\26\31\23\25\31\22\26\31\23\26\31\22\26\31\203\22"
+  "\25\31\202\22\26\31\4\23\26\31\22\26\31\23\25\31\23\26\31\203\22\26\31"
+  "\2\23\26\31\22\26\31\203\22\25\31\203\22\26\31\10\22\25\31\22\26\31\22"
+  "\25\31\23\26\31\22\26\31\23\26\31\22\26\31\23\25\31\202\22\26\31\202"
+  "\23\26\31\203\22\26\31\1\22\25\31\203\22\26\31\1\23\26\31\202\22\26\31"
+  "\1\23\26\31\205\22\26\31\2\23\26\31\22\25\31\203\22\26\31\1\22\25\31"
+  "\203\22\26\31\2\22\25\31\23\25\31\205\22\26\31\1\22\25\31\202\22\26\31"
+  "\1\23\26\31\202\22\25\31\2\22\26\31\23\26\31\203\22\26\31\1\22\25\31"
+  "\202\22\26\31\1\23\26\31\202\22\26\31\202\22\25\31\1\23\26\31\206\22"
+  "\26\31\2\23\25\31\22\25\31\203\22\26\31\3\23\26\31\22\25\31\22\26\31"
+  "\202\22\25\31\2\22\26\31\22\25\31\202\22\26\31\1\22\25\31\202\22\26\31"
+  "\1\23\26\31\205\22\26\31\3\22\25\31\23\26\31\22\25\31\204\22\26\31\202"
+  "\22\25\31\1\22\26\31\202\23\26\31\202\22\26\31\5\22\25\31\22\26\31\23"
+  "\25\31\22\26\31\22\26\30\202\22\26\31\202\23\26\31\1\22\26\31\202\22"
+  "\25\31\203\22\26\31\3\23\25\31\22\26\31\23\26\31\202\22\26\31\12\22\25"
+  "\31\22\26\31\23\25\31\22\26\31\22\25\31\22\26\31\22\25\31\22\26\32\23"
+  "\26\32\22\27\32\202\23\26\32\202\22\26\32\2\22\27\32\23\26\31\202\22"
+  "\26\32\12\22\27\31\22\27\32\22\26\31\22\26\32\22\27\32\22\26\31\23\26"
+  "\32\22\26\31\22\27\32\22\26\32\202\22\27\32\1\22\26\32\202\22\27\32\7"
+  "\23\26\32\23\27\32\22\27\32\23\26\32\22\26\31\22\27\32\23\26\31\202\23"
+  "\26\32\2\22\26\31\22\26\32\202\23\26\32\16\22\27\32\23\26\32\22\27\31"
+  "\22\27\32\22\26\32\22\27\32\22\26\32\22\27\32\22\26\32\22\27\32\22\26"
+  "\32\23\26\32\22\26\32\22\27\32\202\22\26\32\11\22\27\31\22\26\32\23\26"
+  "\32\22\26\32\23\26\32\22\26\31\22\27\32\23\27\32\22\27\32\202\22\26\32"
+  "\202\22\27\32\7\23\27\32\22\27\32\22\26\32\23\26\32\22\27\32\22\26\32"
+  "\22\27\32\202\23\27\32\202\23\26\32\5\22\26\32\23\26\32\22\27\31\22\27"
+  "\32\23\26\32\204\22\26\32\4\23\26\32\22\27\32\22\26\32\23\27\32\202\22"
+  "\27\32\1\23\26\32\202\22\26\32\30\22\26\31\22\27\31\22\27\32\23\27\32"
+  "\22\26\32\23\26\32\23\27\31\22\27\32\22\26\31\22\27\32\22\26\32\22\27"
+  "\31\22\27\32\22\26\32\23\27\31\22\26\32\22\27\32\23\27\32\22\26\32\23"
+  "\27\31\23\27\32\22\27\32\22\26\32\22\26\31\202\22\26\32\6\22\27\32\23"
+  "\26\32\22\27\31\23\27\32\22\26\32\23\26\32\202\22\26\32\1\23\27\32\202"
+  "\22\26\32\2\22\27\32\23\27\31\203\23\26\32\5\22\26\32\23\27\32\23\30"
+  "\32\22\26\32\23\26\32\202\22\27\32\2\22\26\32\22\27\32\202\22\26\32\203"
+  "\23\26\32\3\22\26\32\23\27\32\23\26\32\202\22\26\32\1\22\26\31\202\22"
+  "\26\32\10\23\26\32\23\27\32\22\26\31\23\27\32\22\27\31\22\26\32\23\26"
+  "\32\22\26\32\204\23\26\32\5\22\26\32\23\26\32\22\26\32\22\27\32\22\26"
+  "\32\203\23\26\32\1\22\26\31\202\22\26\32\5\22\27\32\23\26\32\23\27\32"
+  "\22\27\31\23\26\31\203\22\26\32\1\23\26\32\202\22\27\32\10\22\26\32\23"
+  "\26\32\22\26\32\23\26\32\22\26\32\22\27\32\22\26\32\23\26\31\204\22\26"
+  "\32\10\23\26\32\22\27\32\22\26\32\23\26\31\22\27\32\23\27\32\22\27\32"
+  "\23\27\32\207\22\26\32\1\23\26\32\203\22\26\32\3\22\27\31\23\26\32\22"
+  "\27\32\203\22\26\32\202\23\26\32\1\22\26\31\203\23\26\32\1\22\26\32\203"
+  "\23\26\32\1\22\26\32\202\22\27\32\2\23\26\32\22\26\31\202\22\26\32\2"
+  "\22\27\32\23\27\32\202\23\26\32\2\23\27\32\22\27\32\202\22\26\32\202"
+  "\23\26\32\202\22\26\32\3\22\27\32\23\27\33\22\27\32\203\22\27\33\203"
+  "\23\27\33\3\23\27\32\22\27\32\23\27\33\202\22\27\33\3\23\27\33\22\27"
+  "\33\23\27\33\202\22\27\33\1\23\27\33\202\22\27\33\2\22\27\32\22\27\33"
+  "\202\23\27\32\202\22\27\33\202\23\27\33\203\22\27\33\1\23\27\33\203\22"
+  "\27\33\202\23\27\33\1\22\27\32\202\22\27\33\1\22\27\32\202\23\27\33\1"
+  "\23\26\33\202\22\27\33\4\23\27\33\22\26\33\22\27\33\23\27\33\203\22\27"
+  "\33\7\23\27\32\23\27\33\22\27\33\23\27\33\22\27\33\22\27\32\23\27\33"
+  "\202\22\27\33\1\23\27\33\202\22\27\33\1\23\27\32\203\23\26\33\13\23\27"
+  "\33\22\27\33\22\26\33\23\26\33\22\26\32\23\27\32\22\27\33\23\27\32\22"
+  "\27\33\22\26\33\22\27\32\202\23\27\33\4\23\27\32\23\27\33\22\27\33\23"
+  "\27\33\206\22\27\33\10\23\27\32\22\27\33\23\27\33\22\27\33\23\27\33\23"
+  "\26\33\22\27\33\22\27\32\206\22\27\33\203\23\27\33\202\22\27\33\2\23"
+  "\27\33\22\27\33\202\23\27\33\4\22\27\32\23\26\33\22\27\33\23\27\33\203"
+  "\22\27\33\1\23\26\33\203\23\27\33\202\22\27\33\4\23\27\33\22\27\32\22"
+  "\27\33\23\26\33\203\23\27\33\1\22\27\33\204\23\27\33\7\24\27\34\24\31"
+  "\35\22\27\33\23\27\32\23\27\33\22\27\33\23\27\33\202\22\27\33\6\23\27"
+  "\33\22\27\32\22\27\33\23\27\33\22\26\33\23\27\33\202\22\27\33\10\22\27"
+  "\32\23\27\33\22\27\32\23\27\33\22\27\33\23\27\33\23\26\33\22\27\33\202"
+  "\23\27\33\1\23\27\32\202\23\27\33\5\23\26\32\23\27\32\23\27\33\22\27"
+  "\33\22\26\33\202\23\27\33\5\22\26\33\23\27\33\22\27\33\23\27\33\23\26"
+  "\33\202\23\27\33\4\23\26\33\23\27\32\22\27\33\22\27\32\203\22\27\33\1"
+  "\22\26\33\203\23\27\33\1\23\26\33\204\22\27\33\202\23\27\33\12\22\27"
+  "\32\23\26\33\22\27\33\22\27\32\22\27\33\23\27\33\23\27\32\22\27\32\23"
+  "\27\33\22\27\33\202\23\27\33\6\22\27\33\23\26\32\22\27\33\23\27\33\22"
+  "\27\33\23\27\33\204\22\27\33\2\23\27\33\22\27\33\202\23\27\33\202\22"
+  "\27\32\5\23\27\33\23\26\33\22\27\32\23\26\33\23\27\33\202\22\27\33\1"
+  "\23\27\33\204\22\27\33\3\23\27\33\22\27\32\22\27\33\202\23\27\33\205"
+  "\22\27\33\5\23\26\33\22\26\33\23\27\33\22\27\33\23\26\33\204\22\27\33"
+  "\2\22\30\34\23\27\34\202\23\27\33\2\22\27\34\23\27\34\202\23\30\34\1"
+  "\22\27\34\202\23\30\34\7\23\27\34\22\30\34\22\27\34\23\27\34\22\30\34"
+  "\23\30\33\23\27\33\202\22\27\34\7\23\27\33\22\30\34\23\27\34\23\30\34"
+  "\22\27\34\23\30\34\23\27\33\202\23\30\33\11\23\30\34\23\27\33\22\30\34"
+  "\23\27\34\23\30\34\22\30\34\23\30\33\23\30\34\23\27\33\203\23\27\34\7"
+  "\22\30\34\23\27\34\22\27\33\23\30\33\23\27\33\23\27\34\22\30\34\202\23"
+  "\30\34\2\22\27\34\22\27\33\202\23\30\34\202\22\27\34\1\23\27\34\202\23"
+  "\30\34\15\23\27\34\22\30\34\23\30\34\22\27\34\23\30\33\22\27\34\22\30"
+  "\34\22\30\33\22\27\34\23\30\34\23\27\34\23\30\34\23\27\33\202\23\27\34"
+  "\1\23\27\33\202\23\27\34\11\23\30\34\22\30\34\23\30\34\22\27\33\22\30"
+  "\34\22\27\34\23\30\34\23\27\34\22\27\34\203\23\30\34\202\23\27\34\202"
+  "\23\30\34\202\22\30\34\1\22\27\34\203\23\30\34\202\23\27\34\6\22\30\34"
+  "\22\27\34\23\30\34\22\30\34\23\27\34\23\27\33\202\23\30\34\7\23\27\33"
+  "\23\30\34\23\30\33\23\27\33\22\27\34\23\30\33\23\30\34\202\23\27\34\202"
+  "\22\27\34\14\23\27\34\23\27\33\23\27\34\22\30\34\23\30\33\23\27\34\22"
+  "\30\33\23\27\34\23\27\33\22\27\34\23\30\34\23\27\34\202\23\30\34\14\23"
+  "\30\33\23\30\34\23\27\33\22\27\34\23\27\34\24\30\34\22\30\34\26\33\37"
+  "\24\30\35\26\33\36\22\30\34\23\27\34\202\23\30\34\12\22\27\34\23\30\34"
+  "\22\30\34\23\30\34\23\27\34\23\30\34\23\27\33\23\30\34\23\27\34\23\27"
+  "\33\202\22\30\34\202\23\27\34\26\23\30\34\22\30\34\23\30\34\22\27\33"
+  "\23\27\33\23\30\33\23\27\34\22\27\33\23\27\34\23\27\33\23\30\34\22\27"
+  "\34\22\30\34\23\30\34\22\30\34\23\27\34\22\27\34\23\27\34\23\30\33\22"
+  "\30\34\22\27\34\23\27\34\202\22\30\34\3\22\30\33\23\27\34\23\30\34\202"
+  "\22\27\34\6\23\27\34\22\30\34\23\30\34\22\27\34\22\30\34\23\27\33\202"
+  "\23\30\34\202\23\27\34\11\23\30\34\23\30\33\22\27\34\23\27\34\22\27\34"
+  "\22\30\34\22\27\34\23\30\34\23\27\34\202\22\30\34\13\23\30\34\23\27\34"
+  "\22\30\33\23\27\34\22\30\34\23\30\34\23\27\33\22\27\34\22\30\33\23\30"
+  "\34\22\30\34\202\23\27\34\202\22\30\34\13\23\30\33\22\27\33\22\27\34"
+  "\23\27\34\22\27\33\23\27\33\22\27\33\23\30\34\22\30\34\23\30\34\23\30"
+  "\33\202\23\30\34\12\23\30\33\23\27\33\23\27\34\22\27\34\22\30\34\22\27"
+  "\34\23\30\34\23\27\34\23\30\34\22\27\34\204\23\30\34\10\22\30\34\23\27"
+  "\34\22\27\34\22\27\33\23\30\34\23\27\33\23\30\34\23\30\33\203\23\30\35"
+  "\3\22\31\34\23\30\35\23\30\34\203\23\30\35\2\23\30\34\23\31\35\206\23"
+  "\30\35\2\22\30\35\22\30\34\202\23\30\34\205\23\30\35\2\23\30\34\22\30"
+  "\35\202\23\30\35\6\23\31\34\22\30\35\23\30\34\22\30\35\23\31\34\22\30"
+  "\35\211\23\30\35\1\22\30\35\206\23\30\35\2\22\30\35\23\30\34\206\23\30"
+  "\35\3\22\30\34\23\30\35\23\31\34\203\23\30\34\4\23\30\35\22\30\35\23"
+  "\30\35\22\30\35\202\23\30\35\202\22\30\35\205\23\30\35\1\23\31\35\203"
+  "\23\30\35\1\23\30\34\202\23\30\35\3\23\30\34\23\30\35\23\30\34\206\23"
+  "\30\35\6\22\30\35\22\30\34\23\30\35\22\30\35\23\30\34\22\30\35\202\23"
+  "\30\35\1\23\30\34\205\23\30\35\1\23\30\34\203\23\30\35\2\23\30\34\23"
+  "\31\34\202\23\30\35\6\23\30\34\23\30\35\22\30\35\23\31\35\23\31\36\23"
+  "\30\35\202\22\30\35\203\23\30\35\7\23\31\34\23\30\35\22\30\35\23\31\35"
+  "\22\30\34\23\30\35\22\30\35\202\23\30\34\10\23\30\35\25\31\36#%\"\40"
+  "$'\36\"'\23\30\35\24\31\35\24\31\36\204\23\30\35\3\23\30\34\23\30\35"
+  "\22\31\35\202\22\30\35\2\23\30\34\23\31\35\202\23\30\35\1\23\31\34\204"
+  "\23\30\35\2\22\31\34\22\30\35\202\23\30\35\202\23\30\34\7\22\30\34\23"
+  "\30\35\22\30\35\23\30\35\22\30\35\23\30\35\22\30\34\202\23\30\35\2\23"
+  "\31\34\22\30\34\202\23\30\35\2\22\30\35\23\30\34\202\23\30\35\1\23\31"
+  "\35\203\23\30\35\3\22\30\35\23\31\35\22\30\35\202\23\30\35\1\23\31\35"
+  "\202\23\30\35\202\23\30\34\202\23\30\35\1\23\31\34\202\23\30\35\3\22"
+  "\30\35\23\30\35\23\31\35\202\23\30\35\1\22\31\35\204\23\30\35\3\23\30"
+  "\34\23\30\35\23\31\35\202\23\30\35\1\23\30\34\202\23\30\35\3\23\31\34"
+  "\23\30\35\23\30\34\204\23\30\35\3\22\30\35\23\31\35\23\30\34\202\23\30"
+  "\35\2\22\30\34\23\30\34\202\23\30\35\2\23\31\35\22\31\35\204\23\30\35"
+  "\1\22\30\35\204\23\30\35\2\22\30\35\22\30\34\202\23\30\35\2\22\31\35"
+  "\23\30\35\203\23\30\34\202\23\30\35\1\22\31\35\202\23\31\36\1\23\31\35"
+  "\203\23\31\36\2\23\31\35\23\30\36\203\23\31\36\2\23\30\36\23\31\36\203"
+  "\23\31\35\3\23\30\36\23\31\36\23\31\35\204\23\31\36\1\23\30\36\203\23"
+  "\31\36\4\23\30\36\24\31\36\23\31\36\23\31\35\203\23\30\36\207\23\31\36"
+  "\3\23\31\35\23\31\36\23\30\36\203\23\31\36\1\23\31\35\202\23\31\36\202"
+  "\23\30\36\2\23\31\35\23\30\36\204\23\31\36\202\23\31\35\205\23\31\36"
+  "\1\23\31\35\202\23\31\36\202\23\31\35\203\23\31\36\3\23\31\35\23\30\35"
+  "\23\31\35\203\23\31\36\1\23\31\35\202\23\31\36\1\24\31\36\203\23\31\36"
+  "\202\23\30\36\6\23\31\36\23\31\35\23\31\36\23\30\36\23\31\36\23\31\35"
+  "\202\23\31\36\202\23\30\36\205\23\31\36\1\23\31\35\210\23\31\36\203\23"
+  "\31\35\202\23\31\36\3\23\30\36\26\33\40\26\34!\202\24\32\37\24\23\30"
+  "\35\24\32\36\25\33\40\40%*049059/37-15*/2(-1&+.$(-\"&,\"',\40%*\37%)"
+  ":>@\202e\21iR\21%*-\202\23\31\36\1\24\31\37\203\23\31\36\202\23\30\36"
+  "\1\23\31\35\205\23\31\36\1\23\30\36\202\23\31\36\3\23\31\35\23\31\36"
+  "\23\31\35\202\23\31\36\202\23\31\35\202\23\31\36\2\23\31\35\23\30\36"
+  "\206\23\31\36\202\23\30\36\5\23\31\36\23\30\36\23\31\36\23\30\35\23\30"
+  "\36\204\23\31\36\202\23\31\35\1\23\30\36\205\23\31\36\1\23\30\36\202"
+  "\23\31\36\2\23\30\36\23\31\36\202\23\30\36\4\23\31\36\23\31\35\23\31"
+  "\36\24\31\36\204\23\31\36\3\23\31\35\23\31\36\23\31\35\203\23\31\36\2"
+  "\23\31\35\23\30\35\207\23\31\36\1\23\30\36\202\23\31\36\2\23\30\36\23"
+  "\31\35\203\23\31\36\202\23\30\35\1\23\31\35\203\23\31\36\202\23\31\35"
+  "\202\23\31\36\1\23\30\36\203\23\31\36\20\23\31\35\24\31\35\23\30\35\23"
+  "\31\35\23\30\36\23\31\36\23\31\35\23\31\36\23\31\35\23\31\36\23\30\36"
+  "\23\31\36\23\30\36\23\31\35\23\31\36\23\31\37\202\23\32\37\202\23\31"
+  "\37\6\23\32\37\24\31\36\23\31\37\24\32\37\23\31\37\23\32\37\202\23\31"
+  "\37\202\23\32\37\203\23\31\37\202\23\32\37\1\23\31\36\202\23\31\37\2"
+  "\24\31\37\23\31\36\202\23\31\37\6\23\32\37\23\31\36\23\32\36\23\31\37"
+  "\24\31\37\23\32\37\202\23\31\37\1\23\31\36\202\23\31\37\5\23\32\37\23"
+  "\31\37\23\31\36\23\31\37\23\32\37\203\23\31\37\4\24\31\36\23\32\37\23"
+  "\31\37\23\32\37\203\23\31\37\2\23\32\36\23\32\37\202\23\31\37\14\24\32"
+  "\37\23\32\37\23\31\37\23\32\37\23\32\36\23\31\36\23\31\37\23\32\37\23"
+  "\31\37\23\32\37\23\32\36\23\31\37\202\23\32\36\2\23\31\37\23\31\36\202"
+  "\24\31\37\1\23\32\36\202\23\31\37\1\23\32\37\202\23\31\37\14\24\31\37"
+  "\23\31\37\24\31\37\23\31\36\23\32\37\23\32\36\23\31\37\24\31\37\23\31"
+  "\37\24\31\37\23\31\36\24\31\36\204\23\32\37\6\23\31\37\24\32\36\23\31"
+  "\37\23\32\36\23\31\37\23\32\37\202\23\31\37\3\23\32\37\24\32\37\23\31"
+  "\37\202\24\31\36\1\23\32\37\202\23\31\37\36\23\32\36\23\31\37\23\32\37"
+  "\24\32\40\26\33!\24\32\37\23\31\37\25\33\37\27\35#\"'+/48:=\77JKKQQQ"
+  "VWW__`eeeccd`aa]^_TVWKLN>@A@AC@BCKA#\314\226\2\312\226\11(+)\23\32\36"
+  "\203\23\31\37\1\24\31\37\204\23\31\37\13\23\32\36\23\32\37\24\31\37\24"
+  "\32\37\24\31\37\23\31\37\23\32\37\23\32\36\23\32\37\24\31\37\23\31\36"
+  "\202\23\32\37\3\23\31\37\23\32\37\23\31\37\202\24\31\37\206\23\31\37"
+  "\1\23\32\37\202\23\31\37\14\24\32\37\23\31\36\23\31\37\23\32\37\24\32"
+  "\36\24\31\37\24\32\37\23\31\36\23\31\37\24\32\37\23\31\37\24\31\37\202"
+  "\23\31\37\2\24\31\37\23\32\37\202\23\31\37\6\23\32\37\24\31\37\23\31"
+  "\36\23\31\37\23\32\37\24\32\37\202\23\32\37\2\24\31\36\23\32\37\202\23"
+  "\31\37\2\23\32\37\23\31\37\202\23\31\36\2\23\31\37\24\32\37\202\23\31"
+  "\37\1\23\32\37\202\23\31\37\1\23\32\37\202\23\31\36\7\23\32\37\24\31"
+  "\37\23\31\37\23\32\37\23\31\36\23\32\37\24\32\37\207\23\31\37\2\23\32"
+  "\37\23\31\36\202\23\32\37\1\23\31\37\202\23\32\37\202\23\31\37\203\23"
+  "\31\36\11\23\31\37\24\32\37\23\31\37\23\32\37\23\31\37\24\32\37\23\32"
+  "\37\23\31\37\23\32\37\202\23\31\36\1\23\31\37\202\23\32\37\4\24\32\40"
+  "\23\32\40\23\32\37\24\32\40\202\23\32\40\1\23\32\37\202\24\32\40\203"
+  "\23\32\40\1\23\32\37\213\23\32\40\1\24\32\40\202\23\32\40\1\24\32\40"
+  "\202\23\32\40\202\24\32\40\202\23\32\40\4\24\32\40\24\32\37\23\32\40"
+  "\24\32\40\202\23\32\40\202\24\32\40\206\23\32\40\203\24\32\40\203\23"
+  "\32\40\1\24\32\40\202\23\32\40\5\24\32\40\23\32\40\24\32\40\23\32\40"
+  "\24\32\40\203\23\32\40\203\24\32\40\6\23\32\40\23\32\37\23\32\40\24\32"
+  "\40\23\32\40\23\32\37\204\23\32\40\1\24\32\40\207\23\32\40\6\24\32\40"
+  "\23\32\40\24\32\40\23\32\40\23\32\37\24\32\40\203\23\32\40\202\23\32"
+  "\37\203\24\32\40\204\23\32\40\1\24\33\40\204\23\32\40\1\23\32\37\205"
+  "\23\32\40\15\23\32\37\23\32\40\24\33!\26\34\"\34#)&+/&),78:abb\221\221"
+  "\221\275\275\275\344\344\344\357\357\357\205\361\361\361\16\357\357\357"
+  "\350\350\350\321\321\321\271\271\271SQI\201_\3\355\256\2\323\234\11/"
+  "+\36\40%*\25\34\"\24\32\40\23\32\40\23\32\37\202\23\32\40\203\24\32\40"
+  "\4\23\32\40\24\32\40\23\32\40\24\32\40\205\23\32\40\1\24\32\40\204\23"
+  "\32\40\1\24\32\40\202\23\32\40\1\24\32\37\202\23\32\40\202\23\32\37\205"
+  "\23\32\40\202\24\32\40\3\23\32\40\23\33\40\24\32\40\202\23\32\40\1\24"
+  "\32\40\202\23\32\40\10\24\32\40\23\32\40\24\32\40\23\32\37\24\32\37\23"
+  "\32\37\23\32\40\24\32\40\203\24\32\37\6\23\32\40\24\32\40\23\32\40\24"
+  "\32\40\23\32\37\23\32\40\204\24\32\40\1\23\32\40\202\24\32\40\202\23"
+  "\32\40\1\24\32\40\202\23\32\40\1\24\32\40\203\23\32\40\2\24\32\40\23"
+  "\32\37\203\23\32\40\1\24\32\40\204\23\32\40\2\24\32\37\23\32\40\202\24"
+  "\32\40\204\23\32\40\202\24\32\40\202\24\32\37\6\24\32\40\23\32\40\23"
+  "\32\37\24\32\40\23\32\37\24\32\40\207\23\32\40\2\23\32\37\24\32\40\203"
+  "\23\32\40\204\24\33!\2\24\33\40\23\33!\203\24\33!\3\24\32!\23\33!\24"
+  "\32!\202\24\33!\4\23\33!\24\32!\23\33!\23\32\40\202\24\33!\12\23\32!"
+  "\24\32!\23\33!\24\32!\24\33!\23\32!\23\33!\23\33\40\24\32!\24\33!\202"
+  "\23\33!\3\24\33!\23\33!\23\33\40\202\23\33!\2\23\32!\23\33\40\202\24"
+  "\33!\1\24\32\40\202\24\33!\2\23\33!\23\33\40\202\24\33!\1\24\33\40\204"
+  "\24\33!\3\23\32!\24\32!\23\32\40\202\23\33!\2\24\33!\24\33\40\203\23"
+  "\32!\1\24\33!\202\23\33!\3\24\33!\23\33\40\23\32!\204\23\33!\2\24\32"
+  "!\23\33!\203\24\33!\7\23\33!\23\33\40\24\32!\23\33!\24\32!\23\33!\24"
+  "\33!\202\24\32!\20\23\32!\24\33!\24\32!\23\33!\24\33!\24\32!\23\32!\24"
+  "\33!\24\33\40\24\33!\23\33\40\24\33!\23\33!\24\32!\24\33!\23\33!\203"
+  "\23\32!\3\23\33!\23\32!\24\33!\203\23\33!\20\24\32!\23\33!\23\32!\24"
+  "\32!\27\37$&,1,04035\\]]\225\225\225\312\312\312\357\357\357\362\362"
+  "\362\363\363\363\367\367\367\365\365\365\204\361\361\361\17\360\360\360"
+  "\356\356\356\346\346\346\335\335\335\276\276\276XJ$\273\211\0\361\261"
+  "\3\323\235\11""3+\26*,-(+0\35#)\27\36$\24\33\"\202\24\33!\5\23\32!\23"
+  "\33!\24\33!\23\33!\24\33!\202\23\33!\4\23\32!\24\33\40\24\33!\24\32!"
+  "\202\23\33!\1\23\33\40\202\24\33!\4\24\33\40\23\32!\24\33!\24\32!\202"
+  "\24\33!\6\23\33!\24\32!\24\33!\23\33!\24\32!\23\32!\202\23\33!\3\23\32"
+  "!\24\32!\24\33!\202\23\33!\1\24\33!\202\23\32!\2\24\32\40\24\33!\202"
+  "\24\32!\5\23\32!\24\32\40\24\33!\23\33!\24\33!\202\23\33!\2\24\33!\24"
+  "\33\40\203\24\33!\3\23\32!\24\33!\23\32!\202\24\32!\12\24\33!\23\33!"
+  "\23\32!\24\33!\23\33!\24\33!\23\33\40\24\33!\23\33!\24\33!\203\23\32"
+  "!\2\23\33!\24\33!\202\23\33!\1\24\33!\202\23\32!\3\24\33!\24\32!\23\33"
+  "!\203\24\33!\202\23\33!\4\24\33!\23\33!\23\32!\23\33!\203\24\33!\202"
+  "\24\32!\13\24\33!\24\33\40\24\33!\24\32!\24\33!\23\33!\24\32!\23\33!"
+  "\23\33\40\24\32!\23\33!\202\24\33!\4\23\33!\24\33!\23\33!\24\33\40\202"
+  "\24\33\"\16\24\34\"\23\33\"\24\34\"\23\34\"\24\33\"\23\33\"\23\34\"\24"
+  "\34\"\24\33#\24\33\"\24\34\"\23\33\"\23\34\"\24\33\"\203\23\33\"\2\24"
+  "\33\"\23\33\"\202\24\33\"\202\24\34\"\202\24\33\"\1\23\34\"\202\24\33"
+  "\"\202\24\34\"\2\24\33\"\23\33\"\204\24\33\"\1\24\34\"\203\24\33\"\5"
+  "\24\34\"\24\33\"\23\33\"\24\33\"\24\34\"\203\24\33\"\7\23\33\"\24\33"
+  "\"\23\33\"\24\33\"\23\34\"\24\33\"\24\34\"\204\24\33\"\205\24\34\"\205"
+  "\24\33\"\4\23\33\"\24\34\"\24\33\"\23\33\"\202\24\34\"\1\23\33#\202\24"
+  "\33\"\5\24\34\"\23\34\"\24\33\"\23\33\"\23\34\"\203\24\33\"\13\23\33"
+  "#\24\33\"\23\34\"\24\33\"\23\33\"\23\34\"\23\33\"\24\34\"\24\33\"\23"
+  "\34\"\24\33\"\202\24\34\"\2\23\33\"\24\34\"\207\24\33\"\1\24\34\"\202"
+  "\24\33\"\17\23\33\"\23\34\"\33!)-36678jjk\261\261\261\345\345\345\365"
+  "\365\365\367\367\367\370\370\370\372\372\372\370\370\370\372\372\372"
+  "\373\373\373\204\371\371\371\24\367\367\367\363\363\363\351\351\351\333"
+  "\333\333vtp\213h\7\327\236\0\367\266\4\325\236\11VM6\215\215\215fgg*"
+  "-0\37$*\27\36%\25\34#\23\33\"\24\33\"\23\34\"\23\33\"\202\24\33\"\1\24"
+  "\34\"\202\24\33\"\5\24\34\"\24\33\"\23\33\"\24\33\"\24\34\"\202\24\33"
+  "\"\1\23\33\"\203\24\33\"\1\23\33\"\203\24\33\"\1\24\34\"\202\24\33\""
+  "\1\24\34\"\203\24\33\"\1\24\34\"\206\24\33\"\1\24\34\"\206\24\33\"\6"
+  "\24\34\"\23\33\"\24\33\"\24\34\"\24\33\"\23\33\"\202\24\33\"\1\23\34"
+  "\"\202\24\33\"\1\23\34\"\202\24\33\"\11\23\34\"\23\33\"\24\34\"\23\34"
+  "\"\24\33\"\24\34\"\24\33\"\23\34\"\24\34\"\202\24\33\"\203\23\33\"\203"
+  "\24\33\"\1\23\33\"\204\24\33\"\202\24\34\"\2\23\34\"\24\34\"\202\23\33"
+  "\"\1\24\34\"\202\24\33\"\7\23\33\"\24\33\"\23\34\"\24\34\"\23\33\"\24"
+  "\34\"\23\33\"\202\24\33\"\4\24\34\"\24\33\"\23\33\"\24\33\"\202\24\34"
+  "\"\11\24\33\"\23\34\"\24\33\"\23\33\"\24\33\"\23\34\"\23\33\"\24\33\""
+  "\24\34\"\214\24\34#\3\24\35#\24\34#\24\34$\203\24\34#\1\24\34$\202\24"
+  "\34#\1\24\34$\204\24\34#\1\24\34$\216\24\34#\1\24\34$\203\24\34#\2\23"
+  "\34#\24\35#\202\24\34#\1\24\34$\204\24\34#\1\23\34$\203\24\34#\1\24\34"
+  "$\202\24\34#\1\23\34$\203\24\34#\2\24\34$\23\34#\203\24\34#\1\24\34$"
+  "\202\24\34#\1\24\34$\206\24\34#\1\24\34$\203\24\34#\3\24\35#\24\34#\24"
+  "\34$\207\24\34#\1\24\34$\204\24\34#\1\23\34#\202\24\34#\2\24\35$\24\34"
+  "$\211\24\34#\7\26\36%$*0$')NOO\262\262\262\355\355\355\374\374\374\202"
+  "\371\371\371\4\373\373\373\372\372\372\375\375\375\376\376\376\205\377"
+  "\377\377\22\376\376\376\373\373\373\366\366\366\355\355\355\303\303\303"
+  "aU5\260\201\1\345\247\0\372\270\4\331\242\11_S3\254\254\254\332\332\332"
+  "\250\250\251VWY\35\"&\31!'\25\35$\204\24\34#\1\24\35$\206\24\34#\1\23"
+  "\34#\211\24\34#\1\23\34#\204\24\34#\1\23\34#\206\24\34#\2\24\34$\24\34"
+  "#\202\23\34#\202\24\34#\1\24\34$\202\24\34#\2\24\34$\23\34#\203\24\34"
+  "#\1\24\34$\203\24\34#\1\24\34$\205\24\34#\4\23\34#\24\35#\24\34#\24\34"
+  "$\202\24\34#\3\24\35#\24\34#\24\34$\213\24\34#\3\24\34$\24\34#\24\34"
+  "$\206\24\34#\3\24\34$\23\34#\24\34$\203\24\34#\3\24\34$\24\34#\24\35"
+  "$\204\24\34#\202\24\34$\202\24\34#\1\24\34$\207\24\34#\7\23\34#\24\34"
+  "#\24\35$\24\34$\24\35$\24\34$\25\34%\202\24\35$\7\24\34$\24\35%\24\35"
+  "$\25\35$\24\35$\24\34$\25\35$\206\24\35$\3\24\35%\25\34$\24\35%\203\25"
+  "\35$\4\24\35$\24\34$\24\35$\25\35%\204\24\35$\1\24\34$\202\24\35$\4\25"
+  "\35%\25\35$\24\35$\25\35$\202\24\35$\6\24\34$\24\35$\25\35$\24\35$\25"
+  "\35$\24\34$\202\24\35%\202\24\35$\202\24\35%\207\24\35$\5\24\35%\24\34"
+  "$\24\35$\24\34$\24\34%\203\24\35$\6\24\35%\25\34%\24\35%\24\35$\24\34"
+  "$\25\34$\202\24\34$\3\24\35$\24\34$\25\35$\203\24\34$\1\24\35$\202\24"
+  "\35%\202\24\35$\203\24\34$\4\24\35%\24\35$\24\35%\24\34$\205\24\35$\5"
+  "\24\35%\24\35$\24\35%\24\34%\24\35$\202\24\34$\1\25\34%\202\24\34$\7"
+  "\24\35$\33#++05467\200\201\201\323\323\323\372\372\372\202\376\376\376"
+  "\202\374\374\374\202\376\376\376\207\377\377\377\33\375\375\375\371\371"
+  "\371\363\363\363\352\352\352\237\237\236jS\22\300\214\0\354\255\0\372"
+  "\270\4\340\247\12hW)\247\247\247\340\340\340\341\341\341\326\326\326"
+  "}~\177+/2\35$*\26\37&\24\34$\25\35$\24\35$\24\34$\24\34%\24\34$\24\35"
+  "$\24\35%\202\24\35$\1\24\34$\202\24\35$\11\24\34%\24\34$\25\34$\24\35"
+  "$\24\34%\24\34$\24\35$\25\35$\24\35$\202\24\34$\1\24\35%\202\24\35$\202"
+  "\24\34$\10\24\34%\24\35$\24\34$\25\35%\24\35%\24\34$\24\35$\24\34%\203"
+  "\24\35$\4\24\34$\24\34%\24\35$\25\34$\206\24\35$\1\24\34%\202\24\35$"
+  "\1\24\34%\202\24\35$\1\25\35$\202\24\35$\4\24\34$\24\35%\24\34%\24\34"
+  "$\202\24\35$\1\25\35$\202\24\35$\10\24\34%\24\35%\24\34%\24\35%\24\35"
+  "$\24\35%\24\34$\25\35%\202\24\35$\2\24\34$\24\35$\202\24\35%\203\24\35"
+  "$\1\24\34%\202\24\35$\3\24\35%\24\34$\24\34%\203\24\35$\4\25\35$\25\35"
+  "%\24\35$\25\35%\202\24\35$\1\24\34$\202\25\35$\2\24\35$\24\35%\202\24"
+  "\35$\2\25\35$\24\35%\202\24\34$\11\25\35$\24\35%\24\36%\24\35%\25\35"
+  "%\24\35%\24\35&\24\35%\25\35%\202\24\36%\202\25\35%\202\24\35%\202\24"
+  "\35&\5\24\36%\25\36&\24\36%\24\35%\24\35&\202\24\36&\25\24\36%\24\35"
+  "%\25\36&\24\36&\24\35%\24\35&\24\35%\24\36%\25\36%\24\35%\24\36%\24\35"
+  "%\24\36%\25\36%\24\35%\24\35&\24\36%\24\35%\24\36&\24\35%\24\36%\202"
+  "\24\36&\7\24\36%\24\36&\25\36%\25\35%\24\36&\24\35%\25\36%\202\24\35"
+  "%\11\25\36&\24\35&\25\36%\24\36%\24\35&\24\36%\24\35%\24\36%\24\35&\203"
+  "\24\36&\7\24\36%\25\35&\24\35%\25\36%\25\35&\24\35%\24\36%\202\24\35"
+  "%\7\25\36&\24\36%\24\35%\24\35&\24\36%\24\35%\24\35&\202\24\35%\5\24"
+  "\36&\25\36%\25\35%\24\35&\24\35%\202\24\36&\3\24\35%\24\36%\25\35%\202"
+  "\24\35%\7\25\36%\24\35%\25\36&\24\35%\24\36&\25\35&\25\35%\202\24\36"
+  "&\204\24\35%\10\26\36&\"*1&+/ABC\254\254\254\360\360\360\377\377\377"
+  "\376\376\376\214\377\377\377!\376\376\376\374\374\374\367\367\367\357"
+  "\357\357\337\337\337|xl}\\\3\315\226\0\362\261\0\372\270\4\341\247\11"
+  "jX)\253\253\253\350\350\350\353\353\353\347\347\347\332\332\332\242\243"
+  "\243CEG\34#)\26\37(\24\35%\24\36&\25\35%\24\35%\24\36&\24\36%\24\35&"
+  "\25\35&\25\36&\24\36&\24\35%\24\36&\202\24\35%\2\24\36%\24\36&\202\25"
+  "\35&\202\24\35%\2\25\35%\24\35%\202\24\36&\2\24\36%\24\35&\202\24\35"
+  "%\11\24\36%\24\35&\24\35%\24\36%\24\35%\24\36&\24\35%\24\36&\25\35%\202"
+  "\24\35%\1\24\36&\202\24\36%\16\24\35%\24\35&\24\36&\24\36%\25\35%\24"
+  "\35&\24\35%\25\35&\24\35%\24\35&\24\36%\25\35%\24\35%\25\35%\203\24\35"
+  "%\12\24\35&\25\36%\24\35%\24\36%\25\35%\25\35&\25\36%\24\35%\25\35%\24"
+  "\35%\202\24\35&\202\24\36&\3\25\36&\24\35&\25\35%\202\25\35&\2\25\36"
+  "&\25\35%\202\24\35%\1\24\35&\204\24\35%\2\25\35&\24\35&\202\25\35&\5"
+  "\24\35%\25\36%\25\35%\25\36&\24\36&\202\24\36%\203\25\35%\3\25\36%\24"
+  "\35%\25\36%\203\24\36&\202\24\35%\11\24\35&\24\36%\25\37&\24\36'\25\36"
+  "&\25\36'\24\36&\25\36&\25\36'\202\25\36&\202\24\36&\6\25\36'\24\36'\25"
+  "\37&\25\36&\25\36'\24\37'\203\25\36&\3\24\36'\25\36&\24\36&\202\24\36"
+  "'\1\25\36'\202\25\36&\2\24\36&\25\36&\203\25\36'\202\25\36&\5\24\36'"
+  "\24\36&\25\36&\25\36'\25\36&\202\24\36&\202\25\36'\5\25\36&\25\36'\25"
+  "\37&\25\36&\24\36'\202\25\36&\3\24\36'\25\37&\25\36&\202\25\36'\3\24"
+  "\36'\25\36'\24\36'\202\25\36&\16\24\36'\25\36&\24\36'\25\36&\25\36'\25"
+  "\36&\24\36&\25\36&\25\36'\24\36'\25\37'\24\37'\25\36&\25\36'\203\25\36"
+  "&\7\24\36&\25\36&\24\36&\24\36'\25\36&\24\36'\24\36&\202\24\36'\203\25"
+  "\36&\2\24\37&\25\36&\202\24\36&\2\25\36&\24\37'\202\25\36'\21\24\37&"
+  "\25\36&\24\36'\25\36&\25\36'\24\36'\24\36&\25\37'\24\36&\25\36'\25\36"
+  "&\26\37'-4;269QQR\277\277\277\366\366\366\217\377\377\377\31\376\376"
+  "\376\373\373\373\364\364\364\354\354\354\326\326\326d\\D\217i\2\325\233"
+  "\0\364\263\0\372\270\4\351\256\11s\\\36\244\244\244\360\360\360\366\366"
+  "\366\356\356\356\344\344\344\335\335\335\271\272\272OQR\40&+\30!)\24"
+  "\36'\24\36&\25\37'\202\25\36'\1\25\37'\202\25\36'\2\24\36'\25\36'\202"
+  "\24\36&\202\25\36'\6\25\36&\24\36'\24\36&\24\36'\25\36'\24\36'\202\25"
+  "\36'\4\24\36'\25\36&\24\36'\25\36'\202\25\36&\1\24\36&\203\25\36'\3\24"
+  "\37'\24\36&\25\36&\202\24\36&\1\24\36'\203\25\36'\203\25\36&\202\24\36"
+  "'\23\25\36&\25\36'\25\36&\24\37'\25\36'\25\36&\24\36'\25\36&\24\36'\25"
+  "\36&\25\36'\24\37&\25\36&\24\36&\25\36&\25\36'\25\37&\25\36'\24\36'\202"
+  "\25\36&\202\25\36'\13\24\36&\25\36'\24\36'\25\36&\24\37&\25\37'\25\36"
+  "&\24\36&\25\36&\24\36&\24\36'\202\25\36&\202\25\36'\3\25\36&\25\37&\24"
+  "\36'\202\24\36&\5\24\36'\24\36&\25\36'\24\36&\25\36'\205\24\36&\203\25"
+  "\36'\1\25\36&\202\25\36'\3\24\36'\24\37&\25\36'\202\24\36'\2\24\37(\25"
+  "\37'\204\25\37(\4\25\37'\24\37'\25\37(\24\37'\202\25\37(\14\24\37(\25"
+  "\36(\25\37(\24\37(\25\37(\24\37(\25\37(\24\37(\25\37(\24\37(\25\37'\25"
+  "\37(\202\25\36(\203\25\37(\3\24\37'\25\37(\25\37'\202\25\37(\1\24\36"
+  "(\202\25\37(\1\25\36(\202\25\37'\2\25\37(\25\36(\202\25\37'\204\25\37"
+  "(\3\25\37'\25\37(\25\37'\203\25\37(\2\24\37(\25\37'\203\25\37(\1\25\36"
+  "(\203\25\37(\202\24\37(\202\25\37(\4\24\37(\25\37(\25\37'\24\37'\202"
+  "\24\37(\14\25\36'\25\37(\25\36(\24\37(\25\36(\25\37(\25\36(\25\37(\25"
+  "\36'\25\37(\25\36'\24\36(\202\25\37(\3\25\36(\25\37(\25\36'\202\25\37"
+  "(\2\24\37'\24\37(\206\25\37(\3\24\37(\25\36'\25\37(\202\24\37(\2\25\37"
+  "(\25\37'\202\25\37(\5""2:A7:<\\\\\\\324\324\324\375\375\375\220\377\377"
+  "\377\30\376\376\376\372\372\372\362\362\362\351\351\351\322\322\322b"
+  "X\77\230p\2\335\242\0\367\265\0\372\270\3\355\261\11w^\30\235\235\235"
+  "\363\363\363\376\376\376\371\371\371\355\355\355\345\345\345\341\341"
+  "\341\303\303\303_`a\35$*\26\37)\24\37'\203\25\37(\2\24\37'\24\36(\202"
+  "\25\37(\202\25\37'\4\24\37(\25\37'\24\37(\25\37'\202\25\37(\3\24\37'"
+  "\25\37'\25\37(\203\25\37'\2\25\36(\24\37(\202\25\37'\2\24\36(\25\37'"
+  "\202\24\37(\2\25\37'\24\37(\210\25\37(\203\25\37'\203\24\37(\4\25\36"
+  "(\25\37(\24\37(\25\36'\202\25\37(\6\24\37(\25\36(\24\37(\25\37'\25\37"
+  "(\25\37'\202\25\37(\13\25\37'\25\37(\24\36(\25\37'\24\37'\25\37(\25\36"
+  "'\25\37(\25\37'\25\37(\25\37'\202\25\37(\3\24\36(\25\37(\24\37(\202\25"
+  "\37(\203\25\37'\2\25\37(\24\37(\203\25\37(\202\24\37(\6\24\36(\25\37"
+  "'\25\37(\24\37'\25\37(\24\36(\202\25\37(\202\25\36(\5\25\37'\25\37(\25"
+  "\37'\24\37'\25\36(\202\25\37(\5\24\37(\25\36(\25\37'\25\37(\25\40(\204"
+  "\25\37)\2\25\40)\25\37)\203\25\40)\1\25\37)\203\25\40)\14\25\37)\25\40"
+  ")\25\37)\25\40)\24\36(\24\36'\24\35&\23\35&\23\34%\23\35%\22\34$\23\34"
+  "$\202\23\35%\6\23\35&\24\36'\24\36(\25\37(\25\37)\25\40)\202\25\37)\5"
+  "\25\40)\24\37(\25\37)\25\40)\25\37)\204\25\40)\1\25\37)\202\25\40)\5"
+  "\24\37(\24\36'\23\36&\23\34%\23\35%\202\23\34%\3\23\34&\24\36'\24\37"
+  "(\202\25\40)\202\25\37)\1\25\40)\202\25\37)\2\25\40)\25\37)\203\25\40"
+  ")\202\25\37)\1\25\40)\202\25\37)\1\25\40(\203\25\37)\2\25\40)\25\40("
+  "\202\25\37)\3\24\37'\24\36'\23\35&\204\23\35%\6\23\36&\23\35'\25\36("
+  "\24\40)\25\40)\24\40)\203\25\40)\1\25\37)\205\25\40)\1\25\37)\202\25"
+  "\40)\5#.6>@BJJJ\300\300\300\376\376\376\221\377\377\377\30\375\375\375"
+  "\370\370\370\360\360\360\347\347\347\277\277\276SA\21\270\206\0\345\250"
+  "\0\363\262\0\370\266\1\357\261\10z_\25\235\235\235\363\363\363\377\377"
+  "\377\376\376\376\370\370\370\357\357\357\346\346\346\335\335\335\272"
+  "\272\272MOQ\34%,\26!*\202\25\37)\2\25\40)\25\40(\203\25\40)\1\25\37)"
+  "\203\25\40)\203\25\37)\1\25\37(\202\25\40)\203\25\37)\11\25\36'\24\37"
+  "'\23\34&\23\33$\22\33#\21\32\"\21\32!\21\31\40\20\30\40\202\20\31\40"
+  "\10\21\31\40\21\31!\21\32\"\21\33#\23\35%\24\36&\24\36(\25\40)\202\25"
+  "\37)\203\25\40)\2\25\37)\25\40)\204\25\37)\202\25\40)\2\25\37)\24\37"
+  ")\202\25\37)\15\25\40)\25\36(\24\36'\23\35&\23\33$\22\33#\21\32\"\21"
+  "\31!\21\30\40\20\31\40\20\31\37\20\30\40\21\31\40\202\21\31!\5\21\33"
+  "\"\22\34#\23\34%\24\35&\24\36'\202\25\37)\14\25\40)\25\37)\25\40)\25"
+  "\37)\25\40(\25\37)\25\40)\25\37)\24\37(\24\36&\23\35%\22\34$\202\22\33"
+  "#\6\22\33$\23\34%\24\36'\24\36(\25\40)\25\37)\205\25\40)\202\25\37)\203"
+  "\25\40)\10\25!*\25\40*\26\40*\25\40*\25!*\26\40*\25!*\25\40*\202\25!"
+  "*\202\25\40*\1\25\40+\202\26\40*\202\25\40*\20\24\37)\23\35'\22\34$\21"
+  "\32\"\20\30\37\17\26\35\17\26\34\16\25\34\16\26\34\17\26\35\17\27\36"
+  "\20\31\40\21\32#\23\34%\24\36'\25\40)\202\25\40*\1\25!*\203\25\40*\33"
+  "\25!*\25\40*\26\40*\26\40+\25!*\25\40*\25!*\25\40*\24\40*\24\35'\22\34"
+  "$\20\31!\17\27\36\17\26\35\16\26\35\17\27\36\21\31\40\21\33#\23\36&\25"
+  "\37(\25\40*\26\40*\25\40*\26!+\25\40*\26\40+\25!+\202\25\40*\1\25!*\202"
+  "\25\40*\3\26\40*\25\40*\25!*\202\25\40+\3\25\40*\25\40+\25\40*\202\25"
+  "!*\6\25\40*\25\37)\24\36'\21\33#\20\30\40\17\26\35\202\16\26\34\15\17"
+  "\27\36\21\31\40\22\33$\24\36'\25\37)\25\40*\25!*\25!+\25\40*\25\40+\25"
+  "\40*\26!*\25!*\202\25\40*\1\25\40+\202\25\40*\7\32$.FJN777\242\242\242"
+  "\360\360\360\375\375\375\376\376\376\220\377\377\377\16\374\374\374\366"
+  "\366\366\356\356\356\345\345\345\211\207\200xY\3\323\232\0\353\254\0"
+  "\316\227\1\314\226\1\351\254\4x_\32\244\244\244\364\364\364\202\377\377"
+  "\377\10\376\376\376\372\372\372\357\357\357\344\344\344\334\334\334\261"
+  "\261\261/36\32$-\202\25\40*\4\25!*\25\40*\26\40+\25\40+\202\25\40*\2"
+  "\26\40*\25!*\202\25\40*(\25!*\25\40*\25\40+\26\40*\25!+\25\40*\26!*\25"
+  "\40*\24\37(\22\34%\20\31!\16\25\33\13\21\27\12\17\24\10\15\21\10\14\20"
+  "\7\13\17\7\13\16\7\13\17\10\14\20\11\16\22\12\20\24\14\23\31\17\26\35"
+  "\21\32#\23\36&\25\40)\25!*\25\40*\25!*\25\40+\25!*\26!*\26\40*\26\40"
+  "+\25!*\25\40*\26\40+\25!*\25\40*\202\25\40+\27\25\40*\25!*\25\40)\24"
+  "\36(\22\34$\20\30\40\16\25\33\14\22\27\12\17\24\11\15\21\10\14\17\10"
+  "\13\16\7\13\16\7\13\17\10\14\17\11\15\20\11\16\22\12\20\25\14\23\31\16"
+  "\26\35\21\32\"\23\34%\24\37(\202\26\40*\1\25!+\202\25!*\203\25\40*\14"
+  "\25\37)\23\35&\21\32#\16\26\35\15\24\32\14\23\30\15\23\30\15\24\32\17"
+  "\26\35\21\32#\23\35'\25\37)\202\25\40*\1\26\40+\202\25\40*\35\25!*\25"
+  "\40+\25!*\25\40*\26!*\25\40*\25!,\26!,\25!,\25!+\26!,\25!,\26!+\25!,"
+  "\26!+\26!,\26!+\25!,\25!+\25\40*\25\40)\23\36(\22\34%\20\31!39>06:`c"
+  "f^ac\203\204\206\203\202\204\205\17z|}{|~VZ]Y\\`&,2'/5\21\32#\22\35&"
+  "\24\37)\24\40)\26!*\26!,\25!+\26!,\26!+\202\25!,\26\25!+\26!*\25\40*"
+  "\24\36'\22\33$.6<,27tvysuv\202\204\205\203\204\205vxzwz|8=A:@E\21\32"
+  "\"\22\35%\25\37)\25\40*\25\",\25!+\25!,\203\25!+\3\25!,\26!+\26!,\202"
+  "\25!+\16\26!+\25!+\26!,\25!,\25!+\25!,\25\",\26!,\26!+\25\37)\22\33$"
+  "fkndgi\203\205\206\202\202\204\205\10\203\205\207X\\_[_c\22\34$\24\37"
+  "*\26\",\25!+\26!+\204\25!+\15\26!+\25!,\26!+\25!,\25!+\27\"-/7<81\32"
+  "KJG\240\240\240\320\320\320\360\360\360\373\373\373\217\377\377\377\17"
+  "\376\376\376\373\373\373\365\365\365\354\354\354\336\336\336PI6\266\206"
+  "\1\356\256\1\256\201\6\\J\30sU\3\264\204\1r_,\270\270\270\366\366\366"
+  "\203\377\377\377\10\376\376\376\372\372\372\356\356\356\343\343\343\330"
+  "\330\330\217\220\221#*0\27#.\202\25!+\1\26!+\202\25!,\5\25!+\26!+\25"
+  "!,\25!+\26!,\202\25!,\2\26!,\25!+\202\25!,\7\25\40*\24\37(\22\35&\40"
+  "'/\34$({}\200yz{\202\310\310\310\202\362\362\362\204\377\377\377\202"
+  "\325\325\325\10\232\232\233\233\235\235+04.5:\21\33#\24\36(\24\40)\26"
+  "!*\202\26!+\202\25!+\20\26!+\26\"+\25!,\26!+\25\"+\25!+\26!+\25!,\25"
+  "\40+\25\40(\22\34%\20\31!INSGJM\247\250\251\246\246\247\202\331\331\331"
+  "\204\377\377\377\202\367\367\367\202\320\320\320\12\232\232\233\233\234"
+  "\2357;\77:\77E\20\32\"\22\35&\25\37*\25\40+\26!+\26!,\202\25!+\6\25\40"
+  "*\24\37(4<B16;\245\246\247\243\244\245\202\243\243\243\11\243\244\245"
+  "\245\246\247\"(.&/7\24\40*\26!*\26!+\25!,\26!,\203\25!+\202\26!,\14\25"
+  "!,\25!+\26\",\26!-\26\"-\25!,\25\",\26\"-\25\",\26!,\26\"-\25\"-\202"
+  "\26\"-\10\26!,\25\40+\23\35'\21\32\"\15\24\32\11\17\23*.0)*+\202YYY\204"
+  "\200\200\200\202www\26QQQRRR\36\37!\40#'\13\20\26\16\25\34\21\33#\24"
+  "\36(\25!,\26\",\26\"-\25\"-\26\"-\25\",\26\",\26\"-\24!+\23\37(\20\31"
+  "!\15\23\32(,.%&'\202ooo\202\200\200\200\202sss\7""112368\12\20\25\16"
+  "\26\35\22\34&\25\40*\26!,\202\26\"-\4\25\"-\26\",\26\"-\26\",\202\25"
+  "\"-\1\25\",\203\26\"-\1\26!-\202\26\"-\10\26!-\26\",\26\"-\26!,\23\37"
+  ")\17\27\40cfi_`a\204\200\200\200\14SUVX[^\20\31!\23\37)\26\"-\25!,\25"
+  "\"-\26\"-\26\",\25\",\26\"-\26!,\202\26!-\13\25\"-\25!-\36(4<:-\242w"
+  "\1\210g\14aO\37TM;\316\316\316\360\360\360\375\375\375\216\377\377\377"
+  "\17\375\375\375\371\371\371\363\363\363\352\352\352\261\261\261bN\30"
+  "\331\237\1\370\267\4RD\35\231\230\227XK)]E\2|c\37\272\272\272\370\370"
+  "\370\204\377\377\377\12\376\376\376\370\370\370\354\354\354\337\337\337"
+  "\306\306\306ace\33$-\27#-\26\"-\25!-\202\25\"-\202\26\",\203\25\"-\13"
+  "\25!,\26\"-\26\",\26\"-\25\"-\25!,\24\40)\21\33$\16\26\35\31\36#\25\30"
+  "\31\202www\202\310\310\310\202\362\362\362\204\377\377\377\202\325\325"
+  "\325\202\231\231\231\20$%%(+-\14\23\31\17\31\40\23\36(\25!+\26\"-\26"
+  "\",\26\"-\26!-\25\"-\26\"-\25\"-\26\"-\25!-\25\",\202\26!,\6\24\37*\22"
+  "\33$\16\25\34\11\16\23CDE@@@\202\246\246\246\202\331\331\331\204\377"
+  "\377\377\202\367\367\367\202\320\320\320\202\231\231\231\10""000368\13"
+  "\20\26\17\27\35\22\35%\25\37)\25\"+\26\",\202\25\",\4\25!+\23\36'3:@"
+  "-04\206\242\242\242\6\36#&$,3\25\40*\26!,\25!-\26\"-\202\25\"-\2\26\""
+  "-\25\"-\202\26\"-\202\25\"-\2\26\".\26#.\202\26\".\202\26#.\14\26\"."
+  "\26#.\26\".\25\".\25#-\25\"-\24\37)\22\34%AGM=AE\300\300\300\277\277"
+  "\277\216\377\377\377\7\256\256\256\257\260\261/482:@\21\34&\24\40*\25"
+  "\"-\202\26\".\202\26#.\5\26\"-\23\37)\20\31\"twzqrr\202\373\373\373\210"
+  "\377\377\377\6\246\246\246\250\251\252\30\37&\35&.\24\37)\25\",\202\26"
+  "#.\1\26\".\203\26#.\6\26\".\26#.\26#-\25\".\26\"-\25#-\202\26\".\1\25"
+  "\".\203\26#.\4\23\36(\15\24\33\274\274\275\273\273\273\204\377\377\377"
+  "\4\242\242\242\244\245\246\16\27\36\23\36)\203\26#.\3\26\".\26#.\25\""
+  ".\202\26\".\1\26#.\202\26\".\15\27$/)18bK\11\337\243\0\372\270\5\372"
+  "\272\12\237x\14eee\310\310\310\353\353\353\370\370\370\374\374\374\376"
+  "\376\376\213\377\377\377\17\374\374\374\367\367\367\360\360\360\340\340"
+  "\340nkc\240w\4\360\260\2\257\201\5olb\347\347\347\342\342\342\256\254"
+  "\250\211\205x\347\347\347\374\374\374\205\377\377\377\15\376\376\376"
+  "\365\365\365\345\345\345\332\332\332\257\257\26039=\33'1\27$/\26\"-\26"
+  "\".\25#.\26#-\26\".\202\26#.\12\25\".\26\".\25\".\26\".\26\"-\25\"-\23"
+  "\36)\20\32#Y]aTVY\202\352\352\352\216\377\377\377\202\373\373\373\12"
+  "\201\201\202\204\206\210\16\27\37\22\35&\25!,\26\"-\26#-\25\".\26#.\26"
+  "\".\202\25#.\10\26#-\25#.\24\",\23\37*\37)1\33\"'\264\265\266\263\263"
+  "\263\222\377\377\377\14\235\235\235\237\240\242\30\40&\35'/\24\40*\24"
+  "\"-\26#.\26\".\25!,\23\36(FKQ\77AB\206\377\377\377\5'+-/6=\24\37*\25"
+  "!,\25#.\203\26\".\204\26#.\202\26\".\1\26#/\202\26$/\2\26#/\26$/\202"
+  "\26#/\1\26$0\202\26#/\6\25\".\25!+\20\32#\14\23\32;>@777\202\277\277"
+  "\277\216\377\377\377\202\256\256\256\16'(),/3\16\25\35\21\34%\24!,\25"
+  "\".\26$/\26#/\26#.\25\",\21\33%\13\22\30pqrooo\202\373\373\373\210\377"
+  "\377\377\202\246\246\246\4\22\25\27\30\37%\22\34&\24!,\206\26#/\1\26"
+  "$/\203\26#/\3\26$/\26#0\26$/\202\26#/\1\26$/\202\26#/\2\23\36(\14\23"
+  "\31\202\273\273\273\204\377\377\377\7\242\242\242\243\243\244\15\25\34"
+  "\23\36*\26#/\26#0\26$/\202\26#/\203\26$/\202\26#0\20\26$/\35(3NC\31\274"
+  "\211\0\350\252\0\372\270\2\372\271\7\350\255\13_L\31][V\244\244\244\332"
+  "\332\332\352\352\352\366\366\366\374\374\374\376\376\376\210\377\377"
+  "\377\16\376\376\376\372\372\372\365\365\365\356\356\356\273\273\273g"
+  "W)\307\222\1\366\265\4oS\7\245\244\242\354\354\354\372\372\372\371\371"
+  "\371\363\363\363\202\376\376\376\206\377\377\377\10\373\373\373\354\354"
+  "\354\335\335\335\317\317\317{}\177\37(1\31&2\26$/\205\26#/\202\26$/\203"
+  "\26#/\6\25\".\24!+\20\32\"\13\22\30SUVQQQ\202\352\352\352\216\377\377"
+  "\377\202\373\373\373\202\200\200\200\4\10\15\22\16\27\36\23\36)\25!."
+  "\202\26#/\1\26$0\202\26#/\7\26$/\26#/\26#.\24!+\21\33$\32!&\25\27\30"
+  "\202\263\263\263\222\377\377\377\202\235\235\235\12\22\25\27\30\37%\22"
+  "\34&\24\40+\26$/\26#/\25\"-\23\37(EKO>\77@\206\377\377\377\10&),.6<\24"
+  "\40+\25#.\26#0\26#/\26$/\26#0\202\26$/\3\26#/\26$/\26#/\204\26$0\3\26"
+  "%1\26$0\26%0\202\27$0\6\26$0\25#0\23\40+\20\32#psvlmn\202\373\373\373"
+  "\222\377\377\377\202\367\367\367\4XZ\\]bf\22\36&\24\"-\202\27$0\4\25"
+  "\".\23\36*QW[KMO\216\377\377\377\7\304\304\304\305\306\307\22\33#\27"
+  "#-\27%0\27$0\26$0\202\27$1\1\26%0\202\26$0\7\27%0\27%1\27$0\26$0\26$"
+  "1\27%0\26$1\202\26%0\3\26$0\23\37)\13\22\31\202\273\273\273\204\377\377"
+  "\377\13\242\242\242\243\243\243\15\25\34\23\37)\26$0\26$1\26$0\26$1\26"
+  "%0\27$1\27$0\203\26$0\23\26%1%-3\200_\5\270\207\0\340\245\0\370\266\0"
+  "\372\270\3\371\270\6\353\256\11\300\220\14oW\26^WD\226\225\220\304\304"
+  "\304\342\342\342\360\360\360\371\371\371\375\375\375\376\376\376\205"
+  "\377\377\377\15\375\375\375\370\370\370\362\362\362\350\350\350\213\210"
+  "\200~^\6\346\251\1\323\234\6eW1\314\314\314\364\364\364\374\374\374\376"
+  "\376\376\211\377\377\377\11\376\376\376\366\366\366\344\344\344\330\330"
+  "\330\265\265\2658=A\35)4\26$1\26$0\202\27$1\5\26$0\26%1\26$0\27$1\27"
+  "$0\202\26%0\4\24\40,\21\33%pswklm\226\377\377\377\5\231\231\231\233\235"
+  "\237\16\30\37\23\37)\25#/\204\26$0\5\27$1\25#0\25!-!*2\32!&\202\331\331"
+  "\331\226\377\377\377\12\304\304\304\305\306\307\27\37'\33(2\26#/\26$"
+  "1\25#/\23\36)DKO>>\77\206\377\377\377\6%(*.6=\24!,\25#/\27$0\26%0\203"
+  "\26$0\6\26$1\26$0\26$1\26$0\26$1\26%2\205\27%2\10\26%2\27%2\26$1\25\""
+  ".\21\34&\13\23\31klmjjj\202\373\373\373\222\377\377\377\202\367\367\367"
+  "\12UUUX[]\16\30\40\23\40+\27$1\26%2\24\"-\21\34%NRVHHH\216\377\377\377"
+  "\202\304\304\304\4\20\30\37\26#-\27%1\26%2\202\27%2\2\26%2\27%2\202\26"
+  "%2\202\27%2\202\26%2\1\27%2\202\26%2\1\27%2\202\26%2\2\22\37*\13\23\32"
+  "\202\273\273\273\204\377\377\377\4\242\242\242\243\243\243\15\25\35\24"
+  "\40+\202\26&2\3\26%2\27%2\26%2\202\27%2\30\26%1\27%2\26%2\32)5<AFB5\23"
+  "\222k\1\310\223\0\360\260\0\371\266\0\372\267\2\372\270\4\372\271\7\367"
+  "\270\10\313\227\10\204d\14i[4vur\251\251\251\331\331\331\352\352\352"
+  "\364\364\364\373\373\373\376\376\376\202\377\377\377\15\376\376\376\373"
+  "\373\373\366\366\366\357\357\357\335\335\335[P2\257\200\1\363\262\3\240"
+  "w\11~|u\345\345\345\373\373\373\376\376\376\213\377\377\377\10\374\374"
+  "\374\355\355\355\334\334\334\313\313\313uvw#,4\27&3\26%2\202\26%1\13"
+  "\26%2\26%1\27%1\26%2\27&1\26%0\25$0\23\36(\15\25\34lmnjjj\226\377\377"
+  "\377\202\231\231\231\5\11\17\24\17\32#\24\"-\26$0\27%2\203\26%2\4\26"
+  "#/\23\36(\34#)\25\27\30\202\331\331\331\226\377\377\377\202\304\304\304"
+  "\10\25\33\"\32&1\27$1\27%2\25#/\23\37*EKP>>\77\206\377\377\377\5%(*."
+  "6>\24\"-\26$/\27%2\202\26%2\203\27%2\2\26%2\27%2\202\26%2\2\27&3\26&"
+  "3\205\27&3\5\26&3\25#/\22\36)\\afWYZ\210\377\377\377\202\341\341\341"
+  "\202\235\235\235\202\204\204\204\202\263\263\263\202\367\367\367\206"
+  "\377\377\377\202\362\362\362\10\34$)#.8\25%2\26&3\24!-\17\30!\253\254"
+  "\255\252\252\252\204\377\377\377\202\373\373\373\202\231\231\231\202"
+  "\346\346\346\204\377\377\377\10\256\256\256\257\257\257\26\34$\33(2\27"
+  "%3\26&3\27&3\27&4\202\27&3\10\26%3\27&3\27&4\27%2\26%1\26$0\24#/\24\""
+  "0\202\25#/\4\26$0\25%1\22\37*\13\23\32\202\273\273\273\204\377\377\377"
+  "\4\242\242\242\243\243\243\15\26\35\23!,\202\27&3\3\27%3\26&4\27&4\205"
+  "\27&3\26&3\77ACDKIDXC\10\250{\0\341\245\0\366\264\0\372\267\0\372\267"
+  "\1\372\270\2\372\270\4\372\271\6\365\266\10\330\241\10\243{\15jV\40U"
+  "QI\227\227\225\312\312\311\342\342\342\360\360\360\370\370\370\202\375"
+  "\375\375\13\372\372\372\363\363\363\354\354\354\274\273\272WB\12\324"
+  "\233\0\355\257\5{b\36\245\245\245\363\363\363\376\376\376\214\377\377"
+  "\377\7\376\376\376\365\365\365\342\342\342\324\324\324\236\237\23749"
+  "=\36,9\203\26&3\203\27&3\6\26%4\27&3\26$1\24!.:CI47:\202\373\373\373"
+  "\206\377\377\377\202\225\225\225\6""333579\11\15\20\11\15\21),.'()\202"
+  "\200\200\200\202\362\362\362\206\377\377\377\4kklpsw\22\35'\24#0\202"
+  "\27&3\6\27&4\27%2\24\40,\16\30\40\264\265\266\263\263\263\206\377\377"
+  "\377\202\341\341\341\202fff\6!\"#\"%(\12\16\21\11\15\20""579333\202\221"
+  "\221\221\202\373\373\373\204\377\377\377\12\256\256\256\256\257\257\25"
+  "\34$\32'2\27%2\26&4\26$0\23\40+EKP>>\77\206\377\377\377\4%(+.7\77\24"
+  "#/\25%1\202\26&3\202\27&3\1\27%3\202\26&3\10\27&3\27%3\26&3\27'4\27'"
+  "5\27'4\27&5\26'5\202\27'4\5\27&4\24!-\17\31\"X[]UUU\210\377\377\377\202"
+  "\341\341\341\6\235\235\235\236\237\237\205\207\210\205\206\207\263\263"
+  "\264\263\263\263\202\367\367\367\206\377\377\377\202\362\362\362\10\34"
+  "%+#/8\26&3\27&5\24!.\15\26\36\252\253\253\252\252\252\204\377\377\377"
+  "\202\373\373\373\202\231\231\231\202\346\346\346\204\377\377\377\11\256"
+  "\256\256\260\261\262\27\"*\34)4\26&4\27&4\27'4\27&4\27&5\202\27'4\15"
+  "\27'5\26&3\26%1\24\".\23\40+\22\36*\22\36(\22\36)\22\37)\23!,\25\".\22"
+  "\36)\13\23\32\202\273\273\273\204\377\377\377\11\242\242\242\243\243"
+  "\243\15\26\36\24!.\27'4\27&4\27'4\27'5\27'4\202\27&5\12\27'5\27&5\27"
+  "'47AJUTU\253\253\253d`U\204c\7\330\236\0\363\262\0\203\372\267\0\202"
+  "\372\267\1\26\372\270\3\372\271\5\372\271\7\372\272\11\332\242\11\201"
+  "`\6^N!xsg\241\241\241\314\314\314\347\347\347\357\357\357\362\362\362"
+  "\357\357\357\351\351\351mjd\225m\2\350\253\2\324\235\7YVM\327\327\327"
+  "\373\373\373\216\377\377\377\22\372\372\372\350\350\350\326\326\326\263"
+  "\263\263]^`%2<\27'4\27&5\27'5\27&4\27'5\27'4\27&4\27'4\25$1\22\36)6<"
+  "@///\202\373\373\373\206\377\377\377\12\225\225\225\227\230\231;AE=E"
+  "K\24\36'\24\37'2:B/6<\203\205\207\200\201\201\202\362\362\362\206\377"
+  "\377\377\12jjjlno\16\27\40\24!,\27%4\27&4\27&3\26$1\21\35'\12\21\26\202"
+  "\263\263\263\206\377\377\377\202\341\341\341\12hjkknr)18+4=\24\37(\24"
+  "\37'=EK;AE\223\224\226\221\221\221\202\373\373\373\204\377\377\377\12"
+  "\256\256\256\260\261\262\27\"+\34*4\27%4\27'5\25%1\23!,EKQ>>\77\206\377"
+  "\377\377\6%(+/8\77\25#/\26%2\27'5\27'4\203\27&4\1\27'5\202\27&5\3\27"
+  "'4\27&4\27(6\203\27'6\6\30(6\27'6\30&5\26&3!,5\31\40&\202\356\356\356"
+  "\206\377\377\377\202{{{\12\11\15\20\15\23\31\14\25\35\16\30\40\16\31"
+  "!\16\30!\14\26\35\12\22\30\37$'\33\34\35\202\252\252\252\202\377\377"
+  "\377\202\362\362\362\10<<<AEH\20\32%\24\".\27&5\27(6\23\"-\14\25\35\202"
+  "\273\273\273\204\377\377\377\202\263\263\263\4\2\4\5\7\14\20\37#'\32"
+  "\32\32\202\335\335\335\202\231\231\231\5\13\17\24\21\33#\23\40,\25$2"
+  "\27(5\202\27'6\12\27(6\27'6\27&5\26%3\25#0\27#.\25!*GNTEKPdim\202dgk"
+  "\5dim@GMBJQ\15\27\37\10\16\23\202\273\273\273\204\377\377\377\5\242\242"
+  "\242\243\243\243\15\27\37\25\".\27(6\202\27'6\1\27(6\202\27'6\13\27("
+  "5\27(6\27'6\30(7CJPoop\204\204\204TN;\225o\7\337\243\0\364\263\0\206"
+  "\372\267\0\25\372\267\1\372\270\2\372\270\4\372\271\6\372\271\10\341"
+  "\247\11\254\201\12w_\35TN>\205\205\205\300\300\300\330\330\330\343\343"
+  "\343\335\335\335OI8\277\214\0\363\263\3\230s\16\204\204\204\353\353\353"
+  "\376\376\376\216\377\377\377\22\373\373\373\355\355\355\330\330\330\305"
+  "\305\305\201\201\202#.7\27(5\27'6\27(5\27(6\30'6\30(6\27'6\30'5\24!-"
+  "\16\27\40\300\300\301\277\277\277\204\377\377\377\202\373\373\373\16"
+  "UUUVWX\12\21\27\16\31\"\22\40+\24\"/\26%2\27%2\25#1\24!-\21\34%\14\25"
+  "\34""369///\202\356\356\356\204\377\377\377\202\356\356\356\10\31\40"
+  "&!,6\26%2\27&5\26%3\24!.JQVCEG\206\377\377\377\202\331\331\331\6\33\34"
+  "\34!'*\16\30!\21\35(\24\".\26%1\202\26%3\6\24\"0\23\37*\17\31#\13\23"
+  "\32""368///\202\346\346\346\202\252\252\252\12\12\17\22\21\32#\23\40"
+  ",\25$2\27'6\30'6\26&3\23!.EKQ>>\77\206\377\377\377\5%(+/8\77\25#1\26"
+  "'4\27(6\202\27'6\5\27(6\27(5\27'6\27(6\27'5\202\27(5\12\27(7\30(7\27"
+  "(8\30(7\27(8\27(7\27'5\24$1\36'/\26\30\33\202\356\356\356\206\377\377"
+  "\377\16{{|\177\202\204\22\35%\25\",\23\".\24#/\25$0\24#0\24\".\22\40"
+  ",'1:%,2\253\254\255\252\252\252\202\377\377\377\202\362\362\362\10AD"
+  "GFNT\23\".\27&5\30(7\27(7\23\"/\15\26\36\202\273\273\273\204\377\377"
+  "\377\202\263\263\263\14\4\7\11\13\23\32&.5\"'-\335\336\336\335\335\335"
+  "\232\233\233\234\236\240\23\34%\26$/\26'4\26'7\203\27)7\11\30(7\30(6"
+  "\26%4\23\".\21\34'\20\31\40\14\22\27\77AC=>>\204^^^\4""99:;=@\5\11\14"
+  "\2\3\4\202\273\273\273\204\377\377\377\5\242\242\242\243\243\243\15\27"
+  "\40\24#0\30(7\204\27(7\202\30(7\13\27(7\30(8\33,9\77DH{{{A6\32\230o\2"
+  "\270\207\0\336\242\0\360\260\0\370\265\0\210\372\267\0\21\372\267\1\372"
+  "\270\2\372\270\4\371\270\7\362\263\10\340\246\11\226q\15XH\34ibR\250"
+  "\250\250\212\212\212y]\23\341\245\1\360\261\6UG\37\305\305\305\365\365"
+  "\365\217\377\377\377\13\375\375\375\361\361\361\334\334\334\316\316\316"
+  "\232\233\233\",4\31)8\30(8\27(7\27(8\27)7\202\27(7\3\26(5\22\40+\12\21"
+  "\30\202\277\277\277\204\377\377\377\202\373\373\373\6VXX\\ae\21\35(\24"
+  "\"/\25%3\26'5\202\27'6\6\27(5\26&4\24#1\23\40,:BH48;\202\356\356\356"
+  "\204\377\377\377\202\356\356\356\10\26\31\34\36(1\25$1\27'5\25&4\22\37"
+  "*GLP@@@\206\377\377\377\202\331\331\331\6\"(.'2:\23\"/\25$2\26&5\26'"
+  "6\202\30'6\6\26&5\25%3\25$0\22\40+:CI6:>\202\346\346\346\6\252\253\253"
+  "\254\256\257\21\33#\26$/\27&4\27'6\202\27(7\4\26&5\24\".ELQ>>\77\206"
+  "\377\377\377\5%)+/8@\26$2\26'5\27(8\202\30(7\1\27(8\202\27(7\5\30(7\27"
+  "(7\27)7\27(7\30)9\203\27)9\6\30)8\30)9\26&3\22\37+|\177\202www\206\377"
+  "\377\377\11wwwwxx\12\22\30\20\34&\24$3\27'6\30*9\27)8\30)8\202\30)9\23"
+  "\27)7\26&3\23!.\17\31#\11\17\25hjkggh9;<=BF\17\32$\24!.\26'5\30(7\30"
+  "*9\30)8\24#0\16\30\"\243\243\244\242\242\242\206\377\377\377\24sssw{"
+  "}\16\30\"\17\32#\16\30\40\15\26\36\15\30!\21\35(\24#0\27'6\30)8\27)9"
+  "\30)8\30)9\30*8\27(7\26&4\23\40,QY_MQT\202\331\331\331\210\377\377\377"
+  "\204\331\331\331\204\377\377\377\5\242\242\242\243\243\243\16\30\40\25"
+  "#0\30)8\202\30)9\3\30)8\30)9\27)9\202\30)9\13\30)8\36-;>CH\217\217\217"
+  "lh^\\E\6\221j\2\275\212\0\331\237\0\356\257\0\370\266\0\211\372\267\0"
+  "\20\372\267\1\372\270\2\372\270\3\372\271\5\372\271\7\356\261\11\266"
+  "\207\10u]\31XH\33\276\214\2\366\265\3\264\205\6e`S\325\325\325\363\363"
+  "\363\374\374\374\216\377\377\377\20\375\375\375\362\362\362\333\333\333"
+  "\316\316\316\241\241\241+4=\31*:\30)9\27)9\30)9\30)8\27)9\26(6\25'44"
+  "=E+/3\206\377\377\377\202\225\225\225\4\12\22\30\22\36*\26&4\26(7\202"
+  "\27)9\203\30)8\7\27)9\30)8\27(7\24\"/\16\30!\\^`YYY\206\377\377\377\10"
+  "bbchmq\23!-\25&4\25$2\17\32$\247\250\251\246\246\246\204\377\377\377"
+  "\202\367\367\367\27\36\36\36&,1\22\36*\24$2\27(8\27)8\27*8\30)9\27)9"
+  "\30)8\30*8\27)9\30)7\27'6\25$1\22\36*\16\30\40\14\25\35\15\27\37\20\34"
+  "&\24#/\26'6\30)8\202\27)9\5\30)8\26'6\23#/ELR>>\77\206\377\377\377\5"
+  "&),/9A\25%3\27)7\27)8\202\30)9\1\27)8\204\30)9\2\27)9\30)8\204\30*:\6"
+  "\30*9\27*:\25%3\17\33&z{}www\206\377\377\377\6www{~\202\20\35(\25$2\27"
+  "(8\30)9\206\30*:\13\27)8\26'6\24#0\21\35(nsymrvAHODMU\24\"0\26'5\30)"
+  "9\203\30*:\4\24%4\17\34'\244\246\247\242\242\242\206\377\377\377\12s"
+  "ssstt\11\20\27\16\32#\22\40,\23\"/\25$2\26&4\27(7\30)9\202\30*:\10\30"
+  "*9\30*:\27)9\26'6\23!-\16\30!LNQHHH\202\331\331\331\210\377\377\377\204"
+  "\331\331\331\204\377\377\377\4\242\242\242\243\243\243\16\30!\24$2\202"
+  "\30*:\1\27*:\202\30*:\1\30*9\203\30*:\13\37/=HLP\243\244\244\307\307"
+  "\307\214\212\204QJ7{\\\7\251|\1\325\234\0\361\261\0\371\266\0\202\372"
+  "\267\0\6\371\267\0\364\263\0\361\261\0\363\262\0\370\265\0\371\266\0"
+  "\203\372\267\0\20\372\267\1\372\270\3\372\270\5\372\271\6\356\260\5\340"
+  "\244\2\363\262\0\372\271\5\220k\6RM\77\240\237\236\317\317\317\350\350"
+  "\350\364\364\364\372\372\372\376\376\376\213\377\377\377\10\375\375\375"
+  "\363\363\363\334\334\334\314\314\314\245\246\2465>G\31+;\27*:\204\30"
+  "*:\4\27(7\25$21:A)*,\206\377\377\377\5\225\225\225\227\231\232\17\33"
+  "%\25$2\27)9\203\30*:\1\30*9\204\30*:\5\30*8\26&5\23!-`ejZZZ\206\377\377"
+  "\377\10bbbfjm\21\36)\26%5\24#1\15\27\40\246\246\247\246\246\246\204\377"
+  "\377\377\202\367\367\367\6#'*+5>\25%3\27(7\30*:\27*:\207\30*:\12\30*"
+  "9\27(7\26&5\25$2\24#0\24$1\25&3\27(7\30)9\27*:\203\30*:\4\27(7\23#1E"
+  "MS>>\77\206\377\377\377\5&),/9B\26&5\27)8\30*:\202\30*9\2\30*:\30):\205"
+  "\30*:\10\30+;\30+<\30+;\30*<\31*<\31+<\24$2\15\27\40\202\310\310\310"
+  "\204\377\377\377\202\320\320\320\10\6\13\20\17\34&\26'6\27)9\30+;\30"
+  "*<\31+;\30+;\203\30+<\13\30+;\30+<\30+;\27*:\26&6\24$1\23#/\24$1\25&"
+  "5\27)9\30*:\202\30+<\6\30*;\30+<\26'8\23\"/8>D011\202\373\373\373\206"
+  "\377\377\377\6\200\200\200\204\207\212\20\34'\24$1\27)8\30*:\206\30+"
+  "<\4\26(8\24#1V\\cPSU\202\373\373\373\222\377\377\377\5\242\242\242\243"
+  "\243\243\16\31\"\25%3\30+;\202\30+<\202\30*;\204\30+;\26\36/=RVZ\271"
+  "\271\271\340\340\340\334\334\334\276\276\276nkdgO\16\252}\0\333\240\0"
+  "\361\261\0\371\266\0\372\267\1\366\264\1\337\244\0\324\233\0\323\233"
+  "\0\342\245\0\355\256\0\364\263\0\370\266\0\371\267\0\202\372\267\0\202"
+  "\372\267\1\1\372\270\2\202\372\267\1\14\372\270\4\372\271\11\276\216"
+  "\11kR\16eZ;\206\205\203\265\265\265\332\332\332\355\355\355\367\367\367"
+  "\374\374\374\376\376\376\210\377\377\377\7\375\375\375\366\366\366\340"
+  "\340\340\314\314\314\251\251\252;EN\31,=\203\30+;\6\31+;\30*;\26(7\23"
+  "\"/djn^^^\206\377\377\377\7#$%,4:\24#1\27(8\30*;\30+;\30+<\202\30+;\2"
+  "\30*;\30+<\202\30+;\5\30+<\27*9\26'6\20\35(\7\14\20\202\341\341\341\204"
+  "\377\377\377\6\242\242\242\243\245\245\17\33&\25&4\24#1\13\24\33\202"
+  "\335\335\335\204\377\377\377\202\242\242\242\12\13\24\34\23!/\27*9\30"
+  "*<\30+<\30+;\30*<\30*;\30*:\27*:\204\27*9\4\27*:\27)9\30*:\27)9\204\27"
+  "*9\10\27)9\27):\30*;\31+;\30)9\24$3EMT>>\77\206\377\377\377\5&),09B\26"
+  "'7\27):\30*<\203\30+;\3\30*<\30+;\31+;\202\30+;\11\30*<\30,=\31+<\30"
+  ",=\31+<\31,<\30+=\24#1\14\25\34\202\310\310\310\204\377\377\377\202\320"
+  "\320\320\4\13\24\34\23\"/\27*;\30*;\202\30,=\3\30+<\31+=\30+<\202\30"
+  ",=\1\30,<\202\30,=\20\30+<\27+;\30)9\27)9\27*:\30*;\30+;\30,=\31+<\31"
+  "+=\31,=\30+<\27);\26'5<FO49=\202\373\373\373\206\377\377\377\202\200"
+  "\200\200\16\11\20\26\17\33&\25&4\30*:\30+=\30,<\31,=\31+<\30,<\31,=\25"
+  "&5\20\36)RUYMMM\202\373\373\373\222\377\377\377\13\242\242\242\243\243"
+  "\244\16\31#\25%4\31,=\30,=\30,<\31,=\30,=\30,<\31+=\202\30+=\27\36/>"
+  "Z^a\310\310\310\346\346\346\340\340\340\333\333\333\275\275\275UN<}\\"
+  "\2\267\206\0\333\240\0\364\262\0\372\270\2\335\243\3fO\15yY\2\237u\1"
+  "\270\207\0\312\224\0\333\240\0\351\253\0\362\261\0\367\265\0\206\372"
+  "\267\0\16\372\270\2\372\270\4\372\271\7\362\264\10\316\231\10\227s\20"
+  "aS*_]X\250\250\250\330\330\330\350\350\350\365\365\365\373\373\373\376"
+  "\376\376\206\377\377\377\20\375\375\375\370\370\370\346\346\346\317\317"
+  "\317\255\255\255<FP\32,<\30,<\31,=\31+<\30,=\30+<\26(7\22!-cgk^^^\206"
+  "\377\377\377\12&*,0:C\27(6\30+;\31+=\30,<\31+<\30+=\30,<\30+<\203\31"
+  ",=\5\30+<\30,=\27+;\23\".\12\22\30\202\341\341\341\204\377\377\377\6"
+  "\242\242\242\243\243\243\16\31#\25%4\24#0\12\22\31\202\335\335\335\204"
+  "\377\377\377\4\242\242\242\243\244\245\17\31$\25&4\202\30,=\11\31+<\30"
+  ",=\31+<\27+;\30)8\26)8\26'7\26(7\26'7\203\26(7\1\27(7\202\26(7\13\26"
+  "'7\26(7\27'7\26(8\26)9\27):\27+<\30(9\25$3ENT>>@\206\377\377\377\6&)"
+  ",/:D\26(7\30*;\30+<\30+=\202\30,=\10\31,<\31,=\31+=\31,<\30,=\30+=\30"
+  "->\31,>\202\31-\77\4\31->\30->\24#2\12\22\31\202\362\362\362\204\377"
+  "\377\377\16\200\200\200\201\203\204\17\33&\26'6\30->\31->\31-\77\31,"
+  ">\30,=\27*;\27(9\25'6\25'5\25&5\211\25'5\13\25&5\26&6\25(7\26):\30+<"
+  "\30,=\30*;\23#2\15\30\"uvwsss\210\377\377\377\14\242\242\242\244\246"
+  "\247\22\36'\30&4\27*:\30,<\30,>\31->\30,=\27+<\33+7\24\35%\202\346\346"
+  "\346\206\377\377\377\202\314\314\314\202\210\210\210\202\277\277\277"
+  "\210\377\377\377\6\242\242\242\243\243\244\16\31$\25'6\30->\30,>\202"
+  "\30->\3\30,>\31,>\30->\202\31->\32\35/\77adg\323\323\323\356\356\356"
+  "\350\350\350\341\341\341\333\333\333\253\253\253]T:jP\11\260\201\0\340"
+  "\244\0\372\270\4\323\234\7MH7\220\220\215YVN^M\40~^\6\231q\1\273\211"
+  "\0\315\226\0\335\242\0\354\254\0\363\262\0\367\265\0\204\372\267\0\20"
+  "\372\267\1\372\270\2\372\270\3\372\270\5\370\270\7\363\265\10\314\230"
+  "\12nT\15ZO1\203\200y\264\264\264\330\330\330\353\353\353\366\366\366"
+  "\373\373\373\376\376\376\203\377\377\377\20\375\375\375\371\371\371\352"
+  "\352\352\321\321\321\257\257\257;FQ\30->\31,>\30,>\31,>\30,>\30,\77\26"
+  "(8\21\37+\203\206\210\200\200\200\204\377\377\377\202\352\352\352\4\10"
+  "\16\24\23!.\30+<\31+=\202\31->\202\30,>\202\31->\2\30,>\31->\204\31,"
+  ">\2\25%4\15\26\37\202\256\256\256\204\377\377\377\202\273\273\273\4\15"
+  "\27!\25&4\24#2\11\20\27\206\377\377\377\20fffjmp\21\36*\26(7\31,>\31"
+  "->\30,>\31,>\27*=\26(62\77K0<F0:D/:D0:D/:D\2020:D\203/:D\13""0:D/:D0"
+  ":D1<F1\77I\34-:\37""1\77\27(:\25&4FNT>>@\206\377\377\377\4&),0:D\26)"
+  "8\30,<\202\31->\14\30->\30,>\31->\30-\77\30,>\31,>\30,>\31->\30-@\31"
+  "-@\31-\77\30.@\202\31-@\2\23$2\11\21\30\202\362\362\362\204\377\377\377"
+  "\5\200\200\200\203\205\207\21\37+\26(9\31.@\202\31-@\5\31.@\30,<\25'"
+  "6\21\40.\20\34(\202\16\32$\202\16\32%\3\16\32$\16\33%\16\32$\202\16\33"
+  "$\15\16\33%\16\32$\16\32%\17\34'\21\40,\24%3\26(9\30,=\30,>\26(9\22!"
+  "/x}\201tuu\210\377\377\377\202\242\242\242\12\12\17\24\22\35&\23$4\27"
+  "*;\31,\77\31.\77\30-=\27):\30&2\20\26\31\202\346\346\346\206\377\377"
+  "\377\202\314\314\314\202\211\211\212\202\277\277\277\210\377\377\377"
+  "\7\242\242\242\243\243\244\16\32$\25'7\31.\77\31-\77\31-@\202\31.@\40"
+  "\30-@\31-@\31-\77\31.@\34""0A_be\322\322\322\364\364\364\356\356\356"
+  "\347\347\347\340\340\340\334\334\334\270\270\270UN<}\\\3\305\220\0\372"
+  "\270\3\316\231\10SN\77\320\320\320\331\331\331\253\253\253}{udZ\77]F"
+  "\7\213f\2\260\201\0\305\220\0\327\236\0\350\252\0\365\264\0\371\267\0"
+  "\205\372\267\0\16\372\267\1\372\270\3\372\270\4\372\271\6\367\267\10"
+  "\325\236\10\225o\6m[)eaX\222\222\222\322\322\322\346\346\346\363\363"
+  "\363\373\373\373\202\376\376\376\20\375\375\375\371\371\371\351\351\351"
+  "\321\321\321\250\251\2527DO\31-@\31.@\31-\77\31.@\31-@\31-\77\26)9\20"
+  "\37+\203\205\207\200\200\200\204\377\377\377\202\352\352\352\16\11\21"
+  "\30\24#1\31,\77\31-\77\31.@\31-\77\30-@\31.@\30.@\31-\77\30-@\31-@\31"
+  ".\77\30-@\202\31-\77\2\24'5\15\31\"\202\256\256\256\204\377\377\377\202"
+  "\273\273\273\4\15\27!\24'6\24#1\11\20\26\206\377\377\377\14fffjnr\22"
+  "\40-\27)9\31-@\31-\77\31.@\31-\77\27*9\22!.,4;(-2\202&*,\204&),\1&*,"
+  "\202&),\203&*,\10',0*28\27$0\34,9\26(8\24%4FNU>>@\206\377\377\377\11"
+  "&),/:E\26*9\30,>\31.@\31-@\31.@\31-@\31.\77\202\31.@\3\31-@\31.@\31-"
+  "\77\203\31.A\203\31/A\2\24$3\11\21\27\206\377\377\377\6^^^cgk\22\"/\27"
+  "*;\31.A\31/A\202\31.A\4\26):\21\40-\246\251\253\243\245\246\214\243\243"
+  "\244\5\243\245\246\245\247\2512=E6EQ\27+<\202\30-\77\5\27);\22\40.\13"
+  "\25\35ttusss\210\377\377\377\12\277\277\277\300\301\302\31%.\37/<\27"
+  ",=\30-\77\30+=\24&5ahnZ[\\\206\377\377\377\202\221\221\221\2\6\13\17"
+  "\13\24\35\202\16\33%\2\14\26\37\7\16\23\202www\206\377\377\377\5\242"
+  "\242\242\243\243\244\16\33%\25(8\31/A\203\31.A%\31/A\31/@\32.A\31.A\31"
+  "/A\33/AUZ_\312\312\312\371\371\371\365\365\365\356\356\356\346\346\346"
+  "\340\340\340\334\334\334\247\247\245[L$\236t\1\351\253\2\261\203\5gc"
+  "X\321\321\321\342\342\342\340\340\340\332\332\332\313\313\313\246\245"
+  "\242faUPD!y\\\13\240u\1\311\223\0\355\255\0\366\264\0\363\262\0\362\261"
+  "\0\365\264\0\370\266\0\203\372\267\0\25\372\267\1\372\270\2\372\270\3"
+  "\372\271\5\370\270\7\346\253\10\306\224\12\202e\24QH0\206\203|\300\300"
+  "\277\336\336\336\360\360\360\367\367\367\372\372\372\367\367\367\345"
+  "\345\345\321\321\321\206\214\221&8G\31/A\204\31.A\5\32.A\26):\21\37,"
+  "\203\205\207\200\200\200\204\377\377\377\202\341\341\341\6\12\22\31\24"
+  "$3\31/@\31.A\31/A\32.A\212\31.A\4\25'7\16\32$\246\246\247\246\246\246"
+  "\204\377\377\377\202\273\273\273\4\15\30\"\25'7\24$3\11\20\26\206\377"
+  "\377\377\6fffknr\22!/\27*;\32/@\32.A\202\31/A\2\25'7\15\31\"\220\377"
+  "\377\377\6CHMHS]\25'7\24%5FNV>>@\206\377\377\377\6&*,0<F\27*;\30-\77"
+  "\31.A\31/@\202\31.A\3\32/A\31.A\32/A\202\31/A\11\31.A\31/C\31/B\31/C"
+  "\32""0B\32/B\31/B\24%4\11\21\27\206\377\377\377\4^^^cgk\22\"0\30+<\202"
+  "\31/C\202\31/B\3\25(9\16\32%\243\243\244\217\242\242\242\12.4:4AK\27"
+  "*;\31-A\31.B\30.@\26)9\22!/x}\201tuv\210\377\377\377\202\277\277\277"
+  "\10\23\27\34\32'1\25'7\30,>\27,=\23#1_dhYYY\206\377\377\377\4\221\221"
+  "\221\224\226\230\17\34'\24$3\202\25(9\4\24%4\20\36+{\177\203www\206\377"
+  "\377\377\6\242\242\242\243\243\244\17\33&\26(9\31/B\32""0B\202\31/B("
+  "\32""0B\31""0C\31/B\31/C\31/B\33""0CFOW\276\276\276\374\374\374\372\372"
+  "\372\365\365\365\355\355\355\346\346\346\341\341\341\327\327\327xwu^"
+  "H\16\244x\1\212f\1|xn\337\337\337\353\353\353\352\352\352\346\346\346"
+  "\342\342\342\337\337\337\334\334\334\304\304\304xxwH<\34\254~\1\354\255"
+  "\0\357\257\0\335\241\0\322\232\0\331\240\0\350\252\0\362\261\0\367\265"
+  "\0\371\267\0\203\372\267\0\13\371\266\1\370\266\2\372\267\3\372\271\6"
+  "\372\271\10\334\243\11yZ\5dU)zta\240\240\240\315\315\315\202\350\350"
+  "\350\16\331\331\331\312\312\312`kt\33""1D\33""0D\32/C\31""0B\31/C\31"
+  "/B\32/B\26+;\21\40.\203\206\210\200\200\200\204\377\377\377\202\341\341"
+  "\341\10\11\21\30\23$3\31.A\32/B\31/B\31""0C\31""0B\32""0B\203\31/B\3"
+  "\31""0B\32/C\31/B\202\32/B\2\25(8\16\31#\202\246\246\246\204\377\377"
+  "\377\202\273\273\273\4\16\31#\25(8\24&4\11\21\30\206\377\377\377\5ff"
+  "fjmp\21\40-\26*;\32/C\203\31/B\2\25&5\12\23\32\220\377\377\377\6\77B"
+  "DGQY\24&6\25&6FNV>\77@\206\377\377\377\4&*-0<F\27+<\31.A\203\31/B\202"
+  "\32/B\3\32""0C\31/B\32/C\202\31/C\203\32""0D\1\31""0C\202\32""0D\2\24"
+  "&6\12\22\31\202\362\362\362\204\377\377\377\5\200\200\200\203\205\210"
+  "\21\40.\27+=\32""0C\202\32""0D\3\31""0D\25'7\13\24\34\220\377\377\377"
+  "\4@DGGQ[\26*:\31.A\202\32""0D\6\31.B\27+=\23#1\14\27![]^YYY\202\367\367"
+  "\367\206\377\377\377\3\242\242\242\244\246\250\20\37-\202\26*;\3\21\40"
+  "-\224\226\227\221\221\221\204\377\377\377\202\356\356\356\12\15\20\23"
+  "\27#.\26);\30.A\32""1D\31""0D\31.B\27+<\21\37+\7\14\22\202\320\320\320"
+  "\204\377\377\377\7\242\242\242\243\243\244\17\34'\26):\32""0C\31""0D"
+  "\32""0C\202\32""0D\7\31""0D\32""0C\32""0D\31""0D\32""0D9DO\253\254\254"
+  "\202\375\375\375\3\371\371\371\363\363\363\355\355\355\202\347\347\347"
+  "1\331\331\331\240\236\230}vchZ3\242\237\231\356\356\356\364\364\364\362"
+  "\362\362\357\357\357\352\352\352\345\345\345\340\340\340\335\335\335"
+  "\246\246\244_K\24\304\217\1\367\265\1\244y\1_F\2\220j\2\255\177\0\301"
+  "\215\0\325\234\0\344\247\0\360\260\0\367\265\0\372\267\0\372\267\1\367"
+  "\265\0\355\256\0\350\252\0\354\255\1\370\266\2\366\266\5\345\252\7\275"
+  "\214\6\207f\13za\34TL7\205\205\205\277\277\277\304\304\304\276\276\276"
+  "dnv\34""1D\33""1C\31""0C\32""0D\31""0D\202\32""0C\4\30,=\23#2kpsfff\206"
+  "\377\377\377\10\33\40%%3\77\30-\77\31/B\32""0D\31""0D\32""0D\31""0D\204"
+  "\32""0D\6\31""0D\32""0D\32""0C\31/C\24&6\14\25\36\202\325\325\325\204"
+  "\377\377\377\6\252\252\252\252\253\253\17\33&\26):\24'6\12\24\34\202"
+  "\335\335\335\204\377\377\377\5\242\242\242\243\244\245\17\34(\25)9\31"
+  "0C\203\31""0D\2\24%5\11\21\27\220\377\377\377\6""9;<BLT\24'7\25&6FOW"
+  ">\77@\206\377\377\377\10&*-0<G\30,>\31.B\31""1D\32""0D\31""0C\32""0D"
+  "\202\31""0D\2\31""0C\31""0D\202\32""0D\1\32""1F\202\32""1E\5\31""1F\32"
+  "1F\32""0E\25'7\13\24\34\202\362\362\362\204\377\377\377\12\200\200\200"
+  "\201\203\204\20\36+\27+<\31""1E\32""1E\32""0E\32""1E\24'7\11\22\32\220"
+  "\377\377\377\14\77ADGQ[\27+<\31/B\31""1E\32""1E\32""0D\31/C\27+=\23$"
+  "3ahn\\^a\202\367\367\367\206\377\377\377\202\242\242\242\6\14\27\40\23"
+  "&5\25)9\20\36*\223\224\226\221\221\221\204\377\377\377\202\356\356\356"
+  "\4\22\30\37\33+8\31.B\31""0E\202\32""1E\4\32""0D\31/D\24&5\13\25\36\202"
+  "\320\320\320\204\377\377\377\6\242\242\242\243\243\244\17\34(\26*<\32"
+  "1E\32""1F\203\32""1E\17\32""1F\32""1E\32""1F\32""0E\32""1E.>K\225\225"
+  "\225\374\374\374\376\376\376\375\375\375\371\371\371\364\364\364\360"
+  "\360\360\357\357\357\361\361\361\202\366\366\366\2\216\214\207\337\337"
+  "\336\202\373\373\373'\372\372\372\367\367\367\363\363\363\354\354\354"
+  "\345\345\345\340\340\340b[H\230p\1\351\253\1\344\250\5lX!\235\235\235"
+  "lkiNG3lS\16\214g\2\256\200\0\307\221\0\332\240\0\354\255\0\354\255\1"
+  "\260\201\1\232q\0\251|\0\275\212\0\317\227\0\321\231\1\341\244\1\357"
+  "\257\2\363\262\4\310\224\5\247{\6|]\5VH\37oj]\226\226\226vy|\40""0>\33"
+  "2F\202\31""1F\7\32""1F\32""1E\32""1F\30-@\24&6mrwfff\206\377\377\377"
+  "\4\30\33\34\".8\27*=\31/B\204\32""1E\2\32""0E\31""1E\204\32""1E\4\31"
+  "0E\30.B\22#1\11\20\27\202\325\325\325\204\377\377\377\6\252\252\252\253"
+  "\254\255\20\36*\27*=\25(9\14\27\40\202\335\335\335\204\377\377\377\202"
+  "\242\242\242\4\14\27!\25'7\31""0C\32""1D\202\32""1E\2\24'7\11\22\31\220"
+  "\377\377\377\6:=@DNW\26):\25(8FOX>\77@\206\377\377\377\16&*-0=H\30-\77"
+  "\31.D\31""1E\32""1E\31""1F\31""1E\32""1E\32""0E\32""1F\32""1E\32""1F"
+  "\31""1E\204\32""2G\4\32""1G\32""2G\25):\14\27!\202\310\310\310\204\377"
+  "\377\377\202\320\320\320\4\14\27!\24&7\31""0E\32""1F\202\32""2G\2\25"
+  "'9\12\23\33\220\377\377\377\16)-04@K\27,\77\31""0D\32""2G\32""1F\32""0"
+  "D\31""0D\30/A\27,\77\24&5\17\34(ADH<<<\202\352\352\352\206\377\377\377"
+  "\6<@DDOZ\24%5\17\34'\243\244\245\242\242\242\204\377\377\377\202\273"
+  "\273\273\5\14\26\37\25(9\32""2E\32""1F\32""2G\202\32""2F\3\32""2G\26"
+  "*<\16\32%\202\242\242\242\204\377\377\377\6\242\242\242\243\243\244\17"
+  "\35)\26+=\33""2F\32""2G\202\32""2F\4\33""2G\32""1G\32""2G\32""1G\202"
+  "\32""2G\15(;J\202\202\203\372\372\372\377\377\377\376\376\376\374\374"
+  "\374\371\371\371\366\366\366\365\365\365\366\366\366\371\371\371\374"
+  "\374\374\372\372\372\203\376\376\376(\375\375\375\373\373\373\367\367"
+  "\367\357\357\357\350\350\350\266\266\266WC\15\310\223\0\364\264\3\302"
+  "\220\6g`L\311\311\311\333\333\333\301\301\301\220\220\216haMVD\24|[\3"
+  "\255~\0\305\220\0\345\250\0\345\250\1\255\200\3]D\2I8\11jO\5\210d\2\203"
+  "`\2rU\3\273\211\1\355\256\1\344\250\4\244y\4\243x\4]F\7_Q+OLC\40-8\34"
+  "4I\33""3H\202\32""2G\6\32""1F\32""2G\32/B\26*<7AK-/1\206\377\377\377"
+  "\13www{\200\203\22#2\27,\77\32""1F\32""2G\32""2F\32""1F\32""1G\32""2"
+  "G\32""1G\202\32""2G\5\32""2F\30/B\26);JS\\BDF\206\377\377\377\10ooos"
+  "vy\22\"0\27-\77\26+<\16\33&\246\246\247\246\246\246\204\377\377\377\202"
+  "\367\367\367\10$).-:E\27->\31""0D\32""2F\32""2G\25):\14\26!\210\335\335"
+  "\335\202\367\367\367\204\377\377\377\202\373\373\373\6\24\31\36\36-;"
+  "\27,\77\25):FPX>\77@\206\377\377\377\11&*-0=I\30-A\31""0E\32""1F\32""2"
+  "G\32""2F\32""1G\32""2G\202\32""2F\13\32""1F\32""2G\32""2F\33""2I\33""2"
+  "H\32""2H\33""3H\32""3H\33""3H\26+=\16\33'\202\310\310\310\204\377\377"
+  "\377\202\320\320\320\10\7\16\24\21!/\27.A\31""0G\32""3H\32""2H\25*;\15"
+  "\30\"\220\377\377\377\16+054CP\30.C\32""2G\32""1G\31""1F\30/C\27.@\30"
+  ",\77\30-\77\26+=\25(8GQ[AFK\202\352\352\352\206\377\377\377\6""99;AK"
+  "R\22#1\16\33&\243\244\245\242\242\242\204\377\377\377\202\273\273\273"
+  "\3\14\27\40\25):\32""3G\202\32""2H\1\33""2H\202\32""3H\2\26*=\16\33&"
+  "\202\242\242\242\204\377\377\377\10\242\242\242\243\243\244\17\35*\27"
+  ",\77\33""3H\32""3H\32""2I\33""2I\205\32""3H\10\33""3I\"8K\\_c\342\342"
+  "\342\376\376\376\377\377\377\376\376\376\374\374\374\203\373\373\373"
+  "\2\374\374\374\375\375\375\204\377\377\377(\376\376\376\374\374\374\367"
+  "\367\367\357\357\357\341\341\341^\\V\246z\1\344\247\0\367\267\5\224q"
+  "\20~~}\331\331\331\344\344\344\341\341\341\336\336\336\322\322\322\272"
+  "\272\271zwoPF+rW\20\237u\2\306\221\1\352\254\1\330\237\3iN\4J@%\\VDV"
+  "J)rW\15YC\4\177^\3\303\217\1\204a\2YA\2\205d\11YA\2I7\7$03\35""5I\33"
+  "4I\203\32""3H\5\32""2H\32""1F\30-B9FQ048\206\377\377\377\6wwwxyz\16\32"
+  "&\24(9\31""0E\32""2G\202\32""2H\1\33""3H\203\32""3H\6\33""3H\32""2F\27"
+  ",>\22#1FKP@@@\206\377\377\377\10ooouz\177\25'7\30/C\30-\77\21\40.\247"
+  "\251\252\246\246\246\204\377\377\377\202\367\367\367\11\36\36\36(18\24"
+  "'6\30.A\32""2G\32""3H\30,\77\20\37,\336\336\337\207\335\335\335\202\367"
+  "\367\367\204\377\377\377\202\373\373\373\6\26\35%\40""1@\31/B\26*<GP"
+  "X>\77@\206\377\377\377\10&+.1>I\31.B\32""1F\33""2H\32""3H\33""2H\32""2"
+  "H\202\32""3H\6\33""2H\33""3H\33""2H\32""3H\32""4I\32""3I\202\33""4J\6"
+  "\32""3J\33""4J\27-@\21\"0z|~www\206\377\377\377\12ooouz\177\23%4\27-"
+  "@\32""2H\33""3I\27.@\21!/cgk__`\204^^^\202bbb\202\362\362\362\204\377"
+  "\377\377\202\341\341\341\20\12\22\33\24'7\31""1E\32""1F\30-A\26);5ER"
+  "4ALHT^JWb\26)<\30-A\25);\20\40-AFJ<<<\206\377\377\377\6sssx}\202\21\40"
+  "-\16\33'\227\230\231\225\225\225\204\377\377\377\202\346\346\346\12\15"
+  "\27\37\27*:\32""1F\33""3I\33""4I\33""3J\33""3I\31""3H\25(:\14\27!\202"
+  "\314\314\314\204\377\377\377\7\242\242\242\243\243\244\17\35*\26-\77"
+  "\33""4I\32""4J\33""3J\203\32""4I\12\33""3I\33""3J\32""4I\33""3I\35""4"
+  "J8BL\261\262\262\375\375\375\376\376\376\377\377\377\205\376\376\376"
+  "\205\377\377\377'\376\376\376\373\373\373\365\365\365\355\355\355\250"
+  "\250\250gS\34\313\225\0\362\261\2\363\264\7[N*\264\264\264\351\351\351"
+  "\356\356\356\352\352\352\347\347\347\343\343\343\341\341\341\340\340"
+  "\340\314\314\314\222\222\222jdTdR!\200_\5\311\223\1\353\255\2\327\236"
+  "\3\221l\7lZ(96/\\P.qV\15\207c\2\240v\0kN\1F<\36YK'>3\17\"18\34""4J\202"
+  "\32""4I\7\32""3J\33""3I\32""3J\33""3I\31""1F\24'8\12\25\35\202\320\320"
+  "\320\204\377\377\377\202\367\367\367\10@DGGPY\25(9\27-A\32""1F\31""3"
+  "H\33""4I\32""4J\202\32""2H\4\31/D\27+>*8E#+2\202\335\335\335\204\377"
+  "\377\377\202\373\373\373\10\"$'*7B\27,@\32""2F\31""0D\24'7OV[HHH\206"
+  "\377\377\377\20\314\314\314\314\314\315\34&0#3B\30-@\31""0D\30/B\25)"
+  ":\20\40-\15\31#\13\25\36\13\24\35\11\21\30\6\14\22""000///\202\373\373"
+  "\373\204\377\377\377\202\267\267\267\6\15\31$\25*<\31""1E\27,>GQY>\77"
+  "A\206\377\377\377\11%),/=G\27-@\31""0D\32""1E\32""1F\32""1G\32""2G\32"
+  "1H\202\32""3I\4\33""3J\33""3I\33""4J\32""5K\204\33""5K\5\33""4K\31""1"
+  "F\24'8|\201\205www\206\377\377\377\16ooopqq\14\31#\23%6\30/D\32""3I\31"
+  "1F\25);hrzfmsdkpdjpdimbfi\202bbb\202\362\362\362\204\377\377\377\202"
+  "\341\341\341\20\15\31#\26*<\31""1F\30-B\23&5\16\34(.4;+/4ADHDKR\21!0"
+  "\26,>\30/B\26*<GQY>@A\206\377\377\377\6sssw{\177\20\37,\17\35*\227\232"
+  "\233\225\225\225\204\377\377\377\202\346\346\346\4\12\17\24\25$1\30."
+  "C\32""3H\202\33""4K\4\32""3I\31""1E\22$3\10\20\27\202\314\314\314\204"
+  "\377\377\377\4\242\242\242\243\243\244\20\37+\27-@\202\33""4K\3\33""5"
+  "K\33""4K\33""5K\202\33""4K\203\33""5K\5\33""4K*<M\177\202\205\371\371"
+  "\371\376\376\376\213\377\377\377\22\375\375\375\370\370\370\361\361\361"
+  "\334\334\334ql`\217j\5\337\243\0\371\270\4\324\236\10USP\333\333\333"
+  "\365\365\365\366\366\366\363\363\363\360\360\360\354\354\354\351\351"
+  "\351\346\346\346\202\344\344\344\24\327\327\327\276\276\276\216\213\203"
+  "XK(\201`\10\302\216\1\346\251\2\340\244\3\257\203\12UG\40KHAE8\25ZB\2"
+  "\205b\1{[\5.)\32\36(/\36""7L\34""6K\33""5K\203\33""4K\7\33""5K\33""4"
+  "K\33""4J\27,\77\17\35)\320\321\321\320\320\320\204\377\377\377\202\367"
+  "\367\367\6<<<@CF\15\32&\23$4\26.@\30""1F\202\32""4J\6\31""3H\30""0D\25"
+  "):\21\40/\"*0\33\33\33\202\335\335\335\204\377\377\377\202\373\373\373"
+  "\10%-3/>K\31""0E\32""4I\32""1G\27-@R\\eKMP\206\377\377\377\202\314\314"
+  "\314\16\23\26\30\33$,\22\"1\25)<\30/C\30""0C\27-A\26+=\25):\23&6\17\36"
+  "+\12\25\36""124///\202\373\373\373\204\377\377\377\10\267\267\267\270"
+  "\271\272\20\40-\27.A\32""1G\27,\77GQZ>\77A\206\377\377\377\4%&(.9C\24"
+  "(9\26*=\202\27,\77\14\27,@\30.B\30/D\32""2G\32""3I\32""4J\33""5K\32""4"
+  "K\33""5L\33""6M\33""6L\33""5L\202\33""5M\4\32""3I\27.B'5B\36\"&\202\356"
+  "\356\356\204\377\377\377\202\373\373\373\14pqquz\177\22#2\25*<\27,A\26"
+  ",>\24'9\23%4\21#2\21!/\22\37)\15\26\36\202\225\225\225\206\377\377\377"
+  "\10\200\200\200\202\204\205\20!.\26,\77\37""4G\32,:\211\215\221\205\206"
+  "\207\202\373\373\373\202\362\362\362\6-4:4AN\25*<\26*=\23#/\11\15\21"
+  "\202\352\352\352\204\377\377\377\6\200\200\200\203\207\212\20\40.\21"
+  "!0din^^^\206\377\377\377\4www|\200\204\22#2\26+>\202\30/D\4\26,\77\23"
+  "&7agmZZZ\206\377\377\377\11\242\242\242\243\243\244\20\37,\27.B\33""6"
+  "L\34""5L\33""6M\33""6L\33""6M\203\33""5M\10\33""6L\33""5M\33""5L\":N"
+  "HPV\310\310\311\371\371\371\376\376\376\211\377\377\377\25\376\376\376"
+  "\373\373\373\364\364\364\355\355\355\262\261\257fQ\27\273\212\0\362\262"
+  "\1\372\271\7z\\\11\235\235\235\356\356\356\375\375\375\374\374\374\372"
+  "\372\372\370\370\370\366\366\366\363\363\363\360\360\360\355\355\355"
+  "\353\353\353\202\351\351\351\17\350\350\350\336\336\336\231\231\231b"
+  "]Pt[\31\242v\2\336\243\2\342\247\4\203f\27meQUL3<1\24C2\1sX\12#29\202"
+  "\33""6M\3\33""5L\33""6L\34""5M\202\33""5L\6\33""6L\33""5M\31""1F\24'"
+  "8GMS@@@\206\377\377\377\202\373\373\373\12\200\201\201\204\210\213#0"
+  "<&6D\25):\25*;\33.>\32)7emradg\202\346\346\346\206\377\377\377\12\214"
+  "\214\214\214\215\216\16\34)\26+>\33""4K\33""6K\32""4K\31""3G\24'8\13"
+  "\25\36\202\273\273\273\206\377\377\377\202\335\335\335\12Y^a^el\31+8"
+  "\34/>\26)<\25*<\33.\77\32*7elradf\202\352\352\352\206\377\377\377\10"
+  "<<<DKR\24):\31""2G\32""2H\30.@GQ[>\77A\206\377\377\377\4UUUY^aENVFQY"
+  "\202GQZ\11GS\\IU`,=L/BS\31""1G\32""5J\33""5L\33""5M\33""7N\202\33""6"
+  "N\7\34""6N\33""7N\34""7N\33""5L\31""2H+<K#,4\202\356\356\356\204\377"
+  "\377\377\202\373\373\373\202ooo\12\7\17\25\15\30#\20\40.\21#2\22#3\22"
+  "#2\20\37-\16\33'\15\26\36\7\12\15\202\225\225\225\206\377\377\377\6\200"
+  "\200\200\205\211\216\24&7\27,@\33.=\24\37)\202\204\204\204\202\373\373"
+  "\373\202\362\362\362\6&&&,38\16\34(\17\36+\16\30!\5\5\6\202\352\352\352"
+  "\204\377\377\377\6\200\200\200\204\210\213\21#2\24&7fnu___\206\377\377"
+  "\377\202www\10\11\21\31\16\35)\21$2\22#3\17\36+\12\25\36Z\\\\YYY\206"
+  "\377\377\377\4\242\242\242\243\243\244\20\40-\27.C\202\33""7N\3\34""6"
+  "N\33""6N\33""7N\202\33""6N\10\34""6N\34""7N\33""6N\33""7N\34""7O.<H\217"
+  "\220\221\363\363\363\202\376\376\376\210\377\377\377\15\375\375\375\370"
+  "\370\370\361\361\361\351\351\351vqd\207d\2\324\233\0\372\270\3\353\256"
+  "\7QB\32\322\322\322\367\367\367\377\377\377\202\376\376\376\32\375\375"
+  "\375\373\373\373\371\371\371\367\367\367\364\364\364\362\362\362\357"
+  "\357\357\355\355\355\353\353\353\352\352\352\353\353\353\335\335\335"
+  "\254\254\252|vf]I\22\221k\1lV\30[TC\233\233\233\201~tSI--0\"\36""7I\33"
+  "7N\33""6N\34""6N\203\33""6N\7\34""6N\33""6N\33""7N\33""3J\27.BLVaDGK"
+  "\206\377\377\377\202\373\373\373\202\200\200\200\6\30\33\35\33\"'\11"
+  "\23\33\12\24\34\21\30\40\16\23\27\202^^^\202\346\346\346\206\377\377"
+  "\377\14\214\214\214\220\224\227\24&7\30""0E\33""6M\33""7N\33""6M\32""5"
+  "L\27.B\20\40/\274\275\276\273\273\273\206\377\377\377\202\335\335\335"
+  "\4UUUUVV\17\24\30\21\31\40\202\12\24\35\2\22\31\40\16\24\27\202^^^\202"
+  "\352\352\352\206\377\377\377\3ADIHT^\27.C\202\33""3J\3\30.BGQ[>\77A\206"
+  "\377\377\377\202UUU\2=>>>>@\202>\77A\14\77ADAGK&2=+<J\31""1E\32""4K\33"
+  "6N\33""7N\33""7O\34""8P\33""7P\34""7O\202\33""7P\6\33""7O\32""6M\30."
+  "C\20!0ilnfff\210\377\377\377\202\325\325\325\2\205\207\210\207\212\215"
+  "\202chm\2\207\212\215\205\206\210\202\331\331\331\206\377\377\377\202"
+  "\335\335\335\6\23\31\36\34,:\27.B\27,@ku~fil\206\377\377\377\202\352"
+  "\352\352\2xxxyz{\202\273\273\273\206\377\377\377\6UUU\\cj\24(:\26-A\""
+  "2@\30\35#\202\352\352\352\206\377\377\377\6\256\256\256\256\257\257m"
+  "psnps\243\244\245\242\242\242\202\373\373\373\206\377\377\377\6\242\242"
+  "\242\243\243\244\20\40.\27/D\34""7O\33""7P\202\34""7O\5\34""7P\34""7"
+  "O\33""7P\34""7P\34""7O\202\33""7P\12\35""7P&=QKPT\277\277\277\373\373"
+  "\373\375\375\375\374\374\374\366\366\366\370\370\370\374\374\374\203"
+  "\377\377\377\15\376\376\376\373\373\373\365\365\365\355\355\355\320\320"
+  "\320F8\22\270\207\0\351\252\0\372\270\5\245{\5wrd\350\350\350\374\374"
+  "\374\204\377\377\377\11\376\376\376\375\375\375\374\374\374\373\373\373"
+  "\371\371\371\366\366\366\363\363\363\361\361\361\356\356\356\203\355"
+  "\355\355\13\350\350\350\323\322\322\217\215\206`U8lV\34RF$kg];\77B\40"
+  "5G\35""7O\34""7P\202\33""7O\14\33""7P\34""7O\33""7P\34""8O\34""7O\33"
+  "7P\33""6N\32""4K\26*=\15\33'\204\205\206\204\204\204\212\377\377\377"
+  "\202\341\341\341\202\373\373\373\210\377\377\377\202\304\304\304\4\20"
+  "\27\35\31)7\30""0D\32""6M\202\33""7O\202\34""7O\4\31""3J\26,\77&1<\35"
+  "\37!\202\346\346\346\210\377\377\377\202\373\373\373\202\335\335\335"
+  "\202\373\373\373\210\377\377\377\12\204\204\204\204\205\206\16\34(\25"
+  "+>\32""4K\33""6N\32""5K\30.CGQ[>\77A\216\377\377\377\10aeggpy\26-@\32"
+  "4J\34""7O\34""8P\34""8Q\34""9Q\204\34""8Q\202\34""8P\4\32""3J\26+\77"
+  "nu|ghj\210\377\377\377\202\325\325\325\202\204\204\204\202^^^\202\204"
+  "\204\204\202\331\331\331\206\377\377\377\202\335\335\335\6\32'2!6H\31"
+  "2H\27/Dlu~ehj\206\377\377\377\202\352\352\352\202www\202\273\273\273"
+  "\206\377\377\377\6VXZ^hp\27.B\31""2H%8J\34'2\202\352\352\352\206\377"
+  "\377\377\202\256\256\256\202jjj\202\242\242\242\202\373\373\373\206\377"
+  "\377\377\14\242\242\242\243\243\244\20\40/\30""1F\34""8Q\34""9Q\34""8"
+  "Q\33""8Q\34""8Q\33""8Q\34""8Q\33""8Q\204\34""8Q\12\36""9R+:Gxz{\355\355"
+  "\355\374\374\374\313\313\313\212\210\204\315\315\315\357\357\357\374"
+  "\374\374\202\377\377\377\15\375\375\375\371\371\371\361\361\361\351\351"
+  "\351}}}\202a\5\321\231\0\364\263\1\360\262\7x]\22\247\246\245\363\363"
+  "\363\376\376\376\207\377\377\377\7\376\376\376\375\375\375\373\373\373"
+  "\371\371\371\367\367\367\364\364\364\362\362\362\202\360\360\360\12\357"
+  "\357\357\355\355\355\345\345\345\302\302\302\222\221\216cW9B4\15#*&\40"
+  ":P\35""9R\202\34""8Q\15\33""8Q\34""8Q\33""8Q\34""8Q\34""8P\34""9Q\34"
+  "8Q\34""8P\33""7O\31""1G\24(9\211\216\222\204\204\204\212\377\377\377"
+  "\202\341\341\341\202\373\373\373\210\377\377\377\7\304\304\304\305\306"
+  "\307\27&2\36""3G\32""5M\33""7O\33""8Q\203\34""8Q\4\32""6M\31""3J,=M%"
+  "/9\202\346\346\346\210\377\377\377\202\373\373\373\202\335\335\335\202"
+  "\373\373\373\210\377\377\377\12\204\204\204\211\216\222\24(9\31""1G\34"
+  "7P\34""8Q\32""5M\27/DGR\\>\77A\216\377\377\377\4^__fms\26+\77\32""4K"
+  "\202\34""8Q\203\34""9R\3\34""9S\34""9R\34""9S\202\34""9R\6\34""7O\31"
+  "4K\24);\15\31$\204\204\205\204\204\204\224\377\377\377\202\346\346\346"
+  "\10''(/9A\25*=\31""3J\33""7N\31""4J\27):\15\27\37\202\314\314\314\216"
+  "\377\377\377\202\341\341\341\10\21\31\40\33.>\32""4L\33""7O\31""2H\22"
+  "%6V[_QQQ\202\373\373\373\222\377\377\377\7\242\242\242\243\243\244\21"
+  "!/\30""1F\34""9R\34""9S\35""9R\203\34""9S\202\34""9R\34\34""9S\34""9"
+  "R\34""9S\34""9R\35""9R#=S8AI\255\255\256\264\264\263n^0\212h\13f`Q\305"
+  "\305\305\363\363\363\375\375\375\376\376\376\374\374\374\366\366\366"
+  "\356\356\356\277\277\277[P1\271\210\1\346\250\0\371\267\3\324\235\7l"
+  "_;\310\310\310\371\371\371\212\377\377\377\17\376\376\376\375\375\375"
+  "\374\374\374\372\372\372\367\367\367\365\365\365\362\362\362\356\356"
+  "\356\350\350\350\334\334\334\314\314\314\273\273\273\210\210\210.0.\40"
+  "6H\202\35""9R\2\34""9R\34""9S\205\34""9R\2\35""9S\34""9S\202\34""9R\6"
+  "\34""7O\31""3J\23'8\14\30\"xxywww\202\373\373\373\220\377\377\377\202"
+  "\256\256\256\20\23\30\35\32)5\26.B\32""5L\35""9Q\34""9R\34""9S\34""9"
+  "R\34""9S\34""9R\35""9Q\34""8P\32""2I\24)<(2;\40\"#\202\314\314\314\222"
+  "\377\377\377\202\204\204\204\12\13\27!\23&8\31""3J\33""7O\34""9S\35""9"
+  "S\34""6N\30""0FGS]>AB\216\377\377\377\4^^^elr\25+>\33""5L\202\34""9R"
+  "\2\34:T\34""9T\202\34:T\202\35:T\10\34""9T\35:T\34""9S\33""8Q\31""2H"
+  "\23(:\211\216\222\204\205\205\224\377\377\377\202\346\346\346\12/8@5"
+  "FU\30""3I\33""7P\34""8S\33""8Q\34""2G\24#1\315\315\316\314\314\314\216"
+  "\377\377\377\202\341\341\341\10\27$1\37""5H\34""8Q\34""9S\33""6N\27/"
+  "D[enTY[\202\373\373\373\222\377\377\377\7\242\242\242\243\243\244\21"
+  "!0\30""2H\34:T\35:T\34:T\202\35:T\4\34:T\35:T\34:S\35:T\204\34:T\27\35"
+  ":T&>RGNSNLI\237u\4\356\257\4\201`\6wtm\334\334\334\370\370\370\375\375"
+  "\375\371\371\371\362\362\362\342\342\342\177|t\205c\7\324\233\0\363\262"
+  "\1\372\271\5\253\201\13vur\346\346\346\375\375\375\214\377\377\377\17"
+  "\376\376\376\374\374\374\373\373\373\367\367\367\361\361\361\351\351"
+  "\351\337\337\337\320\320\320\303\303\303\246\246\246CFH!5G\35;U\34""9"
+  "S\36;T\206\34:T\1\35:T\203\34:T\7\35""9T\34""9S\33""8Q\31""2H\23(:}\203"
+  "\211yz|\202\373\373\373\220\377\377\377\11\256\256\256\260\262\264\33"
+  "+9!7J\32""5M\33""7Q\34""9T\35:T\34:S\203\35:T\10\34:T\34:R\34""7P\31"
+  "3J/AQ)5\77\315\315\316\314\314\314\222\377\377\377\6\204\205\206\211"
+  "\216\222\23'8\30""1G\34""7P\34""9S\202\34:T\4\34""7P\30""2HHT`\77CF\216"
+  "\377\377\377\7___fnu\26,A\32""6N\35:T\34:T\34;U\202\35;U\202\35;V\202"
+  "\35;U\11\35:U\35;V\34:U\33""9R\32""4L\24);\16\33(TVYQQQ\202\331\331\331"
+  "\216\377\377\377\202\273\273\273\14!\"%(2<\24)<\31""3J\34""9S\35:T\34"
+  ";U\35:V\33""6O\27.B'3>\36!$\202\310\310\310\212\377\377\377\202\341\341"
+  "\341\4""777\77GO\24*=\32""5L\202\34:U\6\34:T\33""7P\26-B\20\40/TY\\Q"
+  "QQ\202\352\352\352\210\377\377\377\202\356\356\356\202\346\346\346\204"
+  "\377\377\377'\242\242\242\243\245\247\22$4\30""3J\35;U\35:U\34;U\35:"
+  "U\34;U\34;V\35;V\34:U\34;V\35;V\35;U\34;U\35;V\35:U\36;V':L/36pU\13\342"
+  "\246\2\345\252\11OB\40\247\247\247\352\352\352\372\372\372\366\366\366"
+  "\356\356\356\311\311\311\\M%\257\200\1\344\247\0\371\267\3\372\271\7"
+  "r[\34\242\242\242\361\361\361\214\377\377\377\202\376\376\376\15\374"
+  "\374\374\372\372\372\365\365\365\354\354\354\341\341\341\322\322\322"
+  "\305\305\305\257\257\257abc\35,:\36<V\34;V\36<U\202\35;U\10\35;V\35:"
+  "U\35;V\35;U\35;V\34:U\35;V\34;U\202\35;U\10\35;V\34;U\34""9S\31""4L\25"
+  "+\77\20\40/28\77,-.\202\242\242\242\202\356\356\356\206\377\377\377\202"
+  "\367\367\367\202\273\273\273\11QQQSVX\13\30#\21%6\30""1G\33""7O\34:T"
+  "\35;U\35:U\202\34:U\15\35;U\35;V\35;U\35:U\35:V\34;T\34""9S\31""5M\26"
+  ".B\23#2\15\30!Z\\]YYY\202\277\277\277\202\367\367\367\206\377\377\377"
+  "\202\362\362\362\202\256\256\256\16<<<AFJ\17\35+\24*<\32""4K\34""9R\35"
+  ";U\34;U\35;V\35;U\33""9R\31""3K>M[6>D\216\273\273\273\17LQTS^h\27/F\33"
+  "7P\34;U\34;V\35<W\34;W\35;W\35<W\35<V\34<W\35;W\35<W\35;W\202\35;V\5"
+  "\34:T\31""4K\26-A[epV]b\202\331\331\331\216\377\377\377\20\273\273\273"
+  "\274\275\277*7B/BS\31""4L\32""9Q\35;V\35<W\34<W\35<W\33""9T\32""6N-A"
+  "R'3\77\311\312\312\310\310\310\212\377\377\377\202\341\341\341\14\77"
+  "GOETa\31""2J\34""9S\35;W\34<W\35<U\34:U\32""5M\26.B[fpV[`\202\352\352"
+  "\352\210\377\377\377\202\356\356\356\202\346\346\346\204\377\377\377"
+  "\5\242\242\242\245\251\254\24)<\32""5N\35;W\204\35<W\1\35;W\203\35<W"
+  "\32\35;W\35<W\34<W\35<W\35;W\35<W!=X(6BB6\22\303\216\0\367\267\6\276"
+  "\217\14ZWP\310\310\310\356\356\356\361\361\361\351\351\351yul}\\\3\320"
+  "\230\0\363\262\1\372\270\4\354\257\10HE=\327\327\327\371\371\371\212"
+  "\377\377\377\202\376\376\376\20\375\375\375\373\373\373\370\370\370\363"
+  "\363\363\353\353\353\336\336\336\322\322\322\305\305\304\252\252\252"
+  "ccd\34(3!>W\35<W\35;W\35<W\35;W\202\35<W\1\34<W\203\35<W\5\35;V\35;W"
+  "\34<W\35<V\35;W\202\35<W\10\35:V\34""9T\32""5N\27""0F;KZ6AL\244\246\251"
+  "\242\242\243\202\356\356\356\206\377\377\377\202\367\367\367\10\273\273"
+  "\273\274\274\275W^d[fp\25,>\31""3J\33""9R\34:T\202\35<W\22\35;W\35;V"
+  "\35;W\35<V\35;W\35<W\35<V\34;W\35<W\35;V\34:S\33""7P\34""4I\27,\77bl"
+  "t^di\300\300\301\277\277\277\202\367\367\367\206\377\377\377\202\362"
+  "\362\362\11\256\256\256\260\261\262DNVIVb\27/C\32""4L\34:T\35;U\34<V"
+  "\203\35<W\6\34:U\32""7O@Sc;HT\275\277\300\274\274\274\212\273\273\273"
+  "\14\273\274\274\274\276\277Q[cVdq\30""4L\34""8S\35<W\35;W\35=X\36=Y\36"
+  "=X\35=X\202\35<X\1\36<X\204\35=X\11\36<Y\34;V\33""9S\30""1H\24*=\17\37"
+  "-\12\25\37DGJAAA\202www\204\242\242\242\202\235\235\235\202sss\24""1"
+  "255:\77\15\32&\22$5\27""1F\32""7O\34:V\35=W\36<X\35<X\35=Y\36<X\36=X"
+  "\34<V\33""7P\27""0E\24%5\16\31#SVXQQQ\202\221\221\221\202\242\242\242"
+  "\202\231\231\231\11bbbccc\30\40&\36+8\24+>\31""4K\34""9U\35;W\35<X\202"
+  "\35=X\11\35<Y\35;V\33""9R\27""0G\24(:\31'2\25\34#RTVQQQ\204\200\200\200"
+  "\23^^^__`\26\32\37\30\36$4:>37;>AD>AB>AD@DI/9A4DQ\27""1G\33""9R\35<X"
+  "\35=X\36=X\35<Y\35<X\203\35<Y\1\35=X\202\35<X\203\35=X\25\35<Y\36>Y)"
+  "BX*,#\251|\0\351\254\3\356\261\12lW\35\215\215\215\333\333\333\354\354"
+  "\354\277\277\277^M\40\274\212\0\343\246\0\371\267\1\372\270\6\241x\5"
+  "}}}\352\352\352\375\375\375\210\377\377\377\202\376\376\376\16\375\375"
+  "\375\374\374\374\371\371\371\366\366\366\357\357\357\350\350\350\337"
+  "\337\337\321\321\321\305\305\305\261\261\261eef\",4\">V\35<Y\202\35="
+  "X\11\35<X\35=X\35<X\36<X\35<X\36=Y\35<X\36=Y\35=X\202\35<X\1\35=X\202"
+  "\35<X\40\35=X\35=Y\35=X\35;W\34:U\32""5N\27""1G\23(:\20!1\14\32&\12\25"
+  "\37!)0\40',',2',1'-2(/4\11\23\34\13\30\"\16\36,\21$4\25,@\30""2I\33""9"
+  "R\34<W\35=X\35<X\35=X\35<X\35=X\35<X\35<Y\202\35=X\4\35<Y\35<X\36<X\35"
+  "=Y\202\35<X\13\35=X\34:U\34""8Q\30""2G\25*>\21#3\16\35+\13\30#\12\23"
+  "\34(/5'-2\202',1\13#)/%,2\12\24\35\14\31%\20\37.\22&8\27.D\31""4M\34"
+  "9T\35<W\35=X\203\35=Y\10\36=X\35<Y\35;X\34;V\31""5N\26.D\22'9\20!1\204"
+  "\17\37-\1\17\37.\203\17\37-\12\17\37.\17\37-\20!0\22%6\26,A\30""3J\33"
+  "8S\34<V\35=Y\35<Y\202\35>Z\7\35=Y\35>Z\36=Z\35>Z\35=Z\35>Y\36=Z\203\35"
+  ">Z\13\36=Y\35<X\33""9T\32""7Q\30""2I\26-AMZgJU_|\202\206{~\202\243\245"
+  "\246\203\243\244\245(\236\240\241\237\241\242w{\200x\177\205;HT>O]\27"
+  "/E\31""5M\33:T\35;V\36=Y\35>Z\36=Z\35=Z\35>Z\35=Z\35>Z\36>Y\34<W\33""9"
+  "S\35""5L\31.B[gqYah\224\227\232\223\225\227\243\244\246\243\244\245\233"
+  "\234\236\233\236\240hnsks{\"5F&=Q\32""6O\33:U\36=X\36=Z\35>Z\36>Z\202"
+  "\35=Z\32\35>Y\34<X\34:T\32""5O\";N\40""3E[fpYai\204\211\215\203\207\213"
+  "\203\207\212\204\210\214elrgpy!3B#6F>N]=LZHVaGU`HUaIXd6IY:Ob\33""8Q\35"
+  ";W\202\35=Z\202\35=Y\206\35=Z\4\35>Y\36=Z\36>Y\35=Z\202\35>Z\23\40@["
+  "*6\77\216i\2\331\237\0\370\267\7\276\216\14YO4\254\254\254\325\325\325"
+  "iga\232r\5\327\236\0\361\261\0\372\270\3\357\261\7SA\16\313\313\313\364"
+  "\364\364\376\376\376\204\377\377\377\204\376\376\376\24\375\375\375\373"
+  "\373\373\371\371\371\366\366\366\361\361\361\353\353\353\342\342\342"
+  "\331\331\331\321\321\321\303\303\303\244\244\244Z\\\\\36*4#\77W\36\77"
+  "Y\35=Z\36>Z\35>Y\36=Z\36>Z\202\35=Z\13\36=Z\35=Z\36=Y\36=Z\35>Z\35=Z"
+  "\36>Z\36=Z\35=Y\35>Z\35>Y\202\36=Z\33\36>Y\35=Y\34;W\33""9T\32""7P\31"
+  "4K\27""1H\27/D-BS+\77P2DT2DS2DT3FW\26.C\30""0F\30""3K\31""5N\32""8R\33"
+  ":T\35<X\35>Y\36=Z\35>Z\36>Z\35>Z\36=Y\202\36=Z\23\35>Z\36>Y\35=Z\35>"
+  "Z\35=Z\35>Y\35=Y\35>Y\36=Z\35<Y\35<X\34""9S\32""8P\32""5M\30""2J\30""0"
+  "G\26.B3FW2ET\2022DS\15""0AR0DU\26.C\27""0G\30""4L\31""5O\33""8S\35;V"
+  "\35=X\35>Z\36=Z\35>Y\36>Z\203\35=Y\12\35>Z\35=Y\34;V\34""9S\31""6O\31"
+  "5M\30""4L\30""3L\30""3K\31""4L\202\30""4L\1\31""4K\202\30""4L\202\31"
+  "4L\11\31""6N\32""8Q\34:U\35=X\35>X\35>Z\35>Y\36\77\\\36\77[\202\35>["
+  "\203\36>[\32\36\77[\36>\\\35>[\36>[\36\77\\\35\77[\36\77[\35>[\36>[\35"
+  "=Y\33<V\32""6P\30""2I\26-B\23*=\22'8\22%6\21$5\21%5\21%7\23':\24*>\26"
+  ".D\31""4L\33""8R\34=W\202\36>[\1\36\77[\203\35>[\202\36>[\24\36\77\\"
+  "\36>[\35\77[\36>[\36\77[\34<Y\34""9T\31""5L\26""0E\24*=\22'9\22%6\21"
+  "%6\22&7\23(;\25-B\27""2J\32""7Q\34;V\35=[\202\36\77[\1\36>[\202\36\77"
+  "[\1\35\77[\202\35>[\202\36>[\33\35=[\35;W\32""9S\31""4M\27""0F\25,A\24"
+  "+>\24*>\25,@\26.E\30""2J\32""6P\33""8R\32""8S\32""6P\31""5N\31""5M\32"
+  "6M\32""6O\33""7Q\34;U\34=X\35=Z\36>\\\36\77[\36\77\\\35>[\202\36>[\2"
+  "\36\77[\36>[\202\35>[\30\36>\\\35\77[\36\77[\35\77[\36\77[\36>\\\35\77"
+  "[,CUiP\11\277\214\0\364\263\3\367\267\12\233t\10VSJ\211\210\207oW\27"
+  "\307\221\0\350\252\0\370\266\1\372\270\5\275\214\6f^G\343\343\343\372"
+  "\372\372\203\377\377\377\1\376\376\376\202\375\375\375\25\374\374\374"
+  "\373\373\373\371\371\371\367\367\367\364\364\364\360\360\360\352\352"
+  "\352\343\343\343\335\335\335\322\322\322\310\310\310\272\272\272\224"
+  "\224\224DFG\35*5%AZ\35>\\\37\77[\35\77[\35>[\36\77[\206\36>[\4\35>[\35"
+  "\77[\36>[\35>\\\203\36>[\3\35>\\\35\77[\36>[\202\35>[\3\36>[\36\77\\"
+  "\36\77[\202\36>[\22\36>Z\35=Y\35;X\34;U\33:T\33""9T\34:U\34:V\34=X\36"
+  "=Z\35>[\36\77[\35>[\36>\\\35\77\\\36\77[\35>[\36\77[\202\36>[\5\35>\\"
+  "\36\77\\\36>[\36>\\\36\77[\202\36>[\6\36>\\\36>[\36\77\\\36>\\\36\77"
+  "\\\36>[\202\35>[\21\35\77\\\36>\\\35>[\36=Z\35<X\34:V\33""9U\32""9S\33"
+  ":S\33:U\33;V\35<Y\36>[\35>\\\35\77[\36>[\35>\\\202\36>[\5\35>[\36\77"
+  "[\36>[\35>\\\36\77[\202\36>[\1\35>[\203\36>[\10\36\77\\\36>[\35\77\\"
+  "\36>[\35>[\36\77[\35\77[\36\77[\202\36>[\1\36\77\\\202\36>[\15\35\77"
+  "[\36\77[\35>\\\35\77[\36>\\\35>[\35\77]\36\77]\36@]\36\77]\36@]\35\77"
+  "\\\36@]\202\36\77]\37\35\77]\36\77]\36@]\36@\\\35\77]\36\77]\36@\\\36"
+  "\77[\35>[\35<Y\34;W\33:U\33""9S\32""8P\32""7O\32""6P\32""6O\31""7P\32"
+  "7Q\33""8S\34:T\34<W\34>Z\36>\\\36\77\\\36@]\36\77]\36@]\36\77]\36@]\36"
+  "\77]\202\36@]\1\36\77]\202\36@\\\20\36@]\36\77\\\35>[\33<X\34:U\33""9"
+  "S\32""7P\32""7Q\32""6P\32""7Q\32""8R\33""9T\34;V\35=Z\35>\\\36@]\202"
+  "\36\77]\2\35\77]\36\77\\\202\36\77]\203\36@]\21\36\77]\36\77\\\35\77"
+  "\\\35=Z\34<X\33:V\33:T\33""9R\33""8R\33""9T\34:T\34<W\35=Y\34=Z\35>Y"
+  "\35<X\35<Y\202\34<X\6\35<Y\35=Y\35>[\36\77\\\35\77]\36\77]\202\36@]\7"
+  "\36@\\\35@]\36@]\36\77]\36@]\36\77]\36@]\202\36\77]\1\36@\\\204\36\77"
+  "]\21$C^A9\31\253}\0\347\251\0\372\271\6\362\264\12w]\26\\Q5\252|\0\336"
+  "\243\0\363\262\0\372\270\2\370\267\6\212h\10\215\214\207\354\354\354"
+  "\373\373\373\202\375\375\375\202\374\374\374\25\372\372\372\371\371\371"
+  "\367\367\367\364\364\364\361\361\361\356\356\356\351\351\351\344\344"
+  "\344\335\335\335\325\325\325\316\316\316\304\304\304\262\262\261\203"
+  "\203\204246\40.;&D^\37\77]!B_\37@^\36@]\202\36\77]\4\36\77\\\36@]\36"
+  "\77]\36\77\\\202\36\77]\15\36\77\\\36\77]\35@\\\36\77\\\36\77]\36\77"
+  "\\\36@]\36\77\\\35\77\\\35\77]\36@]\36\77]\36\77\\\202\36\77]\1\36@]"
+  "\202\36\77]\4\36\77[\36>[\34>[\35=Y\202\35>Z\5\35>[\36>\\\36\77\\\36"
+  "\77]\36\77\\\206\36\77]\2\35\77\\\36\77]\202\36\77\\\4\36@]\35\77\\\36"
+  "\77]\36@\\\202\36\77]\202\36@]\13\36\77]\36\77\\\36@\\\35\77]\36\77]"
+  "\36@]\36\77]\35\77]\36@\\\36\77\\\35>[\202\35=Z\1\35=Y\202\35>[\6\35"
+  "\77[\35\77]\36@]\36\77\\\35\77\\\36\77]\203\36\77\\\5\36@]\36\77\\\35"
+  "@]\36\77]\35@\\\203\36\77]\203\36@]\1\36\77\\\202\36\77]\1\36@]\202\36"
+  "\77]\6\35\77\\\36\77]\36\77\\\36\77]\35@]\36\77\\\204\36\77]\10\36\77"
+  "\\\36\77]\37A^\36@^\36A^\37@^\36@^\37@^\205\36@^\11\36A^\36@^\37A^\36"
+  "@_\37@^\36@^\37@_\36A^\37@^\211\36@^\202\37@^\2\36@_\37@_\202\36@^\202"
+  "\36@_\1\36A^\202\36@^\2\37@^\36A^\202\36@^\203\36A^\1\36@_\203\36@^\2"
+  "\37A^\36@_\203\36@^\1\37@^\203\36@^\5\36@_\36@^\37@^\36@^\36A^\202\36"
+  "@_\1\36A^\203\37@_\2\36@^\37@^\207\36@^\203\37@^\4\36A^\36@_\36@^\36"
+  "@_\203\36@^\1\37@^\203\36@^\12\37A^\37@^\36@^\36A^\37@^\36@_\36A^\37"
+  "@_\36@^\36A^\206\36@^\21\36A^\37A^3=<\213g\3\321\231\0\370\266\1\372"
+  "\270\6\361\262\6\322\232\1\362\261\0\361\261\0\371\266\1\372\270\3\340"
+  "\246\7l[*\257\257\257\353\353\353\204\363\363\363\24\362\362\362\360"
+  "\360\360\355\355\355\352\352\352\346\346\346\342\342\342\336\336\336"
+  "\331\331\331\324\324\324\314\314\314\301\301\301\251\251\251yyz367\35"
+  "\"'0\77M)Gb\40A_\40C`$C`\202\36@^\2\37@_\36@^\202\36A^\2\36@^\36A^\202"
+  "\36@^\202\36@_\204\36@^\1\36@_\203\36@^\2\37@^\36A^\202\36@^\5\36@_\36"
+  "@^\36A_\36@^\36A^\203\36@^\5\37A^\36@^\36@_\36@^\36@_\204\36@^\4\37@"
+  "^\36@^\36A^\36@_\202\37@_\202\36@^\1\36A^\203\36@^\202\36A^\7\36@^\37"
+  "@^\36A^\36@^\37@_\36@^\37@_\202\36@^\11\37@^\36@^\36A^\36@^\36A^\36@"
+  "^\36@_\36@^\36A^\203\36@^\1\36@_\202\37@^\1\36A^\204\36@^\5\36A^\36@"
+  "^\36@_\36@^\36@_\202\36@^\1\36A^\204\36@^\6\36@_\36@^\37A^\36@^\37@^"
+  "\36A^\207\36@^\3\36A^\37@^\36A_\202\37A`\4\37B`\37A_\36A_\37A`\203\37"
+  "A_\202\36A_\4\36A`\37A_\36B`\37B_\202\36A_\3\37A`\36A`\36A_\202\37A_"
+  "\4\37A`\36A_\37A`\37B`\205\36A`\202\36A_\1\37A_\204\36A_\1\36A`\202\37"
+  "A_\1\36A`\202\37A_\1\37A`\202\36A`\1\37A_\202\36A_\202\36A`\24\37A`\36"
+  "A`\37A`\36A`\37A_\36B`\37A_\36A`\37A_\37A`\36A`\37A_\36A_\36A`\37A`\36"
+  "A_\37A`\36A_\36A`\36B`\202\37A`\2\36A_\37A`\202\37A_\3\37A`\36A`\37A"
+  "`\202\36A`\1\36A_\202\36A`\4\36A_\37A`\36B`\37A_\202\37A`\2\36A_\37A"
+  "_\203\37A`\6\36A_\37A_\36A_\37A_\37A`\36A_\202\36A`\12\36A_\37A`\37A"
+  "_\36A_\36A`\37A`(9CqU\5\302\216\0\357\257\0\202\372\267\2\11\372\267"
+  "\1\371\266\0\370\265\0\372\267\1\372\270\5\305\222\10_ZN\303\303\303"
+  "\344\344\344\204\347\347\347\24\346\346\346\344\344\344\341\341\341\336"
+  "\336\336\331\331\331\323\323\323\316\316\316\312\312\312\273\273\273"
+  "\242\242\242\201\201\202CEG&-1-<G5Mb%Eb\37A`\40A`\37A`\37A_\202\36A_"
+  "\1\37A`\203\36A`\2\37A`\37A_\203\37A`\2\36A_\37A_\202\36A_\14\36A`\37"
+  "A_\36A`\36A_\36A`\36A_\36A`\36A_\37A`\37A_\36A_\37A_\202\37A`\6\36A_"
+  "\37A_\36A`\37A_\36A`\36A_\202\36A`\202\36A_\5\36A`\36A_\36A`\37A_\37"
+  "A`\202\36A`\1\37A`\202\36A_\202\36A`\4\36A_\37A`\36B_\36A`\202\37A_\202"
+  "\36A`\1\37A_\204\36A`\13\36A_\36A`\36B_\36A_\37A`\36A`\37A_\37A`\37A"
+  "_\36A`\37A`\202\36A_\3\37A`\36A`\36A_\202\36A`\1\36A_\202\36A`\2\37A"
+  "`\36A_\202\36A`\202\37A`\202\36A_\10\37B`\36A_\36A`\36A_\36A`\36A_\36"
+  "A`\37A`\203\36A`\1\36B`\203\36A`\202\37A_\1\37A`\202\37Ba\5\36Ba\37B"
+  "a\36Ba\37Ba\36Ba\210\37Ba\6\36Bb\37Ba\36Ba\37Ba\36Ba\37Bb\203\37Ba\7"
+  "\36Ba\37Ba\37Bb\37Ba\36Ba\36Bb\36Ba\202\37Ba\3\36Ba\37Ba\36Ba\207\37"
+  "Ba\1\36Ba\202\37Ba\202\36Ba\1\37Ab\202\37Ba\5\36Ba\37Ba\36Ba\37Ba\36"
+  "Ba\204\37Ba\1\36Ba\203\37Ba\3\36Ba\37Ba\36Ba\203\37Ba\1\36Ba\205\37B"
+  "a\2\36Bb\36Ba\202\37Ba\1\36Ba\202\37Ba\7\36Ba\37Ba\36Ba\37Ba\36Ba\37"
+  "Ba\36Ba\202\37Ba\2\36Ba\37Ba\202\36Ba\1\37Ba\202\36Ba\202\37Ba\2\36B"
+  "a\37Ba\204\36Ba\205\37Ba\5'\77SR@\15\250|\1\340\244\0\366\264\0\204\372"
+  "\267\0\30\372\270\2\372\271\7\204e\16]]]\246\246\246\275\275\275\311"
+  "\311\311\322\322\322\325\325\325\324\324\324\323\323\323\322\322\322"
+  "\320\320\320\304\304\304\266\266\266\247\247\247\232\232\232\217\217"
+  "\220jjj37:\32'1\36""0A.G[$Eb\204\37Ba\2\36Ba\37Ba\202\36Ba\3\37Ba\37"
+  "Aa\37Ba\202\36Ba\2\37Aa\36Ba\202\37Ba\3\36Ba\37Aa\36Ba\203\37Ba\1\37"
+  "Aa\204\36Ba\2\36Aa\36Ba\205\37Ba\3\36Ba\37Ba\36Ba\205\37Ba\202\36Ba\1"
+  "\37Ba\202\36Ba\2\37Ba\36Ba\203\37Ba\202\36Ba\206\37Ba\1\36Ba\202\37B"
+  "a\202\36Ba\1\37Ba\202\36Ba\2\37Ba\36Ba\202\37Ba\202\36Ba\2\37Ba\36Ba"
+  "\202\37Ba\2\36Ba\37Ba\202\36Ba\202\37Ba\1\36Aa\202\36Ba\2\37Ba\36Ba\202"
+  "\37Ba\2\37Aa\36Ba\203\37Ba\3\36Aa\37Ba\36Ba\207\37Ba\3\36Ba\37Ba\36B"
+  "a\207\37Ba\1\37Bc\204\37Cc\202\37Cb\5\37Cc\36Bb\37Cc\37Cb\37Cc\202\37"
+  "Bb\5\36Cb\37Cc\37Cb\37Cc\37Cb\203\37Cc\1\37Cb\202\37Cc\202\37Cb\1\37"
+  "Cc\203\37Cb\5\36Cc\37Cc\37Bb\37Bc\37Cb\203\37Cc\202\37Cb\1\37Cc\202\37"
+  "Cb\13\37Cc\37Cb\37Bc\37Cc\36Cc\37Cb\37Cc\37Cb\37Bb\37Cb\37Cc\204\37C"
+  "b\4\37Cc\37Cb\37Bc\37Bb\202\37Cc\202\37Cb\204\37Cc\203\37Cb\1\37Bc\204"
+  "\37Cb\1\37Cc\204\37Cb\1\36Bc\202\37Cc\202\37Cb\13\37Bc\37Cc\36Cc\36C"
+  "b\37Cb\36Cb\37Bc\36Bc\37Cb\37Bb\37Bc\204\37Cb\2\37Cc\37Cb\202\37Bc\1"
+  "\37Cc\202\37Cb\7\37Cc!B`9</\207d\3\312\224\0\353\254\0\367\265\0\203"
+  "\372\267\0\10\372\270\4\365\265\7""2(\14-//>\77@TUUnoo\211\211\211\203"
+  "\221\221\221\14\220\220\220\215\215\215pppSTT;=@'/6$3\77)=M'AX!C`\37"
+  "Bc\37Cc\203\37Cb\1\37Bb\202\37Cb\4\37Bc\37Cb\37Cc\36Cb\203\37Cc\6\37"
+  "Cb\37Cc\37Cb\37Cc\37Cb\37Bc\202\37Cb\2\37Cc\37Cb\204\37Cc\202\37Cb\1"
+  "\37Cc\202\37Bb\202\37Cb\1\37Bc\202\37Cb\204\37Cc\3\37Bc\36Cc\37Cc\202"
+  "\37Cb\13\37Cc\37Bc\37Cb\36Cc\37Cb\37Cc\37Bb\37Cb\37Bc\37Cb\37Cc\207\37"
+  "Cb\203\37Cc\1\37Bc\203\37Cb\202\37Cc\2\37Bb\37Cc\202\37Cb\1\37Cc\203"
+  "\37Cb\1\37Cc\202\37Cb\1\37Bc\203\37Cb\16\37Cc\37Cb\36Cc\37Bc\37Cb\37"
+  "Cc\37Cb\37Bc\36Cb\37Cb\37Bc\37Cc\37Cb\37Bc\202\37Cb\3\37Bc\37Cb\37Bb"
+  "\202\37Cc\13\37Cb\37Cc\37Cb\37Cc\37Bc\37Cb\36Cc\36Cb\37Cb\37Cd\40Cd\202"
+  "\37Dd\3\40Cd\37Cd\40Cd\202\37Dd\202\37Cd\202\37Dd\2\37Cd\37Dd\202\37"
+  "Cd\207\37Dd\1\37Cd\202\37Dd\7\37Cd\37Dd\37Dc\40Dd\37Cd\40Cd\37Dd\202"
+  "\37Cd\203\37Dd\1\40Dd\205\37Dd\1\37Cd\204\37Dd\14\37Dc\40Cd\37Dd\37C"
+  "d\37Dd\37Cd\37Dd\37Cd\37Dd\40Dc\37Dd\37Cd\212\37Dd\1\40Dd\202\37Dd\4"
+  "\40Cd\37Cd\40Dd\40Cd\203\37Dd\202\37Cd\202\37Dd\1\37Cd\202\37Dd\202\37"
+  "Cd\2\40Dd\37Cd\202\37Dd\1\40Cd\202\37Dd\1\37Cd\202\37Dd\2\40Dc\37Cd\205"
+  "\37Dd#\37Cd\37Dd\37Cd\37Dd\40Dd+DW]K\17\251|\1\326\234\0\360\260\0\370"
+  "\266\0\372\267\0\372\267\1\372\270\5\303\220\10).11FY+BX';K#2\77$0;%"
+  "08%/8%08%/9$0;\"2@&:L'AW%D_\"Dd\40Ed\40Dd\37Dd\37Cd\203\37Dd\1\40Dd\202"
+  "\37Dd\5\37Cd\37Dd\40Cd\37Dd\37Cd\202\37Dd\2\37Cd\40Dd\202\37Dd\1\40D"
+  "d\203\37Dd\1\37Cd\203\37Dd\7\37Cc\37Dd\37Dc\37Dd\40Dc\37Cd\40Dd\211\37"
+  "Dd\15\40Cd\37Cd\37Dd\37Dc\37Cd\37Dd\37Cd\40Cd\37Dc\37Dd\40Dd\37Cd\37"
+  "Dd\202\37Cd\4\37Dd\40Cc\37Dd\37Cd\203\37Dd\2\37Cd\40Dd\202\37Cd\203\37"
+  "Dd\1\40Cd\204\37Dd\1\40Cd\204\37Dd\2\37Cd\37Dc\202\37Dd\20\37Cd\40Dd"
+  "\37Dd\37Cd\37Dd\37Cd\37Dd\37Cd\37Dd\37Cd\37Dd\40Cd\37Cd\37Dd\40Cd\40"
+  "Dd\205\37Dd\202\37Cd\3\37Dd\40Dd\37Dd\202\37Cd\203\37Dd\204\37Cd\202"
+  "\40Ef\14\37Ef\40Ee\37Ee\37De\40Ee\37De\37Ee\37De\40Df\37Ee\40De\40Ee"
+  "\202\40De\20\37Ef\40Ee\37Df\37Ee\40De\37Df\40De\40Ee\40Ef\37Ef\37De\40"
+  "Ee\40Df\37Ef\40Ee\37Df\202\40Df\5\37Ef\37Df\40Ee\40Ef\40Ee\202\37Ee\3"
+  "\40Ee\37Ee\37De\202\40De\202\40Ee\11\37De\40De\37Df\37De\40Ee\40De\40"
+  "Df\40Ee\37Ef\202\37Ee\2\37De\40Ee\202\40Df\1\37Ef\202\37Df\6\40De\37"
+  "Ee\37Df\40Dfdw\214\337\342\346\202\377\377\377\5\352\354\356u\206\227"
+  "\37EeQh\200\337\342\346\202\377\377\377\3\337\342\346Qh\200\40Ef\202"
+  "\37Ee\4\37De\40De\37De\37Ef\202\37Df\12\324\330\335\224\240\255\37Df"
+  "\37Ee\37De\40Ef\40De\37De\37Df\40Ee\202\37Ee\6\37Dev\206\227\352\354"
+  "\356\40De\37De\37Ef\202\37Ee\14\37De%Hf4@BiO\6\261\202\1\337\243\0\363"
+  "\262\0\371\266\0\372\270\3\370\267\7iR\15""7M_\202!Ff\20!Egey\214\377"
+  "\377\377\365\366\367%Hg%Gg%Hh#Ff!Ef\40Ef\37Ef\37Df\40Ee\37Df\37EeQh\200"
+  "\204\377\377\377\31\337\342\346;Ws\242\254\267\274\303\312\37De\40De"
+  "\37Ee\37De\40Ef\37Ee\37De\40Ee\37Ee\40Ef\37Ef\37De\37Ee\40Ef\37Ee\40"
+  "Efu\206\227\242\254\267\37Ee\37Df\37Ee\203\40De\7Qh\200\310\315\324\37"
+  "Ee\324\330\335\224\240\255\40Ef\40De\203\37Ee\13\40Ef\37Df\37De\37Ef"
+  "\40Df\37Ef\40Ef\40Ee\40DeQh\200\337\342\346\202\377\377\377\5\242\254"
+  "\267\40Df\37De\40Ee\37Ee\205\377\377\377\1\224\240\255\203\37Ee\2\40"
+  "Df\40De\203\40Ee\3\40Df\40De\40Ef\202\37Ef\23\40Ee\242\254\267\274\303"
+  "\312\37De\40Ee\40Ef\40De\37Ef\37Eev\206\227\242\254\267\37De\37Ee\40"
+  "Df\37Ef\37Ee\40De\37Ef\37Df\202\37Ee\3\40Ef\40De\40Ee\202\37Ef\4\40E"
+  "e\37Df\37Ef\40Ef\203\37Ee\27\40Df\40Ee\40Df\40De\37Ee\40Ef\37De\40De"
+  "\40Eg\40Fg\40Ff\40Fg\37Eg\37Ff\40Eg\37Eg\40Fg\37Ef\40Ef\40Fg\37Eg\37"
+  "Fg\37Eg\202\37Fg\202\37Eg\1\40Fg\202\37Eg\203\40Fg\202\40Eg\2\37Eg\37"
+  "Ef\202\40Fg\202\40Eg\3\40Ef\40Fg\40Eg\202\40Fg\204\40Eg\7\40Ef\37Eg\40"
+  "Eg\37Eg\37Fg\40Eg\40Ff\202\40Eg\3\40Fg\40Eg\40Ff\202\40Fg\7\37Eg\40F"
+  "g\37Fg\40Fg\37Fg\40Fg\37Eg\202\40Fg\1\37Fg\202\37Eg\22\40Eg<Yt\365\366"
+  "\367\257\270\302\40Eg\40Ff\242\254\267\365\366\367<Xt\257\270\302\310"
+  "\316\324\40Eg\37Fg\274\303\313\310\316\324\37Eg\40Fg\40Eg\202\40Fg\7"
+  "\40Eg\37Fg\37Eg\37Fg\324\330\335\224\240\255\37Eg\203\40Fg\202\40Eg\14"
+  "\37Eg\40Eg\37Eg\40Eg\40Fg\37Fg\40Fg\37Fg\40Fg\37Fg\40Ef\40Eg\202\37F"
+  "g\23.JbJ=\23\177^\3\315\226\0\353\254\0\370\265\0\372\270\3\362\264\10"
+  "\77<!0Ni\"Hh\"Gh\40Fg\242\254\270\274\303\313\40Fg\40Eg\40Fg\37Fg\202"
+  "\40Fg\14\37Eg\37Ff\40Eg\40Ff\37Ef\40FgQi\201\377\377\377\40Fg\40Eg\37"
+  "Eg\324\330\335\202\242\254\270\6\274\303\313\40Fg\37Fg\37Eg\40Ef\37E"
+  "f\202\40Eg\23\40Fg\40Eg\40Fg\40Eg\37Ef\40Eg\37Eg\40Eg\37Fg\242\254\270"
+  "\274\303\313\40Fg\37Fg\40Ff\40Eg\37Eg\40Fgu\206\230\352\354\356\202\40"
+  "Eg\10\37Fg\40Ef\37Fg\37Eg\37Fg\40Eg\40Fg\40Eg\202\40Fg\2\40Ef\40Fg\202"
+  "\40Eg\30\40Fg\257\270\302\274\303\313\40Egu\206\230\377\377\377<Yt\37"
+  "Fg\40Eg\37Fg\377\377\377Qi\201\37Fg\40Eg\37Fg\37Eg\40Ef\40Ff\37Eg\40"
+  "Ef\37Eg\37Ff\40Eg\37Fg\202\40Fg\11\40Eg\37Fg\40Fg\37Fg\242\254\270\274"
+  "\303\313\40Ff\40Eg\40Fg\202\37Fg\12\37Eg\242\254\270\274\303\313\40F"
+  "f\40Fg\37Eg\40Ff\37Ff\37Fg\40Fg\202\40Eg\202\37Fg\6\37Eg\40Fg\37Eg\40"
+  "Eg\37Ef\40Eg\203\40Fg\202\40Eg\202\40Fg\202\37Ff\6\40Ff\40Fg\40Eg\40"
+  "Fg\40Fi\40Fh\203\40Gh\203\40Fh\202\40Gh\2\40Fi\37Gi\202\40Gh\3\40Fh\40"
+  "Fi\40Gh\203\40Fh\204\40Gh\2\40Gi\40Fh\202\40Fi\14\40Fh\37Fh\40Gi\40F"
+  "i\37Fh\40Fh\37Fi\37Fh\40Fh\40Fi\40Gi\40Fh\203\40Gh\4\40Fi\40Gh\40Fh\40"
+  "Gh\202\40Fh\202\40Fi\1\37Fh\204\40Fh\2\40Gi\40Gh\202\40Fh\1\40Gh\202"
+  "\40Fi\1\37Fi\202\40Fh\5\40Gi\40Fi\40Fh\242\255\271\324\330\335\202\40"
+  "Fh\2\40Gh\40Fi\202\40Fh\3\242\254\271\337\342\346dx\216\202\40Fh\2\40"
+  "Gh\40Fh\202\40Gh\17v\207\231\365\366\367\352\354\357\377\377\377\324"
+  "\330\335\40Gh\324\330\336\224\240\257\242\254\271\274\303\313\40FhQi"
+  "\202\377\377\377\40Gh\242\254\270\202\377\377\377\30\337\342\346\242"
+  "\255\270v\206\231\352\354\357\40Gi\377\377\377\352\354\357\377\377\377"
+  "\352\354\357Qi\202\40Fi%Ji0;:pT\5\244x\2\344\247\0\366\265\1\372\270"
+  "\5\273\215\14""6CF(Lj\40Gi\40Fiv\207\231\202\377\377\377\2\274\303\313"
+  "\242\254\271\202\377\377\377\13\337\342\346Qi\202v\207\231\352\354\357"
+  "\377\377\377\324\330\335\40Gi\40FhQj\202\377\377\377\40Gi\202\40Fh\6"
+  "\337\342\346\242\254\270\242\255\270\274\303\313\40Gi\274\303\313\202"
+  "\377\377\377\5\324\330\336Qi\202\377\377\377dy\216\40Fh\202\274\303\313"
+  "\1\242\255\270\202\377\377\377\2\337\342\346\205\224\244\202\377\377"
+  "\377\2\242\254\271\224\240\256\202\377\377\377\2\337\342\346Qj\202\202"
+  "\377\377\377\5\324\330\335\324\330\336\224\240\256Qi\203\337\342\346"
+  "\202\377\377\377\4\242\255\270\40Gi\324\330\336\352\354\357\202\377\377"
+  "\377\1u\206\231\205\40Fh\12\40Fi<Yu\377\377\377<Yu\40Fh\40Gh\40Fh\377"
+  "\377\377Qj\202\40Gh\202\40Fh\3\37Gi\242\254\270\337\342\346\202\377\377"
+  "\377\6\242\254\270\365\366\367\377\377\377\242\254\271v\207\231\352\354"
+  "\357\202\40Fi\6\352\354\357v\207\231\242\254\270\274\303\313<Yu\337\342"
+  "\346\202\377\377\377\2\242\254\271v\206\231\202\377\377\377\202\242\255"
+  "\270\202\377\377\377\7\337\342\346Qi\202\242\255\270\352\354\357\377"
+  "\377\377\242\254\270\337\342\346\202\377\377\377\5\224\241\257\40Gh\40"
+  "Fi\40Fh\40Gh\203\40Fh\202\40Gh\5\40Fh\40Fi\40Gh\40Fh\37Fh\202\40Fh\1"
+  "\40Hi\202\40Gi\202\40Gj\6\40Gi!Gj\40Gj!Gi\40Gi!Gj\202\40Gj\6\40Hj\40"
+  "Gj!Gi\40Hj\40Gi\40Gj\202\40Gi\11\40Hj!Gj\40Hj!Gj\40Gj\40Gi\40Gj\40Gi"
+  "\40Hj\203\40Gj\2!Gi\40Gi\202\40Gj\16!Gj\40Gi\40Gj\40Gi\40Gj\40Hj\40G"
+  "j!Gi!Gj\40Hj\40Gi\40Gj\40Hj\40Gj\202\40Gi\4\40Hj\40Gi\40Gj\40Hi\202\40"
+  "Gj\16\40Hj!Gi\40Gi\40Gj\40Hj\40Hi\40Gj!Gj\40Gi\40Gj\242\255\271\274\303"
+  "\314\40Gj\40Gi\203\377\377\377\77v\207\232\40Gj\242\255\271\310\316\325"
+  "\377\377\377\337\342\346dy\216!Gj\40Gj\40Hiv\207\232\377\377\377v\207"
+  "\232<Yw\352\354\357\224\241\257\324\330\336\224\241\257\242\255\271\274"
+  "\303\314\40HiQj\203\377\377\377Qj\203\377\377\377dy\217<Yv\352\354\357"
+  "\242\255\271v\207\232\352\354\357\40Gj\377\377\377\242\255\271\40Gj\324"
+  "\330\336\242\255\271\40Gi!Gj+IdWJ\32\221j\2\332\240\0\365\264\1\365\266"
+  "\6vb\33""0Jb$Kk\40Gj\40Gi\40Gj\242\255\271\274\303\314Qj\203\377\377"
+  "\377v\207\232\40Gi\324\330\336\274\303\314v\207\232\377\377\377Qj\203"
+  "!Gi\202\40Gj\1Rj\203\204\377\377\377\6\337\342\346<Yw\242\255\271\274"
+  "\303\314Qj\203\242\255\271\202<Yv\11\377\377\377dy\216\310\316\325\257"
+  "\270\303<Yw\365\366\367v\207\232\352\354\357\274\303\314\203\40Gj\32"
+  "\242\255\271\274\303\314\40Gj\224\241\257dy\216\40Gi\337\342\346\242"
+  "\255\271v\207\232\352\354\357\40Gi\324\330\336\224\241\257\274\303\314"
+  "\324\330\336\40Giv\207\232\377\377\377Rj\203\324\330\336\310\316\325"
+  "\40Gj\242\255\271\324\330\336!Gj\40Gj\203\40Gi\7\40Gj\274\303\314\242"
+  "\255\271\40Gi\40Hj\40Gi\40Gj\205\377\377\377\13Qj\203\242\255\271\352"
+  "\354\357<Zv\242\255\271\365\366\367<Yv\242\255\271\337\342\346v\207\232"
+  "\352\354\357\202\40Gj\30\352\354\357v\207\232\242\255\271\274\303\314"
+  "v\207\232\205\224\245\40Gj\205\224\245\365\366\367!Hj\242\255\271\274"
+  "\303\314Qj\203\377\377\377v\207\232\40Gj\324\330\336\274\303\314\242"
+  "\255\271\352\354\357\40Hj\224\241\257\352\354\357dy\216\202\40Gj\1\40"
+  "Hj\202\40Gi\6\40Gj\40Gi!Gj\40Gi\40Hi\40Gi\202\40Gj\202\40Gi\3\40Gj\40"
+  "Gi\40Gj\202!Hk\202\40Hk\2!Hk!Hl\203\40Hk\1\40Ik\203\40Hk\1!Ik\202\40"
+  "Hk\1\40Hl\202!Hk\1\40Hk\204!Hk\202\40Hk\1!Hk\202\40Hk\3!Hk\40Hk\40Hl"
+  "\203\40Hk\4!Hk\40Hk!Ik!Hl\202\40Hk\202!Hk\205\40Hk\203!Hk\12\40Hk!Hk"
+  "\40Hk!Hk\40Ik\40Hk\40Ik!Ik\40Hk!Hk\203\40Hk\11!Hk\40Hk!Hk\40Hk!Hk\242"
+  "\255\271\324\330\336!Hk\40Hl\202!Hk\4\352\354\357v\207\233\40Hk!Ik\202"
+  "\40Hk\24\224\241\257\365\366\367!Hk\40Ik\40Hkv\207\233\352\354\357\40"
+  "Hl!Hk\274\303\314\242\255\272\324\330\336\224\241\257\242\255\271\274"
+  "\303\314!HkRj\204\377\377\377v\207\233\352\354\357\202\40Hk\34\274\303"
+  "\314\242\255\271v\207\233\352\354\357\40Hk\377\377\377Qj\204\40Hk\274"
+  "\304\314\242\255\271!Hl\40Hk$Ik/AI\200`\7\305\220\0\360\260\1\340\246"
+  "\6IL8+Mj#Jl\40Hk!Hk\40Hk\242\255\271\274\303\314v\210\233\352\354\357"
+  "\202\40Hk\4\224\241\257\324\330\336v\207\233\352\354\357\204\40Hk\2Q"
+  "j\204\377\377\377\202\40Hk\202!Hk\6\40Hk\242\255\271\274\303\314\40H"
+  "k\257\270\303\324\330\336\202\377\377\377\202v\207\233\16\365\366\367"
+  "\224\241\257\352\354\357!Hk\224\241\257\365\366\367\377\377\377\352\354"
+  "\357ey\220\242\255\271\274\303\314\40Hk\242\255\271\274\303\314\202\377"
+  "\377\377\21\242\255\271v\207\233\352\354\357\40Ik\324\330\336\224\241"
+  "\257\324\330\336\224\241\257\40Hk\40Hl\352\354\357v\210\233\324\330\336"
+  "\224\241\257\40Hk\224\241\257\324\330\336\205\40Hk\12\274\303\314\242"
+  "\255\271\40Ik\40Hk!Hk\40Ik!Hk\377\377\377Rk\204!Hk\203\40Hk\12\242\255"
+  "\272\274\303\314!Hk\224\241\257\324\330\336\40Hk\224\241\257\352\354"
+  "\357v\210\233\352\354\357\202\40Hk\7\352\354\357v\207\233\242\255\271"
+  "\274\303\314Rj\204\274\303\314\337\342\346\202\377\377\377\5\40Hk\242"
+  "\255\271\274\303\314v\207\233\352\354\357\202\40Hk\7\224\241\257\324"
+  "\330\336\242\255\271\274\303\314!Hk<Zx\324\330\336\202\377\377\377\1"
+  "\274\303\314\202\40Hk\5\40Ik!Hk\40Hk\40Ik\40Hk\202!Hk\1\40Hk\202!Hk\2"
+  "\40Hk!Ik\202\40Hk\12!Il\40Il!Il\40Il!Il\40Il!Im!Il\40Im!Il\202!Im\1!"
+  "Il\202\40Im\1!Il\202!Im\4!Hm!Il!Im!Il\203!Im\204\40Im\202!Im\2!Il\40"
+  "Il\202!Im\3\40Im!Il!Im\202\40Il\204!Il\11\40Il!Im\40Im!Il!Hl!Il\40Il"
+  "!Il\40Im\202\40Il\7!Im!Il!Im\40Il!Il\40Im\40Il\202!Il\23!Im\40Im!Il\40"
+  "Il\40Hl<[z\365\366\367\274\304\314\40Im\40Il\242\255\272\365\366\367"
+  "v\210\233\337\343\347\274\304\314!Im\40Il\205\225\246\352\354\357\202"
+  "!Im6\40Ilv\210\234\377\377\377ez\221<[y\352\354\357\205\225\246\324\331"
+  "\336\224\241\260\242\255\272\324\331\336\40Im\242\255\272\377\377\377"
+  "Rk\206\377\377\377ez\220<[z\352\354\357\242\255\272v\210\233\352\354"
+  "\357\40Im\377\377\377Rk\206\40Im\274\304\314\242\255\272!Im\40Im!Il*"
+  "LjA>&\251}\1\310\223\1}c\25""5JX$Km!Il\40Im\40Il!Im\242\255\272\274\304"
+  "\315Qk\205\377\377\377v\210\233!Il\324\331\336\310\316\326v\210\233\352"
+  "\354\357\40Im!Il\202\40Il\3Qk\206\377\377\377!Il\202!Im\202!Il\11\242"
+  "\255\272\274\304\315\242\255\272\352\354\357<[yez\220\377\377\377v\210"
+  "\233!Il\202\352\354\357\3\242\255\272!Im!Il\202\40Im\1\337\343\347\202"
+  "\242\255\272\30\274\304\314dz\220\377\377\377ez\220<[y\352\354\357\242"
+  "\255\272v\210\233\352\354\357!Im\324\331\336\224\241\260\274\304\314"
+  "\324\331\336\40Ilv\210\234\377\377\377dy\220\324\331\336\224\241\260"
+  "!Im\224\241\260\324\331\336\40Il\202!Il\6ez\221\352\354\357\224\241\260"
+  "!Im!Hm!Il\202!Im)!Il\377\377\377Rk\206!Il!Hl!Il\40Hl\242\255\272\274"
+  "\304\314\40Il\224\241\260\324\331\336!Hm\224\241\260\352\354\357v\210"
+  "\233\365\366\367<[zdz\220\377\377\377v\210\233\242\255\272\274\304\314"
+  "\310\316\326\310\316\325\40Im\242\255\272\377\377\377\40Hl\242\255\272"
+  "\274\304\314Rk\206\377\377\377v\210\233!Il\324\331\336\310\316\326\242"
+  "\255\272\274\304\314\40Il!Im\202\40Im\4\206\225\246\377\377\377!Il\40"
+  "Im\202\40Il\202!Im\1!Il\205!Im\5\40Im\40Hl!Il!Hl!Jm\203!Jn\1!Jm\207!"
+  "Jn\3\40Jn!Jm!Jn\202!In\202!Jn\5!Im!Jn!In\40Jn!In\205!Jn\1!Im\203!Jn\3"
+  "!In\40Jn!Jm\203!Jn\1!In\202!Jn\202\40Jn\2!Jn!In\205!Jn\3!Jm!Jn!Jm\202"
+  "!Jn\1!In\203!Jn\2!Im!Jm\202!Jn\1!Im\202!Jm\5!Jn\40Jn!Jne{\221\324\331"
+  "\336\202\377\377\377\5\352\354\357\224\242\261!Jne{\222\337\343\347\202"
+  "\377\377\377\2\352\354\357v\210\234\203!Jn\17v\210\234\365\366\367\352"
+  "\354\357\377\377\377\274\304\315!In\324\331\336\224\242\261Rl\206\352"
+  "\354\357\377\377\377\324\331\336\377\377\377!In\242\256\272\202\377\377"
+  "\377\23\337\343\347\242\256\272v\210\234\352\354\357!Jn\377\377\377R"
+  "k\206!Jn\274\304\315\242\256\272!Jm!Im!Jm!Jn(Ic.>@5DE0Kb&Ln\203!Jn\6"
+  "!Jm!Jn\242\256\272\274\304\315!Jn\242\255\272\202\377\377\377\4\337\343"
+  "\347Ql\206v\210\234\352\354\357\202!In\6!Jn!InRk\206\377\377\377!In!"
+  "Jn\202!In\17\40Jn\242\256\272\274\304\315e{\222\365\366\367\377\377\377"
+  "\337\343\347\352\354\357\206\225\246\40Jn\242\256\272\377\377\377Rl\206"
+  "!Jm\324\331\336\202\377\377\377\14\337\343\347Rl\206\224\242\261\377"
+  "\377\377\242\255\272\337\343\347\377\377\377\352\354\357\324\331\336"
+  "\257\271\304Ql\206\377\377\377\202\324\331\336\3\224\242\261Rl\206\337"
+  "\343\347\202\377\377\377\13\242\256\272!Jn\324\331\336\224\242\261!I"
+  "n\224\242\261\324\331\336!In!Im!In\337\343\347\204\377\377\377\1Rl\206"
+  "\203!Jn\205\377\377\377\202\242\256\272\20\274\304\315!Jn\224\242\261"
+  "\324\331\336!Jn\224\241\261\352\354\357!In\324\331\336\377\377\377\337"
+  "\343\347\365\366\367v\210\234\242\256\272\274\304\315\224\242\261\202"
+  "\377\377\377\5\310\316\326\377\377\377<\\{\224\242\260\377\377\377\202"
+  "\242\256\272\202\377\377\377\7\337\343\347Rl\206\242\255\272\274\304"
+  "\315!Jnv\210\234\365\366\367\202\377\377\377\5\242\255\272\40Jn!In\40"
+  "Jn!Im\202!Jn\1\40Jn\205!Jn\1!Jm\202!Jn\1!In\202!Ko\3!Jo!Kp!Jo\202!Kp"
+  "\202!Jo\3!Ko!Kp!Jo\202!Ko\4!Jp\"Jo!Ko!Jo\203!Ko\11\"Kp!Jp!Ko!Jo!Ko\""
+  "Jo!Kp!Jo\"Ko\202!Ko\1!Jo\202!Ko\3\"Jo!Ko!Jo\202!Ko\10!Kp\"Jp!Kp!Jo!J"
+  "p\"Jo!Ko!Kp\204!Ko\1!Kp\203!Ko\"\"Ko!Jo!Ko!Jp!Kp!Ko!Jo!Kp\"Jo!Jo!Kp!"
+  "Jp!Ko!Jo!Jp\"Ko!Kp!Jo!Ko!Jo\"Jo!Ko!Jo!Ko!Kp!Ko\"Jo!Kp!Jo!Kov\211\235"
+  "\352\354\357\"Kp\"Ko\203!Jo\2!Kp!Jo\202!Ko\10\"Jo!KpRl\210\377\377\377"
+  "e{\222=]{\352\354\357\242\256\273\202!Jo\4!Ko!Jo\"Ko!Jo\205!Ko\6!Kp!"
+  "Ko#Lo#Ko\"Jp\"Kp\202!Ko\3\"Ko!Jo\"Ko\205!Ko\3!Jo!Jp\"Jp\202!Jo\3!Kp!"
+  "Jo!Jp\202!Jo\202!Ko\5!Jo!Kp!Ko!Jo!Ko\202!Jo\202!Kp\202!Jo\4!Kp\"Jo\274"
+  "\304\315\324\331\337\205!Jo\2!Ko!Kp\202!Ko\1!Jo\202!Ko\2!Jo!Kp\203!K"
+  "o\202!Jo\6\"Ko!Jo\"Ko!Jp!Ko!Jo\202!Ko\13!Kp!Ko!Jo!Ko!Jp!Kp\"Ko!Ko\"K"
+  "o!Kp!Jo\205!Ko\2\"Jp!Ko\202!Jo\202!Ko\202!Jo\6!Ko!Jo\"Kp!Jp!Kp!Jp\202"
+  "!Ko\3!Jo!Jp!Jo\203!Ko\1!Jo\203!Ko\2!Jp!Jo\203!Ko\3!Kp!Ko!Jo\202!Ko\2"
+  "!Jo!Ko\202!Jo\202!Ko\2!Kp\"Ko\203!Ko\1!Jp\205!Ko\5!Kp!Jo!Jp!Ko!Jo\202"
+  "!Ko\20!Lq!Kp\"Kq!Kq!Kp\"Lq!Kq!Lq\"Lp!Kp\"Kq!Lq!Kq!Lq\"Lp\"Lq\202!Lq\10"
+  "\"Lq!Lp!Kp\"Lp!Lp!Kq\"Lq!Lq\202!Kq\11\"Kq!Kq!Lq\"Lp!Kq!Lp!Kq!Lq\"Lp\202"
+  "\"Kq\202!Lq\6\"Kp!Lq\"Lp!Kq\"Lp\"Kp\202!Kp\12!Lp\"Kp\"Kq\"Kp!Kq!Kp!K"
+  "q!Lq!Kq!Lq\202!Kq\15\"Lp\"Kp!Kq\"Lp!Kp!Lp!Kp\"Kp\"Lq!Kq!Lq!Kp!Lp\202"
+  "!Lq\2!Kp\"Lp\202!Kq\13!Kp!Lp\"Kp!Kp!Lp\"Lqv\211\236\352\354\357!Lp!K"
+  "q\"Lq\202!Lq\3!Lp\"Kq\"Lq\202!Lq\3!Kq!Lq\274\304\315\202\377\377\377"
+  "\6\324\331\337=]}\"Kq!Lq!Kq\"Kq\202!Lp\14\"Lp!Kq!Lq\"Lp!Lp!Lq!Kq!Lq!"
+  "Lp\"Lp!Kp\"Kq\202!Kp\202\"Kq\1!Lp\202!Lq\202!Lp\4\"Lq!Lp!Kp!Lp\202!K"
+  "q\11\"Lq\"Kq!Lq!Kp\"Kq!Kq\"Lp!Lq!Kq\203\"Lp\11\"Kq!Lq\"Kq\"Lq\"Lp!Kq"
+  "\324\331\337\377\377\377e|\224\203!Kq\20\"Lp!Kq!Lp\"Lp!Kq\"Kp!Kq!Lq\""
+  "Lp!Kp!Lp\"Kq\"Lq!Kp!Kq\"Kq\203\"Lp\23!Kp!Kq\"Lq!Kq!Lq!Kp\"Kp!Kq!Lp!K"
+  "q!Lq\"Lq\"Kp!Lp\"Kp!Lp\"Lq!Lq!Lp\203!Kq\6!Kp\"Lp\"Lq!Kq!Lp\"Lp\202!L"
+  "p\202\"Lp\10!Kp!Kq!Kp!Kq!Lp!Kq!Lp!Kp\202!Kq\13!Lp\"Kq!Kp!Lp\"Lq!Lp!K"
+  "q\"Kp!Lq!Kq!Kp\202!Lq\202!Kp\1\"Lp\202!Kq\6!Lq!Kq!Lq\"Lq!Kq!Lp\202!L"
+  "q\203!Kq\3!Kp!Lp\"Kq\202!Kp\1\"Lq\202!Kq\4!Lq\"Kq!Lr!Mr\202!Lr\2!Mr!"
+  "Lr\203\"Lr\203\"Mr\202!Lr\204\"Lr\2!Lr\"Lr\202!Lr\202\"Mr\2!Lr\"Lr\202"
+  "!Lr\202\"Lr\4!Lr\"Lr!Lr\"Lr\202!Lr\10\"Lr!Lr\"Mr\"Lr!Lr\"Mr\"Lr!Mr\202"
+  "\"Lr\4!Lr\"Mr!Lr\"Mr\203\"Lr\4!Mq\"Lr\"Mr!Lr\202\"Lr\1!Lr\204\"Lr\10"
+  "!Lr\"Mr!Mr\"Lr\"Mr\"Lr!Mr\"Mr\203!Lr\6\"Lr!Lr\"Lr\"Lq!Lr\"Mr\204!Lr\202"
+  "\"Lr\7!Mr!Lr\"Mr!Mr!Mq\"Mr\"Lr\203!Lr\6!Mr!Lr\"Mr\"Lr!Mr!Lr\202\"Lr\14"
+  "!Lr!Mr\"Lq!Lr\"Lr!Mr!Lr\"Mr\"Lr!Mr\"Lr!Lr\204\"Lr\2\"Mr!Mr\202!Lr\2!"
+  "Mr\"Mr\202\"Lr\202!Mr\1!Lr\203\"Lr\5!Lr\"Lr!Lr\"Lr!Lr\202\"Lr\202\"M"
+  "r\1\"Lr\202!Mr\202\"Mr\3\"Lr!Lr!Mr\202!Lr\2\"Lr!Lr\203\"Lr\11!Lr\"Mr"
+  "\"Lr\"Mr!Mr\"Lq!Lr\"Mr!Mr\202!Lr\3\"Lr!Lr\"Lr\202!Mr\202\"Lr\11!Lr\""
+  "Lr!Lr\"Lr!Lr\"Lr!Mr!Lr\"Mr\202!Lr\2\"Lr!Mr\204\"Lr\4!Lr\"Lr\"Mr\"Lr\202"
+  "\"Mr\7!Lr\"Lr!Lr!Mr!Lr\"Lr\"Mr\202\"Lr\202!Lr\5\"Lr!Mr!Lr\"Mr!Lr\203"
+  "\"Lr\6!Lr!Mr\"Mr!Mr!Lr\"Lr\203!Lr\4\"Mr\"Lr!Lr!Mr\204\"Lr\1\"Mr\203\""
+  "Lr\4!Mr!Lr\"Lr\"Mr\202\"Lr\5\"Mr\"Lr!Lr\"Lr\"Mr\202\"Lr\3\"Mr!Lr\"Mr"
+  "\203\"Lr\3!Lr\"Lr!Lr",
+};
+
+
diff --git a/plugins/GSdx_legacy/GSLocalMemory.cpp b/plugins/GSdx_legacy/GSLocalMemory.cpp
new file mode 100644
index 0000000000..537a3f088f
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLocalMemory.cpp
@@ -0,0 +1,2126 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ *	Special Notes:
+ *
+ *	Based on Page.c from GSSoft
+ *	Copyright (C) 2002-2004 GSsoft Team
+ *
+ */
+
+#include "stdafx.h"
+#include "GSLocalMemory.h"
+
+#define ASSERT_BLOCK(r, w, h) \
+	ASSERT((r).width() >= w && (r).height() >= h && !((r).left & (w - 1)) && !((r).top & (h - 1)) && !((r).right & (w - 1)) && !((r).bottom & (h - 1))); \
+
+#define FOREACH_BLOCK_START(r, w, h, bpp) \
+	ASSERT_BLOCK(r, w, h); \
+	GSVector4i _r = r >> 3; \
+	uint8* _dst = dst - _r.left * bpp; \
+	int _offset = dstpitch * h; \
+	for(int y = _r.top; y < _r.bottom; y += h >> 3, _dst += _offset) \
+	{ \
+		uint32 _base = off->block.row[y]; \
+		for(int x = _r.left; x < _r.right; x += w >> 3) \
+		{ \
+			const uint8* src = BlockPtr(_base + off->block.col[x]); \
+			uint8* dst = &_dst[x * bpp]; \
+
+#define FOREACH_BLOCK_END }}
+
+//
+
+uint32 GSLocalMemory::pageOffset32[32][32][64];
+uint32 GSLocalMemory::pageOffset32Z[32][32][64];
+uint32 GSLocalMemory::pageOffset16[32][64][64];
+uint32 GSLocalMemory::pageOffset16S[32][64][64];
+uint32 GSLocalMemory::pageOffset16Z[32][64][64];
+uint32 GSLocalMemory::pageOffset16SZ[32][64][64];
+uint32 GSLocalMemory::pageOffset8[32][64][128];
+uint32 GSLocalMemory::pageOffset4[32][128][128];
+
+int GSLocalMemory::rowOffset32[4096];
+int GSLocalMemory::rowOffset32Z[4096];
+int GSLocalMemory::rowOffset16[4096];
+int GSLocalMemory::rowOffset16S[4096];
+int GSLocalMemory::rowOffset16Z[4096];
+int GSLocalMemory::rowOffset16SZ[4096];
+int GSLocalMemory::rowOffset8[2][4096];
+int GSLocalMemory::rowOffset4[2][4096];
+
+short GSLocalMemory::blockOffset32[256];
+short GSLocalMemory::blockOffset32Z[256];
+short GSLocalMemory::blockOffset16[256];
+short GSLocalMemory::blockOffset16S[256];
+short GSLocalMemory::blockOffset16Z[256];
+short GSLocalMemory::blockOffset16SZ[256];
+short GSLocalMemory::blockOffset8[256];
+short GSLocalMemory::blockOffset4[256];
+
+//
+
+GSLocalMemory::psm_t GSLocalMemory::m_psm[64];
+
+//
+
+GSLocalMemory::GSLocalMemory()
+	: m_clut(this)
+{
+	m_vm8 = (uint8*)vmalloc(m_vmsize * 2, false);
+	m_vm16 = (uint16*)m_vm8;
+	m_vm32 = (uint32*)m_vm8;
+
+	memset(m_vm8, 0, m_vmsize);
+
+	for(int bp = 0; bp < 32; bp++)
+	{
+		for(int y = 0; y < 32; y++) for(int x = 0; x < 64; x++)
+		{
+			pageOffset32[bp][y][x] = PixelAddressOrg32(x, y, bp, 0);
+			pageOffset32Z[bp][y][x] = PixelAddressOrg32Z(x, y, bp, 0);
+		}
+
+		for(int y = 0; y < 64; y++) for(int x = 0; x < 64; x++)
+		{
+			pageOffset16[bp][y][x] = PixelAddressOrg16(x, y, bp, 0);
+			pageOffset16S[bp][y][x] = PixelAddressOrg16S(x, y, bp, 0);
+			pageOffset16Z[bp][y][x] = PixelAddressOrg16Z(x, y, bp, 0);
+			pageOffset16SZ[bp][y][x] = PixelAddressOrg16SZ(x, y, bp, 0);
+		}
+
+		for(int y = 0; y < 64; y++) for(int x = 0; x < 128; x++)
+		{
+			pageOffset8[bp][y][x] = PixelAddressOrg8(x, y, bp, 0);
+		}
+
+		for(int y = 0; y < 128; y++) for(int x = 0; x < 128; x++)
+		{
+			pageOffset4[bp][y][x] = PixelAddressOrg4(x, y, bp, 0);
+		}
+	}
+
+	for(size_t x = 0; x < countof(rowOffset32); x++)
+	{
+		rowOffset32[x] = (int)PixelAddress32(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset32Z); x++)
+	{
+		rowOffset32Z[x] = (int)PixelAddress32Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset16); x++)
+	{
+		rowOffset16[x] = (int)PixelAddress16(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset16S); x++)
+	{
+		rowOffset16S[x] = (int)PixelAddress16S(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset16Z); x++)
+	{
+		rowOffset16Z[x] = (int)PixelAddress16Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset16SZ); x++)
+	{
+		rowOffset16SZ[x] = (int)PixelAddress16SZ(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset8[0]); x++)
+	{
+		rowOffset8[0][x] = (int)PixelAddress8(x & 0x7ff, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
+		rowOffset8[1][x] = (int)PixelAddress8(x & 0x7ff, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(rowOffset4[0]); x++)
+	{
+		rowOffset4[0][x] = (int)PixelAddress4(x & 0x7ff, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
+		rowOffset4[1][x] = (int)PixelAddress4(x & 0x7ff, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
+	}
+
+	for(size_t x = 0; x < countof(blockOffset32); x++)
+	{
+		blockOffset32[x] = (short)((int)BlockNumber32(x << 3, 0, 0, 32) - (int)BlockNumber32(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset32Z); x++)
+	{
+		blockOffset32Z[x] = (short)((int)BlockNumber32Z(x << 3, 0, 0, 32) - (int)BlockNumber32Z(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset16); x++)
+	{
+		blockOffset16[x] = (short)((int)BlockNumber16(x << 3, 0, 0, 32) - (int)BlockNumber16(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset16S); x++)
+	{
+		blockOffset16S[x] = (short)((int)BlockNumber16S(x << 3, 0, 0, 32) - (int)BlockNumber16S(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset16Z); x++)
+	{
+		blockOffset16Z[x] = (short)((int)BlockNumber16Z(x << 3, 0, 0, 32) - (int)BlockNumber16Z(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset16SZ); x++)
+	{
+		blockOffset16SZ[x] = (short)((int)BlockNumber16SZ(x << 3, 0, 0, 32) - (int)BlockNumber16SZ(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset8); x++)
+	{
+		blockOffset8[x] = (short)((int)BlockNumber8(x << 3, 0, 0, 32) - (int)BlockNumber8(0, 0, 0, 32));
+	}
+
+	for(size_t x = 0; x < countof(blockOffset4); x++)
+	{
+		blockOffset4[x] = (short)((int)BlockNumber4(x << 3, 0, 0, 32) - (int)BlockNumber4(0, 0, 0, 32));
+	}
+
+	for(size_t i = 0; i < countof(m_psm); i++)
+	{
+		m_psm[i].pa = &GSLocalMemory::PixelAddress32;
+		m_psm[i].bn = &GSLocalMemory::BlockNumber32;
+		m_psm[i].rp = &GSLocalMemory::ReadPixel32;
+		m_psm[i].rpa = &GSLocalMemory::ReadPixel32;
+		m_psm[i].wp = &GSLocalMemory::WritePixel32;
+		m_psm[i].wpa = &GSLocalMemory::WritePixel32;
+		m_psm[i].rt = &GSLocalMemory::ReadTexel32;
+		m_psm[i].rta = &GSLocalMemory::ReadTexel32;
+		m_psm[i].wfa = &GSLocalMemory::WritePixel32;
+		m_psm[i].wi = &GSLocalMemory::WriteImage<PSM_PSMCT32, 8, 8, 32>;
+		m_psm[i].ri = &GSLocalMemory::ReadImageX; // TODO
+		m_psm[i].rtx = &GSLocalMemory::ReadTexture32;
+		m_psm[i].rtxP = &GSLocalMemory::ReadTexture32;
+		m_psm[i].rtxb = &GSLocalMemory::ReadTextureBlock32;
+		m_psm[i].rtxbP = &GSLocalMemory::ReadTextureBlock32;
+		m_psm[i].bpp = m_psm[i].trbpp = 32;
+		m_psm[i].pal = 0;
+		m_psm[i].bs = GSVector2i(8, 8);
+		m_psm[i].pgs = GSVector2i(64, 32);
+		for(int j = 0; j < 8; j++) m_psm[i].rowOffset[j] = rowOffset32;
+		m_psm[i].blockOffset = blockOffset32;
+		m_psm[i].msk = 0xff;
+	}
+
+	m_psm[PSM_PSMCT16].pa = &GSLocalMemory::PixelAddress16;
+	m_psm[PSM_PSMCT16S].pa = &GSLocalMemory::PixelAddress16S;
+	m_psm[PSM_PSMT8].pa = &GSLocalMemory::PixelAddress8;
+	m_psm[PSM_PSMT4].pa = &GSLocalMemory::PixelAddress4;
+	m_psm[PSM_PSMZ32].pa = &GSLocalMemory::PixelAddress32Z;
+	m_psm[PSM_PSMZ24].pa = &GSLocalMemory::PixelAddress32Z;
+	m_psm[PSM_PSMZ16].pa = &GSLocalMemory::PixelAddress16Z;
+	m_psm[PSM_PSMZ16S].pa = &GSLocalMemory::PixelAddress16SZ;
+
+	m_psm[PSM_PSMCT16].bn = &GSLocalMemory::BlockNumber16;
+	m_psm[PSM_PSMCT16S].bn = &GSLocalMemory::BlockNumber16S;
+	m_psm[PSM_PSMT8].bn = &GSLocalMemory::BlockNumber8;
+	m_psm[PSM_PSMT4].bn = &GSLocalMemory::BlockNumber4;
+	m_psm[PSM_PSMZ32].bn = &GSLocalMemory::BlockNumber32Z;
+	m_psm[PSM_PSMZ24].bn = &GSLocalMemory::BlockNumber32Z;
+	m_psm[PSM_PSMZ16].bn = &GSLocalMemory::BlockNumber16Z;
+	m_psm[PSM_PSMZ16S].bn = &GSLocalMemory::BlockNumber16SZ;
+
+	m_psm[PSM_PSMCT24].rp = &GSLocalMemory::ReadPixel24;
+	m_psm[PSM_PSMCT16].rp = &GSLocalMemory::ReadPixel16;
+	m_psm[PSM_PSMCT16S].rp = &GSLocalMemory::ReadPixel16S;
+	m_psm[PSM_PSMT8].rp = &GSLocalMemory::ReadPixel8;
+	m_psm[PSM_PSMT4].rp = &GSLocalMemory::ReadPixel4;
+	m_psm[PSM_PSMT8H].rp = &GSLocalMemory::ReadPixel8H;
+	m_psm[PSM_PSMT4HL].rp = &GSLocalMemory::ReadPixel4HL;
+	m_psm[PSM_PSMT4HH].rp = &GSLocalMemory::ReadPixel4HH;
+	m_psm[PSM_PSMZ32].rp = &GSLocalMemory::ReadPixel32Z;
+	m_psm[PSM_PSMZ24].rp = &GSLocalMemory::ReadPixel24Z;
+	m_psm[PSM_PSMZ16].rp = &GSLocalMemory::ReadPixel16Z;
+	m_psm[PSM_PSMZ16S].rp = &GSLocalMemory::ReadPixel16SZ;
+
+	m_psm[PSM_PSMCT24].rpa = &GSLocalMemory::ReadPixel24;
+	m_psm[PSM_PSMCT16].rpa = &GSLocalMemory::ReadPixel16;
+	m_psm[PSM_PSMCT16S].rpa = &GSLocalMemory::ReadPixel16;
+	m_psm[PSM_PSMT8].rpa = &GSLocalMemory::ReadPixel8;
+	m_psm[PSM_PSMT4].rpa = &GSLocalMemory::ReadPixel4;
+	m_psm[PSM_PSMT8H].rpa = &GSLocalMemory::ReadPixel8H;
+	m_psm[PSM_PSMT4HL].rpa = &GSLocalMemory::ReadPixel4HL;
+	m_psm[PSM_PSMT4HH].rpa = &GSLocalMemory::ReadPixel4HH;
+	m_psm[PSM_PSMZ32].rpa = &GSLocalMemory::ReadPixel32;
+	m_psm[PSM_PSMZ24].rpa = &GSLocalMemory::ReadPixel24;
+	m_psm[PSM_PSMZ16].rpa = &GSLocalMemory::ReadPixel16;
+	m_psm[PSM_PSMZ16S].rpa = &GSLocalMemory::ReadPixel16;
+
+	m_psm[PSM_PSMCT32].wp = &GSLocalMemory::WritePixel32;
+	m_psm[PSM_PSMCT24].wp = &GSLocalMemory::WritePixel24;
+	m_psm[PSM_PSMCT16].wp = &GSLocalMemory::WritePixel16;
+	m_psm[PSM_PSMCT16S].wp = &GSLocalMemory::WritePixel16S;
+	m_psm[PSM_PSMT8].wp = &GSLocalMemory::WritePixel8;
+	m_psm[PSM_PSMT4].wp = &GSLocalMemory::WritePixel4;
+	m_psm[PSM_PSMT8H].wp = &GSLocalMemory::WritePixel8H;
+	m_psm[PSM_PSMT4HL].wp = &GSLocalMemory::WritePixel4HL;
+	m_psm[PSM_PSMT4HH].wp = &GSLocalMemory::WritePixel4HH;
+	m_psm[PSM_PSMZ32].wp = &GSLocalMemory::WritePixel32Z;
+	m_psm[PSM_PSMZ24].wp = &GSLocalMemory::WritePixel24Z;
+	m_psm[PSM_PSMZ16].wp = &GSLocalMemory::WritePixel16Z;
+	m_psm[PSM_PSMZ16S].wp = &GSLocalMemory::WritePixel16SZ;
+
+	m_psm[PSM_PSMCT32].wpa = &GSLocalMemory::WritePixel32;
+	m_psm[PSM_PSMCT24].wpa = &GSLocalMemory::WritePixel24;
+	m_psm[PSM_PSMCT16].wpa = &GSLocalMemory::WritePixel16;
+	m_psm[PSM_PSMCT16S].wpa = &GSLocalMemory::WritePixel16;
+	m_psm[PSM_PSMT8].wpa = &GSLocalMemory::WritePixel8;
+	m_psm[PSM_PSMT4].wpa = &GSLocalMemory::WritePixel4;
+	m_psm[PSM_PSMT8H].wpa = &GSLocalMemory::WritePixel8H;
+	m_psm[PSM_PSMT4HL].wpa = &GSLocalMemory::WritePixel4HL;
+	m_psm[PSM_PSMT4HH].wpa = &GSLocalMemory::WritePixel4HH;
+	m_psm[PSM_PSMZ32].wpa = &GSLocalMemory::WritePixel32;
+	m_psm[PSM_PSMZ24].wpa = &GSLocalMemory::WritePixel24;
+	m_psm[PSM_PSMZ16].wpa = &GSLocalMemory::WritePixel16;
+	m_psm[PSM_PSMZ16S].wpa = &GSLocalMemory::WritePixel16;
+
+	m_psm[PSM_PSMCT24].rt = &GSLocalMemory::ReadTexel24;
+	m_psm[PSM_PSMCT16].rt = &GSLocalMemory::ReadTexel16;
+	m_psm[PSM_PSMCT16S].rt = &GSLocalMemory::ReadTexel16S;
+	m_psm[PSM_PSMT8].rt = &GSLocalMemory::ReadTexel8;
+	m_psm[PSM_PSMT4].rt = &GSLocalMemory::ReadTexel4;
+	m_psm[PSM_PSMT8H].rt = &GSLocalMemory::ReadTexel8H;
+	m_psm[PSM_PSMT4HL].rt = &GSLocalMemory::ReadTexel4HL;
+	m_psm[PSM_PSMT4HH].rt = &GSLocalMemory::ReadTexel4HH;
+	m_psm[PSM_PSMZ32].rt = &GSLocalMemory::ReadTexel32Z;
+	m_psm[PSM_PSMZ24].rt = &GSLocalMemory::ReadTexel24Z;
+	m_psm[PSM_PSMZ16].rt = &GSLocalMemory::ReadTexel16Z;
+	m_psm[PSM_PSMZ16S].rt = &GSLocalMemory::ReadTexel16SZ;
+
+	m_psm[PSM_PSMCT24].rta = &GSLocalMemory::ReadTexel24;
+	m_psm[PSM_PSMCT16].rta = &GSLocalMemory::ReadTexel16;
+	m_psm[PSM_PSMCT16S].rta = &GSLocalMemory::ReadTexel16;
+	m_psm[PSM_PSMT8].rta = &GSLocalMemory::ReadTexel8;
+	m_psm[PSM_PSMT4].rta = &GSLocalMemory::ReadTexel4;
+	m_psm[PSM_PSMT8H].rta = &GSLocalMemory::ReadTexel8H;
+	m_psm[PSM_PSMT4HL].rta = &GSLocalMemory::ReadTexel4HL;
+	m_psm[PSM_PSMT4HH].rta = &GSLocalMemory::ReadTexel4HH;
+	m_psm[PSM_PSMZ24].rta = &GSLocalMemory::ReadTexel24;
+	m_psm[PSM_PSMZ16].rta = &GSLocalMemory::ReadTexel16;
+	m_psm[PSM_PSMZ16S].rta = &GSLocalMemory::ReadTexel16;
+
+	m_psm[PSM_PSMCT24].wfa = &GSLocalMemory::WritePixel24;
+	m_psm[PSM_PSMCT16].wfa = &GSLocalMemory::WriteFrame16;
+	m_psm[PSM_PSMCT16S].wfa = &GSLocalMemory::WriteFrame16;
+	m_psm[PSM_PSMZ24].wfa = &GSLocalMemory::WritePixel24;
+	m_psm[PSM_PSMZ16].wfa = &GSLocalMemory::WriteFrame16;
+	m_psm[PSM_PSMZ16S].wfa = &GSLocalMemory::WriteFrame16;
+
+	m_psm[PSM_PSMCT24].wi = &GSLocalMemory::WriteImage24; // TODO
+	m_psm[PSM_PSMCT16].wi = &GSLocalMemory::WriteImage<PSM_PSMCT16, 16, 8, 16>;
+	m_psm[PSM_PSMCT16S].wi = &GSLocalMemory::WriteImage<PSM_PSMCT16S, 16, 8, 16>;
+	m_psm[PSM_PSMT8].wi = &GSLocalMemory::WriteImage<PSM_PSMT8, 16, 16, 8>;
+	m_psm[PSM_PSMT4].wi = &GSLocalMemory::WriteImage<PSM_PSMT4, 32, 16, 4>;
+	m_psm[PSM_PSMT8H].wi = &GSLocalMemory::WriteImage8H; // TODO
+	m_psm[PSM_PSMT4HL].wi = &GSLocalMemory::WriteImage4HL; // TODO
+	m_psm[PSM_PSMT4HH].wi = &GSLocalMemory::WriteImage4HH; // TODO
+	m_psm[PSM_PSMZ32].wi = &GSLocalMemory::WriteImage<PSM_PSMZ32, 8, 8, 32>;
+	m_psm[PSM_PSMZ24].wi = &GSLocalMemory::WriteImage24Z; // TODO
+	m_psm[PSM_PSMZ16].wi = &GSLocalMemory::WriteImage<PSM_PSMZ16, 16, 8, 16>;
+	m_psm[PSM_PSMZ16S].wi = &GSLocalMemory::WriteImage<PSM_PSMZ16S, 16, 8, 16>;
+
+	m_psm[PSM_PSMCT24].rtx = &GSLocalMemory::ReadTexture24;
+	m_psm[PSM_PSMCT16].rtx = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8;
+	m_psm[PSM_PSMT4].rtx = &GSLocalMemory::ReadTexture4;
+	m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8H;
+	m_psm[PSM_PSMT4HL].rtx = &GSLocalMemory::ReadTexture4HL;
+	m_psm[PSM_PSMT4HH].rtx = &GSLocalMemory::ReadTexture4HH;
+	m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32;
+	m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24;
+	m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16;
+
+	m_psm[PSM_PSMCT24].rtxP = &GSLocalMemory::ReadTexture24;
+	m_psm[PSM_PSMCT16].rtxP = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMT8].rtxP = &GSLocalMemory::ReadTexture8P;
+	m_psm[PSM_PSMT4].rtxP = &GSLocalMemory::ReadTexture4P;
+	m_psm[PSM_PSMT8H].rtxP = &GSLocalMemory::ReadTexture8HP;
+	m_psm[PSM_PSMT4HL].rtxP = &GSLocalMemory::ReadTexture4HLP;
+	m_psm[PSM_PSMT4HH].rtxP = &GSLocalMemory::ReadTexture4HHP;
+	m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32;
+	m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24;
+	m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16;
+	m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16;
+
+	m_psm[PSM_PSMCT24].rtxb = &GSLocalMemory::ReadTextureBlock24;
+	m_psm[PSM_PSMCT16].rtxb = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8;
+	m_psm[PSM_PSMT4].rtxb = &GSLocalMemory::ReadTextureBlock4;
+	m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8H;
+	m_psm[PSM_PSMT4HL].rtxb = &GSLocalMemory::ReadTextureBlock4HL;
+	m_psm[PSM_PSMT4HH].rtxb = &GSLocalMemory::ReadTextureBlock4HH;
+	m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32;
+	m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24;
+	m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
+
+	m_psm[PSM_PSMCT24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
+	m_psm[PSM_PSMCT16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMT8].rtxbP = &GSLocalMemory::ReadTextureBlock8P;
+	m_psm[PSM_PSMT4].rtxbP = &GSLocalMemory::ReadTextureBlock4P;
+	m_psm[PSM_PSMT8H].rtxbP = &GSLocalMemory::ReadTextureBlock8HP;
+	m_psm[PSM_PSMT4HL].rtxbP = &GSLocalMemory::ReadTextureBlock4HLP;
+	m_psm[PSM_PSMT4HH].rtxbP = &GSLocalMemory::ReadTextureBlock4HHP;
+	m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32;
+	m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
+	m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
+	m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
+
+	m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16;
+	m_psm[PSM_PSMT8].bpp = 8;
+	m_psm[PSM_PSMT4].bpp = 4;
+	m_psm[PSM_PSMZ16].bpp = m_psm[PSM_PSMZ16S].bpp = 16;
+
+	m_psm[PSM_PSMCT24].trbpp = 24;
+	m_psm[PSM_PSMCT16].trbpp = m_psm[PSM_PSMCT16S].trbpp = 16;
+	m_psm[PSM_PSMT8].trbpp = m_psm[PSM_PSMT8H].trbpp = 8;
+	m_psm[PSM_PSMT4].trbpp = m_psm[PSM_PSMT4HL].trbpp = m_psm[PSM_PSMT4HH].trbpp = 4;
+	m_psm[PSM_PSMZ24].trbpp = 24;
+	m_psm[PSM_PSMZ16].trbpp = m_psm[PSM_PSMZ16S].trbpp = 16;
+
+	m_psm[PSM_PSMT8].pal = m_psm[PSM_PSMT8H].pal = 256;
+	m_psm[PSM_PSMT4].pal = m_psm[PSM_PSMT4HL].pal = m_psm[PSM_PSMT4HH].pal = 16;
+
+	for(size_t i = 0; i < countof(m_psm); i++) m_psm[i].fmt = 3;
+	m_psm[PSM_PSMCT32].fmt = m_psm[PSM_PSMZ32].fmt = 0;
+	m_psm[PSM_PSMCT24].fmt = m_psm[PSM_PSMZ24].fmt = 1;
+	m_psm[PSM_PSMCT16].fmt = m_psm[PSM_PSMZ16].fmt = 2;
+	m_psm[PSM_PSMCT16S].fmt = m_psm[PSM_PSMZ16S].fmt = 2;
+
+	m_psm[PSM_PSMCT16].bs = m_psm[PSM_PSMCT16S].bs = GSVector2i(16, 8);
+	m_psm[PSM_PSMT8].bs = GSVector2i(16, 16);
+	m_psm[PSM_PSMT4].bs = GSVector2i(32, 16);
+	m_psm[PSM_PSMZ16].bs = m_psm[PSM_PSMZ16S].bs = GSVector2i(16, 8);
+
+	m_psm[PSM_PSMCT16].pgs = m_psm[PSM_PSMCT16S].pgs = GSVector2i(64, 64);
+	m_psm[PSM_PSMT8].pgs = GSVector2i(128, 64);
+	m_psm[PSM_PSMT4].pgs = GSVector2i(128, 128);
+	m_psm[PSM_PSMZ16].pgs = m_psm[PSM_PSMZ16S].pgs = GSVector2i(64, 64);
+
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMCT16].rowOffset[i] = rowOffset16;
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMCT16S].rowOffset[i] = rowOffset16S;
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMT8].rowOffset[i] = rowOffset8[((i + 2) >> 2) & 1];
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMT4].rowOffset[i] = rowOffset4[((i + 2) >> 2) & 1];
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMZ32].rowOffset[i] = rowOffset32Z;
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMZ24].rowOffset[i] = rowOffset32Z;
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMZ16].rowOffset[i] = rowOffset16Z;
+	for(int i = 0; i < 8; i++) m_psm[PSM_PSMZ16S].rowOffset[i] = rowOffset16SZ;
+
+	m_psm[PSM_PSMCT16].blockOffset = blockOffset16;
+	m_psm[PSM_PSMCT16S].blockOffset = blockOffset16S;
+	m_psm[PSM_PSMT8].blockOffset = blockOffset8;
+	m_psm[PSM_PSMT4].blockOffset = blockOffset4;
+	m_psm[PSM_PSMZ32].blockOffset = blockOffset32Z;
+	m_psm[PSM_PSMZ24].blockOffset = blockOffset32Z;
+	m_psm[PSM_PSMZ16].blockOffset = blockOffset16Z;
+	m_psm[PSM_PSMZ16S].blockOffset = blockOffset16SZ;
+
+	m_psm[PSM_PSMCT24].msk = 0x3f;
+	m_psm[PSM_PSMZ24].msk = 0x3f;
+	m_psm[PSM_PSMT8H].msk = 0xc0;
+	m_psm[PSM_PSMT4HL].msk = 0x40;
+	m_psm[PSM_PSMT4HH].msk = 0x80;
+}
+
+GSLocalMemory::~GSLocalMemory()
+{
+	vmfree(m_vm8, m_vmsize * 2);
+
+	for_each(m_omap.begin(), m_omap.end(), aligned_free_second());
+	for_each(m_pomap.begin(), m_pomap.end(), aligned_free_second());
+	for_each(m_po4map.begin(), m_po4map.end(), aligned_free_second());
+
+	for(hash_map<uint64, vector<GSVector2i>*>::iterator i = m_p2tmap.begin(); i != m_p2tmap.end(); i++)
+	{
+		delete [] i->second;
+	}
+}
+
+GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
+{
+	uint32 hash = bp | (bw << 14) | (psm << 20);
+
+	hash_map<uint32, GSOffset*>::iterator i = m_omap.find(hash);
+
+	if(i != m_omap.end())
+	{
+		return i->second;
+	}
+
+	GSOffset* off = new GSOffset(bp, bw, psm);
+
+	m_omap[hash] = off;
+
+	return off;
+}
+
+GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
+{
+	uint32 fbp = FRAME.Block();
+	uint32 zbp = ZBUF.Block();
+	uint32 fpsm = FRAME.PSM;
+	uint32 zpsm = ZBUF.PSM;
+	uint32 bw = FRAME.FBW;
+
+	ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8);
+
+	// "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only)
+
+	uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2);
+	uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2);
+
+	uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28);
+
+	hash_map<uint32, GSPixelOffset*>::iterator i = m_pomap.find(hash);
+
+	if(i != m_pomap.end())
+	{
+		return i->second;
+	}
+
+	GSPixelOffset* off = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32);
+
+	off->hash = hash;
+	off->fbp = fbp;
+	off->zbp = zbp;
+	off->fpsm = fpsm;
+	off->zpsm = zpsm;
+	off->bw = bw;
+
+	pixelAddress fpa = m_psm[fpsm].pa;
+	pixelAddress zpa = m_psm[zpsm].pa;
+
+	int fs = m_psm[fpsm].bpp >> 5;
+	int zs = m_psm[zpsm].bpp >> 5;
+
+	for(int i = 0; i < 2048; i++)
+	{
+		off->row[i].x = (int)fpa(0, i, fbp, bw) << fs;
+		off->row[i].y = (int)zpa(0, i, zbp, bw) << zs;
+	}
+
+	for(int i = 0; i < 2048; i++)
+	{
+		off->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs;
+		off->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs;
+	}
+
+	m_pomap[hash] = off;
+
+	return off;
+}
+
+GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
+{
+	uint32 fbp = FRAME.Block();
+	uint32 zbp = ZBUF.Block();
+	uint32 fpsm = FRAME.PSM;
+	uint32 zpsm = ZBUF.PSM;
+	uint32 bw = FRAME.FBW;
+
+	ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8);
+
+	// "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only)
+
+	uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2);
+	uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2);
+
+	uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28);
+
+	hash_map<uint32, GSPixelOffset4*>::iterator i = m_po4map.find(hash);
+
+	if(i != m_po4map.end())
+	{
+		return i->second;
+	}
+
+	GSPixelOffset4* off = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32);
+
+	off->hash = hash;
+	off->fbp = fbp;
+	off->zbp = zbp;
+	off->fpsm = fpsm;
+	off->zpsm = zpsm;
+	off->bw = bw;
+
+	pixelAddress fpa = m_psm[fpsm].pa;
+	pixelAddress zpa = m_psm[zpsm].pa;
+
+	int fs = m_psm[fpsm].bpp >> 5;
+	int zs = m_psm[zpsm].bpp >> 5;
+
+	for(int i = 0; i < 2048; i++)
+	{
+		off->row[i].x = (int)fpa(0, i, fbp, bw) << fs;
+		off->row[i].y = (int)zpa(0, i, zbp, bw) << zs;
+	}
+
+	for(int i = 0; i < 512; i++)
+	{
+		off->col[i].x = m_psm[fpsm].rowOffset[0][i * 4] << fs;
+		off->col[i].y = m_psm[zpsm].rowOffset[0][i * 4] << zs;
+	}
+
+	m_po4map[hash] = off;
+
+	return off;
+}
+
+static bool cmp_vec2x(const GSVector2i& a, const GSVector2i& b) {return a.x < b.x;}
+
+vector<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
+{
+	uint64 hash = TEX0.u64 & 0x3ffffffffull; // TBP0 TBW PSM TW TH
+
+	hash_map<uint64, vector<GSVector2i>*>::iterator i = m_p2tmap.find(hash);
+
+	if(i != m_p2tmap.end())
+	{
+		return i->second;
+	}
+
+	GSVector2i bs = m_psm[TEX0.PSM].bs;
+
+	int tw = std::max<int>(1 << TEX0.TW, bs.x);
+	int th = std::max<int>(1 << TEX0.TH, bs.y);
+
+	const GSOffset* off = GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+	hash_map<uint32, hash_set<uint32> > tmp; // key = page, value = y:x, 7 bits each, max 128x128 tiles for the worst case (1024x1024 32bpp 8x8 blocks)
+
+	for(int y = 0; y < th; y += bs.y)
+	{
+		uint32 base = off->block.row[y >> 3];
+
+		for(int x = 0, i = y << 7; x < tw; x += bs.x, i += bs.x)
+		{
+			uint32 page = (base + off->block.col[x >> 3]) >> 5;
+
+			if(page < MAX_PAGES)
+			{
+				tmp[page].insert(i >> 3); // ((y << 7) | x) >> 3
+			}
+		}
+	}
+
+	// combine the lower 5 bits of the address into a 9:5 pointer:mask form, so the "valid bits" can be tested against an uint32 array
+
+	vector<GSVector2i>* p2t = new vector<GSVector2i>[MAX_PAGES];
+
+	for(hash_map<uint32, hash_set<uint32> >::iterator i = tmp.begin(); i != tmp.end(); i++)
+	{
+		uint32 page = i->first;
+
+		hash_set<uint32>& tiles = i->second;
+
+		hash_map<uint32, uint32> m;
+
+		for(hash_set<uint32>::iterator j = tiles.begin(); j != tiles.end(); j++)
+		{
+			uint32 addr = *j;
+
+			uint32 row = addr >> 5;
+			uint32 col = 1 << (addr & 31);
+
+			hash_map<uint32, uint32>::iterator k = m.find(row);
+
+			if(k != m.end())
+			{
+				k->second |= col;
+			}
+			else
+			{
+				m[row] = col;
+			}
+		}
+
+		// sort by x and flip the mask (it will be used to erase a lot of bits in a loop, [x] &= ~y)
+
+		for(hash_map<uint32, uint32>::iterator j = m.begin(); j != m.end(); j++)
+		{
+			p2t[page].push_back(GSVector2i(j->first, ~j->second));
+		}
+
+		std::sort(p2t[page].begin(), p2t[page].end(), cmp_vec2x);
+	}
+
+	m_p2tmap[hash] = p2t;
+
+	return p2t;
+}
+
+////////////////////
+
+template<int psm, int bsx, int bsy, int alignment>
+void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
+{
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	const int csy = bsy / 4;
+
+	for(int offset = srcpitch * csy; h >= csy; h -= csy, y += csy, src += offset)
+	{
+		for(int x = l; x < r; x += bsx)
+		{
+			switch(psm)
+			{
+			case PSM_PSMCT32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			// TODO
+			default: __assume(0);
+			}
+		}
+	}
+}
+
+template<int psm, int bsx, int bsy, int alignment>
+void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
+{
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	for(int offset = srcpitch * bsy; h >= bsy; h -= bsy, y += bsy, src += offset)
+	{
+		for(int x = l; x < r; x += bsx)
+		{
+			switch(psm)
+			{
+			case PSM_PSMCT32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMCT16: GSBlock::WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMCT16S: GSBlock::WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMT8: GSBlock::WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
+			case PSM_PSMT4: GSBlock::WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
+			case PSM_PSMZ32: GSBlock::WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
+			case PSM_PSMZ16: GSBlock::WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			case PSM_PSMZ16S: GSBlock::WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
+			// TODO
+			default: __assume(0);
+			}
+		}
+	}
+}
+
+template<int psm, int bsx, int bsy>
+void GSLocalMemory::WriteImageLeftRight(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
+{
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	for(; h > 0; y++, h--, src += srcpitch)
+	{
+		for(int x = l; x < r; x++)
+		{
+			switch(psm)
+			{
+			case PSM_PSMCT32: WritePixel32(x, y, *(uint32*)&src[x * 4], bp, bw); break;
+			case PSM_PSMCT16: WritePixel16(x, y, *(uint16*)&src[x * 2], bp, bw); break;
+			case PSM_PSMCT16S: WritePixel16S(x, y, *(uint16*)&src[x * 2], bp, bw); break;
+			case PSM_PSMT8: WritePixel8(x, y, src[x], bp, bw); break;
+			case PSM_PSMT4: WritePixel4(x, y, src[x >> 1] >> ((x & 1) << 2), bp, bw); break;
+			case PSM_PSMZ32: WritePixel32Z(x, y, *(uint32*)&src[x * 4], bp, bw); break;
+			case PSM_PSMZ16: WritePixel16Z(x, y, *(uint16*)&src[x * 2], bp, bw); break;
+			case PSM_PSMZ16S: WritePixel16SZ(x, y, *(uint16*)&src[x * 2], bp, bw); break;
+			// TODO
+			default: __assume(0);
+			}
+		}
+	}
+}
+
+template<int psm, int bsx, int bsy, int trbpp>
+void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
+{
+	__aligned(uint8, 32) buff[64]; // merge buffer for one column
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	const int csy = bsy / 4;
+
+	// merge incomplete column
+
+	int y2 = y & (csy - 1);
+
+	if(y2 > 0)
+	{
+		int h2 = min(h, csy - y2);
+
+		for(int x = l; x < r; x += bsx)
+		{
+			uint8* dst = NULL;
+
+			switch(psm)
+			{
+			case PSM_PSMCT32: dst = BlockPtr32(x, y, bp, bw); break;
+			case PSM_PSMCT16: dst = BlockPtr16(x, y, bp, bw); break;
+			case PSM_PSMCT16S: dst = BlockPtr16S(x, y, bp, bw); break;
+			case PSM_PSMT8: dst = BlockPtr8(x, y, bp, bw); break;
+			case PSM_PSMT4: dst = BlockPtr4(x, y, bp, bw); break;
+			case PSM_PSMZ32: dst = BlockPtr32Z(x, y, bp, bw); break;
+			case PSM_PSMZ16: dst = BlockPtr16Z(x, y, bp, bw); break;
+			case PSM_PSMZ16S: dst = BlockPtr16SZ(x, y, bp, bw); break;
+			// TODO
+			default: __assume(0);
+			}
+
+			switch(psm)
+			{
+			case PSM_PSMCT32:
+			case PSM_PSMZ32:
+				GSBlock::ReadColumn32(y, dst, buff, 32);
+				memcpy(&buff[32], &src[x * 4], 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				break;
+			case PSM_PSMCT16:
+			case PSM_PSMCT16S:
+			case PSM_PSMZ16:
+			case PSM_PSMZ16S:
+				GSBlock::ReadColumn16(y, dst, buff, 32);
+				memcpy(&buff[32], &src[x * 2], 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
+				break;
+			case PSM_PSMT8:
+				GSBlock::ReadColumn8(y, dst, buff, 16);
+				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
+				break;
+			case PSM_PSMT4:
+				GSBlock::ReadColumn4(y, dst, buff, 16);
+				for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
+				break;
+			// TODO
+			default:
+				__assume(0);
+			}
+		}
+
+		src += srcpitch * h2;
+		y += h2;
+		h -= h2;
+	}
+
+	// write whole columns
+
+	{
+		int h2 = h & ~(csy - 1);
+
+		if(h2 > 0)
+		{
+			size_t addr = (size_t)&src[l * trbpp >> 3];
+
+			if((addr & 31) == 0 && (srcpitch & 31) == 0)
+			{
+				WriteImageColumn<psm, bsx, bsy, 32>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+			}
+			else if((addr & 15) == 0 && (srcpitch & 15) == 0)
+			{
+				WriteImageColumn<psm, bsx, bsy, 16>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+			}
+			else
+			{
+				WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
+			}
+
+			src += srcpitch * h2;
+			y += h2;
+			h -= h2;
+		}
+	}
+
+	// merge incomplete column
+
+	if(h >= 1)
+	{
+		for(int x = l; x < r; x += bsx)
+		{
+			uint8* dst = NULL;
+
+			switch(psm)
+			{
+			case PSM_PSMCT32: dst = BlockPtr32(x, y, bp, bw); break;
+			case PSM_PSMCT16: dst = BlockPtr16(x, y, bp, bw); break;
+			case PSM_PSMCT16S: dst = BlockPtr16S(x, y, bp, bw); break;
+			case PSM_PSMT8: dst = BlockPtr8(x, y, bp, bw); break;
+			case PSM_PSMT4: dst = BlockPtr4(x, y, bp, bw); break;
+			case PSM_PSMZ32: dst = BlockPtr32Z(x, y, bp, bw); break;
+			case PSM_PSMZ16: dst = BlockPtr16Z(x, y, bp, bw); break;
+			case PSM_PSMZ16S: dst = BlockPtr16SZ(x, y, bp, bw); break;
+			// TODO
+			default: __assume(0);
+			}
+
+			switch(psm)
+			{
+			case PSM_PSMCT32:
+			case PSM_PSMZ32:
+				GSBlock::ReadColumn32(y, dst, buff, 32);
+				memcpy(&buff[0], &src[x * 4], 32);
+				GSBlock::WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
+				break;
+			case PSM_PSMCT16:
+			case PSM_PSMCT16S:
+			case PSM_PSMZ16:
+			case PSM_PSMZ16S:
+				GSBlock::ReadColumn16(y, dst, buff, 32);
+				memcpy(&buff[0], &src[x * 2], 32);
+				GSBlock::WriteColumn16<32>(y, dst, buff, 32);
+				break;
+			case PSM_PSMT8:
+				GSBlock::ReadColumn8(y, dst, buff, 16);
+				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
+				GSBlock::WriteColumn8<32>(y, dst, buff, 16);
+				break;
+			case PSM_PSMT4:
+				GSBlock::ReadColumn4(y, dst, buff, 16);
+				for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
+				GSBlock::WriteColumn4<32>(y, dst, buff, 16);
+				break;
+			// TODO
+			default:
+				__assume(0);
+			}
+		}
+	}
+}
+
+template<int psm, int bsx, int bsy, int trbpp>
+void GSLocalMemory::WriteImage(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	int l = (int)TRXPOS.DSAX;
+	int r = l + (int)TRXREG.RRW;
+
+	// finish the incomplete row first
+
+	if(tx != l)
+	{
+		int n = min(len, (r - tx) * trbpp >> 3);
+		WriteImageX(tx, ty, src, n, BITBLTBUF, TRXPOS, TRXREG);
+		src += n;
+		len -= n;
+	}
+
+	int la = (l + (bsx - 1)) & ~(bsx - 1);
+	int ra = r & ~(bsx - 1);
+	int srcpitch = (r - l) * trbpp >> 3;
+	int h = len / srcpitch;
+
+	if(ra - la >= bsx && h > 0) // "transfer width" >= "block width" && there is at least one full row
+	{
+		const uint8* s = &src[-l * trbpp >> 3];
+
+		src += srcpitch * h;
+		len -= srcpitch * h;
+
+		// left part
+
+		if(l < la)
+		{
+			WriteImageLeftRight<psm, bsx, bsy>(l, la, ty, h, s, srcpitch, BITBLTBUF);
+		}
+
+		// right part
+
+		if(ra < r)
+		{
+			WriteImageLeftRight<psm, bsx, bsy>(ra, r, ty, h, s, srcpitch, BITBLTBUF);
+		}
+
+		// horizontally aligned part
+
+		if(la < ra)
+		{
+			// top part
+
+			{
+				int h2 = min(h, bsy - (ty & (bsy - 1)));
+
+				if(h2 < bsy)
+				{
+					WriteImageTopBottom<psm, bsx, bsy, trbpp>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+
+					s += srcpitch * h2;
+					ty += h2;
+					h -= h2;
+				}
+			}
+
+			// horizontally and vertically aligned part
+
+			{
+				int h2 = h & ~(bsy - 1);
+
+				if(h2 > 0)
+				{
+					size_t addr = (size_t)&s[la * trbpp >> 3];
+
+					if((addr & 31) == 0 && (srcpitch & 31) == 0)
+					{
+						WriteImageBlock<psm, bsx, bsy, 32>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+					}
+					else if((addr & 15) == 0 && (srcpitch & 15) == 0)
+					{
+						WriteImageBlock<psm, bsx, bsy, 16>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+					}
+					else
+					{
+						WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
+					}
+
+					s += srcpitch * h2;
+					ty += h2;
+					h -= h2;
+				}
+			}
+
+			// bottom part
+
+			if(h > 0)
+			{
+				WriteImageTopBottom<psm, bsx, bsy, trbpp>(la, ra, ty, h, s, srcpitch, BITBLTBUF);
+
+				// s += srcpitch * h;
+				ty += h;
+				// h -= h;
+			}
+		}
+	}
+
+	// the rest
+
+	if(len > 0)
+	{
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+}
+
+
+#define IsTopLeftAligned(dsax, tx, ty, bw, bh) \
+	((((int)dsax) & ((bw)-1)) == 0 && ((tx) & ((bw)-1)) == 0 && ((int)dsax) == (tx) && ((ty) & ((bh)-1)) == 0)
+
+void GSLocalMemory::WriteImage24(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	int tw = TRXPOS.DSAX + TRXREG.RRW, srcpitch = TRXREG.RRW * 3;
+	int th = len / srcpitch;
+
+	bool aligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);
+
+	if(!aligned || (tw & 7) || (th & 7) || (len % srcpitch))
+	{
+		// TODO
+
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+	else
+	{
+		th += ty;
+
+		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
+		{
+			for(int x = tx; x < tw; x += 8)
+			{
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32(x, y, bp, bw));
+			}
+		}
+
+		ty = th;
+	}
+}
+
+void GSLocalMemory::WriteImage8H(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	int tw = TRXPOS.DSAX + TRXREG.RRW, srcpitch = TRXREG.RRW;
+	int th = len / srcpitch;
+
+	bool aligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);
+
+	if(!aligned || (tw & 7) || (th & 7) || (len % srcpitch))
+	{
+		// TODO
+
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+	else
+	{
+		th += ty;
+
+		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
+		{
+			for(int x = tx; x < tw; x += 8)
+			{
+				GSBlock::UnpackAndWriteBlock8H(src + (x - tx), srcpitch, BlockPtr32(x, y, bp, bw));
+			}
+		}
+
+		ty = th;
+	}
+}
+
+void GSLocalMemory::WriteImage4HL(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	int tw = TRXPOS.DSAX + TRXREG.RRW, srcpitch = TRXREG.RRW / 2;
+	int th = len / srcpitch;
+
+	bool aligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);
+
+	if(!aligned || (tw & 7) || (th & 7) || (len % srcpitch))
+	{
+		// TODO
+
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+	else
+	{
+		th += ty;
+
+		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
+		{
+			for(int x = tx; x < tw; x += 8)
+			{
+				GSBlock::UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+			}
+		}
+
+		ty = th;
+	}
+}
+
+void GSLocalMemory::WriteImage4HH(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	int tw = TRXPOS.DSAX + TRXREG.RRW, srcpitch = TRXREG.RRW / 2;
+	int th = len / srcpitch;
+
+	bool aligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);
+
+	if(!aligned || (tw & 7) || (th & 7) || (len % srcpitch))
+	{
+		// TODO
+
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+	else
+	{
+		th += ty;
+
+		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
+		{
+			for(int x = tx; x < tw; x += 8)
+			{
+				GSBlock::UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, BlockPtr32(x, y, bp, bw));
+			}
+		}
+
+		ty = th;
+	}
+}
+
+void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(TRXREG.RRW == 0) return;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+
+	int tw = TRXPOS.DSAX + TRXREG.RRW, srcpitch = TRXREG.RRW * 3;
+	int th = len / srcpitch;
+
+	bool aligned = IsTopLeftAligned(TRXPOS.DSAX, tx, ty, 8, 8);
+
+	if(!aligned || (tw & 7) || (th & 7) || (len % srcpitch))
+	{
+		// TODO
+
+		WriteImageX(tx, ty, src, len, BITBLTBUF, TRXPOS, TRXREG);
+	}
+	else
+	{
+		th += ty;
+
+		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
+		{
+			for(int x = tx; x < tw; x += 8)
+			{
+				GSBlock::UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, BlockPtr32Z(x, y, bp, bw));
+			}
+		}
+
+		ty = th;
+	}
+}
+
+void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG)
+{
+	if(len <= 0) return;
+
+	const uint8* pb = (uint8*)src;
+	const uint16* pw = (uint16*)src;
+	const uint32* pd = (uint32*)src;
+
+	uint32 bp = BITBLTBUF.DBP;
+	uint32 bw = BITBLTBUF.DBW;
+	psm_t* psm = &m_psm[BITBLTBUF.DPSM];
+
+	int x = tx;
+	int y = ty;
+	int sx = (int)TRXPOS.DSAX;
+	int ex = sx + (int)TRXREG.RRW;
+
+	switch(BITBLTBUF.DPSM)
+	{
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+
+		len /= 4;
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x++, pd++)
+			{
+				WritePixel32(addr + offset[x], *pd);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+
+		len /= 3;
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x++, pb += 3)
+			{
+				WritePixel24(addr + offset[x], *(uint32*)pb);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+
+		len /= 2;
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x++, pw++)
+			{
+				WritePixel16(addr + offset[x], *pw);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT8:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x++, pb++)
+			{
+				WritePixel8(addr + offset[x], *pb);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				WritePixel4(addr + offset[x + 0], *pb & 0xf);
+				WritePixel4(addr + offset[x + 1], *pb >> 4);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT8H:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x++, pb++)
+			{
+				WritePixel8H(addr + offset[x], *pb);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4HL:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				WritePixel4HL(addr + offset[x + 0], *pb & 0xf);
+				WritePixel4HL(addr + offset[x + 1], *pb >> 4);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4HH:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				WritePixel4HH(addr + offset[x + 0], *pb & 0xf);
+				WritePixel4HH(addr + offset[x + 1], *pb >> 4);
+			}
+
+			if(x >= ex) {x = sx; y++;}
+		}
+
+		break;
+	}
+
+	tx = x;
+	ty = y;
+}
+
+//
+
+void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) const
+{
+	if(len <= 0) return;
+
+	uint8* RESTRICT pb = (uint8*)dst;
+	uint16* RESTRICT pw = (uint16*)dst;
+	uint32* RESTRICT pd = (uint32*)dst;
+
+	uint32 bp = BITBLTBUF.SBP;
+	uint32 bw = BITBLTBUF.SBW;
+	psm_t* RESTRICT psm = &m_psm[BITBLTBUF.SPSM];
+
+	int x = tx;
+	int y = ty;
+	int sx = (int)TRXPOS.SSAX;
+	int ex = sx + (int)TRXREG.RRW;
+
+	// printf("spsm=%d x=%d ex=%d y=%d len=%d\n", BITBLTBUF.SPSM, x, ex, y, len);
+
+	switch(BITBLTBUF.SPSM)
+	{
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+
+		// MGS1 intro, fade effect between two scenes (airplane outside-inside transition)
+
+		len /= 4;
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
+
+			for(; len > 0 && x < ex && (x & 7); len--, x++, pd++) 
+			{
+				*pd = ps[offset[x]];
+			}
+
+			// aligned to a column
+
+			for(int ex8 = ex - 8; len >= 8 && x <= ex8; len -= 8, x += 8, pd += 8)
+			{
+				int off = offset[x];
+
+				GSVector4i::store<false>(&pd[0], GSVector4i::load(&ps[off + 0], &ps[off + 4]));
+				GSVector4i::store<false>(&pd[4], GSVector4i::load(&ps[off + 8], &ps[off + 12]));
+
+				for(int i = 0; i < 8; i++) ASSERT(pd[i] == ps[offset[x + i]]);
+			}
+
+			for(; len > 0 && x < ex; len--, x++, pd++)
+			{
+				*pd = ps[offset[x]];
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+
+		len /= 3;
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
+
+			for(; len > 0 && x < ex; len--, x++, pb += 3)
+			{
+				uint32 c = ps[offset[x]];
+
+				pb[0] = (uint8)(c);
+				pb[1] = (uint8)(c >> 8);
+				pb[2] = (uint8)(c >> 16);
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+
+		len /= 2;
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint16* RESTRICT ps = &m_vm16[psm->pa(0, y, bp, bw)];
+
+			for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pw += 4)
+			{
+				pw[0] = ps[offset[x + 0]];
+				pw[1] = ps[offset[x + 1]];
+				pw[2] = ps[offset[x + 2]];
+				pw[3] = ps[offset[x + 3]];
+			}
+
+			for(; len > 0 && x < ex; len--, x++, pw++)
+			{
+				*pw = ps[offset[x]];
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT8:
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint8* RESTRICT ps = &m_vm8[psm->pa(0, y, bp, bw)];
+
+			for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pb += 4)
+			{
+				pb[0] = ps[offset[x + 0]];
+				pb[1] = ps[offset[x + 1]];
+				pb[2] = ps[offset[x + 2]];
+				pb[3] = ps[offset[x + 3]];
+			}
+
+			for(; len > 0 && x < ex; len--, x++, pb++)
+			{
+				*pb = ps[offset[x]];
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4:
+
+		while(len > 0)
+		{
+			uint32 addr = psm->pa(0, y, bp, bw);
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				*pb = (uint8)(ReadPixel4(addr + offset[x + 0]) | (ReadPixel4(addr + offset[x + 1]) << 4));
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT8H:
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
+
+			for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pb += 4)
+			{
+				pb[0] = (uint8)(ps[offset[x + 0]] >> 24);
+				pb[1] = (uint8)(ps[offset[x + 1]] >> 24);
+				pb[2] = (uint8)(ps[offset[x + 2]] >> 24);
+				pb[3] = (uint8)(ps[offset[x + 3]] >> 24);
+			}
+
+			for(; len > 0 && x < ex; len--, x++, pb++)
+			{
+				*pb = (uint8)(ps[offset[x]] >> 24);
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4HL:
+
+		while(len > 0)
+		{
+			int* offset = psm->rowOffset[y & 7];
+			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				uint32 c0 = (ps[offset[x + 0]] >> 24) & 0x0f;
+				uint32 c1 = (ps[offset[x + 1]] >> 20) & 0xf0;
+
+				*pb = (uint8)(c0 | c1);
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+
+	case PSM_PSMT4HH:
+
+		while(len > 0)
+		{
+			int* RESTRICT offset = psm->rowOffset[y & 7];
+			uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
+
+			for(; len > 0 && x < ex; len--, x += 2, pb++)
+			{
+				uint32 c0 = (ps[offset[x + 0]] >> 28) & 0x0f;
+				uint32 c1 = (ps[offset[x + 1]] >> 24) & 0xf0;
+
+				*pb = (uint8)(c0 | c1);
+			}
+
+			if(x == ex) {x = sx; y++;}
+		}
+
+		break;
+	}
+
+	tx = x;
+	ty = y;
+}
+
+///////////////////
+
+void GSLocalMemory::ReadTexture32(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 8, 8, 32)
+	{
+		GSBlock::ReadBlock32(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	if(TEXA.AEM)
+	{
+		FOREACH_BLOCK_START(r, 8, 8, 32)
+		{
+			GSBlock::ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
+		}
+		FOREACH_BLOCK_END
+	}
+	else
+	{
+		FOREACH_BLOCK_START(r, 8, 8, 32)
+		{
+			GSBlock::ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
+		}
+		FOREACH_BLOCK_END
+	}
+}
+
+void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	if(TEXA.AEM)
+	{
+		FOREACH_BLOCK_START(r, 16, 8, 32)
+		{
+			GSBlock::ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
+		}
+		FOREACH_BLOCK_END
+	}
+	else
+	{
+		FOREACH_BLOCK_START(r, 16, 8, 32)
+		{
+			GSBlock::ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
+		}
+		FOREACH_BLOCK_END
+	}
+}
+
+void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const uint32* pal = m_clut;
+
+	FOREACH_BLOCK_START(r, 16, 16, 32)
+	{
+		GSBlock::ReadAndExpandBlock8_32(src, dst, dstpitch, pal);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const uint64* pal = m_clut;
+
+	FOREACH_BLOCK_START(r, 32, 16, 32)
+	{
+		GSBlock::ReadAndExpandBlock4_32(src, dst, dstpitch, pal);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture8H(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const uint32* pal = m_clut;
+
+	FOREACH_BLOCK_START(r, 8, 8, 32)
+	{
+		GSBlock::ReadAndExpandBlock8H_32(src, dst, dstpitch, pal);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4HL(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const uint32* pal = m_clut;
+
+	FOREACH_BLOCK_START(r, 8, 8, 32)
+	{
+		GSBlock::ReadAndExpandBlock4HL_32(src, dst, dstpitch, pal);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const uint32* pal = m_clut;
+
+	FOREACH_BLOCK_START(r, 8, 8, 32)
+	{
+		GSBlock::ReadAndExpandBlock4HH_32(src, dst, dstpitch, pal);
+	}
+	FOREACH_BLOCK_END
+}
+
+///////////////////
+
+void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadBlock32(BlockPtr(bp), dst, dstpitch);
+}
+
+void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	if(TEXA.AEM)
+	{
+		GSBlock::ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+	}
+	else
+	{
+		GSBlock::ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+	}
+}
+
+void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	if(TEXA.AEM)
+	{
+		GSBlock::ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
+	}
+	else
+	{
+		GSBlock::ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
+	}
+}
+
+void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
+}
+
+///////////////////
+
+void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	const psm_t& psm = m_psm[off->psm];
+
+	readTexel rt = psm.rt;
+	readTexture rtx = psm.rtx;
+
+	if(r.width() < psm.bs.x || r.height() < psm.bs.y
+	|| (r.left & (psm.bs.x - 1)) || (r.top & (psm.bs.y - 1))
+	|| (r.right & (psm.bs.x - 1)) || (r.bottom & (psm.bs.y - 1)))
+	{
+		GIFRegTEX0 TEX0;
+
+		TEX0.TBP0 = off->bp;
+		TEX0.TBW = off->bw;
+		TEX0.PSM = off->psm;
+
+		GSVector4i cr = r.ralign<Align_Inside>(psm.bs);
+
+		bool aligned = ((size_t)(dst + (cr.left - r.left) * sizeof(uint32)) & 0xf) == 0;
+
+		if(cr.rempty() || !aligned)
+		{
+			// TODO: expand r to block size, read into temp buffer
+
+			if(!aligned) printf("unaligned memory pointer passed to ReadTexture\n");
+
+			for(int y = r.top; y < r.bottom; y++, dst += dstpitch)
+			{
+				for(int x = r.left, i = 0; x < r.right; x++, i++)
+				{
+					((uint32*)dst)[i] = (this->*rt)(x, y, TEX0, TEXA);
+				}
+			}
+		}
+		else
+		{
+			for(int y = r.top; y < cr.top; y++, dst += dstpitch)
+			{
+				for(int x = r.left, i = 0; x < r.right; x++, i++)
+				{
+					((uint32*)dst)[i] = (this->*rt)(x, y, TEX0, TEXA);
+				}
+			}
+
+			for(int y = cr.bottom; y < r.bottom; y++, dst += dstpitch)
+			{
+				for(int x = r.left, i = 0; x < r.right; x++, i++)
+				{
+					((uint32*)dst)[i] = (this->*rt)(x, y, TEX0, TEXA);
+				}
+			}
+
+			for(int y = cr.top; y < cr.bottom; y++, dst += dstpitch)
+			{
+				for(int x = r.left, i = 0; x < cr.left; x++, i++)
+				{
+					((uint32*)dst)[i] = (this->*rt)(x, y, TEX0, TEXA);
+				}
+
+				for(int x = cr.right, i = x - r.left; x < r.right; x++, i++)
+				{
+					((uint32*)dst)[i] = (this->*rt)(x, y, TEX0, TEXA);
+				}
+			}
+
+			if(!cr.rempty())
+			{
+				(this->*rtx)(off, cr, dst + (cr.left - r.left) * sizeof(uint32), dstpitch, TEXA);
+			}
+		}
+	}
+	else
+	{
+		(this->*rtx)(off, r, dst, dstpitch, TEXA);
+	}
+}
+
+// 32/8
+
+void GSLocalMemory::ReadTexture8P(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 16, 16, 8)
+	{
+		GSBlock::ReadBlock8(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4P(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 32, 16, 8)
+	{
+		GSBlock::ReadBlock4P(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture8HP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 8, 8, 8)
+	{
+		GSBlock::ReadBlock8HP(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4HLP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 8, 8, 8)
+	{
+		GSBlock::ReadBlock4HLP(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+void GSLocalMemory::ReadTexture4HHP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
+{
+	FOREACH_BLOCK_START(r, 8, 8, 8)
+	{
+		GSBlock::ReadBlock4HHP(src, dst, dstpitch);
+	}
+	FOREACH_BLOCK_END
+}
+
+//
+
+void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	GSBlock::ReadBlock8(BlockPtr(bp), dst, dstpitch);
+}
+
+void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadBlock4P(BlockPtr(bp), dst, dstpitch);
+}
+
+void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
+}
+
+void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
+}
+
+void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
+{
+	ALIGN_STACK(32);
+
+	GSBlock::ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
+}
+
+//
+
+#include "GSTextureSW.h"
+
+void GSLocalMemory::SaveBMP(const string& fn, uint32 bp, uint32 bw, uint32 psm, int w, int h)
+{
+	int pitch = w * 4;
+	int size = pitch * h;
+	void* bits = _aligned_malloc(size, 32);
+
+	GIFRegTEX0 TEX0;
+
+	TEX0.TBP0 = bp;
+	TEX0.TBW = bw;
+	TEX0.PSM = psm;
+
+	readPixel rp = m_psm[psm].rp;
+
+	uint8* p = (uint8*)bits;
+
+	for(int j = 0; j < h; j++, p += pitch)
+	{
+		for(int i = 0; i < w; i++)
+		{
+			((uint32*)p)[i] = (this->*rp)(i, j, TEX0.TBP0, TEX0.TBW);
+		}
+	}
+
+	GSTextureSW t(GSTexture::Offscreen, w, h);
+
+	if(t.Update(GSVector4i(0, 0, w, h), bits, pitch))
+	{
+		t.Save(fn);
+	}
+
+	_aligned_free(bits);
+}
+
+// GSOffset
+
+GSOffset::GSOffset(uint32 _bp, uint32 _bw, uint32 _psm)
+{
+	hash = _bp | (_bw << 14) | (_psm << 20);
+
+	GSLocalMemory::pixelAddress bn = GSLocalMemory::m_psm[_psm].bn;
+
+	for(int i = 0; i < 256; i++)
+	{
+		block.row[i] = (short)bn(0, i << 3, _bp, _bw);
+	}
+
+	block.col = GSLocalMemory::m_psm[_psm].blockOffset;
+
+	GSLocalMemory::pixelAddress pa = GSLocalMemory::m_psm[_psm].pa;
+
+	for(int i = 0; i < 4096; i++)
+	{
+		pixel.row[i] = (int)pa(0, i & 0x7ff, _bp, _bw);
+	}
+
+	for(int i = 0; i < 8; i++)
+	{
+		pixel.col[i] = GSLocalMemory::m_psm[_psm].rowOffset[i];
+	}
+}
+
+GSOffset::~GSOffset()
+{
+}
+
+uint32* GSOffset::GetPages(const GSVector4i& rect, uint32* pages, GSVector4i* bbox)
+{
+	GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
+
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
+
+	if(bbox != NULL) *bbox = r;
+
+	// worst case: 
+	// bp page-aligned: (w * h) / (64 * 32)
+	// bp block-aligned: (w * h) / (8 * 8)
+
+	int size = r.width() * r.height();
+	
+	int limit = MAX_PAGES + 1;
+
+	if(pages == NULL)
+	{
+		limit = std::min<int>((size >> ((bp & 31) != 0 ? 6 : 11)) + 2, MAX_PAGES) + 1;
+
+		pages = new uint32[limit];
+	}
+
+	__aligned(uint32, 16) tmp[16];
+
+	((GSVector4i*)tmp)[0] = GSVector4i::zero();
+	((GSVector4i*)tmp)[1] = GSVector4i::zero();
+	((GSVector4i*)tmp)[2] = GSVector4i::zero();
+	((GSVector4i*)tmp)[3] = GSVector4i::zero();
+
+	r = r.sra32(3);
+
+	bs.x >>= 3;
+	bs.y >>= 3;
+
+	uint32* RESTRICT p = pages;
+	
+	for(int y = r.top; y < r.bottom; y += bs.y)
+	{
+		uint32 base = block.row[y];
+
+		for(int x = r.left; x < r.right; x += bs.x)
+		{
+			uint32 n = (base + block.col[x]) >> 5;
+
+			if(n < MAX_PAGES)
+			{
+				uint32& row = tmp[n >> 5];
+				uint32 col = 1 << (n & 31);
+
+				if((row & col) == 0)
+				{
+					row |= col;
+
+					*p++ = n;
+				}
+			}
+		}
+	}
+
+	*p++ = (uint32)EOP;
+
+	ASSERT(p - pages <= limit);
+
+	return pages;
+}
+
+GSVector4i* GSOffset::GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages, GSVector4i* bbox)
+{
+	if(pages == NULL)
+	{
+		pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
+	}
+
+	pages[0] = GSVector4i::zero();
+	pages[1] = GSVector4i::zero();
+	pages[2] = GSVector4i::zero();
+	pages[3] = GSVector4i::zero();
+
+	GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
+
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
+
+	if(bbox != NULL) *bbox = r;
+
+	r = r.sra32(3);
+
+	bs.x >>= 3;
+	bs.y >>= 3;
+
+	for(int y = r.top; y < r.bottom; y += bs.y)
+	{
+		uint32 base = block.row[y];
+
+		for(int x = r.left; x < r.right; x += bs.x)
+		{
+			uint32 n = (base + block.col[x]) >> 5;
+
+			if(n < MAX_PAGES)
+			{
+				((uint32*)pages)[n >> 5] |= 1 << (n & 31);
+			}
+		}
+	}
+
+	return pages;
+
+}
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSLocalMemory.h b/plugins/GSdx_legacy/GSLocalMemory.h
new file mode 100644
index 0000000000..3b24eb0b1e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLocalMemory.h
@@ -0,0 +1,918 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSTables.h"
+#include "GSVector.h"
+#include "GSBlock.h"
+#include "GSClut.h"
+
+class GSOffset : public GSAlignedClass<32>
+{
+public:
+	__aligned(struct, 32) Block
+	{
+		short row[256]; // yn (n = 0 8 16 ...)
+		short* col; // blockOffset*
+	};
+	
+	__aligned(struct, 32) Pixel
+	{
+		int row[4096]; // yn (n = 0 1 2 ...) NOTE: this wraps around above 2048, only transfers should address the upper half (dark cloud 2 inventing)
+		int* col[8]; // rowOffset*
+	};
+
+	union {uint32 hash; struct {uint32 bp:14, bw:6, psm:6;};};
+
+	Block block;
+	Pixel pixel;
+
+	GSOffset(uint32 bp, uint32 bw, uint32 psm);
+	virtual ~GSOffset();
+
+	enum {EOP = 0xffffffff};
+
+	uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
+	GSVector4i* GetPagesAsBits(const GSVector4i& rect, GSVector4i* pages = NULL, GSVector4i* bbox = NULL); // free returned value with _aligned_free
+};
+
+struct GSPixelOffset
+{
+	// 16 bit offsets (m_vm16[...])
+
+	GSVector2i row[2048]; // f yn | z yn
+	GSVector2i col[2048]; // f xn | z xn
+	uint32 hash;
+	uint32 fbp, zbp, fpsm, zpsm, bw;
+};
+
+struct GSPixelOffset4
+{
+	// 16 bit offsets (m_vm16[...])
+
+	GSVector2i row[2048]; // f yn | z yn (n = 0 1 2 ...)
+	GSVector2i col[512]; // f xn | z xn (n = 0 4 8 ...)
+	uint32 hash;
+	uint32 fbp, zbp, fpsm, zpsm, bw;
+};
+
+class GSLocalMemory : public GSAlignedClass<32>
+{
+public:
+	typedef uint32 (*pixelAddress)(int x, int y, uint32 bp, uint32 bw);
+	typedef void (GSLocalMemory::*writePixel)(int x, int y, uint32 c, uint32 bp, uint32 bw);
+	typedef void (GSLocalMemory::*writeFrame)(int x, int y, uint32 c, uint32 bp, uint32 bw);
+	typedef uint32 (GSLocalMemory::*readPixel)(int x, int y, uint32 bp, uint32 bw) const;
+	typedef uint32 (GSLocalMemory::*readTexel)(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const;
+	typedef void (GSLocalMemory::*writePixelAddr)(uint32 addr, uint32 c);
+	typedef void (GSLocalMemory::*writeFrameAddr)(uint32 addr, uint32 c);
+	typedef uint32 (GSLocalMemory::*readPixelAddr)(uint32 addr) const;
+	typedef uint32 (GSLocalMemory::*readTexelAddr)(uint32 addr, const GIFRegTEXA& TEXA) const;
+	typedef void (GSLocalMemory::*writeImage)(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	typedef void (GSLocalMemory::*readImage)(int& tx, int& ty, uint8* dst, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) const;
+	typedef void (GSLocalMemory::*readTexture)(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	typedef void (GSLocalMemory::*readTextureBlock)(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+
+	__aligned(struct, 128) psm_t
+	{
+		pixelAddress pa, bn;
+		readPixel rp;
+		readPixelAddr rpa;
+		writePixel wp;
+		writePixelAddr wpa;
+		readTexel rt;
+		readTexelAddr rta;
+		writeFrameAddr wfa;
+		writeImage wi;
+		readImage ri;
+		readTexture rtx, rtxP;
+		readTextureBlock rtxb, rtxbP;
+		uint16 bpp, trbpp, pal, fmt;
+		GSVector2i bs, pgs;
+		int* rowOffset[8];
+		short* blockOffset;
+		uint8 msk;
+	};
+
+	static psm_t m_psm[64];
+
+	static const int m_vmsize = 1024 * 1024 * 4;
+
+	uint8* m_vm8; 
+	uint16* m_vm16; 
+	uint32* m_vm32;
+
+	GSClut m_clut;
+
+protected:
+	static uint32 pageOffset32[32][32][64];
+	static uint32 pageOffset32Z[32][32][64];
+	static uint32 pageOffset16[32][64][64];
+	static uint32 pageOffset16S[32][64][64];
+	static uint32 pageOffset16Z[32][64][64];
+	static uint32 pageOffset16SZ[32][64][64];
+	static uint32 pageOffset8[32][64][128];
+	static uint32 pageOffset4[32][128][128];
+
+	static int rowOffset32[4096];
+	static int rowOffset32Z[4096];
+	static int rowOffset16[4096];
+	static int rowOffset16S[4096];
+	static int rowOffset16Z[4096];
+	static int rowOffset16SZ[4096];
+	static int rowOffset8[2][4096];
+	static int rowOffset4[2][4096];
+
+	static short blockOffset32[256];
+	static short blockOffset32Z[256];
+	static short blockOffset16[256];
+	static short blockOffset16S[256];
+	static short blockOffset16Z[256];
+	static short blockOffset16SZ[256];
+	static short blockOffset8[256];
+	static short blockOffset4[256];
+
+	__forceinline static uint32 Expand24To32(uint32 c, const GIFRegTEXA& TEXA)
+	{
+		return (((!TEXA.AEM | (c & 0xffffff)) ? TEXA.TA0 : 0) << 24) | (c & 0xffffff);
+	}
+
+	__forceinline static uint32 Expand16To32(uint16 c, const GIFRegTEXA& TEXA)
+	{
+		return (((c & 0x8000) ? TEXA.TA1 : (!TEXA.AEM | c) ? TEXA.TA0 : 0) << 24) | ((c & 0x7c00) << 9) | ((c & 0x03e0) << 6) | ((c & 0x001f) << 3);
+	}
+
+	// TODO
+
+	friend class GSClut;
+
+	//
+
+	hash_map<uint32, GSOffset*> m_omap;
+	hash_map<uint32, GSPixelOffset*> m_pomap;
+	hash_map<uint32, GSPixelOffset4*> m_po4map;
+	hash_map<uint64, vector<GSVector2i>*> m_p2tmap;
+
+public:
+	GSLocalMemory();
+	virtual ~GSLocalMemory();
+
+	GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm);
+	GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
+	GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
+	vector<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);
+
+	// address
+
+	static uint32 BlockNumber32(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
+	}
+
+	static uint32 BlockNumber16(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
+	}
+
+	static uint32 BlockNumber16S(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
+	}
+
+	static uint32 BlockNumber8(int x, int y, uint32 bp, uint32 bw)
+	{
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
+		return bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
+	}
+
+	static uint32 BlockNumber4(int x, int y, uint32 bp, uint32 bw)
+	{
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
+		return bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
+	}
+
+	static uint32 BlockNumber32Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + (y & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
+	}
+
+	static uint32 BlockNumber16Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
+	}
+
+	static uint32 BlockNumber16SZ(int x, int y, uint32 bp, uint32 bw)
+	{
+		return bp + ((y >> 1) & ~0x1f) * bw + ((x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
+	}
+
+	uint8* BlockPtr(uint32 bp) const
+	{
+		ASSERT(bp < 16384);
+
+		return &m_vm8[bp << 8];
+	}
+
+	uint8* BlockPtr32(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber32(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr16(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber16(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr16S(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber16S(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr8(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber8(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr4(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber4(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr32Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber32Z(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr16Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber16Z(x, y, bp, bw) << 8];
+	}
+
+	uint8* BlockPtr16SZ(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return &m_vm8[BlockNumber16SZ(x, y, bp, bw) << 8];
+	}
+
+	static uint32 PixelAddressOrg32(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+	}
+
+	static uint32 PixelAddressOrg16(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+	}
+
+	static uint32 PixelAddressOrg16S(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+	}
+
+	static uint32 PixelAddressOrg8(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
+	}
+
+	static uint32 PixelAddressOrg4(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
+	}
+
+	static uint32 PixelAddressOrg32Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+	}
+
+	static uint32 PixelAddressOrg16Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+	}
+
+	static uint32 PixelAddressOrg16SZ(int x, int y, uint32 bp, uint32 bw)
+	{
+		return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+	}
+
+	static __forceinline uint32 PixelAddress32(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6);
+		uint32 word = (page << 11) + pageOffset32[bp & 0x1f][y & 0x1f][x & 0x3f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress16(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
+		uint32 word = (page << 12) + pageOffset16[bp & 0x1f][y & 0x3f][x & 0x3f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress16S(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
+		uint32 word = (page << 12) + pageOffset16S[bp & 0x1f][y & 0x3f][x & 0x3f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress8(int x, int y, uint32 bp, uint32 bw)
+	{
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
+		uint32 page = (bp >> 5) + (y >> 6) * (bw >> 1) + (x >> 7);
+		uint32 word = (page << 13) + pageOffset8[bp & 0x1f][y & 0x3f][x & 0x7f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress4(int x, int y, uint32 bp, uint32 bw)
+	{
+		// ASSERT((bw & 1) == 0); // allowed for mipmap levels
+
+		uint32 page = (bp >> 5) + (y >> 7) * (bw >> 1) + (x >> 7);
+		uint32 word = (page << 14) + pageOffset4[bp & 0x1f][y & 0x7f][x & 0x7f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress32Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 5) * bw + (x >> 6);
+		uint32 word = (page << 11) + pageOffset32Z[bp & 0x1f][y & 0x1f][x & 0x3f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress16Z(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
+		uint32 word = (page << 12) + pageOffset16Z[bp & 0x1f][y & 0x3f][x & 0x3f];
+
+		return word;
+	}
+
+	static __forceinline uint32 PixelAddress16SZ(int x, int y, uint32 bp, uint32 bw)
+	{
+		uint32 page = (bp >> 5) + (y >> 6) * bw + (x >> 6);
+		uint32 word = (page << 12) + pageOffset16SZ[bp & 0x1f][y & 0x3f][x & 0x3f];
+
+		return word;
+	}
+
+	// pixel R/W
+
+	__forceinline uint32 ReadPixel32(uint32 addr) const
+	{
+		return m_vm32[addr];
+	}
+
+	__forceinline uint32 ReadPixel24(uint32 addr) const
+	{
+		return m_vm32[addr] & 0x00ffffff;
+	}
+
+	__forceinline uint32 ReadPixel16(uint32 addr) const
+	{
+		return (uint32)m_vm16[addr];
+	}
+
+	__forceinline uint32 ReadPixel8(uint32 addr) const
+	{
+		return (uint32)m_vm8[addr];
+	}
+
+	__forceinline uint32 ReadPixel4(uint32 addr) const
+	{
+		return (m_vm8[addr >> 1] >> ((addr & 1) << 2)) & 0x0f;
+	}
+
+	__forceinline uint32 ReadPixel8H(uint32 addr) const
+	{
+		return m_vm32[addr] >> 24;
+	}
+
+	__forceinline uint32 ReadPixel4HL(uint32 addr) const
+	{
+		return (m_vm32[addr] >> 24) & 0x0f;
+	}
+
+	__forceinline uint32 ReadPixel4HH(uint32 addr) const
+	{
+		return (m_vm32[addr] >> 28) & 0x0f;
+	}
+
+	__forceinline uint32 ReadFrame24(uint32 addr) const
+	{
+		return 0x80000000 | (m_vm32[addr] & 0xffffff);
+	}
+
+	__forceinline uint32 ReadFrame16(uint32 addr) const
+	{
+		uint32 c = (uint32)m_vm16[addr];
+
+		return ((c & 0x8000) << 16) | ((c & 0x7c00) << 9) | ((c & 0x03e0) << 6) | ((c & 0x001f) << 3);
+	}
+
+	__forceinline uint32 ReadPixel32(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel32(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel24(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel24(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel16(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel16(PixelAddress16(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel16S(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel16(PixelAddress16S(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel8(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel8(PixelAddress8(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel4(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel4(PixelAddress4(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel8H(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel8H(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel4HL(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel4HL(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel4HH(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel4HH(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel32Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel32(PixelAddress32Z(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel24Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel24(PixelAddress32Z(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel16Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel16(PixelAddress16Z(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadPixel16SZ(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadPixel16(PixelAddress16SZ(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame24(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame24(PixelAddress32(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame16(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame16(PixelAddress16(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame16S(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame16(PixelAddress16S(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame24Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame24(PixelAddress32Z(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame16Z(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame16(PixelAddress16Z(x, y, bp, bw));
+	}
+
+	__forceinline uint32 ReadFrame16SZ(int x, int y, uint32 bp, uint32 bw) const
+	{
+		return ReadFrame16(PixelAddress16SZ(x, y, bp, bw));
+	}
+
+	__forceinline void WritePixel32(uint32 addr, uint32 c)
+	{
+		m_vm32[addr] = c;
+	}
+
+	__forceinline void WritePixel24(uint32 addr, uint32 c)
+	{
+		m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff);
+	}
+
+	__forceinline void WritePixel16(uint32 addr, uint32 c)
+	{
+		m_vm16[addr] = (uint16)c;
+	}
+
+	__forceinline void WritePixel8(uint32 addr, uint32 c)
+	{
+		m_vm8[addr] = (uint8)c;
+	}
+
+	__forceinline void WritePixel4(uint32 addr, uint32 c)
+	{
+		int shift = (addr & 1) << 2; addr >>= 1;
+
+		m_vm8[addr] = (uint8)((m_vm8[addr] & (0xf0 >> shift)) | ((c & 0x0f) << shift));
+	}
+
+	__forceinline void WritePixel8H(uint32 addr, uint32 c)
+	{
+		m_vm32[addr] = (m_vm32[addr] & 0x00ffffff) | (c << 24);
+	}
+
+	__forceinline void WritePixel4HL(uint32 addr, uint32 c)
+	{
+		m_vm32[addr] = (m_vm32[addr] & 0xf0ffffff) | ((c & 0x0f) << 24);
+	}
+
+	__forceinline void WritePixel4HH(uint32 addr, uint32 c)
+	{
+		m_vm32[addr] = (m_vm32[addr] & 0x0fffffff) | ((c & 0x0f) << 28);
+	}
+
+	__forceinline void WriteFrame16(uint32 addr, uint32 c)
+	{
+		uint32 rb = c & 0x00f800f8;
+		uint32 ga = c & 0x8000f800;
+
+		WritePixel16(addr, (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3));
+	}
+
+	__forceinline void WritePixel32(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel32(PixelAddress32(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel24(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel24(PixelAddress32(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel16(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel16(PixelAddress16(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel16S(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel16(PixelAddress16S(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel8(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel8(PixelAddress8(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel4(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel4(PixelAddress4(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel8H(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel8H(PixelAddress32(x, y, bp, bw), c);
+	}
+
+    __forceinline void WritePixel4HL(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel4HL(PixelAddress32(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel4HH(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel4HH(PixelAddress32(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel32Z(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel32(PixelAddress32Z(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel24Z(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel24(PixelAddress32Z(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel16Z(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel16(PixelAddress16Z(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel16SZ(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WritePixel16(PixelAddress16SZ(x, y, bp, bw), c);
+	}
+
+	__forceinline void WriteFrame16(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WriteFrame16(PixelAddress16(x, y, bp, bw), c);
+	}
+
+	__forceinline void WriteFrame16S(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WriteFrame16(PixelAddress16S(x, y, bp, bw), c);
+	}
+
+	__forceinline void WriteFrame16Z(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WriteFrame16(PixelAddress16Z(x, y, bp, bw), c);
+	}
+
+	__forceinline void WriteFrame16SZ(int x, int y, uint32 c, uint32 bp, uint32 bw)
+	{
+		WriteFrame16(PixelAddress16SZ(x, y, bp, bw), c);
+	}
+
+	__forceinline void WritePixel32(uint8* RESTRICT src, uint32 pitch, GSOffset* off, const GSVector4i& r)
+	{
+		src -= r.left * sizeof(uint32);
+
+		for(int y = r.top; y < r.bottom; y++, src += pitch)
+		{
+			uint32* RESTRICT s = (uint32*)src;
+			uint32* RESTRICT d = &m_vm32[off->pixel.row[y]];
+			int* RESTRICT col = off->pixel.col[0];
+
+			for(int x = r.left; x < r.right; x++)
+			{
+				d[col[x]] = s[x];
+			}
+		}
+	}
+
+	__forceinline void WritePixel24(uint8* RESTRICT src, uint32 pitch, GSOffset* off, const GSVector4i& r)
+	{
+		src -= r.left * sizeof(uint32);
+
+		for(int y = r.top; y < r.bottom; y++, src += pitch)
+		{
+			uint32* RESTRICT s = (uint32*)src;
+			uint32* RESTRICT d = &m_vm32[off->pixel.row[y]];
+			int* RESTRICT col = off->pixel.col[0];
+
+			for(int x = r.left; x < r.right; x++)
+			{
+				d[col[x]] = (d[col[x]] & 0xff000000) | (s[x] & 0x00ffffff);
+			}
+		}
+	}
+
+	__forceinline void WritePixel16(uint8* RESTRICT src, uint32 pitch, GSOffset* off, const GSVector4i& r)
+	{
+		src -= r.left * sizeof(uint16);
+
+		for(int y = r.top; y < r.bottom; y++, src += pitch)
+		{
+			uint16* RESTRICT s = (uint16*)src;
+			uint16* RESTRICT d = &m_vm16[off->pixel.row[y]];
+			int* RESTRICT col = off->pixel.col[0];
+
+			for(int x = r.left; x < r.right; x++)
+			{
+				d[col[x]] = s[x];
+			}
+		}
+	}
+
+	__forceinline void WriteFrame16(uint8* RESTRICT src, uint32 pitch, GSOffset* off, const GSVector4i& r)
+	{
+		src -= r.left * sizeof(uint32);
+
+		for(int y = r.top; y < r.bottom; y++, src += pitch)
+		{
+			uint32* RESTRICT s = (uint32*)src;
+			uint16* RESTRICT d = &m_vm16[off->pixel.row[y]];
+			int* RESTRICT col = off->pixel.col[0];
+
+			for(int x = r.left; x < r.right; x++)
+			{
+				uint32 rb = s[x] & 0x00f800f8;
+				uint32 ga = s[x] & 0x8000f800;
+
+				d[col[x]] = (uint16)((ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3));
+			}
+		}
+	}
+
+	__forceinline uint32 ReadTexel32(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_vm32[addr];
+	}
+
+	__forceinline uint32 ReadTexel24(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return Expand24To32(m_vm32[addr], TEXA);
+	}
+
+	__forceinline uint32 ReadTexel16(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return Expand16To32(m_vm16[addr], TEXA);
+	}
+
+	__forceinline uint32 ReadTexel8(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_clut[ReadPixel8(addr)];
+	}
+
+	__forceinline uint32 ReadTexel4(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_clut[ReadPixel4(addr)];
+	}
+
+	__forceinline uint32 ReadTexel8H(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_clut[ReadPixel8H(addr)];
+	}
+
+	__forceinline uint32 ReadTexel4HL(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_clut[ReadPixel4HL(addr)];
+	}
+
+	__forceinline uint32 ReadTexel4HH(uint32 addr, const GIFRegTEXA& TEXA) const
+	{
+		return m_clut[ReadPixel4HH(addr)];
+	}
+
+	__forceinline uint32 ReadTexel32(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel32(PixelAddress32(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel24(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel24(PixelAddress32(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel16(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel16(PixelAddress16(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel16S(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel16(PixelAddress16S(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel8(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel8(PixelAddress8(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel4(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel4(PixelAddress4(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel8H(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel8H(PixelAddress32(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel4HL(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel4HL(PixelAddress32(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel4HH(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel4HH(PixelAddress32(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel32Z(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel32(PixelAddress32Z(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel24Z(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel24(PixelAddress32Z(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel16Z(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel16(PixelAddress16Z(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	__forceinline uint32 ReadTexel16SZ(int x, int y, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) const
+	{
+		return ReadTexel16(PixelAddress16SZ(x, y, TEX0.TBP0, TEX0.TBW), TEXA);
+	}
+
+	//
+
+	template<int psm, int bsx, int bsy, int alignment>
+	void WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
+
+	template<int psm, int bsx, int bsy, int alignment>
+	void WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
+
+	template<int psm, int bsx, int bsy>
+	void WriteImageLeftRight(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
+
+	template<int psm, int bsx, int bsy, int trbpp>
+	void WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
+
+	template<int psm, int bsx, int bsy, int trbpp>
+	void WriteImage(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+
+	void WriteImage24(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	void WriteImage8H(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	void WriteImage4HL(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	void WriteImage4HH(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	void WriteImage24Z(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+	void WriteImageX(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG);
+
+	// TODO: ReadImage32/24/...
+
+	void ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) const;
+
+	// * => 32
+
+	void ReadTexture32(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture24(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture16(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture8(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture8H(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4HL(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4HH(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+
+	void ReadTexture(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+
+	void ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+
+	// pal ? 8 : 32
+
+	void ReadTexture8P(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4P(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture8HP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4HLP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+	void ReadTexture4HHP(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+
+	void ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+	void ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
+
+	//
+
+	template<typename T> void ReadTexture(const GSOffset* RESTRICT off, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
+
+	//
+
+	void SaveBMP(const string& fn, uint32 bp, uint32 bw, uint32 psm, int w, int h);
+};
+
diff --git a/plugins/GSdx_legacy/GSLzma.cpp b/plugins/GSdx_legacy/GSLzma.cpp
new file mode 100644
index 0000000000..77a76dc55e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLzma.cpp
@@ -0,0 +1,161 @@
+/*
+ *	Copyright (C) 2015-2015 Gregory hainaut
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSLzma.h"
+
+#ifdef __linux__
+
+GSDumpFile::GSDumpFile(char* filename) {
+	m_fp = fopen(filename, "rb");
+	if (m_fp == NULL) {
+		fprintf(stderr, "failed to open %s\n", filename);
+		throw "BAD"; // Just exit the program
+	}
+}
+
+GSDumpFile::~GSDumpFile() {
+	if (m_fp)
+		fclose(m_fp);
+}
+
+/******************************************************************/
+#ifdef LZMA_SUPPORTED
+
+GSDumpLzma::GSDumpLzma(char* filename) : GSDumpFile(filename) {
+
+	memset(&m_strm, 0, sizeof(lzma_stream));
+
+	lzma_ret ret = lzma_stream_decoder(&m_strm, UINT32_MAX, 0);
+
+	if (ret != LZMA_OK) {
+		fprintf(stderr, "Error initializing the decoder! (error code %u)\n", ret);
+		throw "BAD"; // Just exit the program
+	}
+
+	m_buff_size = 1024*1024;
+	m_area      = (uint8_t*)_aligned_malloc(m_buff_size, 32);
+	m_inbuf     = (uint8_t*)_aligned_malloc(BUFSIZ, 32);
+	m_avail     = 0;
+	m_start     = 0;
+
+	m_strm.avail_in  = 0;
+	m_strm.next_in   = m_inbuf;
+
+	m_strm.avail_out = m_buff_size;
+	m_strm.next_out  = m_area;
+}
+
+void GSDumpLzma::Decompress() {
+	lzma_action action = LZMA_RUN;
+
+	m_strm.next_out  = m_area;
+	m_strm.avail_out = m_buff_size;
+
+	// Nothing left in the input buffer. Read data from the file
+	if (m_strm.avail_in == 0 && !feof(m_fp)) {
+		m_strm.next_in   = m_inbuf;
+		m_strm.avail_in  = fread(m_inbuf, 1, BUFSIZ, m_fp);
+
+		if (ferror(m_fp)) {
+			fprintf(stderr, "Read error: %s\n", strerror(errno));
+			throw "BAD"; // Just exit the program
+		}
+	}
+
+	lzma_ret ret = lzma_code(&m_strm, action);
+
+	if (ret != LZMA_OK) {
+		if (ret == LZMA_STREAM_END)
+			fprintf(stderr, "LZMA decoder finished without error\n\n");
+		else {
+			fprintf(stderr, "Decoder error: (error code %u)\n", ret);
+			throw "BAD"; // Just exit the program
+		}
+	}
+
+	m_start = 0;
+	m_avail = m_buff_size - m_strm.avail_out;
+}
+
+bool GSDumpLzma::IsEof() {
+	return feof(m_fp) && (m_avail == 0);
+}
+
+void GSDumpLzma::Read(void* ptr, size_t size) {
+	size_t off = 0;
+	uint8_t* dst = (uint8_t*)ptr;
+	while (size) {
+		if (m_avail == 0) {
+			Decompress();
+		}
+
+		size_t l = min(size, m_avail);
+		memcpy(dst + off, m_area+m_start, l);
+		m_avail -= l;
+		size    -= l;
+		m_start += l;
+		off     += l;
+	}
+}
+
+GSDumpLzma::~GSDumpLzma() {
+	lzma_end(&m_strm);
+
+	if (m_inbuf)
+		_aligned_free(m_inbuf);
+	if (m_area)
+		_aligned_free(m_area);
+}
+
+#endif
+
+/******************************************************************/
+
+GSDumpRaw::GSDumpRaw(char* filename) : GSDumpFile(filename) {
+	m_buff_size = 0;
+	m_area      = NULL;
+	m_inbuf     = NULL;
+	m_avail     = 0;
+	m_start     = 0;
+}
+
+GSDumpRaw::~GSDumpRaw() {
+}
+
+bool GSDumpRaw::IsEof() {
+	return feof(m_fp);
+}
+
+void GSDumpRaw::Read(void* ptr, size_t size) {
+	if (size == 1) {
+		// I don't know why but read of size 1 is not happy
+		int v = fgetc(m_fp);
+		memcpy(ptr, &v, 1);
+	} else {
+		size_t ret = fread(ptr, 1, size, m_fp);
+		if (ret != size) {
+			fprintf(stderr, "GSDumpRaw:: Read error\n");
+			throw "BAD"; // Just exit the program
+		}
+	}
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSLzma.h b/plugins/GSdx_legacy/GSLzma.h
new file mode 100644
index 0000000000..596d97e255
--- /dev/null
+++ b/plugins/GSdx_legacy/GSLzma.h
@@ -0,0 +1,84 @@
+/*
+ *	Copyright (C) 2015-2015 Gregory hainaut
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#ifdef __linux__
+
+#ifdef LZMA_SUPPORTED
+#include <lzma.h>
+#endif
+
+class GSDumpFile {
+	protected:
+	FILE*		m_fp;
+
+
+	public:
+	virtual bool IsEof() = 0;
+	virtual void Read(void* ptr, size_t size) = 0;
+
+	GSDumpFile(char* filename);
+	virtual ~GSDumpFile();
+};
+
+#ifdef LZMA_SUPPORTED
+class GSDumpLzma : public GSDumpFile {
+
+	lzma_stream m_strm;
+
+	size_t		m_buff_size;
+	uint8_t*	m_area;
+	uint8_t*	m_inbuf;
+
+	size_t		m_avail;
+	size_t		m_start;
+
+	void Decompress();
+
+	public:
+
+	GSDumpLzma(char* filename);
+	virtual ~GSDumpLzma();
+
+	bool IsEof();
+	void Read(void* ptr, size_t size);
+};
+#endif
+
+class GSDumpRaw : public GSDumpFile {
+
+	size_t		m_buff_size;
+	uint8_t*	m_area;
+	uint8_t*	m_inbuf;
+
+	size_t		m_avail;
+	size_t		m_start;
+
+	void Decompress();
+
+	public:
+
+	GSDumpRaw(char* filename);
+	virtual ~GSDumpRaw();
+
+	bool IsEof();
+	void Read(void* ptr, size_t size);
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/GSPerfMon.cpp b/plugins/GSdx_legacy/GSPerfMon.cpp
new file mode 100644
index 0000000000..52a82006ef
--- /dev/null
+++ b/plugins/GSdx_legacy/GSPerfMon.cpp
@@ -0,0 +1,119 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSPerfMon.h"
+
+GSPerfMon::GSPerfMon()
+	: m_frame(0)
+	, m_lastframe(0)
+	, m_count(0)
+{
+	memset(m_counters, 0, sizeof(m_counters));
+	memset(m_stats, 0, sizeof(m_stats));
+	memset(m_total, 0, sizeof(m_total));
+	memset(m_begin, 0, sizeof(m_begin));
+}
+
+void GSPerfMon::Put(counter_t c, double val)
+{
+#ifndef DISABLE_PERF_MON
+	if(c == Frame)
+	{
+#ifdef __linux__
+		// clock on linux will return CLOCK_PROCESS_CPUTIME_ID.
+		// CLOCK_THREAD_CPUTIME_ID is much more useful to measure the fps
+		struct timespec ts;
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+		uint64 now =  (uint64) ts.tv_sec * (uint64) 1e6 + (uint64) ts.tv_nsec / (uint64) 1e3;
+#else
+		clock_t now = clock();
+#endif
+
+		if(m_lastframe != 0)
+		{
+			m_counters[c] += (now - m_lastframe) * 1000 / CLOCKS_PER_SEC;
+		}
+
+		m_lastframe = now;
+		m_frame++;
+		m_count++;
+	}
+	else
+	{
+		m_counters[c] += val;
+	}
+#endif
+}
+
+void GSPerfMon::Update()
+{
+#ifndef DISABLE_PERF_MON
+	if(m_count > 0)
+	{
+		for(size_t i = 0; i < countof(m_counters); i++)
+		{
+			m_stats[i] = m_counters[i] / m_count;
+		}
+
+		m_count = 0;
+	}
+
+	memset(m_counters, 0, sizeof(m_counters));
+#endif
+}
+
+void GSPerfMon::Start(int timer)
+{
+#ifndef DISABLE_PERF_MON
+	m_start[timer] = __rdtsc();
+
+	if(m_begin[timer] == 0)
+	{
+		m_begin[timer] = m_start[timer];
+	}
+#endif
+}
+
+void GSPerfMon::Stop(int timer)
+{
+#ifndef DISABLE_PERF_MON
+	if(m_start[timer] > 0)
+	{
+		m_total[timer] += __rdtsc() - m_start[timer];
+		m_start[timer] = 0;
+	}
+#endif
+}
+
+int GSPerfMon::CPU(int timer, bool reset)
+{
+	int percent = (int)(100 * m_total[timer] / (__rdtsc() - m_begin[timer]));
+
+	if(reset)
+	{
+		m_begin[timer] = 0;
+		m_start[timer] = 0;
+		m_total[timer] = 0;
+	}
+
+	return percent;
+}
diff --git a/plugins/GSdx_legacy/GSPerfMon.h b/plugins/GSdx_legacy/GSPerfMon.h
new file mode 100644
index 0000000000..45bedf9b88
--- /dev/null
+++ b/plugins/GSdx_legacy/GSPerfMon.h
@@ -0,0 +1,75 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+class GSPerfMon
+{
+public:
+	enum timer_t 
+	{
+		Main, 
+		Sync, 
+		WorkerDraw0, WorkerDraw1, WorkerDraw2, WorkerDraw3, WorkerDraw4, WorkerDraw5, WorkerDraw6, WorkerDraw7, 
+		WorkerDraw8, WorkerDraw9, WorkerDraw10, WorkerDraw11, WorkerDraw12, WorkerDraw13, WorkerDraw14, WorkerDraw15, 
+		TimerLast,
+	};
+	
+	enum counter_t 
+	{
+		Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint,
+		CounterLast,
+	};
+
+protected:
+	double m_counters[CounterLast];
+	double m_stats[CounterLast];
+	uint64 m_begin[TimerLast], m_total[TimerLast], m_start[TimerLast];
+	uint64 m_frame;
+	clock_t m_lastframe;
+	int m_count;
+
+	friend class GSPerfMonAutoTimer;
+
+public:
+	GSPerfMon();
+
+	void SetFrame(uint64 frame) {m_frame = frame;}
+	uint64 GetFrame() {return m_frame;}
+
+	void Put(counter_t c, double val = 0);
+	double Get(counter_t c) {return m_stats[c];}
+	void Update();
+
+	void Start(int timer = Main);
+	void Stop(int timer = Main);
+	int CPU(int timer = Main, bool reset = true);
+};
+
+class GSPerfMonAutoTimer
+{
+	GSPerfMon* m_pm;
+	int m_timer;
+
+public:
+	GSPerfMonAutoTimer(GSPerfMon* pm, int timer = GSPerfMon::Main) {m_timer = timer; (m_pm = pm)->Start(m_timer);}
+	~GSPerfMonAutoTimer() {m_pm->Stop(m_timer);}
+};
diff --git a/plugins/GSdx_legacy/GSPng.cpp b/plugins/GSdx_legacy/GSPng.cpp
new file mode 100644
index 0000000000..62b7e7d4b5
--- /dev/null
+++ b/plugins/GSdx_legacy/GSPng.cpp
@@ -0,0 +1,149 @@
+/*
+ *	Copyright (C) 2015-2015 Gregory hainaut
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSPng.h"
+#include <zlib.h>
+#include <png.h>
+
+struct {
+    int type;
+    int bytes_per_pixel_in;
+    int bytes_per_pixel_out;
+    int channel_bit_depth;
+    const char *extension[2];
+} static const pixel[GSPng::Format::COUNT] = {
+    {PNG_COLOR_TYPE_RGBA, 4, 4, 8 , {"_full.png",     nullptr}},         // RGBA_PNG
+    {PNG_COLOR_TYPE_RGB , 4, 3, 8 , {".png",          nullptr}},         // RGB_PNG
+    {PNG_COLOR_TYPE_RGB , 4, 3, 8 , {".png",          "_alpha.png"}},    // RGB_A_PNG
+    {PNG_COLOR_TYPE_GRAY, 4, 1, 8 , {"_alpha.png",    nullptr}},         // ALPHA_PNG
+    {PNG_COLOR_TYPE_GRAY, 1, 1, 8 , {"_R8I.png",      nullptr}},         // R8I_PNG
+    {PNG_COLOR_TYPE_GRAY, 2, 2, 16, {"_R16I.png",     nullptr}},         // R16I_PNG
+    {PNG_COLOR_TYPE_GRAY, 4, 2, 16, {"_R32I_lsb.png", "_R32I_msb.png"}}, // R32I_PNG
+};
+
+namespace GSPng {
+
+    bool SaveFile(const string& file, Format fmt, uint8* image, uint8* row,
+        int width, int height, int pitch, int compression,
+        bool rb_swapped = false, bool first_image = false)
+    {
+        int channel_bit_depth = pixel[fmt].channel_bit_depth;
+        int bytes_per_pixel_in = pixel[fmt].bytes_per_pixel_in;
+
+        int type = first_image ? pixel[fmt].type : PNG_COLOR_TYPE_GRAY;
+        int offset = first_image ? 0 : pixel[fmt].bytes_per_pixel_out;
+        int bytes_per_pixel_out = first_image ? pixel[fmt].bytes_per_pixel_out : bytes_per_pixel_in - offset;
+
+        FILE *fp = fopen(file.c_str(), "wb");
+        if (fp == nullptr)
+            return false;
+
+        png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
+        png_infop info_ptr = nullptr;
+
+        bool success = false;
+        try {
+            if (png_ptr == nullptr)
+                throw GSDXRecoverableError();
+
+            info_ptr = png_create_info_struct(png_ptr);
+            if (info_ptr == nullptr)
+                throw GSDXRecoverableError();
+
+            if (setjmp(png_jmpbuf(png_ptr)))
+                throw GSDXRecoverableError();
+
+            if (compression < 0 || compression > Z_BEST_COMPRESSION)
+                compression = Z_BEST_SPEED;
+
+            png_init_io(png_ptr, fp);
+            png_set_compression_level(png_ptr, compression);
+            png_set_IHDR(png_ptr, info_ptr, width, height, channel_bit_depth, type,
+                PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
+            png_write_info(png_ptr, info_ptr);
+
+            if (channel_bit_depth > 8)
+                png_set_swap(png_ptr);
+            if (rb_swapped && type != PNG_COLOR_TYPE_GRAY)
+                png_set_bgr(png_ptr);
+
+            for (int y = 0; y < height; ++y, image += pitch) {
+                for (int x = 0; x < width; ++x)
+                    for (int i = 0; i < bytes_per_pixel_out; ++i)
+                        row[bytes_per_pixel_out * x + i] = image[bytes_per_pixel_in * x + i + offset];
+                png_write_row(png_ptr, row);
+            }
+            png_write_end(png_ptr, nullptr);
+
+            success = true;
+        } catch (GSDXRecoverableError&) {
+            fprintf(stderr, "Failed to write image %s\n", file.c_str());
+        }
+
+        if (png_ptr)
+            png_destroy_write_struct(&png_ptr, info_ptr ? &info_ptr : nullptr);
+        fclose(fp);
+
+        return success;
+    }
+
+    bool Save(GSPng::Format fmt, const string& file, uint8* image, int w, int h, int pitch, int compression, bool rb_swapped)
+    {
+        std::string root = file;
+        root.replace(file.length() - 4, 4, "");
+
+        ASSERT(fmt >= Format::START && fmt < Format::COUNT);
+
+        std::unique_ptr<uint8[]> row(new uint8[pixel[fmt].bytes_per_pixel_out * w]);
+
+        std::string filename = root + pixel[fmt].extension[0];
+        if (!SaveFile(filename, fmt, image, row.get(), w, h, pitch, compression, rb_swapped, true))
+            return false;
+
+        // Second image
+        if (pixel[fmt].extension[1] == nullptr)
+            return true;
+
+        filename = root + pixel[fmt].extension[1];
+        return SaveFile(filename, fmt, image, row.get(), w, h, pitch, compression);
+    }
+
+    Transaction::Transaction(GSPng::Format fmt, const string& file, const uint8* image, int w, int h, int pitch, int compression)
+        : m_fmt(fmt), m_file(file), m_w(w), m_h(h), m_pitch(pitch), m_compression(compression)
+    {
+        // Note: yes it would be better to use shared pointer
+        m_image = (uint8*)_aligned_malloc(pitch*h, 32);
+        if (m_image)
+            memcpy(m_image, image, pitch*h);
+    }
+
+    Transaction::~Transaction()
+    {
+        if (m_image)
+            _aligned_free(m_image);
+    }
+
+    void Worker::Process(shared_ptr<Transaction>& item)
+    {
+        Save(item->m_fmt, item->m_file, item->m_image, item->m_w, item->m_h, item->m_pitch, item->m_compression);
+    }
+
+}
diff --git a/plugins/GSdx_legacy/GSPng.h b/plugins/GSdx_legacy/GSPng.h
new file mode 100644
index 0000000000..6dd623fee3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSPng.h
@@ -0,0 +1,65 @@
+/*
+ *	Copyright (C) 2015-2015 Gregory hainaut
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSThread_CXX11.h"
+
+namespace GSPng {
+    enum Format {
+        START = 0,
+        RGBA_PNG = 0,
+        RGB_PNG,
+        RGB_A_PNG,
+        ALPHA_PNG,
+        R8I_PNG,
+        R16I_PNG,
+        R32I_PNG,
+        COUNT
+    };
+
+	class Transaction
+	{
+		public:
+			Format m_fmt;
+			const std::string m_file;
+			uint8* m_image;
+			int m_w;
+			int m_h;
+			int m_pitch;
+			int m_compression;
+
+			Transaction(GSPng::Format fmt, const string& file, const uint8* image, int w, int h, int pitch, int compression);
+			~Transaction();
+	};
+
+    bool Save(GSPng::Format fmt, const string& file, uint8* image, int w, int h, int pitch, int compression, bool rb_swapped = false);
+
+	class Worker : public GSJobQueue<shared_ptr<Transaction>, 16 >
+	{
+		public:
+			Worker() {};
+			virtual ~Worker() {};
+
+			void Process(shared_ptr<Transaction>& item);
+
+			int GetPixels(bool reset) {return 0;}
+	};
+}
diff --git a/plugins/GSdx_legacy/GSRasterizer.cpp b/plugins/GSdx_legacy/GSRasterizer.cpp
new file mode 100644
index 0000000000..2075aac486
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRasterizer.cpp
@@ -0,0 +1,1234 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+// TODO: JIT Draw* (flags: depth, texture, color (+iip), scissor)
+
+#include "stdafx.h"
+#include "GSRasterizer.h"
+
+// - for more threads screen segments should be smaller to better distribute the pixels
+// - but not too small to keep the threading overhead low
+// - ideal value between 3 and 5, or log2(64 / number of threads)
+
+#define THREAD_HEIGHT 4
+
+int GSRasterizerData::s_counter = 0;
+
+GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
+	: m_perfmon(perfmon)
+	, m_ds(ds)
+	, m_id(id)
+	, m_threads(threads)
+{
+	memset(&m_pixels, 0, sizeof(m_pixels));
+
+	m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
+	m_edge.count = 0;
+
+	m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
+
+	int row = 0;
+
+	while(row < (2048 >> THREAD_HEIGHT))
+	{
+		for(int i = 0; i < threads; i++, row++)
+		{
+			m_scanline[row] = i == id ? 1 : 0;
+		}
+	}
+}
+
+GSRasterizer::~GSRasterizer()
+{
+	_aligned_free(m_scanline);
+
+	if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
+
+	delete m_ds;
+}
+
+bool GSRasterizer::IsOneOfMyScanlines(int top) const
+{
+	ASSERT(top >= 0 && top < 2048);
+
+	return m_scanline[top >> THREAD_HEIGHT] != 0;
+}
+
+bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
+{
+	ASSERT(top >= 0 && top < 2048 && bottom >= 0 && bottom < 2048);
+
+	top = top >> THREAD_HEIGHT;
+	bottom = (bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT;
+
+	while(top < bottom)
+	{
+		if(m_scanline[top++])
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+int GSRasterizer::FindMyNextScanline(int top) const
+{
+	int i = top >> THREAD_HEIGHT;
+
+	if(m_scanline[i] == 0)
+	{
+		while(m_scanline[++i] == 0);
+
+		top = i << THREAD_HEIGHT;
+	}
+
+	return top;
+}
+
+void GSRasterizer::Queue(const shared_ptr<GSRasterizerData>& data)
+{
+	Draw(data.get());
+}
+
+int GSRasterizer::GetPixels(bool reset) 
+{
+	int pixels = m_pixels.sum;
+	
+	if(reset)
+	{
+		m_pixels.sum = 0;
+	}
+
+	return pixels;
+}
+
+void GSRasterizer::Draw(GSRasterizerData* data)
+{
+	GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerDraw0 + m_id);
+
+	if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return;
+
+	m_pixels.actual = 0;
+	m_pixels.total = 0;
+
+	data->start = __rdtsc();
+
+	m_ds->BeginDraw(data);
+
+	const GSVertexSW* vertex = data->vertex;
+	const GSVertexSW* vertex_end = data->vertex + data->vertex_count;
+	
+	const uint32* index = data->index;
+	const uint32* index_end = data->index + data->index_count;
+
+	uint32 tmp_index[] = {0, 1, 2};
+
+	bool scissor_test = !data->bbox.eq(data->bbox.rintersect(data->scissor));
+
+	m_scissor = data->scissor;
+	m_fscissor_x = GSVector4(data->scissor).xzxz();
+	m_fscissor_y = GSVector4(data->scissor).ywyw();
+
+	switch(data->primclass)
+	{
+	case GS_POINT_CLASS:
+
+		if(scissor_test)
+		{
+			DrawPoint<true>(vertex, data->vertex_count, index, data->index_count);
+		}
+		else 
+		{
+			DrawPoint<false>(vertex, data->vertex_count, index, data->index_count);
+		}
+
+		break;
+
+	case GS_LINE_CLASS:
+		
+		if(index != NULL)
+		{
+			do {DrawLine(vertex, index); index += 2;}
+			while(index < index_end);
+		}
+		else
+		{
+			do {DrawLine(vertex, tmp_index); vertex += 2;}
+			while(vertex < vertex_end);
+		}
+
+		break;
+
+	case GS_TRIANGLE_CLASS:
+		
+		if(index != NULL)
+		{
+			do {DrawTriangle(vertex, index); index += 3;}
+			while(index < index_end);
+		}
+		else
+		{
+			do {DrawTriangle(vertex, tmp_index); vertex += 3;}
+			while(vertex < vertex_end);
+		}
+
+		break;
+
+	case GS_SPRITE_CLASS:
+		
+		if(index != NULL)
+		{
+			do {DrawSprite(vertex, index); index += 2;}
+			while(index < index_end);
+		}
+		else
+		{
+			do {DrawSprite(vertex, tmp_index); vertex += 2;}
+			while(vertex < vertex_end);
+		}
+
+		break;
+
+	default:
+		__assume(0);
+	}
+
+	#if _M_SSE >= 0x501
+	_mm256_zeroupper();
+	#endif
+
+	data->pixels = m_pixels.actual;
+
+	uint64 ticks = __rdtsc() - data->start;
+
+	m_pixels.sum += m_pixels.actual;
+
+	m_ds->EndDraw(data->frame, ticks, m_pixels.actual, m_pixels.total);
+}
+
+template<bool scissor_test>
+void GSRasterizer::DrawPoint(const GSVertexSW* vertex, int vertex_count, const uint32* index, int index_count)
+{
+	if(index != NULL)
+	{
+		for(int i = 0; i < index_count; i++, index++)
+		{
+			const GSVertexSW& v = vertex[*index];
+
+			GSVector4i p(v.p);
+
+			if(!scissor_test || m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
+			{
+				if(IsOneOfMyScanlines(p.y))
+				{
+					m_ds->SetupPrim(vertex, index, GSVertexSW::zero());
+
+					DrawScanline(1, p.x, p.y, v);
+				}
+			}
+		}
+	}
+	else
+	{
+		uint32 tmp_index[1] = {0};
+
+		for(int i = 0; i < vertex_count; i++, vertex++)
+		{
+			const GSVertexSW& v = vertex[0];
+
+			GSVector4i p(v.p);
+
+			if(!scissor_test || m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
+			{
+				if(IsOneOfMyScanlines(p.y))
+				{
+					m_ds->SetupPrim(vertex, tmp_index, GSVertexSW::zero());
+
+					DrawScanline(1, p.x, p.y, v);
+				}
+			}
+		}
+	}
+}
+
+void GSRasterizer::DrawLine(const GSVertexSW* vertex, const uint32* index)
+{
+	const GSVertexSW& v0 = vertex[index[0]];
+	const GSVertexSW& v1 = vertex[index[1]];
+	
+	GSVertexSW dv = v1 - v0;
+
+	GSVector4 dp = dv.p.abs();
+
+	int i = (dp < dp.yxwz()).mask() & 1; // |dx| <= |dy|
+
+	if(m_ds->HasEdge())
+	{
+		DrawEdge(v0, v1, dv, i, 0);
+		DrawEdge(v0, v1, dv, i, 1);
+
+		Flush(vertex, index, GSVertexSW::zero(), true);
+
+		return;
+	}
+
+	GSVector4i dpi(dp);
+
+	if(dpi.y == 0)
+	{
+		if(dpi.x > 0)
+		{
+			// shortcut for horizontal lines
+
+			GSVector4 mask = (v0.p > v1.p).xxxx();
+
+			GSVertexSW scan;
+
+			scan.p = v0.p.blend32(v1.p, mask);
+			scan.t = v0.t.blend32(v1.t, mask);
+			scan.c = v0.c.blend32(v1.c, mask);
+
+			GSVector4i p(scan.p);
+
+			if(m_scissor.top <= p.y && p.y < m_scissor.bottom && IsOneOfMyScanlines(p.y))
+			{
+				GSVector4 lrf = scan.p.upl(v1.p.blend32(v0.p, mask)).ceil();
+				GSVector4 l = lrf.max(m_fscissor_x);
+				GSVector4 r = lrf.min(m_fscissor_x);
+				GSVector4i lr = GSVector4i(l.xxyy(r));
+
+				int left = lr.extract32<0>();
+				int right = lr.extract32<2>();
+
+				int pixels = right - left;
+
+				if(pixels > 0)
+				{
+					GSVertexSW dscan = dv / dv.p.xxxx();
+
+					scan += dscan * (l - scan.p).xxxx();
+
+					m_ds->SetupPrim(vertex, index, dscan);
+
+					DrawScanline(pixels, left, p.y, scan);
+				}
+			}
+		}
+
+		return;
+	}
+
+	int steps = dpi.v[i];
+
+	if(steps > 0)
+	{
+		GSVertexSW edge = v0;
+		GSVertexSW dedge = dv / GSVector4(dp.v[i]);
+
+		GSVertexSW* RESTRICT e = m_edge.buff;
+
+		while(1)
+		{
+			GSVector4i p(edge.p);
+
+			if(m_scissor.left <= p.x && p.x < m_scissor.right && m_scissor.top <= p.y && p.y < m_scissor.bottom)
+			{
+				if(IsOneOfMyScanlines(p.y))
+				{
+					AddScanline(e, 1, p.x, p.y, edge);
+
+					e++;
+				}
+			}
+
+			if(--steps == 0) break;
+
+			edge += dedge;
+		}
+
+		m_edge.count = e - m_edge.buff;
+
+		Flush(vertex, index, GSVertexSW::zero());
+	}
+}
+
+static const uint8 s_ysort[8][4] =
+{
+	{0, 1, 2, 0}, // y0 <= y1 <= y2
+	{1, 0, 2, 0}, // y1 < y0 <= y2
+	{0, 0, 0, 0},
+	{1, 2, 0, 0}, // y1 <= y2 < y0
+	{0, 2, 1, 0}, // y0 <= y2 < y1
+	{0, 0, 0, 0},
+	{2, 0, 1, 0}, // y2 < y0 <= y1
+	{2, 1, 0, 0}, // y2 < y1 < y0
+};
+
+#if _M_SSE >= 0x501
+
+void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
+{
+	GSVertexSW2 dv[3];
+	GSVertexSW2 edge;
+	GSVertexSW2 dedge;
+	GSVertexSW2 dscan;
+
+	GSVector4 y0011 = vertex[index[0]].p.yyyy(vertex[index[1]].p);
+	GSVector4 y1221 = vertex[index[1]].p.yyyy(vertex[index[2]].p).xzzx();
+
+	int m1 = (y0011 > y1221).mask() & 7;
+
+	int i[3];
+
+	i[0] = index[s_ysort[m1][0]];
+	i[1] = index[s_ysort[m1][1]];
+	i[2] = index[s_ysort[m1][2]];
+
+	const GSVertexSW2* _v = (const GSVertexSW2*)vertex;
+
+	const GSVertexSW2& v0 = _v[i[0]];
+	const GSVertexSW2& v1 = _v[i[1]];
+	const GSVertexSW2& v2 = _v[i[2]];
+
+	y0011 = v0.p.yyyy(v1.p);
+	y1221 = v1.p.yyyy(v2.p).xzzx();
+
+	m1 = (y0011 == y1221).mask() & 7;
+
+	// if(i == 0) => y0 < y1 < y2
+	// if(i == 1) => y0 == y1 < y2
+	// if(i == 4) => y0 < y1 == y2
+
+	if(m1 == 7) return; // y0 == y1 == y2
+
+	GSVector4 tbf = y0011.xzxz(y1221).ceil();
+	GSVector4 tbmax = tbf.max(m_fscissor_y);
+	GSVector4 tbmin = tbf.min(m_fscissor_y);
+	GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(y0, t) max(y1, t) min(y1, b) min(y2, b)
+
+	dv[0] = v1 - v0;
+	dv[1] = v2 - v0;
+	dv[2] = v2 - v1;
+
+	GSVector4 cross = dv[0].p * dv[1].p.yxwz();
+
+	cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
+
+	// the longest horizontal span would be cross.x / dv[1].p.y, but we don't need its actual value
+
+	int m2 = cross.upl(cross == GSVector4::zero()).mask();
+
+	if(m2 & 2) return;
+
+	m2 &= 1;
+
+	cross = cross.rcpnr();
+
+	GSVector4 dxy01 = dv[0].p.xyxy(dv[1].p);
+
+	GSVector4 dx = dxy01.xzxy(dv[2].p);
+	GSVector4 dy = dxy01.ywyx(dv[2].p);
+
+	GSVector4 ddx[3];
+
+	ddx[0] = dx / dy;
+	ddx[1] = ddx[0].yxzw();
+	ddx[2] = ddx[0].xzyw();
+
+	GSVector8 _dxy01c(dxy01 * cross);
+
+	/*
+	dscan = dv[1] * dxy01c.yyyy() - dv[0] * dxy01c.wwww();
+	dedge = dv[0] * dxy01c.zzzz() - dv[1] * dxy01c.xxxx();
+	*/
+
+	dscan.p = dv[1].p * _dxy01c.yyyy().extract<0>() - dv[0].p * _dxy01c.wwww().extract<0>();
+	dscan.tc = dv[1].tc * _dxy01c.yyyy() - dv[0].tc * _dxy01c.wwww();
+
+	dedge.p = dv[0].p * _dxy01c.zzzz().extract<0>() - dv[1].p * _dxy01c.xxxx().extract<0>();
+	dedge.tc = dv[0].tc * _dxy01c.zzzz() - dv[1].tc * _dxy01c.xxxx();
+
+	if(m1 & 1)
+	{
+		if(tb.y < tb.w)
+		{
+			edge = _v[i[1 - m2]];
+
+			edge.p = edge.p.insert32<0, 1>(vertex[i[m2]].p);
+			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
+
+			DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, vertex[i[1 - m2]].p);
+		}
+	}
+	else
+	{
+		if(tb.x < tb.z)
+		{
+			edge = v0;
+
+			edge.p = edge.p.xxzw();
+			dedge.p = ddx[m2].xyzw(dedge.p);
+
+			DrawTriangleSection(tb.x, tb.z, edge, dedge, dscan, v0.p);
+		}
+
+		if(tb.y < tb.w)
+		{
+			edge = v1;
+
+			edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p);
+			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
+
+			DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p);
+		}
+	}
+
+	Flush(vertex, index, (GSVertexSW&)dscan);
+
+	if(m_ds->HasEdge())
+	{
+		GSVector4 a = dx.abs() < dy.abs(); // |dx| <= |dy|
+		GSVector4 b = dx < GSVector4::zero(); // dx < 0
+		GSVector4 c = cross < GSVector4::zero(); // longest.p.x < 0
+
+		int orientation = a.mask();
+		int side = ((a | b) ^ c).mask() ^ 2; // evil
+
+		DrawEdge((GSVertexSW&)v0, (GSVertexSW&)v1, (GSVertexSW&)dv[0], orientation & 1, side & 1);
+		DrawEdge((GSVertexSW&)v0, (GSVertexSW&)v2, (GSVertexSW&)dv[1], orientation & 2, side & 2);
+		DrawEdge((GSVertexSW&)v1, (GSVertexSW&)v2, (GSVertexSW&)dv[2], orientation & 4, side & 4);
+
+		Flush(vertex, index, GSVertexSW::zero(), true);
+	}
+}
+
+void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, const GSVertexSW2& dedge, const GSVertexSW2& dscan, const GSVector4& p0)
+{
+	ASSERT(top < bottom);
+	ASSERT(edge.p.x <= edge.p.y);
+
+	GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
+
+	GSVector4 scissor = m_fscissor_x;
+
+	top = FindMyNextScanline(top);
+	
+	while(top < bottom)
+	{
+		GSVector8 dy(GSVector4(top) - p0.yyyy());
+
+		GSVertexSW2 scan;
+
+		scan.p = edge.p + dedge.p * dy.extract<0>();
+
+		GSVector4 lrf = scan.p.ceil();
+		GSVector4 l = lrf.max(scissor);
+		GSVector4 r = lrf.min(scissor);
+		GSVector4i lr = GSVector4i(l.xxyy(r));
+
+		int left = lr.extract32<0>();
+		int right = lr.extract32<2>();
+
+		int pixels = right - left;
+
+		if(pixels > 0)
+		{
+			scan.tc = edge.tc + dedge.tc * dy;
+
+			GSVector8 prestep((l - p0).xxxx());
+
+			scan.p = scan.p + dscan.p * prestep.extract<0>();
+			scan.tc = scan.tc + dscan.tc * prestep;
+
+			AddScanline(e++, pixels, left, top, (GSVertexSW&)scan);
+		}
+
+		top++;
+
+		if(!IsOneOfMyScanlines(top))
+		{
+			top += (m_threads - 1) << THREAD_HEIGHT;
+		}
+	}
+
+	m_edge.count += e - &m_edge.buff[m_edge.count];
+}
+
+#else
+
+void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
+{
+	GSVertexSW dv[3];
+	GSVertexSW edge;
+	GSVertexSW dedge;
+	GSVertexSW dscan;
+
+	GSVector4 y0011 = vertex[index[0]].p.yyyy(vertex[index[1]].p);
+	GSVector4 y1221 = vertex[index[1]].p.yyyy(vertex[index[2]].p).xzzx();
+
+	int m1 = (y0011 > y1221).mask() & 7;
+
+	int i[3];
+
+	i[0] = index[s_ysort[m1][0]];
+	i[1] = index[s_ysort[m1][1]];
+	i[2] = index[s_ysort[m1][2]];
+
+	const GSVertexSW& v0 = vertex[i[0]];
+	const GSVertexSW& v1 = vertex[i[1]];
+	const GSVertexSW& v2 = vertex[i[2]];
+
+	y0011 = v0.p.yyyy(v1.p);
+	y1221 = v1.p.yyyy(v2.p).xzzx();
+
+	m1 = (y0011 == y1221).mask() & 7;
+
+	// if(i == 0) => y0 < y1 < y2
+	// if(i == 1) => y0 == y1 < y2
+	// if(i == 4) => y0 < y1 == y2
+
+	if(m1 == 7) return; // y0 == y1 == y2
+
+	GSVector4 tbf = y0011.xzxz(y1221).ceil();
+	GSVector4 tbmax = tbf.max(m_fscissor_y);
+	GSVector4 tbmin = tbf.min(m_fscissor_y);
+	GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(y0, t) max(y1, t) min(y1, b) min(y2, b)
+
+	dv[0] = v1 - v0;
+	dv[1] = v2 - v0;
+	dv[2] = v2 - v1;
+
+	GSVector4 cross = dv[0].p * dv[1].p.yxwz();
+
+	cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
+
+	// the longest horizontal span would be cross.x / dv[1].p.y, but we don't need its actual value
+
+	int m2 = cross.upl(cross == GSVector4::zero()).mask();
+
+	if(m2 & 2) return;
+
+	m2 &= 1;
+
+	cross = cross.rcpnr();
+
+	GSVector4 dxy01 = dv[0].p.xyxy(dv[1].p);
+
+	GSVector4 dx = dxy01.xzxy(dv[2].p);
+	GSVector4 dy = dxy01.ywyx(dv[2].p);
+
+	GSVector4 ddx[3];
+
+	ddx[0] = dx / dy;
+	ddx[1] = ddx[0].yxzw();
+	ddx[2] = ddx[0].xzyw();
+
+	GSVector4 dxy01c = dxy01 * cross;
+
+	/*
+	dscan = dv[1] * dxy01c.yyyy() - dv[0] * dxy01c.wwww();
+	dedge = dv[0] * dxy01c.zzzz() - dv[1] * dxy01c.xxxx();
+	*/
+
+	dscan.p = dv[1].p * dxy01c.yyyy() - dv[0].p * dxy01c.wwww();
+	dscan.t = dv[1].t * dxy01c.yyyy() - dv[0].t * dxy01c.wwww();
+	dscan.c = dv[1].c * dxy01c.yyyy() - dv[0].c * dxy01c.wwww();
+
+	dedge.p = dv[0].p * dxy01c.zzzz() - dv[1].p * dxy01c.xxxx();
+	dedge.t = dv[0].t * dxy01c.zzzz() - dv[1].t * dxy01c.xxxx();
+	dedge.c = dv[0].c * dxy01c.zzzz() - dv[1].c * dxy01c.xxxx();
+
+	if(m1 & 1)
+	{
+		if(tb.y < tb.w)
+		{
+			edge = vertex[i[1 - m2]];
+
+			edge.p = edge.p.insert32<0, 1>(vertex[i[m2]].p);
+			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
+
+			DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, vertex[i[1 - m2]].p);
+		}
+	}
+	else
+	{
+		if(tb.x < tb.z)
+		{
+			edge = v0;
+
+			edge.p = edge.p.xxzw();
+			dedge.p = ddx[m2].xyzw(dedge.p);
+
+			DrawTriangleSection(tb.x, tb.z, edge, dedge, dscan, v0.p);
+		}
+
+		if(tb.y < tb.w)
+		{
+			edge = v1;
+
+			edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p);
+			dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
+
+			DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p);
+		}
+	}
+
+	Flush(vertex, index, dscan);
+
+	if(m_ds->HasEdge())
+	{
+		GSVector4 a = dx.abs() < dy.abs(); // |dx| <= |dy|
+		GSVector4 b = dx < GSVector4::zero(); // dx < 0
+		GSVector4 c = cross < GSVector4::zero(); // longest.p.x < 0
+
+		int orientation = a.mask();
+		int side = ((a | b) ^ c).mask() ^ 2; // evil
+
+		DrawEdge(v0, v1, dv[0], orientation & 1, side & 1);
+		DrawEdge(v0, v2, dv[1], orientation & 2, side & 2);
+		DrawEdge(v1, v2, dv[2], orientation & 4, side & 4);
+
+		Flush(vertex, index, GSVertexSW::zero(), true);
+	}
+}
+
+void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0)
+{
+	ASSERT(top < bottom);
+	ASSERT(edge.p.x <= edge.p.y);
+
+	GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
+
+	GSVector4 scissor = m_fscissor_x;
+
+	top = FindMyNextScanline(top);
+	
+	while(top < bottom)
+	{
+		GSVector4 dy = GSVector4(top) - p0.yyyy();
+
+		GSVertexSW scan;
+
+		scan.p = edge.p + dedge.p * dy;
+
+		GSVector4 lrf = scan.p.ceil();
+		GSVector4 l = lrf.max(scissor);
+		GSVector4 r = lrf.min(scissor);
+		GSVector4i lr = GSVector4i(l.xxyy(r));
+
+		int left = lr.extract32<0>();
+		int right = lr.extract32<2>();
+
+		int pixels = right - left;
+
+		if(pixels > 0)
+		{
+			scan.t = edge.t + dedge.t * dy;
+			scan.c = edge.c + dedge.c * dy;
+
+			GSVector4 prestep = (l - p0).xxxx();
+
+			scan.p = scan.p + dscan.p * prestep;
+			scan.t = scan.t + dscan.t * prestep;
+			scan.c = scan.c + dscan.c * prestep;
+
+			AddScanline(e++, pixels, left, top, scan);
+		}
+
+		top++;
+
+		if(!IsOneOfMyScanlines(top))
+		{
+			top += (m_threads - 1) << THREAD_HEIGHT;
+		}
+	}
+
+	m_edge.count += e - &m_edge.buff[m_edge.count];
+}
+
+#endif
+
+void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index)
+{
+	const GSVertexSW& v0 = vertex[index[0]];
+	const GSVertexSW& v1 = vertex[index[1]];
+
+	GSVector4 mask = (v0.p < v1.p).xyzw(GSVector4::zero());
+
+	GSVertexSW v[2];
+
+	v[0].p = v1.p.blend32(v0.p, mask);
+	v[0].t = v1.t.blend32(v0.t, mask);
+	v[0].c = v1.c;
+
+	v[1].p = v0.p.blend32(v1.p, mask);
+	v[1].t = v0.t.blend32(v1.t, mask);
+
+	GSVector4i r(v[0].p.xyxy(v[1].p).ceil());
+
+	r = r.rintersect(m_scissor);
+
+	if(r.rempty()) return;
+
+	GSVertexSW scan = v[0];
+
+	if(m_ds->IsSolidRect())
+	{
+		if(m_threads == 1)
+		{
+			m_ds->DrawRect(r, scan);
+
+			int pixels = r.width() * r.height();
+
+			m_pixels.actual += pixels;
+			m_pixels.total += pixels;
+		}
+		else
+		{
+			int top = FindMyNextScanline(r.top);
+			int bottom = r.bottom;
+
+			while(top < bottom)
+			{
+				r.top = top;
+				r.bottom = std::min<int>((top + (1 << THREAD_HEIGHT)) & ~((1 << THREAD_HEIGHT) - 1), bottom);
+
+				m_ds->DrawRect(r, scan);
+			
+				int pixels = r.width() * r.height();
+
+				m_pixels.actual += pixels;
+				m_pixels.total += pixels;
+
+				top = r.bottom + ((m_threads - 1) << THREAD_HEIGHT);
+			}
+		}
+
+		return;
+	}
+
+	GSVertexSW dv = v[1] - v[0];
+
+	GSVector4 dt = dv.t / dv.p.xyxy();
+
+	GSVertexSW dedge;
+	GSVertexSW dscan;
+
+	dedge.t = GSVector4::zero().insert32<1, 1>(dt);
+	dscan.t = GSVector4::zero().insert32<0, 0>(dt);
+
+	GSVector4 prestep = GSVector4(r.left, r.top) - scan.p;
+
+	int m = (prestep == GSVector4::zero()).mask();
+
+	if((m & 2) == 0) scan.t += dedge.t * prestep.yyyy();
+	if((m & 1) == 0) scan.t += dscan.t * prestep.xxxx();
+
+	m_ds->SetupPrim(vertex, index, dscan);
+
+	while(1)
+	{
+		if(IsOneOfMyScanlines(r.top))
+		{
+			DrawScanline(r.width(), r.left, r.top, scan);
+		}
+
+		if(++r.top >= r.bottom) break;
+
+		scan.t += dedge.t;
+	}
+}
+
+void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side)
+{
+	// orientation:
+	// - true: |dv.p.y| > |dv.p.x|
+	// - false |dv.p.x| > |dv.p.y|
+	// side:
+	// - true: top/left edge
+	// - false: bottom/right edge
+
+	// TODO: bit slow and too much duplicated code
+	// TODO: inner pre-step is still missing (hardly noticable)
+	// TODO: it does not always line up with the edge of the surrounded triangle
+
+	GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
+
+	if(orientation)
+	{
+		GSVector4 tbf = v0.p.yyyy(v1.p).ceil(); // t t b b
+		GSVector4 tbmax = tbf.max(m_fscissor_y); // max(t, st) max(t, sb) max(b, st) max(b, sb)
+		GSVector4 tbmin = tbf.min(m_fscissor_y); // min(t, st) min(t, sb) min(b, st) min(b, sb)
+		GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(t, st) max(b, sb) min(t, st) min(b, sb)
+
+		int top, bottom;
+
+		GSVertexSW edge, dedge;
+
+		if((dv.p >= GSVector4::zero()).mask() & 2)
+		{
+			top = tb.extract32<0>(); // max(t, st)
+			bottom = tb.extract32<3>(); // min(b, sb)
+
+			if(top >= bottom) return;
+
+			edge = v0;
+			dedge = dv / dv.p.yyyy();
+
+			edge += dedge * (tbmax.xxxx() - edge.p.yyyy());
+		}
+		else
+		{
+			top = tb.extract32<1>(); // max(b, st)
+			bottom = tb.extract32<2>(); // min(t, sb)
+
+			if(top >= bottom) return;
+
+			edge = v1;
+			dedge = dv / dv.p.yyyy();
+
+			edge += dedge * (tbmax.zzzz() - edge.p.yyyy());
+		}
+
+		GSVector4i p = GSVector4i(edge.p.upl(dedge.p) * 0x10000);
+
+		int x = p.extract32<0>();
+		int dx = p.extract32<1>();
+
+		if(side)
+		{
+			while(1)
+			{
+				int xi = x >> 16;
+				int xf = x & 0xffff;
+
+				if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(top))
+				{
+					AddScanline(e, 1, xi, top, edge);
+
+					e->t.u32[3] = (0x10000 - xf) & 0xffff;
+
+					e++;
+				}
+
+				if(++top >= bottom) break;
+
+				edge += dedge;
+				x += dx;
+			}
+		}
+		else
+		{
+			while(1)
+			{
+				int xi = (x >> 16) + 1;
+				int xf = x & 0xffff;
+
+				if(m_scissor.left <= xi && xi < m_scissor.right && IsOneOfMyScanlines(top))
+				{
+					AddScanline(e, 1, xi, top, edge);
+
+					e->t.u32[3] = xf;
+
+					e++;
+				}
+
+				if(++top >= bottom) break;
+
+				edge += dedge;
+				x += dx;
+			}
+		}
+	}
+	else
+	{
+		GSVector4 lrf = v0.p.xxxx(v1.p).ceil(); // l l r r
+		GSVector4 lrmax = lrf.max(m_fscissor_x); // max(l, sl) max(l, sr) max(r, sl) max(r, sr)
+		GSVector4 lrmin = lrf.min(m_fscissor_x); // min(l, sl) min(l, sr) min(r, sl) min(r, sr)
+		GSVector4i lr = GSVector4i(lrmax.xzyw(lrmin)); // max(l, sl) max(r, sl) min(l, sr) min(r, sr)
+
+		int left, right;
+
+		GSVertexSW edge, dedge;
+
+		if((dv.p >= GSVector4::zero()).mask() & 1)
+		{
+			left = lr.extract32<0>(); // max(l, sl)
+			right = lr.extract32<3>(); // min(r, sr)
+
+			if(left >= right) return;
+
+			edge = v0;
+			dedge = dv / dv.p.xxxx();
+
+			edge += dedge * (lrmax.xxxx() - edge.p.xxxx());
+		}
+		else
+		{
+			left = lr.extract32<1>(); // max(r, sl)
+			right = lr.extract32<2>(); // min(l, sr)
+
+			if(left >= right) return;
+
+			edge = v1;
+			dedge = dv / dv.p.xxxx();
+
+			edge += dedge * (lrmax.zzzz() - edge.p.xxxx());
+		}
+
+		GSVector4i p = GSVector4i(edge.p.upl(dedge.p) * 0x10000);
+
+		int y = p.extract32<2>();
+		int dy = p.extract32<3>();
+
+		if(side)
+		{
+			while(1)
+			{
+				int yi = y >> 16;
+				int yf = y & 0xffff;
+
+				if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
+				{
+					AddScanline(e, 1, left, yi, edge);
+
+					e->t.u32[3] = (0x10000 - yf) & 0xffff;
+
+					e++;
+				}
+
+				if(++left >= right) break;
+
+				edge += dedge;
+				y += dy;
+			}
+		}
+		else
+		{
+			while(1)
+			{
+				int yi = (y >> 16) + 1;
+				int yf = y & 0xffff;
+
+				if(m_scissor.top <= yi && yi < m_scissor.bottom && IsOneOfMyScanlines(yi))
+				{
+					AddScanline(e, 1, left, yi, edge);
+
+					e->t.u32[3] = yf;
+
+					e++;
+				}
+
+				if(++left >= right) break;
+
+				edge += dedge;
+				y += dy;
+			}
+		}
+	}
+
+	m_edge.count += e - &m_edge.buff[m_edge.count];
+}
+
+void GSRasterizer::AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan)
+{
+	*e = scan;
+
+	e->_pad.i32[0] = pixels;
+	e->_pad.i32[1] = left;
+	e->_pad.i32[2] = top;
+}
+
+void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge)
+{
+	// TODO: on win64 this could be the place where xmm6-15 are preserved (not by each DrawScanline)
+
+	int count = m_edge.count;
+
+	if(count > 0)
+	{
+		m_ds->SetupPrim(vertex, index, dscan);
+
+		const GSVertexSW* RESTRICT e = m_edge.buff;
+		const GSVertexSW* RESTRICT ee = e + count;
+
+		if(!edge)
+		{
+			do
+			{
+				int pixels = e->_pad.i32[0];
+				int left = e->_pad.i32[1];
+				int top = e->_pad.i32[2];
+
+				DrawScanline(pixels, left, top, *e++);
+			}
+			while(e < ee);
+		}
+		else
+		{
+			do
+			{
+				int pixels = e->_pad.i32[0];
+				int left = e->_pad.i32[1];
+				int top = e->_pad.i32[2];
+
+				DrawEdge(pixels, left, top, *e++);
+			}
+			while(e < ee);
+		}
+
+		m_edge.count = 0;
+	}
+}
+
+#if _M_SSE >= 0x501
+#define PIXELS_PER_LOOP 8
+#else
+#define PIXELS_PER_LOOP 4
+#endif
+
+void GSRasterizer::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	m_pixels.actual += pixels;
+	m_pixels.total += ((left + pixels + (PIXELS_PER_LOOP - 1)) & ~(PIXELS_PER_LOOP - 1)) - (left & (PIXELS_PER_LOOP - 1));
+	//m_pixels.total += ((left + pixels + (PIXELS_PER_LOOP - 1)) & ~(PIXELS_PER_LOOP - 1)) - left;
+
+	ASSERT(m_pixels.actual <= m_pixels.total);
+
+	m_ds->DrawScanline(pixels, left, top, scan);
+}
+
+void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan)
+{
+	m_pixels.actual += 1;
+	m_pixels.total += PIXELS_PER_LOOP - 1;
+
+	ASSERT(m_pixels.actual <= m_pixels.total);
+
+	m_ds->DrawEdge(pixels, left, top, scan);
+}
+
+//
+
+GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
+	: m_perfmon(perfmon)
+{
+	m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
+
+	int row = 0;
+
+	while(row < (2048 >> THREAD_HEIGHT))
+	{
+		for(int i = 0; i < threads; i++, row++)
+		{
+			m_scanline[row] = (uint8)i;
+		}
+	}
+}
+
+GSRasterizerList::~GSRasterizerList()
+{
+	for(auto i = m_workers.begin(); i != m_workers.end(); i++)
+	{
+		delete *i;
+	}
+
+	_aligned_free(m_scanline);
+}
+
+void GSRasterizerList::Queue(const shared_ptr<GSRasterizerData>& data)
+{
+	GSVector4i r = data->bbox.rintersect(data->scissor);
+
+	ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048);
+
+	int top = r.top >> THREAD_HEIGHT;
+	int bottom = std::min<int>((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size());
+
+	while(top < bottom)
+	{
+		m_workers[m_scanline[top++]]->Push(data);
+	}
+}
+
+void GSRasterizerList::Sync()
+{
+	if(!IsSynced())
+	{
+		for(size_t i = 0; i < m_workers.size(); i++)
+		{
+			m_workers[i]->Wait();
+		}
+
+		m_perfmon->Put(GSPerfMon::SyncPoint, 1);
+	}
+}
+
+bool GSRasterizerList::IsSynced() const
+{
+	for(size_t i = 0; i < m_workers.size(); i++)
+	{
+		if(!m_workers[i]->IsEmpty())
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+int GSRasterizerList::GetPixels(bool reset) 
+{
+	int pixels = 0;
+	
+	for(size_t i = 0; i < m_workers.size(); i++)
+	{
+		pixels += m_workers[i]->GetPixels(reset);
+	}
+
+	return pixels;
+}
+
+// GSRasterizerList::GSWorker
+
+GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r)
+	: GSJobQueue<shared_ptr<GSRasterizerData>, 256>()
+	, m_r(r)
+{
+}
+
+GSRasterizerList::GSWorker::~GSWorker()
+{
+	Wait();
+
+	delete m_r;
+}
+
+int GSRasterizerList::GSWorker::GetPixels(bool reset)
+{
+	return m_r->GetPixels(reset);
+}
+
+void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
+{
+	m_r->Draw(item.get());
+}
diff --git a/plugins/GSdx_legacy/GSRasterizer.h b/plugins/GSdx_legacy/GSRasterizer.h
new file mode 100644
index 0000000000..35d42c8ff4
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRasterizer.h
@@ -0,0 +1,235 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSVertexSW.h"
+#include "GSFunctionMap.h"
+#include "GSAlignedClass.h"
+#include "GSPerfMon.h"
+#include "GSThread_CXX11.h"
+
+__aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
+{
+	static int s_counter;
+
+public:
+	GSVector4i scissor;
+	GSVector4i bbox;
+	GS_PRIM_CLASS primclass;
+	uint8* buff;
+	GSVertexSW* vertex;
+	int vertex_count;
+	uint32* index;
+	int index_count;
+	uint64 frame;
+	uint64 start;
+	int pixels;
+	int counter;
+
+	GSRasterizerData() 
+		: scissor(GSVector4i::zero())
+		, bbox(GSVector4i::zero())
+		, primclass(GS_INVALID_CLASS)
+		, buff(NULL)
+		, vertex(NULL)
+		, vertex_count(0)
+		, index(NULL)
+		, index_count(0)
+		, frame(0)
+		, start(0)
+		, pixels(0)
+	{
+		counter = s_counter++;
+	}
+
+	virtual ~GSRasterizerData() 
+	{
+		if(buff != NULL) _aligned_free(buff);
+	}
+};
+
+class IDrawScanline : public GSAlignedClass<32>
+{
+public:
+	typedef void (*SetupPrimPtr)(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan);
+	typedef void (__fastcall *DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan);
+	typedef void (IDrawScanline::*DrawRectPtr)(const GSVector4i& r, const GSVertexSW& v); // TODO: jit
+
+protected:
+	SetupPrimPtr m_sp;
+	DrawScanlinePtr m_ds;
+	DrawScanlinePtr m_de;
+	DrawRectPtr m_dr;
+
+public:
+	IDrawScanline() : m_sp(NULL), m_ds(NULL), m_de(NULL), m_dr(NULL) {}
+	virtual ~IDrawScanline() {}
+
+	virtual void BeginDraw(const GSRasterizerData* data) = 0;
+	virtual void EndDraw(uint64 frame, uint64 ticks, int actual, int total) = 0;
+
+#ifdef ENABLE_JIT_RASTERIZER
+
+	__forceinline void SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan) {m_sp(vertex, index, dscan);}
+	__forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) {m_ds(pixels, left, top, scan);}
+	__forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) {m_de(pixels, left, top, scan);}
+	__forceinline void DrawRect(const GSVector4i& r, const GSVertexSW& v) {(this->*m_dr)(r, v);}
+
+#else
+
+	virtual void SetupPrim(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan) = 0;
+	virtual void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) = 0;
+	virtual void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) = 0;
+	virtual void DrawRect(const GSVector4i& r, const GSVertexSW& v) = 0;
+	
+#endif
+
+	virtual void PrintStats() = 0;
+
+	__forceinline bool HasEdge() const {return m_de != NULL;}
+	__forceinline bool IsSolidRect() const {return m_dr != NULL;}
+};
+
+class IRasterizer : public GSAlignedClass<32>
+{
+public:
+	virtual ~IRasterizer() {}
+
+	virtual void Queue(const shared_ptr<GSRasterizerData>& data) = 0;
+	virtual void Sync() = 0;
+	virtual bool IsSynced() const = 0;
+	virtual int GetPixels(bool reset = true) = 0;
+	virtual void PrintStats() = 0;
+};
+
+__aligned(class, 32) GSRasterizer : public IRasterizer
+{
+protected:
+	GSPerfMon* m_perfmon;
+	IDrawScanline* m_ds;
+	int m_id;
+	int m_threads;
+	uint8* m_scanline;
+	GSVector4i m_scissor;
+	GSVector4 m_fscissor_x;
+	GSVector4 m_fscissor_y;
+	struct {GSVertexSW* buff; int count;} m_edge;
+	struct {int sum, actual, total;} m_pixels;
+
+	typedef void (GSRasterizer::*DrawPrimPtr)(const GSVertexSW* v, int count);
+
+	template<bool scissor_test> 
+	void DrawPoint(const GSVertexSW* vertex, int vertex_count, const uint32* index, int index_count);
+	void DrawLine(const GSVertexSW* vertex, const uint32* index);
+	void DrawTriangle(const GSVertexSW* vertex, const uint32* index);
+	void DrawSprite(const GSVertexSW* vertex, const uint32* index);
+
+	#if _M_SSE >= 0x501
+	__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, const GSVertexSW2& dedge, const GSVertexSW2& dscan, const GSVector4& p0);
+	#else
+	__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& edge, const GSVertexSW& dedge, const GSVertexSW& dscan, const GSVector4& p0);
+	#endif
+
+	void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side);
+
+	__forceinline void AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan);
+	__forceinline void Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge = false);
+
+	__forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan);
+	__forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan);
+
+public:
+	GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon);
+	virtual ~GSRasterizer();
+
+	__forceinline bool IsOneOfMyScanlines(int top) const;
+	__forceinline bool IsOneOfMyScanlines(int top, int bottom) const;
+	__forceinline int FindMyNextScanline(int top) const;
+
+	void Draw(GSRasterizerData* data);
+
+	// IRasterizer
+
+	void Queue(const shared_ptr<GSRasterizerData>& data);
+	void Sync() {}
+	bool IsSynced() const {return true;}
+	int GetPixels(bool reset);
+	void PrintStats() {m_ds->PrintStats();}
+};
+
+class GSRasterizerList : public IRasterizer
+{
+protected:
+	class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData>, 256 >
+	{
+		GSRasterizer* m_r;
+
+	public:
+		GSWorker(GSRasterizer* r);
+		virtual ~GSWorker();
+
+		int GetPixels(bool reset);
+
+		// GSJobQueue
+
+		void Process(shared_ptr<GSRasterizerData>& item);
+	};
+
+	GSPerfMon* m_perfmon;
+	vector<GSWorker*> m_workers;
+	uint8* m_scanline;
+
+	GSRasterizerList(int threads, GSPerfMon* perfmon);
+
+public:
+	virtual ~GSRasterizerList();
+
+	template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon)
+	{
+		threads = std::max<int>(threads, 0);
+
+		if(threads == 0)
+		{
+			return new GSRasterizer(new DS(), 0, 1, perfmon);
+		}
+		else
+		{
+			GSRasterizerList* rl = new GSRasterizerList(threads, perfmon);
+
+			for(int i = 0; i < threads; i++)
+			{
+				rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon)));
+			}
+
+			return rl;
+		}
+	}
+
+	// IRasterizer
+
+	void Queue(const shared_ptr<GSRasterizerData>& data);
+	void Sync();
+	bool IsSynced() const;
+	int GetPixels(bool reset);
+	void PrintStats() {}
+};
diff --git a/plugins/GSdx_legacy/GSRenderer.cpp b/plugins/GSdx_legacy/GSRenderer.cpp
new file mode 100644
index 0000000000..2f34956b75
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRenderer.cpp
@@ -0,0 +1,654 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRenderer.h"
+#ifdef __linux__
+#include <X11/keysym.h>
+#endif
+
+const unsigned int s_interlace_nb = 8;
+const unsigned int s_post_shader_nb = 5;
+const unsigned int s_aspect_ratio_nb = 3;
+
+GSRenderer::GSRenderer()
+	: m_shader(0)
+	, m_shift_key(false)
+	, m_control_key(false)
+	, m_framelimit(false)
+	, m_texture_shuffle(false)
+	, m_wnd(NULL)
+	, m_dev(NULL)
+{
+	m_GStitleInfoBuffer[0] = 0;
+
+	m_interlace = theApp.GetConfig("interlace", 7) % s_interlace_nb;
+	m_aspectratio = theApp.GetConfig("aspectratio", 1) % s_aspect_ratio_nb;
+	m_shader = theApp.GetConfig("TVShader", 0) % s_post_shader_nb;
+	m_filter = theApp.GetConfig("filter", 1);
+	m_vsync = !!theApp.GetConfig("vsync", 0);
+	m_aa1 = !!theApp.GetConfig("aa1", 0);
+	m_fxaa = !!theApp.GetConfig("fxaa", 0);
+	m_shaderfx = !!theApp.GetConfig("shaderfx", 0);
+	m_shadeboost = !!theApp.GetConfig("ShadeBoost", 0);
+}
+
+GSRenderer::~GSRenderer()
+{
+	/*if(m_dev)
+	{
+		m_dev->Reset(1, 1, GSDevice::Windowed);
+	}*/
+
+	delete m_dev;
+
+	if (m_wnd)
+	{
+		delete m_wnd;
+	}
+}
+
+bool GSRenderer::CreateWnd(const string& title, int w, int h)
+{
+	return m_wnd->Create(title.c_str(), w, h);
+}
+
+bool GSRenderer::CreateDevice(GSDevice* dev)
+{
+	ASSERT(dev);
+	ASSERT(!m_dev);
+
+	if(!dev->Create(m_wnd))
+	{
+		return false;
+	}
+
+	m_dev = dev;
+	m_dev->SetVSync(m_vsync && m_framelimit);
+
+	return true;
+}
+
+void GSRenderer::ResetDevice()
+{
+    if(m_dev) m_dev->Reset(1, 1);
+}
+
+bool GSRenderer::Merge(int field)
+{
+	bool en[2];
+
+	GSVector4i fr[2];
+	GSVector4i dr[2];
+
+	int baseline = INT_MAX;
+
+	for(int i = 0; i < 2; i++)
+	{
+		en[i] = IsEnabled(i);
+
+		if(en[i])
+		{
+			fr[i] = GetFrameRect(i);
+			dr[i] = GetDisplayRect(i);
+
+			baseline = min(dr[i].top, baseline);
+
+			//printf("[%d]: %d %d %d %d, %d %d %d %d\n", i, fr[i].x,fr[i].y,fr[i].z,fr[i].w , dr[i].x,dr[i].y,dr[i].z,dr[i].w);
+		}
+	}
+
+	if(!en[0] && !en[1])
+	{
+		return false;
+	}
+
+	GL_PUSH("Renderer Merge %d", s_n);
+
+	// try to avoid fullscreen blur, could be nice on tv but on a monitor it's like double vision, hurts my eyes (persona 4, guitar hero)
+	//
+	// NOTE: probably the technique explained in graphtip.pdf (Antialiasing by Supersampling / 4. Reading Odd/Even Scan Lines Separately with the PCRTC then Blending)
+
+	bool samesrc =
+		en[0] && en[1] &&
+		m_regs->DISP[0].DISPFB.FBP == m_regs->DISP[1].DISPFB.FBP &&
+		m_regs->DISP[0].DISPFB.FBW == m_regs->DISP[1].DISPFB.FBW &&
+		m_regs->DISP[0].DISPFB.PSM == m_regs->DISP[1].DISPFB.PSM;
+
+	// bool blurdetected = false;
+
+	if(samesrc /*&& m_regs->PMODE.SLBG == 0 && m_regs->PMODE.MMOD == 1 && m_regs->PMODE.ALP == 0x80*/)
+	{
+		if(fr[0].eq(fr[1] + GSVector4i(0, -1, 0, 0)) && dr[0].eq(dr[1] + GSVector4i(0, 0, 0, 1))
+		|| fr[1].eq(fr[0] + GSVector4i(0, -1, 0, 0)) && dr[1].eq(dr[0] + GSVector4i(0, 0, 0, 1)))
+		{
+			// persona 4:
+			//
+			// fr[0] = 0 0 640 448
+			// fr[1] = 0 1 640 448
+			// dr[0] = 159 50 779 498
+			// dr[1] = 159 50 779 497
+			//
+			// second image shifted up by 1 pixel and blended over itself
+			//
+			// god of war:
+			//
+			// fr[0] = 0 1 512 448
+			// fr[1] = 0 0 512 448
+			// dr[0] = 127 50 639 497
+			// dr[1] = 127 50 639 498
+			//
+			// same just the first image shifted
+
+			int top = min(fr[0].top, fr[1].top);
+			int bottom = max(dr[0].bottom, dr[1].bottom);
+
+			fr[0].top = top;
+			fr[1].top = top;
+			dr[0].bottom = bottom;
+			dr[1].bottom = bottom;
+
+			// blurdetected = true;
+		}
+		else if(dr[0].eq(dr[1]) && (fr[0].eq(fr[1] + GSVector4i(0, 1, 0, 1)) || fr[1].eq(fr[0] + GSVector4i(0, 1, 0, 1))))
+		{
+			// dq5:
+			//
+			// fr[0] = 0 1 512 445
+			// fr[1] = 0 0 512 444
+			// dr[0] = 127 50 639 494
+			// dr[1] = 127 50 639 494
+
+			int top = min(fr[0].top, fr[1].top);
+			int bottom = min(fr[0].bottom, fr[1].bottom);
+
+			fr[0].top = fr[1].top = top;
+			fr[0].bottom = fr[1].bottom = bottom;
+
+			// blurdetected = true;
+		}
+		//printf("samesrc = %d blurdetected = %d\n",samesrc,blurdetected);
+	}
+
+	GSVector2i fs(0, 0);
+	GSVector2i ds(0, 0);
+
+	GSTexture* tex[2] = {NULL, NULL};
+
+	if(samesrc && fr[0].bottom == fr[1].bottom)
+	{
+		tex[0] = GetOutput(0);
+		tex[1] = tex[0]; // saves one texture fetch
+	}
+	else
+	{
+		if(en[0]) tex[0] = GetOutput(0);
+		if(en[1]) tex[1] = GetOutput(1);
+	}
+
+	GSVector4 src[2];
+	GSVector4 dst[2];
+
+	for(int i = 0; i < 2; i++)
+	{
+		if(!en[i] || !tex[i]) continue;
+
+		GSVector4i r = fr[i];
+
+		// overscan hack
+
+		if(dr[i].height() > 512) // hmm
+		{
+			int y = GetDeviceSize(i).y;
+			r.bottom = r.top + y;
+		}
+
+		GSVector4 scale = GSVector4(tex[i]->GetScale()).xyxy();
+
+		src[i] = GSVector4(r) * scale / GSVector4(tex[i]->GetSize()).xyxy();
+
+		GSVector2 off(0, 0);
+
+		if(dr[i].top - baseline >= 4) // 2?
+		{
+			off.y = tex[i]->GetScale().y * (dr[i].top - baseline);
+
+			if(m_regs->SMODE2.INT && m_regs->SMODE2.FFMD)
+			{
+				off.y /= 2;
+			}
+		}
+
+		dst[i] = GSVector4(off).xyxy() + scale * GSVector4(r.rsize());
+
+		fs.x = max(fs.x, (int)(dst[i].z + 0.5f));
+		fs.y = max(fs.y, (int)(dst[i].w + 0.5f));
+	}
+
+	ds = fs;
+
+	if(m_regs->SMODE2.INT && m_regs->SMODE2.FFMD)
+	{
+		ds.y *= 2;
+	}
+
+	bool slbg = m_regs->PMODE.SLBG;
+	bool mmod = m_regs->PMODE.MMOD;
+
+	if(tex[0] || tex[1])
+	{
+		if(tex[0] == tex[1] && !slbg && (src[0] == src[1] & dst[0] == dst[1]).alltrue())
+		{
+			// the two outputs are identical, skip drawing one of them (the one that is alpha blended)
+
+			tex[0] = NULL;
+		}
+
+		GSVector4 c = GSVector4((int)m_regs->BGCOLOR.R, (int)m_regs->BGCOLOR.G, (int)m_regs->BGCOLOR.B, (int)m_regs->PMODE.ALP) / 255;
+
+		m_dev->Merge(tex, src, dst, fs, slbg, mmod, c);
+
+		if(m_regs->SMODE2.INT && m_interlace > 0)
+		{
+			if (m_interlace == 7 && m_regs->SMODE2.FFMD == 1) // Auto interlace enabled / Odd frame interlace setting
+			{
+				int field2 = 0;
+				int mode = 2;
+				m_dev->Interlace(ds, field ^ field2, mode, tex[1] ? tex[1]->GetScale().y : tex[0]->GetScale().y);
+			}
+			else
+			{
+				int field2 = 1 - ((m_interlace - 1) & 1);
+				int mode = (m_interlace - 1) >> 1;
+				m_dev->Interlace(ds, field ^ field2, mode, tex[1] ? tex[1]->GetScale().y : tex[0]->GetScale().y);
+			}
+		}
+
+		if(m_shadeboost)
+		{
+			m_dev->ShadeBoost();
+		}
+
+		if (m_shaderfx)
+		{
+			m_dev->ExternalFX();
+		}
+
+		if(m_fxaa)
+		{
+			m_dev->FXAA();
+		}
+	}
+
+	GL_POP();
+
+	return true;
+}
+
+void GSRenderer::SetFrameLimit(bool limit)
+{
+	m_framelimit = limit;
+
+	if(m_dev) m_dev->SetVSync(m_vsync && m_framelimit);
+}
+
+void GSRenderer::SetVSync(bool enabled)
+{
+	m_vsync = enabled;
+
+	if(m_dev) m_dev->SetVSync(m_vsync);
+}
+
+void GSRenderer::VSync(int field)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	m_perfmon.Put(GSPerfMon::Frame);
+
+	Flush();
+
+	if(!m_dev->IsLost(true))
+	{
+		if(!Merge(field ? 1 : 0))
+		{
+			return;
+		}
+	}
+	else
+	{
+		ResetDevice();
+	}
+
+	m_dev->AgePool();
+
+	// osd
+
+	if((m_perfmon.GetFrame() & 0x1f) == 0)
+	{
+		m_perfmon.Update();
+
+		double fps = 1000.0f / m_perfmon.Get(GSPerfMon::Frame);
+
+		GSVector4i r = GetDisplayRect();
+
+		string s;
+
+#ifdef GSTITLEINFO_API_FORCE_VERBOSE
+		if (1)//force verbose reply
+#else
+		if (m_wnd->IsManaged())
+#endif
+		{
+			//GSdx owns the window's title, be verbose.
+
+			string s2 = m_regs->SMODE2.INT ? (string("Interlaced ") + (m_regs->SMODE2.FFMD ? "(frame)" : "(field)")) : "Progressive";
+
+			s = format(
+				"%lld | %d x %d | %.2f fps (%d%%) | %s - %s | %s | %d S/%d P/%d D | %d%% CPU | %.2f | %.2f",
+				m_perfmon.GetFrame(), GetInternalResolution().x, GetInternalResolution().y, fps, (int)(100.0 * fps / GetTvRefreshRate()),
+				s2.c_str(),
+				theApp.m_gs_interlace[m_interlace].name.c_str(),
+				theApp.m_gs_aspectratio[m_aspectratio].name.c_str(),
+				(int)m_perfmon.Get(GSPerfMon::SyncPoint),
+				(int)m_perfmon.Get(GSPerfMon::Prim),
+				(int)m_perfmon.Get(GSPerfMon::Draw),
+				m_perfmon.CPU(),
+				m_perfmon.Get(GSPerfMon::Swizzle) / 1024,
+				m_perfmon.Get(GSPerfMon::Unswizzle) / 1024
+			);
+
+			double fillrate = m_perfmon.Get(GSPerfMon::Fillrate);
+
+			if(fillrate > 0)
+			{
+				s += format(" | %.2f mpps", fps * fillrate / (1024 * 1024));
+
+				int sum = 0;
+
+				for(int i = 0; i < 16; i++)
+				{
+					sum += m_perfmon.CPU(GSPerfMon::WorkerDraw0 + i);
+				}
+
+				s += format(" | %d%% CPU", sum);
+			}
+		}
+		else
+		{
+			// Satisfy PCSX2's request for title info: minimal verbosity due to more external title text
+
+			s = format("%dx%d | %s", GetInternalResolution().x, GetInternalResolution().y, theApp.m_gs_interlace[m_interlace].name.c_str());
+		}
+
+		if(m_capture.IsCapturing())
+		{
+			s += " | Recording...";
+		}
+
+		if(m_wnd->IsManaged())
+		{
+			m_wnd->SetWindowText(s.c_str());
+		}
+		else
+		{
+			// note: do not use TryEnterCriticalSection.  It is unnecessary code complication in
+			// an area that absolutely does not matter (even if it were 100 times slower, it wouldn't
+			// be noticeable).  Besides, these locks are extremely short -- overhead of conditional
+			// is way more expensive than just waiting for the CriticalSection in 1 of 10,000,000 tries. --air
+
+			std::lock_guard<std::mutex> lock(m_pGSsetTitle_Crit);
+
+			strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
+
+			m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer) - 1] = 0; // make sure null terminated even if text overflows
+		}
+	}
+	else
+	{
+		// [TODO]
+		// We don't have window title rights, or the window has no title,
+		// so let's use actual OSD!
+	}
+
+	if(m_frameskip)
+	{
+		return;
+	}
+
+	// present
+
+	m_dev->Present(m_wnd->GetClientRect().fit(m_aspectratio), m_shader);
+
+	// snapshot
+
+	if(!m_snapshot.empty())
+	{
+		bool shift = false;
+
+		#ifdef _WIN32
+
+		shift = !!(::GetAsyncKeyState(VK_SHIFT) & 0x8000);
+
+		#else
+
+		shift = m_shift_key;
+
+		#endif
+
+		if(!m_dump && shift)
+		{
+			GSFreezeData fd;
+			fd.size = 0;
+			fd.data = NULL;
+			Freeze(&fd, true);
+			fd.data = new uint8[fd.size];
+			Freeze(&fd, false);
+
+			m_dump.Open(m_snapshot, m_crc, fd, m_regs);
+
+			delete [] fd.data;
+		}
+
+		if(GSTexture* t = m_dev->GetCurrent())
+		{
+			t->Save(m_snapshot + ".bmp", true);
+		}
+
+		m_snapshot.clear();
+	}
+	else
+	{
+		if(m_dump)
+		{
+            bool control = false;
+
+            #ifdef _WIN32
+
+            control = !!(::GetAsyncKeyState(VK_CONTROL) & 0x8000);
+
+			#else
+
+			control = m_control_key;
+
+            #endif
+
+	    	m_dump.VSync(field, !control, m_regs);
+		}
+	}
+
+	// capture
+
+	if(m_capture.IsCapturing())
+	{
+		if(GSTexture* current = m_dev->GetCurrent())
+		{
+			GSVector2i size = m_capture.GetSize();
+
+			if(GSTexture* offscreen = m_dev->CopyOffscreen(current, GSVector4(0, 0, 1, 1), size.x, size.y))
+			{
+				GSTexture::GSMap m;
+
+				if(offscreen->Map(m))
+				{
+					m_capture.DeliverFrame(m.bits, m.pitch, !m_dev->IsRBSwapped());
+
+					offscreen->Unmap();
+				}
+
+				m_dev->Recycle(offscreen);
+			}
+		}
+	}
+}
+
+bool GSRenderer::MakeSnapshot(const string& path)
+{
+	if(m_snapshot.empty())
+	{
+		time_t t = time(NULL);
+
+		char buff[16];
+
+		if(strftime(buff, sizeof(buff), "%Y%m%d%H%M%S", localtime(&t)))
+		{
+			m_snapshot = format("%s_%s", path.c_str(), buff);
+		}
+	}
+
+	return true;
+}
+
+bool GSRenderer::BeginCapture()
+{
+	GSVector4i disp = m_wnd->GetClientRect().fit(m_aspectratio);
+	float aspect = (float)disp.width() / max(1, disp.height());
+
+	return m_capture.BeginCapture(GetTvRefreshRate(), GetInternalResolution(), aspect);
+}
+
+void GSRenderer::EndCapture()
+{
+	m_capture.EndCapture();
+}
+
+void GSRenderer::KeyEvent(GSKeyEventData* e)
+{
+#ifdef _WIN32
+	if(e->type == KEYPRESS)
+	{
+
+		int step = (::GetAsyncKeyState(VK_SHIFT) & 0x8000) ? -1 : 1;
+
+		switch(e->key)
+		{
+		case VK_F5:
+			m_interlace = (m_interlace + s_interlace_nb + step) % s_interlace_nb;
+			printf("GSdx: Set deinterlace mode to %d (%s).\n", (int)m_interlace, theApp.m_gs_interlace.at(m_interlace).name.c_str());
+			return;
+		case VK_F6:
+			if( m_wnd->IsManaged() )
+				m_aspectratio = (m_aspectratio + s_aspect_ratio_nb + step) % s_aspect_ratio_nb;
+			return;
+		case VK_F7:
+			m_shader = (m_shader + s_post_shader_nb + step) % s_post_shader_nb;
+			printf("GSdx: Set shader to: %d.\n", (int)m_shader);
+			return;
+		case VK_DELETE:
+			m_aa1 = !m_aa1;
+			printf("GSdx: (Software) Edge anti-aliasing is now %s.\n", m_aa1 ? "enabled" : "disabled");
+			return;
+		case VK_INSERT:
+			m_mipmap = !m_mipmap;
+			printf("GSdx: (Software) Mipmapping is now %s.\n", m_mipmap ? "enabled" : "disabled");
+			return;
+		case VK_PRIOR:
+			m_fxaa = !m_fxaa;
+			printf("GSdx: FXAA anti-aliasing is now %s.\n", m_fxaa ? "enabled" : "disabled");
+			return;
+		case VK_HOME:
+			m_shaderfx = !m_shaderfx;
+			printf("GSdx: External post-processing is now %s.\n", m_shaderfx ? "enabled" : "disabled");
+			return;
+		}
+
+	}
+#elif defined(__linux__)
+	if(e->type == KEYPRESS)
+	{
+		int step = m_shift_key ? -1 : 1;
+
+		switch(e->key)
+		{
+		case XK_F5:
+			m_interlace = (m_interlace + s_interlace_nb + step) % s_interlace_nb;
+			printf("GSdx: Set deinterlace mode to %d (%s).\n", (int)m_interlace, theApp.m_gs_interlace.at(m_interlace).name.c_str());
+			return;
+		case XK_F6:
+			if( m_wnd->IsManaged() )
+				m_aspectratio = (m_aspectratio + s_aspect_ratio_nb + step) % s_aspect_ratio_nb;
+			return;
+		case XK_F7:
+			m_shader = (m_shader + s_post_shader_nb + step) % s_post_shader_nb;
+			printf("GSdx: Set shader %d.\n", (int)m_shader);
+			return;
+		case XK_Delete:
+			m_aa1 = !m_aa1;
+			printf("GSdx: (Software) Edge anti-aliasing is now %s.\n", m_aa1 ? "enabled" : "disabled");
+			return;
+		case XK_Insert:
+			m_mipmap = !m_mipmap;
+			printf("GSdx: (Software) Mipmapping is now %s.\n", m_mipmap ? "enabled" : "disabled");
+			return;
+		case XK_Prior:
+			m_fxaa = !m_fxaa;
+			printf("GSdx: FXAA anti-aliasing is now %s.\n", m_fxaa ? "enabled" : "disabled");
+			return;
+		case XK_Home:
+			m_shaderfx = !m_shaderfx;
+			printf("GSdx: External post-processing is now %s.\n", m_shaderfx ? "enabled" : "disabled");
+			return;
+		case XK_Shift_L:
+		case XK_Shift_R:
+			m_shift_key = true;
+			return;
+		case XK_Control_L:
+		case XK_Control_R:
+			m_control_key = true;
+			return;
+		}
+
+	}
+	else if(e->type == KEYRELEASE)
+	{
+		switch(e->key)
+		{
+			case XK_Shift_L:
+			case XK_Shift_R:
+				m_shift_key = false;
+				return;
+			case XK_Control_L:
+			case XK_Control_R:
+				m_control_key = false;
+				return;
+		}
+	}
+#endif
+}
diff --git a/plugins/GSdx_legacy/GSRenderer.h b/plugins/GSdx_legacy/GSRenderer.h
new file mode 100644
index 0000000000..cca70a3486
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRenderer.h
@@ -0,0 +1,86 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSdx.h"
+#include "GSWnd.h"
+#include "GSState.h"
+#include "GSCapture.h"
+
+class GSRenderer : public GSState
+{
+	GSCapture m_capture;
+	string m_snapshot;
+	int m_shader;
+
+	bool Merge(int field);
+
+	// Only used on linux
+	bool m_shift_key;
+	bool m_control_key;
+
+protected:
+	int m_interlace;
+	int m_aspectratio;
+	int m_filter;
+	bool m_vsync;
+	bool m_aa1;
+	bool m_framelimit;
+	bool m_shaderfx;
+	bool m_fxaa;
+	bool m_shadeboost;
+	bool m_texture_shuffle;
+
+	virtual GSTexture* GetOutput(int i) = 0;
+
+public:
+	GSWnd* m_wnd;
+	GSDevice* m_dev;
+
+public:
+	GSRenderer();
+	virtual ~GSRenderer();
+
+	virtual bool CreateWnd(const string& title, int w, int h);
+	virtual bool CreateDevice(GSDevice* dev);
+	virtual void ResetDevice();
+	virtual void VSync(int field);
+	virtual bool MakeSnapshot(const string& path);
+	virtual void KeyEvent(GSKeyEventData* e);
+	virtual bool CanUpscale() {return false;}
+	virtual int GetUpscaleMultiplier() {return 1;}
+	virtual GSVector2i GetInternalResolution() {
+		return GSVector2i(GetDisplayRect().width(), GetDisplayRect().height());
+	}
+	void SetAspectRatio(int aspect) {m_aspectratio = aspect;}
+	void SetVSync(bool enabled);
+	void SetFrameLimit(bool limit);
+	virtual void SetExclusive(bool isExcl) {}
+
+	virtual bool BeginCapture();
+	virtual void EndCapture();
+
+public:
+	std::mutex m_pGSsetTitle_Crit;
+
+	char m_GStitleInfoBuffer[128];
+};
diff --git a/plugins/GSdx_legacy/GSRendererCL.cpp b/plugins/GSdx_legacy/GSRendererCL.cpp
new file mode 100644
index 0000000000..1f90f71046
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererCL.cpp
@@ -0,0 +1,2248 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererCL.h"
+
+#ifdef ENABLE_OPENCL
+
+#define LOG 0
+
+static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
+
+#define MAX_FRAME_SIZE 2048
+#define MAX_PRIM_COUNT 4096u
+#define MAX_PRIM_PER_BATCH_BITS 5
+#define MAX_PRIM_PER_BATCH (1u << MAX_PRIM_PER_BATCH_BITS)
+#define BATCH_COUNT(prim_count) (((prim_count) + (MAX_PRIM_PER_BATCH - 1)) / MAX_PRIM_PER_BATCH)
+#define MAX_BATCH_COUNT BATCH_COUNT(MAX_PRIM_COUNT)
+#define BIN_SIZE_BITS 4
+#define BIN_SIZE (1u << BIN_SIZE_BITS)
+#define MAX_BIN_PER_BATCH ((MAX_FRAME_SIZE / BIN_SIZE) * (MAX_FRAME_SIZE / BIN_SIZE))
+#define MAX_BIN_COUNT (MAX_BIN_PER_BATCH * MAX_BATCH_COUNT)
+#define TFX_PARAM_SIZE 2048
+#define TFX_MAX_PARAM_COUNT 256
+
+#if MAX_PRIM_PER_BATCH == 64u
+#define BIN_TYPE cl_ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+#define BIN_TYPE cl_uint
+#else
+#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+#pragma pack(push, 1)
+
+typedef struct
+{
+	GSVertexCL v[4];
+} gs_prim;
+
+typedef struct
+{
+	cl_float4 dx, dy;
+	cl_float4 zero;
+	cl_float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	struct { cl_uint first, last; } bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	cl_uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+#pragma pack(pop)
+
+GSRendererCL::GSRendererCL()
+	: m_vb_count(0)
+	, m_synced(true)
+{
+	m_nativeres = true; // ignore ini, sw is always native
+
+	memset(m_texture, 0, sizeof(m_texture));
+
+	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
+
+	for(int i = 0; i < 4; i++)
+	{
+		m_rw_pages[0][i] = GSVector4i::zero();
+		m_rw_pages[1][i] = GSVector4i::zero();
+		m_tc_pages[i] = GSVector4i::xffffffff();
+	}
+
+	memset(m_rw_pages_rendering, 0, sizeof(m_rw_pages_rendering));
+
+	#define InitCVB(P) \
+		m_cvb[P][0][0] = &GSRendererCL::ConvertVertexBuffer<P, 0, 0>; \
+		m_cvb[P][0][1] = &GSRendererCL::ConvertVertexBuffer<P, 0, 1>; \
+		m_cvb[P][1][0] = &GSRendererCL::ConvertVertexBuffer<P, 1, 0>; \
+		m_cvb[P][1][1] = &GSRendererCL::ConvertVertexBuffer<P, 1, 1>; \
+
+	InitCVB(GS_POINT_CLASS);
+	InitCVB(GS_LINE_CLASS);
+	InitCVB(GS_TRIANGLE_CLASS);
+	InitCVB(GS_SPRITE_CLASS);
+
+	// NOTE: m_cl.vm may be cached on the device according to the specs, there are a couple of places where we access m_mem.m_vm8 without 
+	// mapping the buffer (after the two invalidate* calls and in getoutput), it is currently not an issue, but on some devices it may be.
+
+	m_cl.vm = cl::Buffer(m_cl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (size_t)m_mem.m_vmsize, m_mem.m_vm8, NULL);
+	m_cl.tex = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY, (size_t)m_mem.m_vmsize);
+}
+
+GSRendererCL::~GSRendererCL()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+	}
+
+	_aligned_free(m_output);
+}
+
+void GSRendererCL::Reset()
+{
+	Sync(-1);
+
+	GSRenderer::Reset();
+}
+
+static int pageuploads = 0;
+static int pageuploadcount = 0;
+static int tfxcount = 0;
+static int64 tfxpixels = 0;
+static int tfxselcount = 0;
+static int tfxdiffselcount = 0;
+
+void GSRendererCL::VSync(int field)
+{
+	GSRenderer::VSync(field);
+
+	//printf("vsync %d/%d/%d/%d\n", pageuploads, pageuploadcount, tfxcount, tfxpixels);
+	//printf("vsync %d/%d\n", tfxselcount, tfxdiffselcount);
+	pageuploads = pageuploadcount = tfxcount = tfxpixels = 0;
+	tfxselcount = tfxdiffselcount = 0;
+
+	//if(!field) memset(m_mem.m_vm8, 0, (size_t)m_mem.m_vmsize);
+}
+
+void GSRendererCL::ResetDevice()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+
+		m_texture[i] = NULL;
+	}
+}
+
+GSTexture* GSRendererCL::GetOutput(int i)
+{
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	int w = DISPFB.FBW * 64;
+	int h = GetFrameRect(i).bottom;
+
+	// TODO: round up bottom
+
+	if(m_dev->ResizeTexture(&m_texture[i], w, h))
+	{
+		static int pitch = 1024 * 4;
+
+		GSVector4i r(0, 0, w, h);
+
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
+
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = DISPFB.Block();
+		BITBLTBUF.SBW = DISPFB.FBW;
+		BITBLTBUF.SPSM = DISPFB.PSM;
+
+		InvalidateLocalMem(BITBLTBUF, r);
+
+		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
+
+		m_texture[i]->Update(r, m_output, pitch);
+
+		if(s_dump)
+		{
+			if(s_save && s_n >= s_saven)
+			{
+				m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+			}
+
+			s_n++;
+		}
+	}
+
+	return m_texture[i];
+}
+
+const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f);
+
+template<uint32 primclass, uint32 tme, uint32 fst>
+void GSRendererCL::ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
+{
+	GSVector4i o = (GSVector4i)m_context->XYOFFSET;
+	GSVector4 st_scale = GSVector4(16 << m_context->TEX0.TW, 16 << m_context->TEX0.TH, 1, 0);
+
+	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
+	{
+		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
+
+		GSVector4i xyzuvf(src->m[1]);
+
+		dst->p = (GSVector4(xyzuvf.upl16() - o) * g_pos_scale).xyxy(GSVector4::cast(xyzuvf.ywyw())); // pass zf as uints
+
+		GSVector4 t = GSVector4::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				#if _M_SSE >= 0x401
+
+				t = GSVector4(xyzuvf.uph16());
+					
+				#else
+
+				t = GSVector4(GSVector4i::load(src->UV).upl16());
+
+				#endif
+			}
+			else
+			{
+				t = stcq.xyww() * st_scale;
+			}
+		}
+
+		dst->t = t.insert32<2, 3>(stcq); // color as uchar4 in t.w
+	}
+}
+
+void GSRendererCL::Draw()
+{
+	const GSDrawingContext* context = m_context;
+
+	GSVector4i scissor = GSVector4i(context->scissor.in);
+	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (example: single line 0,0->256,0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
+	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
+
+	GSVector4i rect = bbox.rintersect(scissor);
+
+	if(rect.rempty())
+	{
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+
+		std::string s;
+
+		if(s_save && s_n >= s_saven && PRIM->TME)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
+
+			m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+
+	try
+	{
+		size_t vb_size = m_vertex.next * sizeof(GSVertexCL);
+		size_t ib_size = m_index.tail * sizeof(uint32);
+		size_t pb_size = TFX_PARAM_SIZE;
+
+		ASSERT(sizeof(TFXParameter) <= TFX_PARAM_SIZE);
+
+		if(m_cl.vb.tail + vb_size > m_cl.vb.size || m_cl.ib.tail + ib_size > m_cl.ib.size || m_cl.pb.tail + pb_size > m_cl.pb.size)
+		{
+			if(vb_size > m_cl.vb.size || ib_size > m_cl.ib.size)
+			{
+				// buffer too small for even one batch, allow twice the size (at least 1 MB)
+
+				Sync(2); // must sync, reallocating the input buffers
+
+				m_cl.Unmap();
+
+				m_cl.vb.size = 0;
+				m_cl.ib.size = 0;
+
+				size_t size = std::max(vb_size * 2, (size_t)2 << 20);
+
+				printf("growing vertex/index buffer %d\n", size);
+
+				m_cl.vb.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.vb.size = size;
+
+				size = std::max(size / sizeof(GSVertex) * 3 * sizeof(uint32), (size_t)1 << 20); // worst case, three times the vertex count
+
+				ASSERT(size >= ib_size);
+
+				if(size < ib_size) size = ib_size; // should not happen
+
+				m_cl.ib.buff[0] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.buff[1] = cl::Buffer(m_cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size);
+				m_cl.ib.size = size;
+			}
+			else
+			{
+				Enqueue();
+
+				m_cl.Unmap();
+
+				// make the write queue wait until the rendering queue is ready, it may still use the device buffers
+
+				std::vector<cl::Event> el(1);
+
+				m_cl.queue[2].enqueueMarker(&el[0]);
+				m_cl.wq->enqueueWaitForEvents(el);
+
+				// switch to the other queue/buffer (double buffering)
+
+				m_cl.wqidx = (m_cl.wqidx + 1) & 1;
+				m_cl.wq = &m_cl.queue[m_cl.wqidx];
+			}
+
+			m_cl.vb.head = m_cl.vb.tail = 0;
+			m_cl.ib.head = m_cl.ib.tail = 0;
+			m_cl.pb.head = m_cl.pb.tail = 0;
+
+			m_cl.Map();
+		}
+		else
+		{
+			// only allow batches of the same primclass in Enqueue
+
+			if(!m_jobs.empty() && m_jobs.front()->sel.prim != (uint32)m_vt.m_primclass)
+			{
+				Enqueue();
+			}
+		}
+
+		//
+
+		GSVertexCL* vb = (GSVertexCL*)(m_cl.vb.ptr + m_cl.vb.tail);
+		uint32* ib = (uint32*)(m_cl.ib.ptr + m_cl.ib.tail);
+		TFXParameter* pb = (TFXParameter*)(m_cl.pb.ptr + m_cl.pb.tail);
+
+		(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(vb, m_vertex.buff, m_vertex.next); // TODO: upload in GSVertex format and extract the fields in the kernel? 
+
+		if(m_jobs.empty())
+		{
+			memcpy(ib, m_index.buff, m_index.tail * sizeof(uint32));
+
+			m_vb_start = m_cl.vb.tail;
+			m_vb_count = 0;
+			m_pb_start = m_cl.pb.tail;
+			m_pb_count = 0;
+		}
+		else
+		{
+			// TODO: SIMD
+
+			ASSERT(m_pb_count < TFX_MAX_PARAM_COUNT);
+
+			uint32 vb_count = m_vb_count | (m_pb_count << 24);
+
+			for(size_t i = 0; i < m_index.tail; i++)
+			{
+				ib[i] = m_index.buff[i] + vb_count;
+			}
+		}
+
+		shared_ptr<TFXJob> job(new TFXJob());
+
+		if(!SetupParameter(job.get(), pb, vb, m_vertex.next, m_index.buff, m_index.tail))
+		{
+			return;
+		}
+
+		pb->scissor = scissor;
+
+		if(bbox.eq(bbox.rintersect(scissor)))
+		{
+			pb->sel.noscissor = 1;
+		}
+
+		job->rect.x = rect.x;
+		job->rect.y = rect.y;
+		job->rect.z = rect.z;
+		job->rect.w = rect.w;
+		job->sel = pb->sel;
+		job->ib_start = m_cl.ib.tail;
+		job->prim_count = m_index.tail / GSUtil::GetClassVertexCount(m_vt.m_primclass);
+		job->fbp = pb->fbp;
+		job->zbp = pb->zbp;
+		job->bw = pb->bw;
+		job->fpsm = context->FRAME.PSM;
+		job->zpsm = context->ZBUF.PSM;
+		job->tpsm = context->TEX0.PSM;
+
+#ifdef DEBUG
+		job->pb = pb;
+#endif
+		m_jobs.push_back(job);
+
+		m_vb_count += m_vertex.next;
+		m_pb_count++;
+
+		m_cl.vb.tail += vb_size;
+		m_cl.ib.tail += ib_size;
+		m_cl.pb.tail += pb_size;
+
+		m_synced = false;
+
+		// mark pages used in rendering as source or target
+
+		if(job->sel.fwrite || job->sel.rfb)
+		{
+			m_context->offset.fb->GetPagesAsBits(rect, m_tmp_pages);
+
+			if(job->sel.rfb)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					m_rw_pages[0][i] |= m_tmp_pages[i];
+				}
+			}
+
+			if(job->sel.fwrite)
+			{
+				GSVector4i* dst_pages = job->GetDstPages();
+
+				for(int i = 0; i < 4; i++)
+				{
+					m_rw_pages[1][i] |= m_tmp_pages[i];
+
+					dst_pages[i] |= m_tmp_pages[i];
+				}
+			}
+		}
+
+		if(job->sel.zwrite || job->sel.rzb)
+		{
+			m_context->offset.zb->GetPagesAsBits(rect, m_tmp_pages);
+
+			if(job->sel.rzb)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					m_rw_pages[0][i] |= m_tmp_pages[i];
+				}
+			}
+
+			if(job->sel.zwrite)
+			{
+				GSVector4i* dst_pages = job->GetDstPages();
+
+				for(int i = 0; i < 4; i++)
+				{
+					m_rw_pages[1][i] |= m_tmp_pages[i];
+
+					dst_pages[i] |= m_tmp_pages[i];
+				}
+			}
+		}
+
+		if(job->src_pages != NULL)
+		{
+			for(int i = 0; i < 4; i++)
+			{
+				m_rw_pages[0][i] |= job->src_pages[i];
+
+				if(job->dst_pages != NULL && !(job->dst_pages[i] & job->src_pages[i]).eq(GSVector4i::zero()))
+				{
+					//printf("src and dst overlap!\n");
+				}
+			}
+		}
+
+		// don't buffer too much data, feed them to the device if there is enough
+
+		if(m_pb_count >= TFX_MAX_PARAM_COUNT || m_vb_count >= 4096)
+		{
+			Enqueue();
+		}
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+
+		return;
+	}
+	catch(std::exception err)
+	{
+		printf("%s\n", err.what());
+
+		return;
+	}
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+		
+		std::string s;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+	}
+}
+
+void GSRendererCL::Sync(int reason)
+{
+	if(LOG) { fprintf(s_fp, "Sync (%d)\n", reason); fflush(s_fp); }
+
+	//printf("sync %d\n", reason);
+
+	GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
+
+	Enqueue();
+
+	m_cl.queue[2].finish();
+
+	for(int i = 0; i < 4; i++)
+	{
+		m_rw_pages[0][i] = GSVector4i::zero();
+		m_rw_pages[1][i] = GSVector4i::zero();
+	}
+
+	for(int i = 0; i < MAX_PAGES; i++) ASSERT(m_rw_pages_rendering[i] == 0);
+
+	m_synced = true;
+}
+
+void GSRendererCL::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+
+	o->GetPagesAsBits(r, m_tmp_pages);
+
+	if(!m_synced)
+	{
+		int i = 0;
+
+		bool wait;
+
+		do
+		{
+			wait = false;
+
+			for(; i < 4; i++)
+			{
+				GSVector4i pages = m_rw_pages[0][i] | m_rw_pages[1][i];
+
+				if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
+				{
+					// TODO: an awesome idea to avoid this Sync
+					// - call Enqueue() to flush m_jobs 
+					// - append rendering queue with a kernel that writes the incoming data to m_mem.vm and tell the parent class to not do it
+					// - the only problem, clut has to be read directly by the texture sampler, can't attach it to gs_param before being written
+
+					//Sync(3);
+
+					Enqueue();
+
+					wait = true;
+
+					break;
+				}
+			}
+
+			_mm_pause();
+		}
+		while(wait);
+
+		if(!m_synced)
+		{
+			o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice
+			
+			const uint32* p = m_tmp_pages2;
+
+			do
+			{
+				wait = false;
+
+				for(; *p != GSOffset::EOP; p++)
+				{
+					if(m_rw_pages_rendering[*p])
+					{
+						// Sync(5);
+
+						wait = true;
+
+						break;
+					}
+				}
+				/*
+				if(!m_synced)
+				{
+					void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+					m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr);
+				}
+				*/
+
+				_mm_pause();
+			}
+			while(wait);
+		}
+	}
+
+	for(int i = 0; i < 4; i++)
+	{
+		m_tc_pages[i] |= m_tmp_pages[i];
+	}
+}
+
+void GSRendererCL::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	if(!m_synced)
+	{
+		GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+
+		o->GetPagesAsBits(r, m_tmp_pages);
+
+		for(int i = 0; i < 4; i++)
+		{
+			GSVector4i pages = m_rw_pages[1][i];
+
+			if(!(pages & m_tmp_pages[i]).eq(GSVector4i::zero()))
+			{
+				Sync(4);
+
+				break;
+			}
+		}
+
+		if(!m_synced)
+		{
+			o->GetPages(r, m_tmp_pages2); // TODO: don't ask twice
+
+			for(const uint32* p = m_tmp_pages2; *p != GSOffset::EOP; p++)
+			{
+				if(m_rw_pages_rendering[*p] & 0xffff0000)
+				{
+					Sync(6);
+
+					break;
+				}
+			}
+			/*
+			if(!m_synced)
+			{
+				void* ptr = m_cl.wq->enqueueMapBuffer(m_cl.vm, CL_TRUE, CL_MAP_READ, 0, m_mem.m_vmsize);
+				m_cl.wq->enqueueUnmapMemObject(m_cl.vm, ptr);
+			}
+			*/
+		}
+	}
+}
+
+typedef struct { GSRendererCL* r; uint32 pages[(MAX_PAGES + 1) * 2]; } cb_data;
+
+void GSRendererCL::Enqueue()
+{
+	if(m_jobs.empty()) return;
+
+	cb_data* data = new cb_data();
+
+	data->r = this;
+
+	UsePages(data->pages);
+
+	try
+	{
+		ASSERT(m_cl.vb.tail > m_cl.vb.head);
+		ASSERT(m_cl.ib.tail > m_cl.ib.head);
+		ASSERT(m_cl.pb.tail > m_cl.pb.head);
+
+		int primclass = m_jobs.front()->sel.prim;
+
+		uint32 n = GSUtil::GetClassVertexCount(primclass);
+
+		PrimSelector psel;
+
+		psel.key = 0;
+		psel.prim = primclass;
+
+		cl::Kernel& pk = m_cl.GetPrimKernel(psel);
+
+		pk.setArg(1, m_cl.vb.buff[m_cl.wqidx]);
+		pk.setArg(2, m_cl.ib.buff[m_cl.wqidx]);
+		pk.setArg(3, m_cl.pb.buff[m_cl.wqidx]);
+		pk.setArg(4, (cl_uint)m_vb_start);
+		pk.setArg(6, (cl_uint)m_pb_start);
+
+		TileSelector tsel;
+
+		tsel.key = 0;
+		tsel.prim = primclass;
+
+		tsel.mode = 0;
+
+		cl::Kernel& tk_32 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 1;
+
+		cl::Kernel& tk_16 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 2;
+
+		cl::Kernel& tk_8 = m_cl.GetTileKernel(tsel);
+
+		tsel.mode = 3;
+
+		cl::Kernel& tk = m_cl.GetTileKernel(tsel);
+
+		tsel.key = 0;
+		tsel.clear = 1;
+
+		cl::Kernel& tk_clear = m_cl.GetTileKernel(tsel);
+
+		//
+
+		m_cl.Unmap();
+
+		std::vector<cl::Event> el(1);
+
+		m_cl.wq->enqueueMarker(&el[0]);
+		m_cl.queue[2].enqueueWaitForEvents(el);
+
+		//
+
+		auto head = m_jobs.begin();
+
+		while(head != m_jobs.end())
+		{
+			uint32 total_prim_count = 0;
+
+			auto next = head;
+
+			while(next != m_jobs.end())
+			{
+				auto job = next++;
+
+				uint32 cur_prim_count = (*job)->prim_count;
+				uint32 next_prim_count = next != m_jobs.end() ? (*next)->prim_count : 0;
+
+				total_prim_count += cur_prim_count;
+
+				if(total_prim_count >= MAX_PRIM_COUNT || next == m_jobs.end())// || next_prim_count >= MAX_PRIM_COUNT || next_prim_count < 16 && total_prim_count >= MAX_PRIM_COUNT / 2)
+				{
+					uint32 prim_count = std::min(total_prim_count, MAX_PRIM_COUNT);					
+
+					pk.setArg(5, (cl_uint)(*head)->ib_start);
+
+					m_cl.queue[2].enqueueNDRangeKernel(pk, cl::NullRange, cl::NDRange(prim_count), cl::NullRange);
+
+					if(0)
+					{
+						gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+						m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+					}
+
+					GSVector4i rect = GSVector4i::zero();
+
+					for(auto i = head; i != next; i++)
+					{
+						rect = rect.runion(GSVector4i::load<false>(&(*i)->rect));
+					}
+
+					rect = rect.ralign<Align_Outside>(GSVector2i(BIN_SIZE, BIN_SIZE)) >> BIN_SIZE_BITS;
+
+					int bin_w = rect.width();
+					int bin_h = rect.height();
+
+					uint32 batch_count = BATCH_COUNT(prim_count);
+					uint32 bin_count = bin_w * bin_h;
+
+					cl_uchar4 bin_dim;
+
+					bin_dim.s[0] = (cl_uchar)rect.x;
+					bin_dim.s[1] = (cl_uchar)rect.y;
+					bin_dim.s[2] = (cl_uchar)bin_w;
+					bin_dim.s[3] = (cl_uchar)bin_h;
+
+					if(1)//bin_w > 1 || bin_h > 1) // && not just one sprite covering the whole area
+					{
+						m_cl.queue[2].enqueueNDRangeKernel(tk_clear, cl::NullRange, cl::NDRange(bin_count), cl::NullRange);
+
+						if(bin_count <= 32 && m_cl.WIs >= 256)
+						{
+							uint32 item_count;
+							uint32 group_count;
+							cl::Kernel* k;
+
+							if(bin_count <= 8)
+							{
+								item_count = std::min(prim_count, 32u);
+								group_count = ((prim_count + 31) >> 5) * item_count;
+								k = &tk_32;
+							}
+							else if(bin_count <= 16)
+							{
+								item_count = std::min(prim_count, 16u);
+								group_count = ((prim_count + 15) >> 4) * item_count;
+								k = &tk_16;
+							}
+							else
+							{
+								item_count = std::min(prim_count, 8u);
+								group_count = ((prim_count + 7) >> 3) * item_count;
+								k = &tk_8;
+							}
+
+							k->setArg(1, (cl_uint)prim_count);
+							k->setArg(2, (cl_uint)bin_count);
+							k->setArg(3, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(*k, cl::NullRange, cl::NDRange(bin_w, bin_h, group_count), cl::NDRange(bin_w, bin_h, item_count));
+						}
+						else
+						{
+							uint32 item_count = std::min(bin_count, m_cl.WIs);
+							uint32 group_count = batch_count * item_count;
+
+							tk.setArg(1, (cl_uint)prim_count);
+							tk.setArg(2, (cl_uint)bin_count);
+							tk.setArg(3, bin_dim);
+
+							m_cl.queue[2].enqueueNDRangeKernel(tk, cl::NullRange, cl::NDRange(group_count), cl::NDRange(item_count));
+						}
+
+						if(0)
+						{
+							gs_env* ptr = (gs_env*)m_cl.queue[2].enqueueMapBuffer(m_cl.env, CL_TRUE, CL_MAP_READ, 0, sizeof(gs_env));
+							m_cl.queue[2].enqueueUnmapMemObject(m_cl.env, ptr);
+						}
+					}
+
+					std::list<shared_ptr<TFXJob>> jobs(head, next);
+
+					JoinTFX(jobs);
+
+					EnqueueTFX(jobs, bin_count, bin_dim);
+
+					if(total_prim_count > MAX_PRIM_COUNT)
+					{
+						prim_count = cur_prim_count - (total_prim_count - MAX_PRIM_COUNT);
+
+						(*job)->ib_start += prim_count * n * sizeof(uint32);
+						(*job)->prim_count -= prim_count;
+
+						next = job; // try again for the remainder
+
+						//printf("split %d\n", (*job)->prim_count);
+					}
+
+					break;
+				}
+			}
+
+			head = next;
+		}
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+	}
+
+	try
+	{
+		cl::Event e;
+		m_cl.queue[2].enqueueMarker(&e);
+		e.setCallback(CL_COMPLETE, ReleasePageEvent, data);
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+
+		delete data;
+	}
+
+	m_jobs.clear();
+
+	m_vb_count = 0;
+
+	m_cl.vb.head = m_cl.vb.tail;
+	m_cl.ib.head = m_cl.ib.tail;
+	m_cl.pb.head = m_cl.pb.tail;
+
+	m_cl.Map();
+}
+
+void GSRendererCL::EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim)
+{
+	cl_kernel tfx_prev = NULL;
+
+	uint32 prim_start = 0;
+
+	for(auto i : jobs)
+	{
+		ASSERT(prim_start < MAX_PRIM_COUNT);
+
+		tfxcount++;
+
+		uint32 prim_count = std::min(i->prim_count, MAX_PRIM_COUNT - prim_start);
+
+		cl::Kernel& tfx = m_cl.GetTFXKernel(i->sel);
+
+		cl::Buffer* tex = UpdateTextureCache(i.get()) ? &m_cl.tex : &m_cl.vm;
+
+		tfx.setArg(2, sizeof(*tex), tex);
+
+		if(tfx_prev != tfx())
+		{
+			tfx.setArg(3, sizeof(m_cl.pb.buff[m_cl.wqidx]), &m_cl.pb.buff[m_cl.wqidx]);
+			tfx.setArg(4, (cl_uint)m_pb_start);
+
+			tfx_prev = tfx();
+		}
+
+		tfx.setArg(5, (cl_uint)prim_start);
+		tfx.setArg(6, (cl_uint)prim_count);
+		tfx.setArg(7, (cl_uint)bin_count);
+		tfx.setArg(8, bin_dim);
+		tfx.setArg(9, i->fbp);
+		tfx.setArg(10, i->zbp);
+		tfx.setArg(11, i->bw);
+
+		GSVector4i r = GSVector4i::load<false>(&i->rect);
+
+		r = r.ralign<Align_Outside>(GSVector2i(8, 8));
+
+		m_cl.queue[2].enqueueNDRangeKernel(tfx, cl::NDRange(r.left, r.top), cl::NDRange(r.width(), r.height()), cl::NDRange(8, 8));
+
+		tfxpixels += r.width() * r.height();
+
+		InvalidateTextureCache(i.get());
+
+		prim_start += prim_count;
+	}
+}
+
+void GSRendererCL::JoinTFX(std::list<shared_ptr<TFXJob>>& jobs)
+{
+	// join tfx kernel calls where the selector and fbp/zbp/bw/fpsm/zpsm are the same and src_pages != prev dst_pages
+
+	//printf("before\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+
+	tfxselcount += jobs.size();
+
+	auto next = jobs.begin();
+
+	while(next != jobs.end())
+	{
+		auto prev = next++;
+
+		if(next == jobs.end())
+		{
+			break;
+		}
+
+		TFXSelector prev_sel = (*prev)->sel;
+		TFXSelector next_sel = (*next)->sel;
+
+		prev_sel.ababcd = next_sel.ababcd = 0;
+		prev_sel.wms = next_sel.wms = 0;
+		prev_sel.wmt = next_sel.wmt = 0;
+		prev_sel.noscissor = next_sel.noscissor = prev_sel.noscissor | next_sel.noscissor;
+		prev_sel.merged = next_sel.merged = 0;
+
+		if(prev_sel != next_sel
+		|| (*prev)->fbp != (*next)->fbp
+		|| (*prev)->zbp != (*next)->zbp
+		|| (*prev)->bw != (*next)->bw
+		|| (*prev)->fpsm != (*next)->fpsm
+		|| (*prev)->zpsm != (*next)->zpsm)
+		{
+			continue;
+		}
+
+		if((*prev)->dst_pages != NULL && (*next)->src_pages != NULL)
+		{
+			bool overlap = false;
+
+			for(int i = 0; i < 4; i++)
+			{
+				if(!((*prev)->dst_pages[i] & (*next)->src_pages[i]).eq(GSVector4i::zero()))
+				{
+					overlap = true;
+
+					break;
+				}
+			}
+
+			if(overlap)
+			{
+				continue;
+			}
+		}
+
+		if((*prev)->src_pages != NULL)
+		{
+			GSVector4i* src_pages = (*next)->GetSrcPages();
+
+			for(int i = 0; i < 4; i++)
+			{
+				src_pages[i] |= (*prev)->src_pages[i];
+			}
+		}
+
+		if((*prev)->dst_pages != NULL)
+		{
+			GSVector4i* dst_pages = (*next)->GetDstPages();
+
+			for(int i = 0; i < 4; i++)
+			{
+				dst_pages[i] |= (*prev)->dst_pages[i];
+			}
+		}
+
+		GSVector4i prev_rect = GSVector4i::load<false>(&(*prev)->rect);
+		GSVector4i next_rect = GSVector4i::load<false>(&(*next)->rect);
+
+		GSVector4i::store<false>(&(*next)->rect, prev_rect.runion(next_rect));
+
+		(*next)->prim_count += (*prev)->prim_count;
+		(*next)->ib_start = (*prev)->ib_start;
+
+		(*next)->sel = next_sel;
+		(*next)->sel.merged = 1;
+
+		jobs.erase(prev);
+
+		//if((*prev)->sel != (*next)->sel) printf("%d %016llx %016llx\n", jobs.size(), (*prev)->sel.key, (*next)->sel.key);
+	}
+
+	tfxdiffselcount += jobs.size();
+
+	//printf("after\n"); for(auto i : jobs) printf("%016llx %05x %05x %d %d %d\n", i->sel.key, i->fbp, i->zbp, i->bw, i->prim_count, i->ib_start);
+}
+
+bool GSRendererCL::UpdateTextureCache(TFXJob* job)
+{
+	if(job->src_pages == NULL) return false;
+
+	bool overlap = false;
+	bool invalid = false;
+
+	if(job->dst_pages != NULL)
+	{
+		bool can_overlap = job->sel.fwrite && GSUtil::HasSharedBits(job->tpsm, job->fpsm) || job->sel.zwrite && GSUtil::HasSharedBits(job->tpsm, job->zpsm);
+
+		for(int i = 0; i < 4; i++)
+		{
+			if(!(job->src_pages[i] & job->dst_pages[i]).eq(GSVector4i::zero()))
+			{
+				overlap = can_overlap; // gow, re4
+			}
+
+			if(!(m_tc_pages[i] & job->src_pages[i]).eq(GSVector4i::zero()))
+			{
+				invalid = true;
+			}
+		}
+	}
+
+	if(!invalid)
+	{
+		return true; // all needed pages are valid in texture cache, use it
+	}
+
+	if(!overlap)
+	{
+		return false; // no overlap, but has invalid pages, don't use texture cache
+	}
+
+	// overlap && invalid, update and use texture cache
+
+	int count = 0;
+
+	for(int i = 0; i < 4; i++)
+	{
+		GSVector4i pages = m_tc_pages[i] & job->src_pages[i];
+
+		if(pages.eq(GSVector4i::zero())) continue;
+
+		m_tc_pages[i] &= ~job->src_pages[i];
+
+		for(int j = 0; j < 4; j++)
+		{
+			if(pages.u32[j] == 0) continue;
+
+			if(pages.u32[j] == 0xffffffff)
+			{
+				size_t offset = (i * sizeof(GSVector4i) + j * sizeof(uint32)) * 8 * PAGE_SIZE;
+
+				m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 32);
+
+				if(LOG) { fprintf(s_fp, "tc (%d x32)\n", offset >> 13); fflush(s_fp); }
+
+				pageuploadcount++;
+				count += 32;
+
+				continue;
+			}
+
+			for(int k = 0; k < 4; k++)
+			{
+				uint8 b = pages.u8[j * 4 + k];
+
+				if(b == 0) continue;
+
+				if(b == 0xff)
+				{
+					size_t offset = (i * sizeof(GSVector4i) + (j * 4 + k)) * 8 * PAGE_SIZE;
+
+					m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE * 8);
+
+					if(LOG) { fprintf(s_fp, "tc (%d x8)\n", offset >> 13); fflush(s_fp); }
+
+					pageuploadcount++;
+					count += 8;
+
+					continue;
+				}
+
+				for(int l = 0; l < 8; l++)
+				{
+					if(b & (1 << l))
+					{
+						size_t offset = ((i * sizeof(GSVector4i) + (j * 4 + k)) * 8 + l) * PAGE_SIZE;
+
+						m_cl.queue[2].enqueueCopyBuffer(m_cl.vm, m_cl.tex, offset, offset, PAGE_SIZE);
+
+						if(LOG) { fprintf(s_fp, "tc (%d x1)\n", offset >> 13); fflush(s_fp); }
+
+						pageuploadcount++;
+						count++;
+					}
+				}
+			}
+		}
+	}
+
+	if(count > 0)
+	{
+		pageuploads += count;
+	}
+
+	return true;
+}
+
+void GSRendererCL::InvalidateTextureCache(TFXJob* job)
+{
+	if(job->dst_pages == NULL) return;
+
+	for(int i = 0; i < 4; i++)
+	{
+		m_tc_pages[i] |= job->dst_pages[i];
+	}
+}
+
+void GSRendererCL::UsePages(uint32* p)
+{
+	for(int l = 0; l < 2; l++)
+	{
+		for(int i = 0; i < 4; i++)
+		{
+			GSVector4i* v = &m_rw_pages[l][i];
+
+			if(v->eq(GSVector4i::zero())) continue;
+
+			for(int j = 0; j < 4; j++)
+			{
+				unsigned long index;
+				unsigned long mask = v->u32[j];
+
+				if(mask == 0) continue;
+
+				int o = (i << 7) | (j << 5);
+
+				if(mask == 0xffffffff)
+				{
+					for(int index = 0; index < 32; index++)
+					{
+						_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l);
+
+						*p++ = index | o;
+					}
+				}
+				else
+				{
+					while(_BitScanForward(&index, mask))
+					{
+						mask &= ~(1 << index);
+
+						_InterlockedIncrement16((short*)&m_rw_pages_rendering[index | o] + l);
+
+						*p++ = index | o;
+					}
+				}
+			}
+
+			*v = GSVector4i::zero();
+		}
+
+		*p++ = GSOffset::EOP;
+	}
+}
+
+void GSRendererCL::ReleasePages(uint32* pages)
+{
+	const uint32* p = pages;
+
+	for(; *p != GSOffset::EOP; p++)
+	{
+		_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 0);
+	}
+
+	p++;
+
+	for(; *p != GSOffset::EOP; p++)
+	{
+		_InterlockedDecrement16((short*)&m_rw_pages_rendering[*p] + 1);
+	}
+}
+
+void CL_CALLBACK GSRendererCL::ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data)
+{
+	if(event_command_exec_status == CL_COMPLETE)
+	{
+		cb_data* data = (cb_data*)user_data;
+		
+		data->r->ReleasePages(data->pages);
+
+		delete data;
+	}
+}
+
+static int RemapPSM(int psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: psm = 0; break;
+	case PSM_PSMCT24: psm = 1; break;
+	case PSM_PSMCT16: psm = 2; break;
+	case PSM_PSMCT16S: psm = 3; break;
+	case PSM_PSMZ32: psm = 4; break;
+	case PSM_PSMZ24: psm = 5; break;
+	case PSM_PSMZ16: psm = 6; break;
+	case PSM_PSMZ16S: psm = 7; break;
+	case PSM_PSMT8: psm = 8; break;
+	case PSM_PSMT4: psm = 9; break;
+	case PSM_PSMT8H: psm = 10; break;
+	case PSM_PSMT4HL: psm = 11; break;
+	case PSM_PSMT4HH: psm = 12; break;
+	}
+
+	return psm;
+}
+
+bool GSRendererCL::SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count)
+{
+	const GSDrawingEnvironment& env = m_env;
+	const GSDrawingContext* context = m_context;
+	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
+
+	TFXSelector sel;
+
+	sel.key = 0;
+
+	sel.atst = ATST_ALWAYS;
+	sel.tfx = TFX_NONE;
+	sel.ababcd = 0xff;
+	sel.prim = primclass;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
+	{
+		fm = 0xffffffff;
+		zm = 0xffffffff;
+	}
+
+	if(PRIM->TME)
+	{
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+	}
+
+	if(context->TEST.ATE)
+	{
+		if(!TryAlphaTest(fm, zm))
+		{
+			sel.atst = context->TEST.ATST;
+			sel.afail = context->TEST.AFAIL;
+			pb->aref = context->TEST.AREF;
+
+			switch(sel.atst)
+			{
+			case ATST_LESS:
+				sel.atst = ATST_LEQUAL;
+				pb->aref--;
+				break;
+			case ATST_GREATER:
+				sel.atst = ATST_GEQUAL;
+				pb->aref++;
+				break;
+			}
+		}
+	}
+
+	bool fwrite;
+	bool zwrite = zm != 0xffffffff;
+	
+	switch(context->FRAME.PSM)
+	{
+	default:
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+		fwrite = fm != 0xffffffff;
+		break;
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		fwrite = (fm & 0x00ffffff) != 0x00ffffff;
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		fwrite = (fm & 0x80f8f8f8) != 0x80f8f8f8;
+		break;
+	}
+
+	if(!fwrite && !zwrite) return false;
+
+	bool ftest = sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
+
+	sel.fwrite = fwrite;
+	sel.ftest = ftest;
+	sel.zwrite = zwrite;
+	sel.ztest = ztest;
+
+	if(fwrite || ftest)
+	{
+		sel.fpsm = RemapPSM(context->FRAME.PSM);
+
+		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
+		{
+			sel.iip = PRIM->IIP;
+		}
+
+		if(PRIM->TME)
+		{
+			sel.tfx = context->TEX0.TFX;
+			sel.tcc = context->TEX0.TCC;
+			sel.fst = PRIM->FST;
+			sel.ltf = m_vt.IsLinear();
+			sel.tpsm = RemapPSM(context->TEX0.PSM);
+			sel.aem = m_env.TEXA.AEM;
+
+			pb->tbp[0] = context->TEX0.TBP0;
+			pb->tbw[0] = context->TEX0.TBW;
+			pb->ta0 = m_env.TEXA.TA0;
+			pb->ta1 = m_env.TEXA.TA1;
+
+			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+			{
+				sel.tlu = 1;
+
+				memcpy(pb->clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
+			}
+
+			sel.wms = ((uint32)context->CLAMP.WMS + 1) & 3;
+			sel.wmt = ((uint32)context->CLAMP.WMT + 1) & 3;
+
+			if(sel.tfx == TFX_MODULATE && sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			{
+				// modulate does not do anything when vertex color is 0x80
+
+				sel.tfx = TFX_DECAL;
+			}
+
+			bool mipmap = IsMipMapActive();
+
+			GIFRegTEX0 TEX0 = m_context->GetSizeFixedTEX0(m_vt.m_min.t.xyxy(m_vt.m_max.t), m_vt.IsLinear(), mipmap);
+
+			GSVector4i r;
+
+			GetTextureMinMax(r, TEX0, context->CLAMP, sel.ltf);
+
+			GSVector4i* src_pages = job->GetSrcPages();
+
+			GSOffset* o = m_mem.GetOffset(context->TEX0.TBP0, context->TEX0.TBW, context->TEX0.PSM);
+			
+			o->GetPagesAsBits(r, m_tmp_pages);
+
+			for(int i = 0; i < 4; i++)
+			{
+				src_pages[i] |= m_tmp_pages[i];
+			}
+
+			if(mipmap)
+			{
+				// TEX1.MMIN
+				// 000 p
+				// 001 l
+				// 010 p round
+				// 011 p tri
+				// 100 l round
+				// 101 l tri
+
+				if(m_vt.m_lod.x > 0)
+				{
+					sel.ltf = context->TEX1.MMIN >> 2;
+				}
+				else
+				{
+					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
+				}
+
+				sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				sel.lcm = context->TEX1.LCM;
+
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
+				int k = context->TEX1.K << 12;
+
+				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
+				{
+					k = (int)m_vt.m_lod.x << 16; // set lod to max level
+
+					sel.lcm = 1; // lod is constant
+					sel.mmin = 1; // tri-linear is meaningless
+				}
+
+				if(sel.mmin == 2)
+				{
+					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
+				}
+
+				if(sel.fst)
+				{
+					ASSERT(sel.lcm == 1);
+					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
+
+					sel.lcm = 1;
+				}
+
+				if(sel.lcm)
+				{
+					int lod = std::max<int>(std::min<int>(k, mxl), 0);
+
+					if(sel.mmin == 1)
+					{
+						lod = (lod + 0x8000) & 0xffff0000; // rounding
+					}
+
+					pb->lod = lod;
+
+					// TODO: lot to optimize when lod is constant
+				}
+				else
+				{
+					pb->mxl = mxl;
+					pb->l = (float)(-0x10000 << context->TEX1.L);
+					pb->k = (float)k;
+				}
+
+				GIFRegTEX0 MIP_TEX0 = TEX0;
+				GIFRegCLAMP MIP_CLAMP = context->CLAMP;
+
+				GSVector4 tmin = m_vt.m_min.t;
+				GSVector4 tmax = m_vt.m_max.t;
+
+				static int s_counter = 0;
+
+				for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++)
+				{
+					switch(i)
+					{
+					case 1:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP1;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW1;
+						break;
+					case 2:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP2;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW2;
+						break;
+					case 3:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP3;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW3;
+						break;
+					case 4:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP4;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW4;
+						break;
+					case 5:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP5;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW5;
+						break;
+					case 6:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP6;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW6;
+						break;
+					default:
+						__assume(0);
+					}
+
+					pb->tbp[i] = MIP_TEX0.TBP0;
+					pb->tbw[i] = MIP_TEX0.TBW;
+
+					if(MIP_TEX0.TW > 0) MIP_TEX0.TW--;
+					if(MIP_TEX0.TH > 0) MIP_TEX0.TH--;
+
+					MIP_CLAMP.MINU >>= 1;
+					MIP_CLAMP.MINV >>= 1;
+					MIP_CLAMP.MAXU >>= 1;
+					MIP_CLAMP.MAXV >>= 1;
+
+					m_vt.m_min.t *= 0.5f;
+					m_vt.m_max.t *= 0.5f;
+
+					GSVector4i r;
+
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, sel.ltf);
+
+					GSOffset* o = m_mem.GetOffset(MIP_TEX0.TBP0, MIP_TEX0.TBW, MIP_TEX0.PSM);
+					
+					o->GetPagesAsBits(r, m_tmp_pages);
+
+					for(int i = 0; i < 4; i++)
+					{
+						src_pages[i] |= m_tmp_pages[i];
+					}
+				}
+
+				s_counter++;
+
+				m_vt.m_min.t = tmin;
+				m_vt.m_max.t = tmax;
+			}
+			else
+			{
+				if(sel.fst == 0)
+				{
+					// skip per pixel division if q is constant
+
+					GSVertexCL* RESTRICT v = vertex;
+
+					if(m_vt.m_eq.q)
+					{
+						sel.fst = 1;
+
+						const GSVector4& t = v[index[0]].t;
+
+						if(t.z != 1.0f)
+						{
+							GSVector4 w = t.zzzz().rcpnr();
+
+							for(int i = 0, j = vertex_count; i < j; i++)
+							{
+								GSVector4 t = v[i].t;
+
+								v[i].t = (t * w).xyzw(t);
+							}
+						}
+					}
+					else if(primclass == GS_SPRITE_CLASS)
+					{
+						sel.fst = 1;
+
+						for(int i = 0, j = vertex_count; i < j; i += 2)
+						{
+							GSVector4 t0 = v[i + 0].t;
+							GSVector4 t1 = v[i + 1].t;
+
+							GSVector4 w = t1.zzzz().rcpnr();
+
+							v[i + 0].t = (t0 * w).xyzw(t0);
+							v[i + 1].t = (t1 * w).xyzw(t1);
+						}
+					}
+				}
+			}
+
+			int tw = 1 << TEX0.TW;
+			int th = 1 << TEX0.TH;
+
+			switch(context->CLAMP.WMS)
+			{
+			case CLAMP_REPEAT:
+				pb->minu = tw - 1;
+				pb->maxu = 0;
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minu = 0;
+				pb->maxu = tw - 1;
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minu = std::min((int)context->CLAMP.MINU, tw - 1);
+				pb->maxu = std::min((int)context->CLAMP.MAXU, tw - 1);
+				//gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minu = (int)context->CLAMP.MINU & (tw - 1);
+				pb->maxu = (int)context->CLAMP.MAXU & (tw - 1);
+				//gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+
+			switch(context->CLAMP.WMT)
+			{
+			case CLAMP_REPEAT:
+				pb->minv = th - 1;
+				pb->maxv = 0;
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				pb->minv = 0;
+				pb->maxv = th - 1;
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				pb->minv = std::min((int)context->CLAMP.MINV, th - 1);
+				pb->maxv = std::min((int)context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
+				//gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				pb->minv = (int)context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127
+				pb->maxv = (int)context->CLAMP.MAXV & (th - 1);
+				//gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+		}
+
+		if(PRIM->FGE)
+		{
+			sel.fge = 1;
+			pb->fog = env.FOGCOL.u32[0];
+		}
+
+		if(context->FRAME.PSM != PSM_PSMCT24)
+		{
+			sel.date = context->TEST.DATE;
+			sel.datm = context->TEST.DATM;
+		}
+
+		if(!IsOpaque())
+		{
+			sel.abe = PRIM->ABE;
+			sel.ababcd = context->ALPHA.u32[0];
+
+			if(env.PABE.PABE)
+			{
+				sel.pabe = 1;
+			}
+
+			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
+			{
+				sel.aa1 = 1;
+			}
+
+			pb->afix = context->ALPHA.FIX;
+		}
+
+		if(sel.date || sel.aba == 1 || sel.abb == 1 || sel.abc == 1 && (sel.fpsm & 3) != 1 || sel.abd == 1)
+		{
+			sel.rfb = 1;
+		}
+		else
+		{
+			if(fwrite)
+			{
+				if(sel.atst != ATST_ALWAYS && sel.afail == AFAIL_RGB_ONLY
+				|| (sel.fpsm & 3) == 0 && fm != 0
+				|| (sel.fpsm & 3) == 1 // always read-merge-write 24bpp, regardless the mask
+				|| (sel.fpsm & 3) >= 2 && (fm & 0x80f8f8f8) != 0)
+				{
+					sel.rfb = 1;
+				}
+			}
+		}
+
+		sel.colclamp = env.COLCLAMP.CLAMP;
+		sel.fba = context->FBA.FBA;
+
+		if(env.DTHE.DTHE)
+		{
+			sel.dthe = 1;
+
+			GSVector4i dimx0 = env.dimx[1].sll32(16).sra32(16);
+			GSVector4i dimx1 = env.dimx[3].sll32(16).sra32(16);
+			GSVector4i dimx2 = env.dimx[5].sll32(16).sra32(16);
+			GSVector4i dimx3 = env.dimx[7].sll32(16).sra32(16);
+
+			pb->dimx = dimx0.ps32(dimx1).ps16(dimx2.ps32(dimx3));
+		}
+	}
+
+	if(zwrite || ztest)
+	{
+		sel.zpsm = RemapPSM(context->ZBUF.PSM);
+		sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+
+		if(ztest)
+		{
+			sel.rzb = 1;
+		}
+		else
+		{
+			if(zwrite)
+			{
+				if(sel.atst != ATST_ALWAYS && (sel.afail == AFAIL_FB_ONLY || sel.afail == AFAIL_RGB_ONLY)
+				|| (sel.zpsm & 3) == 1) // always read-merge-write 24bpp, regardless the mask
+				{
+					sel.rzb = 1;
+				}
+			}
+		}
+	}
+
+	pb->fm = fm;
+	pb->zm = zm;
+
+	if((sel.fpsm & 3) == 1)
+	{
+		pb->fm |= 0xff000000;
+	}
+	else if((sel.fpsm & 3) >= 2)
+	{
+		uint32 rb = pb->fm & 0x00f800f8;
+		uint32 ga = pb->fm & 0x8000f800;
+
+		pb->fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
+	}
+
+	if((sel.zpsm & 3) == 1)
+	{
+		pb->zm |= 0xff000000;
+	}
+	else if((sel.zpsm & 3) >= 2)
+	{
+		pb->zm |= 0xffff0000;
+	}
+
+	pb->fbp = context->FRAME.Block();
+	pb->zbp = context->ZBUF.Block();
+	pb->bw = context->FRAME.FBW;
+
+	pb->sel = sel;
+
+	return true;
+}
+
+//
+
+GSRendererCL::TFXJob::TFXJob()
+	: src_pages(NULL)
+	, dst_pages(NULL)
+{
+}
+
+GSRendererCL::TFXJob::~TFXJob()
+{
+	if(src_pages != NULL) _aligned_free(src_pages);
+	if(dst_pages != NULL) _aligned_free(dst_pages);
+}
+
+GSVector4i* GSRendererCL::TFXJob::GetSrcPages()
+{
+	if(src_pages == NULL)
+	{
+		src_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
+
+		src_pages[0] = GSVector4i::zero();
+		src_pages[1] = GSVector4i::zero();
+		src_pages[2] = GSVector4i::zero();
+		src_pages[3] = GSVector4i::zero();
+	}
+
+	return src_pages;
+}
+
+GSVector4i* GSRendererCL::TFXJob::GetDstPages()
+{
+	if(dst_pages == NULL)
+	{
+		dst_pages = (GSVector4i*)_aligned_malloc(sizeof(GSVector4i) * 4, 16);
+
+		dst_pages[0] = GSVector4i::zero();
+		dst_pages[1] = GSVector4i::zero();
+		dst_pages[2] = GSVector4i::zero();
+		dst_pages[3] = GSVector4i::zero();
+	}
+
+	return dst_pages;
+}
+
+//
+
+//#define IOCL_DEBUG
+
+GSRendererCL::CL::CL()
+{
+	WIs = INT_MAX;
+	version = INT_MAX;
+
+	std::string ocldev = theApp.GetConfig("ocldev", "");
+
+#ifdef IOCL_DEBUG
+	ocldev = "Intel(R) Corporation Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz OpenCL C 1.2 CPU";
+#endif
+
+	list<OCLDeviceDesc> dl;
+
+	GSUtil::GetDeviceDescs(dl);
+
+	for(auto d : dl)
+	{
+		if(d.name == ocldev)
+		{
+			devs.push_back(d);
+
+			WIs = std::min(WIs, (uint32)d.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+			version = std::min(version, d.version);
+
+			break; // TODO: multiple devices?
+		}
+	}
+
+	if(devs.empty() && !dl.empty())
+	{
+		auto d = dl.front();
+
+		devs.push_back(d);
+
+		WIs = std::min(WIs, (uint32)d.device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+		version = std::min(version, d.version);
+	}
+
+	if(devs.empty())
+	{
+		throw new std::exception("OpenCL device not found");
+	}
+
+	vector<cl::Device> tmp;
+
+	for(auto d : devs) tmp.push_back(d.device);
+
+	context = cl::Context(tmp);
+
+	queue[0] = cl::CommandQueue(context);
+	queue[1] = cl::CommandQueue(context);
+	queue[2] = cl::CommandQueue(context);
+
+	vector<unsigned char> buff;
+
+	if(theApp.LoadResource(IDR_TFX_CL, buff))
+	{
+		kernel_str = std::string((const char*)buff.data(), buff.size());
+	}
+
+	vb.head = vb.tail = vb.size = 0;
+	ib.head = ib.tail = ib.size = 0;
+	pb.head = pb.tail = pb.size = 0;
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+
+	pb.size = TFX_PARAM_SIZE * 256;
+	pb.buff[0] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+	pb.buff[1] = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, pb.size);
+
+	env = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(gs_env));
+
+	wqidx = 0;
+	wq = &queue[0];
+}
+
+GSRendererCL::CL::~CL()
+{
+	Unmap();
+}
+
+void GSRendererCL::CL::Map()
+{
+	Unmap();
+
+	cl_map_flags flags = version >= 120 ? CL_MAP_WRITE_INVALIDATE_REGION : CL_MAP_WRITE;
+
+	if(vb.head < vb.size)
+	{
+		vb.mapped_ptr = wq->enqueueMapBuffer(vb.buff[wqidx], CL_TRUE, flags, vb.head, vb.size - vb.head);
+		vb.ptr = (unsigned char*)vb.mapped_ptr - vb.head;
+		ASSERT(((size_t)vb.ptr & 15) == 0);
+	}
+
+	if(ib.head < ib.size)
+	{
+		ib.mapped_ptr = wq->enqueueMapBuffer(ib.buff[wqidx], CL_TRUE, flags, ib.head, ib.size - ib.head);
+		ib.ptr = (unsigned char*)ib.mapped_ptr - ib.head;
+	}
+
+	if(pb.head < pb.size)
+	{
+		pb.mapped_ptr = wq->enqueueMapBuffer(pb.buff[wqidx], CL_TRUE, flags, pb.head, pb.size - pb.head);
+		pb.ptr = (unsigned char*)pb.mapped_ptr - pb.head;
+		ASSERT(((size_t)pb.ptr & 15) == 0);
+	}
+}
+
+void GSRendererCL::CL::Unmap()
+{
+	if(vb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(vb.buff[wqidx], vb.mapped_ptr);
+	if(ib.mapped_ptr != NULL) wq->enqueueUnmapMemObject(ib.buff[wqidx], ib.mapped_ptr);
+	if(pb.mapped_ptr != NULL) wq->enqueueUnmapMemObject(pb.buff[wqidx], pb.mapped_ptr);
+
+	vb.mapped_ptr = vb.ptr = NULL;
+	ib.mapped_ptr = ib.ptr = NULL;
+	pb.mapped_ptr = pb.ptr = NULL;
+}
+
+cl::Kernel GSRendererCL::CL::Build(const char* entry, ostringstream& opt)
+{
+	cl::Program program;
+
+	if(version >= 120)
+	{
+		cl::Program::Binaries binaries;
+
+		try
+		{
+			for(auto d : devs)
+			{
+				string path = d.tmppath + "/" + entry;
+
+				FILE* f = fopen(path.c_str(), "rb");
+
+				if(f != NULL)
+				{
+					fseek(f, 0, SEEK_END);				
+					long size = ftell(f);
+					pair<void*, size_t> b(new char[size], size);
+					fseek(f, 0, SEEK_SET);
+					fread(b.first, b.second, 1, f);
+					fclose(f);
+
+					binaries.push_back(b);
+				}
+				else
+				{
+					break;
+				}
+			}
+
+			if(binaries.size() == devs.size())
+			{
+				vector<cl::Device> tmp;
+
+				for(auto d : devs) tmp.push_back(d.device);
+
+				program = cl::Program(context, tmp, binaries);
+
+				AddDefs(opt);
+
+				program.build(opt.str().c_str());
+
+				cl::Kernel kernel = cl::Kernel(program, entry);
+
+				return kernel;
+			}
+		}
+		catch(cl::Error err)
+		{
+			printf("%s (%d)\n", err.what(), err.err());
+		}
+
+		for(auto b : binaries)
+		{
+			delete [] b.first;
+		}
+	}
+
+	try
+	{
+		printf("building kernel (%s)\n", entry);
+
+		program = cl::Program(context, kernel_str);
+
+		AddDefs(opt);
+
+		program.build(opt.str().c_str());
+	}
+	catch(cl::Error err)
+	{
+		if(err.err() == CL_BUILD_PROGRAM_FAILURE)
+		{
+			for(auto d : devs)
+			{
+				auto s = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(d.device);
+
+				printf("kernel (%s) build error: %s\n", entry, s.c_str());
+			}
+		}
+
+		throw err;
+	}
+
+	if(version >= 120)
+	{
+		try
+		{
+			vector<size_t> sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
+			vector<char*> binaries = program.getInfo<CL_PROGRAM_BINARIES>();
+
+			for(int i = 0; i < binaries.size(); i++)
+			{
+				string path = devs[i].tmppath + "/" + entry;
+
+				FILE* f = fopen(path.c_str(), "wb");
+
+				if(f != NULL)
+				{
+					fwrite(binaries[i], sizes[i], 1, f);
+					fclose(f);
+				}
+
+				delete [] binaries[i];
+			}
+		}
+		catch(cl::Error err)
+		{
+			printf("%s (%d)\n", err.what(), err.err());
+		}
+	}
+
+	return cl::Kernel(program, entry);
+}
+
+void GSRendererCL::CL::AddDefs(ostringstream& opt)
+{
+	if(version == 110) opt << "-cl-std=CL1.1 ";
+	else opt << "-cl-std=CL1.2 ";
+	opt << "-D MAX_FRAME_SIZE=" << MAX_FRAME_SIZE << "u ";
+	opt << "-D MAX_PRIM_COUNT=" << MAX_PRIM_COUNT << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH_BITS=" << MAX_PRIM_PER_BATCH_BITS << "u ";
+	opt << "-D MAX_PRIM_PER_BATCH=" << MAX_PRIM_PER_BATCH << "u ";
+	opt << "-D MAX_BATCH_COUNT=" << MAX_BATCH_COUNT << "u ";
+	opt << "-D BIN_SIZE_BITS=" << BIN_SIZE_BITS << " ";
+	opt << "-D BIN_SIZE=" << BIN_SIZE << "u ";
+	opt << "-D MAX_BIN_PER_BATCH=" << MAX_BIN_PER_BATCH << "u ";
+	opt << "-D MAX_BIN_COUNT=" << MAX_BIN_COUNT << "u ";
+	opt << "-D TFX_PARAM_SIZE=" << TFX_PARAM_SIZE << "u ";
+#ifdef IOCL_DEBUG
+	opt << "-g -s \"E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\tfx.cl\" ";
+#endif
+}
+
+cl::Kernel& GSRendererCL::CL::GetPrimKernel(const PrimSelector& sel)
+{
+	auto i = prim_map.find(sel);
+
+	if(i != prim_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "prim_%02x", sel);
+
+	ostringstream opt;
+
+	opt << "-D KERNEL_PRIM=" << entry << " ";
+	opt << "-D PRIM=" << sel.prim << " ";
+
+	cl::Kernel k = Build(entry, opt);
+
+	prim_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return prim_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTileKernel(const TileSelector& sel)
+{
+	auto i = tile_map.find(sel);
+
+	if(i != tile_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tile_%02x", sel);
+
+	ostringstream opt;
+
+	opt << "-D KERNEL_TILE=" << entry << " ";
+	opt << "-D PRIM=" << sel.prim << " ";
+	opt << "-D MODE=" << sel.mode << " ";
+	opt << "-D CLEAR=" << sel.clear << " ";
+
+	cl::Kernel k = Build(entry, opt);
+
+	tile_map[sel] = k;
+
+	k.setArg(0, env);
+
+	return tile_map[sel];
+}
+
+cl::Kernel& GSRendererCL::CL::GetTFXKernel(const TFXSelector& sel)
+{
+	auto i = tfx_map.find(sel);
+
+	if(i != tfx_map.end())
+	{
+		return i->second;
+	}
+
+	char entry[256];
+
+	sprintf(entry, "tfx_%016llx", sel);
+
+	ostringstream opt;
+
+	opt << "-D KERNEL_TFX=" << entry << " ";
+	opt << "-D FPSM=" << sel.fpsm << " ";
+	opt << "-D ZPSM=" << sel.zpsm << " ";
+	opt << "-D ZTST=" << sel.ztst << " ";
+	opt << "-D ATST=" << sel.atst << " ";
+	opt << "-D AFAIL=" << sel.afail << " ";
+	opt << "-D IIP=" << sel.iip << " ";
+	opt << "-D TFX=" << sel.tfx << " ";
+	opt << "-D TCC=" << sel.tcc << " ";
+	opt << "-D FST=" << sel.fst << " ";
+	opt << "-D LTF=" << sel.ltf << " ";
+	opt << "-D TLU=" << sel.tlu << " ";
+	opt << "-D FGE=" << sel.fge << " ";
+	opt << "-D DATE=" << sel.date << " ";
+	opt << "-D ABE=" << sel.abe << " ";
+	opt << "-D ABA=" << sel.aba << " ";
+	opt << "-D ABB=" << sel.abb << " ";
+	opt << "-D ABC=" << sel.abc << " ";
+	opt << "-D ABD=" << sel.abd << " ";
+	opt << "-D PABE=" << sel.pabe << " ";
+	opt << "-D AA1=" << sel.aa1 << " ";
+	opt << "-D FWRITE=" << sel.fwrite << " ";
+	opt << "-D FTEST=" << sel.ftest << " ";
+	opt << "-D RFB=" << sel.rfb << " ";
+	opt << "-D ZWRITE=" << sel.zwrite << " ";
+	opt << "-D ZTEST=" << sel.ztest << " ";
+	opt << "-D RZB=" << sel.rzb << " ";
+	opt << "-D WMS=" << sel.wms << " ";
+	opt << "-D WMT=" << sel.wmt << " ";
+	opt << "-D DATM=" << sel.datm << " ";
+	opt << "-D COLCLAMP=" << sel.colclamp << " ";
+	opt << "-D FBA=" << sel.fba << " ";
+	opt << "-D DTHE=" << sel.dthe << " ";
+	opt << "-D PRIM=" << sel.prim << " ";
+	opt << "-D LCM=" << sel.lcm << " ";
+	opt << "-D MMIN=" << sel.mmin << " ";
+	opt << "-D NOSCISSOR=" << sel.noscissor << " ";
+	opt << "-D TPSM=" << sel.tpsm << " ";
+	opt << "-D AEM=" << sel.aem << " ";
+	opt << "-D FB=" << sel.fb << " ";
+	opt << "-D ZB=" << sel.zb << " ";
+	opt << "-D MERGED=" << sel.merged << " ";
+
+	cl::Kernel k = Build(entry, opt);
+
+	tfx_map[sel] = k;
+
+	k.setArg(0, env);
+	k.setArg(1, vm);
+
+	return tfx_map[sel];
+}
+#endif
diff --git a/plugins/GSdx_legacy/GSRendererCL.h b/plugins/GSdx_legacy/GSRendererCL.h
new file mode 100644
index 0000000000..81ec47ba49
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererCL.h
@@ -0,0 +1,268 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+
+#ifdef ENABLE_OPENCL
+
+__aligned(struct, 32) GSVertexCL
+{
+	GSVector4 p, t;
+};
+
+class GSRendererCL : public GSRenderer
+{
+	typedef void (GSRendererCL::*ConvertVertexBufferPtr)(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	ConvertVertexBufferPtr m_cvb[4][2][2];
+
+	template<uint32 primclass, uint32 tme, uint32 fst>
+	void ConvertVertexBuffer(GSVertexCL* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	union PrimSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TileSelector
+	{
+		struct
+		{
+			uint32 prim:2; // 0
+			uint32 mode:2; // 2
+			uint32 clear:1; // 4
+		};
+
+		uint32 key;
+
+		operator uint32() const { return key; }
+	};
+
+	union TFXSelector
+	{
+		struct
+		{
+			uint32 fpsm:3; // 0
+			uint32 zpsm:3; // 3
+			uint32 ztst:2; // 6 (0: off, 1: write, 2: test (ge), 3: test (g))
+			uint32 atst:3; // 8
+			uint32 afail:2; // 11
+			uint32 iip:1; // 13
+			uint32 tfx:3; // 14
+			uint32 tcc:1; // 17
+			uint32 fst:1; // 18
+			uint32 ltf:1; // 19
+			uint32 tlu:1; // 20
+			uint32 fge:1; // 21
+			uint32 date:1; // 22
+			uint32 abe:1; // 23
+			uint32 aba:2; // 24
+			uint32 abb:2; // 26
+			uint32 abc:2; // 28
+			uint32 abd:2; // 30
+
+			uint32 pabe:1; // 32
+			uint32 aa1:1; // 33
+			uint32 fwrite:1; // 34
+			uint32 ftest:1; // 35
+			uint32 rfb:1; // 36
+			uint32 zwrite:1; // 37
+			uint32 ztest:1; // 38
+			uint32 rzb:1; // 39
+			uint32 wms:2; // 40
+			uint32 wmt:2; // 42
+			uint32 datm:1; // 44
+			uint32 colclamp:1; // 45
+			uint32 fba:1; // 46
+			uint32 dthe:1; // 47
+			uint32 prim:2; // 48
+			uint32 lcm:1; // 50
+			uint32 mmin:2; // 51
+			uint32 noscissor:1; // 53
+			uint32 tpsm:4; // 54
+			uint32 aem:1; // 58
+			uint32 merged:1; // 59
+			// TODO
+		};
+
+		struct
+		{
+			uint32 _pad1:24;
+			uint32 ababcd:8;
+			uint32 _pad2:2;
+			uint32 fb:2;
+			uint32 _pad3:1;
+			uint32 zb:2;
+		};
+
+		struct
+		{
+			uint32 lo;
+			uint32 hi;
+		};
+
+		uint64 key;
+
+		operator uint64() const { return key; }
+
+		bool IsSolidRect() const
+		{
+			return prim == GS_SPRITE_CLASS
+				&& iip == 0
+				&& tfx == TFX_NONE
+				&& abe == 0
+				&& ztst <= 1
+				&& atst <= 1
+				&& date == 0
+				&& fge == 0;
+		}
+	};
+
+	__aligned(struct, 32) TFXParameter
+	{
+		GSVector4i scissor;
+		GSVector4i dimx; // 4x4 signed char
+		TFXSelector sel;
+		uint32 fbp, zbp, bw;
+		uint32 fm, zm;
+		uint32 fog; // rgb
+		uint8 aref, afix;
+		uint8 ta0, ta1;
+		uint32 tbp[7], tbw[7];
+		int minu, maxu, minv, maxv; // umsk, ufix, vmsk, vfix
+		int lod; // lcm == 1
+		int mxl;
+		float l; // TEX1.L * -0x10000
+		float k; // TEX1.K * 0x10000
+		uint32 clut[256];
+	};
+
+	class TFXJob
+	{
+	public:
+		struct { int x, y, z, w; } rect;
+		TFXSelector sel;
+		uint32 ib_start;
+		uint32 prim_count;
+		GSVector4i* src_pages; // read by any texture level
+		GSVector4i* dst_pages; // f/z writes to it
+		uint32 fbp, zbp, bw;
+		uint32 fpsm, zpsm, tpsm;
+#ifdef DEBUG
+		TFXParameter* pb;
+#endif
+		TFXJob();
+		virtual ~TFXJob();
+
+		GSVector4i* GetSrcPages();
+		GSVector4i* GetDstPages();
+	};
+
+	class CL
+	{
+		std::string kernel_str;
+		std::map<uint32, cl::Kernel> prim_map;
+		std::map<uint32, cl::Kernel> tile_map;
+		std::map<uint64, cl::Kernel> tfx_map;
+
+		cl::Kernel Build(const char* entry, ostringstream& opt);
+		void AddDefs(ostringstream& opt);
+
+	public:
+		std::vector<OCLDeviceDesc> devs;
+		cl::Context context;
+		cl::CommandQueue queue[3];
+		cl::Buffer vm;
+		cl::Buffer tex;
+		struct { cl::Buffer buff[2]; size_t head, tail, size; unsigned char* ptr; void* mapped_ptr; } vb, ib, pb;
+		cl::Buffer env;
+		cl::CommandQueue* wq;
+		int wqidx;
+		uint32 WIs;
+		int version;
+
+	public:
+		CL();
+		virtual ~CL();
+
+		cl::Kernel& GetPrimKernel(const PrimSelector& sel);
+		cl::Kernel& GetTileKernel(const TileSelector& sel);
+		cl::Kernel& GetTFXKernel(const TFXSelector& sel);
+
+		void Map();
+		void Unmap();
+	};
+
+	CL m_cl;
+	std::list<shared_ptr<TFXJob>> m_jobs;
+	uint32 m_vb_start;
+	uint32 m_vb_count;
+	uint32 m_pb_start;
+	uint32 m_pb_count;
+	bool m_synced;
+
+	void Enqueue();
+	void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
+	void JoinTFX(std::list<shared_ptr<TFXJob>>& jobs);
+	bool UpdateTextureCache(TFXJob* job);
+	void InvalidateTextureCache(TFXJob* job);
+	void UsePages(uint32* pages);
+	void ReleasePages(uint32* pages);
+
+	static void CL_CALLBACK ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data);
+
+protected:
+	GSTexture* m_texture[2];
+	uint8* m_output;
+	
+	GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
+	GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) // TODO: this should be block level, too many overlaps inside pages with render targets
+	GSVector4i m_tmp_pages[4];
+	uint32 m_tmp_pages2[MAX_PAGES + 1];
+	uint32 m_rw_pages_rendering[512]; // pages that are currently in-use
+
+	void Reset();
+	void VSync(int field);
+	void ResetDevice();
+	GSTexture* GetOutput(int i);
+
+	void Draw();
+	void Sync(int reason);
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
+
+	bool SetupParameter(TFXJob* job, TFXParameter* pb, GSVertexCL* vertex, size_t vertex_count, const uint32* index, size_t index_count);
+
+public:
+	GSRendererCL();
+	virtual ~GSRendererCL();
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/GSRendererCS.cpp b/plugins/GSdx_legacy/GSRendererCS.cpp
new file mode 100644
index 0000000000..86b1e23b07
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererCS.cpp
@@ -0,0 +1,877 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererCS.h"
+
+#define PS_BATCH_SIZE 512
+
+GSRendererCS::GSRendererCS()
+	: GSRenderer()
+{
+	m_nativeres = true;
+
+	memset(m_vm_valid, 0, sizeof(m_vm_valid));
+
+	memset(m_texture, 0, sizeof(m_texture));
+
+	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
+}
+
+GSRendererCS::~GSRendererCS()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+	}
+
+	_aligned_free(m_output);
+}
+
+bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
+{
+	if(!__super::CreateDevice(dev_unk))
+		return false;
+
+	HRESULT hr; 
+
+	D3D11_DEPTH_STENCIL_DESC dsd;
+	D3D11_BLEND_DESC bsd;
+	D3D11_SAMPLER_DESC sd;
+	D3D11_BUFFER_DESC bd;
+	D3D11_TEXTURE2D_DESC td;
+	D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
+	D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
+
+	D3D_FEATURE_LEVEL level;
+
+	((GSDeviceDX*)dev_unk)->GetFeatureLevel(level);
+
+	if(level < D3D_FEATURE_LEVEL_11_0)
+		return false;
+
+	GSDevice11* dev = (GSDevice11*)dev_unk;
+
+	ID3D11DeviceContext* ctx = *dev;
+
+	// empty depth stencil state
+
+	memset(&dsd, 0, sizeof(dsd));
+
+	dsd.StencilEnable = false;
+	dsd.DepthEnable = false;
+
+	hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss);
+
+	if(FAILED(hr)) return false;
+	
+	// empty blend state
+
+	memset(&bsd, 0, sizeof(bsd));
+
+	bsd.RenderTarget[0].BlendEnable = false;
+
+	hr = (*dev)->CreateBlendState(&bsd, &m_bs);
+
+	if(FAILED(hr)) return false;
+
+	// point sampler
+
+	memset(&sd, 0, sizeof(sd));
+
+	sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
+	sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.MinLOD = -FLT_MAX;
+	sd.MaxLOD = FLT_MAX;
+	sd.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+	sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
+
+	hr = (*dev)->CreateSamplerState(&sd, &m_ss);
+
+	if(FAILED(hr)) return false;
+
+	// link buffer
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = 256 << 20; // 256 MB w00t
+	bd.StructureByteStride = sizeof(uint32) * 4; // c, z, id, next
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+	bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_lb);
+
+	{
+		uint32 data[] = {0, 0, 0xffffffff, 0};
+
+		D3D11_BOX box;
+		memset(&box, 0, sizeof(box));
+		box.right = sizeof(data);
+		box.bottom = 1;
+		box.back = 1;
+
+		ctx->UpdateSubresource(m_lb, 0, &box, data, 0, 0);
+	}
+
+	if(FAILED(hr)) return false;
+
+	memset(&uavd, 0, sizeof(uavd));
+
+	uavd.Format = DXGI_FORMAT_UNKNOWN;
+	uavd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride;
+	uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
+	uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+	hr = (*dev)->CreateUnorderedAccessView(m_lb, &uavd, &m_lb_uav);
+
+	if(FAILED(hr)) return false;
+
+	memset(&srvd, 0, sizeof(srvd));
+
+	srvd.Format = DXGI_FORMAT_UNKNOWN;
+	srvd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride;
+	srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+
+	hr = (*dev)->CreateShaderResourceView(m_lb, &srvd, &m_lb_srv);
+
+	if(FAILED(hr)) return false;
+
+	// start offset buffer
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(uint32) * 2048 * 2048; // index
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+	bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob);
+
+	if(FAILED(hr)) return false;
+
+	memset(&uavd, 0, sizeof(uavd));
+
+	uavd.Format = DXGI_FORMAT_R32_TYPELESS;
+	uavd.Buffer.NumElements = bd.ByteWidth / sizeof(uint32);
+	uavd.Buffer.Flags =  D3D11_BUFFER_UAV_FLAG_RAW;
+	uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+	hr = (*dev)->CreateUnorderedAccessView(m_sob, &uavd, &m_sob_uav);
+
+	if(FAILED(hr)) return false;
+
+	memset(&srvd, 0, sizeof(srvd));
+
+	srvd.Format = DXGI_FORMAT_R32_TYPELESS;
+	srvd.BufferEx.NumElements = bd.ByteWidth / sizeof(uint32);
+	srvd.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
+	srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX;
+
+	hr = (*dev)->CreateShaderResourceView(m_sob, &srvd, &m_sob_srv);
+
+	if(FAILED(hr)) return false;
+
+	const uint32 tmp = 0;
+
+	ctx->ClearUnorderedAccessViewUint(m_sob_uav, &tmp); // initial clear, next time Draw should restore it in Step 2
+
+	// video memory (4MB)
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = 4 * 1024 * 1024;
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
+	bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_vm);
+
+	if(FAILED(hr)) return false;
+
+	memset(&uavd, 0, sizeof(uavd));
+
+	uavd.Format = DXGI_FORMAT_R32_TYPELESS;
+	uavd.Buffer.FirstElement = 0;
+	uavd.Buffer.NumElements = 1024 * 1024;
+	uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
+	uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+	hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
+
+	if(FAILED(hr)) return false;
+/*
+	memset(&td, 0, sizeof(td));
+
+	td.Width = PAGE_SIZE;
+	td.Height = MAX_PAGES;
+	td.Format = DXGI_FORMAT_R8_UINT;
+	td.MipLevels = 1;
+	td.ArraySize = 1;
+	td.SampleDesc.Count = 1;
+	td.SampleDesc.Quality = 0;
+	td.Usage = D3D11_USAGE_DEFAULT;
+	td.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
+
+	hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm);
+
+	if(FAILED(hr)) return false;
+
+	memset(&uavd, 0, sizeof(uavd));
+
+	uavd.Format = DXGI_FORMAT_R8_UINT;
+	uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
+
+	hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
+
+	if(FAILED(hr)) return false;
+*/
+	// one page, for copying between cpu<->gpu
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = PAGE_SIZE;
+	bd.Usage = D3D11_USAGE_STAGING;
+	bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb);
+
+	if(FAILED(hr)) return false;
+/*
+	memset(&td, 0, sizeof(td));
+
+	td.Width = PAGE_SIZE;
+	td.Height = 1;
+	td.Format = DXGI_FORMAT_R8_UINT;
+	td.MipLevels = 1;
+	td.ArraySize = 1;
+	td.SampleDesc.Count = 1;
+	td.SampleDesc.Quality = 0;
+	td.Usage = D3D11_USAGE_STAGING;
+	td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+
+	hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb);
+
+	if(FAILED(hr)) return false;
+*/
+	// VSConstantBuffer
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(VSConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb);
+
+	if(FAILED(hr)) return false;
+
+	// PS
+
+	D3D_SHADER_MACRO macro[] =
+	{
+		{NULL, NULL},
+	};
+
+	try
+	{
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_CS_FX, shader);
+		dev->CompileShader((const char *)shader.data(), shader.size(), "cs.fx", nullptr, "ps_main0", macro, &m_ps0);
+	}
+	catch (GSDXRecoverableError)
+	{
+		return false;
+	}
+
+	// PSConstantBuffer
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(PSConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb);
+
+	if(FAILED(hr)) return false;
+
+	//
+
+	return true;
+}
+
+void GSRendererCS::ResetDevice()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+
+		m_texture[i] = NULL;
+	}
+}
+
+void GSRendererCS::VSync(int field)
+{
+	__super::VSync(field);
+
+	//printf("%lld\n", m_perfmon.GetFrame());
+}
+
+GSTexture* GSRendererCS::GetOutput(int i)
+{
+	// TODO: create a compute shader which unswizzles the frame from m_vm to the output texture
+
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	int w = DISPFB.FBW * 64;
+	int h = GetFrameRect(i).bottom;
+
+	// TODO: round up bottom
+
+	if(m_dev->ResizeTexture(&m_texture[i], w, h))
+	{
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
+
+		GSVector4i r(0, 0, w, h);
+		GSVector4i r2 = r.ralign<Align_Outside>(psm.bs);
+
+		GSOffset* off = m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM);
+
+		Read(off, r2, false);
+
+		(m_mem.*psm.rtx)(off, r2, m_output, 1024 * 4, m_env.TEXA);
+
+		m_texture[i]->Update(r, m_output, 1024 * 4);
+
+		if(s_dump)
+		{
+			if(s_save && s_n >= s_saven)
+			{
+				m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+			}
+
+			s_n++;
+		}
+	}
+
+	return m_texture[i];
+}
+
+void GSRendererCS::Draw()
+{
+	GSDrawingEnvironment& env = m_env;
+	GSDrawingContext* context = m_context;
+
+	GSVector2i rtsize(2048, 2048);
+	GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
+	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+	GSVector4i r = bbox.rintersect(scissor);
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(fm != 0xffffffff)
+	{
+		Write(context->offset.fb, r);
+
+		// TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false);
+	}
+
+	if(zm != 0xffffffff)
+	{
+		Write(context->offset.zb, r);
+
+		// TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false);
+	}
+
+	// TODO: if(24-bit) fm/zm |= 0xff000000;
+
+	if(PRIM->TME)
+	{
+		m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+
+		GSVector4i r;
+
+		GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
+
+		// TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu
+
+		// TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also
+
+		// TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
+
+		// if(!tex) return;
+	}
+
+	//
+
+	GSDevice11* dev = (GSDevice11*)m_dev;
+
+	ID3D11DeviceContext* ctx = *dev;
+
+	//
+
+	dev->BeginScene();
+
+	// SetupOM
+
+	dev->OMSetDepthStencilState(m_dss, 0);
+	dev->OMSetBlendState(m_bs, 0);
+	
+	ID3D11UnorderedAccessView* uavs[] = {m_vm_uav, m_lb_uav, m_sob_uav};
+	uint32 counters[] = {1, 0, 0};
+
+	dev->OMSetRenderTargets(rtsize, countof(uavs), uavs, counters, &scissor);
+
+	// SetupIA
+
+	D3D11_PRIMITIVE_TOPOLOGY topology;
+
+	switch(m_vt.m_primclass)
+	{
+	case GS_POINT_CLASS:
+		topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
+		break;
+	case GS_LINE_CLASS:
+	case GS_SPRITE_CLASS:
+		topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
+		break;
+	case GS_TRIANGLE_CLASS:
+		topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+		break;
+	default:
+		__assume(0);
+	}
+
+	GSVector4i r2 = bbox.add32(GSVector4i(-1, -1, 1, 1)).rintersect(scissor);
+
+	m_vertex.buff[m_vertex.next + 0].XYZ.X = (uint16)(context->XYOFFSET.OFX + (r2.left << 4));
+	m_vertex.buff[m_vertex.next + 0].XYZ.Y = (uint16)(context->XYOFFSET.OFY + (r2.top << 4));
+	m_vertex.buff[m_vertex.next + 1].XYZ.X = (uint16)(context->XYOFFSET.OFX + (r2.right << 4));
+	m_vertex.buff[m_vertex.next + 1].XYZ.Y = (uint16)(context->XYOFFSET.OFY + (r2.bottom << 4));
+	
+	m_index.buff[m_index.tail + 0] = m_vertex.next + 0;
+	m_index.buff[m_index.tail + 1] = m_vertex.next + 1;
+
+	dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertex), m_vertex.next + 2);
+	dev->IASetIndexBuffer(m_index.buff, m_index.tail + 2);
+
+	// SetupVS
+
+	VSSelector vs_sel;
+
+	vs_sel.tme = PRIM->TME;
+	vs_sel.fst = PRIM->FST;
+
+	VSConstantBuffer vs_cb;
+
+	float sx = 2.0f / (rtsize.x << 4);
+	float sy = 2.0f / (rtsize.y << 4);
+	//float sx = 1.0f / 16;
+	//float sy = 1.0f / 16;
+	float ox = (float)(int)context->XYOFFSET.OFX;
+	float oy = (float)(int)context->XYOFFSET.OFY;
+
+	vs_cb.VertexScale  = GSVector4(sx, -sy, 0.0f, 0.0f);
+	vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f);
+	//vs_cb.VertexScale  = GSVector4(sx, sy, 0.0f, 0.0f);
+	//vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f);
+
+	{
+		GSVertexShader11 vs;
+
+		hash_map<uint32, GSVertexShader11>::const_iterator i = m_vs.find(vs_sel);
+
+		if(i != m_vs.end())
+		{
+			vs = i->second;
+		}
+		else
+		{
+			string str[2];
+
+			str[0] = format("%d", vs_sel.tme);
+			str[1] = format("%d", vs_sel.fst);
+
+			D3D_SHADER_MACRO macro[] =
+			{
+				{"VS_TME", str[0].c_str()},
+				{"VS_FST", str[1].c_str()},
+				{NULL, NULL},
+			};
+			
+			D3D11_INPUT_ELEMENT_DESC layout[] =
+			{
+				{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
+				{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			};
+
+			vector<unsigned char> shader;
+			theApp.LoadResource(IDR_CS_FX, shader);
+			dev->CompileShader((const char *)shader.data(), shader.size(), "cs.fx", nullptr, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
+
+			m_vs[vs_sel] = vs;
+		}
+
+		ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed
+
+		dev->VSSetShader(vs.vs, m_vs_cb);
+
+		dev->IASetInputLayout(vs.il);
+	}
+
+	// SetupGS
+
+	GSSelector gs_sel;
+
+	gs_sel.iip = PRIM->IIP;
+
+	CComPtr<ID3D11GeometryShader> gs[2];
+
+	for(int j = 0; j < 2; j++)
+	{
+		gs_sel.prim = j == 0 ? m_vt.m_primclass : GS_SPRITE_CLASS;
+
+		hash_map<uint32, CComPtr<ID3D11GeometryShader> >::const_iterator i = m_gs.find(gs_sel);
+
+		if(i != m_gs.end())
+		{
+			gs[j] = i->second;
+		}
+		else
+		{
+			string str[2];
+
+			str[0] = format("%d", gs_sel.iip);
+			str[1] = format("%d", j == 0 ? gs_sel.prim : GS_SPRITE_CLASS);
+
+			D3D_SHADER_MACRO macro[] =
+			{
+				{"GS_IIP", str[0].c_str()},
+				{"GS_PRIM", str[1].c_str()},
+				{NULL, NULL},
+			};
+
+			vector<unsigned char> shader;
+			theApp.LoadResource(IDR_CS_FX, shader);
+			dev->CompileShader((const char *)shader.data(), shader.size(), "cs.fx", nullptr, "gs_main", macro, &gs[j]);
+
+			m_gs[gs_sel] = gs[j];
+		}
+	}
+
+	// SetupPS
+
+	dev->PSSetSamplerState(m_ss, NULL, NULL);
+
+	PSSelector ps_sel;
+
+	ps_sel.fpsm = context->FRAME.PSM;
+	ps_sel.zpsm = context->ZBUF.PSM;
+
+	CComPtr<ID3D11PixelShader> ps[2] = {m_ps0, NULL};
+
+	hash_map<uint32, CComPtr<ID3D11PixelShader> >::const_iterator i = m_ps1.find(ps_sel);
+
+	if(i != m_ps1.end())
+	{
+		ps[1] = i->second;
+	}
+	else
+	{
+		string str[15];
+
+		str[0] = format("%d", PS_BATCH_SIZE);
+		str[1] = format("%d", context->FRAME.PSM);
+		str[2] = format("%d", context->ZBUF.PSM);
+
+		D3D_SHADER_MACRO macro[] =
+		{
+			{"PS_BATCH_SIZE", str[0].c_str()},
+			{"PS_FPSM", str[1].c_str()},
+			{"PS_ZPSM", str[2].c_str()},
+			{NULL, NULL},
+		};
+
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_CS_FX, shader);
+		dev->CompileShader((const char *)shader.data(), shader.size(), "cs.fx", nullptr, "ps_main1", macro, &ps[1]);
+
+		m_ps1[ps_sel] = ps[1];
+	}
+
+	PSConstantBuffer ps_cb;
+
+	ps_cb.fm = fm;
+	ps_cb.zm = zm;
+
+	ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed
+
+	OffsetBuffer* fzbo = NULL;
+	
+	GetOffsetBuffer(&fzbo);
+
+	dev->PSSetShaderResourceView(0, fzbo->row_srv);
+	dev->PSSetShaderResourceView(1, fzbo->col_srv);
+	// TODO: palette, texture
+
+	int step = PS_BATCH_SIZE * GSUtil::GetVertexCount(PRIM->PRIM);
+
+	for(uint32 i = 0; i < m_index.tail; i += step)
+	{
+		dev->IASetPrimitiveTopology(topology);
+		dev->GSSetShader(gs[0]);
+		dev->PSSetShader(ps[0], m_ps_cb);
+		dev->DrawIndexedPrimitive(i, std::min<int>(m_index.tail - i, step));
+
+		dev->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST);
+		dev->GSSetShader(gs[1]);
+		dev->PSSetShader(ps[1], m_ps_cb);
+		dev->DrawIndexedPrimitive(m_index.tail, 2);
+
+		//printf("%d/%d, %d %d %d %d\n", i, m_index.tail, r2.x, r2.y, r2.z, r2.w);
+	}
+
+	dev->EndScene();
+
+	if(0)
+	{
+		std::string s;
+		/*
+		s = format("c:\\temp1\\_%05d_f%lld_fb0_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0);
+		m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
+		Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false);
+		*/
+		//
+		if(fm != 0xffffffff) Read(context->offset.fb, r, false);
+		//
+		if(zm != 0xffffffff) Read(context->offset.zb, r, false);
+
+		s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
+		m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+
+		s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+		m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+
+		/*
+		s = format("c:\\temp1\\_%05d_f%lld_fb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0);
+		m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
+		*/
+
+		s_n++;
+	}
+}
+
+void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	GSOffset* off = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+
+	Read(off, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important)
+
+	// TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target
+	// TODO: invalidate texture cache
+}
+
+void GSRendererCS::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	GSOffset* off = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+
+	Read(off, r, false);
+}
+
+void GSRendererCS::Write(GSOffset* off, const GSVector4i& r)
+{
+	GSDevice11* dev = (GSDevice11*)m_dev;
+	
+	ID3D11DeviceContext* ctx = *dev;
+
+	D3D11_BOX box;
+	
+	memset(&box, 0, sizeof(box));
+
+	box.right = 1;
+	box.bottom = 1;
+	box.back = 1;
+
+	uint32* pages = off->GetPages(r);
+
+	for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
+	{
+		uint32 page = pages[i];
+
+		uint32 row = page >> 5;
+		uint32 col = 1 << (page & 31);
+
+		if((m_vm_valid[row] & col) == 0)
+		{
+			m_vm_valid[row] |= col;
+
+			box.left = page * PAGE_SIZE;
+			box.right = (page + 1) * PAGE_SIZE;
+
+			ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
+/*
+			// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
+
+			box.left = 0;
+			box.right = PAGE_SIZE;
+			box.top = page;
+			box.bottom = box.top + 1;
+
+			ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
+*/
+			if(0)
+			printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), off->bp, off->bw, off->psm, page);
+		}
+	}
+
+	delete [] pages;
+}
+
+void GSRendererCS::Read(GSOffset* off, const GSVector4i& r, bool invalidate)
+{
+	GSDevice11* dev = (GSDevice11*)m_dev;
+	
+	ID3D11DeviceContext* ctx = *dev;
+
+	D3D11_BOX box;
+	
+	memset(&box, 0, sizeof(box));
+
+	box.right = 1;
+	box.bottom = 1;
+	box.back = 1;
+
+	uint32* pages = off->GetPages(r);
+
+	for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
+	{
+		uint32 page = pages[i];
+
+		uint32 row = page >> 5;
+		uint32 col = 1 << (page & 31);
+
+		if(m_vm_valid[row] & col)
+		{
+			if(invalidate)
+			{
+				m_vm_valid[row] ^= col;
+			}
+
+			box.left = page * PAGE_SIZE;
+			box.right = (page + 1) * PAGE_SIZE;
+
+			ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
+/*
+			// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
+
+			box.left = 0;
+			box.right = PAGE_SIZE;
+			box.top = page;
+			box.bottom = box.top + 1;
+
+			ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
+*/
+			D3D11_MAPPED_SUBRESOURCE map;
+
+			if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map)))
+			{
+				memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE);
+
+				ctx->Unmap(m_pb, 0);
+				
+				if(0)
+				printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), off->bp, off->bw, off->psm, page);
+			}
+		}
+	}
+
+	delete [] pages;
+}
+
+bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo)
+{
+	HRESULT hr;
+
+	GSDevice11* dev = (GSDevice11*)m_dev;
+
+	D3D11_BUFFER_DESC bd;
+	D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
+	D3D11_SUBRESOURCE_DATA data;
+
+	hash_map<uint32, OffsetBuffer>::iterator i = m_offset.find(m_context->offset.fzb->hash);
+
+	if(i == m_offset.end())
+	{
+		OffsetBuffer ob;
+
+		memset(&bd, 0, sizeof(bd));
+
+		bd.ByteWidth = sizeof(GSVector2i) * 2048;
+		bd.Usage = D3D11_USAGE_IMMUTABLE;
+		bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+
+		memset(&data, 0, sizeof(data));
+
+		data.pSysMem = m_context->offset.fzb->row;
+
+		hr = (*dev)->CreateBuffer(&bd, &data, &ob.row);
+
+		if(FAILED(hr)) return false;
+
+		data.pSysMem = m_context->offset.fzb->col;
+
+		hr = (*dev)->CreateBuffer(&bd, &data, &ob.col);
+
+		if(FAILED(hr)) return false;
+
+		memset(&srvd, 0, sizeof(srvd));
+
+		srvd.Format = DXGI_FORMAT_R32G32_SINT;
+		srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+		srvd.Buffer.FirstElement = 0;
+		srvd.Buffer.NumElements = 2048;
+
+		hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_srv);
+
+		if(FAILED(hr)) return false;
+
+		hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_srv);
+
+		if(FAILED(hr)) return false;
+
+		m_offset[m_context->offset.fzb->hash] = ob;
+
+		i = m_offset.find(m_context->offset.fzb->hash);
+	}
+
+	*fzbo = &i->second;
+
+	return true;
+}
diff --git a/plugins/GSdx_legacy/GSRendererCS.h b/plugins/GSdx_legacy/GSRendererCS.h
new file mode 100644
index 0000000000..185356c658
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererCS.h
@@ -0,0 +1,145 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+#include "GSDevice11.h"
+
+class GSRendererCS : public GSRenderer
+{
+	struct VSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 tme:1;
+				uint32 fst:1;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x3;}
+
+		VSSelector() : key(0) {}
+	};
+
+	__aligned(struct, 32) VSConstantBuffer
+	{
+		GSVector4 VertexScale;
+		GSVector4 VertexOffset;
+	};
+
+	struct GSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 iip:1;
+				uint32 prim:2;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x7;}
+
+		GSSelector() : key(0) {}
+	};
+
+	struct PSSelector
+	{
+		union
+		{
+			struct
+			{
+				uint32 fpsm:6;
+				uint32 zpsm:6;
+			};
+
+			uint32 key;
+		};
+
+		operator uint32() {return key & 0x3ff;}
+
+		PSSelector() : key(0) {}
+	};
+
+	__aligned(struct, 32) PSConstantBuffer
+	{
+		uint32 fm;
+		uint32 zm;
+	};
+
+	CComPtr<ID3D11DepthStencilState> m_dss;
+	CComPtr<ID3D11BlendState> m_bs;
+	CComPtr<ID3D11SamplerState> m_ss;
+	CComPtr<ID3D11Buffer> m_lb;
+	CComPtr<ID3D11UnorderedAccessView> m_lb_uav;
+	CComPtr<ID3D11ShaderResourceView> m_lb_srv;
+	CComPtr<ID3D11Buffer> m_sob;
+	CComPtr<ID3D11UnorderedAccessView> m_sob_uav;
+	CComPtr<ID3D11ShaderResourceView> m_sob_srv;
+	CComPtr<ID3D11Buffer> m_vm;
+	//CComPtr<ID3D11Texture2D> m_vm;
+	CComPtr<ID3D11UnorderedAccessView> m_vm_uav;
+	uint32 m_vm_valid[16];
+	CComPtr<ID3D11Buffer> m_pb;
+	//CComPtr<ID3D11Texture2D> m_pb;
+	hash_map<uint32, GSVertexShader11 > m_vs;
+	CComPtr<ID3D11Buffer> m_vs_cb;
+	hash_map<uint32, CComPtr<ID3D11GeometryShader> > m_gs;
+	CComPtr<ID3D11PixelShader> m_ps0;
+	hash_map<uint32, CComPtr<ID3D11PixelShader> > m_ps1;
+	CComPtr<ID3D11Buffer> m_ps_cb;
+
+	void Write(GSOffset* off, const GSVector4i& r);
+	void Read(GSOffset* off, const GSVector4i& r, bool invalidate);
+
+	struct OffsetBuffer
+	{
+		CComPtr<ID3D11Buffer> row, col;
+		CComPtr<ID3D11ShaderResourceView> row_srv, col_srv;
+	};
+
+	hash_map<uint32, OffsetBuffer> m_offset;
+
+	bool GetOffsetBuffer(OffsetBuffer** fzbo);
+
+protected:
+	GSTexture* m_texture[2];
+	uint8* m_output;
+
+	bool CreateDevice(GSDevice* dev);
+	void ResetDevice();
+	void VSync(int field);
+	GSTexture* GetOutput(int i);
+	void Draw();
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut);
+
+public:
+	GSRendererCS();
+	virtual ~GSRendererCS();
+};
diff --git a/plugins/GSdx_legacy/GSRendererDX.cpp b/plugins/GSdx_legacy/GSRendererDX.cpp
new file mode 100644
index 0000000000..531d3e4ecd
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX.cpp
@@ -0,0 +1,530 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererDX.h"
+#include "GSDeviceDX.h"
+
+GSRendererDX::GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter)
+	: GSRendererHW(tc)
+	, m_pixelcenter(pixelcenter)
+{
+	m_logz = !!theApp.GetConfig("logz", 0);
+	m_fba = !!theApp.GetConfig("fba", 1);
+
+	UserHacks_AlphaHack = !!theApp.GetConfig("UserHacks_AlphaHack", 0) && !!theApp.GetConfig("UserHacks", 0);
+	UserHacks_AlphaStencil = !!theApp.GetConfig("UserHacks_AlphaStencil", 0) && !!theApp.GetConfig("UserHacks", 0);
+
+	UserHacks_TCOffset = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_TCOffset", 0) : 0;
+	UserHacks_TCO_x = (UserHacks_TCOffset & 0xFFFF) / -1000.0f;
+	UserHacks_TCO_y = ((UserHacks_TCOffset >> 16) & 0xFFFF) / -1000.0f;
+}
+
+GSRendererDX::~GSRendererDX()
+{
+}
+
+void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
+{
+	GSDrawingEnvironment& env = m_env;
+	GSDrawingContext* context = m_context;
+
+	const GSVector2i& rtsize = ds ? ds->GetSize()  : rt->GetSize();
+	const GSVector2& rtscale = ds ? ds->GetScale() : rt->GetScale();
+
+	bool DATE = m_context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+
+	GSTexture* rtcopy = NULL;
+
+	ASSERT(m_dev != NULL);
+
+	GSDeviceDX* dev = (GSDeviceDX*)m_dev;
+
+	if(DATE)
+	{
+		if(dev->HasStencil())
+		{
+			GSVector4 s = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y);
+			GSVector4 off = GSVector4(-1.0f, 1.0f);
+
+			GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + off.xxyy()) * s.xyxy()).sat(off.zzyy());
+			GSVector4 dst = src * 2.0f + off.xxxx();
+
+			GSVertexPT1 vertices[] =
+			{
+				{GSVector4(dst.x, -dst.y, 0.5f, 1.0f), GSVector2(src.x, src.y)},
+				{GSVector4(dst.z, -dst.y, 0.5f, 1.0f), GSVector2(src.z, src.y)},
+				{GSVector4(dst.x, -dst.w, 0.5f, 1.0f), GSVector2(src.x, src.w)},
+				{GSVector4(dst.z, -dst.w, 0.5f, 1.0f), GSVector2(src.z, src.w)},
+			};
+
+			dev->SetupDATE(rt, ds, vertices, m_context->TEST.DATM);
+		}
+		else
+		{
+			rtcopy = dev->CreateRenderTarget(rtsize.x, rtsize.y, false, rt->GetFormat());
+
+			// I'll use VertexTrace when I consider it more trustworthy
+
+			dev->CopyRect(rt, rtcopy, GSVector4i(rtsize).zwxy());
+		}
+	}
+
+	//
+
+	dev->BeginScene();
+
+	// om
+
+	GSDeviceDX::OMDepthStencilSelector om_dssel;
+
+	if(context->TEST.ZTE)
+	{
+		om_dssel.ztst = context->TEST.ZTST;
+		om_dssel.zwe = !context->ZBUF.ZMSK;
+	}
+	else
+	{
+		om_dssel.ztst = ZTST_ALWAYS;
+	}
+
+	if(m_fba)
+	{
+		om_dssel.fba = context->FBA.FBA;
+	}
+
+	GSDeviceDX::OMBlendSelector om_bsel;
+
+	if(!IsOpaque())
+	{
+		om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS;
+
+		om_bsel.a = context->ALPHA.A;
+		om_bsel.b = context->ALPHA.B;
+		om_bsel.c = context->ALPHA.C;
+		om_bsel.d = context->ALPHA.D;
+
+		if(env.PABE.PABE)
+		{
+			if(om_bsel.a == 0 && om_bsel.b == 1 && om_bsel.c == 0 && om_bsel.d == 1)
+			{
+				// this works because with PABE alpha blending is on when alpha >= 0x80, but since the pixel shader
+				// cannot output anything over 0x80 (== 1.0) blending with 0x80 or turning it off gives the same result
+
+				om_bsel.abe = 0;
+			}
+			else
+			{
+				//Breath of Fire Dragon Quarter triggers this in battles. Graphics are fine though.
+				//ASSERT(0);
+			}
+		}
+	}
+
+	om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
+
+	// vs
+
+	GSDeviceDX::VSSelector vs_sel;
+
+	vs_sel.tme = PRIM->TME;
+	vs_sel.fst = PRIM->FST;
+	vs_sel.logz = dev->HasDepth32() ? 0 : m_logz ? 1 : 0;
+	vs_sel.rtcopy = !!rtcopy;
+
+	// The real GS appears to do no masking based on the Z buffer format and writing larger Z values
+	// than the buffer supports seems to be an error condition on the real GS, causing it to crash.
+	// We are probably receiving bad coordinates from VU1 in these cases.
+
+	if(om_dssel.ztst >= ZTST_ALWAYS && om_dssel.zwe)
+	{
+		if(context->ZBUF.PSM == PSM_PSMZ24)
+		{
+			if(m_vt.m_max.p.z > 0xffffff)
+			{
+				ASSERT(m_vt.m_min.p.z > 0xffffff);
+				// Fixme :Following conditional fixes some dialog frame in Wild Arms 3, but may not be what was intended.
+				if (m_vt.m_min.p.z > 0xffffff)
+				{
+					vs_sel.bppz = 1;
+					om_dssel.ztst = ZTST_ALWAYS;
+				}
+			}
+		}
+		else if(context->ZBUF.PSM == PSM_PSMZ16 || context->ZBUF.PSM == PSM_PSMZ16S)
+		{
+			if(m_vt.m_max.p.z > 0xffff)
+			{
+				ASSERT(m_vt.m_min.p.z > 0xffff); // sfex capcom logo
+				// Fixme : Same as above, I guess.
+				if (m_vt.m_min.p.z > 0xffff)
+				{
+					vs_sel.bppz = 2;
+					om_dssel.ztst = ZTST_ALWAYS;
+				}
+			}
+		}
+	}
+
+	GSDeviceDX::VSConstantBuffer vs_cb;
+
+	float sx = 2.0f * rtscale.x / (rtsize.x << 4);
+	float sy = 2.0f * rtscale.y / (rtsize.y << 4);
+	float ox = (float)(int)context->XYOFFSET.OFX;
+	float oy = (float)(int)context->XYOFFSET.OFY;
+	float ox2 = 2.0f * m_pixelcenter.x / rtsize.x;
+	float oy2 = 2.0f * m_pixelcenter.y / rtsize.y;
+
+	//This hack subtracts around half a pixel from OFX and OFY. (Cannot do this directly,
+	//because DX10 and DX9 have a different pixel center.)
+	//
+	//The resulting shifted output aligns better with common blending / corona / blurring effects,
+	//but introduces a few bad pixels on the edges.
+
+	if(rt && rt->LikelyOffset)
+	{
+		// DX9 has pixelcenter set to 0.0, so give it some value here
+
+		if(m_pixelcenter.x == 0 && m_pixelcenter.y == 0) { ox2 = -0.0003f; oy2 = -0.0003f; }
+		
+		ox2 *= rt->OffsetHack_modx;
+		oy2 *= rt->OffsetHack_mody;
+	}
+
+	vs_cb.VertexScale  = GSVector4(sx, -sy, ldexpf(1, -32), 0.0f);
+	vs_cb.VertexOffset = GSVector4(ox * sx + ox2 + 1, -(oy * sy + oy2 + 1), 0.0f, -1.0f);
+
+	// gs
+
+	GSDeviceDX::GSSelector gs_sel;
+
+	gs_sel.iip = PRIM->IIP;
+	gs_sel.prim = m_vt.m_primclass;
+
+	// ps
+
+	GSDeviceDX::PSSelector ps_sel;
+	GSDeviceDX::PSSamplerSelector ps_ssel;
+	GSDeviceDX::PSConstantBuffer ps_cb;
+
+	// Gregory: code is not yet ready so let's only enable it when
+	// CRC is below the FULL level
+	if (m_texture_shuffle && (m_crc_hack_level < 3)) {
+		ps_sel.shuffle = 1;
+		ps_sel.fmt = 0;
+
+		const GIFRegXYOFFSET& o = m_context->XYOFFSET;
+		GSVertex* v = &m_vertex.buff[0];
+		size_t count = m_vertex.next;
+
+		// vertex position is 8 to 16 pixels, therefore it is the 16-31 bits of the colors
+		int  pos = (v[0].XYZ.X - o.OFX) & 0xFF;
+		bool write_ba = (pos > 112 && pos < 136);
+		// Read texture is 8 to 16 pixels (same as above)
+		int tex_pos = v[0].U & 0xFF;
+		ps_sel.read_ba = (tex_pos > 112 && tex_pos < 144);
+
+		GL_INS("Color shuffle %s => %s", ps_sel.read_ba ? "BA" : "RG", write_ba ? "BA" : "RG");
+
+		// Convert the vertex info to a 32 bits color format equivalent
+		for (size_t i = 0; i < count; i += 2) {
+			if (write_ba)
+				v[i].XYZ.X -= 128u;
+			else
+				v[i + 1].XYZ.X += 128u;
+
+			if (ps_sel.read_ba)
+				v[i].U -= 128u;
+			else
+				v[i + 1].U += 128u;
+
+			// Height is too big (2x).
+			int tex_offset = v[i].V & 0xF;
+			GSVector4i offset(o.OFY, tex_offset, o.OFY, tex_offset);
+
+			GSVector4i tmp(v[i].XYZ.Y, v[i].V, v[i + 1].XYZ.Y, v[i + 1].V);
+			tmp = GSVector4i(tmp - offset).srl32(1) + offset;
+
+			v[i].XYZ.Y = tmp.x;
+			v[i].V = tmp.y;
+			v[i + 1].XYZ.Y = tmp.z;
+			v[i + 1].V = tmp.w;
+		}
+
+		// Please bang my head against the wall!
+		// 1/ Reduce the frame mask to a 16 bit format
+		const uint32& m = context->FRAME.FBMSK;
+		uint32 fbmask = ((m >> 3) & 0x1F) | ((m >> 6) & 0x3E0) | ((m >> 9) & 0x7C00) | ((m >> 31) & 0x8000);
+		om_bsel.wrgba = 0;
+
+		// 2 Select the new mask (Please someone put SSE here)
+		if ((fbmask & 0xFF) == 0) {
+			if (write_ba)
+				om_bsel.wb = 1;
+			else
+				om_bsel.wr = 1;
+		}
+		else if ((fbmask & 0xFF) != 0xFF) {
+#ifdef _DEBUG
+			fprintf(stderr, "Please fix me! wb %d wr %d\n", om_bsel.wb, om_bsel.wr);
+#endif
+			//ASSERT(0);
+		}
+
+		fbmask >>= 8;
+		if ((fbmask & 0xFF) == 0) {
+			if (write_ba)
+				om_bsel.wa = 1;
+			else
+				om_bsel.wg = 1;
+		}
+		else if ((fbmask & 0xFF) != 0xFF) {
+#ifdef _DEBUG
+			fprintf(stderr, "Please fix me! wa %d wg %d\n", om_bsel.wa, om_bsel.wg);
+#endif
+			//ASSERT(0);
+		}
+
+	}
+	else {
+		//ps_sel.fmt = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
+
+		om_bsel.wrgba = ~GSVector4i::load((int)context->FRAME.FBMSK).eq8(GSVector4i::xffffffff()).mask();
+	}
+
+	if(DATE)
+	{
+		if(dev->HasStencil())
+		{
+			om_dssel.date = 1;
+		}
+		else
+		{
+			ps_sel.date = 1 + context->TEST.DATM;
+		}
+	}
+
+	if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
+	{
+		ps_sel.colclip = 1;
+	}
+
+	ps_sel.clr1 = om_bsel.IsCLR1();
+	ps_sel.fba = context->FBA.FBA;
+	ps_sel.aout = context->FRAME.PSM == PSM_PSMCT16 || context->FRAME.PSM == PSM_PSMCT16S || (context->FRAME.FBMSK & 0xff000000) == 0x7f000000 ? 1 : 0;
+	ps_sel.aout &= !ps_sel.shuffle;
+	if(UserHacks_AlphaHack) ps_sel.aout = 1;
+
+	if(PRIM->FGE)
+	{
+		ps_sel.fog = 1;
+
+		ps_cb.FogColor_AREF = GSVector4::rgba32(env.FOGCOL.u32[0]) / 255;
+	}
+
+	if(context->TEST.ATE)
+		ps_sel.atst = context->TEST.ATST;
+	else
+		ps_sel.atst = ATST_ALWAYS;
+
+	if (context->TEST.ATE && context->TEST.ATST > 1)
+		ps_cb.FogColor_AREF.a = (float)context->TEST.AREF;
+
+	// Destination alpha pseudo stencil hack: use a stencil operation combined with an alpha test
+	// to only draw pixels which would cause the destination alpha test to fail in the future once.
+	// Unfortunately this also means only drawing those pixels at all, which is why this is a hack.
+	// The interaction with FBA in D3D9 is probably less than ideal.
+	if (UserHacks_AlphaStencil && DATE && dev->HasStencil() && om_bsel.wa && (!context->TEST.ATE || context->TEST.ATST == 1))
+	{
+		if (!context->FBA.FBA)
+		{
+			if (context->TEST.DATM == 0)
+				ps_sel.atst = 5; // >=
+			else
+				ps_sel.atst = 2; // <
+			ps_cb.FogColor_AREF.a = (float)0x80;
+		}
+		if (!(context->FBA.FBA && context->TEST.DATM == 1))
+			om_dssel.alpha_stencil = 1;
+	}
+
+	if(tex)
+	{
+		const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[context->TEX0.PSM];
+		const GSLocalMemory::psm_t &cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[context->TEX0.CPSM] : psm;
+		bool bilinear = m_filter == 2 ? m_vt.IsLinear() : m_filter != 0;
+		bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && context->CLAMP.WMS < 3 && context->CLAMP.WMT < 3;
+		// Don't force extra filtering on sprite (it creates various upscaling issue)
+		bilinear &= !((m_vt.m_primclass == GS_SPRITE_CLASS) && m_userhacks_round_sprite_offset && !m_vt.IsLinear());
+
+		ps_sel.wms = context->CLAMP.WMS;
+		ps_sel.wmt = context->CLAMP.WMT;
+		if (ps_sel.shuffle) {
+			ps_sel.fmt = 0;
+		} else {
+			ps_sel.fmt = tex->m_palette ? cpsm.fmt | 4 : cpsm.fmt;
+		}
+		ps_sel.aem = env.TEXA.AEM;
+		ps_sel.tfx = context->TEX0.TFX;
+		ps_sel.tcc = context->TEX0.TCC;
+		ps_sel.ltf = bilinear && !simple_sample;
+		ps_sel.rt = tex->m_target;
+		ps_sel.spritehack = tex->m_spritehack_t;
+		ps_sel.point_sampler = !(bilinear && simple_sample);
+
+		int w = tex->m_texture->GetWidth();
+		int h = tex->m_texture->GetHeight();
+
+		int tw = (int)(1 << context->TEX0.TW);
+		int th = (int)(1 << context->TEX0.TH);
+
+		GSVector4 WH(tw, th, w, h);
+
+		if(PRIM->FST)
+		{
+			vs_cb.TextureScale = GSVector4(1.0f / 16) / WH.xyxy();
+			//Maybe better?
+			//vs_cb.TextureScale = GSVector4(1.0f / 16) * GSVector4(tex->m_texture->GetScale()).xyxy() / WH.zwzw();
+			ps_sel.fst = 1;
+		}
+
+		ps_cb.WH = WH;
+		ps_cb.HalfTexel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw();
+		ps_cb.MskFix = GSVector4i(context->CLAMP.MINU, context->CLAMP.MINV, context->CLAMP.MAXU, context->CLAMP.MAXV);
+
+		// TC Offset Hack
+		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
+		ps_cb.TC_OffsetHack = GSVector4(UserHacks_TCO_x, UserHacks_TCO_y).xyxy() / WH.xyxy();
+
+		GSVector4 clamp(ps_cb.MskFix);
+		GSVector4 ta(env.TEXA & GSVector4i::x000000ff());
+
+		ps_cb.MinMax = clamp / WH.xyxy();
+		ps_cb.MinF_TA = (clamp + 0.5f).xyxy(ta) / WH.xyxy(GSVector4(255, 255));
+
+		ps_ssel.tau = (context->CLAMP.WMS + 3) >> 1;
+		ps_ssel.tav = (context->CLAMP.WMT + 3) >> 1;
+		ps_ssel.ltf = bilinear && simple_sample;
+	}
+	else
+	{
+		ps_sel.tfx = 4;
+	}
+
+	// rs
+
+	GSVector4i scissor = GSVector4i(GSVector4(rtscale).xyxy() * context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
+
+	dev->OMSetRenderTargets(rt, ds, &scissor);
+	dev->PSSetShaderResource(0, tex ? tex->m_texture : NULL);
+	dev->PSSetShaderResource(1, tex ? tex->m_palette : NULL);
+	dev->PSSetShaderResource(2, rtcopy);
+
+	uint8 afix = context->ALPHA.FIX;
+
+	SetupIA();
+
+	dev->SetupOM(om_dssel, om_bsel, afix);
+	dev->SetupVS(vs_sel, &vs_cb);
+	dev->SetupGS(gs_sel);
+	dev->SetupPS(ps_sel, &ps_cb, ps_ssel);
+
+	// draw
+
+	if(context->TEST.DoFirstPass())
+	{
+		dev->DrawIndexedPrimitive();
+
+		if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
+		{
+			GSDeviceDX::OMBlendSelector om_bselneg(om_bsel);
+			GSDeviceDX::PSSelector ps_selneg(ps_sel);
+
+			om_bselneg.negative = 1;
+			ps_selneg.colclip = 2;
+
+			dev->SetupOM(om_dssel, om_bselneg, afix);
+			dev->SetupPS(ps_selneg, &ps_cb, ps_ssel);
+
+			dev->DrawIndexedPrimitive();
+			dev->SetupOM(om_dssel, om_bsel, afix);
+		}
+	}
+
+	if(context->TEST.DoSecondPass())
+	{
+		ASSERT(!env.PABE.PABE);
+
+		static const uint32 iatst[] = {1, 0, 5, 6, 7, 2, 3, 4};
+
+		ps_sel.atst = iatst[ps_sel.atst];
+
+		dev->SetupPS(ps_sel, &ps_cb, ps_ssel);
+
+		bool z = om_dssel.zwe;
+		bool r = om_bsel.wr;
+		bool g = om_bsel.wg;
+		bool b = om_bsel.wb;
+		bool a = om_bsel.wa;
+
+		switch(context->TEST.AFAIL)
+		{
+		case 0: z = r = g = b = a = false; break; // none
+		case 1: z = false; break; // rgba
+		case 2: r = g = b = a = false; break; // z
+		case 3: z = a = false; break; // rgb
+		default: __assume(0);
+		}
+
+		if(z || r || g || b || a)
+		{
+			om_dssel.zwe = z;
+			om_bsel.wr = r;
+			om_bsel.wg = g;
+			om_bsel.wb = b;
+			om_bsel.wa = a;
+
+			dev->SetupOM(om_dssel, om_bsel, afix);
+
+			dev->DrawIndexedPrimitive();
+
+			if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
+			{
+				GSDeviceDX::OMBlendSelector om_bselneg(om_bsel);
+				GSDeviceDX::PSSelector ps_selneg(ps_sel);
+
+				om_bselneg.negative = 1;
+				ps_selneg.colclip = 2;
+
+				dev->SetupOM(om_dssel, om_bselneg, afix);
+				dev->SetupPS(ps_selneg, &ps_cb, ps_ssel);
+
+				dev->DrawIndexedPrimitive();
+			}
+		}
+	}
+
+	dev->EndScene();
+
+	dev->Recycle(rtcopy);
+
+	if(om_dssel.fba) UpdateFBA(rt);
+}
diff --git a/plugins/GSdx_legacy/GSRendererDX.h b/plugins/GSdx_legacy/GSRendererDX.h
new file mode 100644
index 0000000000..a9071b8597
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX.h
@@ -0,0 +1,47 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRendererHW.h"
+
+class GSRendererDX : public GSRendererHW
+{
+	GSVector2 m_pixelcenter;
+	bool m_logz;
+	bool m_fba;
+
+	bool UserHacks_AlphaHack;
+	bool UserHacks_AlphaStencil;
+
+protected:
+	virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex);
+	virtual void SetupIA() = 0;
+	virtual void UpdateFBA(GSTexture* rt) {}
+
+	unsigned int UserHacks_TCOffset;
+	float UserHacks_TCO_x, UserHacks_TCO_y;
+
+public:
+	GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter = GSVector2(0, 0));
+	virtual ~GSRendererDX();
+
+};
diff --git a/plugins/GSdx_legacy/GSRendererDX11.cpp b/plugins/GSdx_legacy/GSRendererDX11.cpp
new file mode 100644
index 0000000000..4eeb93734b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX11.cpp
@@ -0,0 +1,84 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererDX11.h"
+#include "GSCrc.h"
+#include "resource.h"
+
+GSRendererDX11::GSRendererDX11()
+	: GSRendererDX(new GSTextureCache11(this), GSVector2(-0.5f, -0.5f))
+{
+}
+
+bool GSRendererDX11::CreateDevice(GSDevice* dev)
+{
+	if(!__super::CreateDevice(dev))
+		return false;
+
+	return true;
+}
+
+void GSRendererDX11::SetupIA()
+{
+	GSDevice11* dev = (GSDevice11*)m_dev;
+
+	void* ptr = NULL;
+
+	if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertex), m_vertex.next))
+	{
+		GSVector4i::storent(ptr, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
+		
+		if(UserHacks_WildHack && !isPackedUV_HackFlag)
+		{
+			GSVertex* RESTRICT d = (GSVertex*)ptr;
+		
+			for(unsigned int i = 0; i < m_vertex.next; i++)
+			{
+				if(PRIM->TME && PRIM->FST) d[i].UV &= 0x3FEF3FEF;
+			}
+		}
+		
+		dev->IAUnmapVertexBuffer();
+	}
+
+	dev->IASetIndexBuffer(m_index.buff, m_index.tail);
+
+	D3D11_PRIMITIVE_TOPOLOGY t;
+
+	switch(m_vt.m_primclass)
+	{
+	case GS_POINT_CLASS:
+		t = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
+		break;
+	case GS_LINE_CLASS:
+	case GS_SPRITE_CLASS:
+		t = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
+		break;
+	case GS_TRIANGLE_CLASS:
+		t = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+		break;
+	default:
+		__assume(0);
+	}
+	
+	dev->IASetPrimitiveTopology(t);
+}
diff --git a/plugins/GSdx_legacy/GSRendererDX11.h b/plugins/GSdx_legacy/GSRendererDX11.h
new file mode 100644
index 0000000000..53d1021222
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX11.h
@@ -0,0 +1,38 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRendererDX.h"
+#include "GSVertexHW.h"
+#include "GSTextureCache11.h"
+
+class GSRendererDX11 : public GSRendererDX
+{
+protected:
+	void SetupIA();
+
+public:
+	GSRendererDX11();
+	virtual ~GSRendererDX11() {}
+
+	bool CreateDevice(GSDevice* dev);
+};
diff --git a/plugins/GSdx_legacy/GSRendererDX9.cpp b/plugins/GSdx_legacy/GSRendererDX9.cpp
new file mode 100644
index 0000000000..fa078e7646
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX9.cpp
@@ -0,0 +1,281 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererDX9.h"
+#include "GSCrc.h"
+#include "resource.h"
+
+GSRendererDX9::GSRendererDX9()
+	: GSRendererDX(new GSTextureCache9(this))
+{
+}
+
+bool GSRendererDX9::CreateDevice(GSDevice* dev)
+{
+	if(!__super::CreateDevice(dev))
+		return false;
+
+	//
+
+	memset(&m_fba.dss, 0, sizeof(m_fba.dss));
+
+	m_fba.dss.StencilEnable = true;
+	m_fba.dss.StencilReadMask = 2;
+	m_fba.dss.StencilWriteMask = 2;
+	m_fba.dss.StencilFunc = D3DCMP_EQUAL;
+	m_fba.dss.StencilPassOp = D3DSTENCILOP_ZERO;
+	m_fba.dss.StencilFailOp = D3DSTENCILOP_ZERO;
+	m_fba.dss.StencilDepthFailOp = D3DSTENCILOP_ZERO;
+	m_fba.dss.StencilRef = 2;
+
+	memset(&m_fba.bs, 0, sizeof(m_fba.bs));
+
+	m_fba.bs.RenderTargetWriteMask = D3DCOLORWRITEENABLE_ALPHA;
+
+	//
+
+	return true;
+}
+
+void GSRendererDX9::SetupIA()
+{
+	D3DPRIMITIVETYPE topology;
+
+	switch(m_vt.m_primclass)
+	{
+	case GS_POINT_CLASS:
+
+		topology = D3DPT_POINTLIST;
+
+		break;
+
+	case GS_LINE_CLASS:
+
+		topology = D3DPT_LINELIST;
+
+		if(PRIM->IIP == 0)
+		{
+			for(size_t i = 0, j = m_index.tail; i < j; i += 2) 
+			{
+				uint32 tmp = m_index.buff[i + 0]; 
+				m_index.buff[i + 0] = m_index.buff[i + 1];
+				m_index.buff[i + 1] = tmp;
+			}
+		}
+
+		break;
+
+	case GS_TRIANGLE_CLASS:
+
+		topology = D3DPT_TRIANGLELIST;
+
+		if(PRIM->IIP == 0)
+		{
+			for(size_t i = 0, j = m_index.tail; i < j; i += 3) 
+			{
+				uint32 tmp = m_index.buff[i + 0]; 
+				m_index.buff[i + 0] = m_index.buff[i + 2];
+				m_index.buff[i + 2] = tmp;
+			}
+		}
+
+		break;
+
+	case GS_SPRITE_CLASS:
+
+		topology = D3DPT_TRIANGLELIST;
+
+		// each sprite converted to quad needs twice the space
+
+		while(m_vertex.tail * 2 > m_vertex.maxcount)
+		{
+			GrowVertexBuffer();
+		}
+
+		// assume vertices are tightly packed and sequentially indexed (it should be the case)
+
+		if(m_vertex.next >= 2)
+		{
+			size_t count = m_vertex.next;
+
+			int i = (int)count * 2 - 4;
+			GSVertex* s = &m_vertex.buff[count - 2];
+			GSVertex* q = &m_vertex.buff[count * 2 - 4];
+			uint32* RESTRICT index = &m_index.buff[count * 3 - 6];
+		
+			for(; i >= 0; i -= 4, s -= 2, q -= 4, index -= 6) 
+			{
+				GSVertex v0 = s[0];
+				GSVertex v1 = s[1];
+
+				v0.RGBAQ = v1.RGBAQ;
+				v0.XYZ.Z = v1.XYZ.Z;
+				v0.FOG = v1.FOG;
+
+				q[0] = v0;
+				q[3] = v1;
+
+				// swap x, s, u
+
+				uint16 x = v0.XYZ.X;
+				v0.XYZ.X = v1.XYZ.X;
+				v1.XYZ.X = x;
+
+				float s = v0.ST.S;
+				v0.ST.S = v1.ST.S;
+				v1.ST.S = s;
+
+				uint16 u = v0.U;
+				v0.U = v1.U;
+				v1.U = u;
+
+				q[1] = v0;
+				q[2] = v1;
+
+				index[0] = i + 0;
+				index[1] = i + 1;
+				index[2] = i + 2;
+				index[3] = i + 1;
+				index[4] = i + 2;
+				index[5] = i + 3;
+			}
+
+			m_vertex.head = m_vertex.tail = m_vertex.next = count * 2;
+			m_index.tail = count * 3;
+		}
+
+		break;
+
+	default:
+		__assume(0);
+	}
+
+	GSDevice9* dev = (GSDevice9*)m_dev;
+
+	(*dev)->SetRenderState(D3DRS_SHADEMODE, PRIM->IIP ? D3DSHADE_GOURAUD : D3DSHADE_FLAT); // TODO
+
+	void* ptr = NULL;
+
+	if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertexHW9), m_vertex.next))
+	{
+		GSVertex* RESTRICT s = (GSVertex*)m_vertex.buff;
+		GSVertexHW9* RESTRICT d = (GSVertexHW9*)ptr;
+
+		for(uint32 i = 0; i < m_vertex.next; i++, s++, d++)
+		{
+			GSVector4 p = GSVector4(GSVector4i::load(s->XYZ.u32[0]).upl16());
+
+			if(PRIM->TME && !PRIM->FST)
+			{
+				p = p.xyxy(GSVector4((float)s->XYZ.Z, s->RGBAQ.Q));
+			}
+			else
+			{
+				p = p.xyxy(GSVector4::load((float)s->XYZ.Z));
+			}
+
+			GSVector4 t = GSVector4::zero();
+
+			if(PRIM->TME)
+			{
+				if(PRIM->FST)
+				{
+					if(UserHacks_WildHack && !isPackedUV_HackFlag)
+					{
+						t = GSVector4(GSVector4i::load(s->UV & 0x3FEF3FEF).upl16());
+						//printf("GSDX: %08X | D3D9(%d) %s\n", s->UV & 0x3FEF3FEF, m_vertex.next, i == 0 ? "*" : "");
+					}
+					else
+					{
+						t = GSVector4(GSVector4i::load(s->UV).upl16());
+					}
+				}
+				else
+				{
+					t = GSVector4::loadl(&s->ST);
+				}
+			}
+
+			t = t.xyxy(GSVector4::cast(GSVector4i(s->RGBAQ.u32[0], s->FOG)));
+
+			d->p = p;
+			d->t = t;
+		}
+
+		dev->IAUnmapVertexBuffer();
+	}
+
+	dev->IASetIndexBuffer(m_index.buff, m_index.tail);
+
+	dev->IASetPrimitiveTopology(topology);
+}
+
+void GSRendererDX9::UpdateFBA(GSTexture* rt)
+{
+	if (!rt)
+		return;
+
+	GSDevice9* dev = (GSDevice9*)m_dev;
+
+	dev->BeginScene();
+
+	// om
+
+	dev->OMSetDepthStencilState(&m_fba.dss);
+	dev->OMSetBlendState(&m_fba.bs, 0);
+
+	// ia
+
+	GSVector4 s = GSVector4(rt->GetScale().x / rt->GetWidth(), rt->GetScale().y / rt->GetHeight());
+	GSVector4 off = GSVector4(-1.0f, 1.0f);
+
+	GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + off.xxyy()) * s.xyxy()).sat(off.zzyy());
+	GSVector4 dst = src * 2.0f + off.xxxx();
+
+	GSVertexPT1 vertices[] =
+	{
+		{GSVector4(dst.x, -dst.y, 0.5f, 1.0f), GSVector2(0, 0)},
+		{GSVector4(dst.z, -dst.y, 0.5f, 1.0f), GSVector2(0, 0)},
+		{GSVector4(dst.x, -dst.w, 0.5f, 1.0f), GSVector2(0, 0)},
+		{GSVector4(dst.z, -dst.w, 0.5f, 1.0f), GSVector2(0, 0)},
+	};
+
+	dev->IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
+	dev->IASetInputLayout(dev->m_convert.il);
+	dev->IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
+
+	// vs
+
+	dev->VSSetShader(dev->m_convert.vs, NULL, 0);
+
+	// ps
+
+	dev->PSSetShader(dev->m_convert.ps[4], NULL, 0);
+
+	//
+
+	dev->DrawPrimitive();
+
+	//
+
+	dev->EndScene();
+}
diff --git a/plugins/GSdx_legacy/GSRendererDX9.h b/plugins/GSdx_legacy/GSRendererDX9.h
new file mode 100644
index 0000000000..c7e3b8c8b2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererDX9.h
@@ -0,0 +1,45 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRendererDX.h"
+#include "GSVertexHW.h"
+#include "GSTextureCache9.h"
+
+class GSRendererDX9 : public GSRendererDX
+{
+protected:
+	struct
+	{
+		Direct3DDepthStencilState9 dss;
+		Direct3DBlendState9 bs;
+	} m_fba;
+
+	void SetupIA();
+	void UpdateFBA(GSTexture* rt);
+
+public:
+	GSRendererDX9();
+	virtual ~GSRendererDX9() {}
+
+	bool CreateDevice(GSDevice* dev);
+};
diff --git a/plugins/GSdx_legacy/GSRendererHW.cpp b/plugins/GSdx_legacy/GSRendererHW.cpp
new file mode 100644
index 0000000000..d4733bea24
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererHW.cpp
@@ -0,0 +1,1393 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererHW.h"
+
+GSRendererHW::GSRendererHW(GSTextureCache* tc)
+	: m_width(1280)
+	, m_height(1024)
+	, m_skip(0)
+	, m_reset(false)
+	, m_upscale_multiplier(1)
+	, m_tc(tc)
+{
+	m_upscale_multiplier = theApp.GetConfig("upscale_multiplier", 1);
+	m_userhacks_skipdraw = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SkipDraw", 0) : 0;
+	m_userhacks_align_sprite_X = !!theApp.GetConfig("UserHacks_align_sprite_X", 0) && !!theApp.GetConfig("UserHacks", 0);
+	m_userhacks_round_sprite_offset = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_round_sprite_offset", 0) : 0;
+	m_userhacks_disable_gs_mem_clear = theApp.GetConfig("UserHacks_DisableGsMemClear", 0) && theApp.GetConfig("UserHacks", 0);
+
+	if (!m_upscale_multiplier) { //Custom Resolution
+		m_width = theApp.GetConfig("resx", m_width);
+		m_height = theApp.GetConfig("resy", m_height);
+	}
+
+	if (m_upscale_multiplier == 1) { // hacks are only needed for upscaling issues.
+		m_userhacks_round_sprite_offset = 0;
+		m_userhacks_align_sprite_X = 0;
+	}
+
+}
+
+void GSRendererHW::SetScaling()
+{
+	GSVector2i crtc_size(GetDisplayRect().width(), GetDisplayRect().height());
+
+	// Framebuffer width is always a multiple of 64 so at certain cases it can't cover some weird width values.
+	// 480P , 576P use width as 720 which is not referencable by FBW * 64. so it produces 704 ( the closest value multiple by 64).
+	// In such cases, let's just use the CRTC width.
+	int fb_width = max({ (int)m_context->FRAME.FBW * 64, crtc_size.x , 512 });
+	// GS doesn't have a specific register for the FrameBuffer height. so we get the height
+	// from physical units of the display rectangle in case the game uses a heigher value of height.
+	int fb_height = (fb_width < 1024) ? max(512, crtc_size.y) : 1024;
+
+	int upscaled_fb_w = fb_width * m_upscale_multiplier;
+	int upscaled_fb_h = fb_height * m_upscale_multiplier;
+	bool good_rt_size = m_width >= upscaled_fb_w && m_height >= upscaled_fb_h;
+
+	// No need to resize for native/custom resolutions as default size will be enough for native and we manually get RT Buffer size for custom.
+	// don't resize until the display rectangle and register states are stabilized.
+	if ( m_upscale_multiplier <= 1 || good_rt_size)
+		return;
+
+	m_tc->RemovePartial();
+	m_width = upscaled_fb_w;
+	m_height = upscaled_fb_h;
+	printf("Frame buffer size set to  %dx%d (%dx%d)\n", fb_width, fb_height , m_width, m_height);
+}
+
+GSRendererHW::~GSRendererHW()
+{
+	delete m_tc;
+}
+
+void GSRendererHW::SetGameCRC(uint32 crc, int options)
+{
+	GSRenderer::SetGameCRC(crc, options);
+
+	m_hacks.SetGameCRC(m_game);
+}
+
+bool GSRendererHW::CanUpscale()
+{
+	if(m_hacks.m_cu && !(this->*m_hacks.m_cu)())
+	{
+		return false;
+	}
+
+	return m_upscale_multiplier!=1 && m_regs->PMODE.EN != 0; // upscale ratio depends on the display size, with no output it may not be set correctly (ps2 logo to game transition)
+}
+
+int GSRendererHW::GetUpscaleMultiplier()
+{
+	// Custom resolution (currently 0) needs an upscale multiplier of 1.
+	return m_upscale_multiplier ? m_upscale_multiplier : 1;
+}
+
+GSVector2i GSRendererHW::GetInternalResolution() {
+	GSVector2i dr(GetDisplayRect().width(), GetDisplayRect().height());
+
+	if (m_upscale_multiplier)
+		return GSVector2i(dr.x * m_upscale_multiplier, dr.y * m_upscale_multiplier);
+	else
+		return GSVector2i(m_width, m_height);
+}
+
+void GSRendererHW::Reset()
+{
+	// TODO: GSreset can come from the main thread too => crash
+	// m_tc->RemoveAll();
+
+	m_reset = true;
+
+	GSRenderer::Reset();
+}
+
+void GSRendererHW::VSync(int field)
+{
+	//Check if the frame buffer width or display width has changed
+	SetScaling();
+
+	if(m_reset)
+	{
+		m_tc->RemoveAll();
+
+		m_reset = false;
+	}
+
+	GSRenderer::VSync(field);
+
+	m_tc->IncAge();
+
+	m_tc->PrintMemoryUsage();
+	m_dev->PrintMemoryUsage();
+
+	m_skip = 0;
+}
+
+void GSRendererHW::ResetDevice()
+{
+	m_tc->RemoveAll();
+
+	GSRenderer::ResetDevice();
+}
+
+GSTexture* GSRendererHW::GetOutput(int i)
+{
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	GIFRegTEX0 TEX0;
+
+	TEX0.TBP0 = DISPFB.Block();
+	TEX0.TBW = DISPFB.FBW;
+	TEX0.PSM = DISPFB.PSM;
+
+	// TRACE(_T("[%d] GetOutput %d %05x (%d)\n"), (int)m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM);
+
+	GSTexture* t = NULL;
+
+	if(GSTextureCache::Target* rt = m_tc->LookupTarget(TEX0, m_width, m_height, GetFrameRect(i).bottom))
+	{
+		t = rt->m_texture;
+
+#ifndef NDEBUG
+		if(s_dump)
+		{
+			if(s_savef && s_n >= s_saven)
+			{
+				t->Save(root_hw + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM));
+			}
+		}
+
+		s_n++;
+#endif
+	}
+
+	return t;
+}
+
+void GSRendererHW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	// printf("[%d] InvalidateVideoMem %d,%d - %d,%d %05x (%d)\n", (int)m_perfmon.GetFrame(), r.left, r.top, r.right, r.bottom, (int)BITBLTBUF.DBP, (int)BITBLTBUF.DPSM);
+
+	m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
+}
+
+void GSRendererHW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	// printf("[%d] InvalidateLocalMem %d,%d - %d,%d %05x (%d)\n", (int)m_perfmon.GetFrame(), r.left, r.top, r.right, r.bottom, (int)BITBLTBUF.SBP, (int)BITBLTBUF.SPSM);
+
+	if(clut) return; // FIXME
+		
+	m_tc->InvalidateLocalMem(m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM), r);
+}
+
+int GSRendererHW::Interpolate_UV(float alpha, int t0, int t1)
+{
+	float t = (1.0f - alpha) * t0 + alpha * t1;
+	return (int)t & ~0xF; // cheap rounding
+}
+
+float GSRendererHW::alpha0(int L, int X0, int X1)
+{
+	float x = (X0 + 15) & ~0xF; // Round up
+	return (x - X0) / (float)L;
+}
+
+float GSRendererHW::alpha1(int L, int X0, int X1)
+{
+	float x = (X1 - 1) & ~0xF; // Round down. Note -1 because right pixel isn't included in primitive so 0x100 must return 0.
+	return (x - X0) / (float)L;
+}
+
+template <bool linear>
+void GSRendererHW::RoundSpriteOffset()
+{
+//#define DEBUG_U
+//#define DEBUG_V
+#if defined(DEBUG_V) || defined(DEBUG_U)
+	bool debug = linear;
+#endif
+	size_t count = m_vertex.next;
+	GSVertex* v = &m_vertex.buff[0];
+
+	for(size_t i = 0; i < count; i += 2) {
+		// Performance note: if it had any impact on perf, someone would port it to SSE (AKA GSVector)
+
+		// Compute the coordinate of first and last texels (in native with a linear filtering)
+		int   ox  = m_context->XYOFFSET.OFX;
+		int   X0  = v[i].XYZ.X   - ox;
+		int   X1  = v[i+1].XYZ.X - ox;
+		int   Lx  = (v[i+1].XYZ.X - v[i].XYZ.X);
+		float ax0 = alpha0(Lx, X0, X1);
+		float ax1 = alpha1(Lx, X0, X1);
+		int   tx0 = Interpolate_UV(ax0, v[i].U, v[i+1].U);
+		int   tx1 = Interpolate_UV(ax1, v[i].U, v[i+1].U);
+#ifdef DEBUG_U
+		if (debug) {
+			fprintf(stderr, "u0:%d and u1:%d\n", v[i].U, v[i+1].U);
+			fprintf(stderr, "a0:%f and a1:%f\n", ax0, ax1);
+			fprintf(stderr, "t0:%d and t1:%d\n", tx0, tx1);
+		}
+#endif
+
+		int   oy  = m_context->XYOFFSET.OFY;
+		int   Y0  = v[i].XYZ.Y   - oy;
+		int   Y1  = v[i+1].XYZ.Y - oy;
+		int   Ly  = (v[i+1].XYZ.Y - v[i].XYZ.Y);
+		float ay0 = alpha0(Ly, Y0, Y1);
+		float ay1 = alpha1(Ly, Y0, Y1);
+		int   ty0 = Interpolate_UV(ay0, v[i].V, v[i+1].V);
+		int   ty1 = Interpolate_UV(ay1, v[i].V, v[i+1].V);
+#ifdef DEBUG_V
+		if (debug) {
+			fprintf(stderr, "v0:%d and v1:%d\n", v[i].V, v[i+1].V);
+			fprintf(stderr, "a0:%f and a1:%f\n", ay0, ay1);
+			fprintf(stderr, "t0:%d and t1:%d\n", ty0, ty1);
+		}
+#endif
+
+#ifdef DEBUG_U
+		if (debug)
+			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].U, v[i+1].U);
+#endif
+#ifdef DEBUG_V
+		if (debug)
+			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].V, v[i+1].V);
+#endif
+
+#if 1
+		// Use rounded value of the newly computed texture coordinate. It ensures
+		// that sampling will remains inside texture boundary
+		//
+		// Note for bilinear: by definition it will never work correctly! A sligh modification
+		// of interpolation migth trigger a discard (with alpha testing)
+		// Let's use something simple that correct really bad case (for a couple of 2D games).
+		// I hope it won't create too much glitches.
+		if (linear) {
+			int Lu = v[i+1].U - v[i].U;
+			// Note 32 is based on taisho-mononoke
+			if ((Lu > 0) && (Lu <= (Lx+32))) {
+				v[i+1].U -= 8;
+			}
+		} else {
+			if (tx0 <= tx1) {
+				v[i].U   = tx0;
+				v[i+1].U = tx1 + 16;
+			} else {
+				v[i].U   = tx0 + 15;
+				v[i+1].U = tx1;
+			}
+		}
+#endif
+#if 1
+		if (linear) {
+			int Lv = v[i+1].V - v[i].V;
+			if ((Lv > 0) && (Lv <= (Ly+32))) {
+				v[i+1].V -= 8;
+			}
+		} else {
+			if (ty0 <= ty1) {
+				v[i].V   = ty0;
+				v[i+1].V = ty1 + 16;
+			} else {
+				v[i].V   = ty0 + 15;
+				v[i+1].V = ty1;
+			}
+		}
+#endif
+
+#ifdef DEBUG_U
+		if (debug)
+			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].U, v[i+1].U);
+#endif
+#ifdef DEBUG_V
+		if (debug)
+			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].V, v[i+1].V);
+#endif
+
+	}
+}
+
+void GSRendererHW::Draw()
+{
+	if(m_dev->IsLost() || GSRenderer::IsBadFrame(m_skip, m_userhacks_skipdraw)) {
+		GL_INS("Warning skipping a draw call (%d)", s_n);
+		s_n += 3; // Keep it sync with SW renderer
+		return;
+	}
+	GL_PUSH("HW Draw %d", s_n);
+
+	GSDrawingEnvironment& env = m_env;
+	GSDrawingContext* context = m_context;
+
+	// It is allowed to use the depth and rt at the same location. However at least 1 must
+	// be disabled.
+	// 1/ GoW uses a Cd blending on a 24 bits buffer (no alpha)
+	// 2/ SuperMan really draws (0,0,0,0) color and a (0) 32-bits depth
+	// 3/ 50cents really draws (0,0,0,128) color and a (0) 24 bits depth
+	// Note: FF DoC has both buffer at same location but disable the depth test (write?) with ZTE = 0
+	const bool no_rt = (context->ALPHA.IsCd() && PRIM->ABE && (context->FRAME.PSM == 1));
+	const bool no_ds = !no_rt && (
+			// Depth is always pass (no read) and write are discarded (tekken 5).  (Note: DATE is currently implemented with a stencil buffer)
+			(context->ZBUF.ZMSK && m_context->TEST.ZTST == ZTST_ALWAYS && !m_context->TEST.DATE) ||
+			// Depth will be written through the RT
+			(context->FRAME.FBP == context->ZBUF.ZBP && !PRIM->TME && !context->ZBUF.ZMSK && !context->FRAME.FBMSK && context->TEST.ZTE)
+			);
+
+	GIFRegTEX0 TEX0;
+
+	TEX0.TBP0 = context->FRAME.Block();
+	TEX0.TBW = context->FRAME.FBW;
+	TEX0.PSM = context->FRAME.PSM;
+
+	GSTextureCache::Target* rt = no_rt ? NULL : m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::RenderTarget, true);
+	GSTexture* rt_tex = rt ? rt->m_texture : NULL;
+
+	TEX0.TBP0 = context->ZBUF.Block();
+	TEX0.TBW = context->FRAME.FBW;
+	TEX0.PSM = context->ZBUF.PSM;
+
+	GSTextureCache::Target* ds = no_ds ? NULL : m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, context->DepthWrite());
+	GSTexture* ds_tex = ds ? ds->m_texture : NULL;
+
+	if(!(rt || no_rt) || !(ds || no_ds))
+	{
+		GL_POP();
+		ASSERT(0);
+		return;
+	}
+
+	GSTextureCache::Source* tex = NULL;
+	m_texture_shuffle = false;
+
+	if(PRIM->TME)
+	{
+		/*
+		
+		// m_tc->LookupSource will mess with the palette, should not, but we do this after, until it is sorted out
+
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+
+		*/
+
+		GSVector4i r;
+
+		GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
+
+		tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
+
+		if(!tex) {
+			GL_POP();
+			return;
+		}
+
+		// FIXME: Could be removed on openGL
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+
+		// Hypothesis: texture shuffle is used as a postprocessing effect so texture will be an old target.
+		// Initially code also tested the RT but it gives too much false-positive
+		//
+		// Both input and output are 16 bits and texture was initially 32 bits!
+		m_texture_shuffle = (context->FRAME.PSM & 0x2) && ((context->TEX0.PSM & 3) == 2) && (m_vt.m_primclass == GS_SPRITE_CLASS) && tex->m_32_bits_fmt;
+
+		// Texture shuffle is not yet supported with strange clamp mode
+		ASSERT(!m_texture_shuffle || (context->CLAMP.WMS < 3 && context->CLAMP.WMT < 3));
+	}
+	if (rt) {
+		// Be sure texture shuffle detection is properly propagated
+		// Otherwise set or clear the flag (Code in texture cache only set the flag)
+		// Note: it is important to clear the flag when RT is used as a real 16 bits target.
+		rt->m_32_bits_fmt = m_texture_shuffle || !(context->FRAME.PSM & 0x2);
+	}
+
+#ifndef NDEBUG
+	if(s_dump)
+	{
+		uint64 frame = m_perfmon.GetFrame();
+
+		string s;
+
+		if (s_n >= s_saven) {
+			// Dump Register state
+			s = format("%05d_context.txt", s_n);
+
+			m_env.Dump(root_hw+s);
+			m_context->Dump(root_hw+s);
+		}
+
+		if(s_savet && s_n >= s_saven && tex)
+		{
+			s = format("%05d_f%lld_tex_%05x_%d_%d%d_%02x_%02x_%02x_%02x.dds",
+				s_n, frame, (int)context->TEX0.TBP0, (int)context->TEX0.PSM,
+				(int)context->CLAMP.WMS, (int)context->CLAMP.WMT,
+				(int)context->CLAMP.MINU, (int)context->CLAMP.MAXU,
+				(int)context->CLAMP.MINV, (int)context->CLAMP.MAXV);
+
+			tex->m_texture->Save(root_hw+s, false, true);
+
+			if(tex->m_palette)
+			{
+				s = format("%05d_f%lld_tpx_%05x_%d.dds", s_n, frame, context->TEX0.CBP, context->TEX0.CPSM);
+
+				tex->m_palette->Save(root_hw+s, false, true);
+			}
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
+
+			if (rt)
+				rt->m_texture->Save(root_hw+s);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
+
+			if (ds_tex)
+				ds_tex->Save(root_hw+s);
+		}
+
+		s_n++;
+
+	} else {
+		s_n += 2;
+	}
+#endif
+
+	if(m_hacks.m_oi && !(this->*m_hacks.m_oi)(rt_tex, ds_tex, tex))
+	{
+		s_n += 1; // keep counter sync
+		GL_POP();
+		return;
+	}
+
+	if (!m_userhacks_disable_gs_mem_clear) {
+		OI_GsMemClear();
+	}
+
+	// skip alpha test if possible
+
+	GIFRegTEST TEST = context->TEST;
+	GIFRegFRAME FRAME = context->FRAME;
+	GIFRegZBUF ZBUF = context->ZBUF;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ATE && context->TEST.ATST != ATST_ALWAYS)
+	{
+		if(GSRenderer::TryAlphaTest(fm, zm))
+		{
+			context->TEST.ATST = ATST_ALWAYS;
+		}
+	}
+
+	context->FRAME.FBMSK = fm;
+	context->ZBUF.ZMSK = zm != 0;
+
+	// A couple of hack to avoid upscaling issue. So far it seems to impacts mostly sprite
+	if ((m_upscale_multiplier > 1) && (m_vt.m_primclass == GS_SPRITE_CLASS)) {
+		size_t count = m_vertex.next;
+		GSVertex* v = &m_vertex.buff[0];
+
+		// Hack to avoid vertical black line in various games (ace combat/tekken)
+		if (m_userhacks_align_sprite_X) {
+			// Note for performance reason I do the check only once on the first
+			// primitive
+			int win_position = v[1].XYZ.X - context->XYOFFSET.OFX;
+			const bool unaligned_position = ((win_position & 0xF) == 8);
+			const bool unaligned_texture  = ((v[1].U & 0xF) == 0) && PRIM->FST; // I'm not sure this check is useful
+			const bool hole_in_vertex = (count < 4) || (v[1].XYZ.X != v[2].XYZ.X);
+			if (hole_in_vertex && unaligned_position && (unaligned_texture || !PRIM->FST)) {
+				// Normaly vertex are aligned on full pixels and texture in half
+				// pixels. Let's extend the coverage of an half-pixel to avoid
+				// hole after upscaling
+				for(size_t i = 0; i < count; i += 2) {
+					v[i+1].XYZ.X += 8;
+					// I really don't know if it is a good idea. Neither what to do for !PRIM->FST
+					if (unaligned_texture)
+						v[i+1].U += 8;
+				}
+			}
+		}
+
+		if (PRIM->FST) {
+			if ((m_userhacks_round_sprite_offset > 1) || (m_userhacks_round_sprite_offset == 1 && !m_vt.IsLinear())) {
+				if (m_vt.IsLinear())
+					RoundSpriteOffset<true>();
+				else
+					RoundSpriteOffset<false>();
+			}
+		} else {
+			; // vertical line in Yakuza (note check m_userhacks_align_sprite_X behavior)
+		}
+	}
+
+	//
+
+	DrawPrims(rt_tex, ds_tex, tex);
+
+	//
+
+	context->TEST = TEST;
+	context->FRAME = FRAME;
+	context->ZBUF = ZBUF;
+
+	//
+
+	GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(context->scissor.in));
+
+	// Help to detect rendering outside of the framebuffer
+#if _DEBUG
+	if (m_upscale_multiplier * r.z > m_width) {
+		GL_INS("ERROR: RT width is too small only %d but require %d", m_width, m_upscale_multiplier * r.z);
+	}
+	if (m_upscale_multiplier * r.w > m_height) {
+		GL_INS("ERROR: RT height is too small only %d but require %d", m_height, m_upscale_multiplier * r.w);
+	}
+#endif
+
+	if(fm != 0xffffffff && rt)
+	{
+		rt->m_valid = rt->m_valid.runion(r);
+
+		m_tc->InvalidateVideoMem(context->offset.fb, r, false);
+
+		m_tc->InvalidateVideoMemType(GSTextureCache::DepthStencil, context->FRAME.Block());
+	}
+
+	if(zm != 0xffffffff && ds)
+	{
+		ds->m_valid = ds->m_valid.runion(r);
+
+		m_tc->InvalidateVideoMem(context->offset.zb, r, false);
+
+		m_tc->InvalidateVideoMemType(GSTextureCache::RenderTarget, context->ZBUF.Block());
+	}
+
+	//
+
+	if(m_hacks.m_oo)
+	{
+		(this->*m_hacks.m_oo)();
+	}
+
+#ifndef NDEBUG
+	if(s_dump)
+	{
+		uint64 frame = m_perfmon.GetFrame();
+
+		string s;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
+
+			if (rt)
+				rt->m_texture->Save(root_hw+s);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
+
+			if (ds_tex)
+				ds_tex->Save(root_hw+s);
+		}
+
+		s_n++;
+
+		if(s_savel > 0 && (s_n - s_saven) > s_savel)
+		{
+			s_dump = 0;
+		}
+	} else {
+		s_n += 1;
+	}
+#endif
+
+	#ifdef DISABLE_HW_TEXTURE_CACHE
+
+	if (rt)
+		m_tc->Read(rt, r);
+
+	#endif
+
+	GL_POP();
+}
+
+// hacks
+
+GSRendererHW::Hacks::Hacks()
+	: m_oi_map(m_oi_list)
+	, m_oo_map(m_oo_list)
+	, m_cu_map(m_cu_list)
+	, m_oi(NULL)
+	, m_oo(NULL)
+	, m_cu(NULL)
+{
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::FFXII, CRC::EU, &GSRendererHW::OI_FFXII));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::FFX, CRC::RegionCount, &GSRendererHW::OI_FFX));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::MetalSlug6, CRC::RegionCount, &GSRendererHW::OI_MetalSlug6));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::GodOfWar2, CRC::RegionCount, &GSRendererHW::OI_GodOfWar2));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SimpsonsGame, CRC::RegionCount, &GSRendererHW::OI_SimpsonsGame));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::RozenMaidenGebetGarden, CRC::RegionCount, &GSRendererHW::OI_RozenMaidenGebetGarden));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpidermanWoS, CRC::RegionCount, &GSRendererHW::OI_SpidermanWoS));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TyTasmanianTiger, CRC::RegionCount, &GSRendererHW::OI_TyTasmanianTiger));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TyTasmanianTiger2, CRC::RegionCount, &GSRendererHW::OI_TyTasmanianTiger));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::DigimonRumbleArena2, CRC::RegionCount, &GSRendererHW::OI_DigimonRumbleArena2));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::StarWarsForceUnleashed, CRC::RegionCount, &GSRendererHW::OI_StarWarsForceUnleashed));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::BlackHawkDown, CRC::RegionCount, &GSRendererHW::OI_BlackHawkDown));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::XmenOriginsWolverine, CRC::RegionCount, &GSRendererHW::OI_XmenOriginsWolverine));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::CallofDutyFinalFronts, CRC::RegionCount, &GSRendererHW::OI_CallofDutyFinalFronts));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpyroNewBeginning, CRC::RegionCount, &GSRendererHW::OI_SpyroNewBeginning));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpyroEternalNight, CRC::RegionCount, &GSRendererHW::OI_SpyroEternalNight));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TalesOfLegendia, CRC::RegionCount, &GSRendererHW::OI_TalesOfLegendia));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SMTNocturne, CRC::RegionCount, &GSRendererHW::OI_SMTNocturne));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SuperManReturns, CRC::RegionCount, &GSRendererHW::OI_SuperManReturns));
+
+	m_oo_list.push_back(HackEntry<OO_Ptr>(CRC::DBZBT2, CRC::RegionCount, &GSRendererHW::OO_DBZBT2));
+	m_oo_list.push_back(HackEntry<OO_Ptr>(CRC::MajokkoALaMode2, CRC::RegionCount, &GSRendererHW::OO_MajokkoALaMode2));
+
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::DBZBT2, CRC::RegionCount, &GSRendererHW::CU_DBZBT2));
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::MajokkoALaMode2, CRC::RegionCount, &GSRendererHW::CU_MajokkoALaMode2));
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::TalesOfAbyss, CRC::RegionCount, &GSRendererHW::CU_TalesOfAbyss));
+}
+
+void GSRendererHW::Hacks::SetGameCRC(const CRC::Game& game)
+{
+	uint32 hash = (uint32)((game.region << 24) | game.title);
+
+	m_oi = m_oi_map[hash];
+	m_oo = m_oo_map[hash];
+	m_cu = m_cu_map[hash];
+
+	if (game.flags & CRC::PointListPalette) {
+		ASSERT(m_oi == NULL);
+
+		m_oi = &GSRendererHW::OI_PointListPalette;
+	}
+
+	bool hack = theApp.GetConfig("UserHacks_ColorDepthClearOverlap", 0) && theApp.GetConfig("UserHacks", 0);
+	if (hack && !m_oi) {
+		// FIXME: Enable this code in the future. I think it could replace
+		// most of the "old" OI hack. So far code was tested on GoW2 & SimpsonsGame with
+		// success
+		m_oi = &GSRendererHW::OI_DoubleHalfClear;
+	}
+}
+
+bool GSRendererHW::OI_DoubleHalfClear(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if ((m_vt.m_primclass == GS_SPRITE_CLASS) && !PRIM->TME && !m_context->ZBUF.ZMSK && (m_context->FRAME.FBW >= 7) && rt) {
+		GSVertex* v = &m_vertex.buff[0];
+
+		//GL_INS("OI_DoubleHalfClear: psm:%x. Z:%d R:%d G:%d B:%d A:%d", m_context->FRAME.PSM,
+		//		v[1].XYZ.Z, v[1].RGBAQ.R, v[1].RGBAQ.G, v[1].RGBAQ.B, v[1].RGBAQ.A);
+
+		// Check it is a clear on the first primitive only
+		if (v[1].XYZ.Z || v[1].RGBAQ.R || v[1].RGBAQ.G || v[1].RGBAQ.B || v[1].RGBAQ.A) {
+			return true;
+		}
+		// Only 32 bits format is supported otherwise it is complicated
+		if (m_context->FRAME.PSM & 2)
+			return true;
+
+		// FIXME might need some rounding
+		// In 32 bits pages are 64x32 pixels. In theory, it must be somethings
+		// like FBW * 64 pixels * ratio / 32 pixels / 2 = FBW * ratio
+		// It is hard to predict the ratio, so I round it to 1. And I use
+		// <= comparison below.
+		uint32 h_pages  = m_context->FRAME.FBW;
+
+		uint32 base;
+		uint32 half;
+		if (m_context->FRAME.FBP > m_context->ZBUF.ZBP) {
+			base = m_context->ZBUF.ZBP;
+			half = m_context->FRAME.FBP;
+		} else {
+			base = m_context->FRAME.FBP;
+			half = m_context->ZBUF.ZBP;
+		}
+
+		if (half <= (base + h_pages * m_context->FRAME.FBW)) {
+			//GL_INS("OI_DoubleHalfClear: base %x half %x. h_pages %d fbw %d", base, half, h_pages, m_context->FRAME.FBW);
+			if (m_context->FRAME.FBP > m_context->ZBUF.ZBP) {
+				m_dev->ClearDepth(ds, 0);
+			} else {
+				m_dev->ClearRenderTarget(rt, 0);
+			}
+			// Don't return false, it will break the rendering. I guess that it misses texture
+			// invalidation
+			//return false;
+		}
+	}
+	return true;
+}
+
+// Note: hack is safe, but it could impact the perf a little (normally games do only a couple of clear by frame)
+void GSRendererHW::OI_GsMemClear()
+{
+	// Rectangle draw without texture
+	if ((m_vt.m_primclass == GS_SPRITE_CLASS) && (m_vertex.next == 2) && !PRIM->TME && !PRIM->ABE // Direct write
+			&& !m_context->TEST.ATE // no alpha test
+			&& (!m_context->TEST.ZTE || m_context->TEST.ZTST == ZTST_ALWAYS) // no depth test
+			&& (m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(0))) // Constant 0 write
+			) {
+		GL_INS("OI_GsMemClear");
+		GSOffset* off = m_context->offset.fb;
+		GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(m_context->scissor.in));
+
+		int format = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt;
+
+		if (format == 0) {
+			// Based on WritePixel32
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm32[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] = 0; // Here the constant color
+				}
+			}
+		} else if (format == 1) {
+			// Based on WritePixel24
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm32[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] &= 0xff000000; // Clear the color
+				}
+			}
+		} else if (format == 2) {
+			; // Hack is used for FMV which are likely 24/32 bits. Let's keep the for reference
+#if 0
+			// Based on WritePixel16
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm16[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] = 0; // Here the constant color
+				}
+			}
+#endif
+		}
+	}
+}
+
+// OI (others input?/implementation?) hacks replace current draw call
+
+bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	static uint32* video = NULL;
+	static size_t lines = 0;
+
+	if(lines == 0)
+	{
+		if(m_vt.m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2))
+		{
+			lines = m_vertex.next / 2;
+		}
+	}
+	else
+	{
+		if(m_vt.m_primclass == GS_POINT_CLASS)
+		{
+			if(m_vertex.next >= 16 * 512)
+			{
+				// incoming pixels are stored in columns, one column is 16x512, total res 448x512 or 448x454
+
+				if(!video) video = new uint32[512 * 512];
+
+				int ox = m_context->XYOFFSET.OFX - 8;
+				int oy = m_context->XYOFFSET.OFY - 8;
+
+				const GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = (int)m_vertex.next; i > 0; i--, v++)
+				{
+					int x = (v->XYZ.X - ox) >> 4;
+					int y = (v->XYZ.Y - oy) >> 4;
+					
+					if (x < 0 || x >= 448 || y < 0 || y >= (int)lines) return false; // le sigh
+					
+					video[(y << 8) + (y << 7) + (y << 6) + x] = v->RGBAQ.u32[0];
+				}
+
+				return false;
+			}
+			else
+			{
+				lines = 0;
+			}
+		}
+		else if(m_vt.m_primclass == GS_LINE_CLASS)
+		{
+			if(m_vertex.next == lines * 2)
+			{
+				// normally, this step would copy the video onto screen with 512 texture mapped horizontal lines,
+				// but we use the stored video data to create a new texture, and replace the lines with two triangles
+
+				m_dev->Recycle(t->m_texture);
+
+				t->m_texture = m_dev->CreateTexture(512, 512);
+
+				t->m_texture->Update(GSVector4i(0, 0, 448, lines), video, 448 * 4);
+
+				m_vertex.buff[2] = m_vertex.buff[m_vertex.next - 2];
+				m_vertex.buff[3] = m_vertex.buff[m_vertex.next - 1];
+
+				m_index.buff[0] = 0;
+				m_index.buff[1] = 1;
+				m_index.buff[2] = 2;
+				m_index.buff[3] = 1;
+				m_index.buff[4] = 2;
+				m_index.buff[5] = 3;
+
+				m_vertex.head = m_vertex.tail = m_vertex.next = 4;
+				m_index.tail = 6;
+
+				m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS);
+			}
+			else
+			{
+				lines = 0;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_FFX(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 ZBP = m_context->ZBUF.Block();
+	uint32 TBP = m_context->TEX0.TBP0;
+
+	if((FBP == 0x00d00 || FBP == 0x00000) && ZBP == 0x02100 && PRIM->TME && TBP == 0x01a00 && m_context->TEX0.PSM == PSM_PSMCT16S)
+	{
+		// random battle transition (z buffer written directly, clear it now)
+
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	// missing red channel fix (looks alright in pcsx2 r5000+)
+
+	GSVertex* RESTRICT v = m_vertex.buff;
+
+	for(int i = (int)m_vertex.next; i > 0; i--, v++)
+	{
+		uint32 c = v->RGBAQ.u32[0];
+
+		uint32 r = (c >> 0) & 0xff;
+		uint32 g = (c >> 8) & 0xff;
+		uint32 b = (c >> 16) & 0xff;
+
+		if(r == 0 && g != 0 && b != 0)
+		{
+			v->RGBAQ.u32[0] = (c & 0xffffff00) | ((g + b + 1) >> 1);
+		}
+	}
+
+	m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt.m_primclass);
+	
+	return true;
+}
+
+bool GSRendererHW::OI_GodOfWar2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FBW = m_context->FRAME.FBW;
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x00f00 || FBP == 0x00100 || FBP == 0x01280) && FPSM == PSM_PSMZ24) // ntsc 0xf00, pal 0x100, ntsc "HD" 0x1280
+	{
+		// z buffer clear
+
+		GIFRegTEX0 TEX0;
+
+		TEX0.TBP0 = FBP;
+		TEX0.TBW = FBW;
+		TEX0.PSM = FPSM;
+
+		if(GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+		{
+			m_dev->ClearDepth(ds->m_texture, 0);
+		}
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SimpsonsGame(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x01500 || FBP == 0x01800) && FPSM == PSM_PSMZ24)	//0x1800 pal, 0x1500 ntsc
+	{
+		// instead of just simply drawing a full height 512x512 sprite to clear the z buffer,
+		// it uses a 512x256 sprite only, yet it is still able to fill the whole surface with zeros,
+		// how? by using a render target that overlaps with the lower half of the z buffer...
+		
+		// TODO: tony hawk pro skater 4 same problem, the empty half is not visible though, painted over fully
+
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_RozenMaidenGebetGarden(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if(!PRIM->TME)
+	{
+		uint32 FBP = m_context->FRAME.Block();
+		uint32 ZBP = m_context->ZBUF.Block();
+
+		if(FBP == 0x008c0 && ZBP == 0x01a40)
+		{
+			//  frame buffer clear, atst = fail, afail = write z only, z buffer points to frame buffer
+
+			GIFRegTEX0 TEX0;
+
+			TEX0.TBP0 = ZBP;
+			TEX0.TBW = m_context->FRAME.FBW;
+			TEX0.PSM = m_context->FRAME.PSM;
+
+			if(GSTextureCache::Target* rt = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::RenderTarget, true))
+			{
+				m_dev->ClearRenderTarget(rt->m_texture, 0);
+			}
+
+			return false;
+		}
+		else if(FBP == 0x00000 && m_context->ZBUF.Block() == 0x01180)
+		{
+			// z buffer clear, frame buffer now points to the z buffer (how can they be so clever?)
+
+			GIFRegTEX0 TEX0;
+
+			TEX0.TBP0 = FBP;
+			TEX0.TBW = m_context->FRAME.FBW;
+			TEX0.PSM = m_context->ZBUF.PSM;
+
+			if(GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+			{
+				m_dev->ClearDepth(ds->m_texture, 0);
+			}
+
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpidermanWoS(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x025a0 || FBP == 0x02800) && FPSM == PSM_PSMCT32)	//0x2800 pal, 0x25a0 ntsc
+	{
+		//only top half of the screen clears
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_TyTasmanianTiger(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x02800 || FBP == 0x02BC0) && FPSM == PSM_PSMCT24)	//0x2800 pal, 0x2bc0 ntsc
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_DigimonRumbleArena2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if((FBP == 0x02300 || FBP == 0x03fc0) && FPSM == PSM_PSMCT32)
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_BlackHawkDown(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x02000 && FPSM == PSM_PSMZ24)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_StarWarsForceUnleashed(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && FBP == 0x2bc0)
+		{
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_max.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_XmenOriginsWolverine(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x0 && FPSM == PSM_PSMCT16)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_CallofDutyFinalFronts(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x02300 && FPSM == PSM_PSMZ24)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpyroNewBeginning(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && (FBP == 0x02800 || FBP == 0x02bc0))	//0x2800 pal, 0x2bc0 ntsc
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpyroEternalNight(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && FBP == 0x2bc0)
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt.m_eq.z)
+	{
+		m_context->TEST.ZTST = ZTST_ALWAYS;
+		//m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SMTNocturne(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBMSK = m_context->FRAME.FBMSK;
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FBW = m_context->FRAME.FBW;
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBMSK == 16777215 && m_vertex.head != 2 && m_vertex.tail != 4 && m_vertex.next != 4)
+	{
+
+		GIFRegTEX0 TEX0;
+
+		TEX0.TBP0 = FBP;
+		TEX0.TBW = FBW;
+		TEX0.PSM = FPSM;
+		if (GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+		{
+			m_dev->ClearDepth(ds->m_texture, 0);
+		}
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if(m_vt.m_primclass == GS_POINT_CLASS && !PRIM->TME)
+	{
+		uint32 FBP = m_context->FRAME.Block();
+		uint32 FBW = m_context->FRAME.FBW;
+
+		if(FBP >= 0x03f40 && (FBP & 0x1f) == 0)
+		{
+			if(m_vertex.next == 16)
+			{
+				GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = 0; i < 16; i++, v++)
+				{
+					uint32 c = v->RGBAQ.u32[0];
+					uint32 a = c >> 24;
+
+					c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
+
+					v->RGBAQ.u32[0] = c;
+
+					m_mem.WritePixel32(i & 7, i >> 3, c, FBP, FBW);
+				}
+
+				m_mem.m_clut.Invalidate();
+
+				return false;
+			}
+			else if(m_vertex.next == 256)
+			{
+				GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = 0; i < 256; i++, v++)
+				{
+					uint32 c = v->RGBAQ.u32[0];
+					uint32 a = c >> 24;
+
+					c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
+
+					v->RGBAQ.u32[0] = c;
+
+					m_mem.WritePixel32(i & 15, i >> 4, c, FBP, FBW);
+				}
+
+				m_mem.m_clut.Invalidate();
+
+				return false;
+			}
+			else
+			{
+				ASSERT(0);
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SuperManReturns(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	// Instead to use a fullscreen rectangle they use a 32 pixels, 4096 pixels with a FBW of 1.
+	// Technically the FB wrap/overlap on itself...
+	GSDrawingContext* ctx = m_context;
+	GSVertex* v = &m_vertex.buff[0];
+
+	if (!(ctx->FRAME.FBP == ctx->ZBUF.ZBP && !PRIM->TME && !ctx->ZBUF.ZMSK && !ctx->FRAME.FBMSK && m_vt.m_eq.rgba == 0xFFFF))
+		return true;
+
+	// Please kill those crazy devs!
+	ASSERT(m_vertex.next == 2);
+	ASSERT(m_vt.m_primclass == GS_SPRITE_CLASS);
+	ASSERT((v->RGBAQ.A << 24 | v->RGBAQ.B << 16 | v->RGBAQ.G << 8 | v->RGBAQ.R) == (int)v->XYZ.Z);
+
+	// Do a direct write
+	m_dev->ClearRenderTarget(rt, GSVector4(m_vt.m_min.c));
+
+	m_tc->InvalidateVideoMemType(GSTextureCache::DepthStencil, ctx->FRAME.Block());
+
+	return false;
+}
+
+
+// OO (others output?) hacks: invalidate extra local memory after the draw call
+
+void GSRendererHW::OO_DBZBT2()
+{
+	// palette readback (cannot detect yet, when fetching the texture later)
+
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 TBP0 = m_context->TEX0.TBP0;
+
+	if(PRIM->TME && (FBP == 0x03c00 && TBP0 == 0x03c80 || FBP == 0x03ac0 && TBP0 == 0x03b40))
+	{
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = FBP;
+		BITBLTBUF.SBW = 1;
+		BITBLTBUF.SPSM = PSM_PSMCT32;
+
+		InvalidateLocalMem(BITBLTBUF, GSVector4i(0, 0, 64, 64));
+	}
+}
+
+void GSRendererHW::OO_MajokkoALaMode2()
+{
+	// palette readback
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	if(!PRIM->TME && FBP == 0x03f40)
+	{
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = FBP;
+		BITBLTBUF.SBW = 1;
+		BITBLTBUF.SPSM = PSM_PSMCT32;
+
+		InvalidateLocalMem(BITBLTBUF, GSVector4i(0, 0, 16, 16));
+	}
+}
+
+// Can Upscale hacks: disable upscaling for some draw calls
+
+bool GSRendererHW::CU_DBZBT2()
+{
+	// palette should stay 64 x 64
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x03c00 && FBP != 0x03ac0;
+}
+
+bool GSRendererHW::CU_MajokkoALaMode2()
+{
+	// palette should stay 16 x 16
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x03f40;
+}
+
+bool GSRendererHW::CU_TalesOfAbyss()
+{
+	// full image blur and brightening
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x036e0 && FBP != 0x03560 && FBP != 0x038e0;
+}
diff --git a/plugins/GSdx_legacy/GSRendererHW.cpp.orig b/plugins/GSdx_legacy/GSRendererHW.cpp.orig
new file mode 100644
index 0000000000..5eb7aa94a8
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererHW.cpp.orig
@@ -0,0 +1,1479 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererHW.h"
+
+GSRendererHW::GSRendererHW(GSTextureCache* tc)
+	: m_width(1280)
+	, m_height(1024)
+	, m_skip(0)
+	, m_reset(false)
+	, m_upscale_multiplier(1)
+	, m_tc(tc)
+{
+	m_upscale_multiplier = theApp.GetConfig("upscale_multiplier", 1);
+	m_userhacks_skipdraw = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_SkipDraw", 0) : 0;
+	m_userhacks_align_sprite_X = !!theApp.GetConfig("UserHacks_align_sprite_X", 0) && !!theApp.GetConfig("UserHacks", 0);
+	m_userhacks_round_sprite_offset = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_round_sprite_offset", 0) : 0;
+	m_userhacks_disable_gs_mem_clear = theApp.GetConfig("UserHacks_DisableGsMemClear", 0) && theApp.GetConfig("UserHacks", 0);
+
+	if (!m_upscale_multiplier) { //Custom Resolution
+		m_width = theApp.GetConfig("resx", m_width);
+		m_height = theApp.GetConfig("resy", m_height);
+	}
+
+	if (m_upscale_multiplier == 1) { // hacks are only needed for upscaling issues.
+		m_userhacks_round_sprite_offset = 0;
+		m_userhacks_align_sprite_X = 0;
+	}
+
+}
+
+void GSRendererHW::SetScaling()
+{
+	GSVector2i crtc_size(GetDisplayRect().width(), GetDisplayRect().height());
+
+	// Framebuffer width is always a multiple of 64 so at certain cases it can't cover some weird width values.
+	// 480P , 576P use width as 720 which is not referencable by FBW * 64. so it produces 704 ( the closest value multiple by 64).
+	// In such cases, let's just use the CRTC width.
+	int fb_width = max({ (int)m_context->FRAME.FBW * 64, crtc_size.x , 512 });
+	// GS doesn't have a specific register for the FrameBuffer height. so we get the height
+	// from physical units of the display rectangle in case the game uses a heigher value of height.
+	int fb_height = (fb_width < 1024) ? max(512, crtc_size.y) : 1024;
+
+	int upscaled_fb_w = fb_width * m_upscale_multiplier;
+	int upscaled_fb_h = fb_height * m_upscale_multiplier;
+	bool good_rt_size = m_width >= upscaled_fb_w && m_height >= upscaled_fb_h;
+
+	// No need to resize for native/custom resolutions as default size will be enough for native and we manually get RT Buffer size for custom.
+	// don't resize until the display rectangle and register states are stabilized.
+	if ( m_upscale_multiplier <= 1 || good_rt_size)
+		return;
+
+	m_tc->RemovePartial();
+	m_width = upscaled_fb_w;
+	m_height = upscaled_fb_h;
+	printf("Frame buffer size set to  %dx%d (%dx%d)\n", fb_width, fb_height , m_width, m_height);
+}
+
+GSRendererHW::~GSRendererHW()
+{
+	delete m_tc;
+}
+
+void GSRendererHW::SetGameCRC(uint32 crc, int options)
+{
+	GSRenderer::SetGameCRC(crc, options);
+
+	m_hacks.SetGameCRC(m_game);
+}
+
+bool GSRendererHW::CanUpscale()
+{
+	if(m_hacks.m_cu && !(this->*m_hacks.m_cu)())
+	{
+		return false;
+	}
+
+	return m_upscale_multiplier!=1 && m_regs->PMODE.EN != 0; // upscale ratio depends on the display size, with no output it may not be set correctly (ps2 logo to game transition)
+}
+
+int GSRendererHW::GetUpscaleMultiplier()
+{
+	// Custom resolution (currently 0) needs an upscale multiplier of 1.
+	return m_upscale_multiplier ? m_upscale_multiplier : 1;
+}
+
+GSVector2i GSRendererHW::GetInternalResolution() {
+	GSVector2i dr(GetDisplayRect().width(), GetDisplayRect().height());
+
+	if (m_upscale_multiplier)
+		return GSVector2i(dr.x * m_upscale_multiplier, dr.y * m_upscale_multiplier);
+	else
+		return GSVector2i(m_width, m_height);
+}
+
+void GSRendererHW::Reset()
+{
+	// TODO: GSreset can come from the main thread too => crash
+	// m_tc->RemoveAll();
+
+	m_reset = true;
+
+	GSRenderer::Reset();
+}
+
+void GSRendererHW::VSync(int field)
+{
+	//Check if the frame buffer width or display width has changed
+	SetScaling();
+
+	if(m_reset)
+	{
+		m_tc->RemoveAll();
+
+		m_reset = false;
+	}
+
+	GSRenderer::VSync(field);
+
+	m_tc->IncAge();
+
+	m_tc->PrintMemoryUsage();
+	m_dev->PrintMemoryUsage();
+
+	m_skip = 0;
+}
+
+void GSRendererHW::ResetDevice()
+{
+	m_tc->RemoveAll();
+
+	GSRenderer::ResetDevice();
+}
+
+GSTexture* GSRendererHW::GetOutput(int i, int& y_offset)
+{
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	GIFRegTEX0 TEX0;
+
+	TEX0.TBP0 = DISPFB.Block();
+	TEX0.TBW = DISPFB.FBW;
+	TEX0.PSM = DISPFB.PSM;
+
+	// TRACE(_T("[%d] GetOutput %d %05x (%d)\n"), (int)m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM);
+
+	GSTexture* t = NULL;
+
+	if(GSTextureCache::Target* rt = m_tc->LookupTarget(TEX0, m_width, m_height, GetFrameRect(i).bottom))
+	{
+		t = rt->m_texture;
+
+<<<<<<< HEAD
+#ifndef NDEBUG
+=======
+		int delta = TEX0.TBP0 - rt->m_TEX0.TBP0;
+		if (delta > 0) {
+			ASSERT(DISPFB.PSM == PSM_PSMCT32 || DISPFB.PSM == PSM_PSMCT24);
+			y_offset = delta / DISPFB.FBW;
+			GL_CACHE("Frame y offset %d pixels, unit %d", y_offset, i);
+		}
+
+>>>>>>> 24c104e... gsdx tc: re-implement frame lookup
+		if(s_dump)
+		{
+			if(s_savef && s_n >= s_saven)
+			{
+				t->Save(root_hw + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM));
+			}
+		}
+
+		s_n++;
+#endif
+	}
+
+	return t;
+}
+
+void GSRendererHW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	// printf("[%d] InvalidateVideoMem %d,%d - %d,%d %05x (%d)\n", (int)m_perfmon.GetFrame(), r.left, r.top, r.right, r.bottom, (int)BITBLTBUF.DBP, (int)BITBLTBUF.DPSM);
+
+	m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
+}
+
+void GSRendererHW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	// printf("[%d] InvalidateLocalMem %d,%d - %d,%d %05x (%d)\n", (int)m_perfmon.GetFrame(), r.left, r.top, r.right, r.bottom, (int)BITBLTBUF.SBP, (int)BITBLTBUF.SPSM);
+
+	if(clut) return; // FIXME
+
+	m_tc->InvalidateLocalMem(m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM), r);
+}
+
+int GSRendererHW::Interpolate_UV(float alpha, int t0, int t1)
+{
+	float t = (1.0f - alpha) * t0 + alpha * t1;
+	return (int)t & ~0xF; // cheap rounding
+}
+
+float GSRendererHW::alpha0(int L, int X0, int X1)
+{
+	float x = (X0 + 15) & ~0xF; // Round up
+	return (x - X0) / (float)L;
+}
+
+float GSRendererHW::alpha1(int L, int X0, int X1)
+{
+	float x = (X1 - 1) & ~0xF; // Round down. Note -1 because right pixel isn't included in primitive so 0x100 must return 0.
+	return (x - X0) / (float)L;
+}
+
+template <bool linear>
+void GSRendererHW::RoundSpriteOffset()
+{
+//#define DEBUG_U
+//#define DEBUG_V
+#if defined(DEBUG_V) || defined(DEBUG_U)
+	bool debug = linear;
+#endif
+	size_t count = m_vertex.next;
+	GSVertex* v = &m_vertex.buff[0];
+
+	for(size_t i = 0; i < count; i += 2) {
+		// Performance note: if it had any impact on perf, someone would port it to SSE (AKA GSVector)
+
+		// Compute the coordinate of first and last texels (in native with a linear filtering)
+		int   ox  = m_context->XYOFFSET.OFX;
+		int   X0  = v[i].XYZ.X   - ox;
+		int   X1  = v[i+1].XYZ.X - ox;
+		int   Lx  = (v[i+1].XYZ.X - v[i].XYZ.X);
+		float ax0 = alpha0(Lx, X0, X1);
+		float ax1 = alpha1(Lx, X0, X1);
+		int   tx0 = Interpolate_UV(ax0, v[i].U, v[i+1].U);
+		int   tx1 = Interpolate_UV(ax1, v[i].U, v[i+1].U);
+#ifdef DEBUG_U
+		if (debug) {
+			fprintf(stderr, "u0:%d and u1:%d\n", v[i].U, v[i+1].U);
+			fprintf(stderr, "a0:%f and a1:%f\n", ax0, ax1);
+			fprintf(stderr, "t0:%d and t1:%d\n", tx0, tx1);
+		}
+#endif
+
+		int   oy  = m_context->XYOFFSET.OFY;
+		int   Y0  = v[i].XYZ.Y   - oy;
+		int   Y1  = v[i+1].XYZ.Y - oy;
+		int   Ly  = (v[i+1].XYZ.Y - v[i].XYZ.Y);
+		float ay0 = alpha0(Ly, Y0, Y1);
+		float ay1 = alpha1(Ly, Y0, Y1);
+		int   ty0 = Interpolate_UV(ay0, v[i].V, v[i+1].V);
+		int   ty1 = Interpolate_UV(ay1, v[i].V, v[i+1].V);
+#ifdef DEBUG_V
+		if (debug) {
+			fprintf(stderr, "v0:%d and v1:%d\n", v[i].V, v[i+1].V);
+			fprintf(stderr, "a0:%f and a1:%f\n", ay0, ay1);
+			fprintf(stderr, "t0:%d and t1:%d\n", ty0, ty1);
+		}
+#endif
+
+#ifdef DEBUG_U
+		if (debug)
+			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].U, v[i+1].U);
+#endif
+#ifdef DEBUG_V
+		if (debug)
+			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].V, v[i+1].V);
+#endif
+
+#if 1
+		// Use rounded value of the newly computed texture coordinate. It ensures
+		// that sampling will remains inside texture boundary
+		//
+		// Note for bilinear: by definition it will never work correctly! A sligh modification
+		// of interpolation migth trigger a discard (with alpha testing)
+		// Let's use something simple that correct really bad case (for a couple of 2D games).
+		// I hope it won't create too much glitches.
+		if (linear) {
+			int Lu = v[i+1].U - v[i].U;
+			// Note 32 is based on taisho-mononoke
+			if ((Lu > 0) && (Lu <= (Lx+32))) {
+				v[i+1].U -= 8;
+			}
+		} else {
+			if (tx0 <= tx1) {
+				v[i].U   = tx0;
+				v[i+1].U = tx1 + 16;
+			} else {
+				v[i].U   = tx0 + 15;
+				v[i+1].U = tx1;
+			}
+		}
+#endif
+#if 1
+		if (linear) {
+			int Lv = v[i+1].V - v[i].V;
+			if ((Lv > 0) && (Lv <= (Ly+32))) {
+				v[i+1].V -= 8;
+			}
+		} else {
+			if (ty0 <= ty1) {
+				v[i].V   = ty0;
+				v[i+1].V = ty1 + 16;
+			} else {
+				v[i].V   = ty0 + 15;
+				v[i+1].V = ty1;
+			}
+		}
+#endif
+
+#ifdef DEBUG_U
+		if (debug)
+			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].U, v[i+1].U);
+#endif
+#ifdef DEBUG_V
+		if (debug)
+			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].V, v[i+1].V);
+#endif
+
+	}
+}
+
+void GSRendererHW::Draw()
+{
+	if(m_dev->IsLost() || GSRenderer::IsBadFrame(m_skip, m_userhacks_skipdraw)) {
+		GL_INS("Warning skipping a draw call (%d)", s_n);
+		s_n += 3; // Keep it sync with SW renderer
+		return;
+	}
+	GL_PUSH("HW Draw %d", s_n);
+
+	GSDrawingEnvironment& env = m_env;
+	GSDrawingContext* context = m_context;
+
+	// It is allowed to use the depth and rt at the same location. However at least 1 must
+	// be disabled.
+	// 1/ GoW uses a Cd blending on a 24 bits buffer (no alpha)
+	// 2/ SuperMan really draws (0,0,0,0) color and a (0) 32-bits depth
+	// 3/ 50cents really draws (0,0,0,128) color and a (0) 24 bits depth
+	// Note: FF DoC has both buffer at same location but disable the depth test (write?) with ZTE = 0
+	const bool no_rt = (context->ALPHA.IsCd() && PRIM->ABE && (context->FRAME.PSM == 1));
+	const bool no_ds = !no_rt && (
+			// Depth is always pass (no read) and write are discarded (tekken 5).  (Note: DATE is currently implemented with a stencil buffer)
+			(context->ZBUF.ZMSK && m_context->TEST.ZTST == ZTST_ALWAYS && !m_context->TEST.DATE) ||
+			// Depth will be written through the RT
+			(context->FRAME.FBP == context->ZBUF.ZBP && !PRIM->TME && !context->ZBUF.ZMSK && !context->FRAME.FBMSK && context->TEST.ZTE)
+			);
+
+	GIFRegTEX0 TEX0;
+
+	TEX0.TBP0 = context->FRAME.Block();
+	TEX0.TBW = context->FRAME.FBW;
+	TEX0.PSM = context->FRAME.PSM;
+
+	GSTextureCache::Target* rt = no_rt ? NULL : m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::RenderTarget, true);
+	GSTexture* rt_tex = rt ? rt->m_texture : NULL;
+
+	TEX0.TBP0 = context->ZBUF.Block();
+	TEX0.TBW = context->FRAME.FBW;
+	TEX0.PSM = context->ZBUF.PSM;
+
+	GSTextureCache::Target* ds = no_ds ? NULL : m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, context->DepthWrite());
+	GSTexture* ds_tex = ds ? ds->m_texture : NULL;
+
+	if(!(rt || no_rt) || !(ds || no_ds))
+	{
+		GL_POP();
+		ASSERT(0);
+		return;
+	}
+
+	GSTextureCache::Source* tex = NULL;
+	m_texture_shuffle = false;
+
+	if(PRIM->TME)
+	{
+		/*
+
+		// m_tc->LookupSource will mess with the palette, should not, but we do this after, until it is sorted out
+
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+
+		*/
+
+		GSVector4i r;
+
+		GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
+
+		tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
+
+		if(!tex) {
+			GL_POP();
+			return;
+		}
+
+		// FIXME: Could be removed on openGL
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+
+		// Hypothesis: texture shuffle is used as a postprocessing effect so texture will be an old target.
+		// Initially code also tested the RT but it gives too much false-positive
+		//
+		// Both input and output are 16 bits and texture was initially 32 bits!
+		m_texture_shuffle = (context->FRAME.PSM & 0x2) && ((context->TEX0.PSM & 3) == 2) && (m_vt.m_primclass == GS_SPRITE_CLASS) && tex->m_32_bits_fmt;
+
+		// Texture shuffle is not yet supported with strange clamp mode
+		ASSERT(!m_texture_shuffle || (context->CLAMP.WMS < 3 && context->CLAMP.WMT < 3));
+	}
+	if (rt) {
+		// Be sure texture shuffle detection is properly propagated
+		// Otherwise set or clear the flag (Code in texture cache only set the flag)
+		// Note: it is important to clear the flag when RT is used as a real 16 bits target.
+		rt->m_32_bits_fmt = m_texture_shuffle || !(context->FRAME.PSM & 0x2);
+	}
+
+#ifndef NDEBUG
+	if(s_dump)
+	{
+		uint64 frame = m_perfmon.GetFrame();
+
+		string s;
+
+		if (s_n >= s_saven) {
+			// Dump Register state
+			s = format("%05d_context.txt", s_n);
+
+			m_env.Dump(root_hw+s);
+			m_context->Dump(root_hw+s);
+		}
+
+		if(s_savet && s_n >= s_saven && tex)
+		{
+			s = format("%05d_f%lld_tex_%05x_%d_%d%d_%02x_%02x_%02x_%02x.dds",
+				s_n, frame, (int)context->TEX0.TBP0, (int)context->TEX0.PSM,
+				(int)context->CLAMP.WMS, (int)context->CLAMP.WMT,
+				(int)context->CLAMP.MINU, (int)context->CLAMP.MAXU,
+				(int)context->CLAMP.MINV, (int)context->CLAMP.MAXV);
+
+			tex->m_texture->Save(root_hw+s, false, true);
+
+			if(tex->m_palette)
+			{
+				s = format("%05d_f%lld_tpx_%05x_%d.dds", s_n, frame, context->TEX0.CBP, context->TEX0.CPSM);
+
+				tex->m_palette->Save(root_hw+s, false, true);
+			}
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
+
+			if (rt)
+				rt->m_texture->Save(root_hw+s);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
+
+			if (ds_tex)
+				ds_tex->Save(root_hw+s);
+		}
+
+		s_n++;
+
+	} else {
+		s_n += 2;
+	}
+#endif
+
+	// The rectangle of the draw
+	GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(context->scissor.in));
+
+	if(m_hacks.m_oi && !(this->*m_hacks.m_oi)(rt_tex, ds_tex, tex))
+	{
+		s_n += 1; // keep counter sync
+		GL_INS("Warning skipping a draw call (%d)", s_n);
+		GL_POP();
+		return;
+	}
+
+	if (!OI_BlitFMV(rt, tex, r)) {
+		s_n += 1; // keep counter sync
+		GL_INS("Warning skipping a draw call (%d)", s_n);
+		GL_POP();
+		return;
+	}
+
+	if (!m_userhacks_disable_gs_mem_clear) {
+		OI_GsMemClear();
+	}
+
+	// skip alpha test if possible
+
+	GIFRegTEST TEST = context->TEST;
+	GIFRegFRAME FRAME = context->FRAME;
+	GIFRegZBUF ZBUF = context->ZBUF;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ATE && context->TEST.ATST != ATST_ALWAYS)
+	{
+		if(GSRenderer::TryAlphaTest(fm, zm))
+		{
+			context->TEST.ATST = ATST_ALWAYS;
+		}
+	}
+
+	context->FRAME.FBMSK = fm;
+	context->ZBUF.ZMSK = zm != 0;
+
+	// A couple of hack to avoid upscaling issue. So far it seems to impacts mostly sprite
+	if ((m_upscale_multiplier > 1) && (m_vt.m_primclass == GS_SPRITE_CLASS)) {
+		size_t count = m_vertex.next;
+		GSVertex* v = &m_vertex.buff[0];
+
+		// Hack to avoid vertical black line in various games (ace combat/tekken)
+		if (m_userhacks_align_sprite_X) {
+			// Note for performance reason I do the check only once on the first
+			// primitive
+			int win_position = v[1].XYZ.X - context->XYOFFSET.OFX;
+			const bool unaligned_position = ((win_position & 0xF) == 8);
+			const bool unaligned_texture  = ((v[1].U & 0xF) == 0) && PRIM->FST; // I'm not sure this check is useful
+			const bool hole_in_vertex = (count < 4) || (v[1].XYZ.X != v[2].XYZ.X);
+			if (hole_in_vertex && unaligned_position && (unaligned_texture || !PRIM->FST)) {
+				// Normaly vertex are aligned on full pixels and texture in half
+				// pixels. Let's extend the coverage of an half-pixel to avoid
+				// hole after upscaling
+				for(size_t i = 0; i < count; i += 2) {
+					v[i+1].XYZ.X += 8;
+					// I really don't know if it is a good idea. Neither what to do for !PRIM->FST
+					if (unaligned_texture)
+						v[i+1].U += 8;
+				}
+			}
+		}
+
+		if (PRIM->FST) {
+			if ((m_userhacks_round_sprite_offset > 1) || (m_userhacks_round_sprite_offset == 1 && !m_vt.IsLinear())) {
+				if (m_vt.IsLinear())
+					RoundSpriteOffset<true>();
+				else
+					RoundSpriteOffset<false>();
+			}
+		} else {
+			; // vertical line in Yakuza (note check m_userhacks_align_sprite_X behavior)
+		}
+	}
+
+	//
+
+	DrawPrims(rt_tex, ds_tex, tex);
+
+	//
+
+	context->TEST = TEST;
+	context->FRAME = FRAME;
+	context->ZBUF = ZBUF;
+
+	//
+
+	// Help to detect rendering outside of the framebuffer
+#if _DEBUG
+	if (m_upscale_multiplier * r.z > m_width) {
+		GL_INS("ERROR: RT width is too small only %d but require %d", m_width, m_upscale_multiplier * r.z);
+	}
+	if (m_upscale_multiplier * r.w > m_height) {
+		GL_INS("ERROR: RT height is too small only %d but require %d", m_height, m_upscale_multiplier * r.w);
+	}
+#endif
+
+	if(fm != 0xffffffff && rt)
+	{
+		//rt->m_valid = rt->m_valid.runion(r);
+		rt->UpdateValidity(r);
+
+		m_tc->InvalidateVideoMem(context->offset.fb, r, false);
+
+		m_tc->InvalidateVideoMemType(GSTextureCache::DepthStencil, context->FRAME.Block());
+	}
+
+	if(zm != 0xffffffff && ds)
+	{
+		//ds->m_valid = ds->m_valid.runion(r);
+		ds->UpdateValidity(r);
+
+		m_tc->InvalidateVideoMem(context->offset.zb, r, false);
+
+		m_tc->InvalidateVideoMemType(GSTextureCache::RenderTarget, context->ZBUF.Block());
+	}
+
+	//
+
+	if(m_hacks.m_oo)
+	{
+		(this->*m_hacks.m_oo)();
+	}
+
+#ifndef NDEBUG
+	if(s_dump)
+	{
+		uint64 frame = m_perfmon.GetFrame();
+
+		string s;
+
+		if(s_save && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
+
+			if (rt)
+				rt->m_texture->Save(root_hw+s);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
+
+			if (ds_tex)
+				ds_tex->Save(root_hw+s);
+		}
+
+		s_n++;
+
+		if(s_savel > 0 && (s_n - s_saven) > s_savel)
+		{
+			s_dump = 0;
+		}
+	} else {
+		s_n += 1;
+	}
+#endif
+
+	#ifdef DISABLE_HW_TEXTURE_CACHE
+
+	if (rt)
+		m_tc->Read(rt, r);
+
+	#endif
+
+	GL_POP();
+}
+
+// hacks
+
+GSRendererHW::Hacks::Hacks()
+	: m_oi_map(m_oi_list)
+	, m_oo_map(m_oo_list)
+	, m_cu_map(m_cu_list)
+	, m_oi(NULL)
+	, m_oo(NULL)
+	, m_cu(NULL)
+{
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::FFXII, CRC::EU, &GSRendererHW::OI_FFXII));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::FFX, CRC::RegionCount, &GSRendererHW::OI_FFX));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::MetalSlug6, CRC::RegionCount, &GSRendererHW::OI_MetalSlug6));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::GodOfWar2, CRC::RegionCount, &GSRendererHW::OI_GodOfWar2));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SimpsonsGame, CRC::RegionCount, &GSRendererHW::OI_SimpsonsGame));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::RozenMaidenGebetGarden, CRC::RegionCount, &GSRendererHW::OI_RozenMaidenGebetGarden));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpidermanWoS, CRC::RegionCount, &GSRendererHW::OI_SpidermanWoS));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TyTasmanianTiger, CRC::RegionCount, &GSRendererHW::OI_TyTasmanianTiger));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TyTasmanianTiger2, CRC::RegionCount, &GSRendererHW::OI_TyTasmanianTiger));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::DigimonRumbleArena2, CRC::RegionCount, &GSRendererHW::OI_DigimonRumbleArena2));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::StarWarsForceUnleashed, CRC::RegionCount, &GSRendererHW::OI_StarWarsForceUnleashed));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::BlackHawkDown, CRC::RegionCount, &GSRendererHW::OI_BlackHawkDown));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::XmenOriginsWolverine, CRC::RegionCount, &GSRendererHW::OI_XmenOriginsWolverine));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::CallofDutyFinalFronts, CRC::RegionCount, &GSRendererHW::OI_CallofDutyFinalFronts));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpyroNewBeginning, CRC::RegionCount, &GSRendererHW::OI_SpyroNewBeginning));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SpyroEternalNight, CRC::RegionCount, &GSRendererHW::OI_SpyroEternalNight));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::TalesOfLegendia, CRC::RegionCount, &GSRendererHW::OI_TalesOfLegendia));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SMTNocturne, CRC::RegionCount, &GSRendererHW::OI_SMTNocturne));
+	m_oi_list.push_back(HackEntry<OI_Ptr>(CRC::SuperManReturns, CRC::RegionCount, &GSRendererHW::OI_SuperManReturns));
+
+	m_oo_list.push_back(HackEntry<OO_Ptr>(CRC::DBZBT2, CRC::RegionCount, &GSRendererHW::OO_DBZBT2));
+	m_oo_list.push_back(HackEntry<OO_Ptr>(CRC::MajokkoALaMode2, CRC::RegionCount, &GSRendererHW::OO_MajokkoALaMode2));
+
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::DBZBT2, CRC::RegionCount, &GSRendererHW::CU_DBZBT2));
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::MajokkoALaMode2, CRC::RegionCount, &GSRendererHW::CU_MajokkoALaMode2));
+	m_cu_list.push_back(HackEntry<CU_Ptr>(CRC::TalesOfAbyss, CRC::RegionCount, &GSRendererHW::CU_TalesOfAbyss));
+}
+
+void GSRendererHW::Hacks::SetGameCRC(const CRC::Game& game)
+{
+	uint32 hash = (uint32)((game.region << 24) | game.title);
+
+	m_oi = m_oi_map[hash];
+	m_oo = m_oo_map[hash];
+	m_cu = m_cu_map[hash];
+
+	if (game.flags & CRC::PointListPalette) {
+		ASSERT(m_oi == NULL);
+
+		m_oi = &GSRendererHW::OI_PointListPalette;
+	}
+
+	bool hack = theApp.GetConfig("UserHacks_ColorDepthClearOverlap", 0) && theApp.GetConfig("UserHacks", 0);
+	if (hack && !m_oi) {
+		// FIXME: Enable this code in the future. I think it could replace
+		// most of the "old" OI hack. So far code was tested on GoW2 & SimpsonsGame with
+		// success
+		m_oi = &GSRendererHW::OI_DoubleHalfClear;
+	}
+}
+
+bool GSRendererHW::OI_DoubleHalfClear(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if ((m_vt.m_primclass == GS_SPRITE_CLASS) && !PRIM->TME && !m_context->ZBUF.ZMSK && (m_context->FRAME.FBW >= 7) && rt) {
+		GSVertex* v = &m_vertex.buff[0];
+
+		//GL_INS("OI_DoubleHalfClear: psm:%x. Z:%d R:%d G:%d B:%d A:%d", m_context->FRAME.PSM,
+		//		v[1].XYZ.Z, v[1].RGBAQ.R, v[1].RGBAQ.G, v[1].RGBAQ.B, v[1].RGBAQ.A);
+
+		// Check it is a clear on the first primitive only
+		if (v[1].XYZ.Z || v[1].RGBAQ.R || v[1].RGBAQ.G || v[1].RGBAQ.B || v[1].RGBAQ.A) {
+			return true;
+		}
+		// Only 32 bits format is supported otherwise it is complicated
+		if (m_context->FRAME.PSM & 2)
+			return true;
+
+		// FIXME might need some rounding
+		// In 32 bits pages are 64x32 pixels. In theory, it must be somethings
+		// like FBW * 64 pixels * ratio / 32 pixels / 2 = FBW * ratio
+		// It is hard to predict the ratio, so I round it to 1. And I use
+		// <= comparison below.
+		uint32 h_pages  = m_context->FRAME.FBW;
+
+		uint32 base;
+		uint32 half;
+		if (m_context->FRAME.FBP > m_context->ZBUF.ZBP) {
+			base = m_context->ZBUF.ZBP;
+			half = m_context->FRAME.FBP;
+		} else {
+			base = m_context->FRAME.FBP;
+			half = m_context->ZBUF.ZBP;
+		}
+
+		if (half <= (base + h_pages * m_context->FRAME.FBW)) {
+			//GL_INS("OI_DoubleHalfClear: base %x half %x. h_pages %d fbw %d", base, half, h_pages, m_context->FRAME.FBW);
+			if (m_context->FRAME.FBP > m_context->ZBUF.ZBP) {
+				m_dev->ClearDepth(ds, 0);
+			} else {
+				m_dev->ClearRenderTarget(rt, 0);
+			}
+			// Don't return false, it will break the rendering. I guess that it misses texture
+			// invalidation
+			//return false;
+		}
+	}
+	return true;
+}
+
+// Note: hack is safe, but it could impact the perf a little (normally games do only a couple of clear by frame)
+void GSRendererHW::OI_GsMemClear()
+{
+	// Rectangle draw without texture
+	if ((m_vt.m_primclass == GS_SPRITE_CLASS) && (m_vertex.next == 2) && !PRIM->TME && !PRIM->ABE // Direct write
+			&& !m_context->TEST.ATE // no alpha test
+			&& (!m_context->TEST.ZTE || m_context->TEST.ZTST == ZTST_ALWAYS) // no depth test
+			&& (m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(0))) // Constant 0 write
+			) {
+		GL_INS("OI_GsMemClear");
+		GSOffset* off = m_context->offset.fb;
+		GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(m_context->scissor.in));
+
+		int format = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt;
+
+		if (format == 0) {
+			// Based on WritePixel32
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm32[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] = 0; // Here the constant color
+				}
+			}
+		} else if (format == 1) {
+			// Based on WritePixel24
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm32[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] &= 0xff000000; // Clear the color
+				}
+			}
+		} else if (format == 2) {
+			; // Hack is used for FMV which are likely 24/32 bits. Let's keep the for reference
+#if 0
+			// Based on WritePixel16
+			for(int y = r.top; y < r.bottom; y++)
+			{
+				uint32* RESTRICT d = &m_mem.m_vm16[off->pixel.row[y]];
+				int* RESTRICT col = off->pixel.col[0];
+
+				for(int x = r.left; x < r.right; x++)
+				{
+					d[col[x]] = 0; // Here the constant color
+				}
+			}
+#endif
+		}
+	}
+}
+
+bool GSRendererHW::OI_BlitFMV(GSTextureCache::Target* _rt, GSTextureCache::Source* tex, const GSVector4i& r_draw)
+{
+	if (r_draw.w > 1024 && (m_vt.m_primclass == GS_SPRITE_CLASS) && (m_vertex.next == 2) && PRIM->TME && !PRIM->ABE) {
+		GL_PUSH("OI_BlitFMV");
+
+		GL_INS("OI_BlitFMV");
+
+		// The draw is done past the RT at the location of the texture. To avoid various upscaling mess
+		// We will blit the data from the top to the bottom of the texture manually.
+
+		// Expected memory representation
+		// -----------------------------------------------------------------
+		// RT (2 half frame)
+		// -----------------------------------------------------------------
+		// Top of Texture (full height frame)
+		//
+		// Bottom of Texture (half height frame, will be the copy of Top texture after the draw)
+		// -----------------------------------------------------------------
+
+		// sRect is the top of texture
+		int tw = (int)(1 << m_context->TEX0.TW);
+		int th = (int)(1 << m_context->TEX0.TH);
+		GSVector4 sRect;
+		sRect.x = m_vt.m_min.t.x / tw;
+		sRect.y = m_vt.m_min.t.y / th;
+		sRect.z = m_vt.m_max.t.x / tw;
+		sRect.w = m_vt.m_max.t.y / th;
+
+		// Compute the Bottom of texture rectangle
+		ASSERT(m_context->TEX0.TBP0 > m_context->FRAME.Block());
+		int offset = (m_context->TEX0.TBP0 - m_context->FRAME.Block()) / m_context->TEX0.TBW;
+		GSVector4i r_texture(r_draw);
+		r_texture.y -= offset;
+		r_texture.w -= offset;
+
+		GSVector4 dRect(r_texture);
+
+		// Do the blit. With a Copy mess to avoid issue with limited API (dx)
+		// m_dev->StretchRect(tex->m_texture, sRect, tex->m_texture, dRect);
+		GSVector4i r_full(0, 0, tw, th);
+		if (GSTexture* rt = m_dev->CreateRenderTarget(tw, th, false)) {
+			m_dev->CopyRect(tex->m_texture, rt, r_full);
+
+			m_dev->StretchRect(tex->m_texture, sRect, rt, dRect);
+
+			m_dev->CopyRect(rt, tex->m_texture, r_full);
+
+			m_dev->Recycle(rt);
+		}
+
+		// Copy back the texture into the GS mem. I don't know why but it will be
+		// reuploaded again later
+		m_tc->Read(tex, r_texture);
+
+		m_tc->InvalidateVideoMemSubTarget(_rt);
+
+		GL_POP();
+
+		return false; // skip current draw
+	}
+
+	// Nothing to see keep going
+	return true;
+}
+
+// OI (others input?/implementation?) hacks replace current draw call
+
+bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	static uint32* video = NULL;
+	static size_t lines = 0;
+
+	if(lines == 0)
+	{
+		if(m_vt.m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2))
+		{
+			lines = m_vertex.next / 2;
+		}
+	}
+	else
+	{
+		if(m_vt.m_primclass == GS_POINT_CLASS)
+		{
+			if(m_vertex.next >= 16 * 512)
+			{
+				// incoming pixels are stored in columns, one column is 16x512, total res 448x512 or 448x454
+
+				if(!video) video = new uint32[512 * 512];
+
+				int ox = m_context->XYOFFSET.OFX - 8;
+				int oy = m_context->XYOFFSET.OFY - 8;
+
+				const GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = (int)m_vertex.next; i > 0; i--, v++)
+				{
+					int x = (v->XYZ.X - ox) >> 4;
+					int y = (v->XYZ.Y - oy) >> 4;
+
+					if (x < 0 || x >= 448 || y < 0 || y >= (int)lines) return false; // le sigh
+
+					video[(y << 8) + (y << 7) + (y << 6) + x] = v->RGBAQ.u32[0];
+				}
+
+				return false;
+			}
+			else
+			{
+				lines = 0;
+			}
+		}
+		else if(m_vt.m_primclass == GS_LINE_CLASS)
+		{
+			if(m_vertex.next == lines * 2)
+			{
+				// normally, this step would copy the video onto screen with 512 texture mapped horizontal lines,
+				// but we use the stored video data to create a new texture, and replace the lines with two triangles
+
+				m_dev->Recycle(t->m_texture);
+
+				t->m_texture = m_dev->CreateTexture(512, 512);
+
+				t->m_texture->Update(GSVector4i(0, 0, 448, lines), video, 448 * 4);
+
+				m_vertex.buff[2] = m_vertex.buff[m_vertex.next - 2];
+				m_vertex.buff[3] = m_vertex.buff[m_vertex.next - 1];
+
+				m_index.buff[0] = 0;
+				m_index.buff[1] = 1;
+				m_index.buff[2] = 2;
+				m_index.buff[3] = 1;
+				m_index.buff[4] = 2;
+				m_index.buff[5] = 3;
+
+				m_vertex.head = m_vertex.tail = m_vertex.next = 4;
+				m_index.tail = 6;
+
+				m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS);
+			}
+			else
+			{
+				lines = 0;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_FFX(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 ZBP = m_context->ZBUF.Block();
+	uint32 TBP = m_context->TEX0.TBP0;
+
+	if((FBP == 0x00d00 || FBP == 0x00000) && ZBP == 0x02100 && PRIM->TME && TBP == 0x01a00 && m_context->TEX0.PSM == PSM_PSMCT16S)
+	{
+		// random battle transition (z buffer written directly, clear it now)
+
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	// missing red channel fix (looks alright in pcsx2 r5000+)
+
+	GSVertex* RESTRICT v = m_vertex.buff;
+
+	for(int i = (int)m_vertex.next; i > 0; i--, v++)
+	{
+		uint32 c = v->RGBAQ.u32[0];
+
+		uint32 r = (c >> 0) & 0xff;
+		uint32 g = (c >> 8) & 0xff;
+		uint32 b = (c >> 16) & 0xff;
+
+		if(r == 0 && g != 0 && b != 0)
+		{
+			v->RGBAQ.u32[0] = (c & 0xffffff00) | ((g + b + 1) >> 1);
+		}
+	}
+
+	m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt.m_primclass);
+
+	return true;
+}
+
+bool GSRendererHW::OI_GodOfWar2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FBW = m_context->FRAME.FBW;
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x00f00 || FBP == 0x00100 || FBP == 0x01280) && FPSM == PSM_PSMZ24) // ntsc 0xf00, pal 0x100, ntsc "HD" 0x1280
+	{
+		// z buffer clear
+
+		GIFRegTEX0 TEX0;
+
+		TEX0.TBP0 = FBP;
+		TEX0.TBW = FBW;
+		TEX0.PSM = FPSM;
+
+		if(GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+		{
+			m_dev->ClearDepth(ds->m_texture, 0);
+		}
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SimpsonsGame(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x01500 || FBP == 0x01800) && FPSM == PSM_PSMZ24)	//0x1800 pal, 0x1500 ntsc
+	{
+		// instead of just simply drawing a full height 512x512 sprite to clear the z buffer,
+		// it uses a 512x256 sprite only, yet it is still able to fill the whole surface with zeros,
+		// how? by using a render target that overlaps with the lower half of the z buffer...
+
+		// TODO: tony hawk pro skater 4 same problem, the empty half is not visible though, painted over fully
+
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_RozenMaidenGebetGarden(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if(!PRIM->TME)
+	{
+		uint32 FBP = m_context->FRAME.Block();
+		uint32 ZBP = m_context->ZBUF.Block();
+
+		if(FBP == 0x008c0 && ZBP == 0x01a40)
+		{
+			//  frame buffer clear, atst = fail, afail = write z only, z buffer points to frame buffer
+
+			GIFRegTEX0 TEX0;
+
+			TEX0.TBP0 = ZBP;
+			TEX0.TBW = m_context->FRAME.FBW;
+			TEX0.PSM = m_context->FRAME.PSM;
+
+			if(GSTextureCache::Target* rt = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::RenderTarget, true))
+			{
+				m_dev->ClearRenderTarget(rt->m_texture, 0);
+			}
+
+			return false;
+		}
+		else if(FBP == 0x00000 && m_context->ZBUF.Block() == 0x01180)
+		{
+			// z buffer clear, frame buffer now points to the z buffer (how can they be so clever?)
+
+			GIFRegTEX0 TEX0;
+
+			TEX0.TBP0 = FBP;
+			TEX0.TBW = m_context->FRAME.FBW;
+			TEX0.PSM = m_context->ZBUF.PSM;
+
+			if(GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+			{
+				m_dev->ClearDepth(ds->m_texture, 0);
+			}
+
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpidermanWoS(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x025a0 || FBP == 0x02800) && FPSM == PSM_PSMCT32)	//0x2800 pal, 0x25a0 ntsc
+	{
+		//only top half of the screen clears
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_TyTasmanianTiger(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if((FBP == 0x02800 || FBP == 0x02BC0) && FPSM == PSM_PSMCT24)	//0x2800 pal, 0x2bc0 ntsc
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_DigimonRumbleArena2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if((FBP == 0x02300 || FBP == 0x03fc0) && FPSM == PSM_PSMCT32)
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_BlackHawkDown(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x02000 && FPSM == PSM_PSMZ24)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_StarWarsForceUnleashed(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && FBP == 0x2bc0)
+		{
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_max.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_XmenOriginsWolverine(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x0 && FPSM == PSM_PSMCT16)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_CallofDutyFinalFronts(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBP == 0x02300 && FPSM == PSM_PSMZ24)
+	{
+		//half height buffer clear
+		m_dev->ClearDepth(ds, 0);
+
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpyroNewBeginning(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && (FBP == 0x02800 || FBP == 0x02bc0))	//0x2800 pal, 0x2bc0 ntsc
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SpyroEternalNight(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(!PRIM->TME)
+	{
+		if(FPSM == PSM_PSMCT24 && FBP == 0x2bc0)
+		{
+			//half height buffer clear
+			m_dev->ClearDepth(ds, 0);
+
+			return false;
+		}
+	}
+	else if(PRIM->TME)
+	{
+		if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
+		{
+			m_dev->ClearDepth(ds, 0);
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt.m_eq.z)
+	{
+		m_context->TEST.ZTST = ZTST_ALWAYS;
+		//m_dev->ClearDepth(ds, 0);
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SMTNocturne(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	uint32 FBMSK = m_context->FRAME.FBMSK;
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 FBW = m_context->FRAME.FBW;
+	uint32 FPSM = m_context->FRAME.PSM;
+
+	if(FBMSK == 16777215 && m_vertex.head != 2 && m_vertex.tail != 4 && m_vertex.next != 4)
+	{
+
+		GIFRegTEX0 TEX0;
+
+		TEX0.TBP0 = FBP;
+		TEX0.TBW = FBW;
+		TEX0.PSM = FPSM;
+		if (GSTextureCache::Target* ds = m_tc->LookupTarget(TEX0, m_width, m_height, GSTextureCache::DepthStencil, true))
+		{
+			m_dev->ClearDepth(ds->m_texture, 0);
+		}
+		return false;
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	if(m_vt.m_primclass == GS_POINT_CLASS && !PRIM->TME)
+	{
+		uint32 FBP = m_context->FRAME.Block();
+		uint32 FBW = m_context->FRAME.FBW;
+
+		if(FBP >= 0x03f40 && (FBP & 0x1f) == 0)
+		{
+			if(m_vertex.next == 16)
+			{
+				GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = 0; i < 16; i++, v++)
+				{
+					uint32 c = v->RGBAQ.u32[0];
+					uint32 a = c >> 24;
+
+					c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
+
+					v->RGBAQ.u32[0] = c;
+
+					m_mem.WritePixel32(i & 7, i >> 3, c, FBP, FBW);
+				}
+
+				m_mem.m_clut.Invalidate();
+
+				return false;
+			}
+			else if(m_vertex.next == 256)
+			{
+				GSVertex* RESTRICT v = m_vertex.buff;
+
+				for(int i = 0; i < 256; i++, v++)
+				{
+					uint32 c = v->RGBAQ.u32[0];
+					uint32 a = c >> 24;
+
+					c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
+
+					v->RGBAQ.u32[0] = c;
+
+					m_mem.WritePixel32(i & 15, i >> 4, c, FBP, FBW);
+				}
+
+				m_mem.m_clut.Invalidate();
+
+				return false;
+			}
+			else
+			{
+				ASSERT(0);
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSRendererHW::OI_SuperManReturns(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
+{
+	// Instead to use a fullscreen rectangle they use a 32 pixels, 4096 pixels with a FBW of 1.
+	// Technically the FB wrap/overlap on itself...
+	GSDrawingContext* ctx = m_context;
+	GSVertex* v = &m_vertex.buff[0];
+
+	if (!(ctx->FRAME.FBP == ctx->ZBUF.ZBP && !PRIM->TME && !ctx->ZBUF.ZMSK && !ctx->FRAME.FBMSK && m_vt.m_eq.rgba == 0xFFFF))
+		return true;
+
+	// Please kill those crazy devs!
+	ASSERT(m_vertex.next == 2);
+	ASSERT(m_vt.m_primclass == GS_SPRITE_CLASS);
+	ASSERT((v->RGBAQ.A << 24 | v->RGBAQ.B << 16 | v->RGBAQ.G << 8 | v->RGBAQ.R) == (int)v->XYZ.Z);
+
+	// Do a direct write
+	m_dev->ClearRenderTarget(rt, GSVector4(m_vt.m_min.c));
+
+	m_tc->InvalidateVideoMemType(GSTextureCache::DepthStencil, ctx->FRAME.Block());
+
+	return false;
+}
+
+
+// OO (others output?) hacks: invalidate extra local memory after the draw call
+
+void GSRendererHW::OO_DBZBT2()
+{
+	// palette readback (cannot detect yet, when fetching the texture later)
+
+	uint32 FBP = m_context->FRAME.Block();
+	uint32 TBP0 = m_context->TEX0.TBP0;
+
+	if(PRIM->TME && (FBP == 0x03c00 && TBP0 == 0x03c80 || FBP == 0x03ac0 && TBP0 == 0x03b40))
+	{
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = FBP;
+		BITBLTBUF.SBW = 1;
+		BITBLTBUF.SPSM = PSM_PSMCT32;
+
+		InvalidateLocalMem(BITBLTBUF, GSVector4i(0, 0, 64, 64));
+	}
+}
+
+void GSRendererHW::OO_MajokkoALaMode2()
+{
+	// palette readback
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	if(!PRIM->TME && FBP == 0x03f40)
+	{
+		GIFRegBITBLTBUF BITBLTBUF;
+
+		BITBLTBUF.SBP = FBP;
+		BITBLTBUF.SBW = 1;
+		BITBLTBUF.SPSM = PSM_PSMCT32;
+
+		InvalidateLocalMem(BITBLTBUF, GSVector4i(0, 0, 16, 16));
+	}
+}
+
+// Can Upscale hacks: disable upscaling for some draw calls
+
+bool GSRendererHW::CU_DBZBT2()
+{
+	// palette should stay 64 x 64
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x03c00 && FBP != 0x03ac0;
+}
+
+bool GSRendererHW::CU_MajokkoALaMode2()
+{
+	// palette should stay 16 x 16
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x03f40;
+}
+
+bool GSRendererHW::CU_TalesOfAbyss()
+{
+	// full image blur and brightening
+
+	uint32 FBP = m_context->FRAME.Block();
+
+	return FBP != 0x036e0 && FBP != 0x03560 && FBP != 0x038e0;
+}
diff --git a/plugins/GSdx_legacy/GSRendererHW.h b/plugins/GSdx_legacy/GSRendererHW.h
new file mode 100644
index 0000000000..83272e7899
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererHW.h
@@ -0,0 +1,171 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+#include "GSTextureCache.h"
+#include "GSCrc.h"
+#include "GSFunctionMap.h"
+#include "GSState.h"
+
+class GSRendererHW : public GSRenderer
+{
+private:
+	int m_width;
+	int m_height;
+	int m_skip;
+	bool m_reset;
+	int m_upscale_multiplier;
+	int m_userhacks_skipdraw;
+
+	bool m_userhacks_align_sprite_X;
+	bool m_userhacks_disable_gs_mem_clear;
+
+	#pragma region hacks
+
+	typedef bool (GSRendererHW::*OI_Ptr)(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	typedef void (GSRendererHW::*OO_Ptr)();
+	typedef bool (GSRendererHW::*CU_Ptr)();
+
+	// Require special argument
+	void OI_GsMemClear(); // always on
+
+	bool OI_DoubleHalfClear(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_FFX(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_GodOfWar2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SimpsonsGame(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_RozenMaidenGebetGarden(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SpidermanWoS(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_TyTasmanianTiger(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_DigimonRumbleArena2(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_BlackHawkDown(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_StarWarsForceUnleashed(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_XmenOriginsWolverine(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_CallofDutyFinalFronts(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SpyroNewBeginning(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SpyroEternalNight(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SMTNocturne(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	bool OI_SuperManReturns(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t);
+	void OO_DBZBT2();
+	void OO_MajokkoALaMode2();
+
+	bool CU_DBZBT2();
+	bool CU_MajokkoALaMode2();
+	bool CU_TalesOfAbyss();
+
+	class Hacks
+	{
+		template<class T> class HackEntry
+		{
+		public:
+			CRC::Title title;
+			CRC::Region region;
+			T func;
+
+			HackEntry(CRC::Title t, CRC::Region r, T f)
+			{
+				title = t;
+				region = r;
+				func = f;
+			}
+		};
+
+		template<class T> class FunctionMap : public GSFunctionMap<uint32, T>
+		{
+			list<HackEntry<T> >& m_tbl;
+
+			T GetDefaultFunction(uint32 key)
+			{
+				CRC::Title title = (CRC::Title)(key & 0xffffff);
+				CRC::Region region = (CRC::Region)(key >> 24);
+
+				for(typename list<HackEntry<T> >::iterator i = m_tbl.begin(); i != m_tbl.end(); i++)
+				{
+					if(i->title == title && (i->region == CRC::RegionCount || i->region == region))
+					{
+						return i->func;
+					}
+				}
+
+				return NULL;
+			}
+
+		public:
+			FunctionMap(list<HackEntry<T> >& tbl) : m_tbl(tbl) {}
+		};
+
+		list<HackEntry<OI_Ptr> > m_oi_list;
+		list<HackEntry<OO_Ptr> > m_oo_list;
+		list<HackEntry<CU_Ptr> > m_cu_list;
+
+		FunctionMap<OI_Ptr> m_oi_map;
+		FunctionMap<OO_Ptr> m_oo_map;
+		FunctionMap<CU_Ptr> m_cu_map;
+
+	public:
+		OI_Ptr m_oi;
+		OO_Ptr m_oo;
+		CU_Ptr m_cu;
+
+		Hacks();
+
+		void SetGameCRC(const CRC::Game& game);
+
+	} m_hacks;
+
+	#pragma endregion
+
+	int Interpolate_UV(float alpha, int t0, int t1);
+	float alpha0(int L, int X0, int X1);
+	float alpha1(int L, int X0, int X1);
+
+	template <bool linear> void RoundSpriteOffset();
+
+protected:
+	GSTextureCache* m_tc;
+
+	virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) = 0;
+
+	int m_userhacks_round_sprite_offset;
+
+public:
+	GSRendererHW(GSTextureCache* tc);
+	virtual ~GSRendererHW();
+
+	void SetGameCRC(uint32 crc, int options);
+	bool CanUpscale();
+	int GetUpscaleMultiplier();
+	virtual GSVector2i GetInternalResolution();
+	void SetScaling();
+
+	void Reset();
+	void VSync(int field);
+	void ResetDevice();
+	GSTexture* GetOutput(int i);
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
+	void Draw();
+};
diff --git a/plugins/GSdx_legacy/GSRendererNull.cpp b/plugins/GSdx_legacy/GSRendererNull.cpp
new file mode 100644
index 0000000000..19b8e88471
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererNull.cpp
@@ -0,0 +1,23 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererNull.h"
diff --git a/plugins/GSdx_legacy/GSRendererNull.h b/plugins/GSdx_legacy/GSRendererNull.h
new file mode 100644
index 0000000000..f7f26f9941
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererNull.h
@@ -0,0 +1,49 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+
+class GSRendererNull : public GSRenderer
+{
+	class GSVertexTraceNull : public GSVertexTrace
+	{
+	public:
+		GSVertexTraceNull(const GSState* state) : GSVertexTrace(state) {}
+	};
+
+protected:
+	void Draw()
+	{
+	}
+
+	GSTexture* GetOutput(int i) 
+	{
+		return NULL;
+	}
+
+public:
+	GSRendererNull() 
+		: GSRenderer() 
+	{
+	}
+};
diff --git a/plugins/GSdx_legacy/GSRendererOGL.cpp b/plugins/GSdx_legacy/GSRendererOGL.cpp
new file mode 100644
index 0000000000..84afbc66eb
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererOGL.cpp
@@ -0,0 +1,1157 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererOGL.h"
+#include "GSRenderer.h"
+
+
+GSRendererOGL::GSRendererOGL()
+	: GSRendererHW(new GSTextureCacheOGL(this))
+{
+	m_accurate_date   = theApp.GetConfig("accurate_date", 0);
+
+	m_sw_blending = theApp.GetConfig("accurate_blending_unit", 1);
+
+	// Hope nothing requires too many draw calls.
+	m_drawlist.reserve(2048);
+
+	UserHacks_TCOffset       = theApp.GetConfig("UserHacks_TCOffset", 0);
+	UserHacks_TCO_x          = (UserHacks_TCOffset & 0xFFFF) / -1000.0f;
+	UserHacks_TCO_y          = ((UserHacks_TCOffset >> 16) & 0xFFFF) / -1000.0f;
+	UserHacks_safe_fbmask    = theApp.GetConfig("UserHacks_safe_fbmask", false);
+
+	m_prim_overlap = PRIM_OVERLAP_UNKNOW;
+	m_unsafe_fbmask = false;
+
+	if (!theApp.GetConfig("UserHacks", 0)) {
+		UserHacks_TCOffset       = 0;
+		UserHacks_TCO_x          = 0;
+		UserHacks_TCO_y          = 0;
+		UserHacks_safe_fbmask    = false;
+	}
+}
+
+bool GSRendererOGL::CreateDevice(GSDevice* dev)
+{
+	if (!GSRenderer::CreateDevice(dev))
+		return false;
+
+	// No sw blending if not supported (Intel GPU)
+	if (!GLLoader::found_GL_ARB_texture_barrier) {
+		fprintf(stderr, "Error GL_ARB_texture_barrier is not supported by your driver. You can't emulate correctly the GS blending unit! Sorry!\n");
+		m_accurate_date = false;
+		m_sw_blending = 0;
+	}
+
+
+	return true;
+}
+
+void GSRendererOGL::EmulateGS()
+{
+	if (m_vt.m_primclass != GS_SPRITE_CLASS) return;
+
+	// each sprite converted to quad needs twice the space
+
+	while(m_vertex.tail * 2 > m_vertex.maxcount)
+	{
+		GrowVertexBuffer();
+	}
+
+	// assume vertices are tightly packed and sequentially indexed (it should be the case)
+
+	if (m_vertex.next >= 2)
+	{
+		size_t count = m_vertex.next;
+
+		int i = (int)count * 2 - 4;
+		GSVertex* s = &m_vertex.buff[count - 2];
+		GSVertex* q = &m_vertex.buff[count * 2 - 4];
+		uint32* RESTRICT index = &m_index.buff[count * 3 - 6];
+
+		for(; i >= 0; i -= 4, s -= 2, q -= 4, index -= 6)
+		{
+			GSVertex v0 = s[0];
+			GSVertex v1 = s[1];
+
+			v0.RGBAQ = v1.RGBAQ;
+			v0.XYZ.Z = v1.XYZ.Z;
+			v0.FOG = v1.FOG;
+
+			q[0] = v0;
+			q[3] = v1;
+
+			// swap x, s, u
+
+			uint16 x = v0.XYZ.X;
+			v0.XYZ.X = v1.XYZ.X;
+			v1.XYZ.X = x;
+
+			float s = v0.ST.S;
+			v0.ST.S = v1.ST.S;
+			v1.ST.S = s;
+
+			uint16 u = v0.U;
+			v0.U = v1.U;
+			v1.U = u;
+
+			q[1] = v0;
+			q[2] = v1;
+
+			index[0] = i + 0;
+			index[1] = i + 1;
+			index[2] = i + 2;
+			index[3] = i + 1;
+			index[4] = i + 2;
+			index[5] = i + 3;
+		}
+
+		m_vertex.head = m_vertex.tail = m_vertex.next = count * 2;
+		m_index.tail = count * 3;
+	}
+}
+
+void GSRendererOGL::SetupIA()
+{
+	GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
+
+	if (!GLLoader::found_geometry_shader)
+		EmulateGS();
+
+	dev->IASetVertexBuffer(m_vertex.buff, m_vertex.next);
+	dev->IASetIndexBuffer(m_index.buff, m_index.tail);
+
+	GLenum t = 0;
+
+	switch(m_vt.m_primclass)
+	{
+	case GS_POINT_CLASS:
+		t = GL_POINTS;
+		break;
+	case GS_LINE_CLASS:
+		t = GL_LINES;
+		break;
+	case GS_SPRITE_CLASS:
+		if (GLLoader::found_geometry_shader)
+			t = GL_LINES;
+		else
+			t = GL_TRIANGLES;
+		break;
+	case GS_TRIANGLE_CLASS:
+		t = GL_TRIANGLES;
+		break;
+	default:
+		__assume(0);
+	}
+
+	dev->IASetPrimitiveTopology(t);
+}
+
+bool GSRendererOGL::EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel)
+{
+	bool require_barrier = false;
+
+	if (m_texture_shuffle) {
+		ps_sel.shuffle = 1;
+		ps_sel.dfmt = 0;
+
+		const GIFRegXYOFFSET& o = m_context->XYOFFSET;
+		GSVertex* v = &m_vertex.buff[0];
+		size_t count = m_vertex.next;
+
+		// vertex position is 8 to 16 pixels, therefore it is the 16-31 bits of the colors
+		int  pos = (v[0].XYZ.X - o.OFX) & 0xFF;
+		bool write_ba = (pos > 112 && pos < 136);
+		// Read texture is 8 to 16 pixels (same as above)
+		float tw = (float)(1u << m_context->TEX0.TW);
+		int tex_pos = (PRIM->FST) ? v[0].U : tw * v[0].ST.S;
+		tex_pos &= 0xFF;
+		ps_sel.read_ba = (tex_pos > 112 && tex_pos < 144);
+
+		// Convert the vertex info to a 32 bits color format equivalent
+		if (PRIM->FST) {
+			GL_INS("First vertex is  P: %d => %d    T: %d => %d", v[0].XYZ.X, v[1].XYZ.X, v[0].U, v[1].U);
+
+			for(size_t i = 0; i < count; i += 2) {
+				if (write_ba)
+					v[i].XYZ.X   -= 128u;
+				else
+					v[i+1].XYZ.X += 128u;
+
+				if (ps_sel.read_ba)
+					v[i].U       -= 128u;
+				else
+					v[i+1].U     += 128u;
+
+				// Height is too big (2x).
+				int tex_offset = v[i].V & 0xF;
+				GSVector4i offset(o.OFY, tex_offset, o.OFY, tex_offset);
+
+				GSVector4i tmp(v[i].XYZ.Y, v[i].V, v[i+1].XYZ.Y, v[i+1].V);
+				tmp = GSVector4i(tmp - offset).srl32(1) + offset;
+
+				v[i].XYZ.Y   = tmp.x;
+				v[i].V       = tmp.y;
+				v[i+1].XYZ.Y = tmp.z;
+				v[i+1].V     = tmp.w;
+			}
+		} else {
+			const float offset_8pix = 8.0f / tw;
+			GL_INS("First vertex is  P: %d => %d    T: %f => %f (offset %f)", v[0].XYZ.X, v[1].XYZ.X, v[0].ST.S, v[1].ST.S, offset_8pix);
+
+			for(size_t i = 0; i < count; i += 2) {
+				if (write_ba)
+					v[i].XYZ.X   -= 128u;
+				else
+					v[i+1].XYZ.X += 128u;
+
+				if (ps_sel.read_ba)
+					v[i].ST.S    -= offset_8pix;
+				else
+					v[i+1].ST.S  += offset_8pix;
+
+				// Height is too big (2x).
+				GSVector4i offset(o.OFY, o.OFY);
+
+				GSVector4i tmp(v[i].XYZ.Y, v[i+1].XYZ.Y);
+				tmp = GSVector4i(tmp - offset).srl32(1) + offset;
+
+				//fprintf(stderr, "Before %d, After %d\n", v[i+1].XYZ.Y, tmp.y);
+				v[i].XYZ.Y   = tmp.x;
+				v[i].ST.T   /= 2.0f;
+				v[i+1].XYZ.Y = tmp.y;
+				v[i+1].ST.T /= 2.0f;
+			}
+		}
+
+		// If date is enabled you need to test the green channel instead of the
+		// alpha channel. Only enable this code in DATE mode to reduce the number
+		// of shader.
+		ps_sel.write_rg = !write_ba && m_context->TEST.DATE;
+
+		// Please bang my head against the wall!
+		// 1/ Reduce the frame mask to a 16 bit format
+		const uint32& m = m_context->FRAME.FBMSK;
+		uint32 fbmask = ((m >> 3) & 0x1F) | ((m >> 6) & 0x3E0) | ((m >> 9) & 0x7C00) | ((m >> 16) & 0x8000);
+		// FIXME GSVector will be nice here
+		uint8 rg_mask = fbmask & 0xFF;
+		uint8 ba_mask = (fbmask >> 8) & 0xFF;
+		om_csel.wrgba = 0;
+
+		// 2 Select the new mask (Please someone put SSE here)
+		if (rg_mask != 0xFF) {
+			if (write_ba) {
+				GL_INS("Color shuffle %s => B", ps_sel.read_ba ? "B" : "R");
+				om_csel.wb = 1;
+			} else {
+				GL_INS("Color shuffle %s => R", ps_sel.read_ba ? "B" : "R");
+				om_csel.wr = 1;
+			}
+			if (rg_mask)
+				ps_sel.fbmask = 1;
+		}
+
+		if (ba_mask != 0xFF) {
+			if (write_ba) {
+				GL_INS("Color shuffle %s => A", ps_sel.read_ba ? "A" : "G");
+				om_csel.wa = 1;
+			} else {
+				GL_INS("Color shuffle %s => G", ps_sel.read_ba ? "A" : "G");
+				om_csel.wg = 1;
+			}
+			if (ba_mask)
+				ps_sel.fbmask = 1;
+		}
+
+		if (ps_sel.fbmask && m_sw_blending) {
+			GL_INS("FBMASK SW emulated fb_mask:%x on tex shuffle", fbmask);
+			ps_cb.FbMask.r = rg_mask;
+			ps_cb.FbMask.g = rg_mask;
+			ps_cb.FbMask.b = ba_mask;
+			ps_cb.FbMask.a = ba_mask;
+			require_barrier = true;
+		} else {
+			ps_sel.fbmask = 0;
+		}
+
+	} else {
+		ps_sel.dfmt = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt;
+
+		GSVector4i fbmask_v = GSVector4i::load((int)m_context->FRAME.FBMSK);
+		int ff_fbmask = fbmask_v.eq8(GSVector4i::xffffffff()).mask();
+		int zero_fbmask = fbmask_v.eq8(GSVector4i::zero()).mask();
+
+		om_csel.wrgba = ~ff_fbmask; // Enable channel if at least 1 bit is 0
+
+		ps_sel.fbmask = m_sw_blending && (~ff_fbmask & ~zero_fbmask & 0xF);
+
+		if (ps_sel.fbmask) {
+			ps_cb.FbMask = fbmask_v.u8to32();
+			// Only alpha is special here, I think we can take a very unsafe shortcut
+			// Alpha isn't blended on the GS but directly copyied into the RT.
+			//
+			// Behavior is clearly undefined however there is a high probability that
+			// it will work. Masked bit will be constant and normally the same everywhere
+			// RT/FS output/Cached value.
+			//
+			// Just to be sure let's add a new safe hack for unsafe access :)
+			//
+			// Here the GL spec quote to emphasize the unexpected behavior.
+			/*
+			   - If a texel has been written, then in order to safely read the result
+			   a texel fetch must be in a subsequent Draw separated by the command
+
+			   void TextureBarrier(void);
+
+			   TextureBarrier() will guarantee that writes have completed and caches
+			   have been invalidated before subsequent Draws are executed.
+			 */
+			if (!(~ff_fbmask & ~zero_fbmask & 0x7) && !UserHacks_safe_fbmask) {
+				GL_INS("FBMASK Unsafe SW emulated fb_mask:%x on %d bits format", m_context->FRAME.FBMSK,
+						(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt == 2) ? 16 : 32);
+				m_unsafe_fbmask = true;
+				require_barrier = false;
+			} else {
+				// The safe and accurate path (but slow)
+				GL_INS("FBMASK SW emulated fb_mask:%x on %d bits format", m_context->FRAME.FBMSK,
+						(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt == 2) ? 16 : 32);
+				require_barrier = true;
+			}
+		}
+	}
+
+	return require_barrier;
+}
+
+bool GSRendererOGL::EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, bool DATE_GL42)
+{
+	GSDeviceOGL* dev         = (GSDeviceOGL*)m_dev;
+	const GIFRegALPHA& ALPHA = m_context->ALPHA;
+	bool require_barrier     = false;
+	bool sw_blending         = false;
+
+	// No blending so early exit
+	if (!(PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS)) {
+#ifdef ENABLE_OGL_DEBUG
+		if (m_env.PABE.PABE) {
+			GL_INS("!!! ENV PABE  without ABE !!!");
+		}
+#endif
+		dev->OMSetBlendState();
+		return false;
+	}
+
+	if (m_env.PABE.PABE)
+	{
+		GL_INS("!!! ENV PABE  not supported !!!");
+		if (m_sw_blending >= ACC_BLEND_CCLIP_DALPHA) {
+			ps_sel.pabe = 1;
+			require_barrier |= (ALPHA.C == 1);
+			sw_blending = true;
+		}
+		//Breath of Fire Dragon Quarter triggers this in battles. Graphics are fine though.
+		//ASSERT(0);
+	}
+
+	// Compute the blending equation to detect special case
+	uint8 blend_index  = ((ALPHA.A * 3 + ALPHA.B) * 3 + ALPHA.C) * 3 + ALPHA.D;
+	int blend_flag = GSDeviceOGL::m_blendMapOGL[blend_index].bogus;
+
+	// SW Blend is (nearly) free. Let's use it.
+	bool impossible_or_free_blend = (blend_flag & (BLEND_NO_BAR|BLEND_A_MAX|BLEND_ACCU))
+			|| (m_prim_overlap == PRIM_OVERLAP_NO);
+
+	// Do the multiplication in shader for blending accumulation: Cs*As + Cd or Cs*Af + Cd
+	bool accumulation_blend = (blend_flag & BLEND_ACCU);
+
+	// Warning no break on purpose
+	switch (m_sw_blending) {
+		case ACC_BLEND_ULTRA:           sw_blending |= true;
+		case ACC_BLEND_FULL:            if (!m_vt.m_alpha.valid && (ALPHA.C == 0)) GetAlphaMinMax();
+										sw_blending |= (ALPHA.A != ALPHA.B) &&
+												((ALPHA.C == 0 && m_vt.m_alpha.max > 128) || (ALPHA.C == 2 && ALPHA.FIX > 128u));
+		case ACC_BLEND_CCLIP_DALPHA:    sw_blending |= (ALPHA.C == 1) || (m_env.COLCLAMP.CLAMP == 0);
+										// Initial idea was to enable accurate blending for sprite rendering to handle
+										// correctly post-processing effect. Some games (ZoE) use tons of sprites as particles.
+										// In order to keep it fast, let's limit it to smaller draw call.
+		case ACC_BLEND_SPRITE:          sw_blending |= m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 100;
+		case ACC_BLEND_FREE:            sw_blending |= (ps_sel.fbmask  && !m_unsafe_fbmask) || impossible_or_free_blend; // blending is only free when we use slow fbmask
+		default:                        sw_blending |= accumulation_blend;
+	}
+	// SW Blending
+	// GL42 interact very badly with sw blending. GL42 uses the primitiveID to find the primitive
+	// that write the bad alpha value. Sw blending will force the draw to run primitive by primitive
+	// (therefore primitiveID will be constant to 1)
+	sw_blending &= !DATE_GL42;
+
+	// Color clip
+	if (m_env.COLCLAMP.CLAMP == 0) {
+		if (m_prim_overlap == PRIM_OVERLAP_NO) {
+			// The fastest algo that requires a single pass
+			GL_INS("COLCLIP Free mode ENABLED");
+			ps_sel.colclip = 1;
+			ASSERT(sw_blending);
+			accumulation_blend = false; // disable the HDR algo
+		} else if (accumulation_blend) {
+			// A fast algo that requires 2 passes
+			GL_INS("COLCLIP Fast HDR mode ENABLED");
+			ps_sel.hdr = 1;
+		} else if (sw_blending) {
+			// A slow algo that could requires several passes (barely used)
+			GL_INS("COLCLIP SW ENABLED (blending is %d/%d/%d/%d)", ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D);
+			ps_sel.colclip = 1;
+		} else {
+			// Speed hack skip previous slow algo
+			GL_INS("Sorry colclip isn't supported");
+		}
+	}
+
+	// Seriously don't expect me to support this kind of crazyness.
+	// No mix of COLCLIP + accumulation_blend + DATE GL42
+	// Neither fbmask and GL42
+	ASSERT(!(ps_sel.hdr && DATE_GL42));
+	ASSERT(!(ps_sel.fbmask && DATE_GL42));
+
+	// For stat to optimize accurate option
+#if 0
+	GL_INS("BLEND_INFO: %d/%d/%d/%d. Clamp:%d. Prim:%d number %d (sw %d)",
+			ALPHA.A, ALPHA.B,  ALPHA.C, ALPHA.D, m_env.COLCLAMP.CLAMP, m_vt.m_primclass, m_vertex.next, sw_blending);
+#endif
+	if (sw_blending) {
+		ps_sel.blend_a = ALPHA.A;
+		ps_sel.blend_b = ALPHA.B;
+		ps_sel.blend_c = ALPHA.C;
+		ps_sel.blend_d = ALPHA.D;
+
+		if (accumulation_blend) {
+			// Keep HW blending to do the addition/subtraction
+			dev->OMSetBlendState(blend_index);
+			if (ALPHA.A == 2) {
+				// The blend unit does a reverse subtraction so it means
+				// the shader must output a positive value.
+				// Replace 0 - Cs by Cs - 0
+				ps_sel.blend_a = ALPHA.B;
+				ps_sel.blend_b = 2;
+			}
+			// Remove the addition/substraction from the SW blending
+			ps_sel.blend_d = 2;
+		} else {
+			// Disable HW blending
+			dev->OMSetBlendState();
+		}
+
+		// Require the fix alpha vlaue
+		if (ALPHA.C == 2) {
+			ps_cb.TA_Af.a = (float)ALPHA.FIX / 128.0f;
+		}
+
+		// No need to flush for every primitive
+		require_barrier |= !(blend_flag & BLEND_NO_BAR) && !accumulation_blend;
+	} else {
+		ps_sel.clr1 = !!(blend_flag & BLEND_C_CLR);
+		if (ps_sel.dfmt == 1 && ALPHA.C == 1) {
+			// 24 bits doesn't have an alpha channel so use 1.0f fix factor as equivalent
+			int hacked_blend_index  = blend_index + 3; // +3 <=> +1 on C
+			dev->OMSetBlendState(hacked_blend_index, 128, true);
+		} else {
+			dev->OMSetBlendState(blend_index, ALPHA.FIX, (ALPHA.C == 2));
+		}
+	}
+
+	return require_barrier;
+}
+
+GSRendererOGL::PRIM_OVERLAP GSRendererOGL::PrimitiveOverlap()
+{
+	// Either 1 triangle or 1 line or 3 POINTs
+	// It is bad for the POINTs but low probability that they overlap
+	if (m_vertex.next < 4)
+		return PRIM_OVERLAP_NO;
+
+	if (m_vt.m_primclass != GS_SPRITE_CLASS)
+		return PRIM_OVERLAP_UNKNOW; // maybe, maybe not
+
+	// Check intersection of sprite primitive only
+	size_t count = m_vertex.next;
+	PRIM_OVERLAP overlap = PRIM_OVERLAP_NO;
+	GSVertex* v = m_vertex.buff;
+
+	m_drawlist.clear();
+	size_t i = 0;
+	while (i < count) {
+		// In order to speed up comparison a bounding-box is accumulated. It removes a
+		// loop so code is much faster (check game virtua fighter). Besides it allow to check
+		// properly the Y order.
+
+		// .x = min(v[i].XYZ.X, v[i+1].XYZ.X)
+		// .y = min(v[i].XYZ.Y, v[i+1].XYZ.Y)
+		// .z = max(v[i].XYZ.X, v[i+1].XYZ.X)
+		// .w = max(v[i].XYZ.Y, v[i+1].XYZ.Y)
+		GSVector4i all = GSVector4i(v[i].m[1]).upl16(GSVector4i(v[i+1].m[1])).upl16().xzyw();
+		all = all.xyxy().blend(all.zwzw(), all > all.zwxy());
+
+		size_t j = i + 2;
+		while (j < count) {
+			GSVector4i sprite = GSVector4i(v[j].m[1]).upl16(GSVector4i(v[j+1].m[1])).upl16().xzyw();
+			sprite = sprite.xyxy().blend(sprite.zwzw(), sprite > sprite.zwxy());
+
+			// Be sure to get vertex in good order, otherwise .r* function doesn't
+			// work as expected.
+			ASSERT(sprite.x <= sprite.z);
+			ASSERT(sprite.y <= sprite.w);
+			ASSERT(all.x <= all.z);
+			ASSERT(all.y <= all.w);
+
+			if (all.rintersect(sprite).rempty()) {
+				all = all.runion_ordered(sprite);
+			} else {
+				overlap = PRIM_OVERLAP_YES;
+				break;
+			}
+			j += 2;
+		}
+		m_drawlist.push_back((j - i) >> 1); // Sprite count
+		i = j;
+	}
+
+#if 0
+	// Old algo: less constraint but O(n^2) instead of O(n) as above
+
+	// You have no guarantee on the sprite order, first vertex can be either top-left or bottom-left
+	// There is a high probability that the draw call will uses same ordering for all vertices.
+	// In order to keep a small performance impact only the first sprite will be checked
+	//
+	// Some safe-guard will be added in the outer-loop to avoid corruption with a limited perf impact
+	if (v[1].XYZ.Y < v[0].XYZ.Y) {
+		// First vertex is Top-Left
+		for(size_t i = 0; i < count; i += 2) {
+			if (v[i+1].XYZ.Y > v[i].XYZ.Y) {
+				return PRIM_OVERLAP_UNKNOW;
+			}
+			GSVector4i vi(v[i].XYZ.X, v[i+1].XYZ.Y, v[i+1].XYZ.X, v[i].XYZ.Y);
+			for (size_t j = i+2; j < count; j += 2) {
+				GSVector4i vj(v[j].XYZ.X, v[j+1].XYZ.Y, v[j+1].XYZ.X, v[j].XYZ.Y);
+				GSVector4i inter = vi.rintersect(vj);
+				if (!inter.rempty()) {
+					return PRIM_OVERLAP_YES;
+				}
+			}
+		}
+	} else {
+		// First vertex is Bottom-Left
+		for(size_t i = 0; i < count; i += 2) {
+			if (v[i+1].XYZ.Y < v[i].XYZ.Y) {
+				return PRIM_OVERLAP_UNKNOW;
+			}
+			GSVector4i vi(v[i].XYZ.X, v[i].XYZ.Y, v[i+1].XYZ.X, v[i+1].XYZ.Y);
+			for (size_t j = i+2; j < count; j += 2) {
+				GSVector4i vj(v[j].XYZ.X, v[j].XYZ.Y, v[j+1].XYZ.X, v[j+1].XYZ.Y);
+				GSVector4i inter = vi.rintersect(vj);
+				if (!inter.rempty()) {
+					return PRIM_OVERLAP_YES;
+				}
+			}
+		}
+	}
+#endif
+
+	//fprintf(stderr, "%d: Yes, code can be optimized (draw of %d vertices)\n", s_n, count);
+	return overlap;
+}
+
+GSVector4i GSRendererOGL::ComputeBoundingBox(const GSVector2& rtscale, const GSVector2i& rtsize)
+{
+	GSVector4 scale = GSVector4(rtscale.x, rtscale.y);
+	GSVector4 offset = GSVector4(-1.0f, 1.0f); // Round value
+	GSVector4 box = m_vt.m_min.p.xyxy(m_vt.m_max.p) + offset.xxyy();
+	return GSVector4i(box * scale.xyxy()).rintersect(GSVector4i(0, 0, rtsize.x, rtsize.y));
+}
+
+void GSRendererOGL::SendDraw(bool require_barrier)
+{
+	GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
+
+	if (!require_barrier && m_unsafe_fbmask) {
+		// Not safe but still worth to take some precautions.
+		ASSERT(GLLoader::found_GL_ARB_texture_barrier);
+		glTextureBarrier();
+		dev->DrawIndexedPrimitive();
+	} else if (!require_barrier) {
+		dev->DrawIndexedPrimitive();
+	} else if (m_prim_overlap == PRIM_OVERLAP_NO) {
+		ASSERT(GLLoader::found_GL_ARB_texture_barrier);
+		glTextureBarrier();
+		dev->DrawIndexedPrimitive();
+	} else if (m_vt.m_primclass == GS_SPRITE_CLASS) {
+		size_t nb_vertex = (GLLoader::found_geometry_shader) ? 2 : 6;
+
+		GL_PUSH("Split the draw (SPRITE)");
+
+#if defined(_DEBUG)
+		// Check how draw call is split.
+		map<size_t, size_t> frequency;
+		for (const auto& it: m_drawlist)
+			++frequency[it];
+
+		string message;
+		for (const auto& it: frequency)
+			message += " " + to_string(it.first) + "(" + to_string(it.second) + ")";
+
+		GL_PERF("Split single draw (%d sprites) into %zu draws: consecutive draws(frequency):%s",
+			m_index.tail / nb_vertex, m_drawlist.size(), message.c_str());
+#endif
+
+		for (size_t count, p = 0, n = 0; n < m_drawlist.size(); p += count, ++n) {
+			count = m_drawlist[n] * nb_vertex;
+			glTextureBarrier();
+			dev->DrawIndexedPrimitive(p, count);
+		}
+
+		GL_POP();
+	} else {
+		// FIXME: Investigate: a dynamic check to pack as many primitives as possibles
+		// I'm nearly sure GSdx already have this kind of code (maybe we can adapt GSDirtyRect)
+		size_t nb_vertex;
+		switch (m_vt.m_primclass) {
+			case GS_TRIANGLE_CLASS: nb_vertex = 3; break;
+			case GS_POINT_CLASS:	nb_vertex = 1; break;
+			default: nb_vertex = 2; break;
+		}
+
+		GL_PUSH("Split the draw");
+
+		GL_PERF("Split single draw in %d draw", m_index.tail/nb_vertex);
+
+		for (size_t p = 0; p < m_index.tail; p += nb_vertex) {
+			glTextureBarrier();
+			dev->DrawIndexedPrimitive(p, nb_vertex);
+		}
+
+		GL_POP();
+	}
+}
+
+void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
+{
+	GSDeviceOGL::VSSelector vs_sel;
+	GSDeviceOGL::GSSelector gs_sel;
+
+	GSDeviceOGL::PSSelector ps_sel;
+	GSDeviceOGL::PSSamplerSelector ps_ssel;
+
+	GSDeviceOGL::OMColorMaskSelector om_csel;
+	GSDeviceOGL::OMDepthStencilSelector om_dssel;
+
+	GL_PUSH("GL Draw from %d in %d (Depth %d)",
+				tex && tex->m_texture ? tex->m_texture->GetID() : 0,
+				rt ? rt->GetID() : -1, ds ? ds->GetID() : -1);
+
+	GSTexture* hdr_rt = NULL;
+
+	const GSVector2i& rtsize = ds ? ds->GetSize()  : rt->GetSize();
+	const GSVector2& rtscale = ds ? ds->GetScale() : rt->GetScale();
+
+	bool DATE = m_context->TEST.DATE && m_context->FRAME.PSM != PSM_PSMCT24;
+	bool DATE_GL42 = false;
+	bool DATE_GL45 = false;
+
+	bool require_barrier = false; // For accurate option
+	m_unsafe_fbmask = false;
+
+	ASSERT(m_dev != NULL);
+
+	GSDeviceOGL* dev = (GSDeviceOGL*)m_dev;
+	dev->s_n = s_n;
+
+	if ((DATE || m_sw_blending) && GLLoader::found_GL_ARB_texture_barrier && (m_vt.m_primclass == GS_SPRITE_CLASS)) {
+		// Except 2D games, sprites are often use for special post-processing effect
+		m_prim_overlap = PrimitiveOverlap();
+	} else {
+		m_prim_overlap = PRIM_OVERLAP_UNKNOW;
+	}
+#ifdef ENABLE_OGL_DEBUG
+	if (m_sw_blending && (m_prim_overlap != PRIM_OVERLAP_NO) && (m_context->FRAME.Block() == m_context->TEX0.TBP0) && (m_vertex.next > 2)) {
+		GL_INS("ERROR: Source and Target are the same!");
+	}
+#endif
+
+	require_barrier |= EmulateTextureShuffleAndFbmask(ps_sel, om_csel);
+
+	// DATE: selection of the algorithm. Must be done before blending because GL42 is not compatible with blending
+
+	if (DATE && GLLoader::found_GL_ARB_texture_barrier) {
+		if (m_prim_overlap == PRIM_OVERLAP_NO || m_texture_shuffle) {
+			// It is way too complex to emulate texture shuffle with DATE. So just use
+			// the slow but accurate algo
+			require_barrier = true;
+			DATE_GL45 = true;
+			DATE = false;
+		} else if (m_accurate_date && om_csel.wa /* FIXME Check the msb bit of the mask instead + the dfmt*/
+				&& (!m_context->TEST.ATE || m_context->TEST.ATST == ATST_ALWAYS)) {
+			// texture barrier will split the draw call into n draw call. It is very efficient for
+			// few primitive draws. Otherwise it sucks.
+			if (m_index.tail < 100) {
+				require_barrier = true;
+				DATE_GL45 = true;
+				DATE = false;
+			} else {
+				DATE_GL42 = GLLoader::found_GL_ARB_shader_image_load_store;
+			}
+		}
+	}
+
+	// Blend
+
+	if (!IsOpaque() && rt) {
+		require_barrier |= EmulateBlending(ps_sel, DATE_GL42);
+	} else {
+		dev->OMSetBlendState(); // No blending please
+	}
+
+	if (ps_sel.dfmt == 1) {
+		// Disable writing of the alpha channel
+		om_csel.wa = 0;
+	}
+
+	// DATE (setup part)
+
+	if (DATE) {
+		GSVector4i dRect = ComputeBoundingBox(rtscale, rtsize);
+
+		// Reduce the quantity of clean function
+		glScissor( dRect.x, dRect.y, dRect.width(), dRect.height() );
+		GLState::scissor = dRect;
+
+		// Must be done here to avoid any GL state pertubation (clear function...)
+		// Create an r32ui image that will containt primitive ID
+		if (DATE_GL42) {
+			dev->InitPrimDateTexture(rt);
+		} else {
+			GSVector4 src = GSVector4(dRect) / GSVector4(rtsize.x, rtsize.y).xyxy();
+			GSVector4 dst = src * 2.0f - 1.0f;
+
+			GSVertexPT1 vertices[] =
+			{
+				{GSVector4(dst.x, dst.y, 0.0f, 0.0f), GSVector2(src.x, src.y)},
+				{GSVector4(dst.z, dst.y, 0.0f, 0.0f), GSVector2(src.z, src.y)},
+				{GSVector4(dst.x, dst.w, 0.0f, 0.0f), GSVector2(src.x, src.w)},
+				{GSVector4(dst.z, dst.w, 0.0f, 0.0f), GSVector2(src.z, src.w)},
+			};
+
+			dev->SetupDATE(rt, ds, vertices, m_context->TEST.DATM);
+		}
+	}
+
+	//
+
+	dev->BeginScene();
+
+	// om
+
+	if (m_context->TEST.ZTE)
+	{
+		om_dssel.ztst = m_context->TEST.ZTST;
+		om_dssel.zwe = !m_context->ZBUF.ZMSK;
+	}
+	else
+	{
+		om_dssel.ztst = ZTST_ALWAYS;
+	}
+
+	// vs
+
+	vs_sel.wildhack = (UserHacks_WildHack && !isPackedUV_HackFlag) ? 1 : 0;
+
+	// The real GS appears to do no masking based on the Z buffer format and writing larger Z values
+	// than the buffer supports seems to be an error condition on the real GS, causing it to crash.
+	// We are probably receiving bad coordinates from VU1 in these cases.
+
+	if (om_dssel.ztst >= ZTST_ALWAYS && om_dssel.zwe)
+	{
+		if (m_context->ZBUF.PSM == PSM_PSMZ24)
+		{
+			if (m_vt.m_max.p.z > 0xffffff)
+			{
+				ASSERT(m_vt.m_min.p.z > 0xffffff);
+				// Fixme :Following conditional fixes some dialog frame in Wild Arms 3, but may not be what was intended.
+				if (m_vt.m_min.p.z > 0xffffff)
+				{
+					GL_INS("Bad Z size on 24 bits buffers")
+					vs_sel.bppz = 1;
+					om_dssel.ztst = ZTST_ALWAYS;
+				}
+			}
+		}
+		else if (m_context->ZBUF.PSM == PSM_PSMZ16 || m_context->ZBUF.PSM == PSM_PSMZ16S)
+		{
+			if (m_vt.m_max.p.z > 0xffff)
+			{
+				ASSERT(m_vt.m_min.p.z > 0xffff); // sfex capcom logo
+				// Fixme : Same as above, I guess.
+				if (m_vt.m_min.p.z > 0xffff)
+				{
+					GL_INS("Bad Z size on 16 bits buffers")
+					vs_sel.bppz = 2;
+					om_dssel.ztst = ZTST_ALWAYS;
+				}
+			}
+		}
+	}
+
+	// FIXME Opengl support half pixel center (as dx10). Code could be easier!!!
+	float sx = 2.0f * rtscale.x / (rtsize.x << 4);
+	float sy = 2.0f * rtscale.y / (rtsize.y << 4);
+	float ox = (float)(int)m_context->XYOFFSET.OFX;
+	float oy = (float)(int)m_context->XYOFFSET.OFY;
+	float ox2 = -1.0f / rtsize.x;
+	float oy2 = -1.0f / rtsize.y;
+
+	//This hack subtracts around half a pixel from OFX and OFY. (Cannot do this directly,
+	//because DX10 and DX9 have a different pixel center.)
+	//
+	//The resulting shifted output aligns better with common blending / corona / blurring effects,
+	//but introduces a few bad pixels on the edges.
+
+	if (rt && rt->LikelyOffset)
+	{
+		ox2 *= rt->OffsetHack_modx;
+		oy2 *= rt->OffsetHack_mody;
+	}
+
+	// Note: DX does y *= -1.0
+	vs_cb.Vertex_Scale_Offset = GSVector4(sx, sy, ox * sx + ox2 + 1, oy * sy + oy2 + 1);
+	// END of FIXME
+
+	// GS_SPRITE_CLASS are already flat (either by CPU or the GS)
+	ps_sel.iip = (m_vt.m_primclass == GS_SPRITE_CLASS) ? 1 : PRIM->IIP;
+
+	if (DATE_GL45) {
+		ps_sel.date = 5 + m_context->TEST.DATM;
+	} else if (DATE) {
+		if (DATE_GL42)
+			ps_sel.date = 1 + m_context->TEST.DATM;
+		else
+			om_dssel.date = 1;
+	}
+
+	ps_sel.fba = m_context->FBA.FBA;
+
+	if (PRIM->FGE)
+	{
+		ps_sel.fog = 1;
+
+		GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]);
+#if _M_SSE >= 0x401
+		// Blend AREF to avoid to load a random value for alpha (dirty cache)
+		ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF);
+#else
+		ps_cb.FogColor_AREF = fc;
+#endif
+	}
+
+	if (m_context->TEST.ATE)
+		ps_sel.atst = m_context->TEST.ATST;
+	else
+		ps_sel.atst = ATST_ALWAYS;
+
+	if (m_context->TEST.ATE && m_context->TEST.ATST > 1)
+		ps_cb.FogColor_AREF.a = (float)m_context->TEST.AREF;
+
+	// By default don't use texture
+	ps_sel.tfx = 4;
+	bool spritehack = false;
+	int  atst = ps_sel.atst;
+
+	if (tex)
+	{
+		const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[m_context->TEX0.PSM];
+		const GSLocalMemory::psm_t &cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[m_context->TEX0.CPSM] : psm;
+		bool bilinear = m_filter == 2 ? m_vt.IsLinear() : m_filter != 0;
+		bool simple_sample = !tex->m_palette && cpsm.fmt == 0 && m_context->CLAMP.WMS < 2 && m_context->CLAMP.WMT < 2;
+		// Don't force extra filtering on sprite (it creates various upscaling issue)
+		bilinear &= !((m_vt.m_primclass == GS_SPRITE_CLASS) && m_userhacks_round_sprite_offset && !m_vt.IsLinear());
+
+		ps_sel.wms = m_context->CLAMP.WMS;
+		ps_sel.wmt = m_context->CLAMP.WMT;
+
+		// Performance note:
+		// 1/ Don't set 0 as it is the default value
+		// 2/ Only keep aem when it is useful (avoid useless shader permutation)
+		if (ps_sel.shuffle) {
+			// Force a 32 bits access (normally shuffle is done on 16 bits)
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			ps_sel.aem     = m_env.TEXA.AEM;
+			ASSERT(tex->m_target);
+
+			// Shuffle is a 16 bits format, so aem is always required
+			GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+			ta /= 255.0f;
+			// FIXME rely on compiler for the optimization
+			ps_cb.TA_Af.x = ta.x;
+			ps_cb.TA_Af.y = ta.y;
+
+			// FIXME: it is likely a bad idea to do the bilinear interpolation here
+			// bilinear &= m_vt.IsLinear();
+
+		} else if (tex->m_target) {
+			// Use an old target. AEM and index aren't resolved it must be done
+			// on the GPU
+
+			// Select the 32/24/16 bits color (AEM)
+			ps_sel.tex_fmt = cpsm.fmt;
+			ps_sel.aem     = m_env.TEXA.AEM;
+
+			// Don't upload AEM if format is 32 bits
+			if (cpsm.fmt) {
+				GSVector4 ta(m_env.TEXA & GSVector4i::x000000ff());
+				ta /= 255.0f;
+				// FIXME rely on compiler for the optimization
+				ps_cb.TA_Af.x = ta.x;
+				ps_cb.TA_Af.y = ta.y;
+			}
+
+			// Select the index format
+			if (tex->m_palette) {
+				// FIXME Potentially improve fmt field in GSLocalMemory
+				if (m_context->TEX0.PSM == PSM_PSMT4HL)
+					ps_sel.tex_fmt |= 1 << 2;
+				else if (m_context->TEX0.PSM == PSM_PSMT4HH)
+					ps_sel.tex_fmt |= 2 << 2;
+				else
+					ps_sel.tex_fmt |= 3 << 2;
+
+				// Alpha channel of the RT is reinterpreted as an index. Star
+				// Ocean 3 uses it to emulate a stencil buffer.  It is a very
+				// bad idea to force bilinear filtering on it.
+				bilinear &= m_vt.IsLinear();
+			}
+
+		} else if (tex->m_palette) {
+			// Use a standard 8 bits texture. AEM is already done on the CLUT
+			// Therefore you only need to set the index
+			// ps_sel.aem     = 0; // removed as an optimization
+
+			// Note 4 bits indexes are converted to 8 bits
+			ps_sel.tex_fmt = 3 << 2;
+
+		} else {
+			// Standard texture. Both index and AEM expansion were already done by the CPU.
+			// ps_sel.tex_fmt = 0; // removed as an optimization
+			// ps_sel.aem     = 0; // removed as an optimization
+		}
+
+		if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) {
+			// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
+			ps_sel.tfx = TFX_DECAL;
+		} else {
+			ps_sel.tfx = m_context->TEX0.TFX;
+		}
+
+		ps_sel.tcc = m_context->TEX0.TCC;
+
+		ps_sel.ltf = bilinear && !simple_sample;
+		spritehack = tex->m_spritehack_t;
+
+		int w = tex->m_texture->GetWidth();
+		int h = tex->m_texture->GetHeight();
+
+		int tw = (int)(1 << m_context->TEX0.TW);
+		int th = (int)(1 << m_context->TEX0.TH);
+
+		GSVector4 WH(tw, th, w, h);
+
+		ps_sel.fst = !!PRIM->FST;
+
+		ps_cb.WH = WH;
+		ps_cb.HalfTexel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw();
+		if ((m_context->CLAMP.WMS | m_context->CLAMP.WMT) > 1) {
+			ps_cb.MskFix = GSVector4i(m_context->CLAMP.MINU, m_context->CLAMP.MINV, m_context->CLAMP.MAXU, m_context->CLAMP.MAXV);
+			ps_cb.MinMax = GSVector4(ps_cb.MskFix) / WH.xyxy();
+		}
+
+		// TC Offset Hack
+		ps_sel.tcoffsethack = !!UserHacks_TCOffset;
+		ps_cb.TC_OH_TS = GSVector4(1/16.0f, 1/16.0f, UserHacks_TCO_x, UserHacks_TCO_y) / WH.xyxy();
+
+
+		// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
+		ps_ssel.tau   = (m_context->CLAMP.WMS != CLAMP_CLAMP);
+		ps_ssel.tav   = (m_context->CLAMP.WMT != CLAMP_CLAMP);
+		ps_ssel.ltf   = bilinear && simple_sample;
+		ps_ssel.aniso = simple_sample;
+
+		// Setup Texture ressources
+		dev->SetupSampler(ps_ssel);
+		dev->PSSetShaderResources(tex->m_texture, tex->m_palette);
+
+		if (spritehack && (ps_sel.atst == 2)) {
+			ps_sel.atst = 1;
+		}
+	} else {
+#ifdef ENABLE_OGL_DEBUG
+		// Unattach texture to avoid noise in debugger
+		dev->PSSetShaderResources(NULL, NULL);
+#endif
+	}
+	// Always bind the RT. This way special effect can use it.
+	dev->PSSetShaderResource(3, rt);
+
+
+	// GS
+
+#if 0
+	if (m_vt.m_primclass == GS_POINT_CLASS) {
+		// Upscaling point will create aliasing because point has a size of 0 pixels.
+		// This code tries to replace point with sprite. So a point in 4x will be replaced by
+		// a 4x4 sprite.
+		gs_sel.point = 1;
+		// FIXME this formula is potentially wrong
+		GSVector4 point_size = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y) * 2.0f;
+		vs_cb.TextureScale = vs_cb.TextureScale.xyxy(point_size);
+	}
+#endif
+	gs_sel.sprite = m_vt.m_primclass == GS_SPRITE_CLASS;
+
+	dev->SetupVS(vs_sel);
+	dev->SetupGS(gs_sel);
+	dev->SetupPS(ps_sel);
+
+	// rs
+
+	GSVector4i scissor = GSVector4i(GSVector4(rtscale).xyxy() * m_context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
+
+	GL_PUSH("IA");
+	SetupIA();
+	GL_POP();
+
+	dev->OMSetColorMaskState(om_csel);
+	dev->SetupOM(om_dssel);
+
+	dev->SetupCB(&vs_cb, &ps_cb);
+
+	if (DATE_GL42) {
+		GL_PUSH("Date GL42");
+		// It could be good idea to use stencil in the same time.
+		// Early stencil test will reduce the number of atomic-load operation
+
+		// Create an r32i image that will contain primitive ID
+		// Note: do it at the beginning because the clean will dirty the FBO state
+		//dev->InitPrimDateTexture(rtsize.x, rtsize.y);
+
+		// I don't know how much is it legal to mount rt as Texture/RT. No write is done.
+		// In doubt let's detach RT.
+		dev->OMSetRenderTargets(NULL, ds, &scissor);
+
+		// Don't write anything on the color buffer
+		// Neither in the depth buffer
+		glDepthMask(false);
+		// Compute primitiveID max that pass the date test
+		SendDraw(false);
+
+		// Ask PS to discard shader above the primitiveID max
+		glDepthMask(GLState::depth_mask);
+
+		ps_sel.date = 3;
+		dev->SetupPS(ps_sel);
+
+		// Be sure that first pass is finished !
+		dev->Barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+
+		GL_POP();
+	}
+
+	if (ps_sel.hdr) {
+		hdr_rt = dev->CreateTexture(rtsize.x, rtsize.y, GL_RGBA32F);
+
+		dev->CopyRectConv(rt, hdr_rt, ComputeBoundingBox(rtscale, rtsize), false);
+
+		dev->OMSetRenderTargets(hdr_rt, ds, &scissor);
+	} else {
+		dev->OMSetRenderTargets(rt, ds, &scissor);
+	}
+
+	if (m_context->TEST.DoFirstPass())
+	{
+		SendDraw(require_barrier);
+	}
+
+	if (m_context->TEST.DoSecondPass())
+	{
+		ASSERT(!m_env.PABE.PABE);
+
+		static const uint32 iatst[] = {1, 0, 5, 6, 7, 2, 3, 4};
+
+		ps_sel.atst = iatst[atst];
+		if (spritehack && (ps_sel.atst == 2)) {
+			ps_sel.atst = 1;
+		}
+
+		dev->SetupPS(ps_sel);
+
+		bool z = om_dssel.zwe;
+		bool r = om_csel.wr;
+		bool g = om_csel.wg;
+		bool b = om_csel.wb;
+		bool a = om_csel.wa;
+
+		switch(m_context->TEST.AFAIL)
+		{
+			case AFAIL_KEEP: z = r = g = b = a = false; break; // none
+			case AFAIL_FB_ONLY: z = false; break; // rgba
+			case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z
+			case AFAIL_RGB_ONLY: z = a = false; break; // rgb
+			default: __assume(0);
+		}
+
+		if (z || r || g || b || a)
+		{
+			om_dssel.zwe = z;
+			om_csel.wr = r;
+			om_csel.wg = g;
+			om_csel.wb = b;
+			om_csel.wa = a;
+
+			dev->OMSetColorMaskState(om_csel);
+			dev->SetupOM(om_dssel);
+
+			SendDraw(require_barrier);
+		}
+	}
+
+	if (DATE_GL42) {
+		dev->RecycleDateTexture();
+	}
+
+	dev->EndScene();
+
+	// Warning: EndScene must be called before StretchRect otherwise
+	// vertices will be overwritten. Trust me you don't want to do that.
+	if (hdr_rt) {
+		GSVector4 dRect(ComputeBoundingBox(rtscale, rtsize));
+		GSVector4 sRect = dRect / GSVector4(rtsize.x, rtsize.y).xyxy();
+		dev->StretchRect(hdr_rt, sRect, rt, dRect, ShaderConvert_MOD_256, false);
+
+		dev->Recycle(hdr_rt);
+	}
+
+	GL_POP();
+}
diff --git a/plugins/GSdx_legacy/GSRendererOGL.h b/plugins/GSdx_legacy/GSRendererOGL.h
new file mode 100644
index 0000000000..dd6c7eab60
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererOGL.h
@@ -0,0 +1,80 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRendererHW.h"
+
+#include "GSRenderer.h"
+#include "GSTextureCacheOGL.h"
+#include "GSVertexHW.h"
+
+class GSRendererOGL final : public GSRendererHW
+{
+	enum PRIM_OVERLAP {
+		PRIM_OVERLAP_UNKNOW,
+		PRIM_OVERLAP_YES,
+		PRIM_OVERLAP_NO
+	};
+
+	enum ACC_BLEND {
+		ACC_BLEND_NONE = 0,
+		ACC_BLEND_FREE = 1,
+		ACC_BLEND_SPRITE = 2,
+		ACC_BLEND_CCLIP_DALPHA = 3,
+		ACC_BLEND_FULL = 4,
+		ACC_BLEND_ULTRA = 5
+	};
+
+	private:
+		bool m_accurate_date;
+		int m_sw_blending;
+		PRIM_OVERLAP m_prim_overlap;
+		bool m_unsafe_fbmask;
+		vector<size_t> m_drawlist;
+
+		unsigned int UserHacks_TCOffset;
+		float UserHacks_TCO_x, UserHacks_TCO_y;
+		bool UserHacks_safe_fbmask;
+
+		GSDeviceOGL::VSConstantBuffer vs_cb;
+		GSDeviceOGL::PSConstantBuffer ps_cb;
+
+		GSVector4i ComputeBoundingBox(const GSVector2& rtscale, const GSVector2i& rtsize);
+
+	private:
+		void EmulateGS();
+		void SetupIA();
+		bool EmulateTextureShuffleAndFbmask(GSDeviceOGL::PSSelector& ps_sel, GSDeviceOGL::OMColorMaskSelector& om_csel);
+		bool EmulateBlending(GSDeviceOGL::PSSelector& ps_sel, bool DATE_GL42);
+
+	public:
+		GSRendererOGL();
+		virtual ~GSRendererOGL() {};
+
+		bool CreateDevice(GSDevice* dev);
+
+		void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) final;
+
+		PRIM_OVERLAP PrimitiveOverlap();
+
+		void SendDraw(bool require_barrier);
+};
diff --git a/plugins/GSdx_legacy/GSRendererSW.cpp b/plugins/GSdx_legacy/GSRendererSW.cpp
new file mode 100644
index 0000000000..6e7f6bb9e3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererSW.cpp
@@ -0,0 +1,1675 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSRendererSW.h"
+
+#define LOG 0
+
+static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
+
+const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
+
+#if _M_SSE >= 0x501
+const GSVector8 g_pos_scale2(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
+#endif
+
+GSRendererSW::GSRendererSW(int threads)
+	: m_fzb(NULL)
+{
+	m_nativeres = true; // ignore ini, sw is always native
+
+	m_tc = new GSTextureCacheSW(this);
+
+	memset(m_texture, 0, sizeof(m_texture));
+
+	m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon);
+
+	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
+
+	for (uint32 i = 0; i < countof(m_fzb_pages); i++) {
+		m_fzb_pages[i] = 0;
+	}
+	for (uint32 i = 0; i < countof(m_tex_pages); i++) {
+		m_tex_pages[i] = 0;
+	}
+
+	#define InitCVB(P) \
+		m_cvb[P][0][0] = &GSRendererSW::ConvertVertexBuffer<P, 0, 0>; \
+		m_cvb[P][0][1] = &GSRendererSW::ConvertVertexBuffer<P, 0, 1>; \
+		m_cvb[P][1][0] = &GSRendererSW::ConvertVertexBuffer<P, 1, 0>; \
+		m_cvb[P][1][1] = &GSRendererSW::ConvertVertexBuffer<P, 1, 1>; \
+
+	InitCVB(GS_POINT_CLASS);
+	InitCVB(GS_LINE_CLASS);
+	InitCVB(GS_TRIANGLE_CLASS);
+	InitCVB(GS_SPRITE_CLASS);
+}
+
+GSRendererSW::~GSRendererSW()
+{
+	delete m_tc;
+
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+	}
+
+	delete m_rl;
+
+	_aligned_free(m_output);
+}
+
+void GSRendererSW::Reset()
+{
+	Sync(-1);
+
+	m_tc->RemoveAll();
+
+	GSRenderer::Reset();
+}
+
+void GSRendererSW::VSync(int field)
+{
+	Sync(0); // IncAge might delete a cached texture in use
+
+	if(0) if(LOG)
+	{
+		fprintf(s_fp, "%lld\n", m_perfmon.GetFrame());
+
+		GSVector4i dr = GetDisplayRect();
+		GSVector4i fr = GetFrameRect();
+		GSVector2i ds = GetDeviceSize();
+
+		fprintf(s_fp, "dr %d %d %d %d, fr %d %d %d %d, ds %d %d\n",
+			dr.x, dr.y, dr.z, dr.w,
+			fr.x, fr.y, fr.z, fr.w,
+			ds.x, ds.y);
+
+		for(int i = 0; i < 2; i++)
+		{
+			if(i == 0 && !m_regs->PMODE.EN1) continue;
+			if(i == 1 && !m_regs->PMODE.EN2) continue;
+
+			fprintf(s_fp, "DISPFB[%d] BP=%05x BW=%d PSM=%d DBX=%d DBY=%d\n", 
+				i,
+				m_regs->DISP[i].DISPFB.Block(),
+				m_regs->DISP[i].DISPFB.FBW,
+				m_regs->DISP[i].DISPFB.PSM,
+				m_regs->DISP[i].DISPFB.DBX,
+				m_regs->DISP[i].DISPFB.DBY
+				);
+
+			fprintf(s_fp, "DISPLAY[%d] DX=%d DY=%d DW=%d DH=%d MAGH=%d MAGV=%d\n", 
+				i,
+				m_regs->DISP[i].DISPLAY.DX,
+				m_regs->DISP[i].DISPLAY.DY,
+				m_regs->DISP[i].DISPLAY.DW,
+				m_regs->DISP[i].DISPLAY.DH,
+				m_regs->DISP[i].DISPLAY.MAGH,
+				m_regs->DISP[i].DISPLAY.MAGV
+				);
+		}
+
+		fprintf(s_fp, "PMODE EN1=%d EN2=%d CRTMD=%d MMOD=%d AMOD=%d SLBG=%d ALP=%d\n", 
+			m_regs->PMODE.EN1,
+			m_regs->PMODE.EN2,
+			m_regs->PMODE.CRTMD,
+			m_regs->PMODE.MMOD,
+			m_regs->PMODE.AMOD,
+			m_regs->PMODE.SLBG,
+			m_regs->PMODE.ALP
+			);
+
+		fprintf(s_fp, "SMODE1 CLKSEL=%d CMOD=%d EX=%d GCONT=%d LC=%d NVCK=%d PCK2=%d PEHS=%d PEVS=%d PHS=%d PRST=%d PVS=%d RC=%d SINT=%d SLCK=%d SLCK2=%d SPML=%d T1248=%d VCKSEL=%d VHP=%d XPCK=%d\n",
+			m_regs->SMODE1.CLKSEL,
+			m_regs->SMODE1.CMOD,
+			m_regs->SMODE1.EX,
+			m_regs->SMODE1.GCONT,
+			m_regs->SMODE1.LC,
+			m_regs->SMODE1.NVCK,
+			m_regs->SMODE1.PCK2,
+			m_regs->SMODE1.PEHS,
+			m_regs->SMODE1.PEVS,
+			m_regs->SMODE1.PHS,
+			m_regs->SMODE1.PRST,
+			m_regs->SMODE1.PVS,
+			m_regs->SMODE1.RC,
+			m_regs->SMODE1.SINT,
+			m_regs->SMODE1.SLCK,
+			m_regs->SMODE1.SLCK2,
+			m_regs->SMODE1.SPML,
+			m_regs->SMODE1.T1248,
+			m_regs->SMODE1.VCKSEL,
+			m_regs->SMODE1.VHP,
+			m_regs->SMODE1.XPCK
+			);
+
+		fprintf(s_fp, "SMODE2 INT=%d FFMD=%d DPMS=%d\n", 
+			m_regs->SMODE2.INT,
+			m_regs->SMODE2.FFMD,
+			m_regs->SMODE2.DPMS
+			);
+
+		fprintf(s_fp, "SRFSH %08x_%08x\n", 
+			m_regs->SRFSH.u32[0],
+			m_regs->SRFSH.u32[1]
+			);
+
+		fprintf(s_fp, "SYNCH1 %08x_%08x\n", 
+			m_regs->SYNCH1.u32[0],
+			m_regs->SYNCH1.u32[1]
+			);
+
+		fprintf(s_fp, "SYNCH2 %08x_%08x\n", 
+			m_regs->SYNCH2.u32[0],
+			m_regs->SYNCH2.u32[1]
+			);
+
+		fprintf(s_fp, "SYNCV %08x_%08x\n", 
+			m_regs->SYNCV.u32[0],
+			m_regs->SYNCV.u32[1]
+			);
+
+		fprintf(s_fp, "CSR %08x_%08x\n", 
+			m_regs->CSR.u32[0],
+			m_regs->CSR.u32[1]
+			);
+
+		fflush(s_fp);
+	}
+
+	/*
+	int draw[8], sum = 0;
+
+	for(size_t i = 0; i < countof(draw); i++)
+	{
+		draw[i] = m_perfmon.CPU(GSPerfMon::WorkerDraw0 + i);
+		sum += draw[i];
+	}
+
+	printf("CPU %d Sync %d W %d %d %d %d %d %d %d %d (%d)\n",
+		m_perfmon.CPU(GSPerfMon::Main),
+		m_perfmon.CPU(GSPerfMon::Sync),
+		draw[0], draw[1], draw[2], draw[3], draw[4], draw[5], draw[6], draw[7], sum);
+
+	//
+	*/
+
+	GSRenderer::VSync(field);
+
+	m_tc->IncAge();
+
+	// if((m_perfmon.GetFrame() & 255) == 0) m_rl->PrintStats();
+}
+
+void GSRendererSW::ResetDevice()
+{
+	for(size_t i = 0; i < countof(m_texture); i++)
+	{
+		delete m_texture[i];
+
+		m_texture[i] = NULL;
+	}
+}
+
+GSTexture* GSRendererSW::GetOutput(int i)
+{
+	Sync(1);
+
+	const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
+
+	int w = DISPFB.FBW * 64;
+	int h = GetFrameRect(i).bottom;
+
+	// TODO: round up bottom
+
+	if(m_dev->ResizeTexture(&m_texture[i], w, h))
+	{
+		static int pitch = 1024 * 4;
+
+		GSVector4i r(0, 0, w, h);
+
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
+
+		(m_mem.*psm.rtx)(m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM), r.ralign<Align_Outside>(psm.bs), m_output, pitch, m_env.TEXA);
+
+		m_texture[i]->Update(r, m_output, pitch);
+
+		if(s_dump)
+		{
+			if(s_savef && s_n >= s_saven)
+			{
+				m_texture[i]->Save(root_sw + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+			}
+
+			s_n++;
+		}
+	}
+
+	return m_texture[i];
+}
+
+template<uint32 primclass, uint32 tme, uint32 fst>
+void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
+{
+	#if 0//_M_SSE >= 0x501
+
+	// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
+
+	GSVector8i o2((GSVector4i)m_context->XYOFFSET);
+	GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
+
+	for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
+	{
+		GSVector8i v0 = GSVector8i::load<true>(src[0].m);
+		GSVector8i v1 = GSVector8i::load<true>(src[1].m);
+
+		GSVector8 stcq = GSVector8::cast(v0.ac(v1));
+		GSVector8i xyzuvf = v0.bd(v1);
+
+		//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
+		//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
+
+		GSVector8i xy = xyzuvf.upl16() - o2;
+		GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
+
+		GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * g_pos_scale2;
+		GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
+
+		GSVector8 t = GSVector8::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				t = GSVector8(xyzuvf.uph16() << (16 - 4));
+			}
+			else
+			{
+				t = stcq.xyww() * tsize2;
+			}
+		}
+
+		if(primclass == GS_SPRITE_CLASS)
+		{
+			t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
+		}
+
+		GSVector8::storel(&dst[0].p, p);
+
+		if(tme || primclass == GS_SPRITE_CLASS)
+		{
+			GSVector8::store<true>(&dst[0].t, t.ac(c));
+		}
+		else
+		{
+			GSVector8::storel(&dst[0].c, c);
+		}
+
+		GSVector8::storeh(&dst[1].p, p);
+
+		if(tme || primclass == GS_SPRITE_CLASS)
+		{
+			GSVector8::store<true>(&dst[1].t, t.bd(c));
+		}
+		else
+		{
+			GSVector8::storeh(&dst[1].c, c);
+		}
+	}
+
+	#else
+	
+	GSVector4i off = (GSVector4i)m_context->XYOFFSET;
+	GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
+
+	for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
+	{
+		GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
+
+		#if _M_SSE >= 0x401
+
+		GSVector4i xyzuvf(src->m[1]);
+
+		GSVector4i xy = xyzuvf.upl16() - off;
+		GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
+
+		#else
+
+		uint32 z = src->XYZ.Z;
+
+		GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - off;
+		GSVector4i zf = GSVector4i((int)std::min<uint32>(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later
+
+		#endif
+
+		dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
+		dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
+
+		GSVector4 t = GSVector4::zero();
+
+		if(tme)
+		{
+			if(fst)
+			{
+				#if _M_SSE >= 0x401
+
+				t = GSVector4(xyzuvf.uph16() << (16 - 4));
+					
+				#else
+
+				t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4));
+
+				#endif
+			}
+			else
+			{
+				t = stcq.xyww() * tsize;
+			}
+		}
+
+		if(primclass == GS_SPRITE_CLASS)
+		{
+			#if _M_SSE >= 0x401
+
+			t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
+
+			#else
+
+			t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
+
+			#endif
+		}
+
+		dst->t = t;
+
+		#if 0 //_M_SSE >= 0x501
+
+		dst->_pad = GSVector4::zero();
+
+		#endif
+	}
+
+	#endif
+}
+
+void GSRendererSW::Draw()
+{
+	const GSDrawingContext* context = m_context;
+
+	SharedData* sd = new SharedData(this);
+
+	shared_ptr<GSRasterizerData> data(sd);
+
+	sd->primclass = m_vt.m_primclass;
+	sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1) + sizeof(uint32) * m_index.tail, 64);
+	sd->vertex = (GSVertexSW*)sd->buff;
+	sd->vertex_count = m_vertex.next;
+	sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
+	sd->index_count = m_index.tail;
+
+	(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
+
+	memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail);
+
+	GSVector4i scissor = GSVector4i(context->scissor.in);
+	GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
+
+	// points and lines may have zero area bbox (single line: 0, 0 - 256, 0)
+
+	if(m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS)
+	{
+		if(bbox.x == bbox.z) bbox.z++;
+		if(bbox.y == bbox.w) bbox.w++;
+	}
+
+	GSVector4i r = bbox.rintersect(scissor);
+
+	scissor.z = std::min<int>(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
+	
+	sd->scissor = scissor;
+	sd->bbox = bbox;
+	sd->frame = m_perfmon.GetFrame();
+
+	if(!GetScanlineGlobalData(sd))
+	{
+		s_n += 3; // Keep it sync with HW renderer
+		return;
+	}
+
+	if(0) if(LOG)
+	{
+		int n = GSUtil::GetVertexCount(PRIM->PRIM);
+		
+		for(uint32 i = 0, j = 0; i < m_index.tail; i += n, j++)
+		{
+			for(int k = 0; k < n; k++)
+			{
+				GSVertex* v = &m_vertex.buff[m_index.buff[i + k]];
+				GSVertex* vn = &m_vertex.buff[m_index.buff[i + n - 1]];
+				
+				fprintf(s_fp, "%d:%d %f %f %f %f\n", 
+					j, k,
+					(float)(v->XYZ.X - context->XYOFFSET.OFX) / 16,
+					(float)(v->XYZ.Y - context->XYOFFSET.OFY) / 16,
+					PRIM->FST ? (float)(v->U) / 16 : v->ST.S / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q),
+					PRIM->FST ? (float)(v->V) / 16 : v->ST.T / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q)
+					);
+			}
+		}
+	}
+
+	//
+
+	// GSScanlineGlobalData& gd = sd->global;
+
+	uint32* fb_pages = NULL;
+	uint32* zb_pages = NULL;
+
+	if(sd->global.sel.fb)
+	{
+		fb_pages = m_context->offset.fb->GetPages(r);
+	}
+
+	if(sd->global.sel.zb)
+	{
+		zb_pages = m_context->offset.zb->GetPages(r);
+	}
+
+	// check if there is an overlap between this and previous targets
+
+	if(CheckTargetPages(fb_pages, zb_pages, r))
+	{
+		sd->m_syncpoint = SharedData::SyncTarget;
+	}
+
+	// check if the texture is not part of a target currently in use
+
+	if(CheckSourcePages(sd))
+	{
+		sd->m_syncpoint = SharedData::SyncSource;
+	}
+
+	// addref source and target pages
+
+	sd->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm);
+
+	//
+
+	if(s_dump)
+	{
+		Sync(2);
+
+		uint64 frame = m_perfmon.GetFrame();
+		// Dump the texture in 32 bits format. It helps to debug texture shuffle effect
+		// It will breaks the few games that really uses 16 bits RT
+		bool texture_shuffle = ((context->FRAME.PSM & 0x2) && ((context->TEX0.PSM & 3) == 2) && (m_vt.m_primclass == GS_SPRITE_CLASS));
+
+		string s;
+
+		if(s_n >= s_saven)
+		{
+			// Dump Register state
+			s = format("%05d_context.txt", s_n);
+
+			m_env.Dump(root_sw+s);
+			m_context->Dump(root_sw+s);
+		}
+
+		if(s_savet && s_n >= s_saven && PRIM->TME)
+		{
+			if (texture_shuffle) {
+				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
+				s = format("%05d_f%lld_tex_%05x_32bits.bmp", s_n, frame, (int)m_context->TEX0.TBP0);
+				m_mem.SaveBMP(root_sw+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, 0, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+			}
+
+			s = format("%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
+			m_mem.SaveBMP(root_sw+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+		}
+
+		s_n++;
+
+		if(s_save && s_n >= s_saven)
+		{
+
+			if (texture_shuffle) {
+				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
+				s = format("%05d_f%lld_rt0_%05x_32bits.bmp", s_n, frame, m_context->FRAME.Block());
+				m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
+			}
+
+			s = format("%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+
+		Queue(data);
+
+		Sync(3);
+
+		if(s_save && s_n >= s_saven)
+		{
+			if (texture_shuffle) {
+				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
+				s = format("%05d_f%lld_rt1_%05x_32bits.bmp", s_n, frame, m_context->FRAME.Block());
+				m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
+			}
+
+			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
+			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez && s_n >= s_saven)
+		{
+			s = format("%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+
+		s_n++;
+
+		if(s_savel > 0 && (s_n - s_saven) > s_savel)
+		{
+			s_dump = 0;
+		}
+	}
+	else
+	{
+		Queue(data);
+	}
+
+	/*
+	if(0)//stats.ticks > 5000000)
+	{
+		printf("* [%lld | %012llx] ticks %lld prims %d (%d) pixels %d (%d)\n",
+			m_perfmon.GetFrame(), gd->sel.key,
+			stats.ticks,
+			stats.prims, stats.prims > 0 ? (int)(stats.ticks / stats.prims) : -1,
+			stats.pixels, stats.pixels > 0 ? (int)(stats.ticks / stats.pixels) : -1);
+	}
+	*/
+}
+
+void GSRendererSW::Queue(shared_ptr<GSRasterizerData>& item)
+{
+	SharedData* sd = (SharedData*)item.get();
+
+	if(sd->m_syncpoint == SharedData::SyncSource) 
+	{
+		Sync(4);
+	}
+
+	// update previously invalidated parts
+
+	sd->UpdateSource();
+
+	if(sd->m_syncpoint == SharedData::SyncTarget)
+	{
+		Sync(5);
+	}
+
+	if(LOG)
+	{
+		GSScanlineGlobalData& gd = ((SharedData*)item.get())->global;
+
+		fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
+			sd->counter,
+			m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, 
+			m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
+			PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
+			PRIM->PRIM, sd->vertex_count, sd->index_count); 
+
+		fflush(s_fp);
+	}
+
+	m_rl->Queue(item);
+
+	// invalidate new parts rendered onto
+
+	if(sd->global.sel.fwrite)
+	{
+		m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm);
+	}
+
+	if(sd->global.sel.zwrite)
+	{
+		m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm);
+	}
+}
+
+void GSRendererSW::Sync(int reason)
+{
+	//printf("sync %d\n", reason);
+
+	GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
+
+	uint64 t = __rdtsc();
+
+	m_rl->Sync();
+
+	if(0) if(LOG)
+	{
+		s_n++;
+
+		std::string s;
+		
+		if(s_save)
+		{
+			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
+
+			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+		}
+
+		if(s_savez)
+		{
+			s = format("%05d_f%lld_zb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
+
+			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+		}
+	}
+
+	t = __rdtsc() - t;
+
+	int pixels = m_rl->GetPixels();
+
+	if(LOG) {fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, pixels, t > 10000000 ? '*' : ' '); fflush(s_fp);}
+
+	m_perfmon.Put(GSPerfMon::Fillrate, pixels);
+}
+
+void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
+{
+	if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+	
+	GSOffset* off = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
+
+	off->GetPages(r, m_tmp_pages);
+
+	// check if the changing pages either used as a texture or a target
+
+	if(!m_rl->IsSynced())
+	{
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_fzb_pages[*p] | m_tex_pages[*p])
+			{
+				Sync(6);
+
+				break;
+			}
+		}
+	}
+
+	m_tc->InvalidatePages(m_tmp_pages, off->psm); // if texture update runs on a thread and Sync(5) happens then this must come later
+}
+
+void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
+{
+	if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
+
+	if(!m_rl->IsSynced())
+	{
+		GSOffset* off = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
+
+		off->GetPages(r, m_tmp_pages);
+
+		for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++)
+		{
+			if(m_fzb_pages[*p])
+			{
+				Sync(7);
+
+				break;
+			}
+		}
+	}
+}
+
+void GSRendererSW::UsePages(const uint32* pages, const int type)
+{
+	for(const uint32* p = pages; *p != GSOffset::EOP; p++) {
+		switch (type) {
+			case 0:
+				ASSERT((m_fzb_pages[*p] & 0xFFFF) < USHRT_MAX);
+				m_fzb_pages[*p] += 1;
+				break;
+			case 1:
+				ASSERT((m_fzb_pages[*p] >> 16) < USHRT_MAX);
+				m_fzb_pages[*p] += 0x10000;
+				break;
+			case 2:
+				ASSERT(m_tex_pages[*p] < USHRT_MAX);
+				m_tex_pages[*p] += 1;
+				break;
+			default:break;
+		}
+	}
+}
+
+void GSRendererSW::ReleasePages(const uint32* pages, const int type)
+{
+	for(const uint32* p = pages; *p != GSOffset::EOP; p++) {
+		switch (type) {
+			case 0:
+				ASSERT((m_fzb_pages[*p] & 0xFFFF) > 0);
+				m_fzb_pages[*p] -= 1;
+				break;
+			case 1:
+				ASSERT((m_fzb_pages[*p] >> 16) > 0);
+				m_fzb_pages[*p] -= 0x10000;
+				break;
+			case 2:
+				ASSERT(m_tex_pages[*p] > 0);
+				m_tex_pages[*p] -= 1;
+				break;
+			default:break;
+		}
+	}
+}
+
+bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r)
+{
+	bool synced = m_rl->IsSynced();
+	
+	bool fb = fb_pages != NULL;
+	bool zb = zb_pages != NULL;
+
+	bool res = false;
+
+	if(m_fzb != m_context->offset.fzb4)
+	{
+		// targets changed, check everything
+
+		m_fzb = m_context->offset.fzb4;
+		m_fzb_bbox = r;
+
+		if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
+		if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
+
+		memset(m_fzb_cur_pages, 0, sizeof(m_fzb_cur_pages));
+
+		uint32 used = 0;
+
+		for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
+		{
+			uint32 i = *p;
+
+			uint32 row = i >> 5;
+			uint32 col = 1 << (i & 31);
+			
+			m_fzb_cur_pages[row] |= col;
+
+			used |= m_fzb_pages[i];
+		}
+
+		for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
+		{
+			uint32 i = *p;
+			
+			uint32 row = i >> 5;
+			uint32 col = 1 << (i & 31);
+			
+			m_fzb_cur_pages[row] |= col;
+
+			used |= m_fzb_pages[i];
+		}
+
+		if(!synced)
+		{
+			if(used)
+			{
+				if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);}
+
+				res = true;
+			}
+
+			//if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);}
+		}
+	}
+	else
+	{
+		// same target, only check new areas and cross-rendering between frame and z-buffer
+
+		GSVector4i bbox = m_fzb_bbox.runion(r);
+
+		bool check = !m_fzb_bbox.eq(bbox);
+
+		m_fzb_bbox = bbox;
+
+		if(check)
+		{
+			// drawing area is larger than previous time, check new parts only to avoid false positives (m_fzb_cur_pages guards)
+
+			if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r);
+			if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r);
+
+			uint32 used = 0;
+
+			for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
+			{
+				uint32 i = *p;
+
+				uint32 row = i >> 5;
+				uint32 col = 1 << (i & 31);
+			
+				if((m_fzb_cur_pages[row] & col) == 0)
+				{
+					m_fzb_cur_pages[row] |= col;
+
+					used |= m_fzb_pages[i];
+				}
+			}
+
+			for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
+			{
+				uint32 i = *p;
+
+				uint32 row = i >> 5;
+				uint32 col = 1 << (i & 31);
+			
+				if((m_fzb_cur_pages[row] & col) == 0)
+				{
+					m_fzb_cur_pages[row] |= col;
+
+					used |= m_fzb_pages[i];
+				}
+			}
+
+			if(!synced)
+			{
+				if(used)
+				{
+					if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);}
+
+					res = true;
+				}
+			}
+		}
+
+		if(!synced)
+		{
+			// chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue,
+			// have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300)
+
+			if(fb && !res)
+			{
+				for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++)
+				{
+					if(m_fzb_pages[*p] & 0xffff0000)
+					{
+						if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);}
+
+						res = true;
+
+						break;
+					}
+				}
+			}
+
+			if(zb && !res)
+			{
+				for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++)
+				{
+					if(m_fzb_pages[*p] & 0x0000ffff)
+					{
+						if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);}
+
+						res = true;
+
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	if(!fb && fb_pages != NULL) delete [] fb_pages;
+	if(!zb && zb_pages != NULL) delete [] zb_pages;
+
+	return res;
+}
+
+bool GSRendererSW::CheckSourcePages(SharedData* sd)
+{
+	if(!m_rl->IsSynced())
+	{
+		for(size_t i = 0; sd->m_tex[i].t != NULL; i++)
+		{
+			sd->m_tex[i].t->m_offset->GetPages(sd->m_tex[i].r, m_tmp_pages); 
+
+			uint32* pages = m_tmp_pages; // sd->m_tex[i].t->m_pages.n;
+
+			for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+			{
+				// TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2)
+
+				if(m_fzb_pages[*p]) // currently being drawn to? => sync
+				{
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+#include "GSTextureSW.h"
+
+bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
+{
+	GSScanlineGlobalData& gd = data->global;
+
+	const GSDrawingEnvironment& env = m_env;
+	const GSDrawingContext* context = m_context;
+	const GS_PRIM_CLASS primclass = m_vt.m_primclass;
+
+	gd.vm = m_mem.m_vm8;
+
+	gd.fbr = context->offset.fb->pixel.row;
+	gd.zbr = context->offset.zb->pixel.row;
+	gd.fbc = context->offset.fb->pixel.col[0];
+	gd.zbc = context->offset.zb->pixel.col[0];
+	gd.fzbr = context->offset.fzb4->row;
+	gd.fzbc = context->offset.fzb4->col;
+
+	gd.sel.key = 0;
+
+	gd.sel.fpsm = 3;
+	gd.sel.zpsm = 3;
+	gd.sel.atst = ATST_ALWAYS;
+	gd.sel.tfx = TFX_NONE;
+	gd.sel.ababcd = 0xff;
+	gd.sel.prim = primclass;
+
+	uint32 fm = context->FRAME.FBMSK;
+	uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
+
+	if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
+	{
+		fm = 0xffffffff;
+		zm = 0xffffffff;
+	}
+
+	if(PRIM->TME)
+	{
+		if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+		{
+			m_mem.m_clut.Read32(context->TEX0, env.TEXA);
+		}
+	}
+
+	if(context->TEST.ATE)
+	{
+		if(!TryAlphaTest(fm, zm))
+		{
+			gd.sel.atst = context->TEST.ATST;
+			gd.sel.afail = context->TEST.AFAIL;
+
+			gd.aref = GSVector4i((int)context->TEST.AREF);
+
+			switch(gd.sel.atst)
+			{
+			case ATST_LESS:
+				gd.sel.atst = ATST_LEQUAL;
+				gd.aref -= GSVector4i::x00000001();
+				break;
+			case ATST_GREATER:
+				gd.sel.atst = ATST_GEQUAL;
+				gd.aref += GSVector4i::x00000001();
+				break;
+			}
+		}
+	}
+
+	bool fwrite = fm != 0xffffffff;
+	bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24;
+
+	bool zwrite = zm != 0xffffffff;
+	bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
+	/*
+	printf("%05x %d %05x %d %05x %d %dx%d\n", 
+		fwrite || ftest ? m_context->FRAME.Block() : 0xfffff, m_context->FRAME.PSM,
+		zwrite || ztest ? m_context->ZBUF.Block() : 0xfffff, m_context->ZBUF.PSM,
+		PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH);
+	*/
+	if(!fwrite && !zwrite) return false;
+
+	gd.sel.fwrite = fwrite;
+	gd.sel.ftest = ftest;
+
+	if(fwrite || ftest)
+	{
+		gd.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt;
+
+		if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff)
+		{
+			gd.sel.iip = PRIM->IIP;
+		}
+
+		if(PRIM->TME)
+		{
+			gd.sel.tfx = context->TEX0.TFX;
+			gd.sel.tcc = context->TEX0.TCC;
+			gd.sel.fst = PRIM->FST;
+			gd.sel.ltf = m_vt.IsLinear();
+
+			if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
+			{
+				gd.sel.tlu = 1;
+
+				gd.clut = (uint32*)_aligned_malloc(sizeof(uint32) * 256, 32); // FIXME: might address uninitialized data of the texture (0xCD) that is not in 0-15 range for 4-bpp formats
+
+				memcpy(gd.clut, (const uint32*)m_mem.m_clut, sizeof(uint32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal);
+			}
+
+			gd.sel.wms = context->CLAMP.WMS;
+			gd.sel.wmt = context->CLAMP.WMT;
+
+			if(gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128)))
+			{
+				// modulate does not do anything when vertex color is 0x80
+
+				gd.sel.tfx = TFX_DECAL;
+			}
+
+			bool mipmap = IsMipMapActive();
+
+			GIFRegTEX0 TEX0 = m_context->GetSizeFixedTEX0(m_vt.m_min.t.xyxy(m_vt.m_max.t), m_vt.IsLinear(), mipmap);
+
+			GSVector4i r;
+
+			GetTextureMinMax(r, TEX0, context->CLAMP, gd.sel.ltf);
+
+			GSTextureCacheSW::Texture* t = m_tc->Lookup(TEX0, env.TEXA);
+
+			if(t == NULL) {ASSERT(0); return false;}
+
+			data->SetSource(t, r, 0);
+
+			gd.sel.tw = t->m_tw - 3;
+
+			if(mipmap)
+			{
+				// TEX1.MMIN
+				// 000 p
+				// 001 l
+				// 010 p round
+				// 011 p tri
+				// 100 l round
+				// 101 l tri
+
+				if(m_vt.m_lod.x > 0)
+				{
+					gd.sel.ltf = context->TEX1.MMIN >> 2;
+				}
+				else
+				{
+					// TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0
+				}
+
+				gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri
+				gd.sel.lcm = context->TEX1.LCM;
+
+				int mxl = std::min<int>((int)context->TEX1.MXL, 6) << 16;
+				int k = context->TEX1.K << 12;
+
+				if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
+				{
+					k = (int)m_vt.m_lod.x << 16; // set lod to max level
+
+					gd.sel.lcm = 1; // lod is constant
+					gd.sel.mmin = 1; // tri-linear is meaningless
+				}
+
+				if(gd.sel.mmin == 2)
+				{
+					mxl--; // don't sample beyond the last level (TODO: add a dummy level instead?)
+				}
+
+				if(gd.sel.fst)
+				{
+					ASSERT(gd.sel.lcm == 1);
+					ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)
+
+					gd.sel.lcm = 1;
+				}
+
+				if(gd.sel.lcm)
+				{
+					int lod = std::max<int>(std::min<int>(k, mxl), 0);
+
+					if(gd.sel.mmin == 1)
+					{
+						lod = (lod + 0x8000) & 0xffff0000; // rounding
+					}
+
+					gd.lod.i = GSVector4i(lod >> 16);
+					gd.lod.f = GSVector4i(lod & 0xffff).xxxxl().xxzz();
+
+					// TODO: lot to optimize when lod is constant
+				}
+				else
+				{
+					gd.mxl = GSVector4((float)mxl);
+					gd.l = GSVector4((float)(-0x10000 << context->TEX1.L));
+					gd.k = GSVector4((float)k);
+				}
+
+				GIFRegTEX0 MIP_TEX0 = TEX0;
+				GIFRegCLAMP MIP_CLAMP = context->CLAMP;
+
+				GSVector4 tmin = m_vt.m_min.t;
+				GSVector4 tmax = m_vt.m_max.t;
+
+				static int s_counter = 0;
+
+				for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++)
+				{
+					switch(i)
+					{
+					case 1:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP1;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW1;
+						break;
+					case 2:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP2;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW2;
+						break;
+					case 3:
+						MIP_TEX0.TBP0 = context->MIPTBP1.TBP3;
+						MIP_TEX0.TBW = context->MIPTBP1.TBW3;
+						break;
+					case 4:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP4;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW4;
+						break;
+					case 5:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP5;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW5;
+						break;
+					case 6:
+						MIP_TEX0.TBP0 = context->MIPTBP2.TBP6;
+						MIP_TEX0.TBW = context->MIPTBP2.TBW6;
+						break;
+					default:
+						__assume(0);
+					}
+
+					if(MIP_TEX0.TW > 0) MIP_TEX0.TW--;
+					if(MIP_TEX0.TH > 0) MIP_TEX0.TH--;
+
+					MIP_CLAMP.MINU >>= 1;
+					MIP_CLAMP.MINV >>= 1;
+					MIP_CLAMP.MAXU >>= 1;
+					MIP_CLAMP.MAXV >>= 1;
+
+					m_vt.m_min.t *= 0.5f;
+					m_vt.m_max.t *= 0.5f;
+
+					GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, gd.sel.tw + 3);
+
+					if(t == NULL) {ASSERT(0); return false;}
+
+					GSVector4i r;
+
+					GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf);
+
+					data->SetSource(t, r, i);
+				}
+
+				s_counter++;
+
+				m_vt.m_min.t = tmin;
+				m_vt.m_max.t = tmax;
+			}
+			else
+			{
+				if(gd.sel.fst == 0)
+				{
+					// skip per pixel division if q is constant
+
+					GSVertexSW* RESTRICT v = data->vertex;
+
+					if(m_vt.m_eq.q)
+					{
+						gd.sel.fst = 1;
+
+						const GSVector4& t = v[data->index[0]].t;
+
+						if(t.z != 1.0f)
+						{
+							GSVector4 w = t.zzzz().rcpnr();
+
+							for(int i = 0, j = data->vertex_count; i < j; i++)
+							{
+								GSVector4 t = v[i].t;
+
+								v[i].t = (t * w).xyzw(t);
+							}
+						}
+					}
+					else if(primclass == GS_SPRITE_CLASS)
+					{
+						gd.sel.fst = 1;
+
+						for(int i = 0, j = data->vertex_count; i < j; i += 2)
+						{
+							GSVector4 t0 = v[i + 0].t;
+							GSVector4 t1 = v[i + 1].t;
+
+							GSVector4 w = t1.zzzz().rcpnr();
+
+							v[i + 0].t = (t0 * w).xyzw(t0);
+							v[i + 1].t = (t1 * w).xyzw(t1);
+						}
+					}
+				}
+
+				if(gd.sel.ltf && gd.sel.fst)
+				{
+					// if q is constant we can do the half pel shift for bilinear sampling on the vertices
+
+					// TODO: but not when mipmapping is used!!!
+
+					GSVector4 half(0x8000, 0x8000);
+
+					GSVertexSW* RESTRICT v = data->vertex;
+
+					for(int i = 0, j = data->vertex_count; i < j; i++)
+					{
+						GSVector4 t = v[i].t;
+
+						v[i].t = (t - half).xyzw(t);
+					}
+				}
+			}
+
+			uint16 tw = 1u << TEX0.TW;
+			uint16 th = 1u << TEX0.TH;
+
+			switch(context->CLAMP.WMS)
+			{
+			case CLAMP_REPEAT:
+				gd.t.min.u16[0] = gd.t.minmax.u16[0] = tw - 1;
+				gd.t.max.u16[0] = gd.t.minmax.u16[2] = 0;
+				gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				gd.t.min.u16[0] = gd.t.minmax.u16[0] = 0;
+				gd.t.max.u16[0] = gd.t.minmax.u16[2] = tw - 1;
+				gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				gd.t.min.u16[0] = gd.t.minmax.u16[0] = std::min<uint16>(context->CLAMP.MINU, tw - 1);
+				gd.t.max.u16[0] = gd.t.minmax.u16[2] = std::min<uint16>(context->CLAMP.MAXU, tw - 1);
+				gd.t.mask.u32[0] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				gd.t.min.u16[0] = gd.t.minmax.u16[0] = context->CLAMP.MINU & (tw - 1);
+				gd.t.max.u16[0] = gd.t.minmax.u16[2] = context->CLAMP.MAXU & (tw - 1);
+				gd.t.mask.u32[0] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+
+			switch(context->CLAMP.WMT)
+			{
+			case CLAMP_REPEAT:
+				gd.t.min.u16[4] = gd.t.minmax.u16[1] = th - 1;
+				gd.t.max.u16[4] = gd.t.minmax.u16[3] = 0;
+				gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			case CLAMP_CLAMP:
+				gd.t.min.u16[4] = gd.t.minmax.u16[1] = 0;
+				gd.t.max.u16[4] = gd.t.minmax.u16[3] = th - 1;
+				gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_CLAMP:
+				gd.t.min.u16[4] = gd.t.minmax.u16[1] = std::min<uint16>(context->CLAMP.MINV, th - 1);
+				gd.t.max.u16[4] = gd.t.minmax.u16[3] = std::min<uint16>(context->CLAMP.MAXV, th - 1); // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256)
+				gd.t.mask.u32[2] = 0;
+				break;
+			case CLAMP_REGION_REPEAT:
+				gd.t.min.u16[4] = gd.t.minmax.u16[1] = context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127
+				gd.t.max.u16[4] = gd.t.minmax.u16[3] = context->CLAMP.MAXV & (th - 1);
+				gd.t.mask.u32[2] = 0xffffffff;
+				break;
+			default:
+				__assume(0);
+			}
+
+			gd.t.min = gd.t.min.xxxxlh();
+			gd.t.max = gd.t.max.xxxxlh();
+			gd.t.mask = gd.t.mask.xxzz();
+			gd.t.invmask = ~gd.t.mask;
+		}
+
+		if(PRIM->FGE)
+		{
+			gd.sel.fge = 1;
+
+			gd.frb = env.FOGCOL.u32[0] & 0x00ff00ff;
+			gd.fga = (env.FOGCOL.u32[0] >> 8) & 0x00ff00ff;
+		}
+
+		if(context->FRAME.PSM != PSM_PSMCT24)
+		{
+			gd.sel.date = context->TEST.DATE;
+			gd.sel.datm = context->TEST.DATM;
+		}
+
+		if(!IsOpaque())
+		{
+			gd.sel.abe = PRIM->ABE;
+			gd.sel.ababcd = context->ALPHA.u32[0];
+
+			if(env.PABE.PABE)
+			{
+				gd.sel.pabe = 1;
+			}
+
+			if(m_aa1 && PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS))
+			{
+				gd.sel.aa1 = 1;
+			}
+
+			gd.afix = GSVector4i((int)context->ALPHA.FIX << 7).xxzzlh();
+		}
+
+		if(gd.sel.date
+		|| gd.sel.aba == 1 || gd.sel.abb == 1 || gd.sel.abc == 1 || gd.sel.abd == 1
+		|| gd.sel.atst != ATST_ALWAYS && gd.sel.afail == AFAIL_RGB_ONLY
+		|| gd.sel.fpsm == 0 && fm != 0 && fm != 0xffffffff
+		|| gd.sel.fpsm == 1 && (fm & 0x00ffffff) != 0 && (fm & 0x00ffffff) != 0x00ffffff
+		|| gd.sel.fpsm == 2 && (fm & 0x80f8f8f8) != 0 && (fm & 0x80f8f8f8) != 0x80f8f8f8)
+		{
+			gd.sel.rfb = 1;
+		}
+
+		gd.sel.colclamp = env.COLCLAMP.CLAMP;
+		gd.sel.fba = context->FBA.FBA;
+
+		if(env.DTHE.DTHE)
+		{
+			gd.sel.dthe = 1;
+
+			gd.dimx = (GSVector4i*)_aligned_malloc(sizeof(env.dimx), 32);
+
+			memcpy(gd.dimx, env.dimx, sizeof(env.dimx));
+		}
+	}
+
+	gd.sel.zwrite = zwrite;
+	gd.sel.ztest = ztest;
+
+	if(zwrite || ztest)
+	{
+		gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
+		gd.sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS;
+		gd.sel.zoverflow = (uint32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
+	}
+
+	#if _M_SSE >= 0x501
+
+	gd.fm = fm;
+	gd.zm = zm;
+
+	if(gd.sel.fpsm == 1)
+	{
+		gd.fm |= 0xff000000;
+	}
+	else if(gd.sel.fpsm == 2)
+	{
+		uint32 rb = gd.fm & 0x00f800f8;
+		uint32 ga = gd.fm & 0x8000f800;
+
+		gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
+	}
+
+	if(gd.sel.zpsm == 1)
+	{
+		gd.zm |= 0xff000000;
+	}
+	else if(gd.sel.zpsm == 2)
+	{
+		gd.zm |= 0xffff0000;
+	}
+
+	#else
+
+	gd.fm = GSVector4i(fm);
+	gd.zm = GSVector4i(zm);
+
+	if(gd.sel.fpsm == 1)
+	{
+		gd.fm |= GSVector4i::xff000000();
+	}
+	else if(gd.sel.fpsm == 2)
+	{
+		GSVector4i rb = gd.fm & 0x00f800f8;
+		GSVector4i ga = gd.fm & 0x8000f800;
+
+		gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | GSVector4i::xffff0000();
+	}
+
+	if(gd.sel.zpsm == 1)
+	{
+		gd.zm |= GSVector4i::xff000000();
+	}
+	else if(gd.sel.zpsm == 2)
+	{
+		gd.zm |= GSVector4i::xffff0000();
+	}
+
+	#endif
+
+	if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only
+	{
+		gd.sel.notest = 1;
+
+		uint32 ofx = context->XYOFFSET.OFX;
+
+		for(int i = 0, j = m_vertex.tail; i < j; i++)
+		{
+			#if _M_SSE >= 0x501
+			if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8
+			#else
+			if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4
+			#endif
+			{
+				gd.sel.notest = 0;
+			
+				break;
+			}
+		}
+	}
+
+	return true;
+}
+
+GSRendererSW::SharedData::SharedData(GSRendererSW* parent)
+	: m_parent(parent)
+	, m_fb_pages(NULL)
+	, m_zb_pages(NULL)
+	, m_fpsm(0)
+	, m_zpsm(0)
+	, m_using_pages(false)
+	, m_syncpoint(SyncNone)
+{
+	m_tex[0].t = NULL;
+
+	global.sel.key = 0;
+
+	global.clut = NULL;
+	global.dimx = NULL;
+}
+
+GSRendererSW::SharedData::~SharedData()
+{
+	ReleasePages();
+
+	if(global.clut) _aligned_free(global.clut);
+	if(global.dimx) _aligned_free(global.dimx);
+
+	if(LOG) {fprintf(s_fp, "[%d] done t=%lld p=%d | %d %d %d | %08x_%08x\n", 
+		counter, 
+		__rdtsc() - start, pixels,
+		primclass, vertex_count, index_count,
+		global.sel.hi, global.sel.lo 
+		); 
+	fflush(s_fp);}
+}
+
+//static TransactionScope::Lock s_lock;
+
+void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm)
+{
+	if(m_using_pages) return;
+
+	{
+		//TransactionScope scope(s_lock);
+
+		if(global.sel.fb && fb_pages != NULL)
+		{
+			m_parent->UsePages(fb_pages, 0);
+		}
+
+		if(global.sel.zb && zb_pages != NULL)
+		{
+			m_parent->UsePages(zb_pages, 1);
+		}
+
+		for(size_t i = 0; m_tex[i].t != NULL; i++)
+		{
+			m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
+		}
+	}
+
+	m_fb_pages = fb_pages;
+	m_zb_pages = zb_pages;
+	m_fpsm = fpsm;
+	m_zpsm = zpsm;
+
+	m_using_pages = true;
+}
+
+void GSRendererSW::SharedData::ReleasePages()
+{
+	if(!m_using_pages) return;
+
+	{
+		//TransactionScope scope(s_lock);
+
+		if(global.sel.fb)
+		{
+			m_parent->ReleasePages(m_fb_pages, 0);
+		}
+
+		if(global.sel.zb)
+		{
+			m_parent->ReleasePages(m_zb_pages, 1);
+		}
+
+		for(size_t i = 0; m_tex[i].t != NULL; i++)
+		{
+			m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
+		}
+	}
+
+	delete [] m_fb_pages;
+	delete [] m_zb_pages;
+
+	m_fb_pages = NULL;
+	m_zb_pages = NULL;
+
+	m_using_pages = false;
+}
+
+void GSRendererSW::SharedData::SetSource(GSTextureCacheSW::Texture* t, const GSVector4i& r, int level)
+{
+	ASSERT(m_tex[level].t == NULL);
+
+	m_tex[level].t = t;
+	m_tex[level].r = r;
+
+	m_tex[level + 1].t = NULL;
+}
+
+void GSRendererSW::SharedData::UpdateSource()
+{
+	for(size_t i = 0; m_tex[i].t != NULL; i++)
+	{
+		if(m_tex[i].t->Update(m_tex[i].r))
+		{
+			global.tex[i] = m_tex[i].t->m_buff;
+		}
+		else
+		{
+			printf("GSdx: out-of-memory, texturing temporarily disabled\n");
+
+			global.sel.tfx = TFX_NONE;
+		}
+	}
+
+	// TODO
+		
+	if(m_parent->s_dump)
+	{
+		uint64 frame = m_parent->m_perfmon.GetFrame();
+
+		string s;
+
+		if(m_parent->s_savet && m_parent->s_n >= m_parent->s_saven)
+		{
+			for(size_t i = 0; m_tex[i].t != NULL; i++)
+			{
+				s = format("%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n - 2, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM);
+
+				m_tex[i].t->Save(root_sw+s);
+			}
+
+			if(global.clut != NULL)
+			{
+				GSTextureSW* t = new GSTextureSW(0, 256, 1);
+
+				t->Update(GSVector4i(0, 0, 256, 1), global.clut, sizeof(uint32) * 256);
+
+				s = format("%05d_f%lld_texp_%05x_%d.bmp", m_parent->s_n - 2, frame, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM);
+
+				t->Save(root_sw+s);
+
+				delete t;
+			}
+		}
+	}
+}
diff --git a/plugins/GSdx_legacy/GSRendererSW.h b/plugins/GSdx_legacy/GSRendererSW.h
new file mode 100644
index 0000000000..b7b66c145c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSRendererSW.h
@@ -0,0 +1,100 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+#include "GSTextureCacheSW.h"
+#include "GSDrawScanline.h"
+
+class GSRendererSW : public GSRenderer
+{
+	class SharedData : public GSDrawScanline::SharedData
+	{
+		__aligned(struct, 16) TextureLevel 
+		{
+			GSVector4i r; 
+			GSTextureCacheSW::Texture* t;
+		};
+
+	public:
+		GSRendererSW* m_parent;
+		const uint32* m_fb_pages;
+		const uint32* m_zb_pages;
+		int m_fpsm;
+		int m_zpsm;
+		bool m_using_pages;
+		TextureLevel m_tex[7 + 1]; // NULL terminated
+		enum {SyncNone, SyncSource, SyncTarget} m_syncpoint;
+
+	public:
+		SharedData(GSRendererSW* parent);
+		virtual ~SharedData();
+
+		void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm);
+		void ReleasePages();
+
+		void SetSource(GSTextureCacheSW::Texture* t, const GSVector4i& r, int level);
+		void UpdateSource();
+	};
+
+	typedef void (GSRendererSW::*ConvertVertexBufferPtr)(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+	ConvertVertexBufferPtr m_cvb[4][2][2];
+
+	template<uint32 primclass, uint32 tme, uint32 fst>
+	void ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
+
+protected:
+	IRasterizer* m_rl;
+	GSTextureCacheSW* m_tc;
+	GSTexture* m_texture[2];
+	uint8* m_output;
+	GSPixelOffset4* m_fzb;
+	GSVector4i m_fzb_bbox;
+	uint32 m_fzb_cur_pages[16];
+	std::atomic<uint32> m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved
+	std::atomic<uint16> m_tex_pages[512];
+	uint32 m_tmp_pages[512 + 1];
+
+	void Reset();
+	void VSync(int field);
+	void ResetDevice();
+	GSTexture* GetOutput(int i);
+
+	void Draw();
+	void Queue(shared_ptr<GSRasterizerData>& item);
+	void Sync(int reason);
+	void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
+	void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
+
+	void UsePages(const uint32* pages, const int type);
+	void ReleasePages(const uint32* pages, const int type);
+
+	bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r);
+	bool CheckSourcePages(SharedData* sd);
+
+	bool GetScanlineGlobalData(SharedData* data);
+
+public:
+	GSRendererSW(int threads);
+	virtual ~GSRendererSW();
+};
diff --git a/plugins/GSdx_legacy/GSScanlineEnvironment.h b/plugins/GSdx_legacy/GSScanlineEnvironment.h
new file mode 100644
index 0000000000..cc71026b72
--- /dev/null
+++ b/plugins/GSdx_legacy/GSScanlineEnvironment.h
@@ -0,0 +1,216 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSLocalMemory.h"
+#include "GSVector.h"
+
+union GSScanlineSelector
+{
+	struct
+	{
+		uint32 fpsm:2; // 0
+		uint32 zpsm:2; // 2
+		uint32 ztst:2; // 4 (0: off, 1: write, 2: test (ge), 3: test (g))
+		uint32 atst:3; // 6
+		uint32 afail:2; // 9
+		uint32 iip:1; // 11
+		uint32 tfx:3; // 12
+		uint32 tcc:1; // 15
+		uint32 fst:1; // 16
+		uint32 ltf:1; // 17
+		uint32 tlu:1; // 18
+		uint32 fge:1; // 19
+		uint32 date:1; // 20
+		uint32 abe:1; // 21
+		uint32 aba:2; // 22
+		uint32 abb:2; // 24
+		uint32 abc:2; // 26
+		uint32 abd:2; // 28
+		uint32 pabe:1; // 30
+		uint32 aa1:1; // 31
+
+		uint32 fwrite:1; // 32
+		uint32 ftest:1; // 33
+		uint32 rfb:1; // 34
+		uint32 zwrite:1; // 35
+		uint32 ztest:1; // 36
+		uint32 zoverflow:1; // 37 (z max >= 0x80000000)
+		uint32 wms:2; // 38
+		uint32 wmt:2; // 40
+		uint32 datm:1; // 42
+		uint32 colclamp:1; // 43
+		uint32 fba:1; // 44
+		uint32 dthe:1; // 45
+		uint32 prim:2; // 46
+
+		uint32 edge:1; // 48
+		uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
+		uint32 lcm:1; // 52
+		uint32 mmin:2; // 53
+		uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
+		// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
+	};
+
+	struct
+	{
+		uint32 _pad1:22;
+		uint32 ababcd:8;
+		uint32 _pad2:2;
+		uint32 fb:2;
+		uint32 _pad3:1;
+		uint32 zb:2;
+	};
+
+	struct
+	{
+		uint32 lo;
+		uint32 hi;
+	};
+
+	uint64 key;
+
+	operator uint32() const {return lo;}
+	operator uint64() const {return key;}
+
+	bool IsSolidRect() const
+	{
+		return prim == GS_SPRITE_CLASS
+			&& iip == 0
+			&& tfx == TFX_NONE
+			&& abe == 0
+			&& ztst <= 1
+			&& atst <= 1
+			&& date == 0
+			&& fge == 0;
+	}
+};
+
+__aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like a pixel shader constant buffer
+{
+	GSScanlineSelector sel;
+
+	// - the data of vm, tex may change, multi-threaded drawing must be finished before that happens, clut and dimx are copies
+	// - tex is a cached texture, it may be recycled to free up memory, its absolute address cannot be compiled into code
+	// - row and column pointers are allocated once and never change or freed, thier address can be used directly
+
+	void* vm;
+	const void* tex[7];
+	uint32* clut;
+	GSVector4i* dimx;
+
+	const int* fbr;
+	const int* zbr;
+	const int* fbc;
+	const int* zbc;
+	const GSVector2i* fzbr;
+	const GSVector2i* fzbc;
+
+	GSVector4i aref;
+	GSVector4i afix;
+	struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
+
+	#if _M_SSE >= 0x501
+
+	uint32 fm, zm;
+	uint32 frb, fga;
+	GSVector8 mxl;
+	GSVector8 k; // TEX1.K * 0x10000
+	GSVector8 l; // TEX1.L * -0x10000
+	struct {GSVector8i i, f;} lod; // lcm == 1
+
+	#else
+
+	GSVector4i fm, zm;
+	GSVector4i frb, fga;
+	GSVector4 mxl;
+	GSVector4 k; // TEX1.K * 0x10000
+	GSVector4 l; // TEX1.L * -0x10000
+	struct {GSVector4i i, f;} lod; // lcm == 1
+
+	#endif
+};
+
+__aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own
+{
+	#if _M_SSE >= 0x501
+
+	struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
+	struct step {GSVector4 stq; struct {uint32 rb, ga;} c; struct {uint32 z, f;} p;} d8;
+	struct {GSVector8i rb, ga;} c;
+	struct {uint32 z, f;} p;
+
+	// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
+
+	struct 
+	{
+		GSVector8 z, zo;
+		GSVector8i f;
+		GSVector8 s, t, q;
+		GSVector8i rb, ga;
+		GSVector8i zs, zd;
+		GSVector8i uf, vf;
+		GSVector8i cov;
+
+		// mipmapping
+
+		struct {GSVector8i i, f;} lod;
+		GSVector8i uv[2];
+		GSVector8i uv_minmax[2];
+		GSVector8i trb, tga;
+		GSVector8i test;
+	} temp; 
+
+	#else
+
+	struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4];
+	struct step {GSVector4 z, stq; GSVector4i c, f;} d4;
+	struct {GSVector4i rb, ga;} c;
+	struct {GSVector4i z, f;} p;
+
+	// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
+
+	struct 
+	{
+		GSVector4 z, zo;
+		GSVector4i f;
+		GSVector4 s, t, q;
+		GSVector4i rb, ga;
+		GSVector4i zs, zd;
+		GSVector4i uf, vf;
+		GSVector4i cov;
+
+		// mipmapping
+
+		struct {GSVector4i i, f;} lod;
+		GSVector4i uv[2];
+		GSVector4i uv_minmax[2];
+		GSVector4i trb, tga;
+		GSVector4i test;
+	} temp; 
+
+	#endif
+
+	//
+
+	const GSScanlineGlobalData* gd;
+};
diff --git a/plugins/GSdx_legacy/GSSetting.cpp b/plugins/GSdx_legacy/GSSetting.cpp
new file mode 100644
index 0000000000..74c54eb411
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetting.cpp
@@ -0,0 +1,144 @@
+/*
+ *	Copyright (C) 2007-2015 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetting.h"
+#ifndef __linux__
+#include "resource.h"
+#endif
+
+const char* dialog_message(int ID, bool* updateText) {
+	if (updateText)
+		*updateText = true;
+	switch (ID)
+	{
+		case IDC_FILTER:
+			return "Control the texture bilinear filtering of the emulation.\n\n"
+				"Nearest:\nAlways disable interpolation, rendering will be blocky.\n\n"
+				"PS2:\nUse same mode as the PS2. It is the more accurate option.\n\n"
+				"Forced:\nAlways enable interpolation. Rendering is smoother but it could generate some glitches.";
+		case IDC_CRC_LEVEL:
+			return "Control the number of Auto-CRC hacks applied to games.\n\n"
+				"None:\nRemove nearly all CRC hacks (debug only).\n\n"
+				"Minimum:\nEnable a couple of CRC hacks (23).\n\n"
+				"Partial:\nEnable most of the CRC hacks.\nRecommended OpenGL setting (Accurate/depth options may be required).\n\n"
+				"Full:\nEnable all CRC hacks.\nRecommended Direct3D setting.\n\n"
+				"Aggressive:\nUse more aggressive CRC hacks. Only affects a few games, removing some effects which might make the image sharper/clearer.\n"
+				"Affected games: FFX, FFX2, FFXII, GOW2, ICO, SoTC, SSX3, SMT3, SMTDDS1, SMTDDS2.\n"
+				"Works as a speedhack for: Steambot Chronicles.";
+		case IDC_SKIPDRAWHACK:
+		case IDC_SKIPDRAWHACKEDIT:
+			return "Skips drawing n surfaces completely. "
+				"Use it, for example, to try and get rid of bad post processing effects."
+				" Try values between 1 and 100.";
+		case IDC_ALPHAHACK:
+			return "Different alpha handling. Can work around some shadow problems.";
+		case IDC_OFFSETHACK:
+			return "Might fix some misaligned fog, bloom, or blend effect.";
+		case IDC_SPRITEHACK:
+			return "Helps getting rid of black inner lines in some filtered sprites."
+				" Half option is the preferred one. Use it for Mana Khemia or Ar tonelico for example."
+				" Full can be used for Tales of Destiny.";
+		case IDC_WILDHACK:
+			return "Lowers the GS precision to avoid gaps between pixels when upscaling. Fixes the text on Wild Arms games.";
+		case IDC_MSAACB:
+			return "Enables hardware Anti-Aliasing. Needs lots of memory."
+				" The Z-24 modes might need to have LogarithmicZ to compensate for the bits lost (only in DX9 mode).\n\n"
+				" MSAA is not implemented on the OpenGL renderer.";
+		case IDC_ALPHASTENCIL:
+			return "Extend stencil based emulation of destination alpha to perform stencil operations while drawing.\n\n"
+				"Improves many shadows which are normally overdrawn in parts, may affect other effects.\n"
+				"Will disable partial transparency in some games or even prevent drawing some elements altogether.";
+		case IDC_CHECK_DISABLE_ALL_HACKS:
+			return "FOR TESTING ONLY!!\n\n"
+				"Disable all CRC hacks - will break many games. Overrides CrcHacksExclusion at gsdx.ini\n"
+				"\n"
+				"It's possible to exclude CRC hacks also via the gsdx.ini. E.g.:\n"
+				"CrcHacksExclusions=all\n"
+				"CrcHacksExclusions=0x0F0C4A9C, 0x0EE5646B, 0x7ACF7E03";
+		case IDC_ALIGN_SPRITE:
+			return "Fixes issues with upscaling(vertical lines) in Namco games like Ace Combat, Tekken, Soul Calibur, etc.";
+		case IDC_ROUND_SPRITE:
+			return "Corrects the sampling of 2D sprite textures when upscaling.\n\n"
+				"Fixes lines in sprites of games like Ar tonelico when upscaling.\n\n"
+				"Half option is for flat sprites, Full is for all sprites.";
+		case IDC_TCOFFSETX:
+		case IDC_TCOFFSETX2:
+		case IDC_TCOFFSETY:
+		case IDC_TCOFFSETY2:
+			return "Offset for the ST/UV texture coordinates. Fixes some odd texture issues and might fix some post processing alignment too.\n\n"
+				"  0500 0500, fixes Persona 3 minimap, helps Haunting Ground.\n"
+				"  0000 1000, fixes Xenosaga hair edges (DX10+ Issue)";
+		case IDC_PALTEX:
+			return "When checked 4/8 bits texture will be send to the GPU with a palette. GPU will be in charge of the conversion.\n\n"
+				"When unchecked the CPU will convert directly the texture to 32 bits.\n\n"
+				"It is basically a trade-off between GPU/CPU.";
+		case IDC_ACCURATE_DATE:
+			return "Implement a more accurate algorithm to compute GS destination alpha testing.\n\n"
+				"It could be slower when the effects are used.\n\nNote: it requires the OpenGL 4.2 extension GL_ARB_shader_image_load_store.";
+		case IDC_ACCURATE_BLEND_UNIT:
+			return "Control the accuracy level of the GS blending unit emulation. Note: it requires OpenGL 4.5 driver support.\n\n"
+				"None:\nFast but introduce various rendering issues. It is intended for slow computer.\n\n"
+				"Basic:\nEmulate correctly most of the effects with a limited speed penalty. It is the recommended setting.\n\n"
+				"Medium:\nExtend it to all sprites. Performance impact remains reasonable in 3D game.\n\n"
+				"High:\nExtend it to destination alpha blending and color wrapping. (help shadow and fog effect). A good CPU is required.\n\n"
+				"Full:\nExcept few cases, the blending unit will be fully emulated by the shader. It is ultra slow! It is intended for debug.\n\n"
+				"Ultra:\nThe blending unit will be completely emulated by the shader. It is ultra slow! It is intended for debug.";
+		case IDC_SAFE_FBMASK:
+			return "By default, accurate blending relies on undefined hardware behavior to be fast.\nThis option enables a slower but safer behavior if anyone encounters an issue.\n";
+		case IDC_TC_DEPTH:
+			return "Allows the conversion of Depth buffer from/to Color buffer. It is used for blur & depth of field effects";
+		case IDC_AFCOMBO:
+			return "Reduces texture aliasing at extreme viewing angles. High performance impact.";
+		case IDC_AA1:
+			return "Internal GS feature. Reduces edge aliasing of lines and triangles when the game requests it.";
+		case IDC_SWTHREADS:
+		case IDC_SWTHREADS_EDIT:
+			return "Number of rendering threads: 0 for single thread, 2 or more for multithread (1 is for debugging)";
+		case IDC_SHADEBOOST:
+			return "Allows brightness, contrast and saturation to be manually adjusted.";
+		case IDC_SHADER_FX:
+			return "Enables external shader for additional post-processing effects.";
+		case IDC_FXAA:
+			return "Enables fast approximate anti-aliasing. Small performance impact.";
+#ifdef _WIN32
+		// DX9 only
+		case IDC_FBA:
+			return "Makes textures partially or fully transparent as required by emulation. May cause unusual slowdowns for some games.";
+		case IDC_LOGZ:
+			return "Treat depth as logarithmic instead of linear. Recommended setting is on unless it causes graphical glitches.";
+#endif
+		// Exclusive for Hardware Renderer
+		case IDC_PRELOAD_GS:
+			return "Uploads GS data when rendering a new frame to reproduce some effects accurately. Fixes black screen issues in games like Armored Core: Last Raven.";
+		case IDC_MIPMAP:
+			return "Enables mipmapping, which some games require to render correctly. Turn off only for debug purposes.";
+#ifdef __linux__
+		case IDC_FAST_TC_INV:
+			return "By default, the texture cache handles partial invalidations. Unfortunately it is very costly to compute CPU wise."
+				"\n\nThis hack replaces the partial invalidation with a complete deletion of the texture to reduce the CPU load.\n\nIt helps snowblind engine game.";
+#endif
+		default:
+			if (updateText)
+				*updateText = false;
+			return "";
+	}
+}
diff --git a/plugins/GSdx_legacy/GSSetting.h b/plugins/GSdx_legacy/GSSetting.h
new file mode 100644
index 0000000000..ba47d92810
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetting.h
@@ -0,0 +1,78 @@
+/*
+ *	Copyright (C) 2007-2015 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "stdafx.h"
+
+struct GSSetting
+{
+	int32_t value;
+	std::string name;
+	std::string note;
+
+	template< typename T>
+	explicit GSSetting(T value, const char* name, const char* note) :
+		value(static_cast<int32_t>(value)),
+		name(name),
+		note(note)
+	{
+	}
+};
+
+const char* dialog_message(int ID, bool* updateText = NULL);
+
+#ifdef __linux__
+enum {
+	IDC_FILTER,
+	IDC_SKIPDRAWHACK,
+	IDC_SKIPDRAWHACKEDIT,
+	IDC_ALPHAHACK,
+	IDC_OFFSETHACK,
+	IDC_SPRITEHACK,
+	IDC_WILDHACK,
+	IDC_MSAACB,
+	IDC_ALPHASTENCIL,
+	IDC_CHECK_DISABLE_ALL_HACKS,
+	IDC_ALIGN_SPRITE,
+	IDC_ROUND_SPRITE,
+	IDC_TCOFFSETX,
+	IDC_TCOFFSETX2,
+	IDC_TCOFFSETY,
+	IDC_TCOFFSETY2,
+	IDC_PALTEX,
+	IDC_ACCURATE_BLEND_UNIT,
+	IDC_SAFE_FBMASK,
+	IDC_ACCURATE_DATE,
+	IDC_TC_DEPTH,
+	IDC_CRC_LEVEL,
+	IDC_AFCOMBO,
+	IDC_AA1,
+	IDC_SWTHREADS,
+	IDC_SWTHREADS_EDIT,
+	IDC_SHADEBOOST,
+	IDC_SHADER_FX,
+	IDC_FXAA,
+	IDC_MIPMAP,
+	IDC_PRELOAD_GS,
+	IDC_FAST_TC_INV,
+};
+#endif
diff --git a/plugins/GSdx_legacy/GSSettingsDlg.cpp b/plugins/GSdx_legacy/GSSettingsDlg.cpp
new file mode 100644
index 0000000000..4e0ac39229
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSettingsDlg.cpp
@@ -0,0 +1,752 @@
+/*
+ *	Copyright (C) 2007-2015 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSSettingsDlg.h"
+#include "GSUtil.h"
+#include "GSDevice9.h"
+#include "GSDevice11.h"
+#include "resource.h"
+#include "GSSetting.h"
+
+
+GSSettingsDlg::GSSettingsDlg()
+       : GSDialog(IDD_CONFIG)
+       
+{
+#ifdef ENABLE_OPENCL
+	list<OCLDeviceDesc> ocldevs;
+
+	GSUtil::GetDeviceDescs(ocldevs);
+
+	int index = 0;
+
+	for(auto dev : ocldevs)
+	{
+		m_ocl_devs.push_back(GSSetting(index++, dev.name.c_str(), ""));
+	}
+#endif
+}
+
+void GSSettingsDlg::OnInit()
+{
+	__super::OnInit();
+
+	CComPtr<IDirect3D9> d3d9;
+
+	d3d9.Attach(Direct3DCreate9(D3D_SDK_VERSION));
+
+	CComPtr<IDXGIFactory1> dxgi_factory;
+	
+	if(GSUtil::CheckDXGI())
+	{
+		CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&dxgi_factory);
+	}
+        adapters.clear();
+	adapters.push_back(Adapter("Default Hardware Device", "default", GSUtil::CheckDirect3D11Level(NULL, D3D_DRIVER_TYPE_HARDWARE)));
+	adapters.push_back(Adapter("Reference Device", "ref", GSUtil::CheckDirect3D11Level(NULL, D3D_DRIVER_TYPE_REFERENCE)));
+
+	if(dxgi_factory)
+	{
+		for(int i = 0;; i++)
+		{
+			CComPtr<IDXGIAdapter1> adapter;
+
+			if(S_OK != dxgi_factory->EnumAdapters1(i, &adapter))
+				break;
+
+			DXGI_ADAPTER_DESC1 desc;
+			
+			HRESULT hr = adapter->GetDesc1(&desc);
+			
+			if(S_OK == hr)
+			{
+				D3D_FEATURE_LEVEL level = GSUtil::CheckDirect3D11Level(adapter, D3D_DRIVER_TYPE_UNKNOWN);
+				// GSDX isn't unicode!?
+#if 1
+				int size = WideCharToMultiByte(CP_ACP, 0, desc.Description, sizeof(desc.Description), NULL, 0, NULL, NULL);
+				char *buf = new char[size];
+				WideCharToMultiByte(CP_ACP, 0, desc.Description, sizeof(desc.Description), buf, size, NULL, NULL);
+				adapters.push_back(Adapter(buf, GSAdapter(desc), level));
+				delete[] buf;
+#else
+				adapters.push_back(Adapter(desc.Description, GSAdapter(desc), level));
+#endif
+			}
+		}
+	}
+	else if(d3d9)
+	{
+		int n = d3d9->GetAdapterCount();
+		for(int i = 0; i < n; i++)
+		{
+			D3DADAPTER_IDENTIFIER9 desc;
+
+			if(D3D_OK != d3d9->GetAdapterIdentifier(i, 0, &desc))
+				break;
+
+			// GSDX isn't unicode!?
+#if 0
+			wchar_t buf[sizeof desc.Description * sizeof(WCHAR)];
+			MultiByteToWideChar(CP_ACP /* I have no idea if this is right */, 0, desc.Description, sizeof(desc.Description), buf, sizeof buf / sizeof *buf);
+			adapters.push_back(Adapter(buf, GSAdapter(desc), (D3D_FEATURE_LEVEL)0));
+#else
+			adapters.push_back(Adapter(desc.Description, GSAdapter(desc), (D3D_FEATURE_LEVEL)0));
+#endif
+		}
+	}
+
+	std::string adapter_setting = theApp.GetConfig("Adapter", "default");
+	vector<GSSetting> adapter_settings;
+	unsigned int adapter_sel = 0;
+
+	for(unsigned int i = 0; i < adapters.size(); i++)
+	{
+		if(adapters[i].id == adapter_setting)
+		{
+			adapter_sel = i;
+		}
+
+		adapter_settings.push_back(GSSetting(i, adapters[i].name.c_str(), ""));
+	}
+
+	std::string ocldev = theApp.GetConfig("ocldev", "");
+
+	unsigned int ocl_sel = 0;
+
+	for(unsigned int i = 0; i < m_ocl_devs.size(); i++)
+	{
+		if(ocldev == m_ocl_devs[i].name)
+		{
+			ocl_sel = i;
+
+			break;
+		}
+	}
+
+	ComboBoxInit(IDC_ADAPTER, adapter_settings, adapter_sel);
+	ComboBoxInit(IDC_OPENCL_DEVICE, m_ocl_devs, ocl_sel);
+	UpdateRenderers();
+
+	ComboBoxInit(IDC_INTERLACE, theApp.m_gs_interlace, theApp.GetConfig("Interlace", 7)); // 7 = "auto", detects interlace based on SMODE2 register
+	ComboBoxInit(IDC_UPSCALE_MULTIPLIER, theApp.m_gs_upscale_multiplier, theApp.GetConfig("upscale_multiplier", 1));
+	ComboBoxInit(IDC_AFCOMBO, theApp.m_gs_max_anisotropy, theApp.GetConfig("MaxAnisotropy", 0));
+	ComboBoxInit(IDC_FILTER, theApp.m_gs_filter, theApp.GetConfig("filter", 2));
+	ComboBoxInit(IDC_ACCURATE_BLEND_UNIT, theApp.m_gs_acc_blend_level, theApp.GetConfig("accurate_blending_unit", 1));
+	ComboBoxInit(IDC_CRC_LEVEL, theApp.m_gs_crc_level, theApp.GetConfig("crc_hack_level", 3));
+
+	CheckDlgButton(m_hWnd, IDC_PALTEX, theApp.GetConfig("paltex", 0));
+	CheckDlgButton(m_hWnd, IDC_LOGZ, theApp.GetConfig("logz", 1));
+	CheckDlgButton(m_hWnd, IDC_FBA, theApp.GetConfig("fba", 1));
+	CheckDlgButton(m_hWnd, IDC_AA1, theApp.GetConfig("aa1", 0));
+	CheckDlgButton(m_hWnd, IDC_MIPMAP, theApp.GetConfig("mipmap", 1));
+	CheckDlgButton(m_hWnd, IDC_ACCURATE_DATE, theApp.GetConfig("accurate_date", 0));
+	CheckDlgButton(m_hWnd, IDC_TC_DEPTH, theApp.GetConfig("texture_cache_depth", 0));
+	
+	// Hacks
+	CheckDlgButton(m_hWnd, IDC_HACKS_ENABLED, theApp.GetConfig("UserHacks", 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_RESX), UDM_SETRANGE, 0, MAKELPARAM(8192, 256));
+	SendMessage(GetDlgItem(m_hWnd, IDC_RESX), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resx", 1024), 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETRANGE, 0, MAKELPARAM(8192, 256));
+	SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resy", 1024), 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_SETRANGE, 0, MAKELPARAM(16, 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("extrathreads", DEFAULT_EXTRA_RENDERING_THREADS), 0));
+
+	AddTooltip(IDC_FILTER);
+	AddTooltip(IDC_CRC_LEVEL);
+	AddTooltip(IDC_PALTEX);
+	AddTooltip(IDC_ACCURATE_DATE);
+	AddTooltip(IDC_ACCURATE_BLEND_UNIT);
+	AddTooltip(IDC_TC_DEPTH);
+	AddTooltip(IDC_AFCOMBO);
+	AddTooltip(IDC_AA1);
+	AddTooltip(IDC_MIPMAP);
+	AddTooltip(IDC_SWTHREADS);
+	AddTooltip(IDC_SWTHREADS_EDIT);
+	AddTooltip(IDC_FBA);
+	AddTooltip(IDC_LOGZ);
+
+	UpdateControls();
+}
+
+bool GSSettingsDlg::OnCommand(HWND hWnd, UINT id, UINT code)
+{
+	switch (id)
+	{
+		case IDC_ADAPTER:
+			if (code == CBN_SELCHANGE)
+			{
+				UpdateRenderers();
+				UpdateControls();
+			}
+			break;
+		case IDC_RENDERER:
+		case IDC_UPSCALE_MULTIPLIER:
+		case IDC_FILTER:
+			if (code == CBN_SELCHANGE)
+				UpdateControls();
+			break;
+		case IDC_PALTEX:
+		case IDC_HACKS_ENABLED:
+			if (code == BN_CLICKED)
+				UpdateControls();
+			break;
+		case IDC_SHADEBUTTON:
+			if (code == BN_CLICKED)
+				ShaderDlg.DoModal();
+			break;
+		case IDC_HACKSBUTTON:
+			if (code == BN_CLICKED)
+				HacksDlg.DoModal();
+			break;
+		case IDOK:
+		{
+			INT_PTR data;
+
+			if(ComboBoxGetSelData(IDC_ADAPTER, data))
+			{
+				theApp.SetConfig("Adapter", adapters[(int)data].id.c_str());
+			}
+
+			if(ComboBoxGetSelData(IDC_OPENCL_DEVICE, data))
+			{
+				if ((int)data < m_ocl_devs.size()) {
+					theApp.SetConfig("ocldev", m_ocl_devs[(int)data].name.c_str());
+				}
+			}
+
+			if(ComboBoxGetSelData(IDC_RENDERER, data))
+			{
+				theApp.SetConfig("Renderer", (int)data);
+			}
+
+			if(ComboBoxGetSelData(IDC_INTERLACE, data))
+			{
+				theApp.SetConfig("Interlace", (int)data);
+			}
+
+			if(ComboBoxGetSelData(IDC_UPSCALE_MULTIPLIER, data))
+			{
+				theApp.SetConfig("upscale_multiplier", (int)data);
+			}
+			else
+			{
+				theApp.SetConfig("upscale_multiplier", 1);
+			}
+
+			if (ComboBoxGetSelData(IDC_FILTER, data))
+			{
+				theApp.SetConfig("filter", (int)data);
+			}
+
+			if(ComboBoxGetSelData(IDC_ACCURATE_BLEND_UNIT, data))
+			{
+				theApp.SetConfig("accurate_blending_unit", (int)data);
+			}
+
+			if (ComboBoxGetSelData(IDC_CRC_LEVEL, data))
+			{
+				theApp.SetConfig("crc_hack_level", (int)data);
+			}
+
+			if(ComboBoxGetSelData(IDC_AFCOMBO, data))
+			{
+				theApp.SetConfig("MaxAnisotropy", (int)data);
+			}
+
+			theApp.SetConfig("paltex", (int)IsDlgButtonChecked(m_hWnd, IDC_PALTEX));
+			theApp.SetConfig("logz", (int)IsDlgButtonChecked(m_hWnd, IDC_LOGZ));
+			theApp.SetConfig("fba", (int)IsDlgButtonChecked(m_hWnd, IDC_FBA));
+			theApp.SetConfig("aa1", (int)IsDlgButtonChecked(m_hWnd, IDC_AA1));
+			theApp.SetConfig("mipmap", (int)IsDlgButtonChecked(m_hWnd, IDC_MIPMAP));
+			theApp.SetConfig("resx", (int)SendMessage(GetDlgItem(m_hWnd, IDC_RESX), UDM_GETPOS, 0, 0));
+			theApp.SetConfig("resy", (int)SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_GETPOS, 0, 0));
+			theApp.SetConfig("extrathreads", (int)SendMessage(GetDlgItem(m_hWnd, IDC_SWTHREADS), UDM_GETPOS, 0, 0));
+			theApp.SetConfig("accurate_date", (int)IsDlgButtonChecked(m_hWnd, IDC_ACCURATE_DATE));
+			theApp.SetConfig("texture_cache_depth", (int)IsDlgButtonChecked(m_hWnd, IDC_TC_DEPTH));
+			theApp.SetConfig("UserHacks", (int)IsDlgButtonChecked(m_hWnd, IDC_HACKS_ENABLED));
+		}
+		break;
+	}
+
+	return __super::OnCommand(hWnd, id, code);
+}
+
+void GSSettingsDlg::UpdateRenderers()
+{
+	INT_PTR i;
+
+	if (!ComboBoxGetSelData(IDC_ADAPTER, i))
+		return;
+
+	// Ugggh
+	HacksDlg.SetAdapter(adapters[(int)i].id);
+
+	D3D_FEATURE_LEVEL level = adapters[(int)i].level;
+
+	vector<GSSetting> renderers;
+
+	GSRendererType renderer_setting;
+
+	if (ComboBoxGetSelData(IDC_RENDERER, i))
+		renderer_setting = static_cast<GSRendererType>(i);
+	else
+		renderer_setting = static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(GSRendererType::Default)));
+
+	GSRendererType renderer_sel = GSRendererType::Default;
+
+	for(size_t i = 0; i < theApp.m_gs_renderers.size(); i++)
+	{
+		GSSetting r = theApp.m_gs_renderers[i];
+
+		GSRendererType renderer = static_cast<GSRendererType>(r.value);
+
+		if(renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::DX1011_Null || renderer == GSRendererType::DX1011_OpenCL)
+		{
+			if(level < D3D_FEATURE_LEVEL_10_0) continue;
+#if 0
+			// This code is disabled so the renderer name doesn't get messed with.
+			// Just call it Direct3D11.
+			r.name += (level >= D3D_FEATURE_LEVEL_11_0 ? "11" : "10");
+#endif
+		}
+
+		renderers.push_back(r);
+
+		if (static_cast<GSRendererType>(r.value) == renderer_setting)
+		{
+			renderer_sel = renderer_setting;
+		}
+	}
+
+	ComboBoxInit(IDC_RENDERER, renderers, static_cast<int32_t>(renderer_sel));
+}
+
+void GSSettingsDlg::UpdateControls()
+{
+	INT_PTR i;
+
+	int integer_scaling = 0; // in case reading the combo doesn't work, enable the custom res control anyway
+
+	if(ComboBoxGetSelData(IDC_UPSCALE_MULTIPLIER, i))
+	{
+		integer_scaling = (int)i;
+	}
+
+	if(ComboBoxGetSelData(IDC_RENDERER, i))
+	{
+		GSRendererType renderer = static_cast<GSRendererType>(i);
+
+		bool dx9 = renderer == GSRendererType::DX9_HW || renderer == GSRendererType::DX9_SW || renderer == GSRendererType::DX9_Null || renderer == GSRendererType::DX9_OpenCL;
+		bool dx11 = renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::DX1011_Null || renderer == GSRendererType::DX1011_OpenCL;
+		bool ogl = renderer == GSRendererType::OGL_HW || renderer == GSRendererType::OGL_SW || renderer == GSRendererType::OGL_OpenCL;
+
+		bool hw = renderer == GSRendererType::DX9_HW || renderer == GSRendererType::DX1011_HW || renderer == GSRendererType::OGL_HW || renderer == GSRendererType::Null_HW;
+		bool sw = renderer == GSRendererType::DX9_SW || renderer == GSRendererType::DX1011_SW || renderer == GSRendererType::OGL_SW  || renderer == GSRendererType::Null_SW;
+		bool ocl = renderer == GSRendererType::DX9_OpenCL || renderer == GSRendererType::DX1011_OpenCL || renderer == GSRendererType::Null_OpenCL || renderer == GSRendererType::OGL_OpenCL;
+
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO9), dx9 ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGO11), dx11 ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGOGL), ogl ? SW_SHOW : SW_HIDE);
+#ifndef ENABLE_OPENCL
+		ShowWindow(GetDlgItem(m_hWnd, IDC_OPENCL_DEVICE), SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_OPENCL_TEXT), SW_HIDE);
+#endif
+
+		ShowWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9? SW_SHOW: SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 ? SW_SHOW : SW_HIDE);
+
+		ShowWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_DATE), ogl ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_BLEND_UNIT), ogl ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_BLEND_UNIT_TEXT), ogl ? SW_SHOW : SW_HIDE);
+		ShowWindow(GetDlgItem(m_hWnd, IDC_TC_DEPTH), ogl ? SW_SHOW : SW_HIDE);
+
+		EnableWindow(GetDlgItem(m_hWnd, IDC_CRC_LEVEL), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_CRC_LEVEL_TEXT), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_OPENCL_DEVICE), ocl);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_RESX), hw && !integer_scaling);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_RESX_EDIT), hw && !integer_scaling);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_RESY), hw && !integer_scaling);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_RESY_EDIT), hw && !integer_scaling);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_CUSTOM_TEXT), hw && !integer_scaling);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_UPSCALE_MULTIPLIER), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_UPSCALE_MULTIPLIER_TEXT), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_PALTEX), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 && hw);
+
+		INT_PTR filter;
+		if (ComboBoxGetSelData(IDC_FILTER, filter))
+		{
+			EnableWindow(GetDlgItem(m_hWnd, IDC_AFCOMBO), hw && filter && !IsDlgButtonChecked(m_hWnd, IDC_PALTEX));
+		}
+		EnableWindow(GetDlgItem(m_hWnd, IDC_AFCOMBO_TEXT), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER_TEXT), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_DATE), ogl && hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_BLEND_UNIT), ogl && hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_ACCURATE_BLEND_UNIT_TEXT), ogl && hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_TC_DEPTH), ogl && hw);
+		
+		// Software mode settings
+		EnableWindow(GetDlgItem(m_hWnd, IDC_AA1), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_MIPMAP), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_TEXT), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS), sw);
+
+		// Hacks
+		EnableWindow(GetDlgItem(m_hWnd, IDC_HACKS_ENABLED), hw);
+		EnableWindow(GetDlgItem(m_hWnd, IDC_HACKSBUTTON), hw && IsDlgButtonChecked(m_hWnd, IDC_HACKS_ENABLED));
+	}
+
+}
+
+// Shader Configuration Dialog
+
+GSShaderDlg::GSShaderDlg() :
+	GSDialog(IDD_SHADER)
+{}
+
+void GSShaderDlg::OnInit()
+{
+	//TV Shader
+	ComboBoxInit(IDC_TVSHADER, theApp.m_gs_tv_shaders, theApp.GetConfig("TVshader", 0));
+
+	//Shade Boost
+	CheckDlgButton(m_hWnd, IDC_SHADEBOOST, theApp.GetConfig("ShadeBoost", 0));
+	contrast = theApp.GetConfig("ShadeBoost_Contrast", 50);
+	brightness = theApp.GetConfig("ShadeBoost_Brightness", 50);
+	saturation = theApp.GetConfig("ShadeBoost_Saturation", 50);
+
+	// External FX shader
+	CheckDlgButton(m_hWnd, IDC_SHADER_FX, theApp.GetConfig("shaderfx", 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_EDIT), WM_SETTEXT, 0, (LPARAM)theApp.GetConfig("shaderfx_glsl", "shaders\\GSdx.fx").c_str());
+	SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_EDIT), WM_SETTEXT, 0, (LPARAM)theApp.GetConfig("shaderfx_conf", "shaders\\GSdx_FX_Settings.ini").c_str());
+
+	// FXAA shader
+	CheckDlgButton(m_hWnd, IDC_FXAA, theApp.GetConfig("Fxaa", 0));
+
+	AddTooltip(IDC_SHADEBOOST);
+	AddTooltip(IDC_SHADER_FX);
+	AddTooltip(IDC_FXAA);
+
+	UpdateControls();
+}
+
+void GSShaderDlg::UpdateControls()
+{
+	SendMessage(GetDlgItem(m_hWnd, IDC_SATURATION_SLIDER), TBM_SETRANGE, TRUE, MAKELONG(0, 100));
+	SendMessage(GetDlgItem(m_hWnd, IDC_BRIGHTNESS_SLIDER), TBM_SETRANGE, TRUE, MAKELONG(0, 100));
+	SendMessage(GetDlgItem(m_hWnd, IDC_CONTRAST_SLIDER), TBM_SETRANGE, TRUE, MAKELONG(0, 100));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_SATURATION_SLIDER), TBM_SETPOS, TRUE, saturation);
+	SendMessage(GetDlgItem(m_hWnd, IDC_BRIGHTNESS_SLIDER), TBM_SETPOS, TRUE, brightness);
+	SendMessage(GetDlgItem(m_hWnd, IDC_CONTRAST_SLIDER), TBM_SETPOS, TRUE, contrast);
+
+	char text[8] = {0};
+
+	sprintf(text, "%d", saturation);
+	SetDlgItemText(m_hWnd, IDC_SATURATION_TEXT, text);
+	sprintf(text, "%d", brightness);
+	SetDlgItemText(m_hWnd, IDC_BRIGHTNESS_TEXT, text);
+	sprintf(text, "%d", contrast);
+	SetDlgItemText(m_hWnd, IDC_CONTRAST_TEXT, text);
+
+	// Shader Settings
+	bool external_shader_selected = IsDlgButtonChecked(m_hWnd, IDC_SHADER_FX) == BST_CHECKED;
+	bool shadeboost_selected = IsDlgButtonChecked(m_hWnd, IDC_SHADEBOOST) == BST_CHECKED;
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SATURATION_SLIDER), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_BRIGHTNESS_SLIDER), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_CONTRAST_SLIDER), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SATURATION_TEXT), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_BRIGHTNESS_TEXT), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_CONTRAST_TEXT), shadeboost_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_TEXT), external_shader_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_EDIT), external_shader_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_BUTTON), external_shader_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_TEXT), external_shader_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_EDIT), external_shader_selected);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_BUTTON), external_shader_selected);
+}
+
+bool GSShaderDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
+{
+	switch(message)
+	{
+	case WM_HSCROLL:	
+	{											
+		if((HWND)lParam == GetDlgItem(m_hWnd, IDC_SATURATION_SLIDER)) 
+		{	
+			char text[8] = {0};
+
+			saturation = SendMessage(GetDlgItem(m_hWnd, IDC_SATURATION_SLIDER),TBM_GETPOS,0,0);			
+				
+			sprintf(text, "%d", saturation);
+			SetDlgItemText(m_hWnd, IDC_SATURATION_TEXT, text);
+		}
+		else if((HWND)lParam == GetDlgItem(m_hWnd, IDC_BRIGHTNESS_SLIDER)) 
+		{	
+			char text[8] = {0};
+
+			brightness = SendMessage(GetDlgItem(m_hWnd, IDC_BRIGHTNESS_SLIDER),TBM_GETPOS,0,0);			
+				
+			sprintf(text, "%d", brightness);
+			SetDlgItemText(m_hWnd, IDC_BRIGHTNESS_TEXT, text);
+		}
+		else if((HWND)lParam == GetDlgItem(m_hWnd, IDC_CONTRAST_SLIDER)) 
+		{	
+			char text[8] = {0};
+
+			contrast = SendMessage(GetDlgItem(m_hWnd, IDC_CONTRAST_SLIDER),TBM_GETPOS,0,0);
+							
+			sprintf(text, "%d", contrast);
+			SetDlgItemText(m_hWnd, IDC_CONTRAST_TEXT, text);
+		}
+	} break;
+
+	case WM_COMMAND:
+	{
+		int id = LOWORD(wParam);
+
+		switch(id)
+		{
+		case IDOK:
+		{
+			INT_PTR data;
+			//TV Shader
+			if (ComboBoxGetSelData(IDC_TVSHADER, data))
+			{
+				theApp.SetConfig("TVshader", (int)data);
+			}
+			// Shade Boost
+			theApp.SetConfig("ShadeBoost", (int)IsDlgButtonChecked(m_hWnd, IDC_SHADEBOOST));
+			theApp.SetConfig("ShadeBoost_Contrast", contrast);
+			theApp.SetConfig("ShadeBoost_Brightness", brightness);
+			theApp.SetConfig("ShadeBoost_Saturation", saturation);
+
+			// FXAA shader
+			theApp.SetConfig("Fxaa", (int)IsDlgButtonChecked(m_hWnd, IDC_FXAA));
+
+			// External FX Shader
+			theApp.SetConfig("shaderfx", (int)IsDlgButtonChecked(m_hWnd, IDC_SHADER_FX));
+
+			// External FX Shader(OpenGL)
+			int shader_fx_length = (int)SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_EDIT), WM_GETTEXTLENGTH, 0, 0);
+			int shader_fx_conf_length = (int)SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_EDIT), WM_GETTEXTLENGTH, 0, 0);
+			int length = std::max(shader_fx_length, shader_fx_conf_length) + 1;
+			char *buffer = new char[length];
+
+
+			SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_EDIT), WM_GETTEXT, (WPARAM)length, (LPARAM)buffer);
+			theApp.SetConfig("shaderfx_glsl", buffer); // Not really glsl only ;)
+			SendMessage(GetDlgItem(m_hWnd, IDC_SHADER_FX_CONF_EDIT), WM_GETTEXT, (WPARAM)length, (LPARAM)buffer);
+			theApp.SetConfig("shaderfx_conf", buffer);
+			delete[] buffer;
+
+			EndDialog(m_hWnd, id);		
+		} break;
+		case IDC_SHADEBOOST:
+			UpdateControls();
+		case IDC_SHADER_FX:
+			if (HIWORD(wParam) == BN_CLICKED)
+				UpdateControls();
+			break;
+		case IDC_SHADER_FX_BUTTON:
+			if (HIWORD(wParam) == BN_CLICKED)
+				OpenFileDialog(IDC_SHADER_FX_EDIT, "Select External Shader");
+			break;
+
+		case IDC_SHADER_FX_CONF_BUTTON:
+			if (HIWORD(wParam) == BN_CLICKED)
+				OpenFileDialog(IDC_SHADER_FX_CONF_EDIT, "Select External Shader Config");
+			break;
+
+		case IDCANCEL:
+		{
+			EndDialog(m_hWnd, IDCANCEL);
+		} break;
+		}
+
+	} break;
+
+	case WM_CLOSE:EndDialog(m_hWnd, IDCANCEL); break;
+
+	default: return false;
+	}
+	
+
+	return true;
+}
+
+// Hacks Dialog
+
+GSHacksDlg::GSHacksDlg() : 
+	GSDialog(IDD_HACKS)
+{
+	memset(msaa2cb, 0, sizeof(msaa2cb));
+	memset(cb2msaa, 0, sizeof(cb2msaa));
+}
+
+void GSHacksDlg::OnInit()
+{
+	HWND hwnd_renderer = GetDlgItem(GetParent(m_hWnd), IDC_RENDERER);
+	GSRendererType renderer = static_cast<GSRendererType>(SendMessage(hwnd_renderer, CB_GETITEMDATA, SendMessage(hwnd_renderer, CB_GETCURSEL, 0, 0), 0));
+	// It can only be accessed with a HW renderer, so this is sufficient.
+	bool dx9 = renderer == GSRendererType::DX9_HW;
+	// bool dx11 = renderer == GSRendererType::DX1011_HW;
+	bool ogl = renderer == GSRendererType::OGL_HW;
+	unsigned short cb = 0;
+
+	if(dx9) for(unsigned short i = 0; i < 17; i++)
+	{
+		if( i == 1) continue;
+
+		int depth = GSDevice9::GetMaxDepth(i, adapter_id);
+
+		if(depth)
+		{
+			char text[32] = {0};
+			sprintf(text, depth == 32 ? "%dx Z-32" : "%dx Z-24", i);
+			SendMessage(GetDlgItem(m_hWnd, IDC_MSAACB), CB_ADDSTRING, 0, (LPARAM)text);
+
+			msaa2cb[i] = cb;
+			cb2msaa[cb] = i;
+			cb++;
+		}
+	}
+	else for(unsigned short j = 0; j < 5; j++) // TODO: Make the same kind of check for d3d11, eventually....
+	{
+		unsigned short i = j == 0 ? 0 : 1 << j;
+		
+		msaa2cb[i] = j;
+		cb2msaa[j] = i;
+		
+		char text[32] = {0};
+		sprintf(text, "%dx ", i);
+
+		SendMessage(GetDlgItem(m_hWnd, IDC_MSAACB), CB_ADDSTRING, 0, (LPARAM)text);
+	}
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_MSAACB), CB_SETCURSEL, msaa2cb[min(theApp.GetConfig("UserHacks_MSAA", 0), 16)], 0);
+
+	CheckDlgButton(m_hWnd, IDC_ALPHAHACK, theApp.GetConfig("UserHacks_AlphaHack", 0));
+	CheckDlgButton(m_hWnd, IDC_OFFSETHACK, theApp.GetConfig("UserHacks_HalfPixelOffset", 0));
+	CheckDlgButton(m_hWnd, IDC_WILDHACK, theApp.GetConfig("UserHacks_WildHack", 0));
+	CheckDlgButton(m_hWnd, IDC_ALPHASTENCIL, theApp.GetConfig("UserHacks_AlphaStencil", 0));
+	CheckDlgButton(m_hWnd, IDC_PRELOAD_GS, theApp.GetConfig("preload_frame_with_gs_data", 0));
+	CheckDlgButton(m_hWnd, IDC_ALIGN_SPRITE, theApp.GetConfig("UserHacks_align_sprite_X", 0));
+	CheckDlgButton(m_hWnd, IDC_SAFE_FBMASK, theApp.GetConfig("UserHacks_safe_fbmask", 0));
+
+
+	ComboBoxInit(IDC_ROUND_SPRITE, theApp.m_gs_hack, theApp.GetConfig("UserHacks_round_sprite_offset", 0));
+	ComboBoxInit(IDC_SPRITEHACK, theApp.m_gs_hack, theApp.GetConfig("UserHacks_SpriteHack", 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_SKIPDRAWHACK), UDM_SETRANGE, 0, MAKELPARAM(1000, 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_SKIPDRAWHACK), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("UserHacks_SkipDraw", 0), 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETX), UDM_SETRANGE, 0, MAKELPARAM(10000, 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETX), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("UserHacks_TCOffset", 0) & 0xFFFF, 0));
+
+	SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETY), UDM_SETRANGE, 0, MAKELPARAM(10000, 0));
+	SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETY), UDM_SETPOS, 0, MAKELPARAM((theApp.GetConfig("UserHacks_TCOffset", 0) >> 16) & 0xFFFF, 0));
+
+	ShowWindow(GetDlgItem(m_hWnd, IDC_ALPHASTENCIL), ogl ? SW_HIDE : SW_SHOW);
+	ShowWindow(GetDlgItem(m_hWnd, IDC_ALPHAHACK), ogl ? SW_HIDE : SW_SHOW);
+	ShowWindow(GetDlgItem(m_hWnd, IDC_SAFE_FBMASK), ogl ? SW_SHOW : SW_HIDE);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_MSAACB), !ogl);
+	EnableWindow(GetDlgItem(m_hWnd, IDC_MSAA_TEXT), !ogl);
+
+	AddTooltip(IDC_SKIPDRAWHACKEDIT);
+	AddTooltip(IDC_SKIPDRAWHACK);
+	AddTooltip(IDC_ALPHAHACK);
+	AddTooltip(IDC_OFFSETHACK);
+	AddTooltip(IDC_SPRITEHACK);
+	AddTooltip(IDC_WILDHACK);
+	AddTooltip(IDC_MSAACB);
+	AddTooltip(IDC_ALPHASTENCIL);
+	AddTooltip(IDC_ALIGN_SPRITE);
+	AddTooltip(IDC_ROUND_SPRITE);
+	AddTooltip(IDC_TCOFFSETX);
+	AddTooltip(IDC_TCOFFSETX2);
+	AddTooltip(IDC_TCOFFSETY);
+	AddTooltip(IDC_TCOFFSETY2);
+	AddTooltip(IDC_PRELOAD_GS);
+	AddTooltip(IDC_SAFE_FBMASK);
+}
+
+void GSHacksDlg::UpdateControls()
+{}
+
+bool GSHacksDlg::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
+{	    
+	switch(message)
+	{
+	case WM_COMMAND:
+	{
+		int id = LOWORD(wParam);
+
+		switch(id)
+		{
+		case IDOK: 
+		{
+			INT_PTR data;
+			if (ComboBoxGetSelData(IDC_ROUND_SPRITE, data))
+			{
+				theApp.SetConfig("UserHacks_round_sprite_offset", (int)data);
+			}
+			if (ComboBoxGetSelData(IDC_SPRITEHACK, data))
+			{
+				theApp.SetConfig("UserHacks_SpriteHack", (int)data);
+			}
+			theApp.SetConfig("UserHacks_MSAA", cb2msaa[(int)SendMessage(GetDlgItem(m_hWnd, IDC_MSAACB), CB_GETCURSEL, 0, 0)]);
+			theApp.SetConfig("UserHacks_AlphaHack", (int)IsDlgButtonChecked(m_hWnd, IDC_ALPHAHACK));
+			theApp.SetConfig("UserHacks_HalfPixelOffset", (int)IsDlgButtonChecked(m_hWnd, IDC_OFFSETHACK));
+			theApp.SetConfig("UserHacks_SkipDraw", (int)SendMessage(GetDlgItem(m_hWnd, IDC_SKIPDRAWHACK), UDM_GETPOS, 0, 0));
+			theApp.SetConfig("UserHacks_WildHack", (int)IsDlgButtonChecked(m_hWnd, IDC_WILDHACK));
+			theApp.SetConfig("UserHacks_AlphaStencil", (int)IsDlgButtonChecked(m_hWnd, IDC_ALPHASTENCIL));
+			theApp.SetConfig("preload_frame_with_gs_data", (int)IsDlgButtonChecked(m_hWnd, IDC_PRELOAD_GS));
+			theApp.SetConfig("Userhacks_align_sprite_X", (int)IsDlgButtonChecked(m_hWnd, IDC_ALIGN_SPRITE));
+			theApp.SetConfig("UserHacks_safe_fbmask", (int)IsDlgButtonChecked(m_hWnd, IDC_SAFE_FBMASK));
+
+
+			unsigned int TCOFFSET  =  SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETX), UDM_GETPOS, 0, 0) & 0xFFFF;
+						 TCOFFSET |= (SendMessage(GetDlgItem(m_hWnd, IDC_TCOFFSETY), UDM_GETPOS, 0, 0) & 0xFFFF) << 16;
+
+			theApp.SetConfig("UserHacks_TCOffset", TCOFFSET);
+
+			EndDialog(m_hWnd, id);
+		} break;
+		}
+
+	} break;
+
+	case WM_CLOSE:EndDialog(m_hWnd, IDCANCEL); break;
+
+	default: return false;
+	}
+
+	return true;
+}
diff --git a/plugins/GSdx_legacy/GSSettingsDlg.h b/plugins/GSdx_legacy/GSSettingsDlg.h
new file mode 100644
index 0000000000..8bfcf3f373
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSettingsDlg.h
@@ -0,0 +1,96 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDialog.h"
+#include "GSSetting.h"
+
+class GSShaderDlg : public GSDialog
+{
+	int saturation;
+	int brightness;
+	int contrast;
+
+	void UpdateControls();
+
+protected:
+	void OnInit();
+	bool OnMessage(UINT message, WPARAM wParam, LPARAM lParam);	
+
+public:
+	GSShaderDlg();
+};
+
+class GSHacksDlg : public GSDialog
+{
+	unsigned short cb2msaa[17];
+	unsigned short msaa2cb[17];
+	std::string adapter_id;
+	
+	bool isdx9;
+
+	void UpdateControls();
+
+protected:
+	void OnInit();
+	bool OnMessage(UINT message, WPARAM wParam, LPARAM lParam);
+
+public:
+	GSHacksDlg();
+
+	// Ugh
+	void SetAdapter(std::string adapter_id_)
+	{
+		adapter_id = adapter_id_;
+	}
+};
+
+class GSSettingsDlg : public GSDialog
+{
+
+	struct Adapter
+	{
+		std::string name;
+		std::string id;
+		D3D_FEATURE_LEVEL level;
+		Adapter(const std::string &n, const std::string &i, const D3D_FEATURE_LEVEL &l) : name(n), id(i), level(l) {}
+	};
+	
+	std::vector<Adapter> adapters;
+
+	vector<GSSetting> m_ocl_devs;
+	uint32 m_lastValidMsaa; // used to revert to previous dialog value if the user changed to invalid one, or lesser one and canceled
+
+	void UpdateRenderers();
+	void UpdateControls();
+
+protected:
+	void OnInit();
+	bool OnCommand(HWND hWnd, UINT id, UINT code);
+
+	// Shade Boost
+	GSShaderDlg ShaderDlg;
+	GSHacksDlg HacksDlg;
+
+public:
+	GSSettingsDlg();
+};
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.cpp
new file mode 100644
index 0000000000..37e253ee9f
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.cpp
@@ -0,0 +1,65 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+
+#if _M_SSE >= 0x501
+
+const GSVector8 GSSetupPrimCodeGenerator::m_shift[9] =
+{
+	GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f),
+	GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+	GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f),
+	GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f),
+	GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f),
+	GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f),
+	GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f),
+	GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f),
+	GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f),
+};
+
+#else
+
+const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
+{
+	GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
+	GSVector4(0.0f, 1.0f, 2.0f, 3.0f),
+	GSVector4(-1.0f, 0.0f, 1.0f, 2.0f),
+	GSVector4(-2.0f, -1.0f, 0.0f, 1.0f),
+	GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
+};
+
+#endif
+
+GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
+	: GSCodeGenerator(code, maxsize)
+	, m_local(*(GSScanlineLocalData*)param)
+{
+	m_sel.key = key;
+
+	m_en.z = m_sel.zb ? 1 : 0;
+	m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
+	m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
+	m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
+
+	Generate();
+}
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.h b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.h
new file mode 100644
index 0000000000..746d7996aa
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.h
@@ -0,0 +1,50 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSScanlineEnvironment.h"
+#include "GSFunctionMap.h"
+
+class GSSetupPrimCodeGenerator : public GSCodeGenerator
+{
+	void operator = (const GSSetupPrimCodeGenerator&);
+
+	GSScanlineSelector m_sel;
+	GSScanlineLocalData& m_local;
+
+	struct {uint32 z:1, f:1, t:1, c:1;} m_en;
+
+	void Generate();
+
+	void Depth();
+	void Texture();
+	void Color();
+
+public:
+	GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
+
+	#if _M_SSE >= 0x501
+	static const GSVector8 m_shift[9];
+	#else
+	static const GSVector4 m_shift[5];
+	#endif
+};
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.avx.cpp
new file mode 100644
index 0000000000..5fe710dad3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.avx.cpp
@@ -0,0 +1,366 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE == 0x500 && (defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	sub(rsp, 8 + 2 * 16);
+
+	vmovdqa(ptr[rsp + 0], xmm6);
+	vmovdqa(ptr[rsp + 16], xmm7);
+
+	mov(r8, (size_t)&m_local);
+
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		mov(rax, (size_t)&m_shift[0]);
+
+		for(int i = 0; i < 5; i++)
+		{
+			vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	vmovdqa(xmm6, ptr[rsp + 0]);
+	vmovdqa(xmm7, ptr[rsp + 16]);
+
+	add(rsp, 8 + 2 * 16);
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 p = dscan.p;
+
+		vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// GSVector4 df = p.wwww();
+
+			vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+			vmulps(xmm2, xmm1, xmm3);
+			vcvttps2dq(xmm2, xmm2);
+			vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
+
+			for(int i = 0; i < 4; i++)
+			{
+				// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+				vmulps(xmm2, xmm1, Xmm(4 + i));
+				vcvttps2dq(xmm2, xmm2);
+				vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+
+				const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
+				vmovdqa(ptr[r8 + variableOffset], xmm2);
+			}
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 dz = p.zzzz();
+
+			vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			// m_local.d4.z = dz * 4.0f;
+
+			vmulps(xmm1, xmm0, xmm3);
+			vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
+
+			for(int i = 0; i < 4; i++)
+			{
+				// m_local.d[i].z = dz * m_shift[i];
+
+				vmulps(xmm1, xmm0, Xmm(4 + i));
+
+				const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
+				vmovdqa(ptr[r8 + variableOffset], xmm1);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertices[0].p;
+
+		vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
+
+			vcvttps2dq(xmm1, xmm0);
+			vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 z = p.zzzz();
+
+			vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			if(m_sel.zoverflow)
+			{
+				// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+				mov(r9, (size_t)&GSVector4::m_half);
+
+				vbroadcastss(xmm1, ptr[r9]);
+				vmulps(xmm1, xmm0);
+				vcvttps2dq(xmm1, xmm1);
+				vpslld(xmm1, 1);
+
+				vcvttps2dq(xmm0, xmm0);
+				vpcmpeqd(xmm2, xmm2);
+				vpsrld(xmm2, 31);
+				vpand(xmm0, xmm2);
+
+				vpor(xmm0, xmm1);
+			}
+			else
+			{
+				// m_local.p.z = GSVector4i(z);
+
+				vcvttps2dq(xmm0, xmm0);
+			}
+
+			vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector4 t = dscan.t;
+
+	vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
+
+	vmulps(xmm1, xmm0, xmm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d4.stq = GSVector4i(t * 4.0f);
+
+		vcvttps2dq(xmm1, xmm1);
+
+		vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
+	}
+	else
+	{
+		// m_local.d4.stq = t * 4.0f;
+
+		vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector4 ds = t.xxxx();
+		// GSVector4 dt = t.yyyy();
+		// GSVector4 dq = t.zzzz();
+
+		vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4 v = ds/dt * m_shift[i];
+
+			vmulps(xmm2, xmm1, Xmm(4 + i));
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector4i(v);
+
+				vcvttps2dq(xmm2, xmm2);
+
+				const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
+
+				switch(j)
+				{
+				case 0: vmovdqa(ptr[r8 + variableOffsetS], xmm2); break;
+				case 1: vmovdqa(ptr[r8 + variableOffsetT], xmm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
+
+				switch(j)
+				{
+				case 0: vmovaps(ptr[r8 + variableOffsetS], xmm2); break;
+				case 1: vmovaps(ptr[r8 + variableOffsetT], xmm2); break;
+				case 2: vmovaps(ptr[r8 + variableOffsetQ], xmm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector4 c = dscan.c;
+
+		vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
+
+		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+		vmulps(xmm1, xmm0, xmm3);
+		vcvttps2dq(xmm1, xmm1);
+		vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
+		vpackssdw(xmm1, xmm1);
+		vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
+
+		// xmm3 is not needed anymore
+
+		// GSVector4 dr = c.xxxx();
+		// GSVector4 db = c.zzzz();
+
+		vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+			vmulps(xmm0, xmm2, Xmm(4 + i));
+			vcvttps2dq(xmm0, xmm0);
+			vpackssdw(xmm0, xmm0);
+
+			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+			vmulps(xmm1, xmm3, Xmm(4 + i));
+			vcvttps2dq(xmm1, xmm1);
+			vpackssdw(xmm1, xmm1);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			vpunpcklwd(xmm0, xmm1);
+
+			const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
+			vmovdqa(ptr[r8 + variableOffset], xmm0);
+		}
+
+		// GSVector4 c = dscan.c;
+
+		vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+
+		// GSVector4 dg = c.yyyy();
+		// GSVector4 da = c.wwww();
+
+		vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+			vmulps(xmm0, xmm2, Xmm(4 + i));
+			vcvttps2dq(xmm0, xmm0);
+			vpackssdw(xmm0, xmm0);
+
+			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+			vmulps(xmm1, xmm3, Xmm(4 + i));
+			vcvttps2dq(xmm1, xmm1);
+			vpackssdw(xmm1, xmm1);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			vpunpcklwd(xmm0, xmm1);
+
+			const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
+			vmovdqa(ptr[r8 + variableOffset], xmm0);
+		}
+	}
+	else
+	{
+		// GSVector4i c = GSVector4i(vertices[0].c);
+
+		vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
+
+		// c = c.upl16(c.zwxy());
+
+		vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+		vpunpcklwd(xmm0, xmm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			vpsrlw(xmm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
+		vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
+	}
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.cpp
new file mode 100644
index 0000000000..6456ead387
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x64.cpp
@@ -0,0 +1,380 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	sub(rsp, 8 + 2 * 16);
+
+	vmovdqa(ptr[rsp + 0], xmm6);
+	vmovdqa(ptr[rsp + 16], xmm7);
+
+	mov(r8, (size_t)&m_local);
+
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		for(int i = 0; i < 5; i++)
+		{
+			movaps(Xmm(3 + i), ptr[rax + i * 16]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	vmovdqa(xmm6, ptr[rsp + 0]);
+	vmovdqa(xmm7, ptr[rsp + 16]);
+
+	add(rsp, 8 + 2 * 16);
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 p = dscan.p;
+
+		movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// GSVector4 df = p.wwww();
+
+			movaps(xmm1, xmm0);
+			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+			movaps(xmm2, xmm1);
+			mulps(xmm2, xmm3);
+			cvttps2dq(xmm2, xmm2);
+			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
+
+			for(int i = 0; i < 4; i++)
+			{
+				// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+				movaps(xmm2, xmm1);
+				mulps(xmm2, Xmm(4 + i));
+				cvttps2dq(xmm2, xmm2);
+				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+
+				const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
+				movdqa(ptr[r8 + variableOffset], xmm2);
+			}
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 dz = p.zzzz();
+
+			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			// m_local.d4.z = dz * 4.0f;
+
+			movaps(xmm1, xmm0);
+			mulps(xmm1, xmm3);
+			movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
+
+			for(int i = 0; i < 4; i++)
+			{
+				// m_local.d[i].z = dz * m_shift[i];
+
+				movaps(xmm1, xmm0);
+				mulps(xmm1, Xmm(4 + i));
+
+				const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
+				movdqa(ptr[r8 + variableOffset], xmm1);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertices[0].p;
+
+		movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
+
+			cvttps2dq(xmm1, xmm0);
+			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 z = p.zzzz();
+
+			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			if(m_sel.zoverflow)
+			{
+				// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
+
+				mov(r9, (size_t)&GSVector4::m_half);
+
+				movss(xmm1, ptr[r9]);
+				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
+				mulps(xmm1, xmm0);
+				cvttps2dq(xmm1, xmm1);
+				pslld(xmm1, 1);
+
+				cvttps2dq(xmm0, xmm0);
+				pcmpeqd(xmm2, xmm2);
+				psrld(xmm2, 31);
+				pand(xmm0, xmm2);
+
+				por(xmm0, xmm1);
+			}
+			else
+			{
+				// m_local.p.z = GSVector4i(z);
+
+				cvttps2dq(xmm0, xmm0);
+			}
+
+			movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector4 t = dscan.t;
+
+	movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
+
+	movaps(xmm1, xmm0);
+	mulps(xmm1, xmm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d4.stq = GSVector4i(t * 4.0f);
+
+		cvttps2dq(xmm1, xmm1);
+
+		movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
+	}
+	else
+	{
+		// m_local.d4.stq = t * 4.0f;
+
+		movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector4 ds = t.xxxx();
+		// GSVector4 dt = t.yyyy();
+		// GSVector4 dq = t.zzzz();
+
+		movaps(xmm1, xmm0);
+		shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4 v = ds/dt * m_shift[i];
+
+			movaps(xmm2, xmm1);
+			mulps(xmm2, Xmm(4 + i));
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector4i(v);
+
+				cvttps2dq(xmm2, xmm2);
+
+				const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
+
+				switch(j)
+				{
+				case 0: movdqa(ptr[r8 + variableOffsetS], xmm2); break;
+				case 1: movdqa(ptr[r8 + variableOffsetT], xmm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
+				const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
+
+				switch(j)
+				{
+				case 0: movaps(ptr[r8 + variableOffsetS], xmm2); break;
+				case 1: movaps(ptr[r8 + variableOffsetT], xmm2); break;
+				case 2: movaps(ptr[r8 + variableOffsetQ], xmm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector4 c = dscan.c;
+
+		movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
+		movaps(xmm1, xmm0);
+
+		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+		movaps(xmm2, xmm0);
+		mulps(xmm2, xmm3);
+		cvttps2dq(xmm2, xmm2);
+		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
+		packssdw(xmm2, xmm2);
+		movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
+
+		// xmm3 is not needed anymore
+
+		// GSVector4 dr = c.xxxx();
+		// GSVector4 db = c.zzzz();
+
+		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+			movaps(xmm2, xmm0);
+			mulps(xmm2, Xmm(4 + i));
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm2, xmm2);
+
+			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+			movaps(xmm3, xmm1);
+			mulps(xmm3, Xmm(4 + i));
+			cvttps2dq(xmm3, xmm3);
+			packssdw(xmm3, xmm3);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			punpcklwd(xmm2, xmm3);
+
+			const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
+			movdqa(ptr[r8 + variableOffset], xmm2);
+		}
+
+		// GSVector4 c = dscan.c;
+
+		movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+		movaps(xmm1, xmm0);
+
+		// GSVector4 dg = c.yyyy();
+		// GSVector4 da = c.wwww();
+
+		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < 4; i++)
+		{
+			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+			movaps(xmm2, xmm0);
+			mulps(xmm2, Xmm(4 + i));
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm2, xmm2);
+
+			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+			movaps(xmm3, xmm1);
+			mulps(xmm3, Xmm(4 + i));
+			cvttps2dq(xmm3, xmm3);
+			packssdw(xmm3, xmm3);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			punpcklwd(xmm2, xmm3);
+
+			const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
+			movdqa(ptr[r8 + variableOffset], xmm2);
+		}
+	}
+	else
+	{
+		// GSVector4i c = GSVector4i(vertices[0].c);
+
+		cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
+
+		// c = c.upl16(c.zwxy());
+
+		pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+		punpcklwd(xmm0, xmm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			psrlw(xmm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
+		movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
+	}
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx.cpp
new file mode 100644
index 0000000000..21a7d47c97
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx.cpp
@@ -0,0 +1,342 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+static const int _args = 0;
+static const int _vertex = _args + 4;
+static const int _index = _args + 8;
+static const int _dscan = _args + 12;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		mov(edx, dword[esp + _dscan]);
+
+		for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
+		{
+			vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 p = dscan.p;
+
+		vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// GSVector4 df = p.wwww();
+
+			vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+			vmulps(xmm2, xmm1, xmm3);
+			vcvttps2dq(xmm2, xmm2);
+			vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			vmovdqa(ptr[&m_local.d4.f], xmm2);
+
+			for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+				vmulps(xmm2, xmm1, Xmm(4 + i));
+				vcvttps2dq(xmm2, xmm2);
+				vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				vmovdqa(ptr[&m_local.d[i].f], xmm2);
+			}
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 dz = p.zzzz();
+
+			vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			// m_local.d4.z = dz * 4.0f;
+
+			vmulps(xmm1, xmm0, xmm3);
+			vmovdqa(ptr[&m_local.d4.z], xmm1);
+
+			for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].z = dz * m_shift[i];
+
+				vmulps(xmm1, xmm0, Xmm(4 + i));
+				vmovdqa(ptr[&m_local.d[i].z], xmm1);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertex[index[1]].p;
+
+		mov(ecx, ptr[esp + _index]);
+		mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
+		shl(ecx, 6); // * sizeof(GSVertexSW)
+		add(ecx, ptr[esp + _vertex]);
+
+		vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
+
+			vcvttps2dq(xmm1, xmm0);
+			vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			vmovdqa(ptr[&m_local.p.f], xmm1);
+		}
+
+		if(m_en.z)
+		{
+			// uint32 z is bypassed in t.w
+
+			vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
+			vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+			vmovdqa(ptr[&m_local.p.z], xmm0);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector4 t = dscan.t;
+
+	vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
+
+	vmulps(xmm1, xmm0, xmm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d4.stq = GSVector4i(t * 4.0f);
+
+		vcvttps2dq(xmm1, xmm1);
+
+		vmovdqa(ptr[&m_local.d4.stq], xmm1);
+	}
+	else
+	{
+		// m_local.d4.stq = t * 4.0f;
+
+		vmovaps(ptr[&m_local.d4.stq], xmm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector4 ds = t.xxxx();
+		// GSVector4 dt = t.yyyy();
+		// GSVector4 dq = t.zzzz();
+
+		vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4 v = ds/dt * m_shift[i];
+
+			vmulps(xmm2, xmm1, Xmm(4 + i));
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector4i(v);
+
+				vcvttps2dq(xmm2, xmm2);
+
+				switch(j)
+				{
+				case 0: vmovdqa(ptr[&m_local.d[i].s], xmm2); break;
+				case 1: vmovdqa(ptr[&m_local.d[i].t], xmm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				switch(j)
+				{
+				case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
+				case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
+				case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector4 c = dscan.c;
+
+		vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
+
+		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+		vmulps(xmm1, xmm0, xmm3);
+		vcvttps2dq(xmm1, xmm1);
+		vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
+		vpackssdw(xmm1, xmm1);
+		vmovdqa(ptr[&m_local.d4.c], xmm1);
+
+		// xmm3 is not needed anymore
+
+		// GSVector4 dr = c.xxxx();
+		// GSVector4 db = c.zzzz();
+
+		vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+			vmulps(xmm0, xmm2, Xmm(4 + i));
+			vcvttps2dq(xmm0, xmm0);
+			vpackssdw(xmm0, xmm0);
+
+			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+			vmulps(xmm1, xmm3, Xmm(4 + i));
+			vcvttps2dq(xmm1, xmm1);
+			vpackssdw(xmm1, xmm1);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			vpunpcklwd(xmm0, xmm1);
+			vmovdqa(ptr[&m_local.d[i].rb], xmm0);
+		}
+
+		// GSVector4 c = dscan.c;
+
+		vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+
+		// GSVector4 dg = c.yyyy();
+		// GSVector4 da = c.wwww();
+
+		vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+			vmulps(xmm0, xmm2, Xmm(4 + i));
+			vcvttps2dq(xmm0, xmm0);
+			vpackssdw(xmm0, xmm0);
+
+			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+			vmulps(xmm1, xmm3, Xmm(4 + i));
+			vcvttps2dq(xmm1, xmm1);
+			vpackssdw(xmm1, xmm1);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			vpunpcklwd(xmm0, xmm1);
+			vmovdqa(ptr[&m_local.d[i].ga], xmm0);
+		}
+	}
+	else
+	{
+		// GSVector4i c = GSVector4i(vertex[index[last].c);
+
+		int last = 0;
+
+		switch(m_sel.prim)
+		{
+		case GS_POINT_CLASS: last = 0; break;
+		case GS_LINE_CLASS: last = 1; break;
+		case GS_TRIANGLE_CLASS: last = 2; break;
+		case GS_SPRITE_CLASS: last = 1; break;
+		}
+
+		if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
+		{
+			mov(ecx, ptr[esp + _index]);
+			mov(ecx, ptr[ecx + sizeof(uint32) * last]);
+			shl(ecx, 6); // * sizeof(GSVertexSW)
+			add(ecx, ptr[esp + _vertex]);
+		}
+
+		vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
+
+		// c = c.upl16(c.zwxy());
+
+		vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+		vpunpcklwd(xmm0, xmm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			vpsrlw(xmm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		vmovdqa(ptr[&m_local.c.rb], xmm1);
+		vmovdqa(ptr[&m_local.c.ga], xmm2);
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx2.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx2.cpp
new file mode 100644
index 0000000000..172d053a5a
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.avx2.cpp
@@ -0,0 +1,353 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+static const int _args = 0;
+static const int _vertex = _args + 4;
+static const int _index = _args + 8;
+static const int _dscan = _args + 12;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		mov(edx, dword[esp + _dscan]);
+
+		for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
+		{
+			vmovaps(Ymm(3 + i), ptr[&m_shift[i]]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
+
+		vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
+
+		vmulps(ymm1, ymm0, ymm3);
+
+		if(m_en.z)
+		{
+			// m_local.d8.p.z = dp8.extract32<2>();
+
+			vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
+		}
+		
+		if(m_en.f)
+		{
+			// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
+
+			vcvtps2dq(ymm2, ymm1);
+			vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
+		}
+
+		if(m_en.z)
+		{
+			// GSVector8 dz = GSVector8(dscan.p).zzzz();
+
+			vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+		}
+
+		if(m_en.f)
+		{
+			// GSVector8 df = GSVector8(dscan.p).wwww();
+
+			vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
+		}
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			if(m_en.z)
+			{
+				// m_local.d[i].z = dz * shift[1 + i];
+
+				if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+				else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+				vmovaps(ptr[&m_local.d[i].z], ymm0);
+			}
+
+			if(m_en.f)
+			{
+				// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
+
+				if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i));
+				else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]);
+				vcvttps2dq(ymm0, ymm0);
+				vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+				vmovdqa(ptr[&m_local.d[i].f], ymm0);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertex[index[1]].p;
+
+		mov(ecx, ptr[esp + _index]);
+		mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
+		shl(ecx, 6); // * sizeof(GSVertexSW)
+		add(ecx, ptr[esp + _vertex]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
+
+			vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
+			vcvttps2dq(xmm0, xmm0);
+			vpextrd(ptr[&m_local.p.f], xmm0, 3);
+		}
+
+		if(m_en.z)
+		{
+			// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
+
+			mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
+			mov(ptr[&m_local.p.z], eax);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector8 dt(dscan.t);
+
+	vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
+
+	// GSVector8 dt8 = dt * shift[0];
+
+	vmulps(ymm1, ymm0, ymm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
+
+		vcvttps2dq(ymm1, ymm1);
+
+		vmovdqa(ptr[&m_local.d8.stq], xmm1);
+	}
+	else
+	{
+		// m_local.d8.stq = dt8;
+
+		vmovaps(ptr[&m_local.d8.stq], xmm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
+
+		vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8 v = dstq * shift[1 + i];
+
+			if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i));
+			else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]);
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
+
+				vcvttps2dq(ymm2, ymm2);
+
+				switch(j)
+				{
+				case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
+				case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				switch(j)
+				{
+				case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
+				case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
+				case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector8 dc(dscan.c);
+
+		vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
+
+		// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
+
+		vmulps(ymm1, ymm0, ymm3);
+		vcvttps2dq(ymm1, ymm1);
+		vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
+		vpackssdw(ymm1, ymm1);
+		vmovq(ptr[&m_local.d8.c], xmm1);
+
+		// ymm3 is not needed anymore
+
+		// GSVector8 dr = dc.xxxx();
+		// GSVector8 db = dc.zzzz();
+
+		vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+			else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm0, ymm0);
+			vpackssdw(ymm0, ymm0);
+
+			// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
+			else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm1, ymm1);
+			vpackssdw(ymm1, ymm1);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			vpunpcklwd(ymm0, ymm1);
+			vmovdqa(ptr[&m_local.d[i].rb], ymm0);
+		}
+
+		// GSVector8 dc(dscan.c);
+
+		vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+
+		// GSVector8 dg = dc.yyyy();
+		// GSVector8 da = dc.wwww();
+
+		vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
+		vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+			else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm0, ymm0);
+			vpackssdw(ymm0, ymm0);
+
+			// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
+			else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm1, ymm1);
+			vpackssdw(ymm1, ymm1);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			vpunpcklwd(ymm0, ymm1);
+			vmovdqa(ptr[&m_local.d[i].ga], ymm0);
+		}
+	}
+	else
+	{
+		// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
+
+		int last = 0;
+
+		switch(m_sel.prim)
+		{
+		case GS_POINT_CLASS: last = 0; break;
+		case GS_LINE_CLASS: last = 1; break;
+		case GS_TRIANGLE_CLASS: last = 2; break;
+		case GS_SPRITE_CLASS: last = 1; break;
+		}
+
+		if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
+		{
+			mov(ecx, ptr[esp + _index]);
+			mov(ecx, ptr[ecx + sizeof(uint32) * last]);
+			shl(ecx, 6); // * sizeof(GSVertexSW)
+			add(ecx, ptr[esp + _vertex]);
+		}
+
+		vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
+		vcvttps2dq(ymm0, ymm0);
+
+		// c = c.upl16(c.zwxy());
+
+		vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
+		vpunpcklwd(ymm0, ymm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			vpsrlw(ymm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		vmovdqa(ptr[&m_local.c.rb], ymm1);
+		vmovdqa(ptr[&m_local.c.ga], ymm2);
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.cpp
new file mode 100644
index 0000000000..008a12a8f5
--- /dev/null
+++ b/plugins/GSdx_legacy/GSSetupPrimCodeGenerator.x86.cpp
@@ -0,0 +1,357 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+static const int _args = 0;
+static const int _vertex = _args + 4;
+static const int _index = _args + 8;
+static const int _dscan = _args + 12;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		mov(edx, dword[esp + _dscan]);
+
+		for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
+		{
+			movaps(Xmm(3 + i), ptr[&m_shift[i]]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 p = dscan.p;
+
+		movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// GSVector4 df = p.wwww();
+
+			movaps(xmm1, xmm0);
+			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+			// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
+
+			movaps(xmm2, xmm1);
+			mulps(xmm2, xmm3);
+			cvttps2dq(xmm2, xmm2);
+			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+			movdqa(ptr[&m_local.d4.f], xmm2);
+
+			for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
+
+				movaps(xmm2, xmm1);
+				mulps(xmm2, Xmm(4 + i));
+				cvttps2dq(xmm2, xmm2);
+				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
+				movdqa(ptr[&m_local.d[i].f], xmm2);
+			}
+		}
+
+		if(m_en.z)
+		{
+			// GSVector4 dz = p.zzzz();
+
+			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+			// m_local.d4.z = dz * 4.0f;
+
+			movaps(xmm1, xmm0);
+			mulps(xmm1, xmm3);
+			movdqa(ptr[&m_local.d4.z], xmm1);
+
+			for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+			{
+				// m_local.d[i].z = dz * m_shift[i];
+
+				movaps(xmm1, xmm0);
+				mulps(xmm1, Xmm(4 + i));
+				movdqa(ptr[&m_local.d[i].z], xmm1);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertex[index[1]].p;
+
+		mov(ecx, ptr[esp + _index]);
+		mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
+		shl(ecx, 6); // * sizeof(GSVertexSW)
+		add(ecx, ptr[esp + _vertex]);
+
+		movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
+
+			cvttps2dq(xmm1, xmm0);
+			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+			movdqa(ptr[&m_local.p.f], xmm1);
+		}
+
+		if(m_en.z)
+		{
+			// uint32 z is bypassed in t.w
+
+			movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
+			pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+			movdqa(ptr[&m_local.p.z], xmm0);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector4 t = dscan.t;
+
+	movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
+
+	movaps(xmm1, xmm0);
+	mulps(xmm1, xmm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d4.stq = GSVector4i(t * 4.0f);
+
+		cvttps2dq(xmm1, xmm1);
+
+		movdqa(ptr[&m_local.d4.stq], xmm1);
+	}
+	else
+	{
+		// m_local.d4.stq = t * 4.0f;
+
+		movaps(ptr[&m_local.d4.stq], xmm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector4 ds = t.xxxx();
+		// GSVector4 dt = t.yyyy();
+		// GSVector4 dq = t.zzzz();
+
+		movaps(xmm1, xmm0);
+		shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4 v = ds/dt * m_shift[i];
+
+			movaps(xmm2, xmm1);
+			mulps(xmm2, Xmm(4 + i));
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector4i(v);
+
+				cvttps2dq(xmm2, xmm2);
+
+				switch(j)
+				{
+				case 0: movdqa(ptr[&m_local.d[i].s], xmm2); break;
+				case 1: movdqa(ptr[&m_local.d[i].t], xmm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				switch(j)
+				{
+				case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
+				case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
+				case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector4 c = dscan.c;
+
+		movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
+		movaps(xmm1, xmm0);
+
+		// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
+
+		movaps(xmm2, xmm0);
+		mulps(xmm2, xmm3);
+		cvttps2dq(xmm2, xmm2);
+		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
+		packssdw(xmm2, xmm2);
+		movdqa(ptr[&m_local.d4.c], xmm2);
+
+		// xmm3 is not needed anymore
+
+		// GSVector4 dr = c.xxxx();
+		// GSVector4 db = c.zzzz();
+
+		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
+
+			movaps(xmm2, xmm0);
+			mulps(xmm2, Xmm(4 + i));
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm2, xmm2);
+
+			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
+
+			movaps(xmm3, xmm1);
+			mulps(xmm3, Xmm(4 + i));
+			cvttps2dq(xmm3, xmm3);
+			packssdw(xmm3, xmm3);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			punpcklwd(xmm2, xmm3);
+			movdqa(ptr[&m_local.d[i].rb], xmm2);
+		}
+
+		// GSVector4 c = dscan.c;
+
+		movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+		movaps(xmm1, xmm0);
+
+		// GSVector4 dg = c.yyyy();
+		// GSVector4 da = c.wwww();
+
+		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
+		{
+			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
+
+			movaps(xmm2, xmm0);
+			mulps(xmm2, Xmm(4 + i));
+			cvttps2dq(xmm2, xmm2);
+			packssdw(xmm2, xmm2);
+
+			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
+
+			movaps(xmm3, xmm1);
+			mulps(xmm3, Xmm(4 + i));
+			cvttps2dq(xmm3, xmm3);
+			packssdw(xmm3, xmm3);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			punpcklwd(xmm2, xmm3);
+			movdqa(ptr[&m_local.d[i].ga], xmm2);
+		}
+	}
+	else
+	{
+		// GSVector4i c = GSVector4i(vertex[index[last].c);
+
+		int last = 0;
+
+		switch(m_sel.prim)
+		{
+		case GS_POINT_CLASS: last = 0; break;
+		case GS_LINE_CLASS: last = 1; break;
+		case GS_TRIANGLE_CLASS: last = 2; break;
+		case GS_SPRITE_CLASS: last = 1; break;
+		}
+
+		if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
+		{
+			mov(ecx, ptr[esp + _index]);
+			mov(ecx, ptr[ecx + sizeof(uint32) * last]);
+			shl(ecx, 6); // * sizeof(GSVertexSW)
+			add(ecx, ptr[esp + _vertex]);
+		}
+
+		cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
+
+		// c = c.upl16(c.zwxy());
+
+		pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
+		punpcklwd(xmm0, xmm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			psrlw(xmm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+		pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		movdqa(ptr[&m_local.c.rb], xmm1);
+		movdqa(ptr[&m_local.c.ga], xmm2);
+	}
+}
+
+#endif
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSShaderOGL.cpp b/plugins/GSdx_legacy/GSShaderOGL.cpp
new file mode 100644
index 0000000000..c337dfab44
--- /dev/null
+++ b/plugins/GSdx_legacy/GSShaderOGL.cpp
@@ -0,0 +1,358 @@
+/*
+ *	Copyright (C) 2011-2013 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSShaderOGL.h"
+#include "GLState.h"
+
+GSShaderOGL::GSShaderOGL(bool debug) :
+	m_pipeline(0),
+	m_debug_shader(debug)
+{
+	m_single_prog.clear();
+	if (GLLoader::found_GL_ARB_separate_shader_objects) {
+		glGenProgramPipelines(1, &m_pipeline);
+		glBindProgramPipeline(m_pipeline);
+	}
+}
+
+GSShaderOGL::~GSShaderOGL()
+{
+	if (GLLoader::found_GL_ARB_separate_shader_objects)
+		glDeleteProgramPipelines(1, &m_pipeline);
+
+	for (auto it = m_single_prog.begin(); it != m_single_prog.end() ; it++) glDeleteProgram(it->second);
+	m_single_prog.clear();
+}
+
+void GSShaderOGL::VS(GLuint s)
+{
+	if (GLState::vs != s)
+	{
+		GLState::vs = s;
+		GLState::dirty_prog = true;
+		if (GLLoader::found_GL_ARB_separate_shader_objects)
+			glUseProgramStages(m_pipeline, GL_VERTEX_SHADER_BIT, s);
+	}
+}
+
+void GSShaderOGL::PS(GLuint s)
+{
+#ifdef _DEBUG
+	if (true)
+#else
+	if (GLState::ps != s)
+#endif
+	{
+		// In debug always sets the program. It allow to replace the program in apitrace easily.
+		GLState::ps = s;
+		GLState::dirty_prog = true;
+		if (GLLoader::found_GL_ARB_separate_shader_objects) {
+			glUseProgramStages(m_pipeline, GL_FRAGMENT_SHADER_BIT, s);
+		}
+	}
+}
+
+void GSShaderOGL::GS(GLuint s)
+{
+	if (GLState::gs != s)
+	{
+		GLState::gs = s;
+		GLState::dirty_prog = true;
+		if (GLLoader::found_GL_ARB_separate_shader_objects)
+			glUseProgramStages(m_pipeline, GL_GEOMETRY_SHADER_BIT, s);
+	}
+}
+
+bool GSShaderOGL::ValidateShader(GLuint s)
+{
+	if (!m_debug_shader) return true;
+
+	GLint status = 0;
+	glGetShaderiv(s, GL_COMPILE_STATUS, &status);
+	if (status) return true;
+
+	GLint log_length = 0;
+	glGetShaderiv(s, GL_INFO_LOG_LENGTH, &log_length);
+	if (log_length > 0) {
+		char* log = new char[log_length];
+		glGetShaderInfoLog(s, log_length, NULL, log);
+		fprintf(stderr, "%s", log);
+		delete[] log;
+	}
+	fprintf(stderr, "\n");
+
+	return false;
+}
+
+bool GSShaderOGL::ValidateProgram(GLuint p)
+{
+	if (!m_debug_shader) return true;
+
+	GLint status = 0;
+	glGetProgramiv(p, GL_LINK_STATUS, &status);
+	if (status) return true;
+
+	GLint log_length = 0;
+	glGetProgramiv(p, GL_INFO_LOG_LENGTH, &log_length);
+	if (log_length > 0) {
+		char* log = new char[log_length];
+		glGetProgramInfoLog(p, log_length, NULL, log);
+		fprintf(stderr, "%s", log);
+		delete[] log;
+	}
+	fprintf(stderr, "\n");
+
+	return false;
+}
+
+bool GSShaderOGL::ValidatePipeline(GLuint p)
+{
+	if (!m_debug_shader) return true;
+
+	// FIXME: might be mandatory to validate the pipeline
+	glValidateProgramPipeline(p);
+
+	GLint status = 0;
+	glGetProgramPipelineiv(p, GL_VALIDATE_STATUS, &status);
+	if (status) return true;
+
+	GLint log_length = 0;
+	glGetProgramPipelineiv(p, GL_INFO_LOG_LENGTH, &log_length);
+	if (log_length > 0) {
+		char* log = new char[log_length];
+		glGetProgramPipelineInfoLog(p, log_length, NULL, log);
+		fprintf(stderr, "%s", log);
+		delete[] log;
+	}
+	fprintf(stderr, "\n");
+
+	return false;
+}
+
+GLuint GSShaderOGL::LinkNewProgram()
+{
+	GLuint p = glCreateProgram();
+	if (GLState::vs) glAttachShader(p, GLState::vs);
+	if (GLState::ps) glAttachShader(p, GLState::ps);
+	if (GLState::gs) glAttachShader(p, GLState::gs);
+
+	glLinkProgram(p);
+
+	ValidateProgram(p);
+
+	return p;
+}
+
+void GSShaderOGL::UseProgram()
+{
+	if (GLState::dirty_prog) {
+		if (!GLLoader::found_GL_ARB_separate_shader_objects) {
+			hash_map<uint64, GLuint >::iterator it;
+			// Note: shader are integer lookup pointer. They start from 1 and incr
+			// every time you create a new shader OR a new program.
+			// Note2: vs & gs are precompiled at startup. FGLRX and radeon got value < 128. GS has only 2 programs
+			// We migth be able to pack the value in a 32bits int
+			// I would need to check the behavior on Nvidia (pause/resume).
+			uint64 sel = (uint64)GLState::vs << 40 | (uint64)GLState::gs << 20 | GLState::ps;
+			it = m_single_prog.find(sel);
+			if (it == m_single_prog.end()) {
+				GLState::program = LinkNewProgram();
+				m_single_prog[sel] = GLState::program;
+
+				ValidateProgram(GLState::program);
+
+				glUseProgram(GLState::program);
+			} else {
+				GLuint prog = it->second;
+				if (prog != GLState::program) {
+					GLState::program = prog;
+					glUseProgram(GLState::program);
+				}
+			}
+		}
+	}
+
+	GLState::dirty_prog = false;
+}
+
+std::string GSShaderOGL::GenGlslHeader(const std::string& entry, GLenum type, const std::string& macro)
+{
+	std::string header;
+	header = "#version 330 core\n";
+	// Need GL version 420
+	header += "#extension GL_ARB_shading_language_420pack: require\n";
+	if (GLLoader::found_GL_ARB_separate_shader_objects) {
+		// Need GL version 410
+		header += "#extension GL_ARB_separate_shader_objects: require\n";
+	}
+	if (GLLoader::found_GL_ARB_shader_image_load_store) {
+		// Need GL version 420
+		header += "#extension GL_ARB_shader_image_load_store: require\n";
+	} else {
+		header += "#define DISABLE_GL42_image\n";
+	}
+	if (GLLoader::found_GL_ARB_clip_control) {
+		header += "#define ZERO_TO_ONE_DEPTH\n";
+	}
+
+	// Stupid GL implementation (can't use GL_ES)
+	// AMD/nvidia define it to 0
+	// intel window don't define it
+	// intel linux refuse to define it
+	header += "#define pGL_ES 0\n";
+
+	// Allow to puts several shader in 1 files
+	switch (type) {
+		case GL_VERTEX_SHADER:
+			header += "#define VERTEX_SHADER 1\n";
+			break;
+		case GL_GEOMETRY_SHADER:
+			header += "#define GEOMETRY_SHADER 1\n";
+			break;
+		case GL_FRAGMENT_SHADER:
+			header += "#define FRAGMENT_SHADER 1\n";
+			break;
+		default: ASSERT(0);
+	}
+
+	// Select the entry point ie the main function
+	header += format("#define %s main\n", entry.c_str());
+
+	header += macro;
+
+	return header;
+}
+
+GLuint GSShaderOGL::Compile(const std::string& glsl_file, const std::string& entry, GLenum type, const char* glsl_h_code, const std::string& macro_sel)
+{
+	ASSERT(glsl_h_code != NULL);
+
+	GLuint program = 0;
+
+	if (type == GL_GEOMETRY_SHADER && !GLLoader::found_geometry_shader) {
+		return program;
+	}
+
+	// Note it is better to separate header and source file to have the good line number
+	// in the glsl compiler report
+	const char* sources[2];
+
+	std::string header = GenGlslHeader(entry, type, macro_sel);
+	int shader_nb = 1;
+#if 1
+	sources[0] = header.c_str();
+	sources[1] = glsl_h_code;
+	shader_nb++;
+#else
+	sources[0] = header.append(glsl_h_code).c_str();
+#endif
+
+	if (GLLoader::found_GL_ARB_separate_shader_objects) {
+		program = glCreateShaderProgramv(type, shader_nb, sources);
+	} else {
+		program = glCreateShader(type);
+		glShaderSource(program, shader_nb, sources, NULL);
+		glCompileShader(program);
+	}
+
+	bool status;
+	if (GLLoader::found_GL_ARB_separate_shader_objects)
+		status = ValidateProgram(program);
+	else
+		status = ValidateShader(program);
+
+	if (!status) {
+		// print extra info
+		fprintf(stderr, "%s (entry %s, prog %d) :", glsl_file.c_str(), entry.c_str(), program);
+		fprintf(stderr, "\n%s", macro_sel.c_str());
+		fprintf(stderr, "\n");
+	}
+	return program;
+}
+
+// This function will get the binary program. Normally it must be used a caching
+// solution but Nvidia also incorporates the ASM dump. Asm is nice because it allow
+// to have an overview of the program performance based on the instruction number
+// Note: initially I was using cg offline compiler but it doesn't support latest
+// GLSL improvement (unfortunately).
+int GSShaderOGL::DumpAsm(const std::string& file, GLuint p)
+{
+	if (!GLLoader::nvidia_buggy_driver) return 0;
+
+	GLint   binaryLength;
+	glGetProgramiv(p, GL_PROGRAM_BINARY_LENGTH, &binaryLength);
+
+	char* binary = new char[binaryLength+4];
+	GLenum binaryFormat;
+	glGetProgramBinary(p, binaryLength, NULL, &binaryFormat, binary);
+
+	FILE* outfile = fopen(file.c_str(), "w");
+	ASSERT(outfile);
+
+	// Search the magic number "!!"
+	int asm_ = 0;
+	while (asm_ < binaryLength && (binary[asm_] != '!' || binary[asm_+1] != '!')) {
+		asm_ += 1;
+	}
+
+	int instructions = -1;
+	if (asm_ < binaryLength) {
+		// Now print asm as text
+		char* asm_txt = strtok(&binary[asm_], "\n");
+		while (asm_txt != NULL && (strncmp(asm_txt, "END", 3) || !strncmp(asm_txt, "ENDIF", 5))) {
+			if (!strncmp(asm_txt, "OUT", 3) || !strncmp(asm_txt, "TEMP", 4) || !strncmp(asm_txt, "LONG", 4)) {
+				instructions = 0;
+			} else if (instructions >= 0) {
+				if (instructions == 0)
+					fprintf(outfile, "\n");
+				instructions++;
+			}
+
+			fprintf(outfile, "%s\n", asm_txt);
+			asm_txt = strtok(NULL, "\n");
+		}
+		fprintf(outfile, "\nFound %d instructions\n", instructions);
+	}
+	fclose(outfile);
+
+	if (instructions < 0) {
+		// RAW dump in case of error
+		fprintf(stderr, "Error: failed to find the number of instructions!\n");
+		outfile = fopen(file.c_str(), "wb");
+		fwrite(binary, binaryLength, 1, outfile);
+		fclose(outfile);
+		ASSERT(0);
+	}
+
+	delete[] binary;
+
+	return instructions;
+}
+
+void GSShaderOGL::Delete(GLuint s)
+{
+	if (GLLoader::found_GL_ARB_separate_shader_objects) {
+		glDeleteProgram(s);
+	} else {
+		glDeleteShader(s);
+	}
+}
diff --git a/plugins/GSdx_legacy/GSShaderOGL.h b/plugins/GSdx_legacy/GSShaderOGL.h
new file mode 100644
index 0000000000..ef00002208
--- /dev/null
+++ b/plugins/GSdx_legacy/GSShaderOGL.h
@@ -0,0 +1,51 @@
+/*
+ *	Copyright (C) 2011-2013 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+class GSShaderOGL {
+	GLuint m_pipeline;
+	hash_map<uint64, GLuint > m_single_prog;
+	const bool m_debug_shader;
+
+	bool ValidateShader(GLuint p);
+	bool ValidateProgram(GLuint p);
+	bool ValidatePipeline(GLuint p);
+
+	std::string GenGlslHeader(const std::string& entry, GLenum type, const std::string& macro);
+	GLuint LinkNewProgram();
+
+	public:
+	GSShaderOGL(bool debug);
+	~GSShaderOGL();
+
+	void GS(GLuint s);
+	void PS(GLuint s);
+	void VS(GLuint s);
+
+	void UseProgram();
+
+	GLuint Compile(const std::string& glsl_file, const std::string& entry, GLenum type, const char* glsl_h_code, const std::string& macro_sel = "");
+
+	int DumpAsm(const std::string& file, GLuint p);
+
+	void Delete(GLuint s);
+};
diff --git a/plugins/GSdx_legacy/GSState.cpp b/plugins/GSdx_legacy/GSState.cpp
new file mode 100644
index 0000000000..99f6ca5c40
--- /dev/null
+++ b/plugins/GSdx_legacy/GSState.cpp
@@ -0,0 +1,5635 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSState.h"
+#include "GSdx.h"
+
+//#define Offset_ST  // Fixes Persona3 mini map alignment which is off even in software rendering
+
+static int s_crc_hack_level = 3;
+
+GSState::GSState()
+	: m_version(6)
+	, m_mt(false)
+	, m_irq(NULL)
+	, m_path3hack(0)
+	, m_init_read_fifo_supported(false)
+	, m_q(1.0f)
+	, m_texflush(true)
+	, m_vt(this)
+	, m_regs(NULL)
+	, m_crc(0)
+	, m_options(0)
+	, m_frameskip(0)
+	, m_crcinited(false)
+{
+	m_nativeres = theApp.GetConfig("upscale_multiplier",1) == 1;
+	m_mipmap = !!theApp.GetConfig("mipmap", 1);
+
+	s_n     = 0;
+	s_dump  = !!theApp.GetConfig("dump", 0);
+	s_save  = !!theApp.GetConfig("save", 0);
+	s_savet = !!theApp.GetConfig("savet", 0);
+	s_savez = !!theApp.GetConfig("savez", 0);
+	s_savef = !!theApp.GetConfig("savef", 0);
+	s_saven = theApp.GetConfig("saven", 0);
+	s_savel = theApp.GetConfig("savel", 5000);
+#ifdef __linux__
+	if (s_dump) {
+		GSmkdir("/tmp/GS_HW_dump");
+		GSmkdir("/tmp/GS_SW_dump");
+	}
+#endif
+
+	//s_dump = 1;
+	//s_save = 1;
+	//s_savez = 1;
+	//s_savet = 1;
+	//s_savef = 1;
+	//s_saven = 0;
+	//s_savel = 0;
+
+	UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;
+	m_crc_hack_level = theApp.GetConfig("crc_hack_level", 3);
+	s_crc_hack_level = m_crc_hack_level;
+
+	memset(&m_v, 0, sizeof(m_v));
+	memset(&m_vertex, 0, sizeof(m_vertex));
+	memset(&m_index, 0, sizeof(m_index));
+
+	m_v.RGBAQ.Q = 1.0f;
+
+	GrowVertexBuffer();
+
+	m_sssize = 0;
+
+	m_sssize += sizeof(m_version);
+	m_sssize += sizeof(m_env.PRIM);
+	m_sssize += sizeof(m_env.PRMODE);
+	m_sssize += sizeof(m_env.PRMODECONT);
+	m_sssize += sizeof(m_env.TEXCLUT);
+	m_sssize += sizeof(m_env.SCANMSK);
+	m_sssize += sizeof(m_env.TEXA);
+	m_sssize += sizeof(m_env.FOGCOL);
+	m_sssize += sizeof(m_env.DIMX);
+	m_sssize += sizeof(m_env.DTHE);
+	m_sssize += sizeof(m_env.COLCLAMP);
+	m_sssize += sizeof(m_env.PABE);
+	m_sssize += sizeof(m_env.BITBLTBUF);
+	m_sssize += sizeof(m_env.TRXDIR);
+	m_sssize += sizeof(m_env.TRXPOS);
+	m_sssize += sizeof(m_env.TRXREG);
+	m_sssize += sizeof(m_env.TRXREG); // obsolete
+
+	for(int i = 0; i < 2; i++)
+	{
+		m_sssize += sizeof(m_env.CTXT[i].XYOFFSET);
+		m_sssize += sizeof(m_env.CTXT[i].TEX0);
+		m_sssize += sizeof(m_env.CTXT[i].TEX1);
+		m_sssize += sizeof(m_env.CTXT[i].TEX2);
+		m_sssize += sizeof(m_env.CTXT[i].CLAMP);
+		m_sssize += sizeof(m_env.CTXT[i].MIPTBP1);
+		m_sssize += sizeof(m_env.CTXT[i].MIPTBP2);
+		m_sssize += sizeof(m_env.CTXT[i].SCISSOR);
+		m_sssize += sizeof(m_env.CTXT[i].ALPHA);
+		m_sssize += sizeof(m_env.CTXT[i].TEST);
+		m_sssize += sizeof(m_env.CTXT[i].FBA);
+		m_sssize += sizeof(m_env.CTXT[i].FRAME);
+		m_sssize += sizeof(m_env.CTXT[i].ZBUF);
+	}
+
+	m_sssize += sizeof(m_v.RGBAQ);
+	m_sssize += sizeof(m_v.ST);
+	m_sssize += sizeof(m_v.UV);
+	m_sssize += sizeof(m_v.FOG);
+	m_sssize += sizeof(m_v.XYZ);
+	m_sssize += sizeof(GIFReg); // obsolete
+
+	m_sssize += sizeof(m_tr.x);
+	m_sssize += sizeof(m_tr.y);
+	m_sssize += m_mem.m_vmsize;
+	m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * countof(m_path);
+	m_sssize += sizeof(m_q);
+
+	PRIM = &m_env.PRIM;
+//	CSR->rREV = 0x20;
+	m_env.PRMODECONT.AC = 1;
+
+	Reset();
+
+	ResetHandlers();
+}
+
+GSState::~GSState()
+{
+	if(m_vertex.buff) _aligned_free(m_vertex.buff);
+	if(m_index.buff) _aligned_free(m_index.buff);
+}
+
+void GSState::SetRegsMem(uint8* basemem)
+{
+	ASSERT(basemem);
+
+	m_regs = (GSPrivRegSet*)basemem;
+}
+
+void GSState::SetIrqCallback(void (*irq)())
+{
+	m_irq = irq;
+}
+
+void GSState::SetMultithreaded(bool mt)
+{
+	// Some older versions of PCSX2 didn't properly set the irq callback to NULL
+	// in multithreaded mode (possibly because ZeroGS itself would assert in such
+	// cases), and didn't bind them to a dummy callback either.  PCSX2 handles all
+	// IRQs internally when multithreaded anyway -- so let's ignore them here:
+
+	m_mt = mt;
+
+	if(mt)
+	{
+		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull;
+		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull;
+		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerNull;
+	}
+	else
+	{
+		m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerSIGNAL;
+		m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerFINISH;
+		m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerLABEL;
+	}
+}
+
+void GSState::SetFrameSkip(int skip)
+{
+	if(m_frameskip == skip) return;
+
+	m_frameskip = skip;
+
+	if(skip)
+	{
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
+
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP;
+		m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP;
+
+		m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = &GSState::GIFPackedRegHandlerNOP;
+		m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = &GSState::GIFPackedRegHandlerNOP;
+	}
+	else
+	{
+		UpdateVertexKick();
+	}
+}
+
+void GSState::Reset()
+{
+	//printf("GSdx info: GS reset\n");
+
+	// FIXME: memset(m_mem.m_vm8, 0, m_mem.m_vmsize); // bios logo not shown cut in half after reset, missing graphics in GoW after first FMV
+	memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
+	memset(&m_v, 0, sizeof(m_v));
+
+//	PRIM = &m_env.PRIM;
+//	m_env.PRMODECONT.AC = 1;
+
+	m_env.Reset();
+
+	PRIM = !m_env.PRMODECONT.AC ? (GIFRegPRIM*)&m_env.PRMODE : &m_env.PRIM;
+
+	UpdateContext();
+
+	UpdateVertexKick();
+
+	m_env.UpdateDIMX();
+
+	for(size_t i = 0; i < 2; i++)
+	{
+		m_env.CTXT[i].UpdateScissor();
+
+		m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
+		m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
+		m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM);
+		m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
+		m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
+	}
+
+	UpdateScissor();
+
+	m_vertex.head = 0;
+	m_vertex.tail = 0;
+	m_vertex.next = 0;
+	m_index.tail = 0;
+
+	m_texflush = true;
+}
+
+void GSState::ResetHandlers()
+{
+	for(size_t i = 0; i < countof(m_fpGIFPackedRegHandlers); i++)
+	{
+		m_fpGIFPackedRegHandlers[i] = &GSState::GIFPackedRegHandlerNull;
+	}
+
+	m_fpGIFPackedRegHandlers[GIF_REG_PRIM] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerPRIM;
+	m_fpGIFPackedRegHandlers[GIF_REG_RGBA] = &GSState::GIFPackedRegHandlerRGBA;
+	m_fpGIFPackedRegHandlers[GIF_REG_STQ] = &GSState::GIFPackedRegHandlerSTQ;
+	m_fpGIFPackedRegHandlers[GIF_REG_UV] = !UserHacks_WildHack ? &GSState::GIFPackedRegHandlerUV : &GSState::GIFPackedRegHandlerUV_Hack;
+	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerTEX0<0>;
+	m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerTEX0<1>;
+	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
+	m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
+	m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
+	m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D;
+	m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP;
+
+	#define SetHandlerXYZ(P) \
+		m_fpGIFPackedRegHandlerXYZ[P][0] = &GSState::GIFPackedRegHandlerXYZF2<P, 0>; \
+		m_fpGIFPackedRegHandlerXYZ[P][1] = &GSState::GIFPackedRegHandlerXYZF2<P, 1>; \
+		m_fpGIFPackedRegHandlerXYZ[P][2] = &GSState::GIFPackedRegHandlerXYZ2<P, 0>; \
+		m_fpGIFPackedRegHandlerXYZ[P][3] = &GSState::GIFPackedRegHandlerXYZ2<P, 1>; \
+		m_fpGIFRegHandlerXYZ[P][0] = &GSState::GIFRegHandlerXYZF2<P, 0>; \
+		m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2<P, 1>; \
+		m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2<P, 0>; \
+		m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2<P, 1>; \
+		m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2<P>; \
+		m_fpGIFPackedRegHandlerSTQRGBAXYZ2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZ2<P>; \
+
+	SetHandlerXYZ(GS_POINTLIST);
+	SetHandlerXYZ(GS_LINELIST);
+	SetHandlerXYZ(GS_LINESTRIP);
+	SetHandlerXYZ(GS_TRIANGLELIST);
+	SetHandlerXYZ(GS_TRIANGLESTRIP);
+	SetHandlerXYZ(GS_TRIANGLEFAN);
+	SetHandlerXYZ(GS_SPRITE);
+	SetHandlerXYZ(GS_INVALID);
+
+	for(size_t i = 0; i < countof(m_fpGIFRegHandlers); i++)
+	{
+		m_fpGIFRegHandlers[i] = &GSState::GIFRegHandlerNull;
+	}
+
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
+	m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
+	m_fpGIFRegHandlers[GIF_A_D_REG_UV] = !UserHacks_WildHack ? &GSState::GIFRegHandlerUV : &GSState::GIFRegHandlerUV_Hack;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1] = &GSState::GIFRegHandlerTEX0<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2] = &GSState::GIFRegHandlerTEX0<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1] = &GSState::GIFRegHandlerCLAMP<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2] = &GSState::GIFRegHandlerCLAMP<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FOG] = &GSState::GIFRegHandlerFOG;
+	m_fpGIFRegHandlers[GIF_A_D_REG_NOP] = &GSState::GIFRegHandlerNOP;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1] = &GSState::GIFRegHandlerTEX1<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2] = &GSState::GIFRegHandlerTEX1<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1] = &GSState::GIFRegHandlerTEX2<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2] = &GSState::GIFRegHandlerTEX2<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1] = &GSState::GIFRegHandlerXYOFFSET<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2] = &GSState::GIFRegHandlerXYOFFSET<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT] = &GSState::GIFRegHandlerTEXCLUT;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK] = &GSState::GIFRegHandlerSCANMSK;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1] = &GSState::GIFRegHandlerMIPTBP1<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2] = &GSState::GIFRegHandlerMIPTBP1<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1] = &GSState::GIFRegHandlerMIPTBP2<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2] = &GSState::GIFRegHandlerMIPTBP2<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXA] = &GSState::GIFRegHandlerTEXA;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL] = &GSState::GIFRegHandlerFOGCOL;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH] = &GSState::GIFRegHandlerTEXFLUSH;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1] = &GSState::GIFRegHandlerSCISSOR<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2] = &GSState::GIFRegHandlerSCISSOR<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1] = &GSState::GIFRegHandlerALPHA<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2] = &GSState::GIFRegHandlerALPHA<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_DIMX] = &GSState::GIFRegHandlerDIMX;
+	m_fpGIFRegHandlers[GIF_A_D_REG_DTHE] = &GSState::GIFRegHandlerDTHE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP] = &GSState::GIFRegHandlerCOLCLAMP;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1] = &GSState::GIFRegHandlerTEST<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2] = &GSState::GIFRegHandlerTEST<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_PABE] = &GSState::GIFRegHandlerPABE;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1] = &GSState::GIFRegHandlerFBA<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2] = &GSState::GIFRegHandlerFBA<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1] = &GSState::GIFRegHandlerFRAME<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2] = &GSState::GIFRegHandlerFRAME<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1] = &GSState::GIFRegHandlerZBUF<0>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2] = &GSState::GIFRegHandlerZBUF<1>;
+	m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF] = &GSState::GIFRegHandlerBITBLTBUF;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS] = &GSState::GIFRegHandlerTRXPOS;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG] = &GSState::GIFRegHandlerTRXREG;
+	m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR;
+	m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG;
+
+	SetMultithreaded(m_mt);
+}
+
+GSVector4i GSState::GetDisplayRect(int i)
+{
+	if(i < 0) i = IsEnabled(1) ? 1 : 0;
+	int height = (m_regs->DISP[i].DISPLAY.DH + 1) / (m_regs->DISP[i].DISPLAY.MAGV + 1);
+	int width = (m_regs->DISP[i].DISPLAY.DW + 1) / (m_regs->DISP[i].DISPLAY.MAGH + 1);
+	GSVector4i r;
+
+	//Some games (such as Pool Paradise) use alternate line reading and provide a massive height which is really half.
+	if (height > 640 && !Vmode_VESA_DTV)
+	{
+		height /= 2;
+	}
+
+	r.left = m_regs->DISP[i].DISPLAY.DX / (m_regs->DISP[i].DISPLAY.MAGH + 1);
+	r.top = m_regs->DISP[i].DISPLAY.DY / (m_regs->DISP[i].DISPLAY.MAGV + 1);
+	r.right = r.left + width;
+	r.bottom = r.top + height;
+
+	// Useful for debugging games:
+	//printf("DW: %d , DH: %d , left: %d , right: %d , top: %d , down: %d , MAGH: %d , MAGV: %d\n", m_regs->DISP[i].DISPLAY.DW, m_regs->DISP[i].DISPLAY.DH, r.left, r.right, r.top, r.bottom , m_regs->DISP[i].DISPLAY.MAGH,m_regs->DISP[i].DISPLAY.MAGV);
+
+	return r;
+}
+
+GSVector4i GSState::GetFrameRect(int i)
+{
+	if (i < 0) i = IsEnabled(1) ? 1 : 0;
+
+	GSVector4i r = GetDisplayRect(i);
+
+	int w = r.width();
+	int h = r.height();
+
+//  NTSC: Saturate higher height values for games which have CRTC width lower than 640.
+//  Some NTSC mode games request higher height values for accurate display size / position when width is 640
+//  Testcases : PS logo (640x512) , Resident Evil:CVX (640x480). potentially more test cases...
+
+	if (Vmode_NTSC && h > 448 && w < 640)
+		h = 448;
+
+	if (m_regs->SMODE2.INT && m_regs->SMODE2.FFMD && h > 1)
+		h >>= 1;
+
+	r.left = m_regs->DISP[i].DISPFB.DBX;
+	r.top = m_regs->DISP[i].DISPFB.DBY;
+	r.right = r.left + w;
+	r.bottom = r.top + h;
+
+	/*static GSVector4i old_r = (GSVector4i) 0;
+	if ((old_r.left != r.left) || (old_r.right != r.right) || (old_r.top != r.top) || (old_r.right != r.right)){
+	printf("w %d  h %d  left %d  top %d  right %d  bottom %d\n",w,h,r.left,r.top,r.right,r.bottom);
+	}
+	old_r = r;*/
+
+	return r;
+}
+
+GSVector2i GSState::GetDeviceSize(int i)
+{
+	// TODO: return (m_regs->SMODE1.CMOD & 1) ? GSVector2i(640, 576) : GSVector2i(640, 480);
+
+	// TODO: other params of SMODE1 should affect the true device display size
+
+	// TODO2: pal games at 60Hz
+
+	if(i < 0) i = IsEnabled(1) ? 1 : 0;
+
+	GSVector4i r = GetDisplayRect(i);
+
+	int w = r.width();
+	int h = r.height();
+
+	/*if(h == 2 * 416 || h == 2 * 448 || h == 2 * 512)
+	{
+		h /= 2;
+	}
+	else
+	{
+		h = (m_regs->SMODE1.CMOD & 1) ? 512 : 448;
+	}*/
+
+	//Fixme : Just slightly better than the hack above
+	if(m_regs->SMODE2.INT && m_regs->SMODE2.FFMD && h > 1)
+	{
+		if (IsEnabled(0) || IsEnabled(1))
+		{
+			h >>= 1;
+		}
+	}
+
+	//Fixme: These games elude the code above, worked with the old hack
+	else if(m_game.title == CRC::SilentHill2 || m_game.title == CRC::SilentHill3)
+	{
+		h /= 2; 
+	}
+
+	return GSVector2i(w, h);
+
+}
+
+bool GSState::IsEnabled(int i)
+{
+	ASSERT(i >= 0 && i < 2);
+
+	if(i == 0 && m_regs->PMODE.EN1)
+	{
+		return m_regs->DISP[0].DISPLAY.DW || m_regs->DISP[0].DISPLAY.DH;
+	}
+	else if(i == 1 && m_regs->PMODE.EN2)
+	{
+		return m_regs->DISP[1].DISPLAY.DW || m_regs->DISP[1].DISPLAY.DH;
+	}
+
+	return false;
+}
+
+float GSState::GetTvRefreshRate()
+{
+	float vertical_frequency = 0;
+
+	switch (m_regs->SMODE1.CMOD)
+	{
+		case 0:
+		{
+			if (Vmode_VESA_1A)			vertical_frequency = 59.94f;
+			if (Vmode_VESA_1C)			vertical_frequency = 75;
+			if (Vmode_VESA_2B)			vertical_frequency = 60.317f;
+			if (Vmode_VESA_2D)			vertical_frequency = 75;
+			if (Vmode_VESA_3B)			vertical_frequency = 60.004f;
+			if (Vmode_VESA_3D)			vertical_frequency = 75.029f;
+			if (Vmode_VESA_4A)			vertical_frequency = 60.020f;
+			if (Vmode_VESA_4B)			vertical_frequency = 75.025f;
+			if (Vmode_DTV_480P)			vertical_frequency = 59.94f;
+			if (Vmode_DTV_720P_1080I)	vertical_frequency = 60;
+			break;
+		}
+
+		case 2: vertical_frequency = (60 / 1.001f); //NTSC
+			break;
+		case 3: vertical_frequency = 50;			//PAL
+			break;
+		default: ASSERT(0);
+	}
+
+	return vertical_frequency;
+}
+
+// GIFPackedRegHandler*
+
+void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* RESTRICT r)
+{
+	// ASSERT(0);
+}
+
+void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* RESTRICT r)
+{
+	#if _M_SSE >= 0x301
+
+	GSVector4i mask = GSVector4i::load(0x0c080400);
+	GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask);
+
+	m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v);
+
+	#else
+
+	GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff();
+
+	m_v.RGBAQ.u32[0] = v.rgba32();
+
+	#endif
+
+	m_v.RGBAQ.Q = m_q;
+}
+
+void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* RESTRICT r)
+{
+	GSVector4i st = GSVector4i::loadl(&r->u64[0]);
+	GSVector4i q = GSVector4i::loadl(&r->u64[1]);
+
+	GSVector4i::storel(&m_v.ST, st);
+
+	q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // character shadow in Vexx, q = 0 (st also 0 on the first 16 vertices), setting it to 1.0f to avoid div by zero later
+	
+	*(int*)&m_q = GSVector4i::store(q); 
+
+	ASSERT(!std::isnan(m_q)); // See GIFRegHandlerRGBAQ
+	ASSERT(!std::isnan(m_v.ST.S)); // See GIFRegHandlerRGBAQ
+	ASSERT(!std::isnan(m_v.ST.T)); // See GIFRegHandlerRGBAQ
+	
+#ifdef Offset_ST
+	GIFRegTEX0 TEX0 = m_context->TEX0;
+	m_v.ST.S -= 0.02f * m_q / (1 << TEX0.TW);
+	m_v.ST.T -= 0.02f * m_q / (1 << TEX0.TH);
+#endif
+}
+
+void GSState::GIFPackedRegHandlerUV(const GIFPackedReg* RESTRICT r)
+{
+	GSVector4i v = GSVector4i::loadl(r) & GSVector4i::x00003fff();
+
+	m_v.UV = (uint32)GSVector4i::store(v.ps32(v));
+}
+
+void GSState::GIFPackedRegHandlerUV_Hack(const GIFPackedReg* RESTRICT r)
+{
+	GSVector4i v = GSVector4i::loadl(r) & GSVector4i::x00003fff();
+
+	m_v.UV = (uint32)GSVector4i::store(v.ps32(v));
+
+    isPackedUV_HackFlag = true;
+}
+
+template<uint32 prim, uint32 adc>
+void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
+{
+	/*
+	m_v.XYZ.X = r->XYZF2.X;
+	m_v.XYZ.Y = r->XYZF2.Y;
+	m_v.XYZ.Z = r->XYZF2.Z;
+	m_v.FOG = r->XYZF2.F;
+	*/
+	GSVector4i xy = GSVector4i::loadl(&r->u64[0]);
+	GSVector4i zf = GSVector4i::loadl(&r->u64[1]);
+	xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
+	zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
+
+	m_v.m[1] = xy.upl32(zf);
+
+	VertexKick<prim>(adc ? 1 : r->XYZF2.Skip());
+}
+
+template<uint32 prim, uint32 adc>
+void GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* RESTRICT r)
+{
+/*
+	m_v.XYZ.X = r->XYZ2.X;
+	m_v.XYZ.Y = r->XYZ2.Y;
+	m_v.XYZ.Z = r->XYZ2.Z;
+*/
+	GSVector4i xy = GSVector4i::loadl(&r->u64[0]);
+	GSVector4i z = GSVector4i::loadl(&r->u64[1]);
+	GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z);
+
+	m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV));
+
+	VertexKick<prim>(adc ? 1 : r->XYZ2.Skip());
+}
+
+void GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* RESTRICT r)
+{
+	m_v.FOG = r->FOG.F;
+}
+
+void GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* RESTRICT r)
+{
+	(this->*m_fpGIFRegHandlers[r->A_D.ADDR])(&r->r);
+}
+
+void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r)
+{
+}
+
+template<uint32 prim>
+void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size)
+{
+	ASSERT(size > 0 && size % 3 == 0);
+
+	const GIFPackedReg* RESTRICT r_end = r + size;
+
+	while(r < r_end)
+	{
+		GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
+		GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
+		GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
+		/*
+		GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
+		GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
+		GSVector4i rbga = rg.upl8(ba);
+		GSVector4i rgba = rbga.upl8(rbga.zzzz());
+		*/
+		q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
+
+		m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
+
+		GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
+		GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
+		xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
+		zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
+
+		m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
+
+		VertexKick<prim>(r[2].XYZF2.Skip());
+
+		r += 3;
+	}
+
+	m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
+}
+
+template<uint32 prim>
+void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size)
+{
+	ASSERT(size > 0 && size % 3 == 0);
+
+	const GIFPackedReg* RESTRICT r_end = r + size;
+
+	while(r < r_end)
+	{
+		GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
+		GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
+		GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
+		/*
+		GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
+		GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
+		GSVector4i rbga = rg.upl8(ba);
+		GSVector4i rgba = rbga.upl8(rbga.zzzz());
+		*/
+		q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
+
+		m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
+
+		GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
+		GSVector4i z = GSVector4i::loadl(&r[2].u64[1]);
+		GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z);
+
+		m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV)); // TODO: only store the last one
+
+		VertexKick<prim>(r[2].XYZ2.Skip());
+
+		r += 3;
+	}
+
+	m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
+}
+
+void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size)
+{
+}
+
+// GIFRegHandler*
+
+void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
+{
+	// ASSERT(0);
+}
+
+__forceinline void GSState::ApplyPRIM(uint32 prim)
+{
+	// ASSERT(r->PRIM.PRIM < 7);
+
+	if(GSUtil::GetPrimClass(m_env.PRIM.PRIM) == GSUtil::GetPrimClass(prim & 7)) // NOTE: assume strips/fans are converted to lists
+	{
+		if((m_env.PRIM.u32[0] ^ prim) & 0x7f8) // all fields except PRIM
+		{
+			Flush();
+		}
+	}
+	else
+	{
+		Flush();
+	}
+
+	m_env.PRIM.u32[0] = prim;
+	m_env.PRMODE._PRIM = prim;
+
+	UpdateContext();
+
+	UpdateVertexKick();
+
+	ASSERT(m_index.tail == 0 || m_index.buff[m_index.tail - 1] + 1 == m_vertex.next);
+
+	if(m_index.tail == 0)
+	{
+		m_vertex.next = 0;
+	}
+
+	m_vertex.head = m_vertex.tail = m_vertex.next; // remove unused vertices from the end of the vertex buffer
+}
+
+void GSState::GIFRegHandlerPRIM(const GIFReg* RESTRICT r)
+{
+	ALIGN_STACK(32);
+
+	ApplyPRIM(r->PRIM.u32[0]);
+}
+
+void GSState::GIFRegHandlerRGBAQ(const GIFReg* RESTRICT r)
+{
+	GSVector4i rgbaq = (GSVector4i)r->RGBAQ;
+
+	GSVector4i q = rgbaq.blend8(GSVector4i::cast(GSVector4::m_one), rgbaq == GSVector4i::zero()).yyyy(); // see GIFPackedRegHandlerSTQ
+
+	// Silent Hill output a nan in Q to emulate the flash light. Unfortunately it
+	// breaks GSVertexTrace code that rely on min/max.
+
+	q = GSVector4i::cast(GSVector4::cast(q).replace_nan(GSVector4::m_max));
+
+	m_v.RGBAQ = rgbaq.upl32(q);
+
+	/*
+	// Silent Hill output a nan in Q to emulate the flash light. Unfortunately it
+	// breaks GSVertexTrace code that rely on min/max.
+	if (std::isnan(m_v.RGBAQ.Q))
+	{
+		m_v.RGBAQ.Q = std::numeric_limits<float>::max();
+	}
+	*/
+}
+
+void GSState::GIFRegHandlerST(const GIFReg* RESTRICT r)
+{
+	m_v.ST = (GSVector4i)r->ST;
+
+	ASSERT(!std::isnan(m_v.ST.S)); // See GIFRegHandlerRGBAQ
+	ASSERT(!std::isnan(m_v.ST.T)); // See GIFRegHandlerRGBAQ
+
+#ifdef Offset_ST
+	GIFRegTEX0 TEX0 = m_context->TEX0;
+	m_v.ST.S -= 0.02f * m_q / (1 << TEX0.TW);
+	m_v.ST.T -= 0.02f * m_q / (1 << TEX0.TH);
+#endif
+}
+
+void GSState::GIFRegHandlerUV(const GIFReg* RESTRICT r)
+{
+    m_v.UV = r->UV.u32[0] & 0x3fff3fff;
+}
+
+void GSState::GIFRegHandlerUV_Hack(const GIFReg* RESTRICT r)
+{
+    m_v.UV = r->UV.u32[0] & 0x3fff3fff;
+
+    isPackedUV_HackFlag = false;
+}
+
+template<uint32 prim, uint32 adc>
+void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)
+{
+/*
+	m_v.XYZ.X = r->XYZF.X;
+	m_v.XYZ.Y = r->XYZF.Y;
+	m_v.XYZ.Z = r->XYZF.Z;
+	m_v.FOG.F = r->XYZF.F;
+*/
+	
+/*
+	m_v.XYZ.u32[0] = r->XYZF.u32[0];
+	m_v.XYZ.u32[1] = r->XYZF.u32[1] & 0x00ffffff;
+	m_v.FOG = r->XYZF.u32[1] >> 24;
+*/
+
+	GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
+	GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
+	GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
+	
+	m_v.m[1] = xyz.upl64(uvf);
+
+	VertexKick<prim>(adc);
+}
+
+template<uint32 prim, uint32 adc>
+void GSState::GIFRegHandlerXYZ2(const GIFReg* RESTRICT r)
+{
+	// m_v.XYZ = (GSVector4i)r->XYZ;
+
+	m_v.m[1] = GSVector4i::load(&r->XYZ, &m_v.UV);
+
+	VertexKick<prim>(adc);
+}
+
+template<int i> void GSState::ApplyTEX0(GIFRegTEX0& TEX0)
+{
+	// even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing
+
+	bool wt = m_mem.m_clut.WriteTest(TEX0, m_env.TEXCLUT);
+
+	// clut loading already covered with WriteTest, for drawing only have to check CPSM and CSA (MGS3 intro skybox would be drawn piece by piece without this)
+
+	uint64 mask = 0x1f78001c3fffffffull; // TBP0 TBW PSM TW TCC TFX CPSM CSA
+
+	if(wt || PRIM->CTXT == i && ((TEX0.u64 ^ m_env.CTXT[i].TEX0.u64) & mask))
+	{
+		Flush();
+	}
+
+	TEX0.CPSM &= 0xa; // 1010b
+
+	if((TEX0.u32[0] ^ m_env.CTXT[i].TEX0.u32[0]) & 0x3ffffff) // TBP0 TBW PSM
+	{
+		m_env.CTXT[i].offset.tex = m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+	}
+
+	m_env.CTXT[i].TEX0 = (GSVector4i)TEX0;
+
+	if(wt)
+	{
+		GIFRegBITBLTBUF BITBLTBUF;
+		GSVector4i r;
+
+		if(TEX0.CSM == 0)
+		{
+			BITBLTBUF.SBP = TEX0.CBP;
+			BITBLTBUF.SBW = 1;
+			BITBLTBUF.SPSM = TEX0.CSM;
+
+			r.left = 0;
+			r.top = 0;
+			r.right = GSLocalMemory::m_psm[TEX0.CPSM].bs.x;
+			r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].bs.y;
+
+			int blocks = 4;
+
+			if(GSLocalMemory::m_psm[TEX0.CPSM].bpp == 16)
+			{
+				blocks >>= 1;
+			}
+
+			if(GSLocalMemory::m_psm[TEX0.PSM].bpp == 4)
+			{
+				blocks >>= 1;
+			}
+		
+			for(int j = 0; j < blocks; j++, BITBLTBUF.SBP++)
+			{
+				InvalidateLocalMem(BITBLTBUF, r, true);
+			}
+		}
+		else
+		{
+			BITBLTBUF.SBP = TEX0.CBP;
+			BITBLTBUF.SBW = m_env.TEXCLUT.CBW;
+			BITBLTBUF.SPSM = TEX0.CSM;
+
+			r.left = m_env.TEXCLUT.COU;
+			r.top = m_env.TEXCLUT.COV;
+			r.right = r.left + GSLocalMemory::m_psm[TEX0.CPSM].pal;
+			r.bottom = r.top + 1;
+		
+			InvalidateLocalMem(BITBLTBUF, r, true);
+		}
+
+		m_mem.m_clut.Write(m_env.CTXT[i].TEX0, m_env.TEXCLUT);
+	}
+}
+
+template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r)
+{
+	GIFRegTEX0 TEX0 = r->TEX0;
+
+	int tw = (int)TEX0.TW;
+	int th = (int)TEX0.TH;
+
+	if(tw > 10) tw = 10;
+	if(th > 10) th = 10;
+
+	if(PRIM->FST)
+	{
+		// Tokyo Xtreme Racer Drift 2, TW/TH == 0
+		// Just setting the max texture size to make the texture cache allocate some surface. 
+		// The vertex trace will narrow the updated area down to the minimum, upper-left 8x8 
+		// for a single letter, but it may address the whole thing if it wants to.
+
+		if(tw == 0) tw = 10;
+		if(th == 0) th = 10;
+	}
+	else
+	{
+		// Yakuza, TW/TH == 0
+		// The minimap is drawn using solid colors, the texture is really a 1x1 white texel, 
+		// modulated by the vertex color. Cannot change the dimension because S/T are normalized.
+	}
+
+	TEX0.TW = tw;
+	TEX0.TH = th;
+
+	if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4))
+	{
+		ASSERT(TEX0.TBW == 1); // TODO // Bouken Jidai Katsugeki Goemon
+
+		TEX0.TBW &= ~1; // GS User 2.6
+	}
+
+	ApplyTEX0<i>(TEX0);
+
+	if(m_env.CTXT[i].TEX1.MTBA)
+	{
+		// NOTE 1: TEX1.MXL must not be automatically set to 3 here.
+		// NOTE 2: Mipmap levels are tightly packed, if (tbw << 6) > (1 << tw) then the left-over space to the right is used. (common for PSM_PSMT4)
+		// NOTE 3: Non-rectangular textures are treated as rectangular when calculating the occupied space (height is extended, not sure about width)
+
+		uint32 bp = TEX0.TBP0;
+		uint32 bw = TEX0.TBW;
+		uint32 w = 1u << TEX0.TW;
+		uint32 h = 1u << TEX0.TH;
+		uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp;
+
+		if(h < w) h = w;
+
+		bp += ((w * h * bpp >> 3) + 255) >> 8;
+		bw = std::max<uint32>(bw >> 1, 1);
+		w = std::max<uint32>(w >> 1, 1);
+		h = std::max<uint32>(h >> 1, 1);
+
+		m_env.CTXT[i].MIPTBP1.TBP1 = bp;
+		m_env.CTXT[i].MIPTBP1.TBW1 = bw;
+
+		bp += ((w * h * bpp >> 3) + 255) >> 8;
+		bw = std::max<uint32>(bw >> 1, 1);
+		w = std::max<uint32>(w >> 1, 1);
+		h = std::max<uint32>(h >> 1, 1);
+
+		m_env.CTXT[i].MIPTBP1.TBP2 = bp;
+		m_env.CTXT[i].MIPTBP1.TBW2 = bw;
+
+		bp += ((w * h * bpp >> 3) + 255) >> 8;
+		bw = std::max<uint32>(bw >> 1, 1);
+		w = std::max<uint32>(w >> 1, 1);
+		h = std::max<uint32>(h >> 1, 1);
+
+		m_env.CTXT[i].MIPTBP1.TBP3 = bp;
+		m_env.CTXT[i].MIPTBP1.TBW3 = bw;
+
+		// printf("MTBA\n");
+	}
+}
+
+template<int i> void GSState::GIFRegHandlerCLAMP(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->CLAMP != m_env.CTXT[i].CLAMP)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].CLAMP = (GSVector4i)r->CLAMP;
+}
+
+void GSState::GIFRegHandlerFOG(const GIFReg* RESTRICT r)
+{
+	m_v.FOG = r->FOG.F;
+}
+
+void GSState::GIFRegHandlerNOP(const GIFReg* RESTRICT r)
+{
+}
+
+template<int i> void GSState::GIFRegHandlerTEX1(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->TEX1 != m_env.CTXT[i].TEX1)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].TEX1 = (GSVector4i)r->TEX1;
+}
+
+template<int i> void GSState::GIFRegHandlerTEX2(const GIFReg* RESTRICT r)
+{
+	// m_env.CTXT[i].TEX2 = r->TEX2; // not used
+
+	// TEX2 is a masked write to TEX0, for performing CLUT swaps (palette swaps).
+	// It only applies the following fields:
+	//    CLD, CSA, CSM, CPSM, CBP, PSM.
+	// It ignores these fields (uses existing values in the context):
+	//    TFX, TCC, TH, TW, TBW, and TBP0
+
+	uint64 mask = 0xFFFFFFE003F00000ull; // TEX2 bits
+
+	GIFRegTEX0 TEX0;
+	
+	TEX0.u64 = (m_env.CTXT[i].TEX0.u64 & ~mask) | (r->u64 & mask);
+
+	ApplyTEX0<i>(TEX0);
+}
+
+template<int i> void GSState::GIFRegHandlerXYOFFSET(const GIFReg* RESTRICT r)
+{
+	GSVector4i o = (GSVector4i)r->XYOFFSET & GSVector4i::x0000ffff();
+
+	if(!o.eq(m_env.CTXT[i].XYOFFSET))
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].XYOFFSET = o;
+
+	m_env.CTXT[i].UpdateScissor();
+
+	UpdateScissor();
+}
+
+void GSState::GIFRegHandlerPRMODECONT(const GIFReg* RESTRICT r)
+{
+	if(r->PRMODECONT != m_env.PRMODECONT)
+	{
+		Flush();
+	}
+
+	m_env.PRMODECONT.AC = r->PRMODECONT.AC;
+
+	PRIM = m_env.PRMODECONT.AC ? &m_env.PRIM : (GIFRegPRIM*)&m_env.PRMODE;
+
+	// if(PRIM->PRIM == 7) printf("Invalid PRMODECONT/PRIM\n");
+
+	UpdateContext();
+
+	UpdateVertexKick();
+}
+
+void GSState::GIFRegHandlerPRMODE(const GIFReg* RESTRICT r)
+{
+	if(!m_env.PRMODECONT.AC)
+	{
+		Flush();
+	}
+
+	uint32 _PRIM = m_env.PRMODE._PRIM;
+	m_env.PRMODE = (GSVector4i)r->PRMODE;
+	m_env.PRMODE._PRIM = _PRIM;
+
+	UpdateContext();
+
+	UpdateVertexKick();
+}
+
+void GSState::GIFRegHandlerTEXCLUT(const GIFReg* RESTRICT r)
+{
+	if(r->TEXCLUT != m_env.TEXCLUT)
+	{
+		Flush();
+	}
+
+	m_env.TEXCLUT = (GSVector4i)r->TEXCLUT;
+}
+
+void GSState::GIFRegHandlerSCANMSK(const GIFReg* RESTRICT r)
+{
+	if(r->SCANMSK != m_env.SCANMSK)
+	{
+		Flush();
+	}
+
+	m_env.SCANMSK = (GSVector4i)r->SCANMSK;
+}
+
+template<int i> void GSState::GIFRegHandlerMIPTBP1(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->MIPTBP1 != m_env.CTXT[i].MIPTBP1)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].MIPTBP1 = (GSVector4i)r->MIPTBP1;
+}
+
+template<int i> void GSState::GIFRegHandlerMIPTBP2(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->MIPTBP2 != m_env.CTXT[i].MIPTBP2)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].MIPTBP2 = (GSVector4i)r->MIPTBP2;
+}
+
+void GSState::GIFRegHandlerTEXA(const GIFReg* RESTRICT r)
+{
+	if(r->TEXA != m_env.TEXA)
+	{
+		Flush();
+	}
+
+	m_env.TEXA = (GSVector4i)r->TEXA;
+}
+
+void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r)
+{
+	if(r->FOGCOL != m_env.FOGCOL)
+	{
+		Flush();
+	}
+
+	m_env.FOGCOL = (GSVector4i)r->FOGCOL;
+}
+
+void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r)
+{
+	m_texflush = true;
+}
+
+template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->SCISSOR != m_env.CTXT[i].SCISSOR)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].SCISSOR = (GSVector4i)r->SCISSOR;
+
+	m_env.CTXT[i].UpdateScissor();
+
+	UpdateScissor();
+}
+
+template<int i> void GSState::GIFRegHandlerALPHA(const GIFReg* RESTRICT r)
+{
+	ASSERT(r->ALPHA.A != 3);
+	ASSERT(r->ALPHA.B != 3);
+	ASSERT(r->ALPHA.C != 3);
+	ASSERT(r->ALPHA.D != 3);
+
+	if(PRIM->CTXT == i && r->ALPHA != m_env.CTXT[i].ALPHA)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].ALPHA = (GSVector4i)r->ALPHA;
+
+	// A/B/C/D == 3? => 2
+
+	m_env.CTXT[i].ALPHA.u32[0] = ((~m_env.CTXT[i].ALPHA.u32[0] >> 1) | 0xAA) & m_env.CTXT[i].ALPHA.u32[0];
+}
+
+void GSState::GIFRegHandlerDIMX(const GIFReg* RESTRICT r)
+{
+	bool update = false;
+
+	if(r->DIMX != m_env.DIMX)
+	{
+		Flush();
+
+		update = true;
+	}
+
+	m_env.DIMX = (GSVector4i)r->DIMX;
+
+	if(update)
+	{
+		m_env.UpdateDIMX();
+	}
+}
+
+void GSState::GIFRegHandlerDTHE(const GIFReg* RESTRICT r)
+{
+	if(r->DTHE != m_env.DTHE)
+	{
+		Flush();
+	}
+
+	m_env.DTHE = (GSVector4i)r->DTHE;
+}
+
+void GSState::GIFRegHandlerCOLCLAMP(const GIFReg* RESTRICT r)
+{
+	if(r->COLCLAMP != m_env.COLCLAMP)
+	{
+		Flush();
+	}
+
+	m_env.COLCLAMP = (GSVector4i)r->COLCLAMP;
+#ifdef DISABLE_COLCLAMP
+	m_env.COLCLAMP.CLAMP = 1;
+#endif
+}
+
+template<int i> void GSState::GIFRegHandlerTEST(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->TEST != m_env.CTXT[i].TEST)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].TEST = (GSVector4i)r->TEST;
+#ifdef DISABLE_DATE
+	m_env.CTXT[i].TEST.DATE = 0;
+#endif
+}
+
+void GSState::GIFRegHandlerPABE(const GIFReg* RESTRICT r)
+{
+	if(r->PABE != m_env.PABE)
+	{
+		Flush();
+	}
+
+	m_env.PABE = (GSVector4i)r->PABE;
+}
+
+template<int i> void GSState::GIFRegHandlerFBA(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->FBA != m_env.CTXT[i].FBA)
+	{
+		Flush();
+	}
+
+	m_env.CTXT[i].FBA = (GSVector4i)r->FBA;
+}
+
+template<int i> void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r)
+{
+	if(PRIM->CTXT == i && r->FRAME != m_env.CTXT[i].FRAME)
+	{
+		Flush();
+	}
+
+	if((m_env.CTXT[i].FRAME.u32[0] ^ r->FRAME.u32[0]) & 0x3f3f01ff) // FBP FBW PSM
+	{
+		m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM);
+		m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
+		m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF);
+		m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
+	}
+	
+	m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME;
+
+#ifdef DISABLE_BITMASKING
+	m_env.CTXT[i].FRAME.FBMSK = GSVector4i::store(GSVector4i::load((int)m_env.CTXT[i].FRAME.FBMSK).eq8(GSVector4i::xffffffff()));
+#endif
+}
+
+template<int i> void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r)
+{
+	GIFRegZBUF ZBUF = r->ZBUF;
+
+	if(ZBUF.u32[0] == 0)
+	{
+		// during startup all regs are cleared to 0 (by the bios or something), so we mask z until this register becomes valid
+		// edit: breaks Grandia Xtreme and sounds like a bad idea generally. What was the intend?
+		// edit2: should be set only before any serious drawing happens, grandia extreme nulls out this register throughout the whole game, 
+		//        I already forgot what it fixed, that game never masked the zbuffer, but assumed it was set by default
+		//ZBUF.ZMSK = 1;
+	}
+
+	ZBUF.PSM |= 0x30;
+
+	if(ZBUF.PSM != PSM_PSMZ32
+	&& ZBUF.PSM != PSM_PSMZ24
+	&& ZBUF.PSM != PSM_PSMZ16
+	&& ZBUF.PSM != PSM_PSMZ16S)
+	{
+		ZBUF.PSM = PSM_PSMZ32;
+	}
+
+	if(PRIM->CTXT == i && ZBUF != m_env.CTXT[i].ZBUF)
+	{
+		Flush();
+	}
+
+	if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM
+	{
+		m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM);
+		m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF);
+		m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
+	}
+
+	m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF;
+}
+
+void GSState::GIFRegHandlerBITBLTBUF(const GIFReg* RESTRICT r)
+{
+	if(r->BITBLTBUF != m_env.BITBLTBUF)
+	{
+		FlushWrite();
+	}
+
+	m_env.BITBLTBUF = (GSVector4i)r->BITBLTBUF;
+
+	if((m_env.BITBLTBUF.SBW & 1) && (m_env.BITBLTBUF.SPSM == PSM_PSMT8 || m_env.BITBLTBUF.SPSM == PSM_PSMT4))
+	{
+		m_env.BITBLTBUF.SBW &= ~1;
+	}
+
+	if((m_env.BITBLTBUF.DBW & 1) && (m_env.BITBLTBUF.DPSM == PSM_PSMT8 || m_env.BITBLTBUF.DPSM == PSM_PSMT4))
+	{
+		m_env.BITBLTBUF.DBW &= ~1; // namcoXcapcom: 5, 11, refered to as 4, 10 in TEX0.TBW later
+	}
+}
+
+void GSState::GIFRegHandlerTRXPOS(const GIFReg* RESTRICT r)
+{
+	if(r->TRXPOS != m_env.TRXPOS)
+	{
+		FlushWrite();
+	}
+
+	m_env.TRXPOS = (GSVector4i)r->TRXPOS;
+}
+
+void GSState::GIFRegHandlerTRXREG(const GIFReg* RESTRICT r)
+{
+	if(r->TRXREG != m_env.TRXREG)
+	{
+		FlushWrite();
+	}
+
+	m_env.TRXREG = (GSVector4i)r->TRXREG;
+}
+
+void GSState::GIFRegHandlerTRXDIR(const GIFReg* RESTRICT r)
+{
+	Flush();
+
+	m_env.TRXDIR = (GSVector4i)r->TRXDIR;
+
+	switch(m_env.TRXDIR.XDIR)
+	{
+	case 0: // host -> local
+		m_tr.Init(m_env.TRXPOS.DSAX, m_env.TRXPOS.DSAY);
+		break;
+	case 1: // local -> host
+		m_tr.Init(m_env.TRXPOS.SSAX, m_env.TRXPOS.SSAY);
+		break;
+	case 2: // local -> local
+		Move();
+		break;
+	case 3:
+		ASSERT(0);
+		break;
+	default:
+		__assume(0);
+	}
+}
+
+void GSState::GIFRegHandlerHWREG(const GIFReg* RESTRICT r)
+{
+	ASSERT(m_env.TRXDIR.XDIR == 0); // host => local
+
+	Write((uint8*)r, 8); // haunting ground
+}
+
+void GSState::GIFRegHandlerSIGNAL(const GIFReg* RESTRICT r)
+{
+	m_regs->SIGLBLID.SIGID = (m_regs->SIGLBLID.SIGID & ~r->SIGNAL.IDMSK) | (r->SIGNAL.ID & r->SIGNAL.IDMSK);
+
+	if(m_regs->CSR.wSIGNAL) m_regs->CSR.rSIGNAL = 1;
+	if(!m_regs->IMR.SIGMSK && m_irq) m_irq();
+}
+
+void GSState::GIFRegHandlerFINISH(const GIFReg* RESTRICT r)
+{
+	if(m_regs->CSR.wFINISH) m_regs->CSR.rFINISH = 1;
+	if(!m_regs->IMR.FINISHMSK && m_irq) m_irq();
+}
+
+void GSState::GIFRegHandlerLABEL(const GIFReg* RESTRICT r)
+{
+	m_regs->SIGLBLID.LBLID = (m_regs->SIGLBLID.LBLID & ~r->LABEL.IDMSK) | (r->LABEL.ID & r->LABEL.IDMSK);
+}
+
+//
+
+void GSState::Flush()
+{
+	FlushWrite();
+
+	FlushPrim();
+}
+
+void GSState::FlushWrite()
+{
+	int len = m_tr.end - m_tr.start;
+
+	if(len <= 0) return;
+
+	GSVector4i r;
+
+	r.left = m_env.TRXPOS.DSAX;
+	r.top = m_env.TRXPOS.DSAY;
+	r.right = r.left + m_env.TRXREG.RRW;
+	r.bottom = r.top + m_env.TRXREG.RRH;
+
+	InvalidateVideoMem(m_env.BITBLTBUF, r);
+	
+	//int y = m_tr.y;
+
+	GSLocalMemory::writeImage wi = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM].wi;
+
+	(m_mem.*wi)(m_tr.x, m_tr.y, &m_tr.buff[m_tr.start], len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
+
+	m_tr.start += len;
+
+	m_perfmon.Put(GSPerfMon::Swizzle, len);
+
+	/*
+	GSVector4i r;
+
+	r.left = m_env.TRXPOS.DSAX;
+	r.top = y;
+	r.right = r.left + m_env.TRXREG.RRW;
+	r.bottom = std::min<int>(r.top + m_env.TRXREG.RRH, m_tr.x == r.left ? m_tr.y : m_tr.y + 1);
+
+	InvalidateVideoMem(m_env.BITBLTBUF, r);
+	*/
+/*
+	static int n = 0;
+	string s;
+	s = format("c:\\temp1\\[%04d]_%05x_%d_%d_%d_%d_%d_%d.bmp",
+		n++, (int)m_env.BITBLTBUF.DBP, (int)m_env.BITBLTBUF.DBW, (int)m_env.BITBLTBUF.DPSM,
+		r.left, r.top, r.right, r.bottom);
+	m_mem.SaveBMP(s, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, r.right, r.bottom);
+*/
+}
+
+void GSState::FlushPrim()
+{
+	if(m_index.tail > 0)
+	{
+		GSVertex buff[2];
+
+		size_t head = m_vertex.head;
+		size_t tail = m_vertex.tail;
+		size_t next = m_vertex.next;
+		size_t unused = 0;
+
+		if(tail > head)
+		{
+			switch(PRIM->PRIM)
+			{
+			case GS_POINTLIST:
+				ASSERT(0);
+				break;
+			case GS_LINELIST:
+			case GS_LINESTRIP:
+			case GS_SPRITE:
+			case GS_TRIANGLELIST:
+			case GS_TRIANGLESTRIP:
+				unused = tail - head;
+				memcpy(buff, &m_vertex.buff[head], sizeof(GSVertex) * unused);
+				break;
+			case GS_TRIANGLEFAN:
+				buff[0] = m_vertex.buff[head]; unused = 1;
+				if(tail - 1 > head) {buff[1] = m_vertex.buff[tail - 1]; unused = 2;}
+				break;
+			case GS_INVALID:
+				break;
+			default:
+				__assume(0);
+			}
+				
+			ASSERT((int)unused < GSUtil::GetVertexCount(PRIM->PRIM));
+		}
+
+		if(GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt < 3 && GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt < 3)
+		{
+			// FIXME: berserk fpsm = 27 (8H)
+
+			m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
+
+			Draw();
+
+			m_perfmon.Put(GSPerfMon::Draw, 1);
+			m_perfmon.Put(GSPerfMon::Prim, m_index.tail / GSUtil::GetVertexCount(PRIM->PRIM));
+		}
+
+		m_index.tail = 0;
+
+		m_vertex.head = 0;
+
+		if(unused > 0)
+		{
+			memcpy(m_vertex.buff, buff, sizeof(GSVertex) * unused);
+
+			m_vertex.tail = unused;
+			m_vertex.next = next > head ? next - head : 0;
+		}
+		else
+		{
+			m_vertex.tail = 0;
+			m_vertex.next = 0;
+		}
+	}
+}
+
+//
+
+void GSState::Write(const uint8* mem, int len)
+{
+	int w = m_env.TRXREG.RRW;
+	int h = m_env.TRXREG.RRH;
+
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
+
+	// printf("Write len=%d DBP=%05x DBW=%d DPSM=%d DSAX=%d DSAY=%d RRW=%d RRH=%d\n", len, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, m_env.TRXPOS.DSAX, m_env.TRXPOS.DSAY, m_env.TRXREG.RRW, m_env.TRXREG.RRH);
+
+	if(!m_tr.Update(w, h, psm.trbpp, len))
+	{
+		return;
+	}
+
+	GL_CACHE("Write! ...  => 0x%x W:%d F:%d (DIR %d%d), dPos(%d %d) size(%d %d)",
+		m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM,
+		m_env.TRXPOS.DIRX, m_env.TRXPOS.DIRY,
+		m_env.TRXPOS.DSAX, m_env.TRXPOS.DSAY, w, h);
+
+	if(PRIM->TME && (m_env.BITBLTBUF.DBP == m_context->TEX0.TBP0 || m_env.BITBLTBUF.DBP == m_context->TEX0.CBP)) // TODO: hmmmm
+	{
+		FlushPrim();
+	}
+
+	if(m_tr.end == 0 && len >= m_tr.total)
+	{
+		// received all data in one piece, no need to buffer it
+
+		// printf("%d >= %d\n", len, m_tr.total);
+
+		GSVector4i r;
+
+		r.left = m_env.TRXPOS.DSAX;
+		r.top = m_env.TRXPOS.DSAY;
+		r.right = r.left + m_env.TRXREG.RRW;
+		r.bottom = r.top + m_env.TRXREG.RRH;
+
+		InvalidateVideoMem(m_env.BITBLTBUF, r);
+
+		(m_mem.*psm.wi)(m_tr.x, m_tr.y, mem, m_tr.total, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
+
+		m_tr.start = m_tr.end = m_tr.total;
+
+		m_perfmon.Put(GSPerfMon::Swizzle, len);
+
+		/*
+		static int n = 0;
+		string s;
+		s = format("c:\\temp1\\[%04d]_%05x_%d_%d_%d_%d_%d_%d.bmp",
+			n++, (int)m_env.BITBLTBUF.DBP, (int)m_env.BITBLTBUF.DBW, (int)m_env.BITBLTBUF.DPSM,
+			r.left, r.top, r.right, r.bottom);
+		m_mem.SaveBMP(s, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, r.right, r.bottom);
+		*/
+	}
+	else
+	{
+		// printf("%d += %d (%d)\n", m_tr.end, len, m_tr.total);
+
+		memcpy(&m_tr.buff[m_tr.end], mem, len);
+
+		m_tr.end += len;
+
+		if(m_tr.end >= m_tr.total)
+		{
+			FlushWrite();
+		}
+	}
+
+	m_mem.m_clut.Invalidate();
+}
+
+void GSState::InitReadFIFO(uint8* mem, int len)
+{
+	if(len <= 0) return;
+
+	// Allow to keep compatibility with older PCSX2
+	m_init_read_fifo_supported = true;
+
+	int sx = m_env.TRXPOS.SSAX;
+	int sy = m_env.TRXPOS.SSAY;
+	int w = m_env.TRXREG.RRW;
+	int h = m_env.TRXREG.RRH;
+
+	// printf("Read len=%d SBP=%05x SBW=%d SPSM=%d SSAX=%d SSAY=%d RRW=%d RRH=%d\n", len, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM, sx, sy, w, h);
+
+	if(!m_tr.Update(w, h, GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].trbpp, len))
+	{
+		return;
+	}
+
+	if(m_tr.x == sx && m_tr.y == sy)
+	{
+		InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
+	}
+}
+
+void GSState::Read(uint8* mem, int len)
+{
+	if(len <= 0) return;
+
+	int sx = m_env.TRXPOS.SSAX;
+	int sy = m_env.TRXPOS.SSAY;
+	int w = m_env.TRXREG.RRW;
+	int h = m_env.TRXREG.RRH;
+
+	// printf("Read len=%d SBP=%05x SBW=%d SPSM=%d SSAX=%d SSAY=%d RRW=%d RRH=%d\n", len, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM, sx, sy, w, h);
+
+	if(!m_tr.Update(w, h, GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].trbpp, len))
+	{
+		return;
+	}
+
+	if(!m_init_read_fifo_supported)
+	{
+		if(m_tr.x == sx && m_tr.y == sy)
+		{
+			InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
+		}
+	}
+
+	m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
+}
+
+void GSState::Move()
+{
+	// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
+	// guitar hero copies the far end of the board to do a similar blend too
+
+	int sx = m_env.TRXPOS.SSAX;
+	int sy = m_env.TRXPOS.SSAY;
+	int dx = m_env.TRXPOS.DSAX;
+	int dy = m_env.TRXPOS.DSAY;
+	int w = m_env.TRXREG.RRW;
+	int h = m_env.TRXREG.RRH;
+
+	GL_CACHE("Move! 0x%x W:%d F:%d => 0x%x W:%d F:%d (DIR %d%d), sPos(%d %d) dPos(%d %d) size(%d %d)",
+		m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM,
+		m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM,
+		m_env.TRXPOS.DIRX, m_env.TRXPOS.DIRY,
+		sx, sy, dx, dy, w, h);
+
+	InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
+	InvalidateVideoMem(m_env.BITBLTBUF, GSVector4i(dx, dy, dx + w, dy + h));
+
+	int xinc = 1;
+	int yinc = 1;
+
+	if(m_env.TRXPOS.DIRX) {sx += w - 1; dx += w - 1; xinc = -1;}
+	if(m_env.TRXPOS.DIRY) {sy += h - 1; dy += h - 1; yinc = -1;}
+/*
+	printf("%05x %d %d => %05x %d %d (%d%d), %d %d %d %d %d %d\n",
+		m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM,
+		m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM,
+		m_env.TRXPOS.DIRX, m_env.TRXPOS.DIRY,
+		sx, sy, dx, dy, w, h);
+*/
+/*
+	GSLocalMemory::readPixel rp = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].rp;
+	GSLocalMemory::writePixel wp = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM].wp;
+
+	for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
+		for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
+			(m_mem.*wp)(dx, dy, (m_mem.*rp)(sx, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW), m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
+*/
+
+	const GSLocalMemory::psm_t& spsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM];
+	const GSLocalMemory::psm_t& dpsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
+
+	// TODO: unroll inner loops (width has special size requirement, must be multiples of 1 << n, depending on the format)
+
+	GSOffset* RESTRICT spo = m_mem.GetOffset(m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM);
+	GSOffset* RESTRICT dpo = m_mem.GetOffset(m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM);
+
+	if(spsm.trbpp == dpsm.trbpp && spsm.trbpp >= 16)
+	{
+		int* RESTRICT scol = &spo->pixel.col[0][sx];
+		int* RESTRICT dcol = &dpo->pixel.col[0][dx];
+
+		if(spsm.trbpp == 32)
+		{
+			if(xinc > 0)
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint32* RESTRICT s = &m_mem.m_vm32[spo->pixel.row[sy]];
+					uint32* RESTRICT d = &m_mem.m_vm32[dpo->pixel.row[dy]];
+
+					for(int x = 0; x < w; x++) d[dcol[x]] = s[scol[x]];
+				}
+			}
+			else
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint32* RESTRICT s = &m_mem.m_vm32[spo->pixel.row[sy]];
+					uint32* RESTRICT d = &m_mem.m_vm32[dpo->pixel.row[dy]];
+
+					for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
+				}
+			}
+		}
+		else if(spsm.trbpp == 24)
+		{
+			if(xinc > 0)
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint32* RESTRICT s = &m_mem.m_vm32[spo->pixel.row[sy]];
+					uint32* RESTRICT d = &m_mem.m_vm32[dpo->pixel.row[dy]];
+
+					for(int x = 0; x < w; x++) d[dcol[x]] = (d[dcol[x]] & 0xff000000) | (s[scol[x]] & 0x00ffffff);
+				}
+			}
+			else
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint32* RESTRICT s = &m_mem.m_vm32[spo->pixel.row[sy]];
+					uint32* RESTRICT d = &m_mem.m_vm32[dpo->pixel.row[dy]];
+
+					for(int x = 0; x > -w; x--) d[dcol[x]] = (d[dcol[x]] & 0xff000000) | (s[scol[x]] & 0x00ffffff);
+				}
+			}
+		}
+		else // if(spsm.trbpp == 16)
+		{
+			if(xinc > 0)
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint16* RESTRICT s = &m_mem.m_vm16[spo->pixel.row[sy]];
+					uint16* RESTRICT d = &m_mem.m_vm16[dpo->pixel.row[dy]];
+
+					for(int x = 0; x < w; x++) d[dcol[x]] = s[scol[x]];
+				}
+			}
+			else
+			{
+				for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+				{
+					uint16* RESTRICT s = &m_mem.m_vm16[spo->pixel.row[sy]];
+					uint16* RESTRICT d = &m_mem.m_vm16[dpo->pixel.row[dy]];
+
+					for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
+				}
+			}
+		}
+	}
+	else if(m_env.BITBLTBUF.SPSM == PSM_PSMT8 && m_env.BITBLTBUF.DPSM == PSM_PSMT8)
+	{
+		if(xinc > 0)
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint8* RESTRICT s = &m_mem.m_vm8[spo->pixel.row[sy]];
+				uint8* RESTRICT d = &m_mem.m_vm8[dpo->pixel.row[dy]];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x < w; x++) d[dcol[x]] = s[scol[x]];
+			}
+		}
+		else
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint8* RESTRICT s = &m_mem.m_vm8[spo->pixel.row[sy]];
+				uint8* RESTRICT d = &m_mem.m_vm8[dpo->pixel.row[dy]];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
+			}
+		}
+	}
+	else if(m_env.BITBLTBUF.SPSM == PSM_PSMT4 && m_env.BITBLTBUF.DPSM == PSM_PSMT4)
+	{
+		if(xinc > 0)
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint32 sbase = spo->pixel.row[sy];
+				uint32 dbase = dpo->pixel.row[dy];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x < w; x++) m_mem.WritePixel4(dbase + dcol[x], m_mem.ReadPixel4(sbase + scol[x]));
+			}
+		}
+		else
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint32 sbase = spo->pixel.row[sy];
+				uint32 dbase = dpo->pixel.row[dy];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x > -w; x--) m_mem.WritePixel4(dbase + dcol[x], m_mem.ReadPixel4(sbase + scol[x]));
+			}
+		}
+	}
+	else
+	{
+		if(xinc > 0)
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint32 sbase = spo->pixel.row[sy];
+				uint32 dbase = dpo->pixel.row[dy];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x < w; x++) (m_mem.*dpsm.wpa)(dbase + dcol[x], (m_mem.*spsm.rpa)(sbase + scol[x]));
+			}
+		}
+		else
+		{
+			for(int y = 0; y < h; y++, sy += yinc, dy += yinc)
+			{
+				uint32 sbase = spo->pixel.row[sy];
+				uint32 dbase = dpo->pixel.row[dy];
+
+				int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
+				int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
+
+				for(int x = 0; x > -w; x--) (m_mem.*dpsm.wpa)(dbase + dcol[x], (m_mem.*spsm.rpa)(sbase + scol[x]));
+			}
+		}
+	}
+}
+
+void GSState::SoftReset(uint32 mask)
+{
+	if(mask & 1)
+	{
+		memset(&m_path[0], 0, sizeof(GIFPath));
+		memset(&m_path[3], 0, sizeof(GIFPath));
+	}
+
+	if(mask & 2) memset(&m_path[1], 0, sizeof(GIFPath));
+	if(mask & 4) memset(&m_path[2], 0, sizeof(GIFPath));
+
+	m_env.TRXDIR.XDIR = 3; //-1 ; set it to invalid value
+
+	m_q = 1.0f;
+}
+
+void GSState::ReadFIFO(uint8* mem, int size)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	Flush();
+
+	size *= 16;
+
+	Read(mem, size);
+
+	if(m_dump)
+	{
+		m_dump.ReadFIFO(size);
+	}
+}
+
+template void GSState::Transfer<0>(const uint8* mem, uint32 size);
+template void GSState::Transfer<1>(const uint8* mem, uint32 size);
+template void GSState::Transfer<2>(const uint8* mem, uint32 size);
+template void GSState::Transfer<3>(const uint8* mem, uint32 size);
+
+template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
+{
+	GSPerfMonAutoTimer pmat(&m_perfmon);
+
+	const uint8* start = mem;
+
+	GIFPath& path = m_path[index];
+
+	while(size > 0)
+	{
+		if(path.nloop == 0)
+		{
+			path.SetTag(mem);
+
+			mem += sizeof(GIFTag);
+			size--;
+
+			if(path.nloop > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded."
+			{
+				m_q = 1.0f;
+
+				// ASSERT(!(path.tag.PRE && path.tag.FLG == GIF_FLG_REGLIST)); // kingdom hearts
+
+				if(path.tag.PRE && path.tag.FLG == GIF_FLG_PACKED)
+				{
+					ApplyPRIM(path.tag.PRIM);
+				}
+			}
+		}
+		else
+		{
+			uint32 total;
+
+			switch(path.tag.FLG)
+			{
+			case GIF_FLG_PACKED:
+
+				// get to the start of the loop
+
+				if(path.reg != 0)
+				{
+					do
+					{
+						(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
+
+						mem += sizeof(GIFPackedReg);
+						size--;
+					}
+					while(path.StepReg() && size > 0 && path.reg != 0);
+				}
+
+				// all data available? usually is
+
+				total = path.nloop * path.nreg;
+
+				if(size >= total)
+				{
+					size -= total;
+
+					switch(path.type)
+					{
+					case GIFPath::TYPE_UNKNOWN:
+
+						{
+							uint32 reg = 0;
+
+							do
+							{
+								(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
+
+								mem += sizeof(GIFPackedReg);
+
+								reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
+							}
+							while(--total > 0);
+						}
+
+						break;
+
+					case GIFPath::TYPE_ADONLY: // very common
+
+						do
+						{
+							(this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
+
+							mem += sizeof(GIFPackedReg);
+						}
+						while(--total > 0);
+
+						break;
+					
+					case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this
+
+						(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total);
+
+						mem += total * sizeof(GIFPackedReg);
+
+						break;
+
+					case GIFPath::TYPE_STQRGBAXYZ2:
+
+						(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2])((GIFPackedReg*)mem, total);
+
+						mem += total * sizeof(GIFPackedReg);
+
+						break;
+
+					default:
+
+						__assume(0);
+					}
+
+					path.nloop = 0;
+				}
+				else
+				{
+					do
+					{
+						(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
+
+						mem += sizeof(GIFPackedReg);
+						size--;
+					}
+					while(path.StepReg() && size > 0);
+				}
+
+				break;
+
+			case GIF_FLG_REGLIST:
+
+				// TODO: do it similar to packed operation
+
+				size *= 2;
+
+				do
+				{
+					(this->*m_fpGIFRegHandlers[path.GetReg()])((GIFReg*)mem);
+
+					mem += sizeof(GIFReg);
+					size--;
+				}
+				while(path.StepReg() && size > 0);
+
+				if(size & 1) mem += sizeof(GIFReg);
+
+				size /= 2;
+
+				break;
+
+			case GIF_FLG_IMAGE2: // hmmm // Fall through here fixes a crash in Wallace and Gromit Project Zoo
+				// and according to Pseudonym we shouldn't even land in this code. So hmm indeed. (rama)
+				
+				/*ASSERT(0);
+
+				path.nloop = 0;
+
+				break;*/
+
+			case GIF_FLG_IMAGE:
+
+				{
+					int len = (int)min(size, path.nloop);
+
+					//ASSERT(!(len&3));
+
+					switch(m_env.TRXDIR.XDIR)
+					{
+					case 0:
+						Write(mem, len * 16);
+						break;
+					case 1:
+						// This can't happen; downloads can not be started or performed as part of
+						// a GIFtag operation.  They're an entirely separate process that can only be
+						// done through the ReverseFIFO transfer (aka ReadFIFO). --air
+						ASSERT(0);
+						//Read(mem, len * 16);
+						break;
+					case 2:
+						Move();
+						break;
+					case 3:
+						ASSERT(0);
+						break;
+					default:
+						__assume(0);
+					}
+
+					mem += len * 16;
+					path.nloop -= len;
+					size -= len;
+				}
+
+				break;
+
+			default:
+				__assume(0);
+			}
+		}
+
+		if(index == 0)
+		{
+			if(path.tag.EOP && path.nloop == 0)
+			{
+				break;
+			}
+		}
+	}
+
+	if(m_dump && mem > start)
+	{
+		m_dump.Transfer(index, start, mem - start);
+	}
+
+	if(index == 0)
+	{
+		if(size == 0 && path.nloop > 0)
+		{
+			if(m_mt)
+			{
+				// Hackfix for BIOS, which sends an incomplete packet when it does an XGKICK without
+				// having an EOP specified anywhere in VU1 memory.  Needed until PCSX2 is fixed to
+				// handle it more properly (ie, without looping infinitely).
+
+				path.nloop = 0;
+			}
+			else
+			{
+				// Unused in 0.9.7 and above, but might as well keep this for now; allows GSdx
+				// to work with legacy editions of PCSX2.
+
+				Transfer<0>(mem - 0x4000, 0x4000 / 16);
+			}
+		}
+	}
+}
+
+template<class T> static void WriteState(uint8*& dst, T* src, size_t len = sizeof(T))
+{
+	memcpy(dst, src, len);
+	dst += len;
+}
+
+template<class T> static void ReadState(T* dst, uint8*& src, size_t len = sizeof(T))
+{
+	memcpy(dst, src, len);
+	src += len;
+}
+
+int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
+{
+	if(sizeonly)
+	{
+		fd->size = m_sssize;
+		return 0;
+	}
+
+	if(!fd->data || fd->size < m_sssize)
+	{
+		return -1;
+	}
+
+	Flush();
+
+	uint8* data = fd->data;
+
+	WriteState(data, &m_version);
+	WriteState(data, &m_env.PRIM);
+	WriteState(data, &m_env.PRMODE);
+	WriteState(data, &m_env.PRMODECONT);
+	WriteState(data, &m_env.TEXCLUT);
+	WriteState(data, &m_env.SCANMSK);
+	WriteState(data, &m_env.TEXA);
+	WriteState(data, &m_env.FOGCOL);
+	WriteState(data, &m_env.DIMX);
+	WriteState(data, &m_env.DTHE);
+	WriteState(data, &m_env.COLCLAMP);
+	WriteState(data, &m_env.PABE);
+	WriteState(data, &m_env.BITBLTBUF);
+	WriteState(data, &m_env.TRXDIR);
+	WriteState(data, &m_env.TRXPOS);
+	WriteState(data, &m_env.TRXREG);
+	WriteState(data, &m_env.TRXREG); // obsolete
+
+	for(int i = 0; i < 2; i++)
+	{
+		WriteState(data, &m_env.CTXT[i].XYOFFSET);
+		WriteState(data, &m_env.CTXT[i].TEX0);
+		WriteState(data, &m_env.CTXT[i].TEX1);
+		WriteState(data, &m_env.CTXT[i].TEX2);
+		WriteState(data, &m_env.CTXT[i].CLAMP);
+		WriteState(data, &m_env.CTXT[i].MIPTBP1);
+		WriteState(data, &m_env.CTXT[i].MIPTBP2);
+		WriteState(data, &m_env.CTXT[i].SCISSOR);
+		WriteState(data, &m_env.CTXT[i].ALPHA);
+		WriteState(data, &m_env.CTXT[i].TEST);
+		WriteState(data, &m_env.CTXT[i].FBA);
+		WriteState(data, &m_env.CTXT[i].FRAME);
+		WriteState(data, &m_env.CTXT[i].ZBUF);
+	}
+
+	WriteState(data, &m_v.RGBAQ);
+	WriteState(data, &m_v.ST);
+	WriteState(data, &m_v.UV);
+	WriteState(data, &m_v.FOG);
+	WriteState(data, &m_v.XYZ);
+	data += sizeof(GIFReg); // obsolite
+	WriteState(data, &m_tr.x);
+	WriteState(data, &m_tr.y);
+	WriteState(data, m_mem.m_vm8, m_mem.m_vmsize);
+
+	for(size_t i = 0; i < countof(m_path); i++)
+	{
+		m_path[i].tag.NREG = m_path[i].nreg;
+		m_path[i].tag.NLOOP = m_path[i].nloop;
+		m_path[i].tag.REGS = 0;
+
+		for(size_t j = 0; j < countof(m_path[i].regs.u8); j++)
+		{
+			m_path[i].tag.u32[2 + (j >> 3)] |= m_path[i].regs.u8[j] << ((j & 7) << 2);
+		}
+
+		WriteState(data, &m_path[i].tag);
+		WriteState(data, &m_path[i].reg);
+	}
+
+	WriteState(data, &m_q);
+
+	return 0;
+}
+
+int GSState::Defrost(const GSFreezeData* fd)
+{
+	if(!fd || !fd->data || fd->size == 0)
+	{
+		return -1;
+	}
+
+	if(fd->size < m_sssize)
+	{
+		return -1;
+	}
+
+	uint8* data = fd->data;
+
+	int version;
+
+	ReadState(&version, data);
+
+	if(version > m_version)
+	{
+		printf("GSdx: Savestate version is incompatible.  Load aborted.\n" );
+
+		return -1;
+	}
+
+	Flush();
+
+	Reset();
+
+	ReadState(&m_env.PRIM, data);
+	ReadState(&m_env.PRMODE, data);
+	ReadState(&m_env.PRMODECONT, data);
+	ReadState(&m_env.TEXCLUT, data);
+	ReadState(&m_env.SCANMSK, data);
+	ReadState(&m_env.TEXA, data);
+	ReadState(&m_env.FOGCOL, data);
+	ReadState(&m_env.DIMX, data);
+	ReadState(&m_env.DTHE, data);
+	ReadState(&m_env.COLCLAMP, data);
+	ReadState(&m_env.PABE, data);
+	ReadState(&m_env.BITBLTBUF, data);
+	ReadState(&m_env.TRXDIR, data);
+	ReadState(&m_env.TRXPOS, data);
+	ReadState(&m_env.TRXREG, data);
+	ReadState(&m_env.TRXREG, data); // obsolete
+
+	for(int i = 0; i < 2; i++)
+	{
+		ReadState(&m_env.CTXT[i].XYOFFSET, data);
+		ReadState(&m_env.CTXT[i].TEX0, data);
+		ReadState(&m_env.CTXT[i].TEX1, data);
+		ReadState(&m_env.CTXT[i].TEX2, data);
+		ReadState(&m_env.CTXT[i].CLAMP, data);
+		ReadState(&m_env.CTXT[i].MIPTBP1, data);
+		ReadState(&m_env.CTXT[i].MIPTBP2, data);
+		ReadState(&m_env.CTXT[i].SCISSOR, data);
+		ReadState(&m_env.CTXT[i].ALPHA, data);
+		ReadState(&m_env.CTXT[i].TEST, data);
+		ReadState(&m_env.CTXT[i].FBA, data);
+		ReadState(&m_env.CTXT[i].FRAME, data);
+		ReadState(&m_env.CTXT[i].ZBUF, data);
+
+		m_env.CTXT[i].XYOFFSET.OFX &= 0xffff;
+		m_env.CTXT[i].XYOFFSET.OFY &= 0xffff;
+
+		if(version <= 4)
+		{
+			data += sizeof(uint32) * 7; // skip
+		}
+	}
+
+	ReadState(&m_v.RGBAQ, data);
+	ReadState(&m_v.ST, data);
+	ReadState(&m_v.UV, data);
+	ReadState(&m_v.FOG, data);
+	ReadState(&m_v.XYZ, data);
+	data += sizeof(GIFReg); // obsolite
+	ReadState(&m_tr.x, data);
+	ReadState(&m_tr.y, data);
+	ReadState(m_mem.m_vm8, data, m_mem.m_vmsize);
+
+	m_tr.total = 0; // TODO: restore transfer state
+
+	for(size_t i = 0; i < countof(m_path); i++)
+	{
+		ReadState(&m_path[i].tag, data);
+		ReadState(&m_path[i].reg, data);
+
+		m_path[i].SetTag(&m_path[i].tag); // expand regs
+	}
+
+	ReadState(&m_q, data);
+
+	PRIM = !m_env.PRMODECONT.AC ? (GIFRegPRIM*)&m_env.PRMODE : &m_env.PRIM;
+
+	UpdateContext();
+
+	UpdateVertexKick();
+
+	m_env.UpdateDIMX();
+
+	for(size_t i = 0; i < 2; i++)
+	{
+		m_env.CTXT[i].UpdateScissor();
+
+		m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
+		m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
+		m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM);
+		m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
+		m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
+	}
+
+	UpdateScissor();
+
+m_perfmon.SetFrame(5000);
+
+	return 0;
+}
+
+void GSState::SetGameCRC(uint32 crc, int options)
+{
+	m_crc = crc;
+	m_options = options;
+	m_game = CRC::Lookup(m_crc_hack_level ? crc : 0);
+}
+
+//
+
+void GSState::UpdateContext()
+{
+	m_context = &m_env.CTXT[PRIM->CTXT];
+
+	UpdateScissor();
+}
+
+void GSState::UpdateScissor()
+{
+	m_scissor = m_context->scissor.ex;
+	m_ofxy = m_context->scissor.ofxy;
+}
+
+void GSState::UpdateVertexKick() 
+{
+	if(m_frameskip) return;
+
+	uint32 prim = PRIM->PRIM;
+
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = m_fpGIFPackedRegHandlerXYZ[prim][0];
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = m_fpGIFPackedRegHandlerXYZ[prim][1];
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = m_fpGIFPackedRegHandlerXYZ[prim][2];
+	m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = m_fpGIFPackedRegHandlerXYZ[prim][3];
+
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = m_fpGIFRegHandlerXYZ[prim][0];
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = m_fpGIFRegHandlerXYZ[prim][1];
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2];
+	m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3];
+
+	m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
+	m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim];
+}
+
+void GSState::GrowVertexBuffer()
+{
+	int maxcount = std::max<int>(m_vertex.maxcount * 3 / 2, 10000);
+
+	GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
+	uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3
+
+	if(vertex == NULL || index == NULL)
+	{
+		printf("GSdx: failed to allocate %d bytes for verticles and %d for indices.\n", (int)sizeof(GSVertex) * maxcount, (int)sizeof(uint32) * maxcount * 3);
+		throw GSDXError();
+	}
+
+	if(m_vertex.buff != NULL)
+	{
+		memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail);
+
+		_aligned_free(m_vertex.buff);
+	}
+
+	if(m_index.buff != NULL)
+	{
+		memcpy(index, m_index.buff, sizeof(uint32) * m_index.tail);
+		
+		_aligned_free(m_index.buff);
+	}
+
+	m_vertex.buff = vertex;
+	m_vertex.maxcount = maxcount - 3; // -3 to have some space at the end of the buffer before DrawingKick can grow it
+	m_index.buff = index;
+}
+
+template<uint32 prim> 
+__forceinline void GSState::VertexKick(uint32 skip)
+{
+	ASSERT(m_vertex.tail < m_vertex.maxcount + 3);
+
+	size_t head = m_vertex.head;
+	size_t tail = m_vertex.tail;
+	size_t next = m_vertex.next;
+	size_t xy_tail = m_vertex.xy_tail;
+
+	// callers should write XYZUVF to m_v.m[1] in one piece to have this load store-forwarded, either by the cpu or the compiler when this function is inlined
+
+	GSVector4i v0(m_v.m[0]);
+	GSVector4i v1(m_v.m[1]); 
+
+	GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[tail];
+
+	tailptr[0] = v0;
+	tailptr[1] = v1;
+
+	GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy);
+
+	#if _M_SSE >= 0x401
+	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
+	#else
+	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32());
+	#endif
+
+	m_vertex.tail = ++tail;
+	m_vertex.xy_tail = ++xy_tail;
+
+	size_t n = 0;
+
+	switch(prim)
+	{
+	case GS_POINTLIST: n = 1; break;
+	case GS_LINELIST: n = 2; break;
+	case GS_LINESTRIP: n = 2; break;
+	case GS_TRIANGLELIST: n = 3; break;
+	case GS_TRIANGLESTRIP: n = 3; break;
+	case GS_TRIANGLEFAN: n = 3; break;
+	case GS_SPRITE: n = 2; break;
+	case GS_INVALID: n = 1; break;
+	}
+
+	size_t m = tail - head;
+
+	if(m < n)
+	{
+		return;
+	}
+
+	if(skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
+	{
+		GSVector4i v0, v1, v2, v3, pmin, pmax;
+
+		v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
+		v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
+		v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
+		v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
+
+		GSVector4 cross;
+
+		switch(prim)
+		{
+		case GS_POINTLIST:
+			pmin = v2;
+			pmax = v2;
+			break;
+		case GS_LINELIST:
+		case GS_LINESTRIP:
+		case GS_SPRITE:
+			pmin = v2.min_i16(v1);
+			pmax = v2.max_i16(v1);
+			break;
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+			pmin = v2.min_i16(v1.min_i16(v0));
+			pmax = v2.max_i16(v1.max_i16(v0));
+			break;
+		case GS_TRIANGLEFAN:
+			pmin = v2.min_i16(v1.min_i16(v3));
+			pmax = v2.max_i16(v1.max_i16(v3));
+			break;
+		default:
+			break;
+		}
+
+		GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl()); 
+		
+		switch(prim)
+		{
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+		case GS_TRIANGLEFAN:
+		case GS_SPRITE:
+			test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
+			break;
+		default:
+			break;
+		}
+
+		switch(prim)
+		{
+		case GS_TRIANGLELIST:
+		case GS_TRIANGLESTRIP:
+			// TODO: any way to do a 16-bit integer cross product?
+			// cross product is zero most of the time because either of the vertices are the same
+			/*
+			cross = GSVector4(v2.xyxyl().i16to32().sub32(v0.upl32(v1).i16to32())); // x20, y20, x21, y21
+			cross = cross * cross.wzwz(); // x20 * y21, y20 * x21
+			test |= GSVector4i::cast(cross == cross.yxwz());
+			*/
+			test = (test | v0 == v1) | (v1 == v2 | v0 == v2); 
+			break;
+		case GS_TRIANGLEFAN:
+			/*
+			cross = GSVector4(v2.xyxyl().i16to32().sub32(v3.upl32(v1).i16to32())); // x23, y23, x21, y21
+			cross = cross * cross.wzwz(); // x23 * y21, y23 * x21
+			test |= GSVector4i::cast(cross == cross.yxwz());
+			*/
+			test = (test | v3 == v1) | (v1 == v2 | v3 == v2); 
+			break;
+		default:
+			break;
+		}
+		
+		skip |= test.mask() & 15;
+	}
+
+	if(skip != 0)
+	{
+		switch(prim)
+		{
+		case GS_POINTLIST:
+		case GS_LINELIST:
+		case GS_TRIANGLELIST:
+		case GS_SPRITE:
+		case GS_INVALID: 
+			m_vertex.tail = head; // no need to check or grow the buffer length
+			break;
+		case GS_LINESTRIP:
+		case GS_TRIANGLESTRIP:
+			m_vertex.head = head + 1;
+			// fall through
+		case GS_TRIANGLEFAN:
+			if(tail >= m_vertex.maxcount) GrowVertexBuffer(); // in case too many vertices were skipped
+			break;
+		default: 
+			__assume(0);
+		}
+
+		return;
+	}
+
+	if(tail >= m_vertex.maxcount) GrowVertexBuffer();
+
+	uint32* RESTRICT buff = &m_index.buff[m_index.tail];
+
+	switch(prim)
+	{
+	case GS_POINTLIST:
+		buff[0] = head + 0;
+		m_vertex.head = head + 1;
+		m_vertex.next = head + 1;
+		m_index.tail += 1;
+		break;
+	case GS_LINELIST:
+		buff[0] = head + 0;
+		buff[1] = head + 1;
+		m_vertex.head = head + 2;
+		m_vertex.next = head + 2;
+		m_index.tail += 2;
+		break;
+	case GS_LINESTRIP:
+		if(next < head) 
+		{
+			m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
+			m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
+			head = next; 
+			m_vertex.tail = next + 2;
+		}
+		buff[0] = head + 0;
+		buff[1] = head + 1;
+		m_vertex.head = head + 1;
+		m_vertex.next = head + 2;
+		m_index.tail += 2;
+		break;
+	case GS_TRIANGLELIST:
+		buff[0] = head + 0;
+		buff[1] = head + 1;
+		buff[2] = head + 2;
+		m_vertex.head = head + 3;
+		m_vertex.next = head + 3;
+		m_index.tail += 3;
+		break;
+	case GS_TRIANGLESTRIP:
+		if(next < head) 
+		{
+			m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
+			m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
+			m_vertex.buff[next + 2] = m_vertex.buff[head + 2];
+			head = next; 
+			m_vertex.tail = next + 3;
+		}
+		buff[0] = head + 0;
+		buff[1] = head + 1;
+		buff[2] = head + 2;
+		m_vertex.head = head + 1;
+		m_vertex.next = head + 3;
+		m_index.tail += 3;
+		break;
+	case GS_TRIANGLEFAN:
+		// TODO: remove gaps, next == head && head < tail - 3 || next > head && next < tail - 2 (very rare)
+		buff[0] = head + 0;
+		buff[1] = tail - 2;
+		buff[2] = tail - 1;
+		m_vertex.next = tail;
+		m_index.tail += 3;
+		break;
+	case GS_SPRITE:	
+		buff[0] = head + 0;
+		buff[1] = head + 1;
+		m_vertex.head = head + 2;
+		m_vertex.next = head + 2;
+		m_index.tail += 2;
+		break;
+	case GS_INVALID:
+		m_vertex.tail = head;
+		break;
+	default:
+		__assume(0);
+	}
+}
+
+void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear)
+{
+	// TODO: some of the +1s can be removed if linear == false
+
+	int tw = TEX0.TW;
+	int th = TEX0.TH;
+
+	int w = 1 << tw;
+	int h = 1 << th;
+
+	GSVector4i tr(0, 0, w, h);
+
+	int wms = CLAMP.WMS;
+	int wmt = CLAMP.WMT;
+
+	int minu = (int)CLAMP.MINU;
+	int minv = (int)CLAMP.MINV;
+	int maxu = (int)CLAMP.MAXU;
+	int maxv = (int)CLAMP.MAXV;
+
+	GSVector4i vr = tr;
+
+	switch(wms)
+	{
+	case CLAMP_REPEAT:
+		break;
+	case CLAMP_CLAMP:
+		break;
+	case CLAMP_REGION_CLAMP:
+		if(vr.x < minu) vr.x = minu;
+		if(vr.z > maxu + 1) vr.z = maxu + 1;
+		break;
+	case CLAMP_REGION_REPEAT:
+		vr.x = maxu;
+		vr.z = vr.x + (minu + 1);
+		break;
+	default:
+		__assume(0);
+	}
+
+	switch(wmt)
+	{
+	case CLAMP_REPEAT:
+		break;
+	case CLAMP_CLAMP:
+		break;
+	case CLAMP_REGION_CLAMP:
+		if(vr.y < minv) vr.y = minv;
+		if(vr.w > maxv + 1) vr.w = maxv + 1;
+		break;
+	case CLAMP_REGION_REPEAT:
+		vr.y = maxv;
+		vr.w = vr.y + (minv + 1);
+		break;
+	default:
+		__assume(0);
+	}
+
+	if(wms != CLAMP_REGION_REPEAT || wmt != CLAMP_REGION_REPEAT)
+	{
+		GSVector4 st = m_vt.m_min.t.xyxy(m_vt.m_max.t);
+
+		if(linear)
+		{
+			st += GSVector4(-0.5f, 0.5f).xxyy();
+		}
+
+		GSVector4i uv = GSVector4i(st.floor());
+
+		GSVector4i u, v;
+
+		int mask = 0;
+
+		// See commented code below for the meaning of mask
+
+		if(wms == CLAMP_REPEAT || wmt == CLAMP_REPEAT)
+		{
+			u = uv & GSVector4i::xffffffff().srl32(32 - tw);
+			v = uv & GSVector4i::xffffffff().srl32(32 - th);
+
+			GSVector4i uu = uv.sra32(tw);
+			GSVector4i vv = uv.sra32(th);
+
+			mask = (uu.upl32(vv) == uu.uph32(vv)).mask();
+		}
+
+		uv = uv.rintersect(tr);
+
+		switch(wms)
+		{
+		case CLAMP_REPEAT:
+			// This commented code cannot be used directly because it needs uv before the intersection
+			/*if (uv_.x >> tw == uv_.z >> tw)
+			{
+				vr.x = max(vr.x, (uv_.x & ((1 << tw) - 1)));
+				vr.z = min(vr.z, (uv_.z & ((1 << tw) - 1)) + 1);
+			}*/
+			if(mask & 0x000f) {if(vr.x < u.x) vr.x = u.x; if(vr.z > u.z + 1) vr.z = u.z + 1;}
+			break;
+		case CLAMP_CLAMP:
+		case CLAMP_REGION_CLAMP:
+			if(vr.x > uv.z) vr.z = vr.x + 1;
+			else if(vr.z < uv.x) vr.x = vr.z - 1;
+			else
+			{
+				if(vr.x < uv.x) vr.x = uv.x;
+				if(vr.z > uv.z + 1) vr.z = uv.z + 1;
+			}
+			break;
+		case CLAMP_REGION_REPEAT:
+			break;
+		default:
+			__assume(0);
+		}
+
+		switch(wmt)
+		{
+		case CLAMP_REPEAT:
+			/*if (uv_.y >> th == uv_.w >> th)
+			{
+				vr.y = max(vr.y, (uv_.y & ((1 << th) - 1)));
+				vr.w = min(vr.w, (uv_.w & ((1 << th) - 1)) + 1);
+			}*/
+			if(mask & 0xf000) {if(vr.y < v.y) vr.y = v.y; if(vr.w > v.w + 1) vr.w = v.w + 1;}
+			break;
+		case CLAMP_CLAMP:
+		case CLAMP_REGION_CLAMP:
+			if(vr.y > uv.w) vr.w = vr.y + 1;
+			else if(vr.w < uv.y) vr.y = vr.w - 1;
+			else
+			{
+				if(vr.y < uv.y) vr.y = uv.y;
+				if(vr.w > uv.w + 1) vr.w = uv.w + 1;
+			}
+			break;
+		case CLAMP_REGION_REPEAT:
+			break;
+		default:
+			__assume(0);
+		}
+	}
+
+	vr = vr.rintersect(tr);
+
+	// This really shouldn't happen now except with the clamping region set entirely outside the texture,
+	// special handling should be written for that case.
+
+	if(vr.rempty())
+	{
+		// NOTE: this can happen when texcoords are all outside the texture or clamping area is zero, but we can't 
+		// let the texture cache update nothing, the sampler will still need a single texel from the border somewhere
+		// examples: 
+		// - THPS (no visible problems)
+		// - NFSMW (strange rectangles on screen, might be unrelated)
+		// - Lupin 3rd (huge problems, textures sizes seem to be randomly specified)
+
+		vr = (vr + GSVector4i(-1, +1).xxyy()).rintersect(tr);
+	}
+
+	r = vr;
+}
+
+void GSState::GetAlphaMinMax()
+{
+	if(m_vt.m_alpha.valid)
+	{
+		return;
+	}
+
+	const GSDrawingEnvironment& env = m_env;
+	const GSDrawingContext* context = m_context;
+
+	GSVector4i a = m_vt.m_min.c.uph32(m_vt.m_max.c).zzww();
+
+	if(PRIM->TME && context->TEX0.TCC)
+	{
+		switch(GSLocalMemory::m_psm[context->TEX0.PSM].fmt)
+		{
+		case 0:
+			a.y = 0;
+			a.w = 0xff;
+			break;
+		case 1:
+			a.y = env.TEXA.AEM ? 0 : env.TEXA.TA0;
+			a.w = env.TEXA.TA0;
+			break;
+		case 2:
+			a.y = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
+			a.w = max(env.TEXA.TA0, env.TEXA.TA1);
+			break;
+		case 3:
+			m_mem.m_clut.GetAlphaMinMax32(a.y, a.w);
+			break;
+		default:
+			__assume(0);
+		}
+
+		switch(context->TEX0.TFX)
+		{
+		case TFX_MODULATE:
+			a.x = (a.x * a.y) >> 7;
+			a.z = (a.z * a.w) >> 7;
+			if(a.x > 0xff) a.x = 0xff;
+			if(a.z > 0xff) a.z = 0xff;
+			break;
+		case TFX_DECAL:
+			a.x = a.y;
+			a.z = a.w;
+			break;
+		case TFX_HIGHLIGHT:
+			a.x = a.x + a.y;
+			a.z = a.z + a.w;
+			if(a.x > 0xff) a.x = 0xff;
+			if(a.z > 0xff) a.z = 0xff;
+			break;
+		case TFX_HIGHLIGHT2:
+			a.x = a.y;
+			a.z = a.w;
+			break;
+		default:
+			__assume(0);
+		}
+	}
+
+	m_vt.m_alpha.min = a.x;
+	m_vt.m_alpha.max = a.z;
+	m_vt.m_alpha.valid = true;
+}
+
+bool GSState::TryAlphaTest(uint32& fm, uint32& zm)
+{
+	const GSDrawingContext* context = m_context;
+
+	bool pass = true;
+
+	if(context->TEST.ATST == ATST_NEVER)
+	{
+		pass = false;
+	}
+	else if(context->TEST.ATST != ATST_ALWAYS)
+	{
+		GetAlphaMinMax();
+
+		int amin = m_vt.m_alpha.min;
+		int amax = m_vt.m_alpha.max;
+
+		int aref = context->TEST.AREF;
+
+		switch(context->TEST.ATST)
+		{
+		case ATST_NEVER:
+			pass = false;
+			break;
+		case ATST_ALWAYS:
+			pass = true;
+			break;
+		case ATST_LESS:
+			if(amax < aref) pass = true;
+			else if(amin >= aref) pass = false;
+			else return false;
+			break;
+		case ATST_LEQUAL:
+			if(amax <= aref) pass = true;
+			else if(amin > aref) pass = false;
+			else return false;
+			break;
+		case ATST_EQUAL:
+			if(amin == aref && amax == aref) pass = true;
+			else if(amin > aref || amax < aref) pass = false;
+			else return false;
+			break;
+		case ATST_GEQUAL:
+			if(amin >= aref) pass = true;
+			else if(amax < aref) pass = false;
+			else return false;
+			break;
+		case ATST_GREATER:
+			if(amin > aref) pass = true;
+			else if(amax <= aref) pass = false;
+			else return false;
+			break;
+		case ATST_NOTEQUAL:
+			if(amin == aref && amax == aref) pass = false;
+			else if(amin > aref || amax < aref) pass = true;
+			else return false;
+			break;
+		default:
+			__assume(0);
+		}
+	}
+
+	if(!pass)
+	{
+		switch(context->TEST.AFAIL)
+		{
+		case AFAIL_KEEP: fm = zm = 0xffffffff; break;
+		case AFAIL_FB_ONLY: zm = 0xffffffff; break;
+		case AFAIL_ZB_ONLY: fm = 0xffffffff; break;
+		case AFAIL_RGB_ONLY: fm |= 0xff000000; zm = 0xffffffff; break;
+		default: __assume(0);
+		}
+	}
+
+	return true;
+}
+
+bool GSState::IsOpaque()
+{
+	if(PRIM->AA1)
+	{
+		return false;
+	}
+
+	if(!PRIM->ABE)
+	{
+		return true;
+	}
+
+	const GSDrawingContext* context = m_context;
+
+	int amin = 0, amax = 0xff;
+
+	if(context->ALPHA.A != context->ALPHA.B)
+	{
+		if(context->ALPHA.C == 0)
+		{
+			GetAlphaMinMax();
+
+			amin = m_vt.m_alpha.min;
+			amax = m_vt.m_alpha.max;
+		}
+		else if(context->ALPHA.C == 1)
+		{
+			if(context->FRAME.PSM == PSM_PSMCT24 || context->FRAME.PSM == PSM_PSMZ24)
+			{
+				amin = amax = 0x80;
+			}
+		}
+		else if(context->ALPHA.C == 2)
+		{
+			amin = amax = context->ALPHA.FIX;
+		}
+	}
+
+	return context->ALPHA.IsOpaque(amin, amax);
+}
+
+bool GSState::IsMipMapActive()
+{
+	return m_mipmap && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0; 
+}
+
+// GSTransferBuffer
+
+GSState::GSTransferBuffer::GSTransferBuffer()
+{
+	x = y = 0;
+	overflow = false;
+	start = end = total = 0;
+	buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
+}
+
+GSState::GSTransferBuffer::~GSTransferBuffer()
+{
+	_aligned_free(buff);
+}
+
+void GSState::GSTransferBuffer::Init(int tx, int ty)
+{
+	x = tx;
+	y = ty;
+	total = 0;
+}
+
+bool GSState::GSTransferBuffer::Update(int tw, int th, int bpp, int& len)
+{
+	if(total == 0)
+	{
+		start = end = 0;
+		total = std::min<int>((tw * bpp >> 3) * th, 1024 * 1024 * 4);
+		overflow = false;
+	}
+
+	int remaining = total - end;
+
+	if(len > remaining)
+	{
+		if(!overflow)
+		{
+			overflow = true;
+
+			// printf("GS transfer overflow\n");
+		}
+
+		len = remaining;
+	}
+
+	return len > 0;
+}
+
+// hacks
+#define Aggresive (s_crc_hack_level > 3)
+#define Dx_only   (s_crc_hack_level > 2)
+
+struct GSFrameInfo
+{
+	uint32 FBP;
+	uint32 FPSM;
+	uint32 FBMSK;
+	uint32 TBP0;
+	uint32 TPSM;
+	uint32 TZTST;
+	bool TME;
+};
+
+typedef bool (*GetSkipCount)(const GSFrameInfo& fi, int& skip);
+CRC::Region g_crc_region = CRC::NoRegion;
+
+bool GSC_Okami(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00e00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1000;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00e00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03800 && fi.TPSM == PSM_PSMT4)
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_MetalGearSolid3(const GSFrameInfo& fi, int& skip)
+{
+	// Game requires sub RT support (texture cache limitation)
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x02000 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMCT24)
+		{
+			skip = 1000; // 76, 79
+		}
+		else if(fi.TME && fi.FBP == 0x02800 && fi.FPSM == PSM_PSMCT24 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1000; // 69
+		}
+	}
+	else
+	{
+		if(!fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01000) && fi.FPSM == PSM_PSMCT32)
+		{
+			skip = 0;
+		}
+		else if(!fi.TME && fi.FBP == fi.TBP0 && fi.TBP0 == 0x2000 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMCT24)
+		{
+			if(g_crc_region == CRC::US || g_crc_region == CRC::JP || g_crc_region == CRC::KO)
+			{
+				skip = 119;	//ntsc
+			}
+			else
+			{
+				skip = 136;	//pal
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_DBZBT2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && /*fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT16 &&*/ (fi.TBP0 == 0x01c00 || fi.TBP0 == 0x02000) && fi.TPSM == PSM_PSMZ16)
+		{
+			if (Dx_only) // Feel like texture shuffle but not sure
+				skip = 26; //27
+		}
+		else if(!fi.TME && (fi.FBP == 0x02a00 || fi.FBP == 0x03000) && fi.FPSM == PSM_PSMCT16)
+		{
+			skip = 10;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_DBZBT3(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01c00 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x00e00 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMT8H)
+		{
+			//not needed anymore?
+			//skip = 24; // blur
+		}
+		else if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00e00 || fi.FBP == 0x01000) && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8H)
+		{
+			if (Dx_only) { // Ought to be fine with blending accuracy (fbmask?)
+				if(fi.FBMSK == 0x00000)
+				{
+					skip = 28; // outline
+				}
+				if(fi.FBMSK == 0x00FFFFFF)
+				{
+					skip = 1;
+				}
+			}
+		}
+		else if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00e00 || fi.FBP == 0x01000) && fi.FPSM == PSM_PSMCT16 && fi.TPSM == PSM_PSMZ16)
+		{
+			// Texture shuffling must work on openGL
+			if (Dx_only)
+				skip = 5;
+		}
+		else if(fi.TME && fi.FPSM == fi.TPSM && fi.TBP0 == 0x03f00 && fi.TPSM == PSM_PSMCT32)
+		{
+			if (fi.FBP == 0x03400)
+			{
+				skip = 1;	//PAL
+			}
+			if(fi.FBP == 0x02e00)
+			{
+				skip = 3;	//NTSC
+			}
+		}
+	}
+
+    return true;
+}
+
+bool GSC_SFEX3(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00500 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00f00 && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 2; // blur
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Bully(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01180) && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01180) && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.FPSM == fi.TPSM)
+		{
+			return false; // allowed
+		}
+
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01180) && fi.FPSM == PSM_PSMCT16S && fi.TBP0 == 0x02300 && fi.TPSM == PSM_PSMZ16S)
+		{
+			skip = 6;
+		}
+	}
+	else
+	{
+		if(!fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01180) && fi.FPSM == PSM_PSMCT32)
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_BullyCC(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01180) && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01180) && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.FPSM == fi.TPSM)
+		{
+			return false; // allowed
+		}
+
+		if(!fi.TME && fi.FBP == 0x02800 && fi.FPSM == PSM_PSMCT24)
+		{
+			skip = 9;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SoTC(const GSFrameInfo& fi, int& skip)
+{
+            // Not needed anymore? What did it fix anyway? (rama)
+    if(skip == 0)
+    {
+            if(Aggresive && fi.TME /*&& fi.FBP == 0x03d80*/ && fi.FPSM == 0 && fi.TBP0 == 0x03fc0 && fi.TPSM == 1)
+            {
+                    skip = 48;	//removes sky bloom
+            }
+            /*
+            if(fi.TME && fi.FBP == 0x02b80 && fi.FPSM == PSM_PSMCT24 && fi.TBP0 == 0x01e80 && fi.TPSM == PSM_PSMCT24)
+            {
+                    skip = 9;
+            }
+            else if(fi.TME && fi.FBP == 0x01c00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03800 && fi.TPSM == PSM_PSMCT32)
+            {
+                    skip = 8;
+            }
+            else if(fi.TME && fi.FBP == 0x01e80 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03880 && fi.TPSM == PSM_PSMCT32)
+            {
+                    skip = 8;
+            }*/
+    }
+
+
+     
+
+
+	return true;
+}
+
+bool GSC_OnePieceGrandAdventure(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x02d00 && fi.FPSM == PSM_PSMCT16 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x00e00 || fi.TBP0 == 0x00f00) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 4;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_OnePieceGrandBattle(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x02d00 && fi.FPSM == PSM_PSMCT16 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x00f00) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 4;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_ICO(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00800 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03d00 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 3;
+		}
+		else if(fi.TME && fi.FBP == 0x00800 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x02800 && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 1;
+		}
+		else if( Aggresive && fi.TME && fi.FBP == 0x0800 && (fi.TBP0 == 0x2800 || fi.TBP0 ==0x2c00) && fi.TPSM ==0  && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.TBP0 == 0x00800 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GT4(const GSFrameInfo& fi, int& skip)
+{
+	// Game requires to extract source from RT (block boundary) (texture cache limitation)
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP >= 0x02f00 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01180 /*|| fi.TBP0 == 0x01a40*/) && fi.TPSM == PSM_PSMT8) //TBP0 0x1a40 progressive
+		{
+			skip = 770;	//ntsc, progressive 1540
+		}
+		if(g_crc_region == CRC::EU && fi.TME && fi.FBP >= 0x03400 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01400 ) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 880;	//pal
+		}
+		else if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01400) && fi.FPSM == PSM_PSMCT24 && fi.TBP0 >= 0x03420 && fi.TPSM == PSM_PSMT8)
+		{
+			// TODO: removes gfx from where it is not supposed to (garage)
+			// skip = 58;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GT3(const GSFrameInfo& fi, int& skip)
+{
+	// Same issue as GSC_GT4 ???
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP >= 0x02de0 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01180) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 770;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GTConcept(const GSFrameInfo& fi, int& skip)
+{
+	// Same issue as GSC_GT4 ???
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP >= 0x03420 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01400) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 880;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_WildArms4(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x03100 && fi.FPSM == PSM_PSMZ32 && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMZ32)
+		{
+			skip = 100;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00e00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x02a00 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_WildArms5(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x03100 && fi.FPSM == PSM_PSMZ32 && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMZ32)
+		{
+			skip = 100;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00e00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x02a00 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Manhunt2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x03c20 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x01400 && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 640;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_CrashBandicootWoC(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x008c0 || fi.FBP == 0x00a00) && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x008c0 || fi.TBP0 == 0x00a00) && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.FPSM == fi.TPSM)
+		{
+			return false; // allowed
+		}
+
+		if(fi.TME && (fi.FBP == 0x01e40 || fi.FBP == 0x02200)  && fi.FPSM == PSM_PSMZ24 && (fi.TBP0 == 0x01180 || fi.TBP0 == 0x01400) && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 42;
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x008c0 || fi.FBP == 0x00a00) && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03c00 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 0;
+		}
+		else if(!fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x008c0 || fi.FBP == 0x00a00))
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_ResidentEvil4(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x03100 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 176;
+		}
+		else if(fi.TME && fi.FBP ==0x03100 && (fi.TBP0==0x2a00 ||fi.TBP0==0x3480) && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SacredBlaze(const GSFrameInfo& fi, int& skip)
+{
+	//Fix Sacred Blaze rendering glitches
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP==0x0000 || fi.FBP==0x0e00) && (fi.TBP0==0x2880 || fi.TBP0==0x2a80 ) && fi.FPSM==fi.TPSM && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0x0)
+		{
+			skip = 1;
+		}
+	}
+	return true;
+}
+
+template<uptr state_addr>
+bool GSC_SMTNocturneDDS(const GSFrameInfo& fi, int& skip)
+{
+	// stop the motion blur on the main character and 
+	// smudge filter from being drawn on USA versions of
+	// Nocturne, Digital Devil Saga 1 and Digital Devil Saga 2
+
+	if(Aggresive && g_crc_region == CRC::US && skip == 0 && fi.TBP0 == 0xE00 && fi.TME)
+	{
+		// Note: it will crash if the core doesn't allocate the EE mem in 0x2000_0000 (unlikely but possible)
+		// Aggresive hacks are evil anyway
+
+		// Nocturne:
+		// -0x5900($gp), ref at 0x100740
+		const int state = *(int*)(state_addr);
+		if (state == 23 || state == 24 || state == 25)
+			skip = 1;
+	}
+	return true;
+}
+
+bool GSC_Spartan(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(g_crc_region == CRC::EU &&fi.TME && fi.FBP == 0x02000 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 107;
+		}
+		if(g_crc_region == CRC::JP && fi.TME && fi.FBP == 0x02180 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x2180 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 3;
+		}
+		else
+		{
+				if(fi.TME)
+				{
+					// depth textures (bully, mgs3s1 intro, Front Mission 5)
+					if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+						// General, often problematic post processing
+						(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+					{
+						skip = 1;
+					}
+				}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_AceCombat4(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x02a00 && fi.FPSM == PSM_PSMZ24 && fi.TBP0 == 0x01600 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 71; // clouds (z, 16-bit)
+		}
+		else if(fi.TME && fi.FBP == 0x02900 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT24)
+		{
+			skip = 28; // blur
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Tekken5(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x02d60 || fi.FBP == 0x02d80 || fi.FBP == 0x02ea0 || fi.FBP == 0x03620) && fi.FPSM == fi.TPSM && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 95;
+		}
+		else if(fi.TME && (fi.FBP == 0x02bc0 || fi.FBP == 0x02be0 || fi.FBP == 0x02d00) && fi.FPSM == fi.TPSM && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 2;
+		}
+		else if(fi.TME)
+		{
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+				{
+					skip = 24;
+				}
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_IkkiTousen(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00a80 && fi.FPSM == PSM_PSMZ24 && fi.TBP0 == 0x01180 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 1000; // shadow (result is broken without depth copy, also includes 16 bit)
+		}
+		else if(fi.TME && fi.FBP == 0x00700 && fi.FPSM == PSM_PSMZ24 && fi.TBP0 == 0x01180 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 11; // blur
+		}
+	}
+	else if(skip > 7)
+	{
+		if(fi.TME && fi.FBP == 0x00700 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00700 && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 7; // the last steps of shadow drawing
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GodOfWar(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 1000;
+		}
+		else if(fi.TME && fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0xff000000)
+		{
+			skip = 1; // blur
+		}
+		else if(fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8 && ((fi.TZTST == 2 && fi.FBMSK == 0x00FFFFFF) || (fi.TZTST == 1 && fi.FBMSK == 0x00FFFFFF) || (fi.TZTST == 3 && fi.FBMSK == 0xFF000000)))
+		{
+			skip = 1; // wall of fog
+		}
+		else if (fi.TME && (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S))
+		{
+			// Equivalent to the UserHacks_AutoSkipDrawDepth hack but enabled by default
+			// http://forums.pcsx2.net/Thread-God-of-War-Red-line-rendering-explained
+			skip = 1;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT16)
+		{
+			skip = 3;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GodOfWar2(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME)
+		{
+			if( fi.FBP == 0x00100 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00100 && fi.TPSM == PSM_PSMCT16 // ntsc
+				|| fi.FBP == 0x02100 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x02100 && fi.TPSM == PSM_PSMCT16) // pal
+			{
+				skip = 1000; // shadows
+			}
+			if((fi.FBP == 0x00100 || fi.FBP == 0x02100) && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 & 0x03000) == 0x03000
+				&& (fi.TPSM == PSM_PSMT8 || fi.TPSM == PSM_PSMT4)
+				&& ((fi.TZTST == 2 && fi.FBMSK == 0x00FFFFFF) || (fi.TZTST == 1 && fi.FBMSK == 0x00FFFFFF) || (fi.TZTST == 3 && fi.FBMSK == 0xFF000000)))
+			{
+					skip = 1; // wall of fog
+			}
+			else if(Aggresive && fi.TPSM == PSM_PSMCT24 && fi.TME && (fi.FBP ==0x1300 ) && (fi.TBP0 ==0x0F00 || fi.TBP0 ==0x1300 || fi.TBP0==0x2b00)) // || fi.FBP == 0x0100
+			{
+				skip = 1; // global haze/halo
+			}
+			else if(Aggresive && fi.TPSM == PSM_PSMCT24 && fi.TME && (fi.FBP ==0x0100 ) && (fi.TBP0==0x2b00 || fi.TBP0==0x2e80)) //480P 2e80
+			{
+				skip = 1; // water effect and water vertical lines
+			}
+			else if (fi.TME && (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S))
+			{
+				// Equivalent to the UserHacks_AutoSkipDrawDepth hack but enabled by default
+				// http://forums.pcsx2.net/Thread-God-of-War-Red-line-rendering-explained
+				skip = 1;
+			}
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00100 || fi.FBP == 0x02100) && fi.FPSM == PSM_PSMCT16)
+		{
+			skip = 3;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_GiTS(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01400 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x02e40 && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 1315;
+		}
+	}
+	else
+	{
+	}
+
+	return true;
+}
+
+bool GSC_Onimusha3(const GSFrameInfo& fi, int& skip)
+{
+	if(fi.TME /*&& (fi.FBP == 0x00000 || fi.FBP == 0x00700)*/ && (fi.TBP0 == 0x01180 || fi.TBP0 == 0x00e00 || fi.TBP0 == 0x01000 || fi.TBP0 == 0x01200) && (fi.TPSM == PSM_PSMCT32 || fi.TPSM == PSM_PSMCT24))
+	{
+		skip = 1;
+	}
+
+	return true;
+}
+
+bool GSC_TalesOfAbyss(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00e00) && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMT8) // copies the z buffer to the alpha channel of the fb
+		{
+			skip = 1000;
+		}
+		else if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00e00) && (fi.TBP0 == 0x03560 || fi.TBP0 == 0x038e0) && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.TPSM != PSM_PSMT8)
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SonicUnleashed(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FPSM == PSM_PSMCT16S && fi.TBP0 == 0x00000 && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 1000; // shadow
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00000 && fi.FPSM == PSM_PSMCT16 && fi.TPSM == PSM_PSMCT16S)
+		{
+			skip = 2;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SimpsonsGame(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == fi.TPSM && fi.TBP0 == 0x03000 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 100;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x03000 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 2;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Genji(const GSFrameInfo& fi, int& skip)
+{
+	if( !skip && fi.TME && (fi.FBP == 0x700 || fi.FBP == 0x0) && fi.TBP0 == 0x1500 && fi.TPSM )
+		skip=1;
+
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01500 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x00e00 && fi.TPSM == PSM_PSMZ16)
+		{
+			// likely fixed in openGL (texture shuffle)
+			if (Dx_only)
+				skip = 6;
+			else
+				return false;
+		}	
+		else if(fi.TPSM == PSM_PSMCT24 && fi.TME ==0x0001 && fi.TBP0==fi.FBP)
+		{
+			skip = 1;
+		}
+		else if(fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+	}
+	else
+	{
+	}
+
+	return true;
+}
+
+bool GSC_StarOcean3(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	// The game emulate a stencil buffer with the alpha channel of the RT
+	// The operation of the stencil is selected with the palette
+	// For example -1 wrap will be [240, 16, 32, 48 ....]
+	// i.e. p[A>>4] = (A - 16) % 256
+	//
+	// The fastest and accurate solution will be to replace this pseudo stencil
+	// by a dedicated GPU draw call
+	// 1/ Use future GPU capabilities to do a "kind" of SW blending
+	// 2/ Use a real stencil/atomic image, and then compute the RT alpha value
+	//
+	// Both of those solutions will increase code complexity (and only avoid upscaling
+	// glitches)
+
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH)
+		{
+			skip = 1000; //
+		}
+	}
+	else
+	{
+		if(!(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH))
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_ValkyrieProfile2(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		/*if(fi.TME && (fi.FBP == 0x018c0 || fi.FBP == 0x02180) && fi.FPSM == fi.TPSM && fi.TBP0 >= 0x03200 && fi.TPSM == PSM_PSMCT32)	//NTSC only, !(fi.TBP0 == 0x03580 || fi.TBP0 == 0x03960)
+		{
+			skip = 1;	//red garbage in lost forest, removes other effects...
+		}
+		if(fi.TME && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 1; // //garbage in cutscenes, doesn't remove completely, better use "Alpha Hack"
+        }*/
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH)
+		{
+			// GH: Hack is quite similar to GSC_StarOcean3. It is potentially the same issue.
+			skip = 1000; //
+		}
+	}
+	else
+	{
+		if(!(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH))
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_RadiataStories(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+        {
+			skip = 1;
+        }
+		else if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH)
+		{
+			// GH: Hack is quite similar to GSC_StarOcean3. It is potentially the same issue.
+			// Fixed on openGL
+			skip = 1000;
+		}
+	}
+	else
+	{
+		if(!(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4HH))
+		{
+			skip = 0;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_HauntingGround(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16S && fi.FBMSK == 0x03FFF)
+		{
+			if (Dx_only)
+				skip = 1;
+			else
+				return false;
+		}
+		else if(fi.TME && fi.FBP == 0x3000 && fi.TBP0 == 0x3380)
+		{
+			skip = 1; // bloom
+		}
+		else if(fi.TME && (fi.FBP ==0x2200) && (fi.TBP0 ==0x3a80) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+		else if(fi.FBP ==0x2200 && fi.TBP0==0x3000 && fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+		else if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_EvangelionJo(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.TBP0 == 0x2BC0 || (fi.FBP == 0 || fi.FBP == 0x1180) && (fi.FPSM | fi.TPSM) == 0)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SuikodenTactics(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if( !fi.TME && fi.TPSM == PSM_PSMT8H && fi.FPSM == 0 &&
+			fi.FBMSK == 0x0FF000000 && fi.TBP0 == 0 && GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM))
+		{
+			skip = 4;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_CaptainTsubasa(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x1C00 && !fi.FBMSK)
+			{
+				skip = 1;
+			}
+	}
+	return true;
+}
+
+bool GSC_Oneechanbara2Special(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TPSM == PSM_PSMCT24 && fi.TME && fi.FBP == 0x01180)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_NarutimateAccel(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x3800 && fi.TBP0 == 0 && (fi.FPSM | fi.TPSM) == 0)
+			{
+				skip = 105;
+			}
+		else if(!fi.TME && fi.FBP == 0x3800 && fi.TBP0 == 0x1E00 && fi.FPSM == 0 && fi.TPSM == 49 && fi.FBMSK == 0xFF000000)
+			{
+				skip = 1;
+			}
+	}
+	else
+	{
+		if(fi.FBP == 0 && fi.TBP0 == 0x3800 && fi.TME && (fi.FPSM | fi.TPSM) == 0)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Naruto(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x3800 && fi.TBP0 == 0 && (fi.FPSM | fi.TPSM) == 0)
+			{
+				skip = 105;
+			}
+		else if(!fi.TME && fi.FBP == 0x3800 && fi.TBP0 == 0x1E00 && fi.FPSM == 0 && fi.TPSM == 49 && fi.FBMSK == 0xFF000000)
+			{
+				skip = 0;
+			}
+	}
+	else
+	{
+		if(fi.FBP == 0 && fi.TBP0 == 0x3800 && fi.TME && (fi.FPSM | fi.TPSM) == 0)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_EternalPoison(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		// Texture shuffle ???
+		if(fi.TPSM == PSM_PSMCT16S && fi.TBP0 == 0x3200)
+		{
+			skip = 1;
+		}
+	}
+	return true;
+}
+
+bool GSC_LegoBatman(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(Aggresive && skip == 0)
+	{
+		if(fi.TME && fi.TPSM == PSM_PSMZ16 && fi.FPSM == PSM_PSMCT16 && fi.FBMSK == 0x00000)
+		{
+			skip = 3;
+		}
+	}
+	return true;
+}
+
+bool GSC_SakuraTaisen(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && (fi.FBP == 0x0 || fi.FBP == 0x1180) && (fi.TBP0!=0x3fc0 && fi.TBP0!=0x3c9a && fi.TBP0 !=0x3dec /*fi.TBP0 ==0x38d0 || fi.TBP0==0x3912 ||fi.TBP0==0x3bdc ||fi.TBP0==0x3ab3 ||fi.TBP0<=0x3a92*/) && fi.FPSM == PSM_PSMCT32 && (fi.TPSM == PSM_PSMT8 || fi.TPSM == PSM_PSMT4) && (fi.FBMSK == 0x00FFFFFF || !fi.FBMSK))
+		{
+			skip = 0; //3dec 3fc0 3c9a
+		}
+		if(!fi.TME && (fi.FBP | fi.TBP0) !=0 && (fi.FBP | fi.TBP0) !=0x1180 && (fi.FBP | fi.TBP0) !=0x3be0 && (fi.FBP | fi.TBP0) !=0x3c80 && fi.TBP0!=0x3c9a  && (fi.FBP | fi.TBP0) !=0x3d80 && fi.TBP0 !=0x3dec&& fi.FPSM == PSM_PSMCT32 && (fi.FBMSK==0))
+		{
+			skip =0; //3dec 3fc0 3c9a
+		}
+		if(!fi.TME && (fi.FBP | fi.TBP0) !=0 && (fi.FBP | fi.TBP0) !=0x1180 && (fi.FBP | fi.TBP0) !=0x3be0 && (fi.FBP | fi.TBP0) !=0x3c80 && (fi.FBP | fi.TBP0) !=0x3d80 && fi.TBP0!=0x3c9a && fi.TBP0 !=0x3de && fi.FPSM == PSM_PSMCT32 && (fi.FBMSK==0))
+		{
+			skip =1; //3dec 3fc0 3c9a
+		}
+		else if(fi.TME && (fi.FBP == 0 || fi.FBP == 0x1180) && fi.TBP0 == 0x35B8 && fi.TPSM == PSM_PSMT4)
+		{
+			skip = 1;
+		}
+		else
+		{
+			if(!fi.TME && (fi.FBP | fi.TBP0) ==0x38d0 && fi.FPSM == PSM_PSMCT32 )
+			{
+				skip = 1; //3dec 3fc0 3c9a
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Tenchu(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.TPSM == PSM_PSMZ16 && fi.FPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 3; 
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Sly3(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00700 || fi.FBP == 0x00a80 || fi.FBP == 0x00e00) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x00700 || fi.TBP0 == 0x00a80 || fi.TBP0 == 0x00e00) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 1000;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 3;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Sly2(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME &&  (fi.FBP == 0x00000 || fi.FBP == 0x00700 || fi.FBP == 0x00800) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 1000;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 3;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_ShadowofRome(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.FBP && fi.TPSM == PSM_PSMT8H && ( fi.FBMSK ==0x00FFFFFF))
+		{
+			skip =1;
+		}
+		else if(fi.TME ==0x0001 && (fi.TBP0==0x1300 || fi.TBP0==0x0f00) && fi.FBMSK>=0xFFFFFF)
+		{
+			skip = 1;
+		}		
+		else if(fi.TME && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 ==0x0160 ||fi.TBP0==0x01e0 || fi.TBP0<=0x0800) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1;
+		}
+		else if(fi.TME && (fi.TBP0==0x0700) && (fi.TPSM == PSM_PSMCT32 || fi.TPSM == PSM_PSMCT24))
+		{
+			skip = 1;
+		} 
+	}
+	
+	return true;
+}
+
+bool GSC_FFXII(const GSFrameInfo& fi, int& skip)
+{
+	if(Aggresive && skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+	return true;
+}
+
+bool GSC_FFX2(const GSFrameInfo& fi, int& skip)
+{
+	if(Aggresive && skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+	return true;
+}
+
+bool GSC_FFX(const GSFrameInfo& fi, int& skip)
+{
+	if(Aggresive && skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+	return true;
+}
+
+bool GSC_DemonStone(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01400 && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 1000;
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01000) && fi.FPSM == PSM_PSMCT32)
+		{
+			skip = 2;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_BigMuthaTruckers(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00a00) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 3;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_TimeSplitters2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x00e00 || fi.FBP == 0x01000) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x00e00 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0x0FF000000)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_LordOfTheRingsTwoTowers(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x01180 || fi.FBP == 0x01400) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01000) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 1000;//shadows
+		}
+		else if(fi.TME && fi.TPSM == PSM_PSMZ16 && fi.TBP0 == 0x01400 && fi.FPSM == PSM_PSMCT16 && fi.FBMSK == 0x03FFF)
+		{
+			skip = 3;	//wall of fog
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x01000) && (fi.TBP0 == 0x01180 || fi.TBP0 == 0x01400) && fi.FPSM == PSM_PSMCT32)
+		{
+			skip = 2;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_LordOfTheRingsThirdAge(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && fi.FBP == 0x03000 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4 && fi.FBMSK == 0xFF000000)
+		{
+			skip = 1000;	//shadows
+		}
+	}
+	else
+	{
+		if (fi.TME && (fi.FBP == 0x0 || fi.FBP == 0x00e00 || fi.FBP == 0x01000) && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x03000 && fi.TPSM == PSM_PSMCT24)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_RedDeadRevolver(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && (fi.FBP == 0x02420 || fi.FBP == 0x025e0) && fi.FPSM == PSM_PSMCT24)
+		{
+			skip = 1200;
+		}
+		else if(fi.TME && (fi.FBP == 0x00800 || fi.FBP == 0x009c0) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x01600 || fi.TBP0 == 0x017c0) && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 2;	//filter
+		}
+		else if(fi.FBP == 0x03700 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMCT24)
+		{
+			skip = 2;	//blur
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00800 || fi.FBP == 0x009c0) && fi.FPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_HeavyMetalThunder(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x03100 && fi.FPSM == fi.TPSM && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMZ32)
+		{
+			skip = 100;
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == 0x00e00 && fi.FPSM == fi.TPSM && fi.TBP0 == 0x02a00 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_BleachBladeBattlers(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01180 && fi.FPSM == fi.TPSM && fi.TBP0 == 0x03fc0 && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Castlevania(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		// This hack removes the shadows and globally darker image
+		// I think there are 2 issues on GSdx
+		//
+		// 1/ potential not correctly supported colclip.
+		//
+		// 2/ use of a 32 bits format to emulate a 16 bit formats
+		// For example, if you blend 64 time the value 4 on a dark destination pixels
+		//
+		// FMT32: 4*64 = 256 <= white pixels
+		//
+		// FMT16: output of blending will always be 0 because the 3 lsb of color is dropped.
+		//		  Therefore the pixel remains dark !!!
+		if(fi.TME && fi.FBP == 0 && fi.TBP0 && fi.TPSM == 10 && fi.FBMSK == 0xFFFFFF)
+		{
+			skip = 2;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Black(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		// Note: the first part of the hack must be fixed in openGL (texture shuffle). Remains the 2nd part (HasSharedBits)
+		if(fi.TME /*&& (fi.FBP == 0x00000 || fi.FBP == 0x008c0)*/ && fi.FPSM == PSM_PSMCT16 && (fi.TBP0 == 0x01a40 || fi.TBP0 == 0x01b80 || fi.TBP0 == 0x030c0) && fi.TPSM == PSM_PSMZ16 || (GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)))
+		{
+			skip = 5;
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x00000 || fi.FBP == 0x008c0 || fi.FBP == 0x0a00 ) && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT4)
+		{
+			skip = 0;
+		}
+		else if(!fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 0;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_CrashNburn(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_TombRaider(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01000 && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1; 
+		}
+	}
+	return true;
+}
+
+bool GSC_TombRaiderLegend(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01000 && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32 && (fi.TBP0 == 0x2b60 ||fi.TBP0 == 0x2b80 || fi.TBP0 == 0x2E60 ||fi.TBP0 ==0x3020 ||fi.TBP0 == 0x3200 || fi.TBP0 == 0x3320))
+		{
+			skip = 1;
+		}
+		else if(fi.TPSM == PSM_PSMCT32 && (fi.TPSM | fi.FBP)==0x2fa0 && (fi.TBP0==0x2bc0 ) && fi.FBMSK ==0)  
+		{
+			skip = 2;
+		}
+		
+		
+	}// ||fi.TBP0 ==0x2F00
+
+	return true;
+}
+
+bool GSC_TombRaiderUnderWorld(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01000 && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32 && (fi.TBP0 == 0x2B60 /*|| fi.TBP0 == 0x2EFF || fi.TBP0 ==0x2F00 || fi.TBP0 == 0x3020*/ || fi.TBP0 >= 0x2C01 && fi.TBP0!=0x3029 && fi.TBP0!=0x302d))
+		{
+			skip = 1;
+		}
+		else if(fi.TPSM == PSM_PSMCT32 && (fi.TPSM | fi.FBP)==0x2c00 && (fi.TBP0 ==0x0ee0) && fi.FBMSK ==0)  
+		{
+			skip = 2;
+		}
+		/*else if(fi.TPSM == PSM_PSMCT16 && (fi.TPSM | fi.FBP)>=0x0 && (fi.TBP0 >=0x0) && fi.FBMSK ==0)  
+		{
+			skip = 600;
+		}*/
+	}
+
+	return true;
+}
+
+bool GSC_SSX3(const GSFrameInfo& fi, int& skip)
+{
+	if(Aggresive && skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool GSC_FFVIIDoC(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x01c00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x02c00 && fi.TPSM == PSM_PSMCT24)
+		{
+			skip = 1;
+		}
+		if(!fi.TME && fi.FBP == 0x01c00 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x01c00 && fi.TPSM == PSM_PSMCT24)
+		{
+			//skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_DevilMayCry3(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+
+		if(Dx_only && fi.TME && fi.FBP == 0x01800 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x01000 && fi.TPSM == PSM_PSMZ16)
+		{
+			skip = 32;
+		}
+		if(fi.TME && fi.FBP == 0x01800 && fi.FPSM == PSM_PSMZ32 && fi.TBP0 == 0x0800 && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 16;
+		}
+		if(fi.TME && fi.FBP == 0x01800 && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x0 && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 24;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_StarWarsForceUnleashed(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x038a0 || fi.FBP == 0x03ae0) && fi.FPSM == fi.TPSM && fi.TBP0 == 0x02300 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 1000;	//9, shadows
+		}
+	}
+	else
+	{
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x034a0 || fi.TBP0 == 0x36e0) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 2;	
+		}
+
+	}
+	
+	return true;
+}
+
+bool GSC_StarWarsBattlefront(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP > 0x0 && fi.FBP < 0x01000) && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 > 0x02000 && fi.TBP0 < 0x03000) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_StarWarsBattlefront2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP > 0x01000 && fi.FBP < 0x02000) && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 > 0x0 && fi.TBP0 < 0x01000) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1;
+		}
+		if(fi.TME && (fi.FBP > 0x01000 && fi.FBP < 0x02000) && fi.FPSM == PSM_PSMZ32 && (fi.TBP0 > 0x0 && fi.TBP0 < 0x01000) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_BlackHawkDown(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(Dx_only && fi.TME && fi.FBP == 0x00800 && fi.FPSM == PSM_PSMCT16 && fi.TBP0 == 0x01800 && fi.TPSM == PSM_PSMZ16)
+		{
+			skip = 2;	//wall of fog
+		}
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 5;	//night filter
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Burnout(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x01dc0 || fi.FBP == 0x02200) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x01dc0 || fi.TBP0 == 0x02200) && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 4;
+		}
+		else if(fi.TME && fi.FPSM == PSM_PSMCT16 && fi.TPSM == PSM_PSMZ16)	//fog
+		{
+			if (!Dx_only) return false;
+
+			if(fi.FBP == 0x00a00 && fi.TBP0 == 0x01e00)	
+			{
+				skip = 4; //pal
+			}
+			if(fi.FBP == 0x008c0 && fi.TBP0 == 0x01a40)
+			{
+				skip = 3; //ntsc
+			}
+		}
+		else if (fi.TME && (fi.FBP == 0x02d60 || fi.FBP == 0x033a0) && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x02d60 || fi.TBP0 == 0x033a0) && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0x0)
+		{
+			skip = 2; //impact screen
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_MidnightClub3(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP > 0x01d00 && fi.FBP <= 0x02a00) && fi.FPSM == PSM_PSMCT32 && (fi.FBP >= 0x01600 && fi.FBP < 0x03260) && fi.TPSM == PSM_PSMT8H)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_SpyroNewBeginning(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == fi.TPSM && fi.TBP0 == 0x034a0 && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 2;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_SpyroEternalNight(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == fi.TBP0 && fi.FPSM == fi.TPSM && (fi.TBP0 == 0x034a0 ||fi.TBP0 == 0x035a0 || fi.TBP0 == 0x036e0) && fi.TPSM == PSM_PSMCT16)
+		{
+			skip = 2;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_TalesOfLegendia(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP == 0x3f80 || fi.FBP == 0x03fa0) && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 3; //3, 9
+		}
+		if(fi.TME && fi.FBP == 0x3800 && fi.FPSM == PSM_PSMCT32 && fi.TPSM == PSM_PSMZ32)
+		{
+			skip = 2;
+		}
+		if(fi.TME && fi.FBP && fi.FPSM == PSM_PSMCT32 && fi.TBP0 == 0x3d80)
+		{
+			skip = 1;
+		}	
+		if(fi.TME && fi.FBP ==0x1c00 && (fi.TBP0==0x2e80 ||fi.TBP0==0x2d80) && fi.TPSM ==0  && fi.FBMSK == 0xff000000)
+		{
+			skip = 1;
+		}	
+		if(!fi.TME && fi.FBP ==0x2a00 && (fi.TBP0==0x1C00 ) && fi.TPSM ==0  && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 1;
+		}
+	}
+		
+	return true;
+}
+
+bool GSC_NanoBreaker(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP == 0x0 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x03800 || fi.TBP0 == 0x03900) && fi.TPSM == PSM_PSMCT16S)
+		{
+			skip = 2;
+		}
+	}
+		
+	return true;
+}
+
+bool GSC_Kunoichi(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && (fi.FBP == 0x0 || fi.FBP == 0x00700 || fi.FBP == 0x00800) && fi.FPSM == PSM_PSMCT32 && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 3;
+		}
+		if(fi.TME && (fi.FBP ==0x0700 || fi.FBP==0) && fi.TBP0==0x0e00 && fi.TPSM ==0  && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 1;
+			}
+		}
+	}
+	else
+	{
+		if(fi.TME && (fi.FBP == 0x0e00) && fi.FPSM == PSM_PSMCT32 && fi.FBMSK == 0xFF000000)
+		{
+			skip = 0;
+		}
+	}
+		
+	return true;
+}
+
+bool GSC_Yakuza(const GSFrameInfo& fi, int& skip)
+{
+	if(1
+		&& !skip
+		&& !fi.TME
+		&& (0
+			|| fi.FBP == 0x1c20 && fi.TBP0 == 0xe00		//ntsc (EU and US DVDs)
+			|| fi.FBP == 0x1e20 && fi.TBP0 == 0x1000	//pal1
+			|| fi.FBP == 0x1620 && fi.TBP0 == 0x800		//pal2
+		)
+		&& fi.TPSM == PSM_PSMZ24
+		&& fi.FPSM == PSM_PSMCT32
+		/*
+		&& fi.FBMSK	==0xffffff
+		&& fi.TZTST
+		&& !GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)
+		*/
+	)
+	{
+		skip=3;
+	}
+	return true;
+}
+
+bool GSC_Yakuza2(const GSFrameInfo& fi, int& skip)
+{
+	if(1
+		&& !skip
+		&& !fi.TME
+		&& (0
+			|| fi.FBP == 0x1c20 && fi.TBP0 == 0xe00		//ntsc (EU DVD)
+			|| fi.FBP == 0x1e20 && fi.TBP0 == 0x1000	//pal1
+			|| fi.FBP == 0x1620 && fi.TBP0 == 0x800		//pal2
+		)
+		&& fi.TPSM == PSM_PSMZ24
+		&& fi.FPSM == PSM_PSMCT32
+		/*
+		&& fi.FBMSK	==0xffffff
+		&& fi.TZTST
+		&& !GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)
+		*/
+	)
+	{
+		skip=17;
+	}
+	return true;
+}
+
+bool GSC_SkyGunner(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+
+		if(!fi.TME && !(fi.FBP == 0x0 || fi.FBP == 0x00800 || fi.FBP == 0x008c0 || fi.FBP == 0x03e00) && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x0 || fi.TBP0 == 0x01800) && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1; //Huge Vram usage
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_JamesBondEverythingOrNothing(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+
+		if(fi.TME && (fi.FBP < 0x02000 && !(fi.FBP == 0x0 || fi.FBP == 0x00e00)) && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 > 0x01c00 && fi.TBP0 < 0x03000) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1; //Huge Vram usage
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_ZettaiZetsumeiToshi2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+			if(fi.TME  && fi.TPSM == PSM_PSMCT16S  && (fi.FBMSK >= 0x6FFFFFFF || fi.FBMSK ==0) )
+			{
+				skip = 1000;
+			}
+			else if(fi.TME  && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0xFF000000)
+			{
+				skip = 2;
+ 			}
+			else if((fi.FBP | fi.TBP0)&& fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT16 && fi.FBMSK == 0x3FFF)
+			{
+				// Note start of the effect (texture shuffle) is fixed in openGL but maybe not the extra draw
+				// call....
+				skip = 1000;
+			}
+			
+	}
+	else 		
+	{
+			if(!fi.TME && fi.TPSM == PSM_PSMCT32  && fi.FBP==0x1180 && fi.TBP0==0x1180 && (fi.FBMSK ==0))
+			{
+				skip = 0; //
+			}
+			if(fi.TME && fi.TPSM == PSM_PSMT4  && fi.FBP && (fi.TBP0!=0x3753))
+			{
+				skip = 0; //
+			}
+			if(fi.TME && fi.TPSM == PSM_PSMT8H && fi.FBP ==0x22e0 && fi.TBP0 ==0x36e0 )
+			{
+				skip = 0; //
+			}
+			if(!fi.TME  && fi.TPSM == PSM_PSMT8H && fi.FBP ==0x22e0 )
+			{
+				skip = 0; //
+			}
+			if(fi.TME  && fi.TPSM == PSM_PSMT8 && (fi.FBP==0x1180 || fi.FBP==0) && (fi.TBP0 !=0x3764 && fi.TBP0!=0x370f))
+			{
+				skip = 0; //
+			}
+			if(fi.TME && fi.TPSM == PSM_PSMCT16S && (fi.FBP==0x1180 ))
+			{
+				skip = 2; //
+			}
+			
+	}
+	
+	return true;
+}
+
+bool GSC_ShinOnimusha(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		
+		if(fi.TME && fi.FBP == 0x001000 && (fi.TBP0 ==0 || fi.TBP0 == 0x0800) && fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 0;
+		}		
+		else if(fi.TPSM == PSM_PSMCT24 && fi.TME && fi.FBP == 0x01000) // || fi.FBP == 0x00000
+		{
+			skip = 28; //28 30 56 64 
+		}
+		else if(fi.FBP && fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0xFFFFFF)
+		{
+			skip = 0; //24 33 40 9
+		}
+		else if(fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0xFF000000)
+		{
+			skip = 1;
+		}
+		else if(fi.TME && (fi.TBP0 ==0x1400 || fi.TBP0 ==0x1000 ||fi.TBP0 == 0x1200) && (fi.TPSM == PSM_PSMCT32 || fi.TPSM == PSM_PSMCT24))
+		{
+			skip = 1;
+		}
+		
+	}
+	
+	return true;
+}
+
+bool GSC_XE3(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TPSM == PSM_PSMT8H && fi.FBMSK >= 0xEFFFFFFF)
+		{
+			skip = 73;
+		}
+		else if(fi.TME && fi.FBP ==0x03800 && fi.TBP0 && fi.TPSM ==0  && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+		/*else if(fi.TPSM ==0x00000 && PSM_PSMCT24 && fi.TME && fi.FBP == 0x03800)
+		{
+			skip = 1 ;
+		}*/
+		/*else if(fi.TME ==0  && (fi.FBP ==0 ) && fi.FPSM == PSM_PSMCT32 && ( fi.TPSM == PSM_PSMT8 || fi.TPSM == PSM_PSMT4) && (fi.FBMSK == 0x00FFFFFF || fi.FBMSK == 0xFF000000))
+		{
+			skip = 1;
+		}*/
+		else
+		{
+				if(fi.TME)
+				{
+					// depth textures (bully, mgs3s1 intro, Front Mission 5)
+					if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+						// General, often problematic post processing
+						(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+					{
+						skip = 1;
+					}
+				}
+		}
+	}
+	return true;
+}
+
+bool GSC_GetaWay(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if((fi.FBP ==0 || fi.FBP ==0x1180)&& fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SakuraWarsSoLongMyLove(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME==0 && fi.FBP != fi.TBP0 && fi.TBP0 && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 3;
+		}
+		else if(fi.TME==0 && fi.FBP == fi.TBP0 && (fi.TBP0 ==0x1200 ||fi.TBP0 ==0x1180 ||fi.TBP0 ==0) && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 3;
+		}	
+		else if(fi.TME && (fi.FBP ==0 || fi.FBP ==0x1180) && fi.FPSM == PSM_PSMCT32 && fi.TBP0 ==0x3F3F && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_FightingBeautyWulong(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.TBP0 ==0x0700 || fi.TBP0 ==0x0a80) && (fi.TPSM == PSM_PSMCT32 || fi.TPSM == PSM_PSMCT24))
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_TouristTrophy(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP >= 0x02f00 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x00000 || fi.TBP0 == 0x01180) && fi.TPSM == PSM_PSMT8) 
+		{
+			skip = 770;	
+		}
+		if(fi.TME && fi.FBP >= 0x02de0 && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 ==0 || fi.TBP0==0x1a40 ||fi.TBP0 ==0x2300) && fi.TPSM == PSM_PSMT8)
+		{
+			skip = 770; //480P
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GTASanAndreas(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP ==0x0a00 || fi.FBP ==0x08c0) && (fi.TBP0 ==0x1b80 || fi.TBP0 ==0x1a40) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_FrontMission5(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+		if(fi.TME && (fi.FBP ==0x1000) && (fi.TBP0 ==0x2e00 || fi.TBP0 ==0x3200) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1; //fi.TBP0 ==0x1f00
+		}
+	}
+
+	return true;
+}
+
+bool GSC_GodHand(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP ==0x0) && (fi.TBP0 ==0x2800) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_KnightsOfTheTemple2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TPSM == PSM_PSMT8H && fi.FBMSK == 0)
+		{
+			skip = 1;
+		}
+		else if(fi.TPSM ==0x00000 && PSM_PSMCT24 && fi.TME && (fi.FBP ==0x3400 ||fi.FBP==0x3a00))
+		{
+			skip = 1 ;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_UltramanFightingEvolution(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP==0x2a00 && fi.FPSM == PSM_PSMZ24 && fi.TBP0 == 0x1c00 && fi.TPSM == PSM_PSMZ24)
+		{
+			skip = 5; // blur
+		}
+	}
+
+	return true;
+}
+
+bool GSC_DeathByDegreesTekkenNinaWilliams(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP ==0 ) && fi.TBP0==0x34a0 && (fi.TPSM == PSM_PSMCT32))
+		{
+			skip = 1;
+		}
+		else if((fi.FBP ==0x3500)&& fi.TPSM == PSM_PSMT8 && fi.FBMSK == 0xFFFF00FF)
+		{
+			skip = 4;
+		}
+	}
+	if(fi.TME)
+		{
+			if((fi.FBP | fi.TBP0 | fi.FPSM | fi.TPSM) && (fi.FBMSK == 0x00FFFFFF ))
+			{
+				skip = 1;
+			}
+		}
+	return true;
+}
+
+bool GSC_AlpineRacer3(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && fi.FBP == 0 && (fi.FBMSK ==0x0001 ||fi.FBMSK == 0x00FFFFFF))
+		{
+			skip = 2;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_HummerBadlands(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP ==0x0a00) && (fi.TBP0 ==0x03200 || fi.TBP0==0x3700) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_SengokuBasara(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME  && (fi.TBP0==0x1800 ) && fi.FBMSK==0xFF000000)
+		{
+			skip = 1;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_Grandia3(const GSFrameInfo& fi, int& skip) // DX ONLY
+{
+	if(skip == 0)
+	{
+		if(fi.TME && (fi.FBP ==0x0 || fi.FBP ==0x0e00) && (fi.TBP0 ==0x2a00 ||fi.TBP0==0x0e00 ||fi.TBP0==0) && fi.FPSM == fi.TPSM && fi.TPSM == PSM_PSMCT32)
+		{
+			skip = 1;
+		}
+	}
+	
+
+	return true;
+}
+
+bool GSC_FinalFightStreetwise(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(!fi.TME && (fi.FBP == 0 || fi.FBP == 0x08c0) && fi.FPSM == PSM_PSMCT32 && (fi.TPSM == PSM_PSMT8 || fi.TPSM == PSM_PSMT4) && fi.FBMSK == 0x00FFFFFF)
+		{
+			skip = 3;
+		}
+	}
+
+	return true;
+}
+
+bool GSC_TalesofSymphonia(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FPSM == PSM_PSMCT32 && (fi.TBP0 == 0x2bc0 || fi.TBP0 <= 0x0200) && (fi.FBMSK==0xFF000000 ||fi.FBMSK==0x00FFFFFF))
+		{
+			skip = 1; //fi.FBMSK==0
+		}
+		if(fi.TME  && (fi.TBP0==0x1180 || fi.TBP0==0x1a40 || fi.TBP0==0x2300) && fi.FBMSK>=0xFF000000)
+		{
+			skip = 1;
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_SoulCalibur2(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 2;
+			}
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_SoulCalibur3(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = 2;
+			}
+		}
+	}
+	
+	return true;
+}
+
+bool GSC_Simple2000Vol114(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{	
+		if(fi.TME==0 && (fi.FBP==0x1500) && (fi.TBP0==0x2c97 || fi.TBP0==0x2ace || fi.TBP0==0x03d0 || fi.TBP0==0x2448) && (fi.FBMSK == 0x0000))
+		{
+			skip = 1;
+		}
+		if(fi.TME && (fi.FBP==0x0e00) && (fi.TBP0==0x1000) && (fi.FBMSK == 0x0000))
+		{
+			skip = 1;
+		}
+	}
+	return true;
+}
+
+bool GSC_UrbanReign(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		if(fi.TME && fi.FBP==0x0000 && fi.TBP0==0x3980 && fi.FPSM==fi.TPSM && fi.TPSM == PSM_PSMCT32 && fi.FBMSK == 0x0)
+		{
+			skip = 1;
+		}
+	}
+	return true;
+}
+
+bool GSC_SteambotChronicles(const GSFrameInfo& fi, int& skip)
+{
+	if(skip == 0)
+	{
+		// Author: miseru99 on forums.pcsx2.net
+		if(fi.TME && fi.TPSM == PSM_PSMCT16S)
+		{
+			if(fi.FBP == 0x1180)
+			{
+				skip=1;//1 deletes some of the glitched effects
+			}
+			else if(fi.FBP == 0)
+			{
+				skip=100;//deletes most others(too high deletes the buggy sea completely;c, too low causes glitches to be visible)
+			}
+			else if(Aggresive && fi.FBP != 0)//Agressive CRC
+			{
+				skip=19;//"speedhack", makes the game very light, vaporized water can disappear when not looked at directly, possibly some interface still, other value to try: 6 breaks menu background, possibly nothing(?) during gameplay, but it's slower, hence not much of a speedhack anymore
+			}
+		}
+	}
+	return true;
+}
+
+#undef Agressive
+
+#ifdef ENABLE_DYNAMIC_CRC_HACK
+
+#include <sys/stat.h>
+/***************************************************************************
+	AutoReloadLibrary : Automatically reloads a dll if the file was modified.
+		Uses a temporary copy of the watched dll such that the original
+		can be modified while the copy is loaded and used.
+
+	NOTE: The API is not platform specific, but current implementation is Win32.
+***************************************************************************/
+class AutoReloadLibrary
+{
+private:
+	string	m_dllPath, m_loadedDllPath;
+	DWORD	m_minMsBetweenProbes;
+	time_t	m_lastFileModification;
+	DWORD	m_lastProbe;
+	HMODULE	m_library;
+
+	string	GetTempName()
+	{
+		string result = m_loadedDllPath + ".tmp"; //default name
+		TCHAR tmpPath[MAX_PATH], tmpName[MAX_PATH];
+		DWORD ret = GetTempPath(MAX_PATH, tmpPath);
+		if(ret && ret <= MAX_PATH && GetTempFileName(tmpPath, TEXT("GSdx"), 0, tmpName))
+			result = tmpName;
+
+		return result;
+	};
+
+	void	UnloadLib()
+	{
+		if( !m_library )
+			return;
+
+		FreeLibrary( m_library );
+		m_library = NULL;
+
+		// If can't delete (might happen when GSdx closes), schedule delete on reboot
+		if(!DeleteFile( m_loadedDllPath.c_str() ) )
+			MoveFileEx( m_loadedDllPath.c_str(), NULL, MOVEFILE_DELAY_UNTIL_REBOOT );
+	}
+
+public:
+	AutoReloadLibrary( const string dllPath, const int minMsBetweenProbes=100 )
+		: m_minMsBetweenProbes( minMsBetweenProbes )
+		, m_dllPath( dllPath )
+		, m_lastFileModification( 0 )
+		, m_lastProbe( 0 )
+		, m_library( 0 )
+	{};
+
+	~AutoReloadLibrary(){ UnloadLib();	};
+
+	// If timeout has ellapsed, probe the dll for change, and reload if it was changed.
+	// If it returns true, then the dll was freed/reloaded, and any symbol addresse previously obtained is now invalid and needs to be re-obtained.
+	// Overhead is very low when when probe timeout has not ellapsed, and especially if current timestamp is supplied as argument.
+	// Note: there's no relation between the file modification date and currentMs value, so it need'nt neccessarily be an actual timestamp.
+	// Note: isChanged is guarenteed to return true at least once
+	//       (even if the file doesn't exist, at which case the following GetSymbolAddress will return NULL)
+	bool isChanged( const DWORD currentMs=0 )
+	{
+		DWORD current = currentMs? currentMs : GetTickCount();
+		if( current >= m_lastProbe && ( current - m_lastProbe ) < m_minMsBetweenProbes )
+			return false;
+
+		bool firstTime = !m_lastProbe;
+		m_lastProbe = current;
+
+		struct stat s;
+		if( stat( m_dllPath.c_str(), &s ) )
+		{	
+			// File doesn't exist or other error, unload dll
+			bool wasLoaded = m_library?true:false;
+			UnloadLib();	
+			return firstTime || wasLoaded;	// Changed if previously loaded or the first time accessing this method (and file doesn't exist)
+		}
+
+		if( m_lastFileModification == s.st_mtime )
+			return false;
+		m_lastFileModification = s.st_mtime;
+
+		// File modified, reload
+		UnloadLib();
+
+		if( !CopyFile( m_dllPath.c_str(), ( m_loadedDllPath = GetTempName() ).c_str(), false ) )
+			return true;
+
+		m_library = LoadLibrary( m_loadedDllPath.c_str() );
+		return true;
+	};
+
+	// Return value is NULL if the dll isn't loaded (failure or doesn't exist) or if the symbol isn't found.
+	void* GetSymbolAddress( const char* name ){ return m_library? GetProcAddress( m_library, name ) : NULL; };
+};
+
+
+// Use DynamicCrcHack function from a dll which can be modified while GSdx/PCSX2 is running.
+// return value is true if the call succeeded or false otherwise (If the hack could not be invoked: no dll/function/etc).
+// result contains the result of the hack call.
+
+typedef uint32 (__cdecl* DynaHackType)(uint32, uint32, uint32, uint32, uint32, uint32, uint32, int32*, uint32, int32);
+typedef uint32 (__cdecl* DynaHackType2)(uint32, uint32, uint32, uint32, uint32, uint32, uint32, int32*, uint32, int32, uint32); // Also accept CRC
+
+bool IsInvokedDynamicCrcHack( GSFrameInfo &fi, int& skip, int region, bool &result, uint32 crc )
+{
+	static AutoReloadLibrary dll( DYNA_DLL_PATH );
+	static DynaHackType dllFunc = NULL;
+	static DynaHackType2 dllFunc2 = NULL;
+
+	if( dll.isChanged() )
+	{
+		dllFunc  = (DynaHackType)dll.GetSymbolAddress( "DynamicCrcHack" );
+		dllFunc2 = (DynaHackType2)dll.GetSymbolAddress( "DynamicCrcHack2" );
+		printf( "GSdx: Dynamic CRC-hacks%s: %s\n", 
+			((dllFunc && !dllFunc2)?" [Old dynaDLL - No CRC support]":""),
+			dllFunc? "Loaded OK        (-> overriding internal hacks)" :
+					 "Not available    (-> using internal hacks)");
+	}
+	
+	if( !dllFunc2 && !dllFunc )
+		return false;
+	
+	int32	skip32 = skip;
+	bool	hasSharedBits = GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM);
+	if(dllFunc2)
+		result	= dllFunc2( fi.FBP, fi.FPSM, fi.FBMSK, fi.TBP0, fi.TPSM, fi.TZTST, (uint32)fi.TME, &skip32, (uint32)region, (uint32)(hasSharedBits?1:0), crc )?true:false;
+	else
+		result	= dllFunc( fi.FBP, fi.FPSM, fi.FBMSK, fi.TBP0, fi.TPSM, fi.TZTST, (uint32)fi.TME, &skip32, (uint32)region, (uint32)(hasSharedBits?1:0) )?true:false;
+	skip	= skip32;
+
+	return true;
+}
+
+#endif
+
+bool GSState::IsBadFrame(int& skip, int UserHacks_SkipDraw)
+{
+	GSFrameInfo fi;
+
+	fi.FBP = m_context->FRAME.Block();
+	fi.FPSM = m_context->FRAME.PSM;
+	fi.FBMSK = m_context->FRAME.FBMSK;
+	fi.TME = PRIM->TME;
+	fi.TBP0 = m_context->TEX0.TBP0;
+	fi.TPSM = m_context->TEX0.PSM;
+	fi.TZTST = m_context->TEST.ZTST;
+
+	static GetSkipCount map[CRC::TitleCount];
+
+	if (!m_crcinited)
+	{
+		m_crcinited = true;
+
+		memset(map, 0, sizeof(map));
+
+		if (s_crc_hack_level > 1) {
+			map[CRC::AceCombat4] = GSC_AceCombat4;
+			map[CRC::AlpineRacer3] = GSC_AlpineRacer3;
+			map[CRC::BlackHawkDown] = GSC_BlackHawkDown;
+			map[CRC::BleachBladeBattlers] = GSC_BleachBladeBattlers;
+			map[CRC::BullyCC] = GSC_BullyCC; // Bully is fixed, maybe this one too?
+			map[CRC::BurnoutDominator] = GSC_Burnout;
+			map[CRC::BurnoutRevenge] = GSC_Burnout;
+			map[CRC::BurnoutTakedown] = GSC_Burnout;
+			map[CRC::CaptainTsubasa] = GSC_CaptainTsubasa;
+			map[CRC::CrashBandicootWoC] = GSC_CrashBandicootWoC;
+			map[CRC::CrashNburn] = GSC_CrashNburn;
+			map[CRC::DBZBT2] = GSC_DBZBT2;
+			map[CRC::DBZBT3] = GSC_DBZBT3;
+			map[CRC::DeathByDegreesTekkenNinaWilliams] = GSC_DeathByDegreesTekkenNinaWilliams;
+			map[CRC::DevilMayCry3] = GSC_DevilMayCry3;
+			map[CRC::EternalPoison] = GSC_EternalPoison;
+			map[CRC::EvangelionJo] = GSC_EvangelionJo;
+			map[CRC::FFVIIDoC] = GSC_FFVIIDoC;
+			map[CRC::FightingBeautyWulong] = GSC_FightingBeautyWulong;
+			map[CRC::FinalFightStreetwise] = GSC_FinalFightStreetwise;
+			map[CRC::FrontMission5] = GSC_FrontMission5;
+			map[CRC::Genji] = GSC_Genji;
+			map[CRC::GetaWayBlackMonday] = GSC_GetaWay;
+			map[CRC::GetaWay] = GSC_GetaWay;
+			map[CRC::GodHand] = GSC_GodHand;
+			map[CRC::GT3] = GSC_GT3;
+			map[CRC::GT4] = GSC_GT4;
+			map[CRC::GTASanAndreas] = GSC_GTASanAndreas;
+			map[CRC::GTConcept] = GSC_GTConcept;
+			map[CRC::HauntingGround] = GSC_HauntingGround;
+			map[CRC::HeavyMetalThunder] = GSC_HeavyMetalThunder;
+			map[CRC::HummerBadlands] = GSC_HummerBadlands;
+			map[CRC::ICO] = GSC_ICO;
+			map[CRC::IkkiTousen] = GSC_IkkiTousen;
+			map[CRC::JamesBondEverythingOrNothing] = GSC_JamesBondEverythingOrNothing;
+			map[CRC::KnightsOfTheTemple2] = GSC_KnightsOfTheTemple2;
+			map[CRC::Kunoichi] = GSC_Kunoichi;
+			map[CRC::LordOfTheRingsThirdAge] = GSC_LordOfTheRingsThirdAge;
+			map[CRC::Manhunt2] = GSC_Manhunt2;
+			map[CRC::MetalGearSolid3] = GSC_MetalGearSolid3;
+			map[CRC::MidnightClub3] = GSC_MidnightClub3;
+			map[CRC::NanoBreaker] = GSC_NanoBreaker;
+			map[CRC::NarutimateAccel] = GSC_NarutimateAccel;
+			map[CRC::Naruto] = GSC_Naruto;
+			map[CRC::Oneechanbara2Special] = GSC_Oneechanbara2Special;
+			map[CRC::Onimusha3] = GSC_Onimusha3;
+			map[CRC::RedDeadRevolver] = GSC_RedDeadRevolver;
+			map[CRC::ResidentEvil4] = GSC_ResidentEvil4;
+			map[CRC::SacredBlaze] = GSC_SacredBlaze;
+			map[CRC::SakuraTaisen] = GSC_SakuraTaisen;
+			map[CRC::SakuraWarsSoLongMyLove] = GSC_SakuraWarsSoLongMyLove;
+			map[CRC::SengokuBasara] = GSC_SengokuBasara;
+			map[CRC::ShadowofRome] = GSC_ShadowofRome;
+			map[CRC::ShinOnimusha] = GSC_ShinOnimusha;
+			map[CRC::Simple2000Vol114] = GSC_Simple2000Vol114;
+			map[CRC::SkyGunner] = GSC_SkyGunner;
+			map[CRC::SoulCalibur2] = GSC_SoulCalibur2;
+			map[CRC::SoulCalibur3] = GSC_SoulCalibur3;
+			map[CRC::Spartan] = GSC_Spartan;
+			map[CRC::StarWarsBattlefront2] = GSC_StarWarsBattlefront2;
+			map[CRC::StarWarsBattlefront] = GSC_StarWarsBattlefront;
+			map[CRC::StarWarsForceUnleashed] = GSC_StarWarsForceUnleashed;
+			map[CRC::SteambotChronicles] = GSC_SteambotChronicles;
+			map[CRC::TalesOfAbyss] = GSC_TalesOfAbyss;
+			map[CRC::TalesOfLegendia] = GSC_TalesOfLegendia;
+			map[CRC::TalesofSymphonia] = GSC_TalesofSymphonia;
+			map[CRC::Tekken5] = GSC_Tekken5;
+			map[CRC::TimeSplitters2] = GSC_TimeSplitters2;
+			map[CRC::TombRaiderAnniversary] = GSC_TombRaider;
+			map[CRC::TombRaiderLegend] = GSC_TombRaiderLegend;
+			map[CRC::TombRaiderUnderworld] = GSC_TombRaiderUnderWorld;
+			map[CRC::TouristTrophy] = GSC_TouristTrophy;
+			map[CRC::UltramanFightingEvolution] = GSC_UltramanFightingEvolution;
+			map[CRC::UrbanReign] = GSC_UrbanReign;
+			map[CRC::WildArms4] = GSC_WildArms4;
+			map[CRC::WildArms5] = GSC_WildArms5;
+			map[CRC::Yakuza2] = GSC_Yakuza2;
+			map[CRC::Yakuza] = GSC_Yakuza;
+			map[CRC::ZettaiZetsumeiToshi2] = GSC_ZettaiZetsumeiToshi2;
+			// Only Aggresive
+			map[CRC::FFX2] = GSC_FFX2;
+			map[CRC::FFX] = GSC_FFX;
+			map[CRC::FFXII] = GSC_FFXII;
+			map[CRC::SMTDDS1] = GSC_SMTNocturneDDS<0x203BA820>;
+			map[CRC::SMTDDS2] = GSC_SMTNocturneDDS<0x20435BF0>;
+			map[CRC::SMTNocturne] = GSC_SMTNocturneDDS<0x2054E870>;
+			map[CRC::SoTC] = GSC_SoTC;
+			map[CRC::SSX3] = GSC_SSX3;
+		}
+
+		// Hack that were fixed on openGL
+		if (Dx_only) {
+			map[CRC::Bully] = GSC_Bully;
+			map[CRC::GodOfWar2] = GSC_GodOfWar2;
+			map[CRC::LordOfTheRingsTwoTowers] = GSC_LordOfTheRingsTwoTowers;
+			map[CRC::Okami] = GSC_Okami;
+			map[CRC::SimpsonsGame] = GSC_SimpsonsGame;
+			map[CRC::SuikodenTactics] = GSC_SuikodenTactics;
+			map[CRC::XE3] = GSC_XE3;
+
+			// Not tested but must be fixed with texture shuffle
+			map[CRC::BigMuthaTruckers] = GSC_BigMuthaTruckers;
+			map[CRC::DemonStone] = GSC_DemonStone;
+			map[CRC::GiTS] = GSC_GiTS;
+			map[CRC::LegoBatman] = GSC_LegoBatman;
+			map[CRC::OnePieceGrandAdventure] = GSC_OnePieceGrandAdventure;
+			map[CRC::OnePieceGrandBattle] = GSC_OnePieceGrandBattle;
+			map[CRC::SFEX3] = GSC_SFEX3;
+			map[CRC::SpyroEternalNight] = GSC_SpyroEternalNight;
+			map[CRC::SpyroNewBeginning] = GSC_SpyroNewBeginning;
+			map[CRC::SonicUnleashed] = GSC_SonicUnleashed;
+			map[CRC::TenchuFS] = GSC_Tenchu;
+			map[CRC::TenchuWoH] = GSC_Tenchu;
+
+			// Those games might requires accurate fbmask
+			map[CRC::Sly2] = GSC_Sly2;
+			map[CRC::Sly3] = GSC_Sly3;
+
+			// Those games require accurate_colclip (perf)
+			map[CRC::CastlevaniaCoD] = GSC_Castlevania;
+			map[CRC::CastlevaniaLoI] = GSC_Castlevania;
+			map[CRC::GodOfWar] = GSC_GodOfWar;
+
+			// Those games emulate a stencil buffer with the alpha channel of the RT (Slow)
+			map[CRC::RadiataStories] = GSC_RadiataStories;
+			map[CRC::StarOcean3] = GSC_StarOcean3;
+			map[CRC::ValkyrieProfile2] = GSC_ValkyrieProfile2;
+
+			// Deprecated hack could be removed (Cutie)
+			map[CRC::Grandia3] = GSC_Grandia3;
+
+			// At least a part of the CRC is fixed with texture shuffle.
+			// The status of post-processing effect is unknown
+			map[CRC::Black] = GSC_Black;
+		}
+	}
+
+	// TODO: just set gsc in SetGameCRC once
+
+	GetSkipCount gsc = map[m_game.title];
+	g_crc_region = m_game.region;
+
+#ifdef ENABLE_DYNAMIC_CRC_HACK
+	bool res=false; if(IsInvokedDynamicCrcHack(fi, skip, g_crc_region, res, m_crc)){ if( !res ) return false;	} else
+#endif
+	if(gsc && !gsc(fi, skip))
+	{
+		return false;
+	}
+
+	if(skip == 0 && (UserHacks_SkipDraw > 0) )
+	{
+		if(fi.TME)
+		{
+			// depth textures (bully, mgs3s1 intro, Front Mission 5)
+			if( (fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S) ||
+				// General, often problematic post processing
+				(GSUtil::HasSharedBits(fi.FBP, fi.FPSM, fi.TBP0, fi.TPSM)) )
+			{
+				skip = UserHacks_SkipDraw;
+			}
+		}
+	}
+#ifdef ENABLE_OGL_DEBUG
+	else if (fi.TME) {
+			if(fi.TPSM == PSM_PSMZ32 || fi.TPSM == PSM_PSMZ24 || fi.TPSM == PSM_PSMZ16 || fi.TPSM == PSM_PSMZ16S)
+				GL_INS("!!! Depth Texture 0x%x!!!", fi.TPSM);
+	}
+#endif
+
+	if(skip > 0)
+	{
+		skip--;
+
+		return true;
+	}
+
+	return false;
+}
diff --git a/plugins/GSdx_legacy/GSState.h b/plugins/GSdx_legacy/GSState.h
new file mode 100644
index 0000000000..1005987c34
--- /dev/null
+++ b/plugins/GSdx_legacy/GSState.h
@@ -0,0 +1,256 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSLocalMemory.h"
+#include "GSDrawingContext.h"
+#include "GSDrawingEnvironment.h"
+#include "GSVertex.h"
+#include "GSVertexTrace.h"
+#include "GSUtil.h"
+#include "GSPerfMon.h"
+#include "GSVector.h"
+#include "GSDevice.h"
+#include "GSCrc.h"
+#include "GSAlignedClass.h"
+#include "GSDump.h"
+
+class GSState : public GSAlignedClass<32>
+{
+	// RESTRICT prevents multiple loads of the same part of the register when accessing its bitfields (the compiler is happy to know that memory writes in-between will not go there)
+
+	typedef void (GSState::*GIFPackedRegHandler)(const GIFPackedReg* RESTRICT r);
+
+	GIFPackedRegHandler m_fpGIFPackedRegHandlers[16];
+	GIFPackedRegHandler m_fpGIFPackedRegHandlerXYZ[8][4];
+
+	void GIFPackedRegHandlerNull(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerRGBA(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerSTQ(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerUV(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerUV_Hack(const GIFPackedReg* RESTRICT r);
+	template<uint32 prim, uint32 adc> void GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r);
+	template<uint32 prim, uint32 adc> void GIFPackedRegHandlerXYZ2(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerFOG(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerA_D(const GIFPackedReg* RESTRICT r);
+	void GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r);
+
+	typedef void (GSState::*GIFRegHandler)(const GIFReg* RESTRICT r);
+
+	GIFRegHandler m_fpGIFRegHandlers[256];
+	GIFRegHandler m_fpGIFRegHandlerXYZ[8][4];
+
+	typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size);
+
+	GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[2];
+	GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8];
+	GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZ2[8];
+
+	template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size);
+	template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size);
+	void GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size);
+
+	template<int i> void ApplyTEX0(GIFRegTEX0& TEX0);
+	void ApplyPRIM(uint32 prim);
+
+	void GIFRegHandlerNull(const GIFReg* RESTRICT r);
+	void GIFRegHandlerPRIM(const GIFReg* RESTRICT r);
+	void GIFRegHandlerRGBAQ(const GIFReg* RESTRICT r);
+	void GIFRegHandlerST(const GIFReg* RESTRICT r);
+	void GIFRegHandlerUV(const GIFReg* RESTRICT r);
+	void GIFRegHandlerUV_Hack(const GIFReg* RESTRICT r);
+	template<uint32 prim, uint32 adc> void GIFRegHandlerXYZF2(const GIFReg* RESTRICT r);
+	template<uint32 prim, uint32 adc> void GIFRegHandlerXYZ2(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerTEX0(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerCLAMP(const GIFReg* RESTRICT r);
+	void GIFRegHandlerFOG(const GIFReg* RESTRICT r);
+	void GIFRegHandlerNOP(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerTEX1(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerTEX2(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerXYOFFSET(const GIFReg* RESTRICT r);
+	void GIFRegHandlerPRMODECONT(const GIFReg* RESTRICT r);
+	void GIFRegHandlerPRMODE(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTEXCLUT(const GIFReg* RESTRICT r);
+	void GIFRegHandlerSCANMSK(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerMIPTBP1(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerMIPTBP2(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTEXA(const GIFReg* RESTRICT r);
+	void GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerALPHA(const GIFReg* RESTRICT r);
+	void GIFRegHandlerDIMX(const GIFReg* RESTRICT r);
+	void GIFRegHandlerDTHE(const GIFReg* RESTRICT r);
+	void GIFRegHandlerCOLCLAMP(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerTEST(const GIFReg* RESTRICT r);
+	void GIFRegHandlerPABE(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerFBA(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerFRAME(const GIFReg* RESTRICT r);
+	template<int i> void GIFRegHandlerZBUF(const GIFReg* RESTRICT r);
+	void GIFRegHandlerBITBLTBUF(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTRXPOS(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTRXREG(const GIFReg* RESTRICT r);
+	void GIFRegHandlerTRXDIR(const GIFReg* RESTRICT r);
+	void GIFRegHandlerHWREG(const GIFReg* RESTRICT r);
+	void GIFRegHandlerSIGNAL(const GIFReg* RESTRICT r);
+	void GIFRegHandlerFINISH(const GIFReg* RESTRICT r);
+	void GIFRegHandlerLABEL(const GIFReg* RESTRICT r);
+
+	int m_version;
+	int m_sssize;
+
+	bool m_mt;
+	void (*m_irq)();
+	bool m_path3hack;
+	bool m_init_read_fifo_supported;
+
+	struct GSTransferBuffer
+	{
+		int x, y;
+		int start, end, total;
+		bool overflow;
+		uint8* buff;
+
+		GSTransferBuffer();
+		virtual ~GSTransferBuffer();
+
+		void Init(int tx, int ty);
+		bool Update(int tw, int th, int bpp, int& len);
+
+	} m_tr;
+
+protected:
+	bool IsBadFrame(int& skip, int UserHacks_SkipDraw);
+
+	int UserHacks_WildHack;
+	bool isPackedUV_HackFlag;
+	int m_crc_hack_level;
+
+	GSVertex m_v;
+	float m_q;
+	GSVector4i m_scissor;
+	GSVector4i m_ofxy;
+	bool m_texflush;
+	
+	struct 
+	{
+		GSVertex* buff; 
+		size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
+		size_t xy_tail;
+		uint64 xy[4];
+	} m_vertex; 
+
+	struct 
+	{
+		uint32* buff; 
+		size_t tail;
+	} m_index;
+
+	void UpdateContext();
+	void UpdateScissor();
+
+	virtual void UpdateVertexKick();
+
+	void GrowVertexBuffer();
+
+	template<uint32 prim> 
+	void VertexKick(uint32 skip);
+
+	// following functions need m_vt to be initialized
+
+	GSVertexTrace m_vt;
+
+	void GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear);
+	void GetAlphaMinMax();
+	bool TryAlphaTest(uint32& fm, uint32& zm);
+	bool IsOpaque();
+	bool IsMipMapActive();
+
+public:
+	GIFPath m_path[4];
+	GIFRegPRIM* PRIM;
+	GSPrivRegSet* m_regs;
+	GSLocalMemory m_mem;
+	GSDrawingEnvironment m_env;
+	GSDrawingContext* m_context;
+	GSPerfMon m_perfmon;
+	uint32 m_crc;
+	int m_options;
+	int m_frameskip;
+	bool m_crcinited;
+	bool m_framelimit;
+	CRC::Game m_game;
+	GSDump m_dump;
+	bool m_nativeres;
+	bool m_mipmap;
+
+	int s_n;
+	bool s_dump;
+	bool s_save;
+	bool s_savet;
+	bool s_savez;
+	bool s_savef;
+	int s_saven;
+	int s_savel;
+
+public:
+	GSState();
+	virtual ~GSState();
+
+	void ResetHandlers();
+
+	GSVector4i GetDisplayRect(int i = -1);
+	GSVector4i GetFrameRect(int i = -1);
+	GSVector2i GetDeviceSize(int i = -1);
+
+	bool IsEnabled(int i);
+
+	float GetTvRefreshRate();
+
+	virtual void Reset();
+	virtual void Flush();
+	virtual void FlushPrim();
+	virtual void FlushWrite();
+	virtual void Draw() = 0;
+	virtual void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) {}
+	virtual void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false) {}
+
+	void Move();
+	void Write(const uint8* mem, int len);
+	void Read(uint8* mem, int len);
+	void InitReadFIFO(uint8* mem, int len);
+
+	void SoftReset(uint32 mask);
+	void WriteCSR(uint32 csr) {m_regs->CSR.u32[1] = csr;}
+	void ReadFIFO(uint8* mem, int size);
+	template<int index> void Transfer(const uint8* mem, uint32 size);
+	int Freeze(GSFreezeData* fd, bool sizeonly);
+	int Defrost(const GSFreezeData* fd);
+	void GetLastTag(uint32* tag) {*tag = m_path3hack; m_path3hack = 0;}
+	virtual void SetGameCRC(uint32 crc, int options);
+	void SetFrameSkip(int skip);
+	void SetRegsMem(uint8* basemem);
+	void SetIrqCallback(void (*irq)());
+	void SetMultithreaded(bool mt = true);
+};
+
diff --git a/plugins/GSdx_legacy/GSTables.cpp b/plugins/GSdx_legacy/GSTables.cpp
new file mode 100644
index 0000000000..aad7360eae
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTables.cpp
@@ -0,0 +1,275 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTables.h"
+
+const uint8 blockTable32[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+const uint8 blockTable32Z[4][8] =
+{
+	{ 24, 25, 28, 29,  8,  9, 12, 13},
+	{ 26, 27, 30, 31, 10, 11, 14, 15},
+	{ 16, 17, 20, 21,  0,  1,  4,  5},
+	{ 18, 19, 22, 23,  2,  3,  6,  7}
+};
+
+const uint8 blockTable16[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+const uint8 blockTable16S[8][4] =
+{
+	{  0,  2, 16, 18 },
+	{  1,  3, 17, 19 },
+	{  8, 10, 24, 26 },
+	{  9, 11, 25, 27 },
+	{  4,  6, 20, 22 },
+	{  5,  7, 21, 23 },
+	{ 12, 14, 28, 30 },
+	{ 13, 15, 29, 31 }
+};
+
+const uint8 blockTable16Z[8][4] =
+{
+	{ 24, 26, 16, 18 },
+	{ 25, 27, 17, 19 },
+	{ 28, 30, 20, 22 },
+	{ 29, 31, 21, 23 },
+	{  8, 10,  0,  2 },
+	{  9, 11,  1,  3 },
+	{ 12, 14,  4,  6 },
+	{ 13, 15,  5,  7 }
+};
+
+const uint8 blockTable16SZ[8][4] =
+{
+	{ 24, 26,  8, 10 },
+	{ 25, 27,  9, 11 },
+	{ 16, 18,  0,  2 },
+	{ 17, 19,  1,  3 },
+	{ 28, 30, 12, 14 },
+	{ 29, 31, 13, 15 },
+	{ 20, 22,  4,  6 },
+	{ 21, 23,  5,  7 }
+};
+
+const uint8 blockTable8[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+const uint8 blockTable4[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+const uint8 columnTable32[8][8] =
+{
+	{  0,  1,  4,  5,  8,  9, 12, 13 },
+	{  2,  3,  6,  7, 10, 11, 14, 15 },
+	{ 16, 17, 20, 21, 24, 25, 28, 29 },
+	{ 18, 19, 22, 23, 26, 27, 30, 31 },
+	{ 32, 33, 36, 37, 40, 41, 44, 45 },
+	{ 34, 35, 38, 39, 42, 43, 46, 47 },
+	{ 48, 49, 52, 53, 56, 57, 60, 61 },
+	{ 50, 51, 54, 55, 58, 59, 62, 63 },
+};
+
+const uint8 columnTable16[8][16] =
+{
+	{   0,   2,   8,  10,  16,  18,  24,  26,
+	    1,   3,   9,  11,  17,  19,  25,  27 },
+	{   4,   6,  12,  14,  20,  22,  28,  30,
+	    5,   7,  13,  15,  21,  23,  29,  31 },
+	{  32,  34,  40,  42,  48,  50,  56,  58,
+	   33,  35,  41,  43,  49,  51,  57,  59 },
+	{  36,  38,  44,  46,  52,  54,  60,  62,
+	   37,  39,  45,  47,  53,  55,  61,  63 },
+	{  64,  66,  72,  74,  80,  82,  88,  90,
+	   65,  67,  73,  75,  81,  83,  89,  91 },
+	{  68,  70,  76,  78,  84,  86,  92,  94,
+	   69,  71,  77,  79,  85,  87,  93,  95 },
+	{  96,  98, 104, 106, 112, 114, 120, 122,
+	   97,  99, 105, 107, 113, 115, 121, 123 },
+	{ 100, 102, 108, 110, 116, 118, 124, 126,
+	  101, 103, 109, 111, 117, 119, 125, 127 },
+};
+
+const uint8 columnTable8[16][16] =
+{
+	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
+	    2,   6,  18,  22,  34,  38,  50,  54 },
+	{   8,  12,  24,  28,  40,  44,  56,  60,
+	   10,  14,  26,  30,  42,  46,  58,  62 },
+	{  33,  37,  49,  53,   1,   5,  17,  21,
+	   35,  39,  51,  55,   3,   7,  19,  23 },
+	{  41,  45,  57,  61,   9,  13,  25,  29,
+	   43,  47,  59,  63,  11,  15,  27,  31 },
+	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
+	   98, 102, 114, 118,  66,  70,  82,  86 },
+	{ 104, 108, 120, 124,  72,  76,  88,  92,
+	  106, 110, 122, 126,  74,  78,  90,  94 },
+	{  65,  69,  81,  85,  97, 101, 113, 117,
+	   67,  71,  83,  87,  99, 103, 115, 119 },
+	{  73,  77,  89,  93, 105, 109, 121, 125,
+	   75,  79,  91,  95, 107, 111, 123, 127 },
+	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
+	  130, 134, 146, 150, 162, 166, 178, 182 },
+	{ 136, 140, 152, 156, 168, 172, 184, 188,
+	  138, 142, 154, 158, 170, 174, 186, 190 },
+	{ 161, 165, 177, 181, 129, 133, 145, 149,
+	  163, 167, 179, 183, 131, 135, 147, 151 },
+	{ 169, 173, 185, 189, 137, 141, 153, 157,
+	  171, 175, 187, 191, 139, 143, 155, 159 },
+	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
+	  226, 230, 242, 246, 194, 198, 210, 214 },
+	{ 232, 236, 248, 252, 200, 204, 216, 220,
+	  234, 238, 250, 254, 202, 206, 218, 222 },
+	{ 193, 197, 209, 213, 225, 229, 241, 245,
+	  195, 199, 211, 215, 227, 231, 243, 247 },
+	{ 201, 205, 217, 221, 233, 237, 249, 253,
+	  203, 207, 219, 223, 235, 239, 251, 255 },
+};
+
+const uint16 columnTable4[16][32] =
+{
+	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
+	    2,  10,  34,  42,  66,  74,  98, 106,
+	    4,  12,  36,  44,  68,  76, 100, 108,
+	    6,  14,  38,  46,  70,  78, 102, 110 },
+	{  16,  24,  48,  56,  80,  88, 112, 120,
+	   18,  26,  50,  58,  82,  90, 114, 122,
+	   20,  28,  52,  60,  84,  92, 116, 124,
+	   22,  30,  54,  62,  86,  94, 118, 126 },
+	{  65,  73,  97, 105,   1,   9,  33,  41,
+	   67,  75,  99, 107,   3,  11,  35,  43,
+	   69,  77, 101, 109,   5,  13,  37,  45,
+	   71,  79, 103, 111,   7,  15,  39,  47 },
+	{  81,  89, 113, 121,  17,  25,  49,  57,
+	   83,  91, 115, 123,  19,  27,  51,  59,
+	   85,  93, 117, 125,  21,  29,  53,  61,
+	   87,  95, 119, 127,  23,  31,  55,  63 },
+	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
+	  194, 202, 226, 234, 130, 138, 162, 170,
+	  196, 204, 228, 236, 132, 140, 164, 172,
+	  198, 206, 230, 238, 134, 142, 166, 174 },
+	{ 208, 216, 240, 248, 144, 152, 176, 184,
+	  210, 218, 242, 250, 146, 154, 178, 186,
+	  212, 220, 244, 252, 148, 156, 180, 188,
+	  214, 222, 246, 254, 150, 158, 182, 190 },
+	{ 129, 137, 161, 169, 193, 201, 225, 233,
+	  131, 139, 163, 171, 195, 203, 227, 235,
+	  133, 141, 165, 173, 197, 205, 229, 237,
+	  135, 143, 167, 175, 199, 207, 231, 239 },
+	{ 145, 153, 177, 185, 209, 217, 241, 249,
+	  147, 155, 179, 187, 211, 219, 243, 251,
+	  149, 157, 181, 189, 213, 221, 245, 253,
+	  151, 159, 183, 191, 215, 223, 247, 255 },
+	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
+	  258, 266, 290, 298, 322, 330, 354, 362,
+	  260, 268, 292, 300, 324, 332, 356, 364,
+	  262, 270, 294, 302, 326, 334, 358, 366 },
+	{ 272, 280, 304, 312, 336, 344, 368, 376,
+	  274, 282, 306, 314, 338, 346, 370, 378,
+	  276, 284, 308, 316, 340, 348, 372, 380,
+	  278, 286, 310, 318, 342, 350, 374, 382 },
+	{ 321, 329, 353, 361, 257, 265, 289, 297,
+	  323, 331, 355, 363, 259, 267, 291, 299,
+	  325, 333, 357, 365, 261, 269, 293, 301,
+	  327, 335, 359, 367, 263, 271, 295, 303 },
+	{ 337, 345, 369, 377, 273, 281, 305, 313,
+	  339, 347, 371, 379, 275, 283, 307, 315,
+	  341, 349, 373, 381, 277, 285, 309, 317,
+	  343, 351, 375, 383, 279, 287, 311, 319 },
+	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
+	  450, 458, 482, 490, 386, 394, 418, 426,
+	  452, 460, 484, 492, 388, 396, 420, 428,
+	  454, 462, 486, 494, 390, 398, 422, 430 },
+	{ 464, 472, 496, 504, 400, 408, 432, 440,
+	  466, 474, 498, 506, 402, 410, 434, 442,
+	  468, 476, 500, 508, 404, 412, 436, 444,
+	  470, 478, 502, 510, 406, 414, 438, 446 },
+	{ 385, 393, 417, 425, 449, 457, 481, 489,
+	  387, 395, 419, 427, 451, 459, 483, 491,
+	  389, 397, 421, 429, 453, 461, 485, 493,
+	  391, 399, 423, 431, 455, 463, 487, 495 },
+	{ 401, 409, 433, 441, 465, 473, 497, 505,
+	  403, 411, 435, 443, 467, 475, 499, 507,
+	  405, 413, 437, 445, 469, 477, 501, 509,
+	  407, 415, 439, 447, 471, 479, 503, 511 },
+};
+
+const uint8 clutTableT32I8[128] =
+{
+	0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+	64, 65, 68, 69, 72, 73, 76, 77, 66, 67, 70, 71, 74, 75, 78, 79,
+	16, 17, 20, 21, 24, 25, 28, 29, 18, 19, 22, 23, 26, 27, 30, 31,
+	80, 81, 84, 85, 88, 89, 92, 93, 82, 83, 86, 87, 90, 91, 94, 95,
+	32, 33, 36, 37, 40, 41, 44, 45, 34, 35, 38, 39, 42, 43, 46, 47,
+	96, 97, 100, 101, 104, 105, 108, 109, 98, 99, 102, 103, 106, 107, 110, 111,
+	48, 49, 52, 53, 56, 57, 60, 61, 50, 51, 54, 55, 58, 59, 62, 63,
+	112, 113, 116, 117, 120, 121, 124, 125, 114, 115, 118, 119, 122, 123, 126, 127
+};
+
+const uint8 clutTableT32I4[16] =
+{
+	0, 1, 4, 5, 8, 9, 12, 13,
+	2, 3, 6, 7, 10, 11, 14, 15
+};
+
+const uint8 clutTableT16I8[32] =
+{
+	0, 2, 8, 10, 16, 18, 24, 26,
+	4, 6, 12, 14, 20, 22, 28, 30,
+	1, 3, 9, 11, 17, 19, 25, 27,
+	5, 7, 13, 15, 21, 23, 29, 31
+};
+
+const uint8 clutTableT16I4[16] =
+{
+	0, 2, 8, 10, 16, 18, 24, 26,
+	4, 6, 12, 14, 20, 22, 28, 30
+};
diff --git a/plugins/GSdx_legacy/GSTables.h b/plugins/GSdx_legacy/GSTables.h
new file mode 100644
index 0000000000..cd05929557
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTables.h
@@ -0,0 +1,39 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+extern const uint8 blockTable32[4][8];
+extern const uint8 blockTable32Z[4][8];
+extern const uint8 blockTable16[8][4];
+extern const uint8 blockTable16S[8][4];
+extern const uint8 blockTable16Z[8][4];
+extern const uint8 blockTable16SZ[8][4];
+extern const uint8 blockTable8[4][8];
+extern const uint8 blockTable4[8][4];
+extern const uint8 columnTable32[8][8];
+extern const uint8 columnTable16[8][16];
+extern const uint8 columnTable8[16][16];
+extern const uint16 columnTable4[16][32];
+extern const uint8 clutTableT32I8[128];
+extern const uint8 clutTableT32I4[16];
+extern const uint8 clutTableT16I8[32];
+extern const uint8 clutTableT16I4[16];
diff --git a/plugins/GSdx_legacy/GSTexture.cpp b/plugins/GSdx_legacy/GSTexture.cpp
new file mode 100644
index 0000000000..9460deb5bf
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture.cpp
@@ -0,0 +1,36 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTexture.h"
+
+GSTexture::GSTexture()
+	: m_scale(1, 1)
+	, m_size(0, 0)
+	, m_type(0)
+	, m_format(0)
+	, m_msaa(false)
+	, last_frame_used(0)
+	, LikelyOffset(false)
+	, OffsetHack_modx(0.0f)
+	, OffsetHack_mody(0.0f)
+{
+}
diff --git a/plugins/GSdx_legacy/GSTexture.h b/plugins/GSdx_legacy/GSTexture.h
new file mode 100644
index 0000000000..5418cfebbb
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture.h
@@ -0,0 +1,75 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSVector.h"
+
+class GSTexture
+{
+protected:
+	GSVector2 m_scale;
+	GSVector2i m_size;
+	int m_type;
+	int m_format;
+	bool m_msaa;
+
+public:
+	struct GSMap {uint8* bits; int pitch;};
+
+	enum {RenderTarget = 1, DepthStencil, Texture, Offscreen, Backbuffer};
+
+public:
+	GSTexture();
+	virtual ~GSTexture() {}
+
+	virtual operator bool() {ASSERT(0); return false;}
+
+	virtual bool Update(const GSVector4i& r, const void* data, int pitch) = 0;
+	virtual bool Map(GSMap& m, const GSVector4i* r = NULL) = 0;
+	virtual void Unmap() = 0;
+	virtual bool Save(const string& fn, bool user_image = false, bool dds = false) = 0;
+	virtual void Invalidate() {}
+	virtual uint32 GetID() { return 0; }
+
+	GSVector2 GetScale() const {return m_scale;}
+	void SetScale(const GSVector2& scale) {m_scale = scale;}
+
+	int GetWidth() const {return m_size.x;}
+	int GetHeight() const {return m_size.y;}
+	GSVector2i GetSize() const {return m_size;}
+
+	int GetType() const {return m_type;}
+	int GetFormat() const {return m_format;}
+
+	bool IsMSAA() const {return m_msaa;}
+
+	// frame number (arbitrary base) the texture was recycled on
+	// different purpose than texture cache ages, do not attempt to merge
+	unsigned last_frame_used;
+
+	bool LikelyOffset;
+	float OffsetHack_modx;
+	float OffsetHack_mody;
+
+	// Typical size of a RGBA texture
+	virtual uint32 GetMemUsage() { return m_size.x * m_size.y * 4; }
+};
diff --git a/plugins/GSdx_legacy/GSTexture11.cpp b/plugins/GSdx_legacy/GSTexture11.cpp
new file mode 100644
index 0000000000..2e4cbf557b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture11.cpp
@@ -0,0 +1,239 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTexture11.h"
+#include "GSPng.h"
+
+GSTexture11::GSTexture11(ID3D11Texture2D* texture)
+	: m_texture(texture)
+{
+	ASSERT(m_texture);
+
+	m_texture->GetDevice(&m_dev);
+	m_texture->GetDesc(&m_desc);
+
+	m_dev->GetImmediateContext(&m_ctx);
+
+	m_size.x = (int)m_desc.Width;
+	m_size.y = (int)m_desc.Height;
+
+	if(m_desc.BindFlags & D3D11_BIND_RENDER_TARGET) m_type = RenderTarget;
+	else if(m_desc.BindFlags & D3D11_BIND_DEPTH_STENCIL) m_type = DepthStencil;
+	else if(m_desc.BindFlags & D3D11_BIND_SHADER_RESOURCE) m_type = Texture;
+	else if(m_desc.Usage == D3D11_USAGE_STAGING) m_type = Offscreen;
+
+	m_format = (int)m_desc.Format;
+
+	m_msaa = m_desc.SampleDesc.Count > 1;
+}
+
+bool GSTexture11::Update(const GSVector4i& r, const void* data, int pitch)
+{
+	if(m_dev && m_texture)
+	{
+		D3D11_BOX box = {r.left, r.top, 0, r.right, r.bottom, 1};
+
+		m_ctx->UpdateSubresource(m_texture, 0, &box, data, pitch, 0);
+
+		return true;
+	}
+
+	return false;
+}
+
+bool GSTexture11::Map(GSMap& m, const GSVector4i* r)
+{
+	if(r != NULL)
+	{
+		// ASSERT(0); // not implemented
+
+		return false;
+	}
+
+	if(m_texture && m_desc.Usage == D3D11_USAGE_STAGING)
+	{
+		D3D11_MAPPED_SUBRESOURCE map;
+
+		if(SUCCEEDED(m_ctx->Map(m_texture, 0, D3D11_MAP_READ_WRITE, 0, &map)))
+		{
+			m.bits = (uint8*)map.pData;
+			m.pitch = (int)map.RowPitch;
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void GSTexture11::Unmap()
+{
+	if(m_texture)
+	{
+		m_ctx->Unmap(m_texture, 0);
+	}
+}
+
+bool GSTexture11::Save(const string& fn, bool user_image, bool dds)
+{
+	CComPtr<ID3D11Texture2D> res;
+	D3D11_TEXTURE2D_DESC desc;
+
+	m_texture->GetDesc(&desc);
+
+	desc.Usage = D3D11_USAGE_STAGING;
+	desc.BindFlags = 0;
+	desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+
+	HRESULT hr = m_dev->CreateTexture2D(&desc, nullptr, &res);
+	if (FAILED(hr))
+	{
+		return false;
+	}
+
+	m_ctx->CopyResource(res, m_texture);
+
+	if (m_desc.BindFlags & D3D11_BIND_DEPTH_STENCIL)
+	{
+		CComPtr<ID3D11Texture2D> dst;
+
+		desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
+		desc.CPUAccessFlags |= D3D11_CPU_ACCESS_WRITE;
+
+		hr = m_dev->CreateTexture2D(&desc, nullptr, &dst);
+		if (FAILED(hr))
+		{
+			return false;
+		}
+
+		D3D11_MAPPED_SUBRESOURCE sm, dm;
+
+		hr = m_ctx->Map(res, 0, D3D11_MAP_READ, 0, &sm);
+		if (FAILED(hr))
+		{
+			return false;
+		}
+		hr = m_ctx->Map(dst, 0, D3D11_MAP_WRITE, 0, &dm);
+		if (FAILED(hr))
+		{
+			m_ctx->Unmap(res, 0);
+			return false;
+		}
+
+		uint8* s = static_cast<uint8*>(sm.pData);
+		uint8* d = static_cast<uint8*>(dm.pData);
+
+		for (uint32 y = 0; y < desc.Height; y++, s += sm.RowPitch, d += dm.RowPitch)
+		{
+			for (uint32 x = 0; x < desc.Width; x++)
+			{
+				reinterpret_cast<uint32*>(d)[x] = static_cast<uint32>(ldexpf(reinterpret_cast<float*>(s)[x*2], 32));
+			}
+		}
+
+		m_ctx->Unmap(res, 0);
+		m_ctx->Unmap(dst, 0);
+
+		res = dst;
+	}
+
+	res->GetDesc(&desc);
+
+	GSPng::Format format;
+	switch (desc.Format)
+	{
+	case DXGI_FORMAT_A8_UNORM:
+		format = GSPng::R8I_PNG;
+		break;
+	case DXGI_FORMAT_R8G8B8A8_UNORM:
+		format = dds ? GSPng::RGBA_PNG : (m_desc.BindFlags & D3D11_BIND_DEPTH_STENCIL ? GSPng::RGB_A_PNG : GSPng::RGB_PNG);
+		break;
+	default:
+		fprintf(stderr, "DXGI_FORMAT %d not saved to image\n", desc.Format);
+		return false;
+	}
+
+	D3D11_MAPPED_SUBRESOURCE sm;
+	hr = m_ctx->Map(res, 0, D3D11_MAP_READ, 0, &sm);
+	if (FAILED(hr))
+	{
+		return false;
+	}
+
+	int compression = user_image ? Z_BEST_COMPRESSION : theApp.GetConfig("png_compression_level", Z_BEST_SPEED);
+	bool success = GSPng::Save(format, fn, static_cast<uint8*>(sm.pData), desc.Width, desc.Height, sm.RowPitch, compression);
+
+	m_ctx->Unmap(res, 0);
+
+	return success;
+}
+
+GSTexture11::operator ID3D11Texture2D*()
+{
+	return m_texture;
+}
+
+GSTexture11::operator ID3D11ShaderResourceView*()
+{
+	if(!m_srv && m_dev && m_texture)
+	{
+		ASSERT(!m_msaa);
+
+		m_dev->CreateShaderResourceView(m_texture, NULL, &m_srv);
+	}
+
+	return m_srv;
+}
+
+GSTexture11::operator ID3D11UnorderedAccessView*()
+{
+	if(!m_uav && m_dev && m_texture)
+	{
+		ASSERT(!m_msaa);
+
+		m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav);
+	}
+
+	return m_uav;
+}
+
+GSTexture11::operator ID3D11RenderTargetView*()
+{
+	ASSERT(m_dev);
+
+	if(!m_rtv && m_dev && m_texture)
+	{
+		m_dev->CreateRenderTargetView(m_texture, NULL, &m_rtv);
+	}
+
+	return m_rtv;
+}
+
+GSTexture11::operator ID3D11DepthStencilView*()
+{
+	if(!m_dsv && m_dev && m_texture)
+	{
+		m_dev->CreateDepthStencilView(m_texture, NULL, &m_dsv);
+	}
+
+	return m_dsv;
+}
diff --git a/plugins/GSdx_legacy/GSTexture11.h b/plugins/GSdx_legacy/GSTexture11.h
new file mode 100644
index 0000000000..8b9640e069
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture11.h
@@ -0,0 +1,50 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTexture.h"
+
+class GSTexture11 : public GSTexture
+{
+	CComPtr<ID3D11Device> m_dev;
+	CComPtr<ID3D11DeviceContext> m_ctx;
+	CComPtr<ID3D11Texture2D> m_texture;
+	D3D11_TEXTURE2D_DESC m_desc;
+	CComPtr<ID3D11ShaderResourceView> m_srv;
+	CComPtr<ID3D11UnorderedAccessView> m_uav;
+	CComPtr<ID3D11RenderTargetView> m_rtv;
+	CComPtr<ID3D11DepthStencilView> m_dsv;
+
+public:
+	explicit GSTexture11(ID3D11Texture2D* texture);
+
+	bool Update(const GSVector4i& r, const void* data, int pitch);
+	bool Map(GSMap& m, const GSVector4i* r);
+	void Unmap();
+	bool Save(const string& fn, bool user_image = false, bool dds = false);
+
+	operator ID3D11Texture2D*();
+	operator ID3D11ShaderResourceView*();
+	operator ID3D11UnorderedAccessView*();
+	operator ID3D11RenderTargetView*();
+	operator ID3D11DepthStencilView*();
+};
diff --git a/plugins/GSdx_legacy/GSTexture9.cpp b/plugins/GSdx_legacy/GSTexture9.cpp
new file mode 100644
index 0000000000..e322c2ee37
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture9.cpp
@@ -0,0 +1,219 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTexture9.h"
+#include "GSPng.h"
+
+GSTexture9::GSTexture9(IDirect3DSurface9* surface)
+{
+	m_surface = surface;
+
+	surface->GetDevice(&m_dev);
+	surface->GetDesc(&m_desc);
+
+	if(m_desc.Type != D3DRTYPE_SURFACE)
+	{
+		surface->GetContainer(__uuidof(IDirect3DTexture9), (void**)&m_texture);
+
+		ASSERT(m_texture != NULL);
+	}
+
+	m_size.x = (int)m_desc.Width;
+	m_size.y = (int)m_desc.Height;
+
+	if(m_desc.Usage & D3DUSAGE_RENDERTARGET) m_type = RenderTarget;
+	else if(m_desc.Usage & D3DUSAGE_DEPTHSTENCIL) m_type = DepthStencil;
+	else if(m_desc.Pool == D3DPOOL_MANAGED) m_type = Texture;
+	else if(m_desc.Pool == D3DPOOL_SYSTEMMEM) m_type = Offscreen;
+
+	m_format = (int)m_desc.Format;
+
+	m_msaa = m_desc.MultiSampleType != D3DMULTISAMPLE_NONE;
+}
+
+GSTexture9::GSTexture9(IDirect3DTexture9* texture)
+{
+	m_texture = texture;
+
+	texture->GetDevice(&m_dev);
+	texture->GetLevelDesc(0, &m_desc);
+	texture->GetSurfaceLevel(0, &m_surface);
+
+	ASSERT(m_surface != NULL);
+
+	m_size.x = (int)m_desc.Width;
+	m_size.y = (int)m_desc.Height;
+
+	if(m_desc.Usage & D3DUSAGE_RENDERTARGET) m_type = RenderTarget;
+	else if(m_desc.Usage & D3DUSAGE_DEPTHSTENCIL) m_type = DepthStencil;
+	else if(m_desc.Pool == D3DPOOL_MANAGED) m_type = Texture;
+	else if(m_desc.Pool == D3DPOOL_SYSTEMMEM) m_type = Offscreen;
+
+	m_format = (int)m_desc.Format;
+
+	m_msaa = m_desc.MultiSampleType > 1;
+}
+
+GSTexture9::~GSTexture9()
+{
+}
+
+bool GSTexture9::Update(const GSVector4i& r, const void* data, int pitch)
+{
+	if(m_surface)
+	{
+		D3DLOCKED_RECT lr;
+
+		if(SUCCEEDED(m_surface->LockRect(&lr, r, 0)))
+		{
+			uint8* src = (uint8*)data;
+			uint8* dst = (uint8*)lr.pBits;
+
+			int bytes = r.width() * sizeof(uint32);
+
+			switch(m_desc.Format)
+			{
+			case D3DFMT_A8: bytes >>= 2; break;
+			case D3DFMT_A1R5G5B5: bytes >>= 1; break;
+			default: ASSERT(m_desc.Format == D3DFMT_A8R8G8B8); break;
+			}
+
+			bytes = min(bytes, pitch);
+			bytes = min(bytes, lr.Pitch);
+
+			for(int i = 0, j = r.height(); i < j; i++, src += pitch, dst += lr.Pitch)
+			{
+				memcpy(dst, src, bytes);
+			}
+
+			m_surface->UnlockRect();
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool GSTexture9::Map(GSMap& m, const GSVector4i* r)
+{
+	HRESULT hr;
+
+	if(m_surface)
+	{
+		D3DLOCKED_RECT lr;
+
+		if(SUCCEEDED(hr = m_surface->LockRect(&lr, (LPRECT)r, 0)))
+		{
+			m.bits = (uint8*)lr.pBits;
+			m.pitch = (int)lr.Pitch;
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void GSTexture9::Unmap()
+{
+	if(m_surface)
+	{
+		m_surface->UnlockRect();
+	}
+}
+
+bool GSTexture9::Save(const string& fn, bool user_image, bool dds)
+{
+	bool rb_swapped = true;
+	CComPtr<IDirect3DSurface9> surface;
+
+	D3DSURFACE_DESC desc;
+	m_surface->GetDesc(&desc);
+
+	if (m_desc.Usage & D3DUSAGE_DEPTHSTENCIL && desc.Format != D3DFMT_D32F_LOCKABLE)
+	{
+		return false;
+	}
+
+	if (desc.Format == D3DFMT_A8 || desc.Pool == D3DPOOL_MANAGED || desc.Usage == D3DUSAGE_DEPTHSTENCIL)
+	{
+		surface = m_surface;
+		rb_swapped = false;
+	}
+	else
+	{
+		HRESULT hr;
+
+		hr = m_dev->CreateOffscreenPlainSurface(desc.Width, desc.Height, desc.Format, D3DPOOL_SYSTEMMEM, &surface, nullptr);
+		if (FAILED(hr))
+		{
+			return false;
+		}
+
+		hr = m_dev->GetRenderTargetData(m_surface, surface);
+		if (FAILED(hr))
+		{
+			return false;
+		}
+	}
+
+	GSPng::Format format;
+	switch (desc.Format)
+	{
+	case D3DFMT_A8:
+		format = GSPng::R8I_PNG;
+		break;
+	case D3DFMT_A8R8G8B8:
+		format = dds? GSPng::RGBA_PNG : GSPng::RGB_PNG;
+		break;
+	case D3DFMT_D32F_LOCKABLE:
+		format = GSPng::RGB_A_PNG;
+		break;
+	default:
+		fprintf(stderr, "D3DFMT %d not saved to image\n", desc.Format);
+		return false;
+	}
+
+	D3DLOCKED_RECT slr;
+	HRESULT hr = surface->LockRect(&slr, nullptr, 0);
+	if (FAILED(hr))
+	{
+		return false;
+	}
+
+	int compression = user_image ? Z_BEST_COMPRESSION : theApp.GetConfig("png_compression_level", Z_BEST_SPEED);
+	bool success = GSPng::Save(format, fn, static_cast<uint8*>(slr.pBits), desc.Width, desc.Height, slr.Pitch, compression, rb_swapped);
+
+	surface->UnlockRect();
+	return success;
+}
+
+GSTexture9::operator IDirect3DSurface9*()
+{
+	return m_surface;
+}
+
+GSTexture9::operator IDirect3DTexture9*()
+{
+	return m_texture;
+}
diff --git a/plugins/GSdx_legacy/GSTexture9.h b/plugins/GSdx_legacy/GSTexture9.h
new file mode 100644
index 0000000000..347734fd50
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTexture9.h
@@ -0,0 +1,45 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTexture.h"
+
+class GSTexture9 : public GSTexture
+{
+	CComPtr<IDirect3DDevice9> m_dev;
+	CComPtr<IDirect3DSurface9> m_surface;
+	CComPtr<IDirect3DTexture9> m_texture;
+	D3DSURFACE_DESC m_desc;
+
+public:
+	explicit GSTexture9(IDirect3DSurface9* surface);
+	explicit GSTexture9(IDirect3DTexture9* texture);
+	virtual ~GSTexture9();
+
+	bool Update(const GSVector4i& r, const void* data, int pitch);
+	bool Map(GSMap& m, const GSVector4i* r);
+	void Unmap();
+	bool Save(const string& fn, bool user_image = false, bool dds = false);
+
+	operator IDirect3DSurface9*();
+	operator IDirect3DTexture9*();
+};
diff --git a/plugins/GSdx_legacy/GSTextureCache.cpp b/plugins/GSdx_legacy/GSTextureCache.cpp
new file mode 100644
index 0000000000..4fbf330d23
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache.cpp
@@ -0,0 +1,1722 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureCache.h"
+
+bool s_IS_OPENGL = false;
+bool GSTextureCache::m_disable_partial_invalidation = false;
+
+GSTextureCache::GSTextureCache(GSRenderer* r)
+	: m_renderer(r)
+{
+	bool userhacks = !!theApp.GetConfig("UserHacks", 0);
+	s_IS_OPENGL = (static_cast<GSRendererType>(theApp.GetConfig("Renderer", static_cast<int>(GSRendererType::Default))) == GSRendererType::OGL_HW);
+
+	m_spritehack = userhacks ? theApp.GetConfig("UserHacks_SpriteHack", 0) : 0;
+	UserHacks_HalfPixelOffset = userhacks && theApp.GetConfig("UserHacks_HalfPixelOffset", 0);
+
+	m_paltex = !!theApp.GetConfig("paltex", 0);
+	m_preload_frame = userhacks && theApp.GetConfig("preload_frame_with_gs_data", 0);
+	m_can_convert_depth = s_IS_OPENGL && theApp.GetConfig("texture_cache_depth", 1);
+	m_crc_hack_level = theApp.GetConfig("crc_hack_level", 3);
+	m_disable_partial_invalidation = userhacks && theApp.GetConfig("UserHacks_DisablePartialInvalidation", 0);
+
+	// In theory 4MB is enough but 9MB is safer for overflow (8MB
+	// isn't enough in custom resolution)
+	// Test: onimusha 3 PAL 60Hz
+	m_temp = (uint8*)_aligned_malloc(9 * 1024 * 1024, 32);
+}
+
+GSTextureCache::~GSTextureCache()
+{
+	RemoveAll();
+
+	_aligned_free(m_temp);
+}
+
+void GSTextureCache::RemovePartial()
+{
+	//m_src.RemoveAll();
+
+	for (int type = 0; type < 2; type++)
+	{
+		for_each(m_dst[type].begin(), m_dst[type].end(), delete_object());
+
+		m_dst[type].clear();
+	}
+}
+
+void GSTextureCache::RemoveAll()
+{
+	m_src.RemoveAll();
+
+	for(int type = 0; type < 2; type++)
+	{
+		for_each(m_dst[type].begin(), m_dst[type].end(), delete_object());
+
+		m_dst[type].clear();
+	}
+}
+
+GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r)
+{
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
+	//const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[TEX0.CPSM] : psm;
+
+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		if(psm.pal > 0)
+			m_renderer->m_mem.m_clut.Read32(TEX0, TEXA);
+	} else {
+		GIFRegTEXA plainTEXA;
+
+		plainTEXA.AEM = 1;
+		plainTEXA.TA0 = 0;
+		plainTEXA.TA1 = 0x80;
+		m_renderer->m_mem.m_clut.Read32(TEX0, plainTEXA);
+	}
+
+	const uint32* clut = m_renderer->m_mem.m_clut;
+
+	Source* src = NULL;
+
+	list<Source*>& m = m_src.m_map[TEX0.TBP0 >> 5];
+
+
+	for(list<Source*>::iterator i = m.begin(); i != m.end(); i++)
+	{
+		Source* s = *i;
+
+		if (((TEX0.u32[0] ^ s->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ s->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
+			continue;
+
+		// Target are converted (AEM & palette) on the fly by the GPU. They don't need extra check
+		if (!s->m_target) {
+			// We request a palette texture (psm.pal). If the texture was
+			// converted by the CPU (s->m_palette == NULL), we need to ensure
+			// palette content is the same.
+			// Note: content of the palette will be uploaded at the end of the function
+			if (psm.pal > 0 && s->m_palette == NULL && !GSVector4i::compare64(clut, s->m_clut, psm.pal * sizeof(clut[0])))
+				continue;
+
+			// We request a 24/16 bit RGBA texture. Alpha expansion was done by
+			// the CPU.  We need to check that TEXA is identical
+			if (psm.pal == 0 && psm.fmt > 0 && s->m_TEXA.u64 != TEXA.u64)
+				continue;
+		}
+
+		m.splice(m.begin(), m, i);
+
+		src = s;
+
+		break;
+	}
+
+	Target* dst = NULL;
+	bool half_right = false;
+
+#ifdef DISABLE_HW_TEXTURE_CACHE
+	if( 0 )
+#else
+	if(src == NULL)
+#endif
+	{
+		uint32 bp = TEX0.TBP0;
+		uint32 psm = TEX0.PSM;
+
+		// Arc the Lad finds the wrong surface here when looking for a depth stencil.
+		// Since we're currently not caching depth stencils (check ToDo in CreateSource) we should not look for it here.
+
+		// (Simply not doing this code at all makes a lot of previsouly missing stuff show (but breaks pretty much everything
+		// else.)
+
+		for(list<Target*>::iterator i = m_dst[RenderTarget].begin(); i != m_dst[RenderTarget].end(); i++)
+		{
+			Target* t = *i;
+
+			if(t->m_used && t->m_dirty.empty()) {
+				// Typical bug (MGS3 blue cloud):
+				// 1/ RT used as 32 bits => alpha channel written
+				// 2/ RT used as 24 bits => no update of alpha channel
+				// 3/ Lookup of texture that used alpha channel as index, HasSharedBits will return false
+				//    because of the previous draw call format
+				//
+				// Solution: consider the RT as 32 bits if the alpha was used in the past
+				uint32 t_psm = (t->m_dirty_alpha) ? t->m_TEX0.PSM & ~0x1 : t->m_TEX0.PSM;
+
+				if (GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t_psm)) {
+					if (!s_IS_OPENGL && (psm == PSM_PSMT8)) {
+						// OpenGL can convert the texture directly in the GPU. Not sure we want to keep this
+						// code for DX. It fixes effect but it is slow (MGS3)
+
+						// It is a complex to convert the code in shader. As a reference, let's do it on the CPU, it will
+						// be slow but
+						// 1/ it just works :)
+						// 2/ even with upscaling
+						// 3/ for both DX and OpenGL
+
+						// Gregory: to avoid a massive slow down for nothing, let's only enable
+						// this code when CRC is below the FULL level
+						if (m_crc_hack_level < 3)
+							Read(t, t->m_valid);
+						else
+							dst = t;
+					} else {
+						dst = t;
+					}
+
+					break;
+
+				} else if ((t->m_TEX0.TBW >= 16) && GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0 + t->m_TEX0.TBW * 0x10, t->m_TEX0.PSM)) {
+					// Detect half of the render target (fix snow engine game)
+					// Target Page (8KB) have always a width of 64 pixels
+					// Half of the Target is TBW/2 pages * 8KB / (1 block * 256B) = 0x10
+					half_right = true;
+					dst = t;
+
+					break;
+				}
+
+			}
+		}
+
+		if (dst == NULL && CanConvertDepth()) {
+			// Let's try a trick to avoid to use wrongly a depth buffer
+			// Unfortunately, I don't have any Arc the Lad testcase
+			//
+			// 1/ Check only current frame, I guess it is only used as a postprocessing effect
+			for(list<Target*>::iterator i = m_dst[DepthStencil].begin(); i != m_dst[DepthStencil].end(); i++) {
+				Target* t = *i;
+
+				if(!t->m_age && t->m_used && t->m_dirty.empty() && GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t->m_TEX0.PSM))
+				{
+					dst = t;
+					break;
+				}
+			}
+		}
+	}
+
+	if(src == NULL)
+	{
+#ifdef ENABLE_OGL_DEBUG
+		if (dst) {
+			GL_CACHE("TC: dst %s hit (%s): %d (0x%x, F:0x%x)", to_string(dst->m_type), half_right ? "half" : "full",
+						dst->m_texture ? dst->m_texture->GetID() : 0,
+						TEX0.TBP0, TEX0.PSM);
+		} else {
+			GL_CACHE("TC: src miss (0x%x, F:0x%x)", TEX0.TBP0, TEX0.PSM);
+		}
+#endif
+		src = CreateSource(TEX0, TEXA, dst, half_right);
+
+		if(src == NULL)
+		{
+			return NULL;
+		}
+
+	} else {
+		GL_CACHE("TC: src hit: %d (0x%x, F:0x%x)",
+					src->m_texture ? src->m_texture->GetID() : 0,
+					TEX0.TBP0, TEX0.PSM);
+	}
+
+	if (src->m_palette)
+	{
+		int size = psm.pal * sizeof(clut[0]);
+
+		if(src->m_initpalette || !GSVector4i::update(src->m_clut, clut, size))
+		{
+			src->m_palette->Update(GSVector4i(0, 0, psm.pal, 1), src->m_clut, size);
+			src->m_initpalette = false;
+		}
+	}
+
+	src->Update(r);
+
+	m_src.m_used = true;
+
+	return src;
+}
+
+GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int w, int h, int type, bool used)
+{
+	uint32 bp = TEX0.TBP0;
+
+	Target* dst = NULL;
+
+	for(list<Target*>::iterator i = m_dst[type].begin(); i != m_dst[type].end(); i++)
+	{
+		Target* t = *i;
+
+		if(bp == t->m_TEX0.TBP0)
+		{
+			m_dst[type].splice(m_dst[type].begin(), m_dst[type], i);
+
+			dst = t;
+
+			dst->m_32_bits_fmt |= !(TEX0.PSM & 2);
+			dst->m_TEX0 = TEX0;
+
+			break;
+		}
+	}
+
+	if (dst) {
+		GL_CACHE("TC: Lookup Target(%s) %dx%d, hit: %d (0x%x, F:0x%x)", to_string(type), w, h, dst->m_texture->GetID(), bp, TEX0.PSM);
+
+		dst->Update();
+
+		dst->m_dirty_alpha |= (TEX0.PSM != PSM_PSMCT24) && (TEX0.PSM != PSM_PSMZ24);
+
+	} else if (CanConvertDepth()) {
+
+		int rev_type = (type == DepthStencil) ? RenderTarget : DepthStencil;
+		GSVector4 sRect(0, 0, 1.0, 1.0);
+		GSVector4 dRect(0, 0, w, h);
+
+		// Depth stencil/RT can be an older RT/DS but only check recent RT/DS to avoid to pick
+		// some bad data.
+
+		for(list<Target*>::iterator i = m_dst[rev_type].begin(); i != m_dst[rev_type].end(); i++)
+		{
+			Target* t = *i;
+
+			if(!t->m_age && bp == t->m_TEX0.TBP0)
+			{
+				dst = CreateTarget(TEX0, w, h, type);
+				dst->m_32_bits_fmt = t->m_32_bits_fmt;
+
+				if (type == DepthStencil) {
+					GL_CACHE("TC: Lookup Target(Depth) %dx%d, hit Color (0x%x, F:0x%x)", w, h, bp, TEX0.PSM);
+					int shader = ShaderConvert_RGBA8_TO_FLOAT32 + GSLocalMemory::m_psm[TEX0.PSM].fmt;
+					m_renderer->m_dev->StretchRect(t->m_texture, sRect, dst->m_texture, dRect, shader, false);
+				} else {
+					GL_CACHE("TC: Lookup Target(Color) %dx%d, hit Depth (0x%x, F:0x%x)", w, h, bp, TEX0.PSM);
+					m_renderer->m_dev->StretchRect(t->m_texture, sRect, dst->m_texture, dRect, ShaderConvert_FLOAT32_TO_RGBA8, false);
+				}
+
+				break;
+			}
+		}
+	}
+
+	if(dst == NULL)
+	{
+		GL_CACHE("TC: Lookup Target(%s) %dx%d, miss (0x%x, F:0x%x)", to_string(type), w, h, bp, TEX0.PSM);
+
+		dst = CreateTarget(TEX0, w, h, type);
+
+		if(dst == NULL)
+			return NULL;
+
+		// In theory new textures contain invalidated data. Still in theory a new target
+		// must contains the content of the GS memory.
+		// In practice, TC will wrongly invalidate some RT. For example due to write on the alpha
+		// channel but colors is still valid. Unfortunately TC doesn't support the upload of data
+		// in target.
+		//
+		// Cleaning the code here will likely break several games. However it might reduce
+		// the noise in draw call debugging. It is the main reason to enable it on debug build.
+		//
+		// From a performance point of view, it might cost a little on big upscaling
+		// but normally few RT are miss so it must remain reasonable.
+		if (s_IS_OPENGL) {
+			if (m_preload_frame) {
+				GL_INS("Preloading the RT DATA");
+				// RT doesn't have height but if we use a too big value, we will read outside of the GS memory.
+				int page0 = TEX0.TBP0 >> 5;
+				int max_page = (MAX_PAGES - page0);
+				int max_h = 32 * max_page / TEX0.TBW;
+				// h is likely smaller than w (true most of the time). Reduce the upload size (speed)
+				max_h = std::min<int>(max_h, TEX0.TBW * 64);
+
+				dst->m_dirty.push_back(GSDirtyRect(GSVector4i(0, 0, TEX0.TBW * 64, max_h), TEX0.PSM));
+				dst->Update();
+			} else {
+#ifdef ENABLE_OGL_DEBUG
+				switch (type) {
+					case RenderTarget: m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); break;
+					case DepthStencil: m_renderer->m_dev->ClearDepth(dst->m_texture, 0); break;
+					default:break;
+				}
+#endif
+			}
+		}
+	}
+
+	if(m_renderer->CanUpscale())
+	{
+		int multiplier = m_renderer->GetUpscaleMultiplier();
+
+		if(multiplier > 1) // it's limited to a maximum of 4 on reading the config
+		{
+			dst->m_texture->SetScale(GSVector2((float)multiplier, (float)multiplier));
+		}
+		else
+		{
+			GSVector4i fr = m_renderer->GetFrameRect();
+
+			int ww = (int)(fr.left + m_renderer->GetDisplayRect().width());
+			int hh = (int)(fr.top + m_renderer->GetDisplayRect().height());
+
+			if(hh <= m_renderer->GetDeviceSize().y / 2)
+			{
+				hh *= 2;
+			}
+
+			// Gregory: I'm sure this sillyness is related to the usage of a 32bits
+			// buffer as a 16 bits format. In this case the height of the buffer is
+			// multiplyed by 2 (Hence a scissor bigger than the RT)
+
+			// This vp2 fix doesn't work most of the time
+
+			if(hh < 512 && m_renderer->m_context->SCISSOR.SCAY1 == 511) // vp2
+			{
+				hh = 512;
+			}
+
+			if(ww > 0 && hh > 0)
+			{
+				dst->m_texture->SetScale(GSVector2((float)w / ww, (float)h / hh));
+			}
+		}
+	}
+
+	if(used)
+	{
+		dst->m_used = true;
+	}
+
+	return dst;
+}
+
+GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int w, int h, int real_h)
+{
+	uint32 bp = TEX0.TBP0;
+
+	Target* dst = NULL;
+
+	for(list<Target*>::iterator i = m_dst[RenderTarget].begin(); i != m_dst[RenderTarget].end(); i++)
+	{
+		Target* t = *i;
+
+		if(bp == t->m_TEX0.TBP0)
+		{
+			dst = t;
+
+			GL_CACHE("TC: Lookup Frame %dx%d, perfect hit: %d (0x%x)", w, h, dst->m_texture->GetID(), bp);
+
+			break;
+		}
+		else
+		{
+			// HACK: try to find something close to the base pointer
+
+			if(t->m_TEX0.TBP0 <= bp && bp < t->m_TEX0.TBP0 + 0xe00UL && (!dst || t->m_TEX0.TBP0 >= dst->m_TEX0.TBP0))
+			{
+				GL_CACHE("TC: Lookup Frame %dx%d, close hit: %d (0x%x, took 0x%x)", w, h, t->m_texture->GetID(), bp, t->m_TEX0.TBP0);
+				dst = t;
+			}
+		}
+	}
+
+	if(dst == NULL)
+	{
+		GL_CACHE("TC: Lookup Frame %dx%d, miss (0x%x)", w, h, bp);
+
+		dst = CreateTarget(TEX0, w, h, RenderTarget);
+
+		if(dst == NULL)
+		{
+			return NULL;
+		}
+
+		m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage
+
+		if (m_preload_frame) {
+			// Load GS data into frame. Game can directly uploads a background or the full image in
+			// "CTRC" buffer. It will also avoid various black screen issue in gs dump.
+			//
+			// Code is more or less an equivalent of the SW renderer
+			//
+			// Option is hidden and not enabled by default to avoid any regression
+			dst->m_dirty.push_back(GSDirtyRect(GSVector4i(0, 0, TEX0.TBW * 64, real_h), TEX0.PSM));
+			dst->Update();
+		}
+	}
+	else
+	{
+		dst->Update();
+	}
+
+	dst->m_used = true;
+
+	return dst;
+}
+
+// Goal: Depth And Target at the same address is not possible. On GS it is
+// the same memory but not on the Dx/GL. Therefore a write to the Depth/Target
+// must invalidate the Target/Depth respectively
+void GSTextureCache::InvalidateVideoMemType(int type, uint32 bp)
+{
+	if (!CanConvertDepth())
+		return;
+
+	for(list<Target*>::iterator i = m_dst[type].begin(); i != m_dst[type].end(); i++)
+	{
+		Target* t = *i;
+
+		if(bp == t->m_TEX0.TBP0)
+		{
+			GL_CACHE("TC: InvalidateVideoMemType: Remove Target(%s) %d (0x%x)", to_string(type),
+					t->m_texture ? t->m_texture->GetID() : 0,
+					t->m_TEX0.TBP0);
+
+			m_dst[type].erase(i);
+			delete t;
+
+			break;
+		}
+	}
+
+}
+
+// Goal: invalidate data sent to the GPU when the source (GS memory) is modified
+// Called each time you want to write to the GS memory
+void GSTextureCache::InvalidateVideoMem(GSOffset* off, const GSVector4i& rect, bool target)
+{
+	if(!off) return; // Fixme. Crashes Dual Hearts, maybe others as well. Was fine before r1549.
+
+	uint32 bp = off->bp;
+	uint32 bw = off->bw;
+	uint32 psm = off->psm;
+
+	if(!target)
+	{
+		// Remove Source that have same BP as the render target (color&dss)
+		// rendering will dirty the copy
+		const list<Source*>& m = m_src.m_map[bp >> 5];
+
+		for(list<Source*>::const_iterator i = m.begin(); i != m.end(); )
+		{
+			list<Source*>::const_iterator j = i++;
+
+			Source* s = *j;
+
+			if(GSUtil::HasSharedBits(bp, psm, s->m_TEX0.TBP0, s->m_TEX0.PSM))
+			{
+				m_src.RemoveAt(s);
+			}
+		}
+
+		uint32 bbp = bp + bw * 0x10;
+		if (bw >= 16 && bbp < 16384) {
+			// Detect half of the render target (fix snow engine game)
+			// Target Page (8KB) have always a width of 64 pixels
+			// Half of the Target is TBW/2 pages * 8KB / (1 block * 256B) = 0x10
+
+			const list<Source*>& m = m_src.m_map[bbp >> 5];
+
+			for(list<Source*>::const_iterator i = m.begin(); i != m.end(); )
+			{
+				list<Source*>::const_iterator j = i++;
+
+				Source* s = *j;
+
+				if(GSUtil::HasSharedBits(bbp, psm, s->m_TEX0.TBP0, s->m_TEX0.PSM))
+				{
+					m_src.RemoveAt(s);
+				}
+			}
+		}
+	}
+
+	GSVector4i r;
+
+	uint32* pages = (uint32*)m_temp;
+
+	off->GetPages(rect, pages, &r);
+
+	bool found = false;
+
+	for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+	{
+		uint32 page = *p;
+
+		const list<Source*>& m = m_src.m_map[page];
+
+		for(list<Source*>::const_iterator i = m.begin(); i != m.end(); )
+		{
+			list<Source*>::const_iterator j = i++;
+
+			Source* s = *j;
+
+			if(GSUtil::HasSharedBits(psm, s->m_TEX0.PSM))
+			{
+				uint32* RESTRICT valid = s->m_valid;
+
+				bool b = bp == s->m_TEX0.TBP0;
+
+				if(!s->m_target)
+				{
+					if (m_disable_partial_invalidation && s->m_repeating) {
+						m_src.RemoveAt(s);
+					} else {
+						// Invalidate data of input texture
+						if(s->m_repeating)
+						{
+							// Note: very hot path on snowbling engine game
+							vector<GSVector2i>& l = s->m_p2t[page];
+
+							for(vector<GSVector2i>::iterator k = l.begin(); k != l.end(); k++)
+							{
+								valid[k->x] &= k->y;
+							}
+						}
+						else
+						{
+							valid[page] = 0;
+						}
+
+						s->m_complete = false;
+
+						found |= b;
+					}
+				}
+				else
+				{
+					// render target used as input texture
+					// TODO
+
+					if(b)
+					{
+						m_src.RemoveAt(s);
+					}
+				}
+			}
+		}
+	}
+
+	if(!target) return;
+
+	for(int type = 0; type < 2; type++)
+	{
+		for(list<Target*>::iterator i = m_dst[type].begin(); i != m_dst[type].end(); )
+		{
+			list<Target*>::iterator j = i++;
+
+			Target* t = *j;
+
+			// GH: (I think) this code is completely broken. Typical issue:
+			// EE write an alpha channel into 32 bits texture
+			// Results: the target is deleted (because HasCompatibleBits is false)
+			//
+			// Major issues are expected if the game try to reuse the target
+			// If we dirty the RT, it will likely upload partially invalid data.
+			// (The color on the previous example)
+			if(GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t->m_TEX0.PSM))
+			{
+				if(!found && GSUtil::HasCompatibleBits(psm, t->m_TEX0.PSM))
+				{
+					GL_CACHE("TC: Dirty Target(%s) %d (0x%x)", to_string(type),
+								t->m_texture ? t->m_texture->GetID() : 0,
+								t->m_TEX0.TBP0);
+					t->m_dirty.push_back(GSDirtyRect(r, psm));
+					t->m_TEX0.TBW = bw;
+				}
+				else
+				{
+					m_dst[type].erase(j);
+					GL_CACHE("TC: Remove Target(%s) %d (0x%x)", to_string(type),
+								t->m_texture ? t->m_texture->GetID() : 0,
+								t->m_TEX0.TBP0);
+					delete t;
+					continue;
+				}
+			} else if (bp == t->m_TEX0.TBP0) {
+				// EE writes the ALPHA channel. Mark it as invalid for
+				// the texture cache. Otherwise it will generate a wrong
+				// hit on the texture cache.
+				// Game: Conflict - Desert Storm (flickering)
+				t->m_dirty_alpha = false;
+			}
+
+			// GH: Try to detect texture write that will overlap with a target buffer
+			if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM) && bp < t->m_TEX0.TBP0)
+			{
+				uint32 rowsize = bw * 8192;
+				uint32 offset = (uint32)((t->m_TEX0.TBP0 - bp) * 256);
+
+				if(rowsize > 0 && offset % rowsize == 0)
+				{
+					int y = GSLocalMemory::m_psm[psm].pgs.y * offset / rowsize;
+
+					if(r.bottom > y)
+					{
+						GL_CACHE("TC: Dirty After Target(%s) %d (0x%x)", to_string(type),
+								t->m_texture ? t->m_texture->GetID() : 0,
+								t->m_TEX0.TBP0);
+						// TODO: do not add this rect above too
+						t->m_dirty.push_back(GSDirtyRect(GSVector4i(r.left, r.top - y, r.right, r.bottom - y), psm));
+						t->m_TEX0.TBW = bw;
+						continue;
+					}
+				}
+			}
+
+			// FIXME: this code "fixes" black FMV issue with rule of rose.
+			// Code is completely hardcoded so maybe not the best solution. Besides I don't
+			// know the full impact of it.
+			// Let's keep this code for the future
+#if 0
+			if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM) && (t->m_TEX0.TBP0 + 0x200 == bp))
+			{
+				GL_CACHE("TC: Dirty in the middle of Target(%s) %d (0x%x)", to_string(type),
+						t->m_texture ? t->m_texture->GetID() : 0,
+						t->m_TEX0.TBP0);
+
+				uint32 rowsize = bw * 8192u;
+				uint32 offset = 0x200 * 256u;
+				int y = GSLocalMemory::m_psm[psm].pgs.y * offset / rowsize;
+
+				t->m_dirty.push_back(GSDirtyRect(GSVector4i(r.left, r.top + y, r.right, r.bottom + y), psm));
+				t->m_TEX0.TBW = bw;
+				continue;
+			}
+#endif
+		}
+	}
+}
+
+// Goal: retrive the data from the GPU to the GS memory.
+// Called each time you want to read from the GS memory
+void GSTextureCache::InvalidateLocalMem(GSOffset* off, const GSVector4i& r)
+{
+	uint32 bp = off->bp;
+	uint32 psm = off->psm;
+	//uint32 bw = off->bw;
+
+	// No depth handling please.
+	if (psm == PSM_PSMZ32 || psm == PSM_PSMZ24 || psm == PSM_PSMZ16 || psm == PSM_PSMZ16S) {
+		GL_INS("ERROR: InvalidateLocalMem depth format isn't supported");
+		if (m_can_convert_depth) {
+			for(auto t : m_dst[DepthStencil]) {
+				if(GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t->m_TEX0.PSM)) {
+					// Read the full depth buffer for easy testing
+					Read(t, t->m_valid);
+				}
+			}
+		}
+		return;
+	}
+
+	// This is a shorter but potentially slower version of the below, commented out code.
+	// It works for all the games mentioned below and fixes a couple of other ones as well
+	// (Busen0: Wizardry and Chaos Legion).
+	// Also in a few games the below code ran the Grandia3 case when it shouldn't :p
+	for(list<Target*>::iterator i = m_dst[RenderTarget].begin(); i != m_dst[RenderTarget].end(); )
+	{
+		list<Target*>::iterator j = i++;
+
+		Target* t = *j;
+
+		if (t->m_TEX0.PSM != PSM_PSMZ32 && t->m_TEX0.PSM != PSM_PSMZ24 && t->m_TEX0.PSM != PSM_PSMZ16 && t->m_TEX0.PSM != PSM_PSMZ16S)
+		{
+			if(GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t->m_TEX0.PSM))
+			{
+				// GH Note: Read will do a StretchRect and then will sizzle data to the GS memory
+				// t->m_valid will do the full target texture whereas r.intersect(t->m_valid) will be limited
+				// to the useful part for the transfer.
+				// 1/ Logically intersect must be enough, except if we miss some call to InvalidateLocalMem
+				// or it need the depth part too
+				// 2/ Read function is slow but I suspect the swizzle part to be costly. Maybe a compute shader
+				// that do the swizzle at the same time of the Stretching could save CPU computation.
+
+				// note: r.rintersect breaks Wizardry and Chaos Legion
+				// Read(t, t->m_valid) works in all tested games but is very slow in GUST titles ><
+				if (GSTextureCache::m_disable_partial_invalidation) {
+					Read(t, r.rintersect(t->m_valid));
+				} else {
+					if (r.x == 0 && r.y == 0) // Full screen read?
+						Read(t, t->m_valid);
+					else // Block level read?
+						Read(t, r.rintersect(t->m_valid));
+				}
+			}
+		} else {
+			GL_INS("ERROR: InvalidateLocalMem target is a depth format");
+		}
+	}
+
+	//GSTextureCache::Target* rt2 = NULL;
+	//int ymin = INT_MAX;
+	//for(list<Target*>::iterator i = m_dst[RenderTarget].begin(); i != m_dst[RenderTarget].end(); )
+	//{
+	//	list<Target*>::iterator j = i++;
+
+	//	Target* t = *j;
+
+	//	if (t->m_TEX0.PSM != PSM_PSMZ32 && t->m_TEX0.PSM != PSM_PSMZ24 && t->m_TEX0.PSM != PSM_PSMZ16 && t->m_TEX0.PSM != PSM_PSMZ16S)
+	//	{
+	//		if(GSUtil::HasSharedBits(bp, psm, t->m_TEX0.TBP0, t->m_TEX0.PSM))
+	//		{
+	//			if(GSUtil::HasCompatibleBits(psm, t->m_TEX0.PSM))
+	//			{
+	//				Read(t, r.rintersect(t->m_valid));
+	//				return;
+	//			}
+	//			else if(psm == PSM_PSMCT32 && (t->m_TEX0.PSM == PSM_PSMCT16 || t->m_TEX0.PSM == PSM_PSMCT16S))
+	//			{
+	//				// ffx-2 riku changing to her default (shoots some reflecting glass at the end), 16-bit rt read as 32-bit
+	//				Read(t, GSVector4i(r.left, r.top, r.right, r.top + (r.bottom - r.top) * 2).rintersect(t->m_valid));
+	//				return;
+	//			}
+	//			else
+	//			{
+	//				if (psm == PSM_PSMT4HH && t->m_TEX0.PSM == PSM_PSMCT32)
+	//				{
+	//					// Silent Hill Origins shadows: Read 8 bit using only the HIGH bits (4 bit) texture as 32 bit.
+	//					Read(t, r.rintersect(t->m_valid));
+	//					return;
+	//				}
+	//				else
+	//				{
+	//					//printf("Trashing render target. We have a %d type texture and we are trying to write into a %d type texture\n", t->m_TEX0.PSM, psm);
+	//					m_dst[RenderTarget].erase(j);
+	//					delete t;
+	//				}
+	//			}
+	//		}
+
+	//		// Grandia3, FFX, FFX-2 pause menus. t->m_TEX0.TBP0 magic number checks because otherwise kills xs2 videos
+	//		if( (GSUtil::HasSharedBits(psm, t->m_TEX0.PSM) && (bp > t->m_TEX0.TBP0) )
+	//			&& ((t->m_TEX0.TBP0 == 0) || (t->m_TEX0.TBP0==3328) || (t->m_TEX0.TBP0==3584) ))
+	//		{
+	//			//printf("first : %d-%d child : %d-%d\n", psm, bp, t->m_TEX0.PSM, t->m_TEX0.TBP0);
+	//			uint32 rowsize = bw * 8192;
+	//			uint32 offset = (uint32)((bp - t->m_TEX0.TBP0) * 256);
+
+	//			if(rowsize > 0 && offset % rowsize == 0)
+	//			{
+	//				int y = GSLocalMemory::m_psm[psm].pgs.y * offset / rowsize;
+
+	//				if(y < ymin && y < 512)
+	//				{
+	//					rt2 = t;
+	//					ymin = y;
+	//				}
+	//			}
+	//		}
+	//	}
+	//}
+	//if(rt2)
+	//{
+	//	Read(rt2, GSVector4i(r.left, r.top + ymin, r.right, r.bottom + ymin));
+	//}
+
+
+	// TODO: ds
+}
+
+void GSTextureCache::IncAge()
+{
+	int maxage = m_src.m_used ? 3 : 30;
+
+	// You can't use m_map[page] because Source* are duplicated on several pages.
+	for(hash_set<Source*>::iterator i = m_src.m_surfaces.begin(); i != m_src.m_surfaces.end(); )
+	{
+		hash_set<Source*>::iterator j = i++;
+
+		Source* s = *j;
+
+		if(++s->m_age > maxage)
+		{
+			m_src.RemoveAt(s);
+		}
+	}
+
+	m_src.m_used = false;
+
+	// Clearing of Rendertargets causes flickering in many scene transitions.
+	// Sigh, this seems to be used to invalidate surfaces. So set a huge maxage to avoid flicker,
+	// but still invalidate surfaces. (Disgaea 2 fmv when booting the game through the BIOS)
+	// Original maxage was 4 here, Xenosaga 2 needs at least 240, else it flickers on scene transitions.
+	maxage = 400; // ffx intro scene changes leave the old image untouched for a couple of frames and only then start using it
+
+	for(int type = 0; type < 2; type++)
+	{
+		for(list<Target*>::iterator i = m_dst[type].begin(); i != m_dst[type].end(); )
+		{
+			list<Target*>::iterator j = i++;
+
+			Target* t = *j;
+
+			// This variable is used to detect the texture shuffle effect. There is a high
+			// probability that game will do it on the current RT.
+			// Variable is cleared here to avoid issue with game that uses a 16 bits
+			// render target
+			if (t->m_age > 0) {
+				// GoW2 uses the effect at the start of the frame
+				t->m_32_bits_fmt = false;
+			}
+
+			if(++t->m_age > maxage)
+			{
+				m_dst[type].erase(j);
+				GL_CACHE("TC: Remove Target(%s): %d (0x%x) due to age", to_string(type),
+							t->m_texture ? t->m_texture->GetID() : 0,
+							t->m_TEX0.TBP0);
+
+				delete t;
+			}
+		}
+	}
+}
+
+//Fixme: Several issues in here. Not handling depth stencil, pitch conversion doesnt work.
+GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* dst, bool half_right)
+{
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
+	Source* src = new Source(m_renderer, TEX0, TEXA, m_temp);
+
+	int tw = 1 << TEX0.TW;
+	int th = 1 << TEX0.TH;
+	//int tp = TEX0.TBW << 6;
+
+	bool hack = false;
+
+	if(m_spritehack && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT8H))
+	{
+		src->m_spritehack_t = true;
+
+		if(m_spritehack == 2 && TEX0.CPSM != PSM_PSMCT16)
+			src->m_spritehack_t = false;
+	}
+	else
+		src->m_spritehack_t = false;
+
+	if (dst)
+	{
+		// TODO: clean up this mess
+
+		int shader = dst->m_type != RenderTarget ? ShaderConvert_FLOAT32_TO_RGBA8 : ShaderConvert_COPY;
+		bool is_8bits = TEX0.PSM == PSM_PSMT8 && s_IS_OPENGL;
+
+		if (is_8bits) {
+			GL_INS("Reading RT as a packed-indexed 8 bits format");
+			shader = ShaderConvert_RGBA_TO_8I;
+		}
+
+#ifdef ENABLE_OGL_DEBUG
+		if (TEX0.PSM == PSM_PSMT4) {
+			GL_INS("ERROR: Reading RT as a packed-indexed 4 bits format is not supported");
+		}
+#endif
+
+		if (TEX0.PSM < PSM_PSMT8 || TEX0.PSM > PSM_PSMT4HH) {
+			src->m_32_bits_fmt = dst->m_32_bits_fmt;
+		}
+		src->m_target = true;
+
+		dst->Update();
+
+		GSTexture* tmp = NULL;
+
+		if (dst->m_texture->IsMSAA())
+		{
+			tmp = dst->m_texture;
+
+			dst->m_texture = m_renderer->m_dev->Resolve(dst->m_texture);
+		}
+
+
+		// do not round here!!! if edge becomes a black pixel and addressing mode is clamp => everything outside the clamped area turns into black (kh2 shadows)
+
+		int w = (int)(dst->m_texture->GetScale().x * tw);
+		int h = (int)(dst->m_texture->GetScale().y * th);
+		if (is_8bits) {
+			// Unscale 8 bits textures, quality won't be nice but format is really awful
+			w = tw;
+			h = th;
+		}
+
+		GSVector2i dstsize = dst->m_texture->GetSize();
+
+		// pitch conversion
+
+		if(dst->m_TEX0.TBW != TEX0.TBW) // && dst->m_TEX0.PSM == TEX0.PSM
+		{
+			// This is so broken :p
+			////Better not do the code below, "fixes" like every game that ever gets here..
+			////Edit: Ratchet and Clank needs this to show most of it's graphics at all.
+			////Someone else fix this please, I can't :p
+			////delete src; return NULL;
+
+			//// sfex3 uses this trick (bw: 10 -> 5, wraps the right side below the left)
+
+			//ASSERT(dst->m_TEX0.TBW > TEX0.TBW); // otherwise scale.x need to be reduced to make the larger texture fit (TODO)
+
+			//src->m_texture = m_renderer->m_dev->CreateRenderTarget(dstsize.x, dstsize.y, false);
+
+			//GSVector4 size = GSVector4(dstsize).xyxy();
+			//GSVector4 scale = GSVector4(dst->m_texture->GetScale()).xyxy();
+
+			//int blockWidth  = 64;
+			//int blockHeight = TEX0.PSM == PSM_PSMCT32 || TEX0.PSM == PSM_PSMCT24 ? 32 : 64;
+
+			//GSVector4i br(0, 0, blockWidth, blockHeight);
+
+			//int sw = (int)dst->m_TEX0.TBW << 6;
+
+			//int dw = (int)TEX0.TBW << 6;
+			//int dh = 1 << TEX0.TH;
+
+			//if(sw != 0)
+			//for(int dy = 0; dy < dh; dy += blockHeight)
+			//{
+			//	for(int dx = 0; dx < dw; dx += blockWidth)
+			//	{
+			//		int off = dy * dw / blockHeight + dx;
+
+			//		int sx = off % sw;
+			//		int sy = off / sw;
+
+			//		GSVector4 sRect = GSVector4(GSVector4i(sx, sy).xyxy() + br) * scale / size;
+			//		GSVector4 dRect = GSVector4(GSVector4i(dx, dy).xyxy() + br) * scale;
+
+			//		m_renderer->m_dev->StretchRect(dst->m_texture, sRect, src->m_texture, dRect);
+
+			//		// TODO: this is quite a lot of StretchRect, do it with one Draw
+			//	}
+			//}
+		}
+		else if(tw < 1024)
+		{
+			// FIXME: timesplitters blurs the render target by blending itself over a couple of times
+			hack = true;
+			//if(tw == 256 && th == 128 && (TEX0.TBP0 == 0 || TEX0.TBP0 == 0x00e00))
+			//{
+			//	delete src;
+			//	return NULL;
+			//}
+		}
+		// width/height conversion
+
+		GSVector2 scale = dst->m_texture->GetScale();
+
+		GSVector4 dRect(0, 0, w, h);
+
+		// Lengthy explanation of the rescaling code.
+		// Here an example in 2x:
+		// RT is 1280x1024 but only contains 512x448 valid data (so 256x224 pixels without upscaling)
+		//
+		// PS2 want to read it back as a 1024x1024 pixels (they don't care about the extra pixels)
+		// So in theory we need to shrink a 2048x2048 RT into a 1024x1024 texture. Obviously the RT is
+		// too small.
+		//
+		// So we will only limit the resize to the available data in RT.
+		// Therefore we will resize the RT from 1280x1024 to 1280x1024/2048x2048 % of the new texture
+		// size (which is 1280x1024) (i.e. 800x512)
+		// From the rendering point of view. UV coordinate will be normalized on the real GS texture size
+		// This way it can be used on an upscaled texture without extra scaling factor (only requirement is
+		// to have same proportion)
+		//
+		// FIXME: The scaling will create a bad offset. For example if texture coordinate start at 0.5 (pixel 0)
+		// At 2x it will become 0.5/128 * 256 = 1 (pixel 1)
+		// I think it is the purpose of the UserHacks_HalfPixelOffset below. However implementation is less
+		// than ideal.
+		// 1/ It suppose games have an half pixel offset on texture coordinate which could be wrong
+		// 2/ It doesn't support rescaling of the RT (tw = 1024)
+		// Maybe it will be more easy to just round the UV value in the Vertex Shader
+
+		if (!is_8bits) {
+			// 8 bits handling is special due to unscaling. It is better to not execute this code
+			if (w > dstsize.x)
+			{
+				scale.x = (float)dstsize.x / tw;
+				dRect.z = (float)dstsize.x * scale.x / dst->m_texture->GetScale().x;
+				w = dstsize.x;
+			}
+
+			if (h > dstsize.y)
+			{
+				scale.y = (float)dstsize.y / th;
+				dRect.w = (float)dstsize.y * scale.y / dst->m_texture->GetScale().y;
+				h = dstsize.y;
+			}
+		}
+
+		GSVector4 sRect(0, 0, w, h);
+
+		GSTexture* sTex = src->m_texture ? src->m_texture : dst->m_texture;
+		GSTexture* dTex = m_renderer->m_dev->CreateRenderTarget(w, h, false);
+
+		// GH: by default (m_paltex == 0) GSdx converts texture to the 32 bit format
+		// However it is different here. We want to reuse a Render Target as a texture.
+		// Because the texture is already on the GPU, CPU can't convert it.
+		if (psm.pal > 0) {
+			src->m_palette = m_renderer->m_dev->CreateTexture(256, 1);
+		}
+		// Disable linear filtering for various GS post-processing effect
+		// 1/ Palette is used to interpret the alpha channel of the RT as an index.
+		// Star Ocean 3 uses it to emulate a stencil buffer.
+		// 2/ Z formats are a bad idea to interpolate (discontinuties).
+		// 3/ 16 bits buffer is used to move data from a channel to another.
+		//
+		// I keep linear filtering for standard color even if I'm not sure that it is
+		// working correctly.
+		// Indeed, texture is reduced so you need to read all covered pixels (9 in 3x)
+		// to correctly interpolate the value. Linear interpolation is likely acceptable
+		// only in 2x scaling
+		//
+		// Src texture will still be bilinear interpolated so I'm really not sure
+		// that we need to do it here too.
+		//
+		// Future note: instead to do
+		// RT 2048x2048 -> T 1024x1024 -> RT 2048x2048
+		// We can maybe sample directly a bigger texture
+		// RT 2048x2048 -> T 2048x2048 -> RT 2048x2048
+		// Pro: better quality. Copy instead of StretchRect (must be faster)
+		// Cons: consume more memory
+		//
+		// In distant future: investigate to reuse the RT directly without any
+		// copy. Likely a speed boost and memory usage reduction.
+		bool linear = (TEX0.PSM == PSM_PSMCT32 || TEX0.PSM == PSM_PSMCT24);
+
+		if(!src->m_texture)
+		{
+			src->m_texture = dTex;
+		}
+
+		if ((sRect == dRect).alltrue() && !shader)
+		{
+			if (half_right) {
+				// You typically hit this code in snow engine game. Dstsize is the size of of Dx/GL RT
+				// which is arbitrary set to 1280 (biggest RT used by GS). h/w are based on the input texture
+				// so the only reliable way to find the real size of the target is to use the TBW value.
+				float real_width = dst->m_TEX0.TBW * 64u * dst->m_texture->GetScale().x;
+				m_renderer->m_dev->CopyRect(sTex, dTex, GSVector4i(real_width/2.0f, 0, real_width, h));
+			} else {
+				m_renderer->m_dev->CopyRect(sTex, dTex, GSVector4i(0, 0, w, h)); // <= likely wrong dstsize.x could be bigger than w
+			}
+		}
+		else
+		{
+			// Different size or not the same format
+			sRect.z /= sTex->GetWidth();
+			sRect.w /= sTex->GetHeight();
+
+			if (half_right) {
+				sRect.x = sRect.z/2.0f;
+			}
+
+			m_renderer->m_dev->StretchRect(sTex, sRect, dTex, dRect, shader, linear);
+		}
+
+		if(dTex != src->m_texture)
+		{
+			m_renderer->m_dev->Recycle(src->m_texture);
+
+			src->m_texture = dTex;
+		}
+
+		if( src->m_texture )
+			src->m_texture->SetScale(scale);
+		else
+			ASSERT(0);
+
+		if(tmp != NULL)
+		{
+			// tmp is the texture before a MultiSample resolve
+			m_renderer->m_dev->Recycle(dst->m_texture);
+
+			dst->m_texture = tmp;
+		}
+
+		// Offset hack. Can be enabled via GSdx options.
+		// The offset will be used in Draw().
+
+		float modx = 0.0f;
+		float mody = 0.0f;
+
+		if(UserHacks_HalfPixelOffset && hack)
+		{
+			switch(m_renderer->GetUpscaleMultiplier())
+			{
+			case 2:  modx = 2.2f; mody = 2.2f; dst->m_texture->LikelyOffset = true;  break;
+			case 3:  modx = 3.1f; mody = 3.1f; dst->m_texture->LikelyOffset = true;  break;
+			case 4:  modx = 4.2f; mody = 4.2f; dst->m_texture->LikelyOffset = true;  break;
+			case 5:  modx = 5.3f; mody = 5.3f; dst->m_texture->LikelyOffset = true;  break;
+			case 6:  modx = 6.2f; mody = 6.2f; dst->m_texture->LikelyOffset = true;  break;
+			case 8:  modx = 8.2f; mody = 8.2f; dst->m_texture->LikelyOffset = true;  break;
+			default: modx = 0.0f; mody = 0.0f; dst->m_texture->LikelyOffset = false; break;
+			}
+		}
+
+		dst->m_texture->OffsetHack_modx = modx;
+		dst->m_texture->OffsetHack_mody = mody;
+	}
+	else
+	{
+		if (m_paltex && psm.pal > 0)
+		{
+			src->m_texture = m_renderer->m_dev->CreateTexture(tw, th, Get8bitFormat());
+			src->m_palette = m_renderer->m_dev->CreateTexture(256, 1);
+		}
+		else
+			src->m_texture = m_renderer->m_dev->CreateTexture(tw, th);
+	}
+
+	if(src->m_texture == NULL)
+	{
+		ASSERT(0);
+		delete src;
+		return NULL;
+	}
+
+	if(psm.pal > 0)
+	{
+		memcpy(src->m_clut, (const uint32*)m_renderer->m_mem.m_clut, psm.pal * sizeof(uint32));
+	}
+
+	m_src.Add(src, TEX0, m_renderer->m_context->offset.tex);
+
+	return src;
+}
+
+GSTextureCache::Target* GSTextureCache::CreateTarget(const GIFRegTEX0& TEX0, int w, int h, int type)
+{
+	Target* t = new Target(m_renderer, TEX0, m_temp, CanConvertDepth());
+
+	// FIXME: initial data should be unswizzled from local mem in Update() if dirty
+
+	t->m_type = type;
+
+	if(type == RenderTarget)
+	{
+		t->m_texture = m_renderer->m_dev->CreateRenderTarget(w, h, true);
+
+		t->m_used = true; // FIXME
+	}
+	else if(type == DepthStencil)
+	{
+		t->m_texture = m_renderer->m_dev->CreateDepthStencil(w, h, true);
+	}
+
+	if(t->m_texture == NULL)
+	{
+		ASSERT(0);
+		delete t;
+		return NULL;
+	}
+
+	m_dst[type].push_front(t);
+
+	return t;
+}
+
+void GSTextureCache::PrintMemoryUsage()
+{
+#ifdef ENABLE_OGL_DEBUG
+	uint32 tex    = 0;
+	uint32 tex_rt = 0;
+	uint32 rt     = 0;
+	uint32 dss    = 0;
+	for(hash_set<Source*>::iterator i = m_src.m_surfaces.begin(); i != m_src.m_surfaces.end(); i++) {
+		Source* s = *i;
+		if (s) {
+			if (s->m_target)
+				tex_rt += s->m_texture->GetMemUsage();
+			else
+				tex    += s->m_texture->GetMemUsage();
+
+		}
+	}
+	for(list<Target*>::iterator i = m_dst[RenderTarget].begin(); i != m_dst[RenderTarget].end(); i++) {
+		Target* t = *i;
+		if (t)
+			rt += t->m_texture->GetMemUsage();
+	}
+	for(list<Target*>::iterator i = m_dst[DepthStencil].begin(); i != m_dst[DepthStencil].end(); i++) {
+		Target* t = *i;
+		if (t)
+			dss += t->m_texture->GetMemUsage();
+	}
+
+	GL_PERF("MEM: RO Tex %dMB. RW Tex %dMB. Target %dMB. Depth %dMB", tex >> 20u, tex_rt >> 20u, rt >> 20u, dss >> 20u);
+#endif
+}
+
+// GSTextureCache::Surface
+
+GSTextureCache::Surface::Surface(GSRenderer* r, uint8* temp)
+	: m_renderer(r)
+	, m_texture(NULL)
+	, m_age(0)
+	, m_temp(temp)
+	, m_32_bits_fmt(false)
+{
+	m_TEX0.TBP0 = 0x3fff;
+}
+
+GSTextureCache::Surface::~Surface()
+{
+	m_renderer->m_dev->Recycle(m_texture);
+}
+
+void GSTextureCache::Surface::Update()
+{
+	m_age = 0;
+}
+
+// GSTextureCache::Source
+
+GSTextureCache::Source::Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint8* temp)
+	: Surface(r, temp)
+	, m_palette(NULL)
+	, m_initpalette(true)
+	, m_target(false)
+	, m_complete(false)
+	, m_spritehack_t(false)
+	, m_p2t(NULL)
+{
+	m_TEX0 = TEX0;
+	m_TEXA = TEXA;
+
+	memset(m_valid, 0, sizeof(m_valid));
+
+	m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 32);
+
+	memset(m_clut, 0, 256*sizeof(uint32));
+
+	m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 32);
+	m_write.count = 0;
+
+	m_repeating = m_TEX0.IsRepeating();
+
+	if(m_repeating)
+	{
+		m_p2t = r->m_mem.GetPage2TileMap(m_TEX0);
+	}
+}
+
+GSTextureCache::Source::~Source()
+{
+	m_renderer->m_dev->Recycle(m_palette);
+
+	_aligned_free(m_clut);
+
+	_aligned_free(m_write.rect);
+}
+
+void GSTextureCache::Source::Update(const GSVector4i& rect)
+{
+	Surface::Update();
+
+	if(m_complete || m_target)
+	{
+		return;
+	}
+
+	GSVector2i bs = GSLocalMemory::m_psm[m_TEX0.PSM].bs;
+
+	int tw = std::max<int>(1 << m_TEX0.TW, bs.x);
+	int th = std::max<int>(1 << m_TEX0.TH, bs.y);
+
+	GSVector4i r = rect.ralign<Align_Outside>(bs);
+
+	if(r.eq(GSVector4i(0, 0, tw, th)))
+	{
+		m_complete = true; // lame, but better than nothing
+	}
+
+	const GSOffset* off = m_renderer->m_context->offset.tex;
+
+	uint32 blocks = 0;
+
+	if(m_repeating)
+	{
+		for(int y = r.top; y < r.bottom; y += bs.y)
+		{
+			uint32 base = off->block.row[y >> 3];
+
+			for(int x = r.left, i = (y << 7) + x; x < r.right; x += bs.x, i += bs.x)
+			{
+				uint32 block = base + off->block.col[x >> 3];
+
+				if(block < MAX_BLOCKS)
+				{
+					uint32 addr = i >> 3;
+
+					uint32 row = addr >> 5;
+					uint32 col = 1 << (addr & 31);
+
+					if((m_valid[row] & col) == 0)
+					{
+						m_valid[row] |= col;
+
+						Write(GSVector4i(x, y, x + bs.x, y + bs.y));
+
+						blocks++;
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		for(int y = r.top; y < r.bottom; y += bs.y)
+		{
+			uint32 base = off->block.row[y >> 3];
+
+			for(int x = r.left; x < r.right; x += bs.x)
+			{
+				uint32 block = base + off->block.col[x >> 3];
+
+				if(block < MAX_BLOCKS)
+				{
+					uint32 row = block >> 5;
+					uint32 col = 1 << (block & 31);
+
+					if((m_valid[row] & col) == 0)
+					{
+						m_valid[row] |= col;
+
+						Write(GSVector4i(x, y, x + bs.x, y + bs.y));
+
+						blocks++;
+					}
+				}
+			}
+		}
+	}
+
+	if(blocks > 0)
+	{
+		m_renderer->m_perfmon.Put(GSPerfMon::Unswizzle, bs.x * bs.y * blocks << (m_palette ? 2 : 0));
+
+		Flush(m_write.count);
+	}
+}
+
+void GSTextureCache::Source::Write(const GSVector4i& r)
+{
+	m_write.rect[m_write.count++] = r;
+
+	while(m_write.count >= 2)
+	{
+		GSVector4i& a = m_write.rect[m_write.count - 2];
+		GSVector4i& b = m_write.rect[m_write.count - 1];
+
+		if((a == b.zyxw()).mask() == 0xfff0)
+		{
+			a.right = b.right; // extend right
+
+			m_write.count--;
+		}
+		else if((a == b.xwzy()).mask() == 0xff0f)
+		{
+			a.bottom = b.bottom; // extend down
+
+			m_write.count--;
+		}
+		else
+		{
+			break;
+		}
+	}
+
+	if(m_write.count > 2)
+	{
+		Flush(1);
+	}
+}
+
+void GSTextureCache::Source::Flush(uint32 count)
+{
+	// This function as written will not work for paletted formats copied from framebuffers
+	// because they are 8 or 4 bit formats on the GS and the GS local memory module reads
+	// these into an 8 bit format while the D3D surfaces are 32 bit.
+	// However the function is never called for these cases.  This is just for information
+	// should someone wish to use this function for these cases later.
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM];
+
+	int tw = 1 << m_TEX0.TW;
+	int th = 1 << m_TEX0.TH;
+
+	GSVector4i tr(0, 0, tw, th);
+
+	int pitch = max(tw, psm.bs.x) * sizeof(uint32);
+
+	GSLocalMemory& mem = m_renderer->m_mem;
+
+	const GSOffset* off = m_renderer->m_context->offset.tex;
+
+	GSLocalMemory::readTexture rtx = psm.rtx;
+
+	GIFRegTEXA plainTEXA;
+
+	// Until DX is fixed
+	if (s_IS_OPENGL) {
+		plainTEXA = m_TEXA;
+	} else {
+		plainTEXA.AEM = 1;
+		plainTEXA.TA0 = 0;
+		plainTEXA.TA1 = 0x80;
+	}
+
+	if(m_palette)
+	{
+		pitch >>= 2;
+		rtx = psm.rtxP;
+	}
+
+	uint8* buff = m_temp;
+
+	for(uint32 i = 0; i < count; i++)
+	{
+		GSVector4i r = m_write.rect[i];
+
+		if((r > tr).mask() & 0xff00)
+		{
+			(mem.*rtx)(off, r, buff, pitch, m_TEXA);
+
+			m_texture->Update(r.rintersect(tr), buff, pitch);
+		}
+		else
+		{
+			GSTexture::GSMap m;
+
+			if(m_texture->Map(m, &r))
+			{
+				(mem.*rtx)(off, r, m.bits, m.pitch, plainTEXA);
+
+				m_texture->Unmap();
+			}
+			else
+			{
+				(mem.*rtx)(off, r, buff, pitch, plainTEXA);
+
+				m_texture->Update(r, buff, pitch);
+			}
+		}
+	}
+
+	if(count < m_write.count)
+	{
+		// Warning src and destination overlap. Memmove must be used instead of memcpy
+		memmove(&m_write.rect[0], &m_write.rect[count], (m_write.count - count) * sizeof(m_write.rect[0]));
+	}
+
+	m_write.count -= count;
+}
+
+// GSTextureCache::Target
+
+GSTextureCache::Target::Target(GSRenderer* r, const GIFRegTEX0& TEX0, uint8* temp, bool depth_supported)
+	: Surface(r, temp)
+	, m_type(-1)
+	, m_used(false)
+	, m_depth_supported(depth_supported)
+{
+	m_TEX0 = TEX0;
+	m_32_bits_fmt |= !(TEX0.PSM & 2);
+	m_dirty_alpha = (TEX0.PSM != PSM_PSMCT24) && (TEX0.PSM != PSM_PSMZ24);
+
+	m_valid = GSVector4i::zero();
+}
+
+void GSTextureCache::Target::Update()
+{
+	Surface::Update();
+
+	// FIXME: the union of the rects may also update wrong parts of the render target (but a lot faster :)
+	// GH: it must be doable
+	// 1/ rescale the new t to the good size
+	// 2/ copy each rectangle (rescale the rectangle) (use CopyRect or multiple vertex)
+	// Alternate
+	// 1/ uses multiple vertex rectangle
+
+	GSVector4i r = m_dirty.GetDirtyRectAndClear(m_TEX0, m_texture->GetSize());
+
+	if (r.rempty()) return;
+
+	// No handling please
+	if ((m_type == DepthStencil) && !m_depth_supported) {
+		// do the most likely thing a direct write would do, clear it
+		GL_INS("ERROR: Update DepthStencil dummy");
+
+		if((m_renderer->m_game.flags & CRC::ZWriteMustNotClear) == 0)
+			m_renderer->m_dev->ClearDepth(m_texture, 0);
+
+		return;
+	}
+
+	int w = r.width();
+	int h = r.height();
+
+	GIFRegTEXA TEXA;
+
+	TEXA.AEM = 1;
+	TEXA.TA0 = 0;
+	TEXA.TA1 = 0x80;
+
+	GSTexture* t = m_renderer->m_dev->CreateTexture(w, h);
+	if (t == NULL) return;
+
+	const GSOffset* off = m_renderer->m_mem.GetOffset(m_TEX0.TBP0, m_TEX0.TBW, m_TEX0.PSM);
+
+	GSTexture::GSMap m;
+
+	if(t->Map(m))
+	{
+		m_renderer->m_mem.ReadTexture(off, r, m.bits,  m.pitch, TEXA);
+
+		t->Unmap();
+	}
+	else
+	{
+		int pitch = ((w + 3) & ~3) * 4;
+
+		m_renderer->m_mem.ReadTexture(off, r, m_temp, pitch, TEXA);
+
+		t->Update(r.rsize(), m_temp, pitch);
+	}
+
+	// m_renderer->m_perfmon.Put(GSPerfMon::Unswizzle, w * h * 4);
+
+	// Copy the new GS memory content into the destination texture.
+	if(m_type == RenderTarget)
+	{
+		GL_INS("ERROR: Update RenderTarget");
+
+		m_renderer->m_dev->StretchRect(t, m_texture, GSVector4(r) * GSVector4(m_texture->GetScale()).xyxy());
+	}
+	else if(m_type == DepthStencil)
+	{
+		GL_INS("ERROR: Update DepthStencil");
+
+		// FIXME linear or not?
+		m_renderer->m_dev->StretchRect(t, m_texture, GSVector4(r) * GSVector4(m_texture->GetScale()).xyxy(), ShaderConvert_RGBA8_TO_FLOAT32);
+	}
+
+	m_renderer->m_dev->Recycle(t);
+}
+
+// GSTextureCache::SourceMap
+
+void GSTextureCache::SourceMap::Add(Source* s, const GIFRegTEX0& TEX0, const GSOffset* off)
+{
+	m_surfaces.insert(s);
+
+	if(s->m_target)
+	{
+		// TODO
+
+		// GH: I don't know why but it seems we only consider the first page for a render target
+
+		m_map[TEX0.TBP0 >> 5].push_front(s);
+
+		return;
+	}
+
+	// Remaining code will compute a list of pages that are dirty (in a similar fashion as GSOffset::GetPages)
+	// (Maybe GetPages could be used instead, perf opt?)
+	// The source pointer will be stored/duplicated in all m_map[array of pages]
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
+
+	GSVector2i bs = (TEX0.TBP0 & 31) == 0 ? psm.pgs : psm.bs;
+
+	int tw = 1 << TEX0.TW;
+	int th = 1 << TEX0.TH;
+
+	for(int y = 0; y < th; y += bs.y)
+	{
+		uint32 base = off->block.row[y >> 3];
+
+		for(int x = 0; x < tw; x += bs.x)
+		{
+			uint32 page = (base + off->block.col[x >> 3]) >> 5;
+
+			if(page < MAX_PAGES)
+			{
+				m_pages[page >> 5] |= 1 << (page & 31);
+			}
+		}
+	}
+
+	for(size_t i = 0; i < countof(m_pages); i++)
+	{
+		if(uint32 p = m_pages[i])
+		{
+			m_pages[i] = 0;
+
+			list<Source*>* m = &m_map[i << 5];
+
+			unsigned long j;
+
+			while(_BitScanForward(&j, p))
+			{
+				p ^= 1 << j;
+
+				m[j].push_front(s);
+			}
+		}
+	}
+}
+
+void GSTextureCache::SourceMap::RemoveAll()
+{
+	for_each(m_surfaces.begin(), m_surfaces.end(), delete_object());
+
+	m_surfaces.clear();
+
+	for(size_t i = 0; i < countof(m_map); i++)
+	{
+		m_map[i].clear();
+	}
+}
+
+void GSTextureCache::SourceMap::RemoveAt(Source* s)
+{
+	m_surfaces.erase(s);
+
+	GL_CACHE("TC: Remove Src Texture: %d (0x%x)",
+				s->m_texture ? s->m_texture->GetID() : 0,
+				s->m_TEX0.TBP0);
+
+	// Source (except render target) is duplicated for each page they use.
+	for(size_t start = s->m_TEX0.TBP0 >> 5, end = s->m_target ? start : countof(m_map) - 1; start <= end; start++)
+	{
+		list<Source*>& m = m_map[start];
+
+		for(list<Source*>::iterator i = m.begin(); i != m.end(); )
+		{
+			list<Source*>::iterator j = i++;
+
+			if(*j == s) {m.erase(j); break;}
+		}
+	}
+
+	delete s;
+}
diff --git a/plugins/GSdx_legacy/GSTextureCache.h b/plugins/GSdx_legacy/GSTextureCache.h
new file mode 100644
index 0000000000..dbd9a430eb
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache.h
@@ -0,0 +1,159 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+#include "GSDirtyRect.h"
+
+class GSTextureCache
+{
+public:
+	enum {RenderTarget, DepthStencil};
+
+	class Surface : public GSAlignedClass<32>
+	{
+	protected:
+		GSRenderer* m_renderer;
+
+	public:
+		GSTexture* m_texture;
+		GIFRegTEX0 m_TEX0;
+		GIFRegTEXA m_TEXA;
+		int m_age;
+		uint8* m_temp;
+		bool m_32_bits_fmt; // Allow to detect the casting of 32 bits as 16 bits texture
+
+	public:
+		Surface(GSRenderer* r, uint8* temp);
+		virtual ~Surface();
+
+		virtual void Update();
+	};
+
+	class Source : public Surface
+	{
+		struct {GSVector4i* rect; uint32 count;} m_write;
+
+		void Write(const GSVector4i& r);
+		void Flush(uint32 count);
+
+	public:
+		GSTexture* m_palette;
+		bool m_initpalette;
+		uint32 m_valid[MAX_PAGES]; // each uint32 bits map to the 32 blocks of that page
+		uint32* m_clut;
+		bool m_target;
+		bool m_complete;
+		bool m_repeating;
+		bool m_spritehack_t;
+		vector<GSVector2i>* m_p2t;
+
+	public:
+		Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint8* temp);
+		virtual ~Source();
+
+		virtual void Update(const GSVector4i& rect);
+	};
+
+	class Target : public Surface
+	{
+	public:
+		int m_type;
+		bool m_used;
+		GSDirtyRectList m_dirty;
+		GSVector4i m_valid;
+		bool m_depth_supported;
+		bool m_dirty_alpha;
+
+	public:
+		Target(GSRenderer* r, const GIFRegTEX0& TEX0, uint8* temp, bool depth_supported);
+
+		virtual void Update();
+	};
+
+	class SourceMap
+	{
+	public:
+		hash_set<Source*> m_surfaces;
+		list<Source*> m_map[MAX_PAGES];
+		uint32 m_pages[16]; // bitmap of all pages
+		bool m_used;
+
+		SourceMap() : m_used(false) {memset(m_pages, 0, sizeof(m_pages));}
+
+		void Add(Source* s, const GIFRegTEX0& TEX0, const GSOffset* off);
+		void RemoveAll();
+		void RemovePartial();
+		void RemoveAt(Source* s);
+	};
+
+protected:
+	GSRenderer* m_renderer;
+	SourceMap m_src;
+	list<Target*> m_dst[2];
+	bool m_paltex;
+	int m_spritehack;
+	bool m_preload_frame;
+	uint8* m_temp;
+	bool m_can_convert_depth;
+	int m_crc_hack_level;
+	static bool m_disable_partial_invalidation;
+
+	virtual Source* CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* t = NULL, bool half_right = false);
+	virtual Target* CreateTarget(const GIFRegTEX0& TEX0, int w, int h, int type);
+
+	virtual int Get8bitFormat() = 0;
+
+	// TODO: virtual void Write(Source* s, const GSVector4i& r) = 0;
+	// TODO: virtual void Write(Target* t, const GSVector4i& r) = 0;
+#ifndef DISABLE_HW_TEXTURE_CACHE
+	virtual void Read(Target* t, const GSVector4i& r) = 0;
+#endif
+
+	virtual bool CanConvertDepth() { return m_can_convert_depth; }
+
+public:
+	GSTextureCache(GSRenderer* r);
+	virtual ~GSTextureCache();
+#ifdef DISABLE_HW_TEXTURE_CACHE
+	virtual void Read(Target* t, const GSVector4i& r) = 0;
+#endif
+	void RemoveAll();
+	void RemovePartial();
+
+	Source* LookupSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r);
+	Target* LookupTarget(const GIFRegTEX0& TEX0, int w, int h, int type, bool used);
+	Target* LookupTarget(const GIFRegTEX0& TEX0, int w, int h, int real_h);
+
+	void InvalidateVideoMemType(int type, uint32 bp);
+	void InvalidateVideoMem(GSOffset* off, const GSVector4i& r, bool target = true);
+	void InvalidateLocalMem(GSOffset* off, const GSVector4i& r);
+
+	void IncAge();
+	bool UserHacks_HalfPixelOffset;
+
+	const char* to_string(int type) {
+		return (type == DepthStencil) ? "Depth" : "Color";
+	}
+
+	void PrintMemoryUsage();
+};
diff --git a/plugins/GSdx_legacy/GSTextureCache11.cpp b/plugins/GSdx_legacy/GSTextureCache11.cpp
new file mode 100644
index 0000000000..72c4251935
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache11.cpp
@@ -0,0 +1,99 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "StdAfx.h"
+#include "GSTextureCache11.h"
+
+// GSTextureCache11
+
+GSTextureCache11::GSTextureCache11(GSRenderer* r)
+	: GSTextureCache(r)
+{
+}
+
+void GSTextureCache11::Read(Target* t, const GSVector4i& r)
+{
+	if(t->m_type != RenderTarget)
+	{
+		// TODO
+
+		return;
+	}
+
+	const GIFRegTEX0& TEX0 = t->m_TEX0;
+
+	if(TEX0.PSM != PSM_PSMCT32
+	&& TEX0.PSM != PSM_PSMCT24
+	&& TEX0.PSM != PSM_PSMCT16
+	&& TEX0.PSM != PSM_PSMCT16S)
+	{
+		//ASSERT(0);
+
+		return;
+	}
+
+	if (!t->m_dirty.empty() || (r.width() == 0 && r.height() == 0))
+	{
+		return;
+	}
+
+	// printf("GSRenderTarget::Read %d,%d - %d,%d (%08x)\n", r.left, r.top, r.right, r.bottom, TEX0.TBP0);
+
+	int w = r.width();
+	int h = r.height();
+
+	GSVector4 src = GSVector4(r) * GSVector4(t->m_texture->GetScale()).xyxy() / GSVector4(t->m_texture->GetSize()).xyxy();
+
+	DXGI_FORMAT format = TEX0.PSM == PSM_PSMCT16 || TEX0.PSM == PSM_PSMCT16S ? DXGI_FORMAT_R16_UINT : DXGI_FORMAT_R8G8B8A8_UNORM;
+
+	if(GSTexture* offscreen = m_renderer->m_dev->CopyOffscreen(t->m_texture, src, w, h, format))
+	{
+		GSTexture::GSMap m;
+
+		if(offscreen->Map(m))
+		{
+			// TODO: block level write
+
+			GSOffset* off = m_renderer->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMCT32:
+				m_renderer->m_mem.WritePixel32(m.bits, m.pitch, off, r);
+				break;
+			case PSM_PSMCT24:
+				m_renderer->m_mem.WritePixel24(m.bits, m.pitch, off, r);
+				break;
+			case PSM_PSMCT16:
+			case PSM_PSMCT16S:
+				m_renderer->m_mem.WritePixel16(m.bits, m.pitch, off, r);
+				break;
+			default:
+				ASSERT(0);
+			}
+
+			offscreen->Unmap();
+		}
+
+		m_renderer->m_dev->Recycle(offscreen);
+	}
+}
+
diff --git a/plugins/GSdx_legacy/GSTextureCache11.h b/plugins/GSdx_legacy/GSTextureCache11.h
new file mode 100644
index 0000000000..d110dbe156
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache11.h
@@ -0,0 +1,38 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTextureCache.h"
+#include "GSDevice11.h"
+
+class GSTextureCache11 : public GSTextureCache
+{
+protected:
+	int Get8bitFormat() {return DXGI_FORMAT_A8_UNORM;}
+
+	void Read(Target* t, const GSVector4i& r);
+
+	virtual bool CanConvertDepth() { return false; }
+
+public:
+	GSTextureCache11(GSRenderer* r);
+};
diff --git a/plugins/GSdx_legacy/GSTextureCache9.cpp b/plugins/GSdx_legacy/GSTextureCache9.cpp
new file mode 100644
index 0000000000..1820efc2ad
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache9.cpp
@@ -0,0 +1,97 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "StdAfx.h"
+#include "GSTextureCache9.h"
+
+// GSTextureCache9
+
+GSTextureCache9::GSTextureCache9(GSRenderer* r)
+	: GSTextureCache(r)
+{
+}
+
+void GSTextureCache9::Read(Target* t, const GSVector4i& r)
+{
+	if(t->m_type != RenderTarget)
+	{
+		// TODO
+
+		return;
+	}
+
+	const GIFRegTEX0& TEX0 = t->m_TEX0;
+
+	if(TEX0.PSM != PSM_PSMCT32
+	&& TEX0.PSM != PSM_PSMCT24
+	&& TEX0.PSM != PSM_PSMCT16
+	&& TEX0.PSM != PSM_PSMCT16S)
+	{
+		//ASSERT(0);
+
+		return;
+	}
+
+	if (!t->m_dirty.empty() || (r.width() == 0 && r.height() == 0))
+	{
+		return;
+	}
+
+	// printf("GSRenderTarget::Read %d,%d - %d,%d (%08x)\n", r.left, r.top, r.right, r.bottom, TEX0.TBP0);
+
+	int w = r.width();
+	int h = r.height();
+
+	GSVector4 src = GSVector4(r) * GSVector4(t->m_texture->GetScale()).xyxy() / GSVector4(t->m_texture->GetSize()).xyxy();
+
+	if(GSTexture* offscreen = m_renderer->m_dev->CopyOffscreen(t->m_texture, src, w, h))
+	{
+		GSTexture::GSMap m;
+
+		if(offscreen->Map(m))
+		{
+			// TODO: block level write
+
+			GSOffset* off = m_renderer->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+			switch(TEX0.PSM)
+			{
+			case PSM_PSMCT32:
+				m_renderer->m_mem.WritePixel32(m.bits, m.pitch, off, r);
+				break;
+			case PSM_PSMCT24:
+				m_renderer->m_mem.WritePixel24(m.bits, m.pitch, off, r);
+				break;
+			case PSM_PSMCT16:
+			case PSM_PSMCT16S:
+				m_renderer->m_mem.WriteFrame16(m.bits, m.pitch, off, r);
+				break;
+			default:
+				ASSERT(0);
+			}
+
+			offscreen->Unmap();
+		}
+
+		m_renderer->m_dev->Recycle(offscreen);
+	}
+}
+
diff --git a/plugins/GSdx_legacy/GSTextureCache9.h b/plugins/GSdx_legacy/GSTextureCache9.h
new file mode 100644
index 0000000000..1fbf701860
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCache9.h
@@ -0,0 +1,38 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTextureCache.h"
+#include "GSDevice9.h"
+
+class GSTextureCache9 : public GSTextureCache
+{
+protected:
+	int Get8bitFormat() {return D3DFMT_A8;}
+
+	void Read(Target* t, const GSVector4i& r);
+
+	virtual bool CanConvertDepth() { return false; }
+
+public:
+	GSTextureCache9(GSRenderer* r);
+};
diff --git a/plugins/GSdx_legacy/GSTextureCacheOGL.cpp b/plugins/GSdx_legacy/GSTextureCacheOGL.cpp
new file mode 100644
index 0000000000..ee89c6966d
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCacheOGL.cpp
@@ -0,0 +1,131 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureCacheOGL.h"
+
+GSTextureCacheOGL::GSTextureCacheOGL(GSRenderer* r)
+	: GSTextureCache(r)
+{
+}
+
+void GSTextureCacheOGL::Read(Target* t, const GSVector4i& r)
+{
+	if (!t->m_dirty.empty() || (r.width() == 0 && r.height() == 0))
+		return;
+
+	const GIFRegTEX0& TEX0 = t->m_TEX0;
+
+	GLuint fmt;
+	int ps_shader;
+	switch (TEX0.PSM)
+	{
+		case PSM_PSMCT32:
+		case PSM_PSMCT24:
+			fmt = GL_RGBA8;
+			ps_shader = 0;
+			break;
+
+		case PSM_PSMCT16:
+		case PSM_PSMCT16S:
+			fmt = GL_R16UI;
+			ps_shader = 1;
+			break;
+
+		case PSM_PSMZ32:
+			fmt = GL_R32UI;
+			ps_shader = 10;
+			break;
+
+		case PSM_PSMZ24:
+			fmt = GL_R32UI;
+			ps_shader = 10;
+			break;
+
+		case PSM_PSMZ16:
+		case PSM_PSMZ16S:
+			fmt = GL_R16UI;
+			ps_shader = 10;
+			break;
+
+		default:
+			return;
+	}
+
+
+	// Yes lots of logging, but I'm not confident with this code
+	GL_PUSH("Texture Cache Read. Format(0x%x)", TEX0.PSM);
+
+	GL_PERF("TC: Read Back Target: %d (0x%x)[fmt: 0x%x]. Size %dx%d",
+			t->m_texture->GetID(), TEX0.TBP0, TEX0.PSM, r.width(), r.height());
+
+	GSVector4 src = GSVector4(r) * GSVector4(t->m_texture->GetScale()).xyxy() / GSVector4(t->m_texture->GetSize()).xyxy();
+
+	if(GSTexture* offscreen = m_renderer->m_dev->CopyOffscreen(t->m_texture, src, r.width(), r.height(), fmt, ps_shader))
+	{
+		GSTexture::GSMap m;
+		GSVector4i r_offscreen(0, 0, r.width(), r.height());
+
+		if(offscreen->Map(m, &r_offscreen))
+		{
+			// TODO: block level write
+
+			GSOffset* off = m_renderer->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+			switch(TEX0.PSM)
+			{
+				case PSM_PSMCT32:
+					m_renderer->m_mem.WritePixel32(m.bits, m.pitch, off, r);
+					break;
+				case PSM_PSMCT24:
+					m_renderer->m_mem.WritePixel24(m.bits, m.pitch, off, r);
+					break;
+				case PSM_PSMCT16:
+				case PSM_PSMCT16S:
+					m_renderer->m_mem.WritePixel16(m.bits, m.pitch, off, r);
+					break;
+
+				case PSM_PSMZ32:
+					m_renderer->m_mem.WritePixel32(m.bits, m.pitch, off, r);
+					break;
+				case PSM_PSMZ24:
+					m_renderer->m_mem.WritePixel24(m.bits, m.pitch, off, r);
+					break;
+				case PSM_PSMZ16:
+				case PSM_PSMZ16S:
+					m_renderer->m_mem.WritePixel16(m.bits, m.pitch, off, r);
+					break;
+
+				default:
+					ASSERT(0);
+			}
+
+			offscreen->Unmap();
+		}
+
+		// FIXME invalidate data
+		m_renderer->m_dev->Recycle(offscreen);
+	}
+
+	GL_POP();
+}
+
diff --git a/plugins/GSdx_legacy/GSTextureCacheOGL.h b/plugins/GSdx_legacy/GSTextureCacheOGL.h
new file mode 100644
index 0000000000..840826a39b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCacheOGL.h
@@ -0,0 +1,37 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTextureCache.h"
+#include "GSDeviceOGL.h"
+
+class GSTextureCacheOGL final : public GSTextureCache
+{
+protected:
+	int Get8bitFormat() { return GL_R8;}
+
+	void Read(Target* t, const GSVector4i& r);
+
+public:
+	GSTextureCacheOGL(GSRenderer* r);
+};
diff --git a/plugins/GSdx_legacy/GSTextureCacheSW.cpp b/plugins/GSdx_legacy/GSTextureCacheSW.cpp
new file mode 100644
index 0000000000..8d1a2ba6f3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCacheSW.cpp
@@ -0,0 +1,377 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureCacheSW.h"
+
+GSTextureCacheSW::GSTextureCacheSW(GSState* state)
+	: m_state(state)
+{
+}
+
+GSTextureCacheSW::~GSTextureCacheSW()
+{
+	RemoveAll();
+}
+
+GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0)
+{
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM];
+
+	Texture* t = NULL;
+
+	list<Texture*>& m = m_map[TEX0.TBP0 >> 5];
+
+	for(list<Texture*>::iterator i = m.begin(); i != m.end(); i++)
+	{
+		Texture* t2 = *i;
+
+		if(((TEX0.u32[0] ^ t2->m_TEX0.u32[0]) | ((TEX0.u32[1] ^ t2->m_TEX0.u32[1]) & 3)) != 0) // TBP0 TBW PSM TW TH
+		{
+			continue;
+		}
+
+		if((psm.trbpp == 16 || psm.trbpp == 24) && TEX0.TCC && TEXA != t2->m_TEXA)
+		{
+			continue;
+		}
+
+		if(tw0 != 0 && t2->m_tw != tw0)
+		{
+			continue;
+		}
+
+		m.splice(m.begin(), m, i);
+
+		t = t2;
+
+		t->m_age = 0;
+
+		break;
+	}
+
+	if(t == NULL)
+	{
+		t = new Texture(m_state, tw0, TEX0, TEXA);
+
+		m_textures.insert(t);
+
+		for(const uint32* p = t->m_pages.n; *p != GSOffset::EOP; p++)
+		{
+			m_map[*p].push_front(t);
+		}
+	}
+
+	return t;
+}
+
+void GSTextureCacheSW::InvalidatePages(const uint32* pages, uint32 psm)
+{
+	for(const uint32* p = pages; *p != GSOffset::EOP; p++)
+	{
+		uint32 page = *p;
+
+		const list<Texture*>& map = m_map[page];
+
+		for(list<Texture*>::const_iterator i = map.begin(); i != map.end(); i++)
+		{
+			Texture* t = *i;
+
+			if(GSUtil::HasSharedBits(psm, t->m_sharedbits))
+			{
+				uint32* RESTRICT valid = t->m_valid;
+
+				if(t->m_repeating)
+				{
+					vector<GSVector2i>& l = t->m_p2t[page];
+
+					for(vector<GSVector2i>::iterator j = l.begin(); j != l.end(); j++)
+					{
+						valid[j->x] &= j->y;
+					}
+				}
+				else
+				{
+					valid[page] = 0;
+				}
+
+				t->m_complete = false;
+			}
+		}
+	}
+}
+
+void GSTextureCacheSW::RemoveAll()
+{
+	for_each(m_textures.begin(), m_textures.end(), delete_object());
+
+	m_textures.clear();
+
+	for(int i = 0; i < MAX_PAGES; i++)
+	{
+		m_map[i].clear();
+	}
+}
+
+void GSTextureCacheSW::IncAge()
+{
+	for(hash_set<Texture*>::iterator i = m_textures.begin(); i != m_textures.end(); )
+	{
+		hash_set<Texture*>::iterator j = i++;
+
+		Texture* t = *j;
+
+		if(++t->m_age > 10)
+		{
+			m_textures.erase(j);
+
+			for(const uint32* p = t->m_pages.n; *p != GSOffset::EOP; p++)
+			{
+				list<Texture*>& m = m_map[*p];
+
+				for(list<Texture*>::iterator i = m.begin(); i != m.end(); )
+				{
+					list<Texture*>::iterator j = i++;
+
+					if(*j == t) {m.erase(j); break;}
+				}
+			}
+
+			delete t;
+		}
+	}
+}
+
+//
+
+GSTextureCacheSW::Texture::Texture(GSState* state, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
+	: m_state(state)
+	, m_buff(NULL)
+	, m_tw(tw0)
+	, m_age(0)
+	, m_complete(false)
+	, m_p2t(NULL)
+{
+	m_TEX0 = TEX0;
+	m_TEXA = TEXA;
+
+	if(m_tw == 0)
+	{
+		m_tw = std::max<int>(m_TEX0.TW, GSLocalMemory::m_psm[m_TEX0.PSM].pal == 0 ? 3 : 5); // makes one row 32 bytes at least, matches the smallest block size that is allocated for m_buff
+	}
+
+	memset(m_valid, 0, sizeof(m_valid));
+	memset(m_pages.bm, 0, sizeof(m_pages.bm));
+
+	m_sharedbits = GSUtil::HasSharedBitsPtr(m_TEX0.PSM);
+
+	m_offset = m_state->m_mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
+
+	m_pages.n = m_offset->GetPages(GSVector4i(0, 0, 1 << TEX0.TW, 1 << TEX0.TH));
+
+	for(const uint32* p = m_pages.n; *p != GSOffset::EOP; p++)
+	{
+		uint32 page = *p;
+
+		m_pages.bm[page >> 5] |= 1 << (page & 31);
+	}
+
+	m_repeating = m_TEX0.IsRepeating(); // repeating mode always works, it is just slightly slower
+
+	if(m_repeating)
+	{
+		m_p2t = m_state->m_mem.GetPage2TileMap(m_TEX0);
+	}
+}
+
+GSTextureCacheSW::Texture::~Texture()
+{
+	delete [] m_pages.n;
+
+	if(m_buff)
+	{
+		_aligned_free(m_buff);
+	}
+}
+
+bool GSTextureCacheSW::Texture::Update(const GSVector4i& rect)
+{
+	if(m_complete)
+	{
+		return true;
+	}
+
+	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM];
+
+	GSVector2i bs = psm.bs;
+
+	int shift = psm.pal == 0 ? 2 : 0;
+
+	int tw = std::max<int>(1 << m_TEX0.TW, bs.x);
+	int th = std::max<int>(1 << m_TEX0.TH, bs.y);
+
+	GSVector4i r = rect;
+
+	r = r.ralign<Align_Outside>(bs);
+
+	if(r.eq(GSVector4i(0, 0, tw, th)))
+	{
+		m_complete = true; // lame, but better than nothing
+	}
+
+	if(m_buff == NULL)
+	{
+		uint32 pitch = (1 << m_tw) << shift;
+		
+		m_buff = _aligned_malloc(pitch * th * 4, 32);
+
+		if(m_buff == NULL)
+		{
+			return false;
+		}
+	}
+
+	GSLocalMemory& mem = m_state->m_mem;
+
+	const GSOffset* RESTRICT off = m_offset;
+
+	uint32 blocks = 0;
+
+	GSLocalMemory::readTextureBlock rtxbP = psm.rtxbP;
+
+	uint32 pitch = (1 << m_tw) << shift;
+
+	uint8* dst = (uint8*)m_buff + pitch * r.top;
+
+	int block_pitch = pitch * bs.y;
+
+	r = r.srl32(3);
+
+	bs.x >>= 3;
+	bs.y >>= 3;
+
+	shift += 3;
+
+	if(m_repeating)
+	{
+		for(int y = r.top; y < r.bottom; y += bs.y, dst += block_pitch)
+		{
+			uint32 base = off->block.row[y];
+
+			for(int x = r.left, i = (y << 7) + x; x < r.right; x += bs.x, i += bs.x)
+			{
+				uint32 block = base + off->block.col[x];
+
+				if(block < MAX_BLOCKS)
+				{
+					uint32 row = i >> 5;
+					uint32 col = 1 << (i & 31);
+
+					if((m_valid[row] & col) == 0)
+					{
+						m_valid[row] |= col;
+
+						(mem.*rtxbP)(block, &dst[x << shift], pitch, m_TEXA);
+
+						blocks++;
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		for(int y = r.top; y < r.bottom; y += bs.y, dst += block_pitch)
+		{
+			uint32 base = off->block.row[y];
+
+			for(int x = r.left; x < r.right; x += bs.x)
+			{
+				uint32 block = base + off->block.col[x];
+
+				if(block < MAX_BLOCKS)
+				{
+					uint32 row = block >> 5;
+					uint32 col = 1 << (block & 31);
+
+					if((m_valid[row] & col) == 0)
+					{
+						m_valid[row] |= col;
+
+						(mem.*rtxbP)(block, &dst[x << shift], pitch, m_TEXA);
+
+						blocks++;
+					}
+				}
+			}
+		}
+	}
+
+	if(blocks > 0)
+	{
+		m_state->m_perfmon.Put(GSPerfMon::Unswizzle, bs.x * bs.y * blocks << shift);
+	}
+
+	return true;
+}
+
+#include "GSTextureSW.h"
+
+bool GSTextureCacheSW::Texture::Save(const string& fn, bool dds) const
+{
+	const uint32* RESTRICT clut = m_state->m_mem.m_clut;
+
+	int w = 1 << m_TEX0.TW;
+	int h = 1 << m_TEX0.TH;
+
+	GSTextureSW t(0, w, h);
+
+	GSTexture::GSMap m;
+
+	if(t.Map(m, NULL))
+	{
+		const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_TEX0.PSM];
+
+		const uint8* RESTRICT src = (uint8*)m_buff;
+		int pitch = 1 << (m_tw + (psm.pal == 0 ? 2 : 0));
+
+		for(int j = 0; j < h; j++, src += pitch, m.bits += m.pitch)
+		{
+			if(psm.pal == 0)
+			{
+				memcpy(m.bits, src, sizeof(uint32) * w);
+			}
+			else
+			{
+				for(int i = 0; i < w; i++)
+				{
+					((uint32*)m.bits)[i] = clut[src[i]];
+				}
+			}
+		}
+
+		t.Unmap();
+
+		return t.Save(fn.c_str());
+	}
+
+	return false;
+}
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSTextureCacheSW.h b/plugins/GSdx_legacy/GSTextureCacheSW.h
new file mode 100644
index 0000000000..ed59acb35d
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureCacheSW.h
@@ -0,0 +1,72 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSRenderer.h"
+
+class GSTextureCacheSW
+{
+public:
+	class Texture
+	{
+	public:
+		GSState* m_state;
+		GSOffset* m_offset;
+		GIFRegTEX0 m_TEX0;
+		GIFRegTEXA m_TEXA;
+		void* m_buff;
+		uint32 m_tw;
+		uint32 m_age;
+		bool m_complete;
+		bool m_repeating;
+		vector<GSVector2i>* m_p2t;
+		uint32 m_valid[MAX_PAGES];
+		struct {uint32 bm[16]; const uint32* n;} m_pages;
+		const uint32* RESTRICT m_sharedbits;
+
+		// m_valid
+		// fast mode: each uint32 bits map to the 32 blocks of that page
+		// repeating mode: 1 bpp image of the texture tiles (8x8), also having 512 elements is just a coincidence (worst case: (1024*1024)/(8*8)/(sizeof(uint32)*8))
+
+		Texture(GSState* state, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
+		virtual ~Texture();
+
+		bool Update(const GSVector4i& r);
+		bool Save(const string& fn, bool dds = false) const;
+	};
+
+protected:
+	GSState* m_state;
+	hash_set<Texture*> m_textures;
+	list<Texture*> m_map[MAX_PAGES];
+
+public:
+	GSTextureCacheSW(GSState* state);
+	virtual ~GSTextureCacheSW();
+
+	Texture* Lookup(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint32 tw0 = 0);
+
+	void InvalidatePages(const uint32* pages, uint32 psm);
+
+	void RemoveAll();
+	void IncAge();
+};
diff --git a/plugins/GSdx_legacy/GSTextureFX.cpp b/plugins/GSdx_legacy/GSTextureFX.cpp
new file mode 100644
index 0000000000..4deab34da2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureFX.cpp
@@ -0,0 +1,36 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureFX.h"
+
+GSTextureFX::GSTextureFX()
+	: m_dev(NULL)
+{
+}
+
+bool GSTextureFX::Create(GSDevice* dev)
+{
+	m_dev = dev;
+
+	return true;
+}
+
diff --git a/plugins/GSdx_legacy/GSTextureFX11.cpp b/plugins/GSdx_legacy/GSTextureFX11.cpp
new file mode 100644
index 0000000000..77cf05d2fb
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureFX11.cpp
@@ -0,0 +1,427 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDevice11.h"
+#include "resource.h"
+#include "GSTables.h"
+
+bool GSDevice11::CreateTextureFX()
+{
+	HRESULT hr;
+
+	D3D11_BUFFER_DESC bd;
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(VSConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_vs_cb);
+
+	if(FAILED(hr)) return false;
+
+	memset(&bd, 0, sizeof(bd));
+
+	bd.ByteWidth = sizeof(PSConstantBuffer);
+	bd.Usage = D3D11_USAGE_DEFAULT;
+	bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+
+	hr = m_dev->CreateBuffer(&bd, NULL, &m_ps_cb);
+
+	if(FAILED(hr)) return false;
+
+	D3D11_SAMPLER_DESC sd;
+
+	memset(&sd, 0, sizeof(sd));
+
+	sd.Filter = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3D11_FILTER_ANISOTROPIC : D3D11_FILTER_MIN_MAG_MIP_POINT;
+	sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
+	sd.MinLOD = -FLT_MAX;
+	sd.MaxLOD = FLT_MAX;
+	sd.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+	sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
+
+	hr = m_dev->CreateSamplerState(&sd, &m_palette_ss);
+
+	if(FAILED(hr)) return false;
+
+	hr = m_dev->CreateSamplerState(&sd, &m_rt_ss);
+
+	if(FAILED(hr)) return false;
+
+	// create layout
+
+	VSSelector sel;
+	VSConstantBuffer cb;
+
+	SetupVS(sel, &cb);
+
+	//
+
+	return true;
+}
+
+void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
+{
+	hash_map<uint32, GSVertexShader11 >::const_iterator i = m_vs.find(sel);
+
+	if(i == m_vs.end())
+	{
+		string str[4];
+
+		str[0] = format("%d", sel.bppz);
+		str[1] = format("%d", sel.tme);
+		str[2] = format("%d", sel.fst);
+		str[3] = format("%d", sel.rtcopy);
+
+		D3D_SHADER_MACRO macro[] =
+		{
+			{"VS_BPPZ", str[0].c_str()},
+			{"VS_TME", str[1].c_str()},
+			{"VS_FST", str[2].c_str()},
+			{"VS_RTCOPY", str[3].c_str()},
+			{NULL, NULL},
+		};
+
+		D3D11_INPUT_ELEMENT_DESC layout[] =
+		{
+			{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
+			{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
+		};
+
+		GSVertexShader11 vs;
+
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_TFX_FX, shader);
+		CompileShader((const char *)shader.data(), shader.size(), "tfx.fx", nullptr, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
+
+		m_vs[sel] = vs;
+
+		i = m_vs.find(sel);
+	}
+
+	if(m_vs_cb_cache.Update(cb))
+	{
+		ID3D11DeviceContext* ctx = m_ctx;
+
+		ctx->UpdateSubresource(m_vs_cb, 0, NULL, cb, 0, 0);
+	}
+
+	VSSetShader(i->second.vs, m_vs_cb);
+
+	IASetInputLayout(i->second.il);
+}
+
+void GSDevice11::SetupGS(GSSelector sel)
+{
+	CComPtr<ID3D11GeometryShader> gs;
+
+	if(sel.prim > 0 && (sel.iip == 0 || sel.prim == 3)) // geometry shader works in every case, but not needed
+	{
+		hash_map<uint32, CComPtr<ID3D11GeometryShader> >::const_iterator i = m_gs.find(sel);
+
+		if(i != m_gs.end())
+		{
+			gs = i->second;
+		}
+		else
+		{
+			string str[2];
+
+			str[0] = format("%d", sel.iip);
+			str[1] = format("%d", sel.prim);
+
+			D3D_SHADER_MACRO macro[] =
+			{
+				{"GS_IIP", str[0].c_str()},
+				{"GS_PRIM", str[1].c_str()},
+				{NULL, NULL},
+			};
+
+			vector<unsigned char> shader;
+			theApp.LoadResource(IDR_TFX_FX, shader);
+			CompileShader((const char *)shader.data(), shader.size(), "tfx.fx", nullptr, "gs_main", macro, &gs);
+
+			m_gs[sel] = gs;
+		}
+	}
+
+	GSSetShader(gs);
+}
+
+void GSDevice11::SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel)
+{
+	hash_map<uint32, CComPtr<ID3D11PixelShader> >::const_iterator i = m_ps.find(sel);
+
+	if(i == m_ps.end())
+	{
+		string str[20];
+
+		str[0] = format("%d", sel.fst);
+		str[1] = format("%d", sel.wms);
+		str[2] = format("%d", sel.wmt);
+		str[3] = format("%d", sel.fmt);
+		str[4] = format("%d", sel.aem);
+		str[5] = format("%d", sel.tfx);
+		str[6] = format("%d", sel.tcc);
+		str[7] = format("%d", sel.atst);
+		str[8] = format("%d", sel.fog);
+		str[9] = format("%d", sel.clr1);
+		str[10] = format("%d", sel.fba);
+		str[11] = format("%d", sel.aout);
+		str[12] = format("%d", sel.ltf);
+		str[13] = format("%d", sel.colclip);
+		str[14] = format("%d", sel.date);
+		str[15] = format("%d", sel.spritehack);
+		str[16] = format("%d", sel.tcoffsethack);
+		str[17] = format("%d", sel.point_sampler);
+		str[18] = format("%d", sel.shuffle);
+		str[19] = format("%d", sel.read_ba);
+
+		D3D_SHADER_MACRO macro[] =
+		{
+			{"PS_FST", str[0].c_str()},
+			{"PS_WMS", str[1].c_str()},
+			{"PS_WMT", str[2].c_str()},
+			{"PS_FMT", str[3].c_str()},
+			{"PS_AEM", str[4].c_str()},
+			{"PS_TFX", str[5].c_str()},
+			{"PS_TCC", str[6].c_str()},
+			{"PS_ATST", str[7].c_str()},
+			{"PS_FOG", str[8].c_str()},
+			{"PS_CLR1", str[9].c_str()},
+			{"PS_FBA", str[10].c_str()},
+			{"PS_AOUT", str[11].c_str()},
+			{"PS_LTF", str[12].c_str()},
+			{"PS_COLCLIP", str[13].c_str()},
+			{"PS_DATE", str[14].c_str()},
+			{"PS_SPRITEHACK", str[15].c_str()},
+			{"PS_TCOFFSETHACK", str[16].c_str()},
+			{"PS_POINT_SAMPLER", str[17].c_str()},
+			{"PS_SHUFFLE", str[18].c_str() },
+			{"PS_READ_BA", str[19].c_str() },
+			{NULL, NULL},
+		};
+
+		CComPtr<ID3D11PixelShader> ps;
+
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_TFX_FX, shader);
+		CompileShader((const char *)shader.data(), shader.size(), "tfx.fx", nullptr, "ps_main", macro, &ps);
+
+		m_ps[sel] = ps;
+
+		i = m_ps.find(sel);
+	}
+
+	if(m_ps_cb_cache.Update(cb))
+	{
+		ID3D11DeviceContext* ctx = m_ctx;
+
+		ctx->UpdateSubresource(m_ps_cb, 0, NULL, cb, 0, 0);
+	}
+
+	CComPtr<ID3D11SamplerState> ss0, ss1;
+
+	if(sel.tfx != 4)
+	{
+		if(!(sel.fmt < 3 && sel.wms < 3 && sel.wmt < 3))
+		{
+			ssel.ltf = 0;
+		}
+
+		hash_map<uint32, CComPtr<ID3D11SamplerState> >::const_iterator i = m_ps_ss.find(ssel);
+
+		if(i != m_ps_ss.end())
+		{
+			ss0 = i->second;
+		}
+		else
+		{
+			D3D11_SAMPLER_DESC sd, af;
+
+			memset(&sd, 0, sizeof(sd));
+
+			af.Filter = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3D11_FILTER_ANISOTROPIC : D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT;
+			sd.Filter = ssel.ltf ? af.Filter : D3D11_FILTER_MIN_MAG_MIP_POINT;
+
+			sd.AddressU = ssel.tau ? D3D11_TEXTURE_ADDRESS_WRAP : D3D11_TEXTURE_ADDRESS_CLAMP;
+			sd.AddressV = ssel.tav ? D3D11_TEXTURE_ADDRESS_WRAP : D3D11_TEXTURE_ADDRESS_CLAMP;
+			sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
+			sd.MinLOD = -FLT_MAX;
+			sd.MaxLOD = FLT_MAX;
+			sd.MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+			sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
+
+			m_dev->CreateSamplerState(&sd, &ss0);
+
+			m_ps_ss[ssel] = ss0;
+		}
+
+		if(sel.fmt >= 3)
+		{
+			ss1 = m_palette_ss;
+		}
+	}
+
+	PSSetSamplerState(ss0, ss1, sel.date ? m_rt_ss : NULL);
+
+	PSSetShader(i->second, m_ps_cb);
+}
+
+void GSDevice11::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix)
+{
+	hash_map<uint32, CComPtr<ID3D11DepthStencilState> >::const_iterator i = m_om_dss.find(dssel);
+
+	if(i == m_om_dss.end())
+	{
+		D3D11_DEPTH_STENCIL_DESC dsd;
+
+		memset(&dsd, 0, sizeof(dsd));
+
+		if(dssel.date)
+		{
+			dsd.StencilEnable = true;
+			dsd.StencilReadMask = 1;
+			dsd.StencilWriteMask = 1;
+			dsd.FrontFace.StencilFunc = D3D11_COMPARISON_EQUAL;
+			dsd.FrontFace.StencilPassOp = dssel.alpha_stencil ? D3D11_STENCIL_OP_ZERO : D3D11_STENCIL_OP_KEEP;
+			dsd.FrontFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+			dsd.FrontFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+			dsd.BackFace.StencilFunc = D3D11_COMPARISON_EQUAL;
+			dsd.BackFace.StencilPassOp = dssel.alpha_stencil ? D3D11_STENCIL_OP_ZERO : D3D11_STENCIL_OP_KEEP;
+			dsd.BackFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+			dsd.BackFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+		}
+
+		if(dssel.ztst != ZTST_ALWAYS || dssel.zwe)
+		{
+			static const D3D11_COMPARISON_FUNC ztst[] =
+			{
+				D3D11_COMPARISON_NEVER,
+				D3D11_COMPARISON_ALWAYS,
+				D3D11_COMPARISON_GREATER_EQUAL,
+				D3D11_COMPARISON_GREATER
+			};
+
+			dsd.DepthEnable = true;
+			dsd.DepthWriteMask = dssel.zwe ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO;
+			dsd.DepthFunc = ztst[dssel.ztst];
+		}
+
+		CComPtr<ID3D11DepthStencilState> dss;
+
+		m_dev->CreateDepthStencilState(&dsd, &dss);
+
+		m_om_dss[dssel] = dss;
+
+		i = m_om_dss.find(dssel);
+	}
+
+	OMSetDepthStencilState(i->second, 1);
+
+	hash_map<uint32, CComPtr<ID3D11BlendState> >::const_iterator j = m_om_bs.find(bsel);
+
+	if(j == m_om_bs.end())
+	{
+		D3D11_BLEND_DESC bd;
+
+		memset(&bd, 0, sizeof(bd));
+
+		bd.RenderTarget[0].BlendEnable = bsel.abe;
+
+		if(bsel.abe)
+		{
+			int i = ((bsel.a * 3 + bsel.b) * 3 + bsel.c) * 3 + bsel.d;
+
+			bd.RenderTarget[0].BlendOp = (D3D11_BLEND_OP)m_blendMapD3D9[i].op;
+			bd.RenderTarget[0].SrcBlend = (D3D11_BLEND)m_blendMapD3D9[i].src;
+			bd.RenderTarget[0].DestBlend = (D3D11_BLEND)m_blendMapD3D9[i].dst;
+			bd.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD;
+			bd.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ONE;
+			bd.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO;
+
+			// SRC* -> SRC1*
+			// Yes, this casting mess really is needed.  I want to go back to C
+
+			if(bd.RenderTarget[0].SrcBlend >= 3 && bd.RenderTarget[0].SrcBlend <= 6)
+			{
+				bd.RenderTarget[0].SrcBlend = (D3D11_BLEND)((int)bd.RenderTarget[0].SrcBlend + 13);
+			}
+
+			if(bd.RenderTarget[0].DestBlend >= 3 && bd.RenderTarget[0].DestBlend <= 6)
+			{
+				bd.RenderTarget[0].DestBlend = (D3D11_BLEND)((int)bd.RenderTarget[0].DestBlend + 13);
+			}
+
+			// Not very good but I don't wanna write another 81 row table
+
+			if(bsel.negative)
+			{
+				if(bd.RenderTarget[0].BlendOp == D3D11_BLEND_OP_ADD)
+				{
+					bd.RenderTarget[0].BlendOp = D3D11_BLEND_OP_REV_SUBTRACT;
+				}
+				else if(bd.RenderTarget[0].BlendOp == D3D11_BLEND_OP_REV_SUBTRACT)
+				{
+					bd.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD;
+				}
+				else
+					; // god knows, best just not to mess with it for now
+			}
+
+			if(m_blendMapD3D9[i].bogus == 1)
+			{
+				(bsel.a == 0 ? bd.RenderTarget[0].SrcBlend : bd.RenderTarget[0].DestBlend) = D3D11_BLEND_ONE;
+
+				const string afixstr = format("%d >> 7", afix);
+				const char *col[3] = {"Cs", "Cd", "0"};
+				const char *alpha[3] = {"As", "Ad", afixstr.c_str()};
+				
+				printf("Impossible blend for D3D: (%s - %s) * %s + %s\n", col[bsel.a], col[bsel.b], alpha[bsel.c], col[bsel.d]);
+			}
+		}
+
+		if(bsel.wr) bd.RenderTarget[0].RenderTargetWriteMask |= D3D11_COLOR_WRITE_ENABLE_RED;
+		if(bsel.wg) bd.RenderTarget[0].RenderTargetWriteMask |= D3D11_COLOR_WRITE_ENABLE_GREEN;
+		if(bsel.wb) bd.RenderTarget[0].RenderTargetWriteMask |= D3D11_COLOR_WRITE_ENABLE_BLUE;
+		if(bsel.wa) bd.RenderTarget[0].RenderTargetWriteMask |= D3D11_COLOR_WRITE_ENABLE_ALPHA;
+
+		CComPtr<ID3D11BlendState> bs;
+
+		m_dev->CreateBlendState(&bd, &bs);
+
+		m_om_bs[bsel] = bs;
+
+		j = m_om_bs.find(bsel);
+	}
+
+	OMSetBlendState(j->second, (float)(int)afix / 0x80);
+}
diff --git a/plugins/GSdx_legacy/GSTextureFX9.cpp b/plugins/GSdx_legacy/GSTextureFX9.cpp
new file mode 100644
index 0000000000..66f735c17c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureFX9.cpp
@@ -0,0 +1,345 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDevice9.h"
+#include "resource.h"
+#include "GSTables.h"
+
+GSTexture* GSDevice9::CreateMskFix(uint32 size, uint32 msk, uint32 fix)
+{
+	GSTexture* t = NULL;
+
+	uint32 hash = (size << 20) | (msk << 10) | fix;
+
+	hash_map<uint32, GSTexture*>::iterator i = m_mskfix.find(hash);
+
+	if(i != m_mskfix.end())
+	{
+		t = i->second;
+	}
+	else
+	{
+		t = CreateTexture(size, 1, D3DFMT_R32F);
+
+		if(t)
+		{
+			GSTexture::GSMap m;
+
+			if(t->Map(m))
+			{
+				for(uint32 i = 0; i < size; i++)
+				{
+					((float*)m.bits)[i] = (float)((i & msk) | fix) / size;
+				}
+
+				t->Unmap();
+			}
+
+			m_mskfix[hash] = t;
+		}
+	}
+
+	return t;
+}
+
+void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
+{
+	hash_map<uint32, GSVertexShader9>::const_iterator i = m_vs.find(sel);
+
+	if(i == m_vs.end())
+	{
+		string str[5];
+
+		str[0] = format("%d", sel.bppz);
+		str[1] = format("%d", sel.tme);
+		str[2] = format("%d", sel.fst);
+		str[3] = format("%d", sel.logz);
+		str[4] = format("%d", sel.rtcopy);
+
+		D3D_SHADER_MACRO macro[] =
+		{
+			{"VS_BPPZ", str[0].c_str()},
+			{"VS_TME", str[1].c_str()},
+			{"VS_FST", str[2].c_str()},
+			{"VS_LOGZ", str[3].c_str()},
+			{"VS_RTCOPY", str[4].c_str()},
+			{NULL, NULL},
+		};
+
+		static const D3DVERTEXELEMENT9 layout[] =
+		{
+			{0, 0, D3DDECLTYPE_FLOAT2, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_TEXCOORD, 0},
+			{0, 8, D3DDECLTYPE_D3DCOLOR, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_COLOR, 0},
+			{0, 12, D3DDECLTYPE_D3DCOLOR, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_COLOR, 1},
+			{0, 16,  D3DDECLTYPE_FLOAT4, D3DDECLMETHOD_DEFAULT, D3DDECLUSAGE_POSITION, 0},
+			D3DDECL_END()
+		};
+
+		GSVertexShader9 vs;
+
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_TFX_FX, shader);
+		CompileShader((const char *)shader.data(), shader.size(), "tfx.fx", "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
+
+		m_vs[sel] = vs;
+
+		i = m_vs.find(sel);
+	}
+
+	VSSetShader(i->second.vs, (const float*)cb, sizeof(*cb) / sizeof(GSVector4));
+
+	IASetInputLayout(i->second.il);
+}
+
+void GSDevice9::SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel)
+{
+	if(cb->WH.z > 0 && cb->WH.w > 0 && (sel.wms == 3 || sel.wmt == 3))
+	{
+		GSVector4i size(cb->WH);
+
+		if(sel.wms == 3)
+		{
+			if(GSTexture* t = CreateMskFix(size.z, cb->MskFix.x, cb->MskFix.z))
+			{
+				m_dev->SetTexture(3, *(GSTexture9*)t);
+			}
+		}
+
+		if(sel.wmt == 3)
+		{
+			if(GSTexture* t = CreateMskFix(size.w, cb->MskFix.y, cb->MskFix.w))
+			{
+				m_dev->SetTexture(4, *(GSTexture9*)t);
+			}
+		}
+	}
+
+	hash_map<uint32, CComPtr<IDirect3DPixelShader9> >::const_iterator i = m_ps.find(sel);
+
+	if(i == m_ps.end())
+	{
+		string str[17];
+
+		str[0] = format("%d", sel.fst);
+		str[1] = format("%d", sel.wms);
+		str[2] = format("%d", sel.wmt);
+		str[3] = format("%d", sel.fmt);
+		str[4] = format("%d", sel.aem);
+		str[5] = format("%d", sel.tfx);
+		str[6] = format("%d", sel.tcc);
+		str[7] = format("%d", sel.atst);
+		str[8] = format("%d", sel.fog);
+		str[9] = format("%d", sel.clr1);
+		str[10] = format("%d", sel.rt);
+		str[11] = format("%d", sel.ltf);
+		str[12] = format("%d", sel.colclip);
+		str[13] = format("%d", sel.date);
+		str[14] = format("%d", sel.spritehack);
+		str[15] = format("%d", sel.tcoffsethack);
+		str[16] = format("%d", sel.point_sampler);
+
+		D3D_SHADER_MACRO macro[] =
+		{
+			{"PS_FST", str[0].c_str()},
+			{"PS_WMS", str[1].c_str()},
+			{"PS_WMT", str[2].c_str()},
+			{"PS_FMT", str[3].c_str()},
+			{"PS_AEM", str[4].c_str()},
+			{"PS_TFX", str[5].c_str()},
+			{"PS_TCC", str[6].c_str()},
+			{"PS_ATST", str[7].c_str()},
+			{"PS_FOG", str[8].c_str()},
+			{"PS_CLR1", str[9].c_str()},
+			{"PS_RT", str[10].c_str()},
+			{"PS_LTF", str[11].c_str()},
+			{"PS_COLCLIP", str[12].c_str()},
+			{"PS_DATE", str[13].c_str()},
+			{"PS_SPRITEHACK", str[14].c_str()},
+			{"PS_TCOFFSETHACK", str[15].c_str()},
+			{"PS_POINT_SAMPLER", str[16].c_str()},
+			{NULL, NULL},
+		};
+
+		CComPtr<IDirect3DPixelShader9> ps;
+
+		vector<unsigned char> shader;
+		theApp.LoadResource(IDR_TFX_FX, shader);
+		CompileShader((const char *)shader.data(), shader.size(), "tfx.fx", "ps_main", macro, &ps);
+
+		m_ps[sel] = ps;
+
+		i = m_ps.find(sel);
+	}
+
+	PSSetShader(i->second, (const float*)cb, sizeof(*cb) / sizeof(GSVector4));
+
+	Direct3DSamplerState9* ss = NULL;
+
+	if(sel.tfx != 4)
+	{
+		if(!(sel.fmt < 3 && sel.wms < 3 && sel.wmt < 3))
+		{
+			ssel.ltf = 0;
+		}
+
+		hash_map<uint32, Direct3DSamplerState9* >::const_iterator i = m_ps_ss.find(ssel);
+
+		if(i != m_ps_ss.end())
+		{
+			ss = i->second;
+		}
+		else
+		{
+			ss = new Direct3DSamplerState9();
+
+			memset(ss, 0, sizeof(*ss));
+
+			ss->Anisotropic[0] = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3DTEXF_ANISOTROPIC : D3DTEXF_LINEAR;
+			ss->Anisotropic[1] = theApp.GetConfig("MaxAnisotropy", 0) && !theApp.GetConfig("paltex", 0) ? D3DTEXF_ANISOTROPIC : D3DTEXF_POINT;
+			ss->FilterMin[0] = ssel.ltf ? ss->Anisotropic[0] : D3DTEXF_POINT;
+			ss->FilterMag[0] = ssel.ltf ? ss->Anisotropic[0] : D3DTEXF_POINT;
+			ss->FilterMip[0] = ssel.ltf ? ss->Anisotropic[0] : D3DTEXF_POINT;
+			ss->FilterMin[1] = ss->Anisotropic[1];
+			ss->FilterMag[1] = ss->Anisotropic[1];
+			ss->FilterMip[1] = ss->Anisotropic[1];
+			ss->AddressU = ssel.tau ? D3DTADDRESS_WRAP : D3DTADDRESS_CLAMP;
+			ss->AddressV = ssel.tav ? D3DTADDRESS_WRAP : D3DTADDRESS_CLAMP;
+			ss->MaxAnisotropy = theApp.GetConfig("MaxAnisotropy", 0);
+			ss->MaxLOD = ULONG_MAX;
+
+
+			m_ps_ss[ssel] = ss;
+		}
+	}
+
+	PSSetSamplerState(ss);
+}
+
+void GSDevice9::SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, uint8 afix)
+{
+	Direct3DDepthStencilState9* dss = NULL;
+
+	hash_map<uint32, Direct3DDepthStencilState9*>::const_iterator i = m_om_dss.find(dssel);
+
+	if(i == m_om_dss.end())
+	{
+		dss = new Direct3DDepthStencilState9();
+
+		memset(dss, 0, sizeof(*dss));
+
+		if(dssel.date || dssel.fba)
+		{
+			dss->StencilEnable = true;
+			dss->StencilReadMask = 1;
+			dss->StencilWriteMask = dssel.alpha_stencil ? 3 : 2;
+			dss->StencilFunc = dssel.date ? D3DCMP_EQUAL : D3DCMP_ALWAYS;
+			dss->StencilPassOp = dssel.alpha_stencil ? D3DSTENCILOP_ZERO : dssel.fba ? D3DSTENCILOP_REPLACE : D3DSTENCILOP_KEEP;
+			dss->StencilFailOp = dssel.fba && !dssel.alpha_stencil ? D3DSTENCILOP_ZERO : D3DSTENCILOP_KEEP;
+			dss->StencilDepthFailOp = D3DSTENCILOP_KEEP;
+			dss->StencilRef = 3;
+		}
+
+		if(dssel.ztst != ZTST_ALWAYS || dssel.zwe)
+		{
+			static const D3DCMPFUNC ztst[] =
+			{
+				D3DCMP_NEVER,
+				D3DCMP_ALWAYS,
+				D3DCMP_GREATEREQUAL,
+				D3DCMP_GREATER
+			};
+
+			dss->DepthEnable = true;
+			dss->DepthWriteMask = dssel.zwe;
+			dss->DepthFunc = ztst[dssel.ztst];
+		}
+
+		m_om_dss[dssel] = dss;
+
+		i = m_om_dss.find(dssel);
+	}
+
+	OMSetDepthStencilState(i->second);
+
+	hash_map<uint32, Direct3DBlendState9*>::const_iterator j = m_om_bs.find(bsel);
+
+	if(j == m_om_bs.end())
+	{
+		Direct3DBlendState9* bs = new Direct3DBlendState9();
+
+		memset(bs, 0, sizeof(*bs));
+
+		bs->BlendEnable = bsel.abe;
+
+		if(bsel.abe)
+		{
+			int i = ((bsel.a * 3 + bsel.b) * 3 + bsel.c) * 3 + bsel.d;
+
+			bs->BlendOp = (D3DBLENDOP)m_blendMapD3D9[i].op;
+			bs->SrcBlend = (D3DBLEND)m_blendMapD3D9[i].src;
+			bs->DestBlend = (D3DBLEND)m_blendMapD3D9[i].dst;
+			bs->BlendOpAlpha = D3DBLENDOP_ADD;
+			bs->SrcBlendAlpha = D3DBLEND_ONE;
+			bs->DestBlendAlpha = D3DBLEND_ZERO;
+
+			// Not very good but I don't wanna write another 81 row table
+
+			if(bsel.negative)
+			{
+				if(bs->BlendOp == D3DBLENDOP_ADD)
+				{
+					bs->BlendOp = D3DBLENDOP_REVSUBTRACT;
+				}
+				else if(bs->BlendOp == D3DBLENDOP_REVSUBTRACT)
+				{
+					bs->BlendOp = D3DBLENDOP_ADD;
+				}
+				else
+					; // god knows, best just not to mess with it for now
+			}
+
+			if(m_blendMapD3D9[i].bogus == 1)
+			{
+				(bsel.a == 0 ? bs->SrcBlend : bs->DestBlend) = D3DBLEND_ONE;
+
+				const string afixstr = format("%d >> 7", afix);
+				const char *col[3] = {"Cs", "Cd", "0"};
+				const char *alpha[3] = {"As", "Ad", afixstr.c_str()};
+
+				printf("Impossible blend for D3D: (%s - %s) * %s + %s\n", col[bsel.a], col[bsel.b], alpha[bsel.c], col[bsel.d]);
+			}
+		}
+
+		// this is not a typo; dx9 uses BGRA rather than the gs native RGBA, unlike dx10
+
+		if(bsel.wr) bs->RenderTargetWriteMask |= D3DCOLORWRITEENABLE_BLUE;
+		if(bsel.wg) bs->RenderTargetWriteMask |= D3DCOLORWRITEENABLE_GREEN;
+		if(bsel.wb) bs->RenderTargetWriteMask |= D3DCOLORWRITEENABLE_RED;
+		if(bsel.wa) bs->RenderTargetWriteMask |= D3DCOLORWRITEENABLE_ALPHA;
+
+		m_om_bs[bsel] = bs;
+
+		j = m_om_bs.find(bsel);
+	}
+
+	OMSetBlendState(j->second, afix >= 0x80 ? 0xffffff : 0x020202 * afix);
+}
diff --git a/plugins/GSdx_legacy/GSTextureFXOGL.cpp b/plugins/GSdx_legacy/GSTextureFXOGL.cpp
new file mode 100644
index 0000000000..4c89ae9647
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureFXOGL.cpp
@@ -0,0 +1,162 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSDeviceOGL.h"
+#include "GSTables.h"
+
+static const uint32 g_vs_cb_index = 20;
+static const uint32 g_ps_cb_index = 21;
+static const uint32 g_gs_cb_index = 22;
+
+void GSDeviceOGL::CreateTextureFX()
+{
+	m_vs_cb = new GSUniformBufferOGL(g_vs_cb_index, sizeof(VSConstantBuffer));
+	m_ps_cb = new GSUniformBufferOGL(g_ps_cb_index, sizeof(PSConstantBuffer));
+
+	// warning 1 sampler by image unit. So you cannot reuse m_ps_ss...
+	m_palette_ss = CreateSampler(false, false, false);
+	glBindSampler(1, m_palette_ss);
+
+	// Pre compile all Geometry & Vertex Shader
+	// It might cost a seconds at startup but it would reduce benchmark pollution
+	GL_PUSH("Compile GS");
+
+	for (uint32 key = 0; key < countof(m_gs); key++) {
+		GSSelector sel(key);
+		if (sel.point == sel.sprite)
+			m_gs[key] = 0;
+		else
+			m_gs[key] = CompileGS(GSSelector(key));
+	}
+
+	GL_POP();
+
+	GL_PUSH("Compile VS");
+
+	for (uint32 key = 0; key < countof(m_vs); key++) {
+		VSSelector sel(key);
+		m_vs[key] = CompileVS(sel, !GLLoader::found_GL_ARB_clip_control);
+	}
+
+	GL_POP();
+
+	// Enable all bits for stencil operations. Technically 1 bit is
+	// enough but buffer is polluted with noise. Clear will be limited
+	// to the mask.
+	glStencilMask(0xFF);
+	for (uint32 key = 0; key < countof(m_om_dss); key++) {
+		m_om_dss[key] = CreateDepthStencil(OMDepthStencilSelector(key));
+	}
+
+	// Help to debug FS in apitrace
+	m_apitrace = CompilePS(PSSelector());
+}
+
+GSDepthStencilOGL* GSDeviceOGL::CreateDepthStencil(OMDepthStencilSelector dssel)
+{
+	GSDepthStencilOGL* dss = new GSDepthStencilOGL();
+
+	if (dssel.date)
+	{
+		dss->EnableStencil();
+		dss->SetStencil(GL_EQUAL, GL_KEEP);
+	}
+
+	if(dssel.ztst != ZTST_ALWAYS || dssel.zwe)
+	{
+		static const GLenum ztst[] =
+		{
+			GL_NEVER,
+			GL_ALWAYS,
+			GL_GEQUAL,
+			GL_GREATER
+		};
+		dss->EnableDepth();
+		dss->SetDepth(ztst[dssel.ztst], dssel.zwe);
+	}
+
+	return dss;
+}
+
+void GSDeviceOGL::SetupCB(const VSConstantBuffer* vs_cb, const PSConstantBuffer* ps_cb)
+{
+	GL_PUSH("UBO");
+	if(m_vs_cb_cache.Update(vs_cb)) {
+		m_vs_cb->upload(vs_cb);
+	}
+
+	if(m_ps_cb_cache.Update(ps_cb)) {
+		m_ps_cb->upload(ps_cb);
+	}
+	GL_POP();
+}
+
+void GSDeviceOGL::SetupVS(VSSelector sel)
+{
+	m_shader->VS(m_vs[sel]);
+}
+
+void GSDeviceOGL::SetupGS(GSSelector sel)
+{
+	m_shader->GS(m_gs[sel]);
+}
+
+void GSDeviceOGL::SetupPS(PSSelector sel)
+{
+	// *************************************************************
+	// Static
+	// *************************************************************
+	GLuint ps;
+	auto i = m_ps.find(sel);
+
+	if (i == m_ps.end()) {
+		ps = CompilePS(sel);
+		m_ps[sel] = ps;
+	} else {
+		ps = i->second;
+	}
+
+	// *************************************************************
+	// Dynamic
+	// *************************************************************
+	m_shader->PS(ps);
+}
+
+void GSDeviceOGL::SetupSampler(PSSamplerSelector ssel)
+{
+	PSSetSamplerState(m_ps_ss[ssel]);
+}
+
+GLuint GSDeviceOGL::GetSamplerID(PSSamplerSelector ssel)
+{
+	return m_ps_ss[ssel];
+}
+
+GLuint GSDeviceOGL::GetPaletteSamplerID()
+{
+	return m_palette_ss;
+}
+
+void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel)
+{
+	OMSetDepthStencilState(m_om_dss[dssel]);
+}
diff --git a/plugins/GSdx_legacy/GSTextureNull.cpp b/plugins/GSdx_legacy/GSTextureNull.cpp
new file mode 100644
index 0000000000..8a372573d8
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureNull.cpp
@@ -0,0 +1,36 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureNull.h"
+
+GSTextureNull::GSTextureNull()
+{
+	memset(&m_desc, 0, sizeof(m_desc));
+}
+
+GSTextureNull::GSTextureNull(int type, int w, int h, int format)
+{
+	m_desc.type = type;
+	m_desc.w = w;
+	m_desc.h = h;
+	m_desc.format = format;
+}
diff --git a/plugins/GSdx_legacy/GSTextureNull.h b/plugins/GSdx_legacy/GSTextureNull.h
new file mode 100644
index 0000000000..5d443b24e7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureNull.h
@@ -0,0 +1,41 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTexture.h"
+
+class GSTextureNull : public GSTexture
+{
+	struct {int type, w, h, format;} m_desc;
+
+public:
+	GSTextureNull();
+	GSTextureNull(int type, int w, int h, int format);
+
+	int GetType() const {return m_desc.type;}
+	int GetFormat() const {return m_desc.format;}
+
+	bool Update(const GSVector4i& r, const void* data, int pitch) {return true;}
+	bool Map(GSMap& m, const GSVector4i* r) {return false;}
+	void Unmap() {}
+	bool Save(const string& fn, bool user_image = false, bool dds = false) { return false; }
+};
diff --git a/plugins/GSdx_legacy/GSTextureOGL.cpp b/plugins/GSdx_legacy/GSTextureOGL.cpp
new file mode 100644
index 0000000000..15ca86b3e7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureOGL.cpp
@@ -0,0 +1,551 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include <limits.h>
+#include "GSTextureOGL.h"
+#include "GLState.h"
+#include "GSPng.h"
+
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+extern uint64 g_real_texture_upload_byte;
+#endif
+
+// FIXME find the optimal number of PBO
+#define PBO_POOL_SIZE 8
+
+// FIXME OGL4: investigate, only 1 unpack buffer always bound
+namespace PboPool {
+
+	GLuint m_pool[PBO_POOL_SIZE];
+	uptr m_offset[PBO_POOL_SIZE];
+	char*  m_map[PBO_POOL_SIZE];
+	uint32 m_current_pbo = 0;
+	uint32 m_size;
+	bool   m_texture_storage;
+	GLsync m_fence[PBO_POOL_SIZE];
+	const uint32 m_pbo_size = 8*1024*1024;
+
+	// Option for buffer storage
+	// XXX: actually does I really need coherent and barrier???
+	// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
+	// the value visible to the server
+	const GLbitfield common_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+	const GLbitfield map_flags = common_flags | GL_MAP_FLUSH_EXPLICIT_BIT;
+	const GLbitfield create_flags = common_flags | GL_CLIENT_STORAGE_BIT;
+
+	// Perf impact (test was only done on a gs dump):
+	// Normal (fast): Message:Buffer detailed info: Buffer object 9 (bound to
+	//	GL_PIXEL_UNPACK_BUFFER_ARB, usage hint is GL_STREAM_COPY) will use VIDEO
+	//	memory as the source for buffer object operations.
+	//
+	// Persistent (slower): Message:Buffer detailed info: Buffer object 8
+	//	(bound to GL_PIXEL_UNPACK_BUFFER_ARB, usage hint is GL_DYNAMIC_DRAW)
+	//	will use DMA CACHED memory as the source for buffer object operations
+	void Init() {
+		glGenBuffers(countof(m_pool), m_pool);
+		m_texture_storage  = GLLoader::found_GL_ARB_buffer_storage;
+
+		for (size_t i = 0; i < countof(m_pool); i++) {
+			BindPbo();
+
+			if (m_texture_storage) {
+				glBufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, create_flags);
+				m_map[m_current_pbo] = (char*)glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, map_flags);
+				m_fence[m_current_pbo] = 0;
+			} else {
+				glBufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_COPY);
+				m_map[m_current_pbo] = NULL;
+			}
+
+			NextPbo();
+		}
+		UnbindPbo();
+	}
+
+	char* Map(uint32 size) {
+		char* map;
+		m_size = size;
+
+		if (m_size > m_pbo_size) {
+			fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
+		}
+
+		if (m_texture_storage) {
+			if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
+				//NextPbo(); // For test purpose
+				NextPboWithSync();
+			}
+
+			// Note: texsubimage will access currently bound buffer
+			// Pbo ready let's get a pointer
+			BindPbo();
+
+			map = m_map[m_current_pbo] + m_offset[m_current_pbo];
+
+		} else {
+			GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
+
+			if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
+				NextPbo();
+
+				flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
+				flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
+			}
+
+			// Pbo ready let's get a pointer
+			BindPbo();
+
+			// Be sure the map is aligned
+			map = (char*)glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
+		}
+
+		return map;
+	}
+
+	void Unmap() {
+		if (m_texture_storage) {
+			glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size);
+		} else {
+			glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+		}
+	}
+
+	uptr Offset() {
+		return m_offset[m_current_pbo];
+	}
+
+	void Destroy() {
+		if (m_texture_storage) {
+			for (size_t i = 0; i < countof(m_pool); i++) {
+				m_map[i] = NULL;
+				m_offset[i] = 0;
+				glDeleteSync(m_fence[i]);
+
+				// Don't know if we must do it
+				glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_pool[i]);
+				glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+			}
+			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+		}
+		glDeleteBuffers(countof(m_pool), m_pool);
+	}
+
+	void BindPbo() {
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, m_pool[m_current_pbo]);
+	}
+
+	void NextPbo() {
+		m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
+		// Mark new PBO as free
+		m_offset[m_current_pbo] = 0;
+	}
+
+	void NextPboWithSync() {
+		m_fence[m_current_pbo] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+		NextPbo();
+		if (m_fence[m_current_pbo]) {
+#ifdef ENABLE_OGL_DEBUG_FENCE
+			GLenum status = glClientWaitSync(m_fence[m_current_pbo], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+#else
+			glClientWaitSync(m_fence[m_current_pbo], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+#endif
+			glDeleteSync(m_fence[m_current_pbo]);
+			m_fence[m_current_pbo] = 0;
+
+#ifdef ENABLE_OGL_DEBUG_FENCE
+			if (status != GL_ALREADY_SIGNALED) {
+				fprintf(stderr, "GL_PIXEL_UNPACK_BUFFER: Sync Sync! Buffer too small\n");
+			}
+#endif
+		}
+	}
+
+	void UnbindPbo() {
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+	}
+
+	void EndTransfer() {
+		// Note: keep offset aligned for SSE/AVX
+		m_offset[m_current_pbo] = (m_offset[m_current_pbo] + m_size + 63) & ~0x3F;
+	}
+}
+
+// FIXME: check if it possible to always use those setup by default
+// glPixelStorei(GL_PACK_ALIGNMENT, 1);
+// glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read)
+	: m_pbo_size(0), m_dirty(false), m_clean(false), m_local_buffer(NULL), m_r_x(0), m_r_y(0), m_r_w(0), m_r_h(0)
+{
+	// OpenGL didn't like dimensions of size 0
+	m_size.x = max(1,w);
+	m_size.y = max(1,h);
+	m_format = format;
+	m_type   = type;
+	m_fbo_read = fbo_read;
+	m_texture_id = 0;
+
+	// Bunch of constant parameter
+	switch (m_format) {
+			// 1 Channel integer
+		case GL_R32UI:
+		case GL_R32I:
+			m_int_format    = GL_RED_INTEGER;
+			m_int_type      = (m_format == GL_R32UI) ? GL_UNSIGNED_INT : GL_INT;
+			m_int_alignment = 4;
+			m_int_shift     = 2;
+			break;
+		case GL_R16UI:
+			m_int_format    = GL_RED_INTEGER;
+			m_int_type      = GL_UNSIGNED_SHORT;
+			m_int_alignment = 2;
+			m_int_shift     = 1;
+			break;
+
+			// 1 Channel normalized
+		case GL_R8:
+			m_int_format    = GL_RED;
+			m_int_type      = GL_UNSIGNED_BYTE;
+			m_int_alignment = 1;
+			m_int_shift     = 0;
+			break;
+
+			// 4 channel normalized
+		case GL_RGBA16:
+			m_int_format    = GL_RGBA;
+			m_int_type      = GL_UNSIGNED_SHORT;
+			m_int_alignment = 8;
+			m_int_shift     = 3;
+			break;
+		case GL_RGBA8:
+			m_int_format    = GL_RGBA;
+			m_int_type      = GL_UNSIGNED_BYTE;
+			m_int_alignment = 4;
+			m_int_shift     = 2;
+			break;
+
+			// 4 channel integer
+		case GL_RGBA16I:
+		case GL_RGBA16UI:
+			m_int_format    = GL_RGBA_INTEGER;
+			m_int_type      = (m_format == GL_R16UI) ? GL_UNSIGNED_SHORT : GL_SHORT;
+			m_int_alignment = 8;
+			m_int_shift     = 3;
+			break;
+
+			// 4 channel float
+		case GL_RGBA32F:
+			m_int_format    = GL_RGBA;
+			m_int_type      = GL_FLOAT;
+			m_int_alignment = 16;
+			m_int_shift     = 4;
+			break;
+		case GL_RGBA16F:
+			m_int_format    = GL_RGBA;
+			m_int_type      = GL_HALF_FLOAT;
+			m_int_alignment = 8;
+			m_int_shift     = 3;
+			break;
+
+			// Special
+		case 0:
+		case GL_DEPTH32F_STENCIL8:
+			// Backbuffer & dss aren't important
+			m_int_format    = 0;
+			m_int_type      = 0;
+			m_int_alignment = 0;
+			m_int_shift     = 0;
+			break;
+
+		default:
+			m_int_format    = 0;
+			m_int_type      = 0;
+			m_int_alignment = 0;
+			m_int_shift     = 0;
+			ASSERT(0);
+	}
+
+	// Generate & Allocate the buffer
+	switch (m_type) {
+		case GSTexture::Offscreen:
+			// 8B is the worst case for depth/stencil
+			// FIXME I think it is only used for color. So you can save half of the size
+			m_local_buffer = (uint8*)_aligned_malloc(m_size.x * m_size.y * 4, 32);
+		case GSTexture::Texture:
+		case GSTexture::RenderTarget:
+		case GSTexture::DepthStencil:
+			glCreateTextures(GL_TEXTURE_2D, 1, &m_texture_id);
+			glTextureStorage2D(m_texture_id, 1+GL_TEX_LEVEL_0, m_format, m_size.x, m_size.y);
+			if (m_format == GL_R8) {
+				// Emulate DX behavior, beside it avoid special code in shader to differentiate
+				// palette texture from a GL_RGBA target or a GL_R texture.
+				glTextureParameteri(m_texture_id, GL_TEXTURE_SWIZZLE_A, GL_RED);
+			}
+			break;
+		case GSTexture::Backbuffer:
+		default:
+			break;
+	}
+}
+
+GSTextureOGL::~GSTextureOGL()
+{
+	/* Unbind the texture from our local state */
+
+	if (m_texture_id == GLState::rt)
+		GLState::rt = 0;
+	if (m_texture_id == GLState::ds)
+		GLState::ds = 0;
+	for (size_t i = 0; i < countof(GLState::tex_unit); i++) {
+		if (m_texture_id == GLState::tex_unit[i])
+			GLState::tex_unit[i] = 0;
+	}
+
+	glDeleteTextures(1, &m_texture_id);
+
+	if (m_local_buffer)
+		_aligned_free(m_local_buffer);
+}
+
+void GSTextureOGL::Invalidate()
+{
+	if (m_dirty && glInvalidateTexImage) {
+		glInvalidateTexImage(m_texture_id, GL_TEX_LEVEL_0);
+		m_dirty = false;
+	}
+}
+
+bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
+{
+	ASSERT(m_type != GSTexture::DepthStencil && m_type != GSTexture::Offscreen);
+
+	// Default upload path for the texture is the Map/Unmap
+	// This path is mostly used for palette. But also for texture that could
+	// overflow the pbo buffer
+	// Data upload is rather small typically 64B or 1024B. So don't bother with PBO
+	// and directly send the data to the GL synchronously
+
+	m_dirty = true;
+	m_clean = false;
+
+	uint32 row_byte = r.width() << m_int_shift;
+	uint32 map_size = r.height() * row_byte;
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+	g_real_texture_upload_byte += map_size;
+#endif
+
+	glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
+
+#if 0
+	if (r.height() == 1) {
+		// Palette data. Transfer is small either 64B or 1024B.
+		// Sometimes it is faster, sometimes slower.
+		glTextureSubImage2D(m_texture_id, GL_TEX_LEVEL_0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, data);
+		return true;
+	}
+#endif
+
+	GL_PUSH("Upload Texture %d", m_texture_id);
+
+	// The easy solution without PBO
+#if 0
+	// Likely a bad texture
+	glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift);
+
+	glTextureSubImage2D(m_texture_id, GL_TEX_LEVEL_0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, data);
+
+	glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); // Restore default behavior
+#endif
+
+	// The complex solution with PBO
+#if 1
+	char* src = (char*)data;
+	char* map = PboPool::Map(map_size);
+
+	// PERF: slow path of the texture upload. Dunno if we could do better maybe check if TC can keep row_byte == pitch
+	// Note: row_byte != pitch
+	for (int h = 0; h < r.height(); h++) {
+		memcpy(map, src, row_byte);
+		map += row_byte;
+		src += pitch;
+	}
+
+	PboPool::Unmap();
+
+	glTextureSubImage2D(m_texture_id, GL_TEX_LEVEL_0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
+
+	// FIXME OGL4: investigate, only 1 unpack buffer always bound
+	PboPool::UnbindPbo();
+
+	PboPool::EndTransfer();
+#endif
+
+	GL_POP();
+
+	return true;
+}
+
+bool GSTextureOGL::Map(GSMap& m, const GSVector4i* _r)
+{
+	GSVector4i r = _r ? *_r : GSVector4i(0, 0, m_size.x, m_size.y);
+
+	// LOTS OF CRAP CODE!!!! PLEASE FIX ME !!!
+	if (m_type == GSTexture::Offscreen) {
+		// The fastest way will be to use a PBO to read the data asynchronously. Unfortunately GSdx
+		// architecture is waiting the data right now.
+
+#if 0
+		// Maybe it is as good as the code below. I don't know
+		// With openGL 4.5 you can use glGetTextureSubImage
+
+		glGetTextureImage(m_texture_id, GL_TEX_LEVEL_0, m_int_format, m_int_type, 1024*1024*16, m_local_buffer);
+
+#else
+
+		// Bind the texture to the read framebuffer to avoid any disturbance
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, m_fbo_read);
+		glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, m_texture_id, 0);
+
+		glPixelStorei(GL_PACK_ALIGNMENT, m_int_alignment);
+		glReadPixels(r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, m_local_buffer);
+
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+
+#endif
+
+		m.bits = m_local_buffer;
+		m.pitch = m_size.x << m_int_shift;
+
+		return true;
+	} else if (m_type == GSTexture::Texture || m_type == GSTexture::RenderTarget) {
+		GL_PUSH("Upload Texture %d", m_texture_id); // POP is in Unmap
+
+		m_dirty = true;
+		m_clean = false;
+
+		uint32 row_byte = r.width() << m_int_shift;
+		uint32 map_size = r.height() * row_byte;
+
+		m.bits = (uint8*)PboPool::Map(map_size);
+		m.pitch = row_byte;
+
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+	g_real_texture_upload_byte += map_size;
+#endif
+
+		// Save the area for the unmap
+		m_r_x = r.x;
+		m_r_y = r.y;
+		m_r_w = r.width();
+		m_r_h = r.height();
+
+		return true;
+	}
+
+	return false;
+}
+
+void GSTextureOGL::Unmap()
+{
+	if (m_type == GSTexture::Texture || m_type == GSTexture::RenderTarget) {
+
+		PboPool::Unmap();
+
+		glTextureSubImage2D(m_texture_id, GL_TEX_LEVEL_0, m_r_x, m_r_y, m_r_w, m_r_h, m_int_format, m_int_type, (const void*)PboPool::Offset());
+
+		// FIXME OGL4: investigate, only 1 unpack buffer always bound
+		PboPool::UnbindPbo();
+
+		PboPool::EndTransfer();
+
+		GL_POP(); // PUSH is in Map
+	}
+}
+
+bool GSTextureOGL::Save(const string& fn, bool user_image, bool dds)
+{
+	// Collect the texture data
+	uint32 pitch = 4 * m_size.x;
+	uint32 buf_size = pitch * m_size.y * 2;// Note *2 for security (depth/stencil)
+	std::unique_ptr<uint8[]> image(new uint8[buf_size]);
+#ifdef ENABLE_OGL_DEBUG
+	GSPng::Format fmt = GSPng::RGB_A_PNG;
+#else
+	GSPng::Format fmt = GSPng::RGB_PNG;
+#endif
+
+	if (IsBackbuffer()) {
+		glReadPixels(0, 0, m_size.x, m_size.y, GL_RGBA, GL_UNSIGNED_BYTE, image.get());
+	} else if(IsDss()) {
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, m_fbo_read);
+
+		glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, m_texture_id, 0);
+		glReadPixels(0, 0, m_size.x, m_size.y, GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, image.get());
+
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+
+		fmt = GSPng::RGB_A_PNG;
+	} else if(m_format == GL_R32I) {
+		glGetTextureImage(m_texture_id, 0, GL_RED_INTEGER, GL_INT, buf_size, image.get());
+
+		fmt = GSPng::R32I_PNG;
+	} else {
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, m_fbo_read);
+
+		glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, m_texture_id, 0);
+
+		if (m_format == GL_RGBA8) {
+			glReadPixels(0, 0, m_size.x, m_size.y, GL_RGBA, GL_UNSIGNED_BYTE, image.get());
+		}
+		else if (m_format == GL_R16UI)
+		{
+			glReadPixels(0, 0, m_size.x, m_size.y, GL_RED_INTEGER, GL_UNSIGNED_SHORT, image.get());
+			fmt = GSPng::R16I_PNG;
+		}
+		else if (m_format == GL_R8)
+		{
+			fmt = GSPng::R8I_PNG;
+			glReadPixels(0, 0, m_size.x, m_size.y, GL_RED, GL_UNSIGNED_BYTE, image.get());
+		}
+
+		glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+	}
+
+	int compression = user_image ? Z_BEST_COMPRESSION : theApp.GetConfig("png_compression_level", Z_BEST_SPEED);
+	return GSPng::Save(fmt, fn, image.get(), m_size.x, m_size.y, pitch, compression);
+}
+
+uint32 GSTextureOGL::GetMemUsage()
+{
+	switch (m_type) {
+		case GSTexture::Offscreen:
+			return m_size.x * m_size.y * (4 + m_int_alignment);
+		case GSTexture::Texture:
+		case GSTexture::RenderTarget:
+			return m_size.x * m_size.y * m_int_alignment;
+		case GSTexture::DepthStencil:
+			return m_size.x * m_size.y * 8;
+		case GSTexture::Backbuffer:
+		default:
+			return 0;
+	}
+}
diff --git a/plugins/GSdx_legacy/GSTextureOGL.h b/plugins/GSdx_legacy/GSTextureOGL.h
new file mode 100644
index 0000000000..def7ce16d7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureOGL.h
@@ -0,0 +1,84 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTexture.h"
+
+namespace PboPool {
+	void BindPbo();
+	void UnbindPbo();
+	void NextPbo();
+	void NextPboWithSync();
+
+	char* Map(uint32 size);
+	void Unmap();
+	uptr Offset();
+	void EndTransfer();
+
+	void Init();
+	void Destroy();
+}
+
+class GSTextureOGL final : public GSTexture
+{
+	private:
+		GLuint m_texture_id;	 // the texture id
+		int m_pbo_size;
+		GLuint m_fbo_read;
+		bool m_dirty;
+		bool m_clean;
+
+		uint8* m_local_buffer;
+		// Avoid alignment constrain
+		//GSVector4i m_r;
+		int m_r_x;
+		int m_r_y;
+		int m_r_w;
+		int m_r_h;
+
+
+		// internal opengl format/type/alignment
+		GLenum m_int_format;
+		GLenum m_int_type;
+		uint32 m_int_alignment;
+		uint32 m_int_shift;
+
+	public:
+		explicit GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read);
+		virtual ~GSTextureOGL();
+
+		void Invalidate() final;
+		bool Update(const GSVector4i& r, const void* data, int pitch) final;
+		bool Map(GSMap& m, const GSVector4i* r = NULL) final;
+		void Unmap() final;
+		bool Save(const string& fn, bool user_image = false, bool dds = false) final;
+
+		bool IsBackbuffer() { return (m_type == GSTexture::Backbuffer); }
+		bool IsDss() { return (m_type == GSTexture::DepthStencil); }
+
+		uint32 GetID() final { return m_texture_id; }
+		bool HasBeenCleaned() { return m_clean; }
+		void WasAttached() { m_clean = false; m_dirty = true; }
+		void WasCleaned() { m_clean = true; }
+
+		uint32 GetMemUsage();
+};
diff --git a/plugins/GSdx_legacy/GSTextureSW.cpp b/plugins/GSdx_legacy/GSTextureSW.cpp
new file mode 100644
index 0000000000..0baba7282b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureSW.cpp
@@ -0,0 +1,99 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSTextureSW.h"
+#include "GSPng.h"
+
+GSTextureSW::GSTextureSW(int type, int width, int height)
+{
+	m_mapped.clear();
+	m_size = GSVector2i(width, height);
+	m_type = type;
+	m_format = 0;
+	m_pitch = ((width << 2) + 31) & ~31;
+	m_data = _aligned_malloc(m_pitch * height, 32);
+}
+
+GSTextureSW::~GSTextureSW()
+{
+	_aligned_free(m_data);
+}
+
+bool GSTextureSW::Update(const GSVector4i& r, const void* data, int pitch)
+{
+	GSMap m;
+
+	if(m_data != NULL && Map(m, &r))
+	{
+		uint8* RESTRICT src = (uint8*)data;
+		uint8* RESTRICT dst = m.bits;
+
+		int rowbytes = r.width() << 2;
+
+		for(int h = r.height(); h > 0; h--, src += pitch, dst += m.pitch)
+		{
+			memcpy(dst, src, rowbytes);
+		}
+
+		Unmap();
+
+		return true;
+	}
+
+	return false;
+}
+
+bool GSTextureSW::Map(GSMap& m, const GSVector4i* r)
+{
+	GSVector4i r2 = r != NULL ? *r : GSVector4i(0, 0, m_size.x, m_size.y);
+
+	if(m_data != NULL && r2.left >= 0 && r2.right <= m_size.x && r2.top >= 0 && r2.bottom <= m_size.y)
+	{
+		if (!m_mapped.test_and_set())
+		{
+			m.bits = (uint8*)m_data + ((m_pitch * r2.top + r2.left) << 2);
+			m.pitch = m_pitch;
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void GSTextureSW::Unmap()
+{
+	m_mapped.clear();
+}
+
+bool GSTextureSW::Save(const string& fn, bool user_image, bool dds)
+{
+	if(dds) return false; // not implemented
+
+#ifdef ENABLE_OGL_DEBUG
+	GSPng::Format fmt = GSPng::RGB_A_PNG;
+#else
+	GSPng::Format fmt = GSPng::RGB_PNG;
+#endif
+	int compression = user_image ? Z_BEST_COMPRESSION : theApp.GetConfig("png_compression_level", Z_BEST_SPEED);
+	return GSPng::Save(fmt, fn, static_cast<uint8*>(m_data), m_size.x, m_size.y, m_pitch, compression);
+}
diff --git a/plugins/GSdx_legacy/GSTextureSW.h b/plugins/GSdx_legacy/GSTextureSW.h
new file mode 100644
index 0000000000..52154ee297
--- /dev/null
+++ b/plugins/GSdx_legacy/GSTextureSW.h
@@ -0,0 +1,42 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSTexture.h"
+
+class GSTextureSW : public GSTexture
+{
+	// mem texture, always 32-bit rgba (might add 8-bit for palette if needed)
+
+	int m_pitch;
+	void* m_data;
+	std::atomic_flag m_mapped;
+
+public:
+	GSTextureSW(int type, int width, int height);
+	virtual ~GSTextureSW();
+
+	bool Update(const GSVector4i& r, const void* data, int pitch);
+	bool Map(GSMap& m, const GSVector4i* r);
+	void Unmap();
+	bool Save(const string& fn, bool user_image = false, bool dds = false);
+};
diff --git a/plugins/GSdx_legacy/GSThread.cpp b/plugins/GSdx_legacy/GSThread.cpp
new file mode 100644
index 0000000000..860b7384f2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSThread.cpp
@@ -0,0 +1,130 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSThread_CXX11.h"
+
+GSThread::GSThread()
+{
+    #ifdef _WIN32
+
+	m_ThreadId = 0;
+	m_hThread = NULL;
+
+    #else
+
+    #endif
+}
+
+GSThread::~GSThread()
+{
+	CloseThread();
+}
+
+#ifdef _WIN32
+
+DWORD WINAPI GSThread::StaticThreadProc(void* lpParam)
+{
+	((GSThread*)lpParam)->ThreadProc();
+
+	return 0;
+}
+
+#else
+
+void* GSThread::StaticThreadProc(void* param)
+{
+	((GSThread*)param)->ThreadProc();
+#ifndef _STD_THREAD_ // exit is done implicitly by std::thread
+	pthread_exit(NULL);
+#endif
+	return NULL;
+}
+
+#endif
+
+void GSThread::CreateThread()
+{
+    #ifdef _WIN32
+
+	m_hThread = ::CreateThread(NULL, 0, StaticThreadProc, (void*)this, 0, &m_ThreadId);
+
+	#else
+    
+    #ifdef _STD_THREAD_
+    t = new thread(StaticThreadProc,(void*)this);
+    #else
+    pthread_attr_init(&m_thread_attr);
+    pthread_create(&m_thread, &m_thread_attr, StaticThreadProc, (void*)this);
+    #endif
+
+	#endif
+}
+
+void GSThread::CloseThread()
+{
+    #ifdef _WIN32
+
+	if(m_hThread != NULL)
+	{
+		if(WaitForSingleObject(m_hThread, 5000) != WAIT_OBJECT_0)
+		{
+			printf("GSdx: WARNING: GSThread Thread did not close itself in time. Assuming hung. Terminating.\n");
+			TerminateThread(m_hThread, 1);
+		}
+
+		CloseHandle(m_hThread);
+
+		m_hThread = NULL;
+		m_ThreadId = 0;
+	}
+
+    #else
+    // Should be tested on windows too one day, native handle should be disabled there though, or adapted to windows thread
+    #ifdef _STD_THREAD_
+    
+    #define _NATIVE_HANDLE_ // Using std::thread native handle, allows to just use posix stuff.
+    #ifdef _NATIVE_HANDLE_ // std::thread join seems to be bugged, have to test it again every now and then, it did work at some point(gcc 5), seems there is bug in system lib...
+    pthread_t m_thread = t->native_handle();
+    void *ret = NULL;
+    pthread_join(m_thread, &ret);
+    /* We are sure thread is dead, not so bad.
+     * Still no way to to delete that crap though... Really, wtf is this standard??
+     * I guess we will have to wait that someone debug either the implementation or change standard.
+     * There should be a moderate memory leak for now, I am trying to find a way to fix it.
+     * 3kinox
+     */
+    #else
+    if(t->joinable())
+    {
+        t->join();
+    }
+    delete(t);
+    #endif
+    #else
+    void* ret = NULL;
+
+    pthread_join(m_thread, &ret);
+    pthread_attr_destroy(&m_thread_attr);
+    #endif
+    #endif
+}
+
diff --git a/plugins/GSdx_legacy/GSThread.h b/plugins/GSdx_legacy/GSThread.h
new file mode 100644
index 0000000000..9594a793f3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSThread.h
@@ -0,0 +1,126 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSdx.h"
+
+// http://software.intel.com/en-us/blogs/2012/11/06/exploring-intel-transactional-synchronization-extensions-with-intel-software
+#if 0
+class TransactionScope
+{
+public:
+	class Lock
+	{
+		std::atomic<bool> state;
+
+	public:
+		Lock()
+			: state(false)
+		{
+		}
+
+		void lock()
+		{
+			bool expected_value = false;
+			while(state.compare_exchange_strong(expected_value, true))
+			{
+				do {_mm_pause();} while(state);
+			}
+		}
+
+		void unlock()
+		{
+			state = false;
+		}
+
+		bool isLocked() const
+		{
+			return state.load();
+		}
+	};
+
+private:
+	Lock& fallBackLock;
+
+	TransactionScope();
+
+public:
+	TransactionScope(Lock& fallBackLock_, int max_retries = 3)
+		: fallBackLock(fallBackLock_)
+	{
+		// The TSX (RTM/HLE) instructions on Intel AVX2 CPUs may either be
+		// absent or disabled (see errata HSD136 and specification change at
+		// http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-desktop-specification-update.pdf)
+		// This can cause builds for AVX2 CPUs to fail with GCC/Clang on Linux,
+		// so check that the RTM instructions are actually available.
+		#if (_M_SSE >= 0x501 && !defined(__GNUC__)) || defined(__RTM__)
+
+		int nretries = 0;
+
+		while(1)
+		{
+			++nretries;
+
+			unsigned status = _xbegin();
+
+			if(status == _XBEGIN_STARTED)
+			{
+				if(!fallBackLock.isLocked()) return;
+
+				_xabort(0xff);
+			}
+
+			if((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff && !(status & _XABORT_NESTED))
+			{
+				while(fallBackLock.isLocked()) _mm_pause();
+			}
+			else if(!(status & _XABORT_RETRY))
+			{
+				break;
+			}
+
+			if(nretries >= max_retries)
+			{
+				break;
+			}
+		}
+
+		#endif
+
+		fallBackLock.lock();
+	}
+
+	~TransactionScope()
+	{
+		if(fallBackLock.isLocked())
+		{
+			fallBackLock.unlock();
+		}
+		#if (_M_SSE >= 0x501 && !defined(__GNUC__)) || defined(__RTM__)
+		else
+		{
+			_xend();
+		}
+		#endif
+	}
+};
+#endif
diff --git a/plugins/GSdx_legacy/GSThread_CXX11.h b/plugins/GSdx_legacy/GSThread_CXX11.h
new file mode 100644
index 0000000000..21711814ba
--- /dev/null
+++ b/plugins/GSdx_legacy/GSThread_CXX11.h
@@ -0,0 +1,185 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSdx.h"
+#include "boost_spsc_queue.hpp"
+
+class IGSThread
+{
+protected:
+	virtual void ThreadProc() = 0;
+};
+
+// let us use std::thread for now, comment out the definition to go back to pthread
+// There are currently some bugs/limitations to std::thread (see various comment)
+// For the moment let's keep pthread but uses new std object (mutex, cond_var)
+//#define _STD_THREAD_
+
+#ifdef _WIN32
+
+class GSThread : public IGSThread
+{
+    DWORD m_ThreadId;
+    HANDLE m_hThread;
+
+	static DWORD WINAPI StaticThreadProc(void* lpParam);
+
+protected:
+	void CreateThread();
+	void CloseThread();
+
+public:
+	GSThread();
+	virtual ~GSThread();
+};
+
+#else
+
+#ifdef _STD_THREAD_
+#include <thread>
+#else
+#include <pthread.h>
+#endif
+
+class GSThread : public IGSThread
+{
+    #ifdef _STD_THREAD_
+    std::thread *t;
+    #else
+    pthread_attr_t m_thread_attr;
+    pthread_t m_thread;
+    #endif
+    static void* StaticThreadProc(void* param);
+
+protected:
+	void CreateThread();
+	void CloseThread();
+
+public:
+	GSThread();
+	virtual ~GSThread();
+};
+
+#endif
+
+template<class T> class IGSJobQueue : public GSThread
+{
+public:
+	IGSJobQueue() {}
+	virtual ~IGSJobQueue() {}
+
+	virtual bool IsEmpty() const = 0;
+	virtual void Push(const T& item) = 0;
+	virtual void Wait() = 0;
+
+	virtual void Process(T& item) = 0;
+	virtual int GetPixels(bool reset) = 0;
+};
+
+template<class T, int CAPACITY> class GSJobQueue : public IGSJobQueue<T>
+{
+protected:
+	std::atomic<int16_t> m_count;
+	std::atomic<bool> m_exit;
+	ringbuffer_base<T, CAPACITY> m_queue;
+
+	std::mutex m_lock;
+	std::condition_variable m_empty;
+	std::condition_variable m_notempty;
+
+	void ThreadProc() {
+		std::unique_lock<std::mutex> l(m_lock);
+
+		while (true) {
+
+			while (m_count == 0) {
+				if (m_exit.load(memory_order_acquire)) return;
+				m_notempty.wait(l);
+			}
+
+			l.unlock();
+
+			int16_t consumed = 0;
+			for (int16_t nb = m_count; nb >= 0; nb--) {
+				if (m_queue.consume_one(*this))
+					consumed++;
+			}
+
+			l.lock();
+
+			m_count -= consumed;
+
+			if (m_count <= 0)
+				m_empty.notify_one();
+
+		}
+	}
+
+public:
+	GSJobQueue() :
+		m_count(0),
+		m_exit(false)
+	{
+		this->CreateThread();
+	}
+
+	virtual ~GSJobQueue() {
+		m_exit.store(true, memory_order_release);
+		m_notempty.notify_one();
+		this->CloseThread();
+	}
+
+	bool IsEmpty() const {
+		ASSERT(m_count >= 0);
+
+		return m_count == 0;
+	}
+
+	void Push(const T& item) {
+		while(!m_queue.push(item))
+			std::this_thread::yield();
+
+		std::unique_lock<std::mutex> l(m_lock);
+
+		m_count++;
+
+		l.unlock();
+
+		m_notempty.notify_one();
+	}
+
+	void Wait() {
+		if (m_count > 0) {
+			std::unique_lock<std::mutex> l(m_lock);
+			while (m_count > 0) {
+				m_empty.wait(l);
+			}
+		}
+
+		ASSERT(m_count == 0);
+	}
+
+	void operator() (T& item) {
+		this->Process(item);
+	}
+};
diff --git a/plugins/GSdx_legacy/GSUniformBufferOGL.h b/plugins/GSdx_legacy/GSUniformBufferOGL.h
new file mode 100644
index 0000000000..7237f96d5e
--- /dev/null
+++ b/plugins/GSdx_legacy/GSUniformBufferOGL.h
@@ -0,0 +1,155 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GLState.h"
+
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+extern uint64 g_uniform_upload_byte;
+#endif
+
+
+class GSUniformBufferOGL {
+	GLuint buffer;		// data object
+	GLuint index;		// GLSL slot
+	uint32 size;	    // size of the data
+
+public:
+	GSUniformBufferOGL(GLuint index, uint32 size) : index(index)
+												  , size(size)
+	{
+		glGenBuffers(1, &buffer);
+		bind();
+		allocate();
+		attach();
+	}
+
+	void bind()
+	{
+		if (GLState::ubo != buffer) {
+			GLState::ubo = buffer;
+			glBindBuffer(GL_UNIFORM_BUFFER, buffer);
+		}
+	}
+
+	void allocate()
+	{
+		glBufferData(GL_UNIFORM_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
+	}
+
+	void attach()
+	{
+		// From the opengl manpage:
+		// glBindBufferBase also binds buffer to the generic buffer binding point specified by target
+		GLState::ubo = buffer;
+		glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer);
+	}
+
+	void upload(const void* src)
+	{
+		bind();
+		// glMapBufferRange allow to set various parameter but the call is
+		// synchronous whereas glBufferSubData could be asynchronous.
+		// TODO: investigate the extension ARB_invalidate_subdata
+		glBufferSubData(GL_UNIFORM_BUFFER, 0, size, src);
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+		g_uniform_upload_byte += size;
+#endif
+	}
+
+	~GSUniformBufferOGL() {
+		glDeleteBuffers(1, &buffer);
+	}
+};
+
+#define UBO_BUFFER_SIZE (4*1024*1024)
+
+class GSUniformBufferStorageOGL {
+	GLuint buffer;		// data object
+	GLuint index;		// GLSL slot
+	uint32 size;	    // size of the data
+	uint8* m_buffer_ptr;
+	uint32 m_offset;
+
+public:
+	GSUniformBufferStorageOGL(GLuint index, uint32 size) : index(index)
+												  , size(size), m_offset(0)
+	{
+		glGenBuffers(1, &buffer);
+		bind();
+		allocate();
+		attach();
+	}
+
+	void bind()
+	{
+		if (GLState::ubo != buffer) {
+			GLState::ubo = buffer;
+			glBindBuffer(GL_UNIFORM_BUFFER, buffer);
+		}
+	}
+
+	void allocate()
+	{
+		const GLbitfield common_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT /*| GL_MAP_COHERENT_BIT */;
+		const GLbitfield map_flags = common_flags | GL_MAP_FLUSH_EXPLICIT_BIT;
+		const GLbitfield create_flags = common_flags /*| GL_CLIENT_STORAGE_BIT */;
+
+		GLsizei buffer_size = UBO_BUFFER_SIZE;
+		glBufferStorage(GL_UNIFORM_BUFFER, buffer_size, NULL, create_flags);
+		m_buffer_ptr = (uint8*) glMapBufferRange(GL_UNIFORM_BUFFER, 0, buffer_size, map_flags);
+		ASSERT(m_buffer_ptr);
+	}
+
+	void attach()
+	{
+		// From the opengl manpage:
+		// glBindBufferBase also binds buffer to the generic buffer binding point specified by target
+		GLState::ubo = buffer;
+		//glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer);
+		glBindBufferRange(GL_UNIFORM_BUFFER, index, buffer, m_offset, size);
+	}
+
+	void upload(const void* src)
+	{
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+		g_uniform_upload_byte += size;
+#endif
+
+		memcpy(m_buffer_ptr + m_offset, src, size);
+
+		attach();
+		glFlushMappedBufferRange(GL_UNIFORM_BUFFER, m_offset, size);
+
+		m_offset = (m_offset + size + 255u) & ~0xFF;
+		if (m_offset >= UBO_BUFFER_SIZE)
+			m_offset = 0;
+	}
+
+	~GSUniformBufferStorageOGL() {
+		bind();
+		glUnmapBuffer(GL_UNIFORM_BUFFER);
+		glDeleteBuffers(1, &buffer);
+	}
+};
+
+#undef UBO_BUFFER_SIZE
diff --git a/plugins/GSdx_legacy/GSUtil.cpp b/plugins/GSdx_legacy/GSUtil.cpp
new file mode 100644
index 0000000000..2afded494d
--- /dev/null
+++ b/plugins/GSdx_legacy/GSUtil.cpp
@@ -0,0 +1,407 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GS.h"
+#include "GSUtil.h"
+#include "xbyak/xbyak_util.h"
+
+#ifdef _WIN32
+#include "GSDeviceDX.h"
+#include <VersionHelpers.h>
+#include "svnrev.h"
+#else
+#define SVN_REV 0
+#define SVN_MODS 0
+#endif
+
+const char* GSUtil::GetLibName()
+{
+	// TODO: critsec
+
+	static string str;
+
+	if(str.empty())
+	{
+		str = "GSdx";
+
+		#ifdef _WIN32
+		str += format(" %lld", SVN_REV);
+		if(SVN_MODS) str += "m";
+		#endif
+
+		#ifdef _M_AMD64
+		str += " 64-bit";
+		#endif
+
+		list<string> sl;
+
+		#ifdef __INTEL_COMPILER
+		sl.push_back(format("Intel C++ %d.%02d", __INTEL_COMPILER / 100, __INTEL_COMPILER % 100));
+		#elif _MSC_VER
+		sl.push_back(format("MSVC %d.%02d", _MSC_VER / 100, _MSC_VER % 100));
+		#elif __GNUC__
+		sl.push_back(format("GCC %d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__));
+		#endif
+
+		#if _M_SSE >= 0x501
+		sl.push_back("AVX2");
+		#elif _M_SSE >= 0x500
+		sl.push_back("AVX");
+		#elif _M_SSE >= 0x402
+		sl.push_back("SSE42");
+		#elif _M_SSE >= 0x401
+		sl.push_back("SSE41");
+		#elif _M_SSE >= 0x301
+		sl.push_back("SSSE3");
+		#elif _M_SSE >= 0x200
+		sl.push_back("SSE2");
+		#elif _M_SSE >= 0x100
+		sl.push_back("SSE");
+		#endif
+
+		for(list<string>::iterator i = sl.begin(); i != sl.end(); )
+		{
+			if(i == sl.begin()) str += " (";
+			str += *i;
+			str += ++i != sl.end() ? ", " : ")";
+		}
+	}
+
+	return str.c_str();
+}
+
+static class GSUtilMaps
+{
+public:
+	uint8 PrimClassField[8];
+	uint8 VertexCountField[8];
+	uint8 ClassVertexCountField[4];
+	uint32 CompatibleBitsField[64][2];
+	uint32 SharedBitsField[64][2];
+
+	GSUtilMaps()
+	{
+		PrimClassField[GS_POINTLIST] = GS_POINT_CLASS;
+		PrimClassField[GS_LINELIST] = GS_LINE_CLASS;
+		PrimClassField[GS_LINESTRIP] = GS_LINE_CLASS;
+		PrimClassField[GS_TRIANGLELIST] = GS_TRIANGLE_CLASS;
+		PrimClassField[GS_TRIANGLESTRIP] = GS_TRIANGLE_CLASS;
+		PrimClassField[GS_TRIANGLEFAN] = GS_TRIANGLE_CLASS;
+		PrimClassField[GS_SPRITE] = GS_SPRITE_CLASS;
+		PrimClassField[GS_INVALID] = GS_INVALID_CLASS;
+
+		VertexCountField[GS_POINTLIST] = 1;
+		VertexCountField[GS_LINELIST] = 2;
+		VertexCountField[GS_LINESTRIP] = 2;
+		VertexCountField[GS_TRIANGLELIST] = 3;
+		VertexCountField[GS_TRIANGLESTRIP] = 3;
+		VertexCountField[GS_TRIANGLEFAN] = 3;
+		VertexCountField[GS_SPRITE] = 2;
+		VertexCountField[GS_INVALID] = 1;
+
+		ClassVertexCountField[GS_POINT_CLASS] = 1;
+		ClassVertexCountField[GS_LINE_CLASS] = 2;
+		ClassVertexCountField[GS_TRIANGLE_CLASS] = 3;
+		ClassVertexCountField[GS_SPRITE_CLASS] = 2;
+
+		memset(CompatibleBitsField, 0, sizeof(CompatibleBitsField));
+
+		for(int i = 0; i < 64; i++)
+		{
+			CompatibleBitsField[i][i >> 5] |= 1 << (i & 0x1f);
+		}
+
+		CompatibleBitsField[PSM_PSMCT32][PSM_PSMCT24 >> 5] |= 1 << (PSM_PSMCT24 & 0x1f);
+		CompatibleBitsField[PSM_PSMCT24][PSM_PSMCT32 >> 5] |= 1 << (PSM_PSMCT32 & 0x1f);
+		CompatibleBitsField[PSM_PSMCT16][PSM_PSMCT16S >> 5] |= 1 << (PSM_PSMCT16S & 0x1f);
+		CompatibleBitsField[PSM_PSMCT16S][PSM_PSMCT16 >> 5] |= 1 << (PSM_PSMCT16 & 0x1f);
+		CompatibleBitsField[PSM_PSMZ32][PSM_PSMZ24 >> 5] |= 1 << (PSM_PSMZ24 & 0x1f);
+		CompatibleBitsField[PSM_PSMZ24][PSM_PSMZ32 >> 5] |= 1 << (PSM_PSMZ32 & 0x1f);
+		CompatibleBitsField[PSM_PSMZ16][PSM_PSMZ16S >> 5] |= 1 << (PSM_PSMZ16S & 0x1f);
+		CompatibleBitsField[PSM_PSMZ16S][PSM_PSMZ16 >> 5] |= 1 << (PSM_PSMZ16 & 0x1f);
+
+		memset(SharedBitsField, 0, sizeof(SharedBitsField));
+
+		SharedBitsField[PSM_PSMCT24][PSM_PSMT8H >> 5] |= 1 << (PSM_PSMT8H & 0x1f);
+		SharedBitsField[PSM_PSMCT24][PSM_PSMT4HL >> 5] |= 1 << (PSM_PSMT4HL & 0x1f);
+		SharedBitsField[PSM_PSMCT24][PSM_PSMT4HH >> 5] |= 1 << (PSM_PSMT4HH & 0x1f);
+		SharedBitsField[PSM_PSMZ24][PSM_PSMT8H >> 5] |= 1 << (PSM_PSMT8H & 0x1f);
+		SharedBitsField[PSM_PSMZ24][PSM_PSMT4HL >> 5] |= 1 << (PSM_PSMT4HL & 0x1f);
+		SharedBitsField[PSM_PSMZ24][PSM_PSMT4HH >> 5] |= 1 << (PSM_PSMT4HH & 0x1f);
+		SharedBitsField[PSM_PSMT8H][PSM_PSMCT24 >> 5] |= 1 << (PSM_PSMCT24 & 0x1f);
+		SharedBitsField[PSM_PSMT8H][PSM_PSMZ24 >> 5] |= 1 << (PSM_PSMZ24 & 0x1f);
+		SharedBitsField[PSM_PSMT4HL][PSM_PSMCT24 >> 5] |= 1 << (PSM_PSMCT24 & 0x1f);
+		SharedBitsField[PSM_PSMT4HL][PSM_PSMZ24 >> 5] |= 1 << (PSM_PSMZ24 & 0x1f);
+		SharedBitsField[PSM_PSMT4HL][PSM_PSMT4HH >> 5] |= 1 << (PSM_PSMT4HH & 0x1f);
+		SharedBitsField[PSM_PSMT4HH][PSM_PSMCT24 >> 5] |= 1 << (PSM_PSMCT24 & 0x1f);
+		SharedBitsField[PSM_PSMT4HH][PSM_PSMZ24 >> 5] |= 1 << (PSM_PSMZ24 & 0x1f);
+		SharedBitsField[PSM_PSMT4HH][PSM_PSMT4HL >> 5] |= 1 << (PSM_PSMT4HL & 0x1f);
+	}
+
+} s_maps;
+
+GS_PRIM_CLASS GSUtil::GetPrimClass(uint32 prim)
+{
+	return (GS_PRIM_CLASS)s_maps.PrimClassField[prim];
+}
+
+int GSUtil::GetVertexCount(uint32 prim)
+{
+	return s_maps.VertexCountField[prim];
+}
+
+int GSUtil::GetClassVertexCount(uint32 primclass)
+{
+	return s_maps.ClassVertexCountField[primclass];
+}
+
+const uint32* GSUtil::HasSharedBitsPtr(uint32 dpsm)
+{
+	return s_maps.SharedBitsField[dpsm];
+}
+
+bool GSUtil::HasSharedBits(uint32 spsm, const uint32* RESTRICT ptr)
+{
+	return (ptr[spsm >> 5] & (1 << (spsm & 0x1f))) == 0;
+}
+
+bool GSUtil::HasSharedBits(uint32 spsm, uint32 dpsm)
+{
+	return (s_maps.SharedBitsField[dpsm][spsm >> 5] & (1 << (spsm & 0x1f))) == 0;
+}
+
+bool GSUtil::HasSharedBits(uint32 sbp, uint32 spsm, uint32 dbp, uint32 dpsm)
+{
+	return ((sbp ^ dbp) | (s_maps.SharedBitsField[dpsm][spsm >> 5] & (1 << (spsm & 0x1f)))) == 0;
+}
+
+bool GSUtil::HasCompatibleBits(uint32 spsm, uint32 dpsm)
+{
+	return (s_maps.CompatibleBitsField[spsm][dpsm >> 5] & (1 << (dpsm & 0x1f))) != 0;
+}
+
+bool GSUtil::CheckSSE()
+{
+	Xbyak::util::Cpu cpu;
+	Xbyak::util::Cpu::Type type;
+
+	#if _M_SSE >= 0x500
+	type = Xbyak::util::Cpu::tAVX;
+	#elif _M_SSE >= 0x402
+	type = Xbyak::util::Cpu::tSSE42;
+	#elif _M_SSE >= 0x401
+	type = Xbyak::util::Cpu::tSSE41;
+	#elif _M_SSE >= 0x301
+	type = Xbyak::util::Cpu::tSSSE3;
+	#elif _M_SSE >= 0x200
+	type = Xbyak::util::Cpu::tSSE2;
+	#endif
+
+	if(!cpu.has(type))
+	{
+		fprintf(stderr, "This CPU does not support SSE %d.%02d", _M_SSE >> 8, _M_SSE & 0xff);
+
+		return false;
+	}
+
+	return true;
+}
+
+#define OCL_PROGRAM_VERSION 3
+
+#ifdef ENABLE_OPENCL
+void GSUtil::GetDeviceDescs(list<OCLDeviceDesc>& dl)
+{
+	dl.clear();
+
+	try
+	{
+		std::vector<cl::Platform> platforms;
+
+		cl::Platform::get(&platforms);
+
+		for(auto& p : platforms)
+		{
+			std::string platform_vendor = p.getInfo<CL_PLATFORM_VENDOR>();
+
+			std::vector<cl::Device> ds;
+
+			p.getDevices(CL_DEVICE_TYPE_ALL, &ds);
+
+			for(auto& device : ds)
+			{
+				string type;
+
+				switch(device.getInfo<CL_DEVICE_TYPE>())
+				{
+				case CL_DEVICE_TYPE_GPU: type = "GPU"; break;
+				case CL_DEVICE_TYPE_CPU: type = "CPU"; break;
+				}
+
+				if(type.empty()) continue;
+
+				std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+
+				int major = 0;
+				int minor = 0;
+
+				if(!type.empty() && sscanf(version.c_str(), "OpenCL C %d.%d", &major, &minor) == 2 && major == 1 && minor >= 1 || major > 1)
+				{
+					OCLDeviceDesc desc;
+
+					desc.device = device;
+					desc.name = GetDeviceUniqueName(device);
+					desc.version = major * 100 + minor * 10;
+
+					// TODO: linux
+
+					char* buff = new char[MAX_PATH + 1];
+					GetTempPath(MAX_PATH, buff);
+					desc.tmppath = string(buff) + "/" + desc.name;
+
+					WIN32_FIND_DATA FindFileData;
+					HANDLE hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData);
+					if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind);
+					else CreateDirectory(desc.tmppath.c_str(), NULL);
+
+					sprintf(buff, "/%d", OCL_PROGRAM_VERSION);
+					desc.tmppath += buff;
+					delete[] buff;
+
+					hFind = FindFirstFile(desc.tmppath.c_str(), &FindFileData);
+					if(hFind != INVALID_HANDLE_VALUE) FindClose(hFind);
+					else CreateDirectory(desc.tmppath.c_str(), NULL);
+
+					dl.push_back(desc);
+				}
+			}
+		}
+	}
+	catch(cl::Error err)
+	{
+		printf("%s (%d)\n", err.what(), err.err());
+	}
+}
+
+string GSUtil::GetDeviceUniqueName(cl::Device& device)
+{
+	std::string vendor = device.getInfo<CL_DEVICE_VENDOR>();
+	std::string name = device.getInfo<CL_DEVICE_NAME>();
+	std::string version = device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+
+	string type;
+
+	switch(device.getInfo<CL_DEVICE_TYPE>())
+	{
+	case CL_DEVICE_TYPE_GPU: type = "GPU"; break;
+	case CL_DEVICE_TYPE_CPU: type = "CPU"; break;
+	}
+
+	version.erase(version.find_last_not_of(' ') + 1);
+
+	return vendor + " " + name + " " + version + " " + type;
+}
+#endif
+
+#ifdef _WIN32
+
+bool GSUtil::CheckDirectX()
+{
+	if (GSDeviceDX::LoadD3DCompiler())
+	{
+		GSDeviceDX::FreeD3DCompiler();
+		return true;
+	}
+
+	// User's system is likely broken if it fails and is Windows 8.1 or greater.
+	if (!IsWindows8Point1OrGreater())
+	{
+		printf("Cannot find d3dcompiler_43.dll\n");
+		if (MessageBox(nullptr, TEXT("You need to update some DirectX libraries, would you like to do it now?"), TEXT("GSdx"), MB_YESNO) == IDYES)
+		{
+			ShellExecute(nullptr, TEXT("open"), TEXT("https://www.microsoft.com/en-us/download/details.aspx?id=8109"), nullptr, nullptr, SW_SHOWNORMAL);
+		}
+	}
+	return false;
+}
+
+// ---------------------------------------------------------------------------------
+//  DX11 Detection (includes DXGI detection and dynamic library method bindings)
+// ---------------------------------------------------------------------------------
+//  Code 'Borrowed' from Microsoft's DXGI sources -- Modified to suit our needs. --air
+//  Stripped down because of unnecessary complexity and false positives
+//  e.g. (d3d11_beta.dll would fail at device creation time) --pseudonym
+
+static int s_DXGI;
+static int s_D3D11;
+
+bool GSUtil::CheckDXGI()
+{
+	if (0 == s_DXGI)
+	{
+		HMODULE hmod = LoadLibrary("dxgi.dll");
+		s_DXGI = hmod ? 1 : -1;
+		if (hmod)
+			FreeLibrary(hmod);
+	}
+
+	return s_DXGI > 0;
+}
+
+bool GSUtil::CheckD3D11()
+{
+	if (!CheckDXGI())
+		return false;
+
+	if (0 == s_D3D11)
+	{
+		HMODULE hmod = LoadLibrary("d3d11.dll");
+		s_D3D11 = hmod ? 1 : -1;
+		if (hmod)
+			FreeLibrary(hmod);
+	}
+
+	return s_D3D11 > 0;
+}
+
+D3D_FEATURE_LEVEL GSUtil::CheckDirect3D11Level(IDXGIAdapter *adapter, D3D_DRIVER_TYPE type)
+{
+	HRESULT hr;
+	D3D_FEATURE_LEVEL level;
+
+	if(!CheckD3D11())
+		return (D3D_FEATURE_LEVEL)0;
+
+	hr = D3D11CreateDevice(adapter, type, NULL, 0, NULL, 0, D3D11_SDK_VERSION, NULL, &level, NULL);
+
+	return SUCCEEDED(hr) ? level : (D3D_FEATURE_LEVEL)0;
+}
+
+#else
+
+void GSmkdir(const char* dir)
+{
+	if (mkdir(dir, 0777))
+		fprintf(stderr, "Failed to create directory: %s\n", dir);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSUtil.h b/plugins/GSdx_legacy/GSUtil.h
new file mode 100644
index 0000000000..f1372775e3
--- /dev/null
+++ b/plugins/GSdx_legacy/GSUtil.h
@@ -0,0 +1,70 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+
+struct OCLDeviceDesc
+{
+#ifdef ENABLE_OPENCL
+	cl::Device device;
+#endif
+	string name;
+	int version;
+	string tmppath;
+};
+
+class GSUtil
+{
+public:
+	static const char* GetLibName();
+
+	static GS_PRIM_CLASS GetPrimClass(uint32 prim);
+	static int GetVertexCount(uint32 prim);
+	static int GetClassVertexCount(uint32 primclass);
+
+	static const uint32* HasSharedBitsPtr(uint32 dpsm);
+	static bool HasSharedBits(uint32 spsm, const uint32* ptr);
+	static bool HasSharedBits(uint32 spsm, uint32 dpsm);
+	static bool HasSharedBits(uint32 sbp, uint32 spsm, uint32 dbp, uint32 dpsm);
+	static bool HasCompatibleBits(uint32 spsm, uint32 dpsm);
+
+	static bool CheckSSE();
+
+#ifdef ENABLE_OPENCL
+	static void GetDeviceDescs(list<OCLDeviceDesc>& dl);
+	static string GetDeviceUniqueName(cl::Device& device);
+#endif
+
+#ifdef _WIN32
+
+	static bool CheckDirectX();
+	static bool CheckDXGI();
+	static bool CheckD3D11();
+	static D3D_FEATURE_LEVEL CheckDirect3D11Level(IDXGIAdapter *adapter = NULL, D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_HARDWARE);
+
+#endif
+};
+
+#ifdef __linux__
+void GSmkdir(const char* dir);
+#endif
diff --git a/plugins/GSdx_legacy/GSVector.cpp b/plugins/GSdx_legacy/GSVector.cpp
new file mode 100644
index 0000000000..d5d074ac3a
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVector.cpp
@@ -0,0 +1,219 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSVector.h"
+
+const GSVector4i GSVector4i::m_xff[17] = 
+{
+	GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x000000ff, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector4i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000), 
+	GSVector4i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000), 
+	GSVector4i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000), 
+	GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
+	GSVector4i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000), 
+	GSVector4i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000), 
+	GSVector4i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000), 
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff), 
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff), 
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff), 
+	GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+};
+
+const GSVector4i GSVector4i::m_x0f[17] =
+{
+	GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x0000000f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
+	GSVector4i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), 
+	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
+};
+
+const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
+const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
+const GSVector4 GSVector4::m_half(0.5f);
+const GSVector4 GSVector4::m_one(1.0f);
+const GSVector4 GSVector4::m_two(2.0f);
+const GSVector4 GSVector4::m_four(4.0f);
+const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
+const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000)));
+const GSVector4 GSVector4::m_max(FLT_MAX);
+const GSVector4 GSVector4::m_min(FLT_MIN);
+
+#if _M_SSE >= 0x500
+
+const GSVector8 GSVector8::m_half(0.5f);
+const GSVector8 GSVector8::m_one(1.0f);
+const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
+const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
+const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));
+const GSVector8 GSVector8::m_max(FLT_MAX);
+const GSVector8 GSVector8::m_min(FLT_MIN);
+
+#endif
+
+#if _M_SSE >= 0x501
+
+const GSVector8i GSVector8i::m_xff[33] = 
+{
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+};
+
+const GSVector8i GSVector8i::m_x0f[33] =
+{
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), 
+	GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
+};
+
+#endif
+
+GSVector4i GSVector4i::fit(int arx, int ary) const
+{
+	GSVector4i r = *this;
+
+	if(arx > 0 && ary > 0)
+	{
+		int w = width();
+		int h = height();
+
+		if(w * ary > h * arx)
+		{
+			w = h * arx / ary;
+			r.left = (r.left + r.right - w) >> 1;
+			if(r.left & 1) r.left++;
+			r.right = r.left + w;
+		}
+		else
+		{
+			h = w * ary / arx;
+			r.top = (r.top + r.bottom - h) >> 1;
+			if(r.top & 1) r.top++;
+			r.bottom = r.top + h;
+		}
+
+		r = r.rintersect(*this);
+	}
+	else
+	{
+		r = *this;
+	}
+
+	return r;
+}
+
+static const int s_ar[][2] = {{0, 0}, {4, 3}, {16, 9}};
+
+GSVector4i GSVector4i::fit(int preset) const
+{
+	GSVector4i r;
+
+	if(preset > 0 && preset < (int)countof(s_ar))
+	{
+		r = fit(s_ar[preset][0], s_ar[preset][1]);
+	}
+	else
+	{
+		r = *this;
+	}
+
+	return r;
+}
diff --git a/plugins/GSdx_legacy/GSVector.h b/plugins/GSdx_legacy/GSVector.h
new file mode 100644
index 0000000000..c07b951317
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVector.h
@@ -0,0 +1,6048 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+
+#pragma once
+
+enum Align_Mode
+{
+	Align_Outside,
+	Align_Inside,
+	Align_NegInf,
+	Align_PosInf
+};
+
+enum Round_Mode
+{
+	Round_NearestInt = 8,
+	Round_NegInf = 9,
+	Round_PosInf = 10,
+	Round_Truncate = 11
+};
+
+#pragma pack(push, 1)
+
+template<class T> class GSVector2T
+{
+public:
+	union
+	{
+		struct {T x, y;};
+		struct {T r, g;};
+		struct {T v[2];};
+	};
+
+	GSVector2T()
+	{
+	}
+
+	GSVector2T(T x, T y)
+	{
+		this->x = x;
+		this->y = y;
+	}
+
+	bool operator == (const GSVector2T& v) const
+	{
+		return x == v.x && y == v.y;
+	}
+
+	bool operator != (const GSVector2T& v) const
+	{
+		return x != v.x || y != v.y;
+	}
+};
+
+typedef GSVector2T<float> GSVector2;
+typedef GSVector2T<int> GSVector2i;
+
+class GSVector4;
+class GSVector4i;
+
+#if _M_SSE >= 0x500
+
+class GSVector8;
+
+#endif
+
+#if _M_SSE >= 0x501
+
+class GSVector8i;
+
+#endif
+
+__aligned(class, 16) GSVector4i
+{
+	static const GSVector4i m_xff[17];
+	static const GSVector4i m_x0f[17];
+
+public:
+	union
+	{
+		struct {int x, y, z, w;};
+		struct {int r, g, b, a;};
+		struct {int left, top, right, bottom;};
+		int v[4];
+		float f32[4];
+		int8 i8[16];
+		int16 i16[8];
+		int32 i32[4];
+		int64  i64[2];
+		uint8 u8[16];
+		uint16 u16[8];
+		uint32 u32[4];
+		uint64 u64[2];
+		__m128i m;
+	};
+
+	__forceinline GSVector4i()
+	{
+	}
+
+	__forceinline GSVector4i(int x, int y, int z, int w)
+	{
+		// 4 gprs
+
+		// m = _mm_set_epi32(w, z, y, x);
+
+		// 2 gprs
+
+		GSVector4i xz = load(x).upl32(load(z));
+		GSVector4i yw = load(y).upl32(load(w));
+
+		*this = xz.upl32(yw);
+	}
+
+	__forceinline GSVector4i(int x, int y)
+	{
+		*this = load(x).upl32(load(y));
+	}
+
+	__forceinline GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+	{
+		m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+	}
+
+	__forceinline GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+	{
+		m = _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0);
+	}
+
+	__forceinline GSVector4i(const GSVector4i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector4i(const GSVector2i& v)
+	{
+		m = _mm_loadl_epi64((__m128i*)&v);
+	}
+
+	__forceinline explicit GSVector4i(int i)
+	{
+		*this = i;
+	}
+
+	__forceinline explicit GSVector4i(__m128i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+	__forceinline static GSVector4i cast(const GSVector4& v);
+
+	#if _M_SSE >= 0x500
+
+	__forceinline static GSVector4i cast(const GSVector8& v);
+
+	#endif
+
+	#if _M_SSE >= 0x501
+
+	__forceinline static GSVector4i cast(const GSVector8i& v);
+
+	#endif
+
+	__forceinline void operator = (const GSVector4i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm_broadcastd_epi32(_mm_cvtsi32_si128(i));
+
+		#else
+
+		m = _mm_set1_epi32(i);
+
+		#endif
+	}
+
+	__forceinline void operator = (__m128i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m128i() const
+	{
+		return m;
+	}
+
+	// rect
+
+	__forceinline int width() const
+	{
+		return right - left;
+	}
+
+	__forceinline int height() const
+	{
+		return bottom - top;
+	}
+
+	__forceinline GSVector4i rsize() const
+	{
+		return *this - xyxy(); // same as GSVector4i(0, 0, width(), height());
+	}
+
+	__forceinline bool rempty() const
+	{
+		return (*this < zwzw()).mask() != 0x00ff;
+	}
+
+	__forceinline GSVector4i runion(const GSVector4i& a) const
+	{
+		int i = (upl64(a) < uph64(a)).mask();
+
+		if(i == 0xffff)
+		{
+			return runion_ordered(a);
+		}
+
+		if((i & 0x00ff) == 0x00ff)
+		{
+			return *this;
+		}
+
+		if((i & 0xff00) == 0xff00)
+		{
+			return a;
+		}
+
+		return GSVector4i::zero();
+	}
+
+	__forceinline GSVector4i runion_ordered(const GSVector4i& a) const
+	{
+		#if _M_SSE >= 0x401
+
+		return min_i32(a).upl64(max_i32(a).srl<8>());
+
+		#else
+
+		return GSVector4i(min(x, a.x), min(y, a.y), max(z, a.z), max(w, a.w));
+
+		#endif
+	}
+
+	__forceinline GSVector4i rintersect(const GSVector4i& a) const
+	{
+		return sat_i32(a);
+	}
+
+	template<int mode> __forceinline GSVector4i ralign(const GSVector2i& a) const
+	{
+		// a must be 1 << n
+
+		GSVector4i mask = GSVector4i(a) - GSVector4i(1, 1);
+
+		GSVector4i v;
+
+		switch(mode)
+		{
+		case Align_Inside: v = *this + mask; break;
+		case Align_Outside: v = *this + mask.zwxy(); break;
+		case Align_NegInf: v = *this; break;
+		case Align_PosInf: v = *this + mask.zwzw(); break;
+		default: ASSERT(0); break;
+		}
+
+		return v.andnot(mask.xyxy());
+	}
+
+	GSVector4i fit(int arx, int ary) const;
+
+	GSVector4i fit(int preset) const;
+
+	#ifdef _WIN32
+
+	__forceinline operator LPCRECT() const
+	{
+		return (LPCRECT)this;
+	}
+
+	__forceinline operator LPRECT()
+	{
+		return (LPRECT)this;
+	}
+
+	#endif
+
+	//
+
+	__forceinline uint32 rgba32() const
+	{
+		GSVector4i v = *this;
+
+		v = v.ps32(v);
+		v = v.pu16(v);
+
+		return (uint32)store(v);
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i8(a).min_i8(b);
+	}
+
+	__forceinline GSVector4i sat_i8(const GSVector4i& a) const
+	{
+		return max_i8(a.xyxy()).min_i8(a.zwzw());
+	}
+
+	#endif
+
+	__forceinline GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i16(a).min_i16(b);
+	}
+
+	__forceinline GSVector4i sat_i16(const GSVector4i& a) const
+	{
+		return max_i16(a.xyxy()).min_i16(a.zwzw());
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_i32(a).min_i32(b);
+	}
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
+	{
+		return max_i32(a.xyxy()).min_i32(a.zwzw());
+	}
+
+	#else
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const
+	{
+		GSVector4i v;
+
+		v.x = min(max(x, a.x), b.x);
+		v.y = min(max(y, a.y), b.y);
+		v.z = min(max(z, a.z), b.z);
+		v.w = min(max(w, a.w), b.w);
+
+		return v;
+	}
+
+	__forceinline GSVector4i sat_i32(const GSVector4i& a) const
+	{
+		GSVector4i v;
+
+		v.x = min(max(x, a.x), a.z);
+		v.y = min(max(y, a.y), a.w);
+		v.z = min(max(z, a.x), a.z);
+		v.w = min(max(w, a.y), a.w);
+
+		return v;
+	}
+
+	#endif
+
+	__forceinline GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u8(a).min_u8(b);
+	}
+
+	__forceinline GSVector4i sat_u8(const GSVector4i& a) const
+	{
+		return max_u8(a.xyxy()).min_u8(a.zwzw());
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u16(a).min_u16(b);
+	}
+
+	__forceinline GSVector4i sat_u16(const GSVector4i& a) const
+	{
+		return max_u16(a.xyxy()).min_u16(a.zwzw());
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const
+	{
+		return max_u32(a).min_u32(b);
+	}
+
+	__forceinline GSVector4i sat_u32(const GSVector4i& a) const
+	{
+		return max_u32(a.xyxy()).min_u32(a.zwzw());
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i min_i8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epi8(m, a));
+	}
+
+	__forceinline GSVector4i max_i8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epi8(m, a));
+	}
+
+	#endif
+
+	__forceinline GSVector4i min_i16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epi16(m, a));
+	}
+
+	__forceinline GSVector4i max_i16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epi16(m, a));
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i min_i32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epi32(m, a));
+	}
+
+	__forceinline GSVector4i max_i32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epi32(m, a));
+	}
+
+	#endif
+
+	__forceinline GSVector4i min_u8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epu8(m, a));
+	}
+
+	__forceinline GSVector4i max_u8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epu8(m, a));
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i min_u16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epu16(m, a));
+	}
+
+	__forceinline GSVector4i max_u16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epu16(m, a));
+	}
+
+	__forceinline GSVector4i min_u32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_min_epu32(m, a));
+	}
+
+	__forceinline GSVector4i max_u32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_max_epu32(m, a));
+	}
+
+	#endif
+
+	__forceinline static int min_i16(int a, int b)
+	{
+		 return store(load(a).min_i16(load(b)));
+	}
+
+	__forceinline GSVector4i clamp8() const
+	{
+		return pu16().upl8();
+	}
+
+	__forceinline GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4i(_mm_blendv_epi8(m, a, mask));
+
+		#else
+
+		return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
+
+		#endif
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int mask> __forceinline GSVector4i blend16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_blend_epi16(m, a, mask));
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x501
+
+	template<int mask> __forceinline GSVector4i blend32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_blend_epi32(m, v.m, mask));
+	}
+
+	#endif
+
+	__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
+	{
+		return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
+	}
+
+	__forceinline GSVector4i mix16(const GSVector4i& a) const
+	{
+		#if _M_SSE >= 0x401
+
+		return blend16<0xaa>(a);
+
+		#else
+
+		return blend8(a, GSVector4i::xffff0000());
+
+		#endif
+	}
+
+	#if _M_SSE >= 0x301
+
+	__forceinline GSVector4i shuffle8(const GSVector4i& mask) const
+	{
+		return GSVector4i(_mm_shuffle_epi8(m, mask));
+	}
+
+	#endif
+
+	__forceinline GSVector4i ps16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_packs_epi16(m, a));
+	}
+
+	__forceinline GSVector4i ps16() const
+	{
+		return GSVector4i(_mm_packs_epi16(m, m));
+	}
+
+	__forceinline GSVector4i pu16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_packus_epi16(m, a));
+	}
+
+	__forceinline GSVector4i pu16() const
+	{
+		return GSVector4i(_mm_packus_epi16(m, m));
+	}
+
+	__forceinline GSVector4i ps32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_packs_epi32(m, a));
+	}
+
+	__forceinline GSVector4i ps32() const
+	{
+		return GSVector4i(_mm_packs_epi32(m, m));
+	}
+
+	#if _M_SSE >= 0x401
+
+	__forceinline GSVector4i pu32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_packus_epi32(m, a));
+	}
+
+	__forceinline GSVector4i pu32() const
+	{
+		return GSVector4i(_mm_packus_epi32(m, m));
+	}
+
+	#endif
+
+	__forceinline GSVector4i upl8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpacklo_epi8(m, a));
+	}
+
+	__forceinline GSVector4i uph8(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpackhi_epi8(m, a));
+	}
+
+	__forceinline GSVector4i upl16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpacklo_epi16(m, a));
+	}
+
+	__forceinline GSVector4i uph16(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpackhi_epi16(m, a));
+	}
+
+	__forceinline GSVector4i upl32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpacklo_epi32(m, a));
+	}
+
+	__forceinline GSVector4i uph32(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpackhi_epi32(m, a));
+	}
+
+	__forceinline GSVector4i upl64(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpacklo_epi64(m, a));
+	}
+
+	__forceinline GSVector4i uph64(const GSVector4i& a) const
+	{
+		return GSVector4i(_mm_unpackhi_epi64(m, a));
+	}
+
+	__forceinline GSVector4i upl8() const
+	{
+		#if 0 // _M_SSE >= 0x401 // TODO: compiler bug
+
+		return GSVector4i(_mm_cvtepu8_epi16(m));
+
+		#else
+
+		return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128()));
+
+		#endif
+	}
+
+	__forceinline GSVector4i uph8() const
+	{
+		return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128()));
+	}
+
+	__forceinline GSVector4i upl16() const
+	{
+		#if 0 //_M_SSE >= 0x401 // TODO: compiler bug
+
+		return GSVector4i(_mm_cvtepu16_epi32(m));
+
+		#else
+
+		return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128()));
+
+		#endif
+	}
+
+	__forceinline GSVector4i uph16() const
+	{
+		return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128()));
+	}
+
+	__forceinline GSVector4i upl32() const
+	{
+		#if 0 //_M_SSE >= 0x401 // TODO: compiler bug
+
+		return GSVector4i(_mm_cvtepu32_epi64(m));
+
+		#else
+
+		return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128()));
+
+		#endif
+	}
+
+	__forceinline GSVector4i uph32() const
+	{
+		return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128()));
+	}
+
+	__forceinline GSVector4i upl64() const
+	{
+		return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128()));
+	}
+
+	__forceinline GSVector4i uph64() const
+	{
+		return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128()));
+	}
+
+	#if _M_SSE >= 0x401
+
+	// WARNING!!!
+	//
+	// MSVC (2008, 2010 ctp) believes that there is a "mem, reg" form of the pmovz/sx* instructions,
+	// turning these intrinsics into a minefield, don't spill regs when using them...
+
+	__forceinline GSVector4i i8to16() const
+	{
+		return GSVector4i(_mm_cvtepi8_epi16(m));
+	}
+
+	__forceinline GSVector4i u8to16() const
+	{
+		return GSVector4i(_mm_cvtepu8_epi16(m));
+	}
+
+	__forceinline GSVector4i i8to32() const
+	{
+		return GSVector4i(_mm_cvtepi8_epi32(m));
+	}
+
+	__forceinline GSVector4i u8to32() const
+	{
+		return GSVector4i(_mm_cvtepu8_epi32(m));
+	}
+
+	__forceinline GSVector4i i8to64() const
+	{
+		return GSVector4i(_mm_cvtepi8_epi64(m));
+	}
+
+	__forceinline GSVector4i u8to64() const
+	{
+		return GSVector4i(_mm_cvtepu16_epi64(m));
+	}
+
+	__forceinline GSVector4i i16to32() const
+	{
+		return GSVector4i(_mm_cvtepi16_epi32(m));
+	}
+
+	__forceinline GSVector4i u16to32() const
+	{
+		return GSVector4i(_mm_cvtepu16_epi32(m));
+	}
+
+	__forceinline GSVector4i i16to64() const
+	{
+		return GSVector4i(_mm_cvtepi16_epi64(m));
+	}
+
+	__forceinline GSVector4i u16to64() const
+	{
+		return GSVector4i(_mm_cvtepu16_epi64(m));
+	}
+
+	__forceinline GSVector4i i32to64() const
+	{
+		return GSVector4i(_mm_cvtepi32_epi64(m));
+	}
+
+	__forceinline GSVector4i u32to64() const
+	{
+		return GSVector4i(_mm_cvtepu32_epi64(m));
+	}
+
+	#else
+
+	__forceinline GSVector4i u8to16() const
+	{
+		return upl8();
+	}
+
+	__forceinline GSVector4i u8to32() const
+	{
+		return upl8().upl16();
+	}
+
+	__forceinline GSVector4i u8to64() const
+	{
+		return upl8().upl16().upl32();
+	}
+
+	__forceinline GSVector4i u16to32() const
+	{
+		return upl16();
+	}
+
+	__forceinline GSVector4i u16to64() const
+	{
+		return upl16().upl32();
+	}
+
+	__forceinline GSVector4i u32to64() const
+	{
+		return upl32();
+	}
+
+	__forceinline GSVector4i i8to16() const
+	{
+		return zero().upl8(*this).sra16(8);
+	}
+
+	__forceinline GSVector4i i16to32() const
+	{
+		return zero().upl16(*this).sra32(16);
+	}
+
+	#endif
+
+	template<int i> __forceinline GSVector4i srl() const
+	{
+		return GSVector4i(_mm_srli_si128(m, i));
+	}
+
+	template<int i> __forceinline GSVector4i srl(const GSVector4i& v)
+	{
+		#if _M_SSE >= 0x301
+
+		return GSVector4i(_mm_alignr_epi8(v.m, m, i));
+
+		#else
+
+		if(i == 0) return *this;
+		else if(i < 16) return srl<i>() | v.sll<16 - i>();
+		else if(i == 16) return v;
+		else if(i < 32) return v.srl<i - 16>();
+		else return zero();
+
+		#endif
+	}
+
+	template<int i> __forceinline GSVector4i sll() const
+	{
+		return GSVector4i(_mm_slli_si128(m, i));
+	}
+
+	__forceinline GSVector4i sra16(int i) const
+	{
+		return GSVector4i(_mm_srai_epi16(m, i));
+	}
+
+	__forceinline GSVector4i sra16(__m128i i) const
+	{
+		return GSVector4i(_mm_sra_epi16(m, i));
+	}
+
+	__forceinline GSVector4i sra32(int i) const
+	{
+		return GSVector4i(_mm_srai_epi32(m, i));
+	}
+
+	__forceinline GSVector4i sra32(__m128i i) const
+	{
+		return GSVector4i(_mm_sra_epi32(m, i));
+	}
+
+	__forceinline GSVector4i sll16(int i) const
+	{
+		return GSVector4i(_mm_slli_epi16(m, i));
+	}
+
+	__forceinline GSVector4i sll16(__m128i i) const
+	{
+		return GSVector4i(_mm_sll_epi16(m, i));
+	}
+
+	__forceinline GSVector4i sll32(int i) const
+	{
+		return GSVector4i(_mm_slli_epi32(m, i));
+	}
+
+	__forceinline GSVector4i sll32(__m128i i) const
+	{
+		return GSVector4i(_mm_sll_epi32(m, i));
+	}
+
+	__forceinline GSVector4i sll64(int i) const
+	{
+		return GSVector4i(_mm_slli_epi64(m, i));
+	}
+
+	__forceinline GSVector4i sll64(__m128i i) const
+	{
+		return GSVector4i(_mm_sll_epi64(m, i));
+	}
+
+	__forceinline GSVector4i srl16(int i) const
+	{
+		return GSVector4i(_mm_srli_epi16(m, i));
+	}
+
+	__forceinline GSVector4i srl16(__m128i i) const
+	{
+		return GSVector4i(_mm_srl_epi16(m, i));
+	}
+
+	__forceinline GSVector4i srl32(int i) const
+	{
+		return GSVector4i(_mm_srli_epi32(m, i));
+	}
+
+	__forceinline GSVector4i srl32(__m128i i) const
+	{
+		return GSVector4i(_mm_srl_epi32(m, i));
+	}
+
+	__forceinline GSVector4i srl64(int i) const
+	{
+		return GSVector4i(_mm_srli_epi64(m, i));
+	}
+
+	__forceinline GSVector4i srl64(__m128i i) const
+	{
+		return GSVector4i(_mm_srl_epi64(m, i));
+	}
+
+	__forceinline GSVector4i add8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_add_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i add16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_add_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i add32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_add_epi32(m, v.m));
+	}
+
+	__forceinline GSVector4i adds8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_adds_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i adds16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_adds_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i addus8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_adds_epu8(m, v.m));
+	}
+
+	__forceinline GSVector4i addus16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_adds_epu16(m, v.m));
+	}
+
+	__forceinline GSVector4i sub8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_sub_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i sub16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_sub_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i sub32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_sub_epi32(m, v.m));
+	}
+
+	__forceinline GSVector4i subs8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_subs_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i subs16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_subs_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i subus8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_subs_epu8(m, v.m));
+	}
+
+	__forceinline GSVector4i subus16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_subs_epu16(m, v.m));
+	}
+
+	__forceinline GSVector4i avg8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_avg_epu8(m, v.m));
+	}
+
+	__forceinline GSVector4i avg16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_avg_epu16(m, v.m));
+	}
+
+	__forceinline GSVector4i mul16hs(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_mulhi_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i mul16hu(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_mulhi_epu16(m, v.m));
+	}
+
+	__forceinline GSVector4i mul16l(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_mullo_epi16(m, v.m));
+	}
+
+	#if _M_SSE >= 0x301
+
+	__forceinline GSVector4i mul16hrs(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_mulhrs_epi16(m, v.m));
+	}
+
+	#endif
+
+	GSVector4i madd(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_madd_epi16(m, v.m));
+	}
+
+	template<int shift> __forceinline GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+	{
+		// (a - this) * f << shift + this
+
+		return add16(a.sub16(*this).modulate16<shift>(f));
+	}
+
+	template<int shift> __forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+	{
+		// (a - b) * c << shift
+
+		return a.sub16(b).modulate16<shift>(c);
+	}
+
+	template<int shift> __forceinline static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, const GSVector4i& d)
+	{
+		// (a - b) * c << shift + d
+
+		return d.add16(a.sub16(b).modulate16<shift>(c));
+	}
+
+	__forceinline GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
+	{
+		// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+		return add16(a.sub16(*this).mul16l(f).sra16(4));
+	}
+
+	template<int shift> __forceinline GSVector4i modulate16(const GSVector4i& f) const
+	{
+		// a * f << shift
+
+		#if _M_SSE >= 0x301
+
+		if(shift == 0)
+		{
+			return mul16hrs(f);
+		}
+
+		#endif
+
+		return sll16(shift + 1).mul16hs(f);
+	}
+
+	__forceinline bool eq(const GSVector4i& v) const
+	{
+		#if _M_SSE >= 0x401
+		
+		// pxor, ptest, je
+		
+		GSVector4i t = *this ^ v;
+		
+		return _mm_testz_si128(t, t) != 0;
+
+		#else
+
+		// pcmpeqd, pmovmskb, cmp, je
+
+		return eq32(v).alltrue();
+
+		#endif
+	}
+
+	__forceinline GSVector4i eq8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpeq_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i eq16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpeq_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i eq32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpeq_epi32(m, v.m));
+	}
+
+	__forceinline GSVector4i neq8(const GSVector4i& v) const
+	{
+		return ~eq8(v);
+	}
+
+	__forceinline GSVector4i neq16(const GSVector4i& v) const
+	{
+		return ~eq16(v);
+	}
+
+	__forceinline GSVector4i neq32(const GSVector4i& v) const
+	{
+		return ~eq32(v);
+	}
+
+	__forceinline GSVector4i gt8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpgt_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i gt16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpgt_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i gt32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpgt_epi32(m, v.m));
+	}
+
+	__forceinline GSVector4i lt8(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmplt_epi8(m, v.m));
+	}
+
+	__forceinline GSVector4i lt16(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmplt_epi16(m, v.m));
+	}
+
+	__forceinline GSVector4i lt32(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmplt_epi32(m, v.m));
+	}
+
+	__forceinline GSVector4i andnot(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_andnot_si128(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm_movemask_epi8(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xffff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_testz_si128(m, m) != 0;
+
+		#else
+
+		return mask() == 0;
+
+		#endif
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int i> __forceinline GSVector4i insert8(int a) const
+	{
+		return GSVector4i(_mm_insert_epi8(m, a, i));
+	}
+
+	#endif
+
+	template<int i> __forceinline int extract8() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_epi8(m, i);
+
+		#else
+
+		return (int)u8[i];
+
+		#endif
+	}
+
+	template<int i> __forceinline GSVector4i insert16(int a) const
+	{
+		return GSVector4i(_mm_insert_epi16(m, a, i));
+	}
+
+	template<int i> __forceinline int extract16() const
+	{
+		return _mm_extract_epi16(m, i);
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int i> __forceinline GSVector4i insert32(int a) const
+	{
+		return GSVector4i(_mm_insert_epi32(m, a, i));
+	}
+
+	#endif
+
+	template<int i> __forceinline int extract32() const
+	{
+		if(i == 0) return GSVector4i::store(*this);
+
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_epi32(m, i);
+
+		#else
+
+		return i32[i];
+
+		#endif
+	}
+
+	#ifdef _M_AMD64
+
+	#if _M_SSE >= 0x401
+
+	template<int i> __forceinline GSVector4i insert64(int64 a) const
+	{
+		return GSVector4i(_mm_insert_epi64(m, a, i));
+	}
+
+	#endif
+
+	template<int i> __forceinline int64 extract64() const
+	{
+		if(i == 0) return GSVector4i::storeq(*this);
+
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_epi64(m, i);
+
+		#else
+
+		return i64[i];
+
+		#endif
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x401
+
+	template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert8<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert8<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert8<3>((int)ptr[extract8<src + 1>() >> 4]);
+		v = v.insert8<4>((int)ptr[extract8<src + 2>() & 0xf]);
+		v = v.insert8<5>((int)ptr[extract8<src + 2>() >> 4]);
+		v = v.insert8<6>((int)ptr[extract8<src + 3>() & 0xf]);
+		v = v.insert8<7>((int)ptr[extract8<src + 3>() >> 4]);
+		v = v.insert8<8>((int)ptr[extract8<src + 4>() & 0xf]);
+		v = v.insert8<9>((int)ptr[extract8<src + 4>() >> 4]);
+		v = v.insert8<10>((int)ptr[extract8<src + 5>() & 0xf]);
+		v = v.insert8<11>((int)ptr[extract8<src + 5>() >> 4]);
+		v = v.insert8<12>((int)ptr[extract8<src + 6>() & 0xf]);
+		v = v.insert8<13>((int)ptr[extract8<src + 6>() >> 4]);
+		v = v.insert8<14>((int)ptr[extract8<src + 7>() & 0xf]);
+		v = v.insert8<15>((int)ptr[extract8<src + 7>() >> 4]);
+
+		return v;
+	}
+
+	template<class T> __forceinline GSVector4i gather8_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<0>()]);
+		v = v.insert8<1>((int)ptr[extract8<1>()]);
+		v = v.insert8<2>((int)ptr[extract8<2>()]);
+		v = v.insert8<3>((int)ptr[extract8<3>()]);
+		v = v.insert8<4>((int)ptr[extract8<4>()]);
+		v = v.insert8<5>((int)ptr[extract8<5>()]);
+		v = v.insert8<6>((int)ptr[extract8<6>()]);
+		v = v.insert8<7>((int)ptr[extract8<7>()]);
+		v = v.insert8<8>((int)ptr[extract8<8>()]);
+		v = v.insert8<9>((int)ptr[extract8<9>()]);
+		v = v.insert8<10>((int)ptr[extract8<10>()]);
+		v = v.insert8<11>((int)ptr[extract8<11>()]);
+		v = v.insert8<12>((int)ptr[extract8<12>()]);
+		v = v.insert8<13>((int)ptr[extract8<13>()]);
+		v = v.insert8<14>((int)ptr[extract8<14>()]);
+		v = v.insert8<15>((int)ptr[extract8<15>()]);
+
+		return v;
+	}
+
+	template<int dst, class T> __forceinline GSVector4i gather8_16(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert8<dst + 0>((int)ptr[extract16<0>()]);
+		v = v.insert8<dst + 1>((int)ptr[extract16<1>()]);
+		v = v.insert8<dst + 2>((int)ptr[extract16<2>()]);
+		v = v.insert8<dst + 3>((int)ptr[extract16<3>()]);
+		v = v.insert8<dst + 4>((int)ptr[extract16<4>()]);
+		v = v.insert8<dst + 5>((int)ptr[extract16<5>()]);
+		v = v.insert8<dst + 6>((int)ptr[extract16<6>()]);
+		v = v.insert8<dst + 7>((int)ptr[extract16<7>()]);
+
+		return v;
+	}
+
+	template<int dst, class T> __forceinline GSVector4i gather8_32(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert8<dst + 0>((int)ptr[extract32<0>()]);
+		v = v.insert8<dst + 1>((int)ptr[extract32<1>()]);
+		v = v.insert8<dst + 2>((int)ptr[extract32<2>()]);
+		v = v.insert8<dst + 3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	#endif
+
+	template<int src, class T> __forceinline GSVector4i gather16_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert16<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert16<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert16<3>((int)ptr[extract8<src + 1>() >> 4]);
+		v = v.insert16<4>((int)ptr[extract8<src + 2>() & 0xf]);
+		v = v.insert16<5>((int)ptr[extract8<src + 2>() >> 4]);
+		v = v.insert16<6>((int)ptr[extract8<src + 3>() & 0xf]);
+		v = v.insert16<7>((int)ptr[extract8<src + 3>() >> 4]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather16_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>()]);
+		v = v.insert16<1>((int)ptr[extract8<src + 1>()]);
+		v = v.insert16<2>((int)ptr[extract8<src + 2>()]);
+		v = v.insert16<3>((int)ptr[extract8<src + 3>()]);
+		v = v.insert16<4>((int)ptr[extract8<src + 4>()]);
+		v = v.insert16<5>((int)ptr[extract8<src + 5>()]);
+		v = v.insert16<6>((int)ptr[extract8<src + 6>()]);
+		v = v.insert16<7>((int)ptr[extract8<src + 7>()]);
+
+		return v;
+	}
+
+	template<class T>__forceinline GSVector4i gather16_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract16<0>()]);
+		v = v.insert16<1>((int)ptr[extract16<1>()]);
+		v = v.insert16<2>((int)ptr[extract16<2>()]);
+		v = v.insert16<3>((int)ptr[extract16<3>()]);
+		v = v.insert16<4>((int)ptr[extract16<4>()]);
+		v = v.insert16<5>((int)ptr[extract16<5>()]);
+		v = v.insert16<6>((int)ptr[extract16<6>()]);
+		v = v.insert16<7>((int)ptr[extract16<7>()]);
+
+		return v;
+	}
+
+	template<class T1, class T2>__forceinline GSVector4i gather16_16(const T1* ptr1, const T2* ptr2) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr2[ptr1[extract16<0>()]]);
+		v = v.insert16<1>((int)ptr2[ptr1[extract16<1>()]]);
+		v = v.insert16<2>((int)ptr2[ptr1[extract16<2>()]]);
+		v = v.insert16<3>((int)ptr2[ptr1[extract16<3>()]]);
+		v = v.insert16<4>((int)ptr2[ptr1[extract16<4>()]]);
+		v = v.insert16<5>((int)ptr2[ptr1[extract16<5>()]]);
+		v = v.insert16<6>((int)ptr2[ptr1[extract16<6>()]]);
+		v = v.insert16<7>((int)ptr2[ptr1[extract16<7>()]]);
+
+		return v;
+	}
+
+	template<int dst, class T> __forceinline GSVector4i gather16_32(const T* ptr, const GSVector4i& a) const
+	{
+		GSVector4i v = a;
+
+		v = v.insert16<dst + 0>((int)ptr[extract32<0>()]);
+		v = v.insert16<dst + 1>((int)ptr[extract32<1>()]);
+		v = v.insert16<dst + 2>((int)ptr[extract32<2>()]);
+		v = v.insert16<dst + 3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert32<1>((int)ptr[extract8<src + 0>() >> 4]);
+		v = v.insert32<2>((int)ptr[extract8<src + 1>() & 0xf]);
+		v = v.insert32<3>((int)ptr[extract8<src + 1>() >> 4]);
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather32_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract8<src + 0>()]);
+		v = v.insert32<1>((int)ptr[extract8<src + 1>()]);
+		v = v.insert32<2>((int)ptr[extract8<src + 2>()]);
+		v = v.insert32<3>((int)ptr[extract8<src + 3>()]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather32_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract16<src + 0>()]);
+		v = v.insert32<1>((int)ptr[extract16<src + 1>()]);
+		v = v.insert32<2>((int)ptr[extract16<src + 2>()]);
+		v = v.insert32<3>((int)ptr[extract16<src + 3>()]);
+
+		return v;
+	}
+
+	template<class T> __forceinline GSVector4i gather32_32(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr[extract32<0>()]);
+		v = v.insert32<1>((int)ptr[extract32<1>()]);
+		v = v.insert32<2>((int)ptr[extract32<2>()]);
+		v = v.insert32<3>((int)ptr[extract32<3>()]);
+
+		return v;
+	}
+
+	template<class T1, class T2> __forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const
+	{
+		GSVector4i v;
+
+		v = load((int)ptr2[ptr1[extract32<0>()]]);
+		v = v.insert32<1>((int)ptr2[ptr1[extract32<1>()]]);
+		v = v.insert32<2>((int)ptr2[ptr1[extract32<2>()]]);
+		v = v.insert32<3>((int)ptr2[ptr1[extract32<3>()]]);
+
+		return v;
+	}
+
+	#else
+
+	template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
+	{
+		return GSVector4i(
+			(int)ptr[extract8<src + 0>() & 0xf],
+			(int)ptr[extract8<src + 0>() >> 4],
+			(int)ptr[extract8<src + 1>() & 0xf],
+			(int)ptr[extract8<src + 1>() >> 4]);
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather32_8(const T* ptr) const
+	{
+		return GSVector4i(
+			(int)ptr[extract8<src + 0>()],
+			(int)ptr[extract8<src + 1>()],
+			(int)ptr[extract8<src + 2>()],
+			(int)ptr[extract8<src + 3>()]);
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather32_16(const T* ptr) const
+	{
+		return GSVector4i(
+			(int)ptr[extract16<src + 0>()],
+			(int)ptr[extract16<src + 1>()],
+			(int)ptr[extract16<src + 2>()],
+			(int)ptr[extract16<src + 3>()]);
+	}
+
+	template<class T> __forceinline GSVector4i gather32_32(const T* ptr) const
+	{
+		return GSVector4i(
+			(int)ptr[extract32<0>()],
+			(int)ptr[extract32<1>()],
+			(int)ptr[extract32<2>()],
+			(int)ptr[extract32<3>()]);
+	}
+
+	template<class T1, class T2> __forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const
+	{
+		return GSVector4i(
+			(int)ptr2[ptr1[extract32<0>()]],
+			(int)ptr2[ptr1[extract32<1>()]],
+			(int)ptr2[ptr1[extract32<2>()]],
+			(int)ptr2[ptr1[extract32<3>()]]);
+	}
+
+	#endif
+
+	#if defined(_M_AMD64) && _M_SSE >= 0x401
+
+	template<int src, class T> __forceinline GSVector4i gather64_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((int64)ptr[extract8<src + 0>() & 0xf]);
+		v = v.insert64<1>((int64)ptr[extract8<src + 0>() >> 4]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((int64)ptr[extract8<src + 0>()]);
+		v = v.insert64<1>((int64)ptr[extract8<src + 1>()]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((int64)ptr[extract16<src + 0>()]);
+		v = v.insert64<1>((int64)ptr[extract16<src + 1>()]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_32(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((int64)ptr[extract32<src + 0>()]);
+		v = v.insert64<1>((int64)ptr[extract32<src + 1>()]);
+
+		return v;
+	}
+
+	template<class T> __forceinline GSVector4i gather64_64(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadq((int64)ptr[extract64<0>()]);
+		v = v.insert64<1>((int64)ptr[extract64<1>()]);
+
+		return v;
+	}
+
+	#else
+
+	template<int src, class T> __forceinline GSVector4i gather64_4(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = loadu(&ptr[extract8<src + 0>() & 0xf], &ptr[extract8<src + 0>() >> 4]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_8(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load(&ptr[extract8<src + 0>()], &ptr[extract8<src + 1>()]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_16(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load(&ptr[extract16<src + 0>()], &ptr[extract16<src + 1>()]);
+
+		return v;
+	}
+
+	template<int src, class T> __forceinline GSVector4i gather64_32(const T* ptr) const
+	{
+		GSVector4i v;
+
+		v = load(&ptr[extract32<src + 0>()], &ptr[extract32<src + 1>()]);
+
+		return v;
+	}
+
+	#endif
+
+	#if _M_SSE >= 0x401
+
+	template<class T> __forceinline void gather8_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather8_4<0>(ptr);
+		dst[1] = gather8_4<8>(ptr);
+	}
+
+	__forceinline void gather8_8(const uint8* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather8_8<>(ptr);
+	}
+
+	#endif
+
+	template<class T> __forceinline void gather16_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_4<0>(ptr);
+		dst[1] = gather16_4<4>(ptr);
+		dst[2] = gather16_4<8>(ptr);
+		dst[3] = gather16_4<12>(ptr);
+	}
+
+	template<class T> __forceinline void gather16_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_8<0>(ptr);
+		dst[1] = gather16_8<8>(ptr);
+	}
+
+	template<class T> __forceinline void gather16_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather16_16<>(ptr);
+	}
+
+	template<class T> __forceinline void gather32_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_4<0>(ptr);
+		dst[1] = gather32_4<2>(ptr);
+		dst[2] = gather32_4<4>(ptr);
+		dst[3] = gather32_4<6>(ptr);
+		dst[4] = gather32_4<8>(ptr);
+		dst[5] = gather32_4<10>(ptr);
+		dst[6] = gather32_4<12>(ptr);
+		dst[7] = gather32_4<14>(ptr);
+	}
+
+	template<class T> __forceinline void gather32_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_8<0>(ptr);
+		dst[1] = gather32_8<4>(ptr);
+		dst[2] = gather32_8<8>(ptr);
+		dst[3] = gather32_8<12>(ptr);
+	}
+
+	template<class T> __forceinline void gather32_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_16<0>(ptr);
+		dst[1] = gather32_16<4>(ptr);
+	}
+
+	template<class T> __forceinline void gather32_32(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather32_32<>(ptr);
+	}
+
+	template<class T> __forceinline void gather64_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_4<0>(ptr);
+		dst[1] = gather64_4<1>(ptr);
+		dst[2] = gather64_4<2>(ptr);
+		dst[3] = gather64_4<3>(ptr);
+		dst[4] = gather64_4<4>(ptr);
+		dst[5] = gather64_4<5>(ptr);
+		dst[6] = gather64_4<6>(ptr);
+		dst[7] = gather64_4<7>(ptr);
+		dst[8] = gather64_4<8>(ptr);
+		dst[9] = gather64_4<9>(ptr);
+		dst[10] = gather64_4<10>(ptr);
+		dst[11] = gather64_4<11>(ptr);
+		dst[12] = gather64_4<12>(ptr);
+		dst[13] = gather64_4<13>(ptr);
+		dst[14] = gather64_4<14>(ptr);
+		dst[15] = gather64_4<15>(ptr);
+	}
+
+	template<class T> __forceinline void gather64_8(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_8<0>(ptr);
+		dst[1] = gather64_8<2>(ptr);
+		dst[2] = gather64_8<4>(ptr);
+		dst[3] = gather64_8<6>(ptr);
+		dst[4] = gather64_8<8>(ptr);
+		dst[5] = gather64_8<10>(ptr);
+		dst[6] = gather64_8<12>(ptr);
+		dst[7] = gather64_8<14>(ptr);
+	}
+
+	template<class T> __forceinline void gather64_16(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_16<0>(ptr);
+		dst[1] = gather64_16<2>(ptr);
+		dst[2] = gather64_16<4>(ptr);
+		dst[3] = gather64_16<8>(ptr);
+	}
+
+	template<class T> __forceinline void gather64_32(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_32<0>(ptr);
+		dst[1] = gather64_32<2>(ptr);
+	}
+
+	#ifdef _M_AMD64
+
+	template<class T> __forceinline void gather64_64(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
+	{
+		dst[0] = gather64_64<>(ptr);
+	}
+
+	#endif
+
+	__forceinline static GSVector4i loadnt(const void* p)
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4i(_mm_stream_load_si128((__m128i*)p));
+
+		#else
+
+		return GSVector4i(_mm_load_si128((__m128i*)p));
+
+		#endif
+	}
+
+	__forceinline static GSVector4i loadl(const void* p)
+	{
+		return GSVector4i(_mm_loadl_epi64((__m128i*)p));
+	}
+
+	__forceinline static GSVector4i loadh(const void* p)
+	{
+		return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
+	}
+
+	__forceinline static GSVector4i loadh(const void* p, const GSVector4i& v)
+	{
+		return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(v.m), (__m64*)p)));
+	}
+
+	__forceinline static GSVector4i load(const void* pl, const void* ph)
+	{
+		return loadh(ph, loadl(pl));
+	}
+/*
+	__forceinline static GSVector4i load(const void* pl, const void* ph)
+	{
+		__m128i lo = _mm_loadl_epi64((__m128i*)pl);
+		__m128i hi = _mm_loadl_epi64((__m128i*)ph);
+
+		return GSVector4i(_mm_unpacklo_epi64(lo, hi));
+	}
+*/
+	template<bool aligned> __forceinline static GSVector4i load(const void* p)
+	{
+		return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
+	}
+
+	__forceinline static GSVector4i load(int i)
+	{
+		return GSVector4i(_mm_cvtsi32_si128(i));
+	}
+
+	#ifdef _M_AMD64
+
+	__forceinline static GSVector4i loadq(int64 i)
+	{
+		return GSVector4i(_mm_cvtsi64_si128(i));
+	}
+
+	#endif
+
+	__forceinline static void storent(void* p, const GSVector4i& v)
+	{
+		_mm_stream_si128((__m128i*)p, v.m);
+	}
+
+	__forceinline static void storel(void* p, const GSVector4i& v)
+	{
+		_mm_storel_epi64((__m128i*)p, v.m);
+	}
+
+	__forceinline static void storeh(void* p, const GSVector4i& v)
+	{
+		_mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m));
+	}
+
+	__forceinline static void store(void* pl, void* ph, const GSVector4i& v)
+	{
+		GSVector4i::storel(pl, v);
+		GSVector4i::storeh(ph, v);
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4i& v)
+	{
+		if(aligned) _mm_store_si128((__m128i*)p, v.m);
+		else _mm_storeu_si128((__m128i*)p, v.m);
+	}
+
+	__forceinline static int store(const GSVector4i& v)
+	{
+		return _mm_cvtsi128_si32(v.m);
+	}
+
+	#ifdef _M_AMD64
+
+	__forceinline static int64 storeq(const GSVector4i& v)
+	{
+		return _mm_cvtsi128_si64(v.m);
+	}
+
+	#endif
+
+	__forceinline static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size)
+	{
+		const GSVector4i* s = (const GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		if(size == 0) return;
+
+		size_t i = 0;
+		size_t j = size >> 6;
+
+		for(; i < j; i++, s += 4, d += 4)
+		{
+			storent(&d[0], s[0]);
+			storent(&d[1], s[1]);
+			storent(&d[2], s[2]);
+			storent(&d[3], s[3]);
+		}
+
+		size &= 63;
+
+		if(size == 0) return;
+
+		memcpy(d, s, size);
+	}
+
+	__forceinline static void transpose(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		_MM_TRANSPOSE4_SI128(a.m, b.m, c.m, d.m);
+	}
+
+	__forceinline static void sw4(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		const __m128i epi32_0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
+
+		GSVector4i mask(epi32_0f0f0f0f);
+
+		GSVector4i e = (b << 4).blend(a, mask);
+		GSVector4i f = b.blend(a >> 4, mask);
+		GSVector4i g = (d << 4).blend(c, mask);
+		GSVector4i h = d.blend(c >> 4, mask);
+
+		a = e.upl8(f);
+		c = e.uph8(f);
+		b = g.upl8(h);
+		d = g.uph8(h);
+	}
+
+	__forceinline static void sw8(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl8(b);
+		c = e.uph8(b);
+		b = f.upl8(d);
+		d = f.uph8(d);
+	}
+
+	__forceinline static void sw16(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl16(b);
+		c = e.uph16(b);
+		b = f.upl16(d);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw16rl(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = b.upl16(e);
+		c = e.uph16(b);
+		b = d.upl16(f);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw16rh(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl16(b);
+		c = b.uph16(e);
+		b = f.upl16(d);
+		d = d.uph16(f);
+	}
+
+	__forceinline static void sw32(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl32(b);
+		c = e.uph32(b);
+		b = f.upl32(d);
+		d = f.uph32(d);
+	}
+
+	__forceinline static void sw64(GSVector4i& a, GSVector4i& b, GSVector4i& c, GSVector4i& d)
+	{
+		GSVector4i e = a;
+		GSVector4i f = c;
+
+		a = e.upl64(b);
+		c = e.uph64(b);
+		b = f.upl64(d);
+		d = f.uph64(d);
+	}
+
+	__forceinline static bool compare16(const void* dst, const void* src, size_t size)
+	{
+		ASSERT((size & 15) == 0);
+
+		size >>= 4;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		for(size_t i = 0; i < size; i++)
+		{
+			if(!d[i].eq(s[i]))
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	__forceinline static bool compare64(const void* dst, const void* src, size_t size)
+	{
+		ASSERT((size & 63) == 0);
+
+		size >>= 6;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		for(size_t i = 0; i < size; i += 4)
+		{
+			GSVector4i v0 = (d[i * 4 + 0] == s[i * 4 + 0]);
+			GSVector4i v1 = (d[i * 4 + 1] == s[i * 4 + 1]);
+			GSVector4i v2 = (d[i * 4 + 2] == s[i * 4 + 2]);
+			GSVector4i v3 = (d[i * 4 + 3] == s[i * 4 + 3]);
+
+			v0 = v0 & v1;
+			v2 = v2 & v3;
+
+			if(!(v0 & v2).alltrue())
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	__forceinline static bool update(const void* dst, const void* src, size_t size)
+	{
+		ASSERT((size & 15) == 0);
+
+		size >>= 4;
+
+		GSVector4i* s = (GSVector4i*)src;
+		GSVector4i* d = (GSVector4i*)dst;
+
+		GSVector4i v = GSVector4i::xffffffff();
+
+		for(size_t i = 0; i < size; i++)
+		{
+			v &= d[i] == s[i];
+
+			d[i] = s[i];
+		}
+
+		return v.alltrue();
+	}
+
+	__forceinline void operator += (const GSVector4i& v)
+	{
+		m = _mm_add_epi32(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector4i& v)
+	{
+		m = _mm_sub_epi32(m, v);
+	}
+
+	__forceinline void operator += (int i)
+	{
+		*this += GSVector4i(i);
+	}
+
+	__forceinline void operator -= (int i)
+	{
+		*this -= GSVector4i(i);
+	}
+
+	__forceinline void operator <<= (const int i)
+	{
+		m = _mm_slli_epi32(m, i);
+	}
+
+	__forceinline void operator >>= (const int i)
+	{
+		m = _mm_srli_epi32(m, i);
+	}
+
+	__forceinline void operator &= (const GSVector4i& v)
+	{
+		m = _mm_and_si128(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector4i& v)
+	{
+		m = _mm_or_si128(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector4i& v)
+	{
+		m = _mm_xor_si128(m, v);
+	}
+
+	__forceinline friend GSVector4i operator + (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_add_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator - (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_sub_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator + (const GSVector4i& v, int i)
+	{
+		return v + GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator - (const GSVector4i& v, int i)
+	{
+		return v - GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator << (const GSVector4i& v, const int i)
+	{
+		return GSVector4i(_mm_slli_epi32(v, i));
+	}
+
+	__forceinline friend GSVector4i operator >> (const GSVector4i& v, const int i)
+	{
+		return GSVector4i(_mm_srli_epi32(v, i));
+	}
+
+	__forceinline friend GSVector4i operator & (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_and_si128(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator | (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_or_si128(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator ^ (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_xor_si128(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator & (const GSVector4i& v, int i)
+	{
+		return v & GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator | (const GSVector4i& v, int i)
+	{
+		return v | GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator ^ (const GSVector4i& v, int i)
+	{
+		return v ^ GSVector4i(i);
+	}
+
+	__forceinline friend GSVector4i operator ~ (const GSVector4i& v)
+	{
+		return v ^ (v == v);
+	}
+
+	__forceinline friend GSVector4i operator == (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_cmpeq_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator != (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return ~(v1 == v2);
+	}
+
+	__forceinline friend GSVector4i operator > (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_cmpgt_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator < (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return GSVector4i(_mm_cmplt_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector4i operator >= (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return (v1 > v2) | (v1 == v2);
+	}
+
+	__forceinline friend GSVector4i operator <= (const GSVector4i& v1, const GSVector4i& v2)
+	{
+		return (v1 < v2) | (v1 == v2);
+	}
+
+	#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR4i_SHUFFLE_1(xs, xn) \
+		VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR4i_SHUFFLE_1(x, 0)
+	VECTOR4i_SHUFFLE_1(y, 1)
+	VECTOR4i_SHUFFLE_1(z, 2)
+	VECTOR4i_SHUFFLE_1(w, 3)
+
+	__forceinline static GSVector4i zero() {return GSVector4i(_mm_setzero_si128());}
+
+	__forceinline static GSVector4i xffffffff() {return zero() == zero();}
+
+	__forceinline static GSVector4i x00000001() {return xffffffff().srl32(31);}
+	__forceinline static GSVector4i x00000003() {return xffffffff().srl32(30);}
+	__forceinline static GSVector4i x00000007() {return xffffffff().srl32(29);}
+	__forceinline static GSVector4i x0000000f() {return xffffffff().srl32(28);}
+	__forceinline static GSVector4i x0000001f() {return xffffffff().srl32(27);}
+	__forceinline static GSVector4i x0000003f() {return xffffffff().srl32(26);}
+	__forceinline static GSVector4i x0000007f() {return xffffffff().srl32(25);}
+	__forceinline static GSVector4i x000000ff() {return xffffffff().srl32(24);}
+	__forceinline static GSVector4i x000001ff() {return xffffffff().srl32(23);}
+	__forceinline static GSVector4i x000003ff() {return xffffffff().srl32(22);}
+	__forceinline static GSVector4i x000007ff() {return xffffffff().srl32(21);}
+	__forceinline static GSVector4i x00000fff() {return xffffffff().srl32(20);}
+	__forceinline static GSVector4i x00001fff() {return xffffffff().srl32(19);}
+	__forceinline static GSVector4i x00003fff() {return xffffffff().srl32(18);}
+	__forceinline static GSVector4i x00007fff() {return xffffffff().srl32(17);}
+	__forceinline static GSVector4i x0000ffff() {return xffffffff().srl32(16);}
+	__forceinline static GSVector4i x0001ffff() {return xffffffff().srl32(15);}
+	__forceinline static GSVector4i x0003ffff() {return xffffffff().srl32(14);}
+	__forceinline static GSVector4i x0007ffff() {return xffffffff().srl32(13);}
+	__forceinline static GSVector4i x000fffff() {return xffffffff().srl32(12);}
+	__forceinline static GSVector4i x001fffff() {return xffffffff().srl32(11);}
+	__forceinline static GSVector4i x003fffff() {return xffffffff().srl32(10);}
+	__forceinline static GSVector4i x007fffff() {return xffffffff().srl32( 9);}
+	__forceinline static GSVector4i x00ffffff() {return xffffffff().srl32( 8);}
+	__forceinline static GSVector4i x01ffffff() {return xffffffff().srl32( 7);}
+	__forceinline static GSVector4i x03ffffff() {return xffffffff().srl32( 6);}
+	__forceinline static GSVector4i x07ffffff() {return xffffffff().srl32( 5);}
+	__forceinline static GSVector4i x0fffffff() {return xffffffff().srl32( 4);}
+	__forceinline static GSVector4i x1fffffff() {return xffffffff().srl32( 3);}
+	__forceinline static GSVector4i x3fffffff() {return xffffffff().srl32( 2);}
+	__forceinline static GSVector4i x7fffffff() {return xffffffff().srl32( 1);}
+
+	__forceinline static GSVector4i x80000000() {return xffffffff().sll32(31);}
+	__forceinline static GSVector4i xc0000000() {return xffffffff().sll32(30);}
+	__forceinline static GSVector4i xe0000000() {return xffffffff().sll32(29);}
+	__forceinline static GSVector4i xf0000000() {return xffffffff().sll32(28);}
+	__forceinline static GSVector4i xf8000000() {return xffffffff().sll32(27);}
+	__forceinline static GSVector4i xfc000000() {return xffffffff().sll32(26);}
+	__forceinline static GSVector4i xfe000000() {return xffffffff().sll32(25);}
+	__forceinline static GSVector4i xff000000() {return xffffffff().sll32(24);}
+	__forceinline static GSVector4i xff800000() {return xffffffff().sll32(23);}
+	__forceinline static GSVector4i xffc00000() {return xffffffff().sll32(22);}
+	__forceinline static GSVector4i xffe00000() {return xffffffff().sll32(21);}
+	__forceinline static GSVector4i xfff00000() {return xffffffff().sll32(20);}
+	__forceinline static GSVector4i xfff80000() {return xffffffff().sll32(19);}
+	__forceinline static GSVector4i xfffc0000() {return xffffffff().sll32(18);}
+	__forceinline static GSVector4i xfffe0000() {return xffffffff().sll32(17);}
+	__forceinline static GSVector4i xffff0000() {return xffffffff().sll32(16);}
+	__forceinline static GSVector4i xffff8000() {return xffffffff().sll32(15);}
+	__forceinline static GSVector4i xffffc000() {return xffffffff().sll32(14);}
+	__forceinline static GSVector4i xffffe000() {return xffffffff().sll32(13);}
+	__forceinline static GSVector4i xfffff000() {return xffffffff().sll32(12);}
+	__forceinline static GSVector4i xfffff800() {return xffffffff().sll32(11);}
+	__forceinline static GSVector4i xfffffc00() {return xffffffff().sll32(10);}
+	__forceinline static GSVector4i xfffffe00() {return xffffffff().sll32( 9);}
+	__forceinline static GSVector4i xffffff00() {return xffffffff().sll32( 8);}
+	__forceinline static GSVector4i xffffff80() {return xffffffff().sll32( 7);}
+	__forceinline static GSVector4i xffffffc0() {return xffffffff().sll32( 6);}
+	__forceinline static GSVector4i xffffffe0() {return xffffffff().sll32( 5);}
+	__forceinline static GSVector4i xfffffff0() {return xffffffff().sll32( 4);}
+	__forceinline static GSVector4i xfffffff8() {return xffffffff().sll32( 3);}
+	__forceinline static GSVector4i xfffffffc() {return xffffffff().sll32( 2);}
+	__forceinline static GSVector4i xfffffffe() {return xffffffff().sll32( 1);}
+
+	__forceinline static GSVector4i x0001() {return xffffffff().srl16(15);}
+	__forceinline static GSVector4i x0003() {return xffffffff().srl16(14);}
+	__forceinline static GSVector4i x0007() {return xffffffff().srl16(13);}
+	__forceinline static GSVector4i x000f() {return xffffffff().srl16(12);}
+	__forceinline static GSVector4i x001f() {return xffffffff().srl16(11);}
+	__forceinline static GSVector4i x003f() {return xffffffff().srl16(10);}
+	__forceinline static GSVector4i x007f() {return xffffffff().srl16( 9);}
+	__forceinline static GSVector4i x00ff() {return xffffffff().srl16( 8);}
+	__forceinline static GSVector4i x01ff() {return xffffffff().srl16( 7);}
+	__forceinline static GSVector4i x03ff() {return xffffffff().srl16( 6);}
+	__forceinline static GSVector4i x07ff() {return xffffffff().srl16( 5);}
+	__forceinline static GSVector4i x0fff() {return xffffffff().srl16( 4);}
+	__forceinline static GSVector4i x1fff() {return xffffffff().srl16( 3);}
+	__forceinline static GSVector4i x3fff() {return xffffffff().srl16( 2);}
+	__forceinline static GSVector4i x7fff() {return xffffffff().srl16( 1);}
+
+	__forceinline static GSVector4i x8000() {return xffffffff().sll16(15);}
+	__forceinline static GSVector4i xc000() {return xffffffff().sll16(14);}
+	__forceinline static GSVector4i xe000() {return xffffffff().sll16(13);}
+	__forceinline static GSVector4i xf000() {return xffffffff().sll16(12);}
+	__forceinline static GSVector4i xf800() {return xffffffff().sll16(11);}
+	__forceinline static GSVector4i xfc00() {return xffffffff().sll16(10);}
+	__forceinline static GSVector4i xfe00() {return xffffffff().sll16( 9);}
+	__forceinline static GSVector4i xff00() {return xffffffff().sll16( 8);}
+	__forceinline static GSVector4i xff80() {return xffffffff().sll16( 7);}
+	__forceinline static GSVector4i xffc0() {return xffffffff().sll16( 6);}
+	__forceinline static GSVector4i xffe0() {return xffffffff().sll16( 5);}
+	__forceinline static GSVector4i xfff0() {return xffffffff().sll16( 4);}
+	__forceinline static GSVector4i xfff8() {return xffffffff().sll16( 3);}
+	__forceinline static GSVector4i xfffc() {return xffffffff().sll16( 2);}
+	__forceinline static GSVector4i xfffe() {return xffffffff().sll16( 1);}
+
+	__forceinline static GSVector4i xffffffff(const GSVector4i& v) {return v == v;}
+
+	__forceinline static GSVector4i x00000001(const GSVector4i& v) {return xffffffff(v).srl32(31);}
+	__forceinline static GSVector4i x00000003(const GSVector4i& v) {return xffffffff(v).srl32(30);}
+	__forceinline static GSVector4i x00000007(const GSVector4i& v) {return xffffffff(v).srl32(29);}
+	__forceinline static GSVector4i x0000000f(const GSVector4i& v) {return xffffffff(v).srl32(28);}
+	__forceinline static GSVector4i x0000001f(const GSVector4i& v) {return xffffffff(v).srl32(27);}
+	__forceinline static GSVector4i x0000003f(const GSVector4i& v) {return xffffffff(v).srl32(26);}
+	__forceinline static GSVector4i x0000007f(const GSVector4i& v) {return xffffffff(v).srl32(25);}
+	__forceinline static GSVector4i x000000ff(const GSVector4i& v) {return xffffffff(v).srl32(24);}
+	__forceinline static GSVector4i x000001ff(const GSVector4i& v) {return xffffffff(v).srl32(23);}
+	__forceinline static GSVector4i x000003ff(const GSVector4i& v) {return xffffffff(v).srl32(22);}
+	__forceinline static GSVector4i x000007ff(const GSVector4i& v) {return xffffffff(v).srl32(21);}
+	__forceinline static GSVector4i x00000fff(const GSVector4i& v) {return xffffffff(v).srl32(20);}
+	__forceinline static GSVector4i x00001fff(const GSVector4i& v) {return xffffffff(v).srl32(19);}
+	__forceinline static GSVector4i x00003fff(const GSVector4i& v) {return xffffffff(v).srl32(18);}
+	__forceinline static GSVector4i x00007fff(const GSVector4i& v) {return xffffffff(v).srl32(17);}
+	__forceinline static GSVector4i x0000ffff(const GSVector4i& v) {return xffffffff(v).srl32(16);}
+	__forceinline static GSVector4i x0001ffff(const GSVector4i& v) {return xffffffff(v).srl32(15);}
+	__forceinline static GSVector4i x0003ffff(const GSVector4i& v) {return xffffffff(v).srl32(14);}
+	__forceinline static GSVector4i x0007ffff(const GSVector4i& v) {return xffffffff(v).srl32(13);}
+	__forceinline static GSVector4i x000fffff(const GSVector4i& v) {return xffffffff(v).srl32(12);}
+	__forceinline static GSVector4i x001fffff(const GSVector4i& v) {return xffffffff(v).srl32(11);}
+	__forceinline static GSVector4i x003fffff(const GSVector4i& v) {return xffffffff(v).srl32(10);}
+	__forceinline static GSVector4i x007fffff(const GSVector4i& v) {return xffffffff(v).srl32( 9);}
+	__forceinline static GSVector4i x00ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 8);}
+	__forceinline static GSVector4i x01ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 7);}
+	__forceinline static GSVector4i x03ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 6);}
+	__forceinline static GSVector4i x07ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 5);}
+	__forceinline static GSVector4i x0fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 4);}
+	__forceinline static GSVector4i x1fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 3);}
+	__forceinline static GSVector4i x3fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 2);}
+	__forceinline static GSVector4i x7fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 1);}
+
+	__forceinline static GSVector4i x80000000(const GSVector4i& v) {return xffffffff(v).sll32(31);}
+	__forceinline static GSVector4i xc0000000(const GSVector4i& v) {return xffffffff(v).sll32(30);}
+	__forceinline static GSVector4i xe0000000(const GSVector4i& v) {return xffffffff(v).sll32(29);}
+	__forceinline static GSVector4i xf0000000(const GSVector4i& v) {return xffffffff(v).sll32(28);}
+	__forceinline static GSVector4i xf8000000(const GSVector4i& v) {return xffffffff(v).sll32(27);}
+	__forceinline static GSVector4i xfc000000(const GSVector4i& v) {return xffffffff(v).sll32(26);}
+	__forceinline static GSVector4i xfe000000(const GSVector4i& v) {return xffffffff(v).sll32(25);}
+	__forceinline static GSVector4i xff000000(const GSVector4i& v) {return xffffffff(v).sll32(24);}
+	__forceinline static GSVector4i xff800000(const GSVector4i& v) {return xffffffff(v).sll32(23);}
+	__forceinline static GSVector4i xffc00000(const GSVector4i& v) {return xffffffff(v).sll32(22);}
+	__forceinline static GSVector4i xffe00000(const GSVector4i& v) {return xffffffff(v).sll32(21);}
+	__forceinline static GSVector4i xfff00000(const GSVector4i& v) {return xffffffff(v).sll32(20);}
+	__forceinline static GSVector4i xfff80000(const GSVector4i& v) {return xffffffff(v).sll32(19);}
+	__forceinline static GSVector4i xfffc0000(const GSVector4i& v) {return xffffffff(v).sll32(18);}
+	__forceinline static GSVector4i xfffe0000(const GSVector4i& v) {return xffffffff(v).sll32(17);}
+	__forceinline static GSVector4i xffff0000(const GSVector4i& v) {return xffffffff(v).sll32(16);}
+	__forceinline static GSVector4i xffff8000(const GSVector4i& v) {return xffffffff(v).sll32(15);}
+	__forceinline static GSVector4i xffffc000(const GSVector4i& v) {return xffffffff(v).sll32(14);}
+	__forceinline static GSVector4i xffffe000(const GSVector4i& v) {return xffffffff(v).sll32(13);}
+	__forceinline static GSVector4i xfffff000(const GSVector4i& v) {return xffffffff(v).sll32(12);}
+	__forceinline static GSVector4i xfffff800(const GSVector4i& v) {return xffffffff(v).sll32(11);}
+	__forceinline static GSVector4i xfffffc00(const GSVector4i& v) {return xffffffff(v).sll32(10);}
+	__forceinline static GSVector4i xfffffe00(const GSVector4i& v) {return xffffffff(v).sll32( 9);}
+	__forceinline static GSVector4i xffffff00(const GSVector4i& v) {return xffffffff(v).sll32( 8);}
+	__forceinline static GSVector4i xffffff80(const GSVector4i& v) {return xffffffff(v).sll32( 7);}
+	__forceinline static GSVector4i xffffffc0(const GSVector4i& v) {return xffffffff(v).sll32( 6);}
+	__forceinline static GSVector4i xffffffe0(const GSVector4i& v) {return xffffffff(v).sll32( 5);}
+	__forceinline static GSVector4i xfffffff0(const GSVector4i& v) {return xffffffff(v).sll32( 4);}
+	__forceinline static GSVector4i xfffffff8(const GSVector4i& v) {return xffffffff(v).sll32( 3);}
+	__forceinline static GSVector4i xfffffffc(const GSVector4i& v) {return xffffffff(v).sll32( 2);}
+	__forceinline static GSVector4i xfffffffe(const GSVector4i& v) {return xffffffff(v).sll32( 1);}
+
+	__forceinline static GSVector4i x0001(const GSVector4i& v) {return xffffffff(v).srl16(15);}
+	__forceinline static GSVector4i x0003(const GSVector4i& v) {return xffffffff(v).srl16(14);}
+	__forceinline static GSVector4i x0007(const GSVector4i& v) {return xffffffff(v).srl16(13);}
+	__forceinline static GSVector4i x000f(const GSVector4i& v) {return xffffffff(v).srl16(12);}
+	__forceinline static GSVector4i x001f(const GSVector4i& v) {return xffffffff(v).srl16(11);}
+	__forceinline static GSVector4i x003f(const GSVector4i& v) {return xffffffff(v).srl16(10);}
+	__forceinline static GSVector4i x007f(const GSVector4i& v) {return xffffffff(v).srl16( 9);}
+	__forceinline static GSVector4i x00ff(const GSVector4i& v) {return xffffffff(v).srl16( 8);}
+	__forceinline static GSVector4i x01ff(const GSVector4i& v) {return xffffffff(v).srl16( 7);}
+	__forceinline static GSVector4i x03ff(const GSVector4i& v) {return xffffffff(v).srl16( 6);}
+	__forceinline static GSVector4i x07ff(const GSVector4i& v) {return xffffffff(v).srl16( 5);}
+	__forceinline static GSVector4i x0fff(const GSVector4i& v) {return xffffffff(v).srl16( 4);}
+	__forceinline static GSVector4i x1fff(const GSVector4i& v) {return xffffffff(v).srl16( 3);}
+	__forceinline static GSVector4i x3fff(const GSVector4i& v) {return xffffffff(v).srl16( 2);}
+	__forceinline static GSVector4i x7fff(const GSVector4i& v) {return xffffffff(v).srl16( 1);}
+
+	__forceinline static GSVector4i x8000(const GSVector4i& v) {return xffffffff(v).sll16(15);}
+	__forceinline static GSVector4i xc000(const GSVector4i& v) {return xffffffff(v).sll16(14);}
+	__forceinline static GSVector4i xe000(const GSVector4i& v) {return xffffffff(v).sll16(13);}
+	__forceinline static GSVector4i xf000(const GSVector4i& v) {return xffffffff(v).sll16(12);}
+	__forceinline static GSVector4i xf800(const GSVector4i& v) {return xffffffff(v).sll16(11);}
+	__forceinline static GSVector4i xfc00(const GSVector4i& v) {return xffffffff(v).sll16(10);}
+	__forceinline static GSVector4i xfe00(const GSVector4i& v) {return xffffffff(v).sll16( 9);}
+	__forceinline static GSVector4i xff00(const GSVector4i& v) {return xffffffff(v).sll16( 8);}
+	__forceinline static GSVector4i xff80(const GSVector4i& v) {return xffffffff(v).sll16( 7);}
+	__forceinline static GSVector4i xffc0(const GSVector4i& v) {return xffffffff(v).sll16( 6);}
+	__forceinline static GSVector4i xffe0(const GSVector4i& v) {return xffffffff(v).sll16( 5);}
+	__forceinline static GSVector4i xfff0(const GSVector4i& v) {return xffffffff(v).sll16( 4);}
+	__forceinline static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);}
+	__forceinline static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);}
+	__forceinline static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);}
+
+	__forceinline static GSVector4i xff(int n) {return m_xff[n];}
+	__forceinline static GSVector4i x0f(int n) {return m_x0f[n];}
+};
+
+__aligned(class, 16) GSVector4
+{
+public:
+	union
+	{
+		struct {float x, y, z, w;};
+		struct {float r, g, b, a;};
+		struct {float left, top, right, bottom;};
+		float v[4];
+		float f32[4];
+		int8 i8[16];
+		int16 i16[8];
+		int32 i32[4];
+		int64 i64[2];
+		uint8 u8[16];
+		uint16 u16[8];
+		uint32 u32[4];
+		uint64 u64[2];
+		__m128 m;
+	};
+
+	static const GSVector4 m_ps0123;
+	static const GSVector4 m_ps4567;
+	static const GSVector4 m_half;
+	static const GSVector4 m_one;
+	static const GSVector4 m_two;
+	static const GSVector4 m_four;
+	static const GSVector4 m_x4b000000;
+	static const GSVector4 m_x4f800000;
+	static const GSVector4 m_max;
+	static const GSVector4 m_min;
+
+	__forceinline GSVector4()
+	{
+	}
+
+	__forceinline GSVector4(float x, float y, float z, float w)
+	{
+		m = _mm_set_ps(w, z, y, x);
+	}
+
+	__forceinline GSVector4(float x, float y)
+	{
+		m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y));
+	}
+
+	__forceinline GSVector4(int x, int y, int z, int w)
+	{
+		GSVector4i v(x, y, z, w);
+
+		m = _mm_cvtepi32_ps(v.m);
+	}
+
+	__forceinline GSVector4(int x, int y)
+	{
+		m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
+	}
+
+	//Not currently used, just causes a compiler warning
+	/*__forceinline GSVector4(const GSVector4& v)
+	{
+		m = v.m;
+	}*/
+
+	__forceinline explicit GSVector4(const GSVector2& v)
+	{
+		m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v));
+	}
+
+	__forceinline explicit GSVector4(const GSVector2i& v)
+	{
+		m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
+	}
+
+	__forceinline explicit GSVector4(__m128 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline explicit GSVector4(float f)
+	{
+		*this = f;
+	}
+
+	__forceinline explicit GSVector4(int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else
+
+		GSVector4i v((int)i);
+
+		*this = GSVector4(v);
+
+		#endif
+	}
+	
+	__forceinline explicit GSVector4(uint32 u)
+	{
+		GSVector4i v((int)u);
+
+		*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+	}
+
+	__forceinline explicit GSVector4(const GSVector4i& v);
+
+	__forceinline static GSVector4 cast(const GSVector4i& v);
+
+	#if _M_SSE >= 0x500
+
+	__forceinline static GSVector4 cast(const GSVector8& v);
+
+	#endif
+
+	#if _M_SSE >= 0x501
+
+	__forceinline static GSVector4 cast(const GSVector8i& v);
+
+	#endif
+
+	__forceinline void operator = (const GSVector4& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (float f)
+	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
+		m = _mm_set1_ps(f);
+
+		#endif
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m128() const
+	{
+		return m;
+	}
+
+	__forceinline uint32 rgba32() const
+	{
+		return GSVector4i(*this).rgba32();
+	}
+
+	__forceinline static GSVector4 rgba32(uint32 rgba)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32());
+	}
+
+	__forceinline static GSVector4 rgba32(uint32 rgba, int shift)
+	{
+		return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
+	}
+
+	__forceinline GSVector4 abs() const
+	{
+		return *this & cast(GSVector4i::x7fffffff());
+	}
+
+	__forceinline GSVector4 neg() const
+	{
+		return *this ^ cast(GSVector4i::x80000000());
+	}
+
+	__forceinline GSVector4 rcp() const
+	{
+		return GSVector4(_mm_rcp_ps(m));
+	}
+
+	__forceinline GSVector4 rcpnr() const
+	{
+		GSVector4 v = rcp();
+
+		return (v + v) - (v * v) * *this;
+	}
+
+	template<int mode> __forceinline GSVector4 round() const
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4(_mm_round_ps(m, mode));
+
+		#else
+
+		GSVector4 a = *this;
+
+		GSVector4 b = (a & cast(GSVector4i::x80000000())) | m_x4b000000;
+
+		b = a + b - b;
+
+		if((mode & 7) == (Round_NegInf & 7))
+		{
+			return b - ((a < b) & m_one);
+		}
+
+		if((mode & 7) == (Round_PosInf & 7))
+		{
+			return b + ((a > b) & m_one);
+		}
+
+		ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented
+
+		return b;
+
+		#endif
+	}
+
+	__forceinline GSVector4 floor() const
+	{
+		return round<Round_NegInf>();
+	}
+
+	__forceinline GSVector4 ceil() const
+	{
+		return round<Round_PosInf>();
+	}
+
+	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+
+	#define LOG_POLY0(x, c0) GSVector4(c0)
+	#define LOG_POLY1(x, c0, c1) (LOG_POLY0(x, c1).madd(x, GSVector4(c0)))
+	#define LOG_POLY2(x, c0, c1, c2) (LOG_POLY1(x, c1, c2).madd(x, GSVector4(c0)))
+	#define LOG_POLY3(x, c0, c1, c2, c3) (LOG_POLY2(x, c1, c2, c3).madd(x, GSVector4(c0)))
+	#define LOG_POLY4(x, c0, c1, c2, c3, c4) (LOG_POLY3(x, c1, c2, c3, c4).madd(x, GSVector4(c0)))
+	#define LOG_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector4(c0)))
+
+	__forceinline GSVector4 log2(int precision = 5) const
+	{
+		// NOTE: sign bit ignored, safe to pass negative numbers
+
+		// The idea behind this algorithm is to split the float into two parts, log2(m * 2^e) => log2(m) + log2(2^e) => log2(m) + e, 
+		// and then approximate the logarithm of the mantissa (it's 1.x when normalized, a nice short range).
+
+		GSVector4 one = m_one;
+
+		GSVector4i i = GSVector4i::cast(*this);
+
+		GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
+		GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
+
+		GSVector4 p;
+
+		// Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+
+		switch(precision)
+		{
+		case 3:
+			p = LOG_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+			break;
+		case 4:
+			p = LOG_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+			break;
+		default:
+		case 5:
+			p = LOG_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+			break;
+		case 6:
+			p = LOG_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+			break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+
+		p = p * (m - one);
+
+		return p + e;
+	}
+
+	__forceinline GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fmadd_ps(m, a, b));
+		
+		#else
+		
+		return *this * a + b;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fmsub_ps(m, a, b));
+		
+		#else
+		
+		return *this * a - b;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fnmadd_ps(m, a, b));
+		
+		#else
+		
+		return b - *this * a;
+		
+		#endif
+	}
+
+	__forceinline GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector4(_mm_fnmsub_ps(m, a, b));
+		
+		#else
+
+		return -b - *this * a;
+
+		#endif
+	}
+
+	__forceinline GSVector4 addm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.madd(b, *this); // *this + a * b
+	}
+
+	__forceinline GSVector4 subm(const GSVector4& a, const GSVector4& b) const
+	{
+		return a.nmadd(b, *this); // *this - a * b
+	}
+
+	__forceinline GSVector4 hadd() const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hadd_ps(m, m));
+		
+		#else
+		
+		return xzxz() + ywyw();
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hadd(const GSVector4& v) const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hadd_ps(m, v.m));
+		
+		#else
+		
+		return xzxz(v) + ywyw(v);
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hsub() const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hsub_ps(m, m));
+		
+		#else
+		
+		return xzxz() - ywyw();
+		
+		#endif
+	}
+
+	__forceinline GSVector4 hsub(const GSVector4& v) const
+	{
+		#if _M_SSE >= 0x300
+		
+		return GSVector4(_mm_hsub_ps(m, v.m));
+		
+		#else
+		
+		return xzxz(v) - ywyw(v);
+
+		#endif
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int i> __forceinline GSVector4 dp(const GSVector4& v) const
+	{
+		return GSVector4(_mm_dp_ps(m, v.m, i));
+	}
+
+	#endif
+
+	__forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const
+	{
+		return GSVector4(_mm_min_ps(_mm_max_ps(m, a), b));
+	}
+
+	__forceinline GSVector4 sat(const GSVector4& a) const
+	{
+		return GSVector4(_mm_min_ps(_mm_max_ps(m, a.xyxy()), a.zwzw()));
+	}
+
+	__forceinline GSVector4 sat(const float scale = 255) const
+	{
+		return sat(zero(), GSVector4(scale));
+	}
+
+	__forceinline GSVector4 clamp(const float scale = 255) const
+	{
+		return min(GSVector4(scale));
+	}
+
+	__forceinline GSVector4 min(const GSVector4& a) const
+	{
+		return GSVector4(_mm_min_ps(m, a));
+	}
+
+	__forceinline GSVector4 max(const GSVector4& a) const
+	{
+		return GSVector4(_mm_max_ps(m, a));
+	}
+
+	#if _M_SSE >= 0x401
+
+	template<int mask> __forceinline GSVector4 blend32(const GSVector4& a)  const
+	{
+		return GSVector4(_mm_blend_ps(m, a, mask));
+	}
+
+	#endif
+
+	__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask)  const
+	{
+		#if _M_SSE >= 0x401
+
+		return GSVector4(_mm_blendv_ps(m, a, mask));
+
+		#else
+
+		return GSVector4(_mm_or_ps(_mm_andnot_ps(mask, m), _mm_and_ps(mask, a)));
+
+		#endif
+	}
+
+	__forceinline GSVector4 upl(const GSVector4& a) const
+	{
+		return GSVector4(_mm_unpacklo_ps(m, a));
+	}
+
+	__forceinline GSVector4 uph(const GSVector4& a) const
+	{
+		return GSVector4(_mm_unpackhi_ps(m, a));
+	}
+
+	__forceinline GSVector4 l2h(const GSVector4& a) const
+	{
+		return GSVector4(_mm_movelh_ps(m, a));
+	}
+
+	__forceinline GSVector4 h2l(const GSVector4& a) const
+	{
+		return GSVector4(_mm_movehl_ps(m, a));
+	}
+
+	__forceinline GSVector4 andnot(const GSVector4& v) const
+	{
+		return GSVector4(_mm_andnot_ps(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm_movemask_ps(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xf;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		#if _M_SSE >= 0x500
+
+		return _mm_testz_ps(m, m) != 0;
+
+		#elif _M_SSE >= 0x401
+
+		__m128i a = _mm_castps_si128(m);
+
+		return _mm_testz_si128(a, a) != 0;
+
+		#else
+
+		return mask() == 0;
+
+		#endif
+	}
+
+	__forceinline GSVector4 replace_nan(const GSVector4& v) const
+	{
+		return v.blend32(*this, *this == *this);
+	}
+
+	template<int src, int dst> __forceinline GSVector4 insert32(const GSVector4& v) const
+	{
+		// TODO: use blendps when src == dst
+
+		#if 0 // _M_SSE >= 0x401
+
+		// NOTE: it's faster with shuffles...
+
+		return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
+
+		#else
+
+		switch(dst)
+		{
+		case 0:
+			switch(src)
+			{
+			case 0: return yyxx(v).zxzw(*this);
+			case 1: return yyyy(v).zxzw(*this);
+			case 2: return yyzz(v).zxzw(*this);
+			case 3: return yyww(v).zxzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 1:
+			switch(src)
+			{
+			case 0: return xxxx(v).xzzw(*this);
+			case 1: return xxyy(v).xzzw(*this);
+			case 2: return xxzz(v).xzzw(*this);
+			case 3: return xxww(v).xzzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 2:
+			switch(src)
+			{
+			case 0: return xyzx(wwxx(v));
+			case 1: return xyzx(wwyy(v));
+			case 2: return xyzx(wwzz(v));
+			case 3: return xyzx(wwww(v));
+			default: __assume(0);
+			}
+			break;
+		case 3:
+			switch(src)
+			{
+			case 0: return xyxz(zzxx(v));
+			case 1: return xyxz(zzyy(v));
+			case 2: return xyxz(zzzz(v));
+			case 3: return xyxz(zzww(v));
+			default: __assume(0);
+			}
+			break;
+		default:
+			__assume(0);
+		}
+
+		#endif
+
+	}
+
+#ifdef __linux__
+#if 0
+	// Debug build error, _mm_extract_ps is actually a macro that use an anonymous union
+	// that contains i. I decide to rename the template on linux but it makes windows unhappy
+	// Hence the nice ifdef
+	//
+	// Code extract:
+	// union { int i; float f; } __tmp;
+
+GSVector.h:2977:40: error: declaration of 'int GSVector4::extract32() const::<anonymous union>::i'
+   return _mm_extract_ps(m, i);
+GSVector.h:2973:15: error:  shadows template parm 'int i'
+  template<int i> __forceinline int extract32() const
+#endif
+
+	template<int index> __forceinline int extract32() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_ps(m, index);
+
+		#else
+
+		return i32[index];
+
+		#endif
+	}
+#else
+	template<int i> __forceinline int extract32() const
+	{
+		#if _M_SSE >= 0x401
+
+		return _mm_extract_ps(m, i);
+
+		#else
+
+		return i32[i];
+
+		#endif
+	}
+#endif
+
+	__forceinline static GSVector4 zero()
+	{
+		return GSVector4(_mm_setzero_ps());
+	}
+
+	__forceinline static GSVector4 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	__forceinline static GSVector4 ps0123()
+	{
+		return GSVector4(m_ps0123);
+	}
+
+	__forceinline static GSVector4 ps4567()
+	{
+		return GSVector4(m_ps4567);
+	}
+
+	__forceinline static GSVector4 loadl(const void* p)
+	{
+		return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p)));
+	}
+
+	__forceinline static GSVector4 load(float f)
+	{
+		return GSVector4(_mm_load_ss(&f));
+	}
+
+	__forceinline static GSVector4 load(uint32 u)
+	{
+		GSVector4i v = GSVector4i::load((int)u);
+
+		return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+	}
+
+	template<bool aligned> __forceinline static GSVector4 load(const void* p)
+	{
+		return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
+	}
+
+	__forceinline static void storent(void* p, const GSVector4& v)
+	{
+		_mm_stream_ps((float*)p, v.m);
+	}
+
+	__forceinline static void storel(void* p, const GSVector4& v)
+	{
+		_mm_store_sd((double*)p, _mm_castps_pd(v.m));
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector4& v)
+	{
+		if(aligned) _mm_store_ps((float*)p, v.m);
+		else _mm_storeu_ps((float*)p, v.m);
+	}
+
+	__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4i mask = GSVector4i::x000000ff();
+
+		a = GSVector4(v & mask);
+		b = GSVector4((v >> 8) & mask);
+		c = GSVector4((v >> 16) & mask);
+		d = GSVector4((v >> 24));
+	}
+
+	__forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
+	{
+		GSVector4 v0 = a.xyxy(b);
+		GSVector4 v1 = c.xyxy(d);
+
+		GSVector4 e = v0.xzxz(v1);
+		GSVector4 f = v0.ywyw(v1);
+
+		GSVector4 v2 = a.zwzw(b);
+		GSVector4 v3 = c.zwzw(d);
+
+		GSVector4 g = v2.xzxz(v3);
+		GSVector4 h = v2.ywyw(v3);
+
+		a = e;
+		b = f;
+		c = g;
+		d = h;
+/*
+		GSVector4 v0 = a.xyxy(b);
+		GSVector4 v1 = c.xyxy(d);
+		GSVector4 v2 = a.zwzw(b);
+		GSVector4 v3 = c.zwzw(d);
+
+		a = v0.xzxz(v1);
+		b = v0.ywyw(v1);
+		c = v2.xzxz(v3);
+		d = v2.ywyw(v3);
+*/
+/*
+		GSVector4 v0 = a.upl(b);
+		GSVector4 v1 = a.uph(b);
+		GSVector4 v2 = c.upl(d);
+		GSVector4 v3 = c.uph(d);
+
+		a = v0.l2h(v2);
+		b = v2.h2l(v0);
+		c = v1.l2h(v3);
+		d = v3.h2l(v1);
+*/	}
+
+	__forceinline GSVector4 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (const GSVector4& v)
+	{
+		m = _mm_add_ps(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector4& v)
+	{
+		m = _mm_sub_ps(m, v);
+	}
+
+	__forceinline void operator *= (const GSVector4& v)
+	{
+		m = _mm_mul_ps(m, v);
+	}
+
+	__forceinline void operator /= (const GSVector4& v)
+	{
+		m = _mm_div_ps(m, v);
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector4(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector4(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector4(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector4(f);
+	}
+
+	__forceinline void operator &= (const GSVector4& v)
+	{
+		m = _mm_and_ps(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector4& v)
+	{
+		m = _mm_or_ps(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector4& v)
+	{
+		m = _mm_xor_ps(m, v);
+	}
+
+	__forceinline friend GSVector4 operator + (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_add_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator - (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_sub_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator * (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_mul_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator / (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_div_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator + (const GSVector4& v, float f)
+	{
+		return v + GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator - (const GSVector4& v, float f)
+	{
+		return v - GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator * (const GSVector4& v, float f)
+	{
+		return v * GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator / (const GSVector4& v, float f)
+	{
+		return v / GSVector4(f);
+	}
+
+	__forceinline friend GSVector4 operator & (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_and_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator | (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_or_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator ^ (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_xor_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator == (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpeq_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator != (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpneq_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator > (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpgt_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator < (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmplt_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator >= (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmpge_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector4 operator <= (const GSVector4& v1, const GSVector4& v2)
+	{
+		return GSVector4(_mm_cmple_ps(v1, v2));
+	}
+
+	#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR4_SHUFFLE_1(xs, xn) \
+		VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR4_SHUFFLE_1(x, 0)
+	VECTOR4_SHUFFLE_1(y, 1)
+	VECTOR4_SHUFFLE_1(z, 2)
+	VECTOR4_SHUFFLE_1(w, 3)
+
+	#if _M_SSE >= 0x501
+
+	__forceinline GSVector4 broadcast32() const
+	{
+		return GSVector4(_mm_broadcastss_ps(m));
+	}
+
+	__forceinline static GSVector4 broadcast32(const GSVector4& v)
+	{
+		return GSVector4(_mm_broadcastss_ps(v.m));
+	}
+
+	__forceinline static GSVector4 broadcast32(const void* f)
+	{
+		return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
+	}
+
+	#endif
+};
+
+#if _M_SSE >= 0x501
+
+__aligned(class, 32) GSVector8i
+{
+	static const GSVector8i m_xff[33];
+	static const GSVector8i m_x0f[33];
+
+public:
+	union
+	{
+		struct {int x0, y0, z0, w0, x1, y1, z1, w1;};
+		struct {int r0, g0, b0, a0, r1, g1, b1, a1;};
+		int v[8];
+		float f32[8];
+		int8 i8[32];
+		int16 i16[16];
+		int32 i32[8];
+		int64 i64[4];
+		uint8 u8[32];
+		uint16 u16[16];
+		uint32 u32[8];
+		uint64 u64[4];
+		__m256i m;
+		__m128i m0, m1;
+	};
+
+	__forceinline GSVector8i() {}
+
+	__forceinline explicit GSVector8i(const GSVector8& v, bool truncate = true);
+
+	__forceinline static GSVector8i cast(const GSVector8& v);
+	__forceinline static GSVector8i cast(const GSVector4& v);
+	__forceinline static GSVector8i cast(const GSVector4i& v);
+
+	__forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0);
+	}
+
+	__forceinline GSVector8i(
+		short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7,
+		short s8, short s9, short s10, short s11, short s12, short s13, short s14, short s15)
+	{
+		m = _mm256_set_epi16(s15, s14, s13, s12, s11, s10, s9, s8, s7, s6, s5, s4, s3, s2, s1, s0);
+	}
+
+	__forceinline GSVector8i(
+		char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, 
+		char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15,
+		char b16, char b17, char b18, char b19, char b20, char b21, char b22, char b23,
+		char b24, char b25, char b26, char b27, char b28, char b29, char b30, char b31
+		)
+	{
+		m = _mm256_set_epi8(
+			b31, b30, b29, b28, b27, b26, b25, b24, b23, b22, b21, b20, b19, b18, b17, b16,
+			b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0);
+	}
+
+	__forceinline GSVector8i(__m128i m0, __m128i m1)
+	{
+		#if 0 // _MSC_VER >= 1700 
+		
+		this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m0), _mm256_castsi128_si256(m1), 0);
+		
+		#else
+		
+		*this = zero().insert<0>(m0).insert<1>(m1);
+
+		#endif
+	}
+
+	__forceinline GSVector8i(const GSVector8i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector8i(int i)
+	{
+		*this = i;
+	}
+
+	__forceinline explicit GSVector8i(__m128i m)
+	{
+		*this = m;
+	}
+
+	__forceinline explicit GSVector8i(__m256i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline void operator = (const GSVector8i& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (int i)
+	{
+		m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); // m = _mm256_set1_epi32(i);
+	}
+
+	__forceinline void operator = (__m128i m)
+	{
+		this->m = _mm256_inserti128_si256(_mm256_castsi128_si256(m), m, 1);
+	}
+
+	__forceinline void operator = (__m256i m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m256i() const
+	{
+		return m;
+	}
+
+	//
+
+	__forceinline GSVector8i sat_i8(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_i8(a).min_i8(b);
+	}
+
+	__forceinline GSVector8i sat_i8(const GSVector8i& a) const
+	{
+		return max_i8(a.xyxy()).min_i8(a.zwzw());
+	}
+
+	__forceinline GSVector8i sat_i16(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_i16(a).min_i16(b);
+	}
+
+	__forceinline GSVector8i sat_i16(const GSVector8i& a) const
+	{
+		return max_i16(a.xyxy()).min_i16(a.zwzw());
+	}
+
+	__forceinline GSVector8i sat_i32(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_i32(a).min_i32(b);
+	}
+
+	__forceinline GSVector8i sat_i32(const GSVector8i& a) const
+	{
+		return max_i32(a.xyxy()).min_i32(a.zwzw());
+	}
+	
+	__forceinline GSVector8i sat_u8(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_u8(a).min_u8(b);
+	}
+
+	__forceinline GSVector8i sat_u8(const GSVector8i& a) const
+	{
+		return max_u8(a.xyxy()).min_u8(a.zwzw());
+	}
+
+	__forceinline GSVector8i sat_u16(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_u16(a).min_u16(b);
+	}
+
+	__forceinline GSVector8i sat_u16(const GSVector8i& a) const
+	{
+		return max_u16(a.xyxy()).min_u16(a.zwzw());
+	}
+
+	__forceinline GSVector8i sat_u32(const GSVector8i& a, const GSVector8i& b) const
+	{
+		return max_u32(a).min_u32(b);
+	}
+
+	__forceinline GSVector8i sat_u32(const GSVector8i& a) const
+	{
+		return max_u32(a.xyxy()).min_u32(a.zwzw());
+	}
+
+	__forceinline GSVector8i min_i8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epi8(m, a));
+	}
+
+	__forceinline GSVector8i max_i8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epi8(m, a));
+	}
+
+	__forceinline GSVector8i min_i16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epi16(m, a));
+	}
+
+	__forceinline GSVector8i max_i16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epi16(m, a));
+	}
+
+	__forceinline GSVector8i min_i32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epi32(m, a));
+	}
+
+	__forceinline GSVector8i max_i32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epi32(m, a));
+	}
+
+	__forceinline GSVector8i min_u8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epu8(m, a));
+	}
+
+	__forceinline GSVector8i max_u8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epu8(m, a));
+	}
+
+	__forceinline GSVector8i min_u16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epu16(m, a));
+	}
+
+	__forceinline GSVector8i max_u16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epu16(m, a));
+	}
+
+	__forceinline GSVector8i min_u32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_min_epu32(m, a));
+	}
+
+	__forceinline GSVector8i max_u32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_max_epu32(m, a));
+	}
+
+	__forceinline GSVector8i clamp8() const
+	{
+		return pu16().upl8();
+	}
+
+	__forceinline GSVector8i blend8(const GSVector8i& a, const GSVector8i& mask) const
+	{
+		return GSVector8i(_mm256_blendv_epi8(m, a, mask));
+	}
+
+	template<int mask> __forceinline GSVector8i blend16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_blend_epi16(m, a, mask));
+	}
+
+	__forceinline GSVector8i blend(const GSVector8i& a, const GSVector8i& mask) const
+	{
+		return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a)));
+	}
+
+	__forceinline GSVector8i mix16(const GSVector8i& a) const
+	{
+		return blend16<0xaa>(a);
+	}
+
+	__forceinline GSVector8i shuffle8(const GSVector8i& mask) const
+	{
+		return GSVector8i(_mm256_shuffle_epi8(m, mask));
+	}
+
+	__forceinline GSVector8i ps16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_packs_epi16(m, a));
+	}
+
+	__forceinline GSVector8i ps16() const
+	{
+		return GSVector8i(_mm256_packs_epi16(m, m));
+	}
+
+	__forceinline GSVector8i pu16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_packus_epi16(m, a));
+	}
+
+	__forceinline GSVector8i pu16() const
+	{
+		return GSVector8i(_mm256_packus_epi16(m, m));
+	}
+
+	__forceinline GSVector8i ps32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_packs_epi32(m, a));
+	}
+
+	__forceinline GSVector8i ps32() const
+	{
+		return GSVector8i(_mm256_packs_epi32(m, m));
+	}
+
+	__forceinline GSVector8i pu32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_packus_epi32(m, a));
+	}
+
+	__forceinline GSVector8i pu32() const
+	{
+		return GSVector8i(_mm256_packus_epi32(m, m));
+	}
+
+	__forceinline GSVector8i upl8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpacklo_epi8(m, a));
+	}
+
+	__forceinline GSVector8i uph8(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpackhi_epi8(m, a));
+	}
+
+	__forceinline GSVector8i upl16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpacklo_epi16(m, a));
+	}
+
+	__forceinline GSVector8i uph16(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpackhi_epi16(m, a));
+	}
+
+	__forceinline GSVector8i upl32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpacklo_epi32(m, a));
+	}
+
+	__forceinline GSVector8i uph32(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpackhi_epi32(m, a));
+	}
+
+	__forceinline GSVector8i upl64(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpacklo_epi64(m, a));
+	}
+
+	__forceinline GSVector8i uph64(const GSVector8i& a) const
+	{
+		return GSVector8i(_mm256_unpackhi_epi64(m, a));
+	}
+
+	__forceinline GSVector8i upl8() const
+	{
+		return GSVector8i(_mm256_unpacklo_epi8(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i uph8() const
+	{
+		return GSVector8i(_mm256_unpackhi_epi8(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i upl16() const
+	{
+		return GSVector8i(_mm256_unpacklo_epi16(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i uph16() const
+	{
+		return GSVector8i(_mm256_unpackhi_epi16(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i upl32() const
+	{
+		return GSVector8i(_mm256_unpacklo_epi32(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i uph32() const
+	{
+		return GSVector8i(_mm256_unpackhi_epi32(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i upl64() const
+	{
+		return GSVector8i(_mm256_unpacklo_epi64(m, _mm256_setzero_si256()));
+	}
+
+	__forceinline GSVector8i uph64() const
+	{
+		return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256()));
+	}
+
+	// cross lane! from 128-bit to full 256-bit range
+
+	__forceinline GSVector8i i8to16c() const
+	{
+		return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u8to16c() const
+	{
+		return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i i8to32c() const
+	{
+		return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u8to32c() const
+	{
+		return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i i8to64c() const
+	{
+		return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u8to64c() const
+	{
+		return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i i16to32c() const
+	{
+		return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u16to32c() const
+	{
+		return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i i16to64c() const
+	{
+		return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u16to64c() const
+	{
+		return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i i32to64c() const
+	{
+		return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i u32to64c() const
+	{
+		return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	//
+
+	static __forceinline GSVector8i i8to16c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i u8to16c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepu8_epi16(_mm_load_si128((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i i8to32c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i u8to32c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i i8to64c(int i) 
+	{
+		return  GSVector8i(_mm256_cvtepi8_epi64(_mm_cvtsi32_si128(i)));
+	}
+
+	static __forceinline GSVector8i u8to64c(int i) 
+	{
+		return  GSVector8i(_mm256_cvtepu8_epi64(_mm_cvtsi32_si128(i)));
+	}
+
+	static __forceinline GSVector8i i16to32c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepi16_epi32(_mm_load_si128((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i u16to32c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i i16to64c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepi16_epi64(_mm_loadl_epi64((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i u16to64c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i i32to64c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepi32_epi64(_mm_load_si128((__m128i*)p)));
+	}
+
+	static __forceinline GSVector8i u32to64c(const void* p) 
+	{
+		return  GSVector8i(_mm256_cvtepu32_epi64(_mm_load_si128((__m128i*)p)));
+	}
+
+	//
+
+	template<int i> __forceinline GSVector8i srl() const
+	{
+		return GSVector8i(_mm256_srli_si256(m, i));
+	}
+
+	template<int i> __forceinline GSVector8i srl(const GSVector8i& v)
+	{
+		return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
+	}
+
+	template<int i> __forceinline GSVector8i sll() const
+	{
+		return GSVector8i(_mm256_slli_si256(m, i));
+		//return GSVector8i(_mm256_slli_si128(m, i));
+	}
+
+	__forceinline GSVector8i sra16(int i) const
+	{
+		return GSVector8i(_mm256_srai_epi16(m, i));
+	}
+
+	__forceinline GSVector8i sra16(__m128i i) const
+	{
+		return GSVector8i(_mm256_sra_epi16(m, i));
+	}
+
+	__forceinline GSVector8i sra16(__m256i i) const
+	{
+		return GSVector8i(_mm256_sra_epi16(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i sra32(int i) const
+	{
+		return GSVector8i(_mm256_srai_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sra32(__m128i i) const
+	{
+		return GSVector8i(_mm256_sra_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sra32(__m256i i) const
+	{
+		return GSVector8i(_mm256_sra_epi32(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i srav32(__m256i i) const
+	{
+		return GSVector8i(_mm256_srav_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sll16(int i) const
+	{
+		return GSVector8i(_mm256_slli_epi16(m, i));
+	}
+
+	__forceinline GSVector8i sll16(__m128i i) const
+	{
+		return GSVector8i(_mm256_sll_epi16(m, i));
+	}
+
+	__forceinline GSVector8i sll16(__m256i i) const
+	{
+		return GSVector8i(_mm256_sll_epi16(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i sll32(int i) const
+	{
+		return GSVector8i(_mm256_slli_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sll32(__m128i i) const
+	{
+		return GSVector8i(_mm256_sll_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sll32(__m256i i) const
+	{
+		return GSVector8i(_mm256_sll_epi32(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i sllv32(__m256i i) const
+	{
+		return GSVector8i(_mm256_sllv_epi32(m, i));
+	}
+
+	__forceinline GSVector8i sll64(int i) const
+	{
+		return GSVector8i(_mm256_slli_epi64(m, i));
+	}
+
+	__forceinline GSVector8i sll64(__m128i i) const
+	{
+		return GSVector8i(_mm256_sll_epi64(m, i));
+	}
+
+	__forceinline GSVector8i sll64(__m256i i) const
+	{
+		return GSVector8i(_mm256_sll_epi64(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i sllv64(__m256i i) const
+	{
+		return GSVector8i(_mm256_sllv_epi64(m, i));
+	}
+
+	__forceinline GSVector8i srl16(int i) const
+	{
+		return GSVector8i(_mm256_srli_epi16(m, i));
+	}
+
+	__forceinline GSVector8i srl16(__m128i i) const
+	{
+		return GSVector8i(_mm256_srl_epi16(m, i));
+	}
+
+	__forceinline GSVector8i srl16(__m256i i) const
+	{
+		return GSVector8i(_mm256_srl_epi16(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i srl32(int i) const
+	{
+		return GSVector8i(_mm256_srli_epi32(m, i));
+	}
+
+	__forceinline GSVector8i srl32(__m128i i) const
+	{
+		return GSVector8i(_mm256_srl_epi32(m, i));
+	}
+
+	__forceinline GSVector8i srl32(__m256i i) const
+	{
+		return GSVector8i(_mm256_srl_epi32(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i srlv32(__m256i i) const
+	{
+		return GSVector8i(_mm256_srlv_epi32(m, i));
+	}
+
+	__forceinline GSVector8i srl64(int i) const
+	{
+		return GSVector8i(_mm256_srli_epi64(m, i));
+	}
+
+	__forceinline GSVector8i srl64(__m128i i) const
+	{
+		return GSVector8i(_mm256_srl_epi64(m, i));
+	}
+
+	__forceinline GSVector8i srl64(__m256i i) const
+	{
+		return GSVector8i(_mm256_srl_epi64(m, _mm256_castsi256_si128(i)));
+	}
+
+	__forceinline GSVector8i srlv64(__m256i i) const
+	{
+		return GSVector8i(_mm256_srlv_epi64(m, i));
+	}
+
+	__forceinline GSVector8i add8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_add_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i add16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_add_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i add32(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_add_epi32(m, v.m));
+	}
+
+	__forceinline GSVector8i adds8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_adds_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i adds16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_adds_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i addus8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_adds_epu8(m, v.m));
+	}
+
+	__forceinline GSVector8i addus16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_adds_epu16(m, v.m));
+	}
+
+	__forceinline GSVector8i sub8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_sub_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i sub16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_sub_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i sub32(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_sub_epi32(m, v.m));
+	}
+
+	__forceinline GSVector8i subs8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_subs_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i subs16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_subs_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i subus8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_subs_epu8(m, v.m));
+	}
+
+	__forceinline GSVector8i subus16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_subs_epu16(m, v.m));
+	}
+
+	__forceinline GSVector8i avg8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_avg_epu8(m, v.m));
+	}
+
+	__forceinline GSVector8i avg16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_avg_epu16(m, v.m));
+	}
+
+	__forceinline GSVector8i mul16hs(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_mulhi_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i mul16hu(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_mulhi_epu16(m, v.m));
+	}
+
+	__forceinline GSVector8i mul16l(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_mullo_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i mul16hrs(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_mulhrs_epi16(m, v.m));
+	}
+
+	GSVector8i madd(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_madd_epi16(m, v.m));
+	}
+
+	template<int shift> __forceinline GSVector8i lerp16(const GSVector8i& a, const GSVector8i& f) const
+	{
+		// (a - this) * f << shift + this
+
+		return add16(a.sub16(*this).modulate16<shift>(f));
+	}
+
+	template<int shift> __forceinline static GSVector8i lerp16(const GSVector8i& a, const GSVector8i& b, const GSVector8i& c)
+	{
+		// (a - b) * c << shift
+
+		return a.sub16(b).modulate16<shift>(c);
+	}
+
+	template<int shift> __forceinline static GSVector8i lerp16(const GSVector8i& a, const GSVector8i& b, const GSVector8i& c, const GSVector8i& d)
+	{
+		// (a - b) * c << shift + d
+
+		return d.add16(a.sub16(b).modulate16<shift>(c));
+	}
+
+	__forceinline GSVector8i lerp16_4(const GSVector8i& a, const GSVector8i& f) const
+	{
+		// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+		return add16(a.sub16(*this).mul16l(f).sra16(4));
+	}
+
+	template<int shift> __forceinline GSVector8i modulate16(const GSVector8i& f) const
+	{
+		// a * f << shift
+		
+		if(shift == 0)
+		{
+			return mul16hrs(f);
+		}
+
+		return sll16(shift + 1).mul16hs(f);
+	}
+
+	__forceinline bool eq(const GSVector8i& v) const
+	{
+		GSVector8i t = *this ^ v;
+		
+		return _mm256_testz_si256(t, t) != 0;
+	}
+
+	__forceinline GSVector8i eq8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpeq_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i eq16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpeq_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i eq32(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpeq_epi32(m, v.m));
+	}
+
+	__forceinline GSVector8i neq8(const GSVector8i& v) const
+	{
+		return ~eq8(v);
+	}
+
+	__forceinline GSVector8i neq16(const GSVector8i& v) const
+	{
+		return ~eq16(v);
+	}
+
+	__forceinline GSVector8i neq32(const GSVector8i& v) const
+	{
+		return ~eq32(v);
+	}
+
+	__forceinline GSVector8i gt8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi8(m, v.m));
+	}
+
+	__forceinline GSVector8i gt16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi16(m, v.m));
+	}
+
+	__forceinline GSVector8i gt32(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi32(m, v.m));
+	}
+
+	__forceinline GSVector8i lt8(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi8(v.m, m));
+	}
+
+	__forceinline GSVector8i lt16(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi16(v.m, m));
+	}
+
+	__forceinline GSVector8i lt32(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_cmpgt_epi32(v.m, m));
+	}
+
+	__forceinline GSVector8i andnot(const GSVector8i& v) const
+	{
+		return GSVector8i(_mm256_andnot_si256(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm256_movemask_epi8(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == (int)0xffffffff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		return _mm256_testz_si256(m, m) != 0;
+	}
+
+	// TODO: extract/insert
+
+	template<int i> __forceinline int extract8() const
+	{
+		ASSERT(i < 32);
+
+		GSVector4i v = extract<i / 16>();
+
+		return v.extract8<i & 15>();
+	}
+
+	template<int i> __forceinline int extract16() const
+	{
+		ASSERT(i < 16);
+
+		GSVector4i v = extract<i / 8>();
+
+		return v.extract16<i & 8>();
+	}
+
+	template<int i> __forceinline int extract32() const
+	{
+		ASSERT(i < 8);
+
+		GSVector4i v = extract<i / 4>();
+
+		if((i & 3) == 0) return GSVector4i::store(v);
+
+		return v.extract32<i & 3>();
+	}
+
+	template<int i> __forceinline GSVector4i extract() const
+	{
+		ASSERT(i < 2);
+
+		if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
+
+		return GSVector4i(_mm256_extracti128_si256(m, i));
+	}
+
+	template<int i> __forceinline GSVector8i insert(__m128i m) const
+	{
+		ASSERT(i < 2);
+
+		return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
+	}
+
+	// TODO: gather
+
+	template<class T> __forceinline GSVector8i gather32_32(const T* ptr) const
+	{
+		GSVector4i v0;
+		GSVector4i v1;
+
+		GSVector4i a0 = extract<0>();
+		GSVector4i a1 = extract<1>();
+
+		v0 = GSVector4i::load((int)ptr[a0.extract32<0>()]);
+		v0 = v0.insert32<1>((int)ptr[a0.extract32<1>()]);
+		v0 = v0.insert32<2>((int)ptr[a0.extract32<2>()]);
+		v0 = v0.insert32<3>((int)ptr[a0.extract32<3>()]);
+
+		v1 = GSVector4i::load((int)ptr[a1.extract32<0>()]);
+		v1 = v1.insert32<1>((int)ptr[a1.extract32<1>()]);
+		v1 = v1.insert32<2>((int)ptr[a1.extract32<2>()]);
+		v1 = v1.insert32<3>((int)ptr[a1.extract32<3>()]);
+
+		return cast(v0).insert<1>(v1);
+	}
+
+	__forceinline GSVector8i gather32_32(const uint8* ptr) const
+	{
+		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 1)) & GSVector8i::x000000ff();
+	}
+
+	__forceinline GSVector8i gather32_32(const uint16* ptr) const
+	{
+		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 2)) & GSVector8i::x0000ffff();
+	}
+
+	__forceinline GSVector8i gather32_32(const uint32* ptr) const
+	{
+		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 4));
+	}
+
+	template<class T1, class T2> __forceinline GSVector8i gather32_32(const T1* ptr1, const T2* ptr2) const
+	{
+		GSVector4i v0;
+		GSVector4i v1;
+
+		GSVector4i a0 = extract<0>();
+		GSVector4i a1 = extract<1>();
+
+		v0 = GSVector4i::load((int)ptr2[ptr1[a0.extract32<0>()]]);
+		v0 = v0.insert32<1>((int)ptr2[ptr1[a0.extract32<1>()]]);
+		v0 = v0.insert32<2>((int)ptr2[ptr1[a0.extract32<2>()]]);
+		v0 = v0.insert32<3>((int)ptr2[ptr1[a0.extract32<3>()]]);
+
+		v1 = GSVector4i::load((int)ptr2[ptr1[a1.extract32<0>()]]);
+		v1 = v1.insert32<1>((int)ptr2[ptr1[a1.extract32<1>()]]);
+		v1 = v1.insert32<2>((int)ptr2[ptr1[a1.extract32<2>()]]);
+		v1 = v1.insert32<3>((int)ptr2[ptr1[a1.extract32<3>()]]);
+
+		return cast(v0).insert<1>(v1);
+	}
+
+	__forceinline GSVector8i gather32_32(const uint8* ptr1, const uint32* ptr2) const
+	{
+		return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
+	}
+
+	__forceinline GSVector8i gather32_32(const uint32* ptr1, const uint32* ptr2) const
+	{
+		return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
+	}
+
+	template<class T> __forceinline void gather32_32(const T* RESTRICT ptr, GSVector8i* RESTRICT dst) const
+	{
+		dst[0] = gather32_32<>(ptr);
+	}
+
+	//
+
+	__forceinline static GSVector8i loadnt(const void* p)
+	{
+		return GSVector8i(_mm256_stream_load_si256((__m256i*)p));
+	}
+
+	__forceinline static GSVector8i loadl(const void* p)
+	{
+		return GSVector8i(_mm256_castsi128_si256(_mm_load_si128((__m128i*)p)));
+	}
+
+	__forceinline static GSVector8i loadh(const void* p)
+	{
+		return GSVector8i(_mm256_inserti128_si256(_mm256_setzero_si256(), _mm_load_si128((__m128i*)p), 1));
+
+		/* TODO: this may be faster
+		__m256i m = _mm256_castsi128_si256(_mm_load_si128((__m128i*)p));
+		return GSVector8i(_mm256_permute2x128_si256(m, m, 0x08));
+		*/
+	}
+
+	__forceinline static GSVector8i loadh(const void* p, const GSVector8i& v)
+	{
+		return GSVector8i(_mm256_inserti128_si256(v, _mm_load_si128((__m128i*)p), 1));
+	}
+
+	__forceinline static GSVector8i load(const void* pl, const void* ph)
+	{
+		return loadh(ph, loadl(pl));
+
+		/* TODO: this may be faster
+		__m256 m0 = _mm256_castsi128_si256(_mm_load_si128((__m128*)pl));
+		__m256 m1 = _mm256_castsi128_si256(_mm_load_si128((__m128*)ph));
+		return GSVector8i(_mm256_permute2x128_si256(m0, m1, 0x20));
+		*/
+	}
+
+	__forceinline static GSVector8i load(const void* pll, const void* plh, const void* phl, const void* phh)
+	{
+		GSVector4i l = GSVector4i::load(pll, plh);
+		GSVector4i h = GSVector4i::load(phl, phh);
+
+		return cast(l).ac(cast(h));
+
+		// return GSVector8i(l).insert<1>(h);
+	}
+
+	template<bool aligned> __forceinline static GSVector8i load(const void* p)
+	{
+		return GSVector8i(aligned ? _mm256_load_si256((__m256i*)p) : _mm256_loadu_si256((__m256i*)p));
+	}
+
+	__forceinline static GSVector8i load(int i)
+	{
+		return cast(GSVector4i::load(i));
+	}
+
+	#ifdef _M_AMD64
+
+	__forceinline static GSVector8i loadq(int64 i)
+	{
+		return cast(GSVector4i::loadq(i));
+	}
+
+	#endif
+
+	__forceinline static void storent(void* p, const GSVector8i& v)
+	{
+		_mm256_stream_si256((__m256i*)p, v.m);
+	}
+
+	__forceinline static void storel(void* p, const GSVector8i& v)
+	{
+		_mm_store_si128((__m128i*)p, _mm256_extracti128_si256(v.m, 0));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector8i& v)
+	{
+		_mm_store_si128((__m128i*)p, _mm256_extracti128_si256(v.m, 1));
+	}
+
+	__forceinline static void store(void* pl, void* ph, const GSVector8i& v)
+	{
+		GSVector8i::storel(pl, v);
+		GSVector8i::storeh(ph, v);
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector8i& v)
+	{
+		if(aligned) _mm256_store_si256((__m256i*)p, v.m);
+		else _mm256_storeu_si256((__m256i*)p, v.m);
+	}
+
+	__forceinline static int store(const GSVector8i& v)
+	{
+		return GSVector4i::store(GSVector4i::cast(v));
+	}
+
+	#ifdef _M_AMD64
+
+	__forceinline static int64 storeq(const GSVector8i& v)
+	{
+		return GSVector4i::storeq(GSVector4i::cast(v));
+	}
+
+	#endif
+
+	__forceinline static void storent(void* RESTRICT dst, const void* RESTRICT src, size_t size)
+	{
+		const GSVector8i* s = (const GSVector8i*)src;
+		GSVector8i* d = (GSVector8i*)dst;
+
+		if(size == 0) return;
+
+		size_t i = 0;
+		size_t j = size >> 7;
+
+		for(; i < j; i++, s += 4, d += 4)
+		{
+			storent(&d[0], s[0]);
+			storent(&d[1], s[1]);
+			storent(&d[2], s[2]);
+			storent(&d[3], s[3]);
+		}
+
+		size &= 127;
+
+		if(size == 0) return;
+
+		memcpy(d, s, size);
+	}
+
+	// TODO: swizzling
+
+	__forceinline static void sw8(GSVector8i& a, GSVector8i& b)
+	{
+		GSVector8i c = a;
+		GSVector8i d = b;
+
+		a = c.upl8(d);
+		b = c.uph8(d);
+	}
+
+	__forceinline static void sw16(GSVector8i& a, GSVector8i& b)
+	{
+		GSVector8i c = a;
+		GSVector8i d = b;
+
+		a = c.upl16(d);
+		b = c.uph16(d);
+	}
+
+	__forceinline static void sw32(GSVector8i& a, GSVector8i& b)
+	{
+		GSVector8i c = a;
+		GSVector8i d = b;
+
+		a = c.upl32(d);
+		b = c.uph32(d);
+	}
+
+	__forceinline static void sw64(GSVector8i& a, GSVector8i& b)
+	{
+		GSVector8i c = a;
+		GSVector8i d = b;
+
+		a = c.upl64(d);
+		b = c.uph64(d);
+	}
+
+	__forceinline static void sw128(GSVector8i& a, GSVector8i& b)
+	{
+		GSVector8i c = a;
+		GSVector8i d = b;
+
+		a = c.ac(d);
+		b = c.bd(d);
+	}
+
+	__forceinline static void sw4(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		const __m256i epi32_0f0f0f0f = _mm256_set1_epi32(0x0f0f0f0f);
+
+		GSVector8i mask(epi32_0f0f0f0f);
+
+		GSVector8i e = (b << 4).blend(a, mask);
+		GSVector8i f = b.blend(a >> 4, mask);
+		GSVector8i g = (d << 4).blend(c, mask);
+		GSVector8i h = d.blend(c >> 4, mask);
+
+		a = e.upl8(f);
+		c = e.uph8(f);
+		b = g.upl8(h);
+		d = g.uph8(h);
+	}
+
+	__forceinline static void sw8(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl8(b);
+		c = e.uph8(b);
+		b = f.upl8(d);
+		d = f.uph8(d);
+	}
+
+	__forceinline static void sw16(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl16(b);
+		c = e.uph16(b);
+		b = f.upl16(d);
+		d = f.uph16(d);
+	}
+
+	__forceinline static void sw32(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl32(b);
+		c = e.uph32(b);
+		b = f.upl32(d);
+		d = f.uph32(d);
+	}
+
+	__forceinline static void sw64(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.upl64(b);
+		c = e.uph64(b);
+		b = f.upl64(d);
+		d = f.uph64(d);
+	}
+
+	__forceinline static void sw128(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
+	{
+		GSVector8i e = a;
+		GSVector8i f = c;
+
+		a = e.ac(b);
+		c = e.bd(b);
+		b = f.ac(d);
+		d = f.bd(d);
+	}
+
+	__forceinline void operator += (const GSVector8i& v)
+	{
+		m = _mm256_add_epi32(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector8i& v)
+	{
+		m = _mm256_sub_epi32(m, v);
+	}
+
+	__forceinline void operator += (int i)
+	{
+		*this += GSVector8i(i);
+	}
+
+	__forceinline void operator -= (int i)
+	{
+		*this -= GSVector8i(i);
+	}
+
+	__forceinline void operator <<= (const int i)
+	{
+		m = _mm256_slli_epi32(m, i);
+	}
+
+	__forceinline void operator >>= (const int i)
+	{
+		m = _mm256_srli_epi32(m, i);
+	}
+
+	__forceinline void operator &= (const GSVector8i& v)
+	{
+		m = _mm256_and_si256(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector8i& v)
+	{
+		m = _mm256_or_si256(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector8i& v)
+	{
+		m = _mm256_xor_si256(m, v);
+	}
+
+	__forceinline friend GSVector8i operator + (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_add_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator - (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_sub_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator + (const GSVector8i& v, int i)
+	{
+		return v + GSVector8i(i);
+	}
+
+	__forceinline friend GSVector8i operator - (const GSVector8i& v, int i)
+	{
+		return v - GSVector8i(i);
+	}
+
+	__forceinline friend GSVector8i operator << (const GSVector8i& v, const int i)
+	{
+		return GSVector8i(_mm256_slli_epi32(v, i));
+	}
+
+	__forceinline friend GSVector8i operator >> (const GSVector8i& v, const int i)
+	{
+		return GSVector8i(_mm256_srli_epi32(v, i));
+	}
+
+	__forceinline friend GSVector8i operator & (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_and_si256(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator | (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_or_si256(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator ^ (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_xor_si256(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator & (const GSVector8i& v, int i)
+	{
+		return v & GSVector8i(i);
+	}
+
+	__forceinline friend GSVector8i operator | (const GSVector8i& v, int i)
+	{
+		return v | GSVector8i(i);
+	}
+
+	__forceinline friend GSVector8i operator ^ (const GSVector8i& v, int i)
+	{
+		return v ^ GSVector8i(i);
+	}
+
+	__forceinline friend GSVector8i operator ~ (const GSVector8i& v)
+	{
+		return v ^ (v == v);
+	}
+
+	__forceinline friend GSVector8i operator == (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_cmpeq_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator != (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return ~(v1 == v2);
+	}
+
+	__forceinline friend GSVector8i operator > (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_cmpgt_epi32(v1, v2));
+	}
+
+	__forceinline friend GSVector8i operator < (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return GSVector8i(_mm256_cmpgt_epi32(v2, v1));
+	}
+
+	__forceinline friend GSVector8i operator >= (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return (v1 > v2) | (v1 == v2);
+	}
+
+	__forceinline friend GSVector8i operator <= (const GSVector8i& v1, const GSVector8i& v2)
+	{
+		return (v1 < v2) | (v1 == v2);
+	}
+
+	// x = v[31:0] / v[159:128]
+	// y = v[63:32] / v[191:160]
+	// z = v[95:64] / v[223:192]
+	// w = v[127:96] / v[255:224]
+
+	#define VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8i xs##ys##zs##ws() const {return GSVector8i(_mm256_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8i xs##ys##zs##ws##l() const {return GSVector8i(_mm256_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8i xs##ys##zs##ws##h() const {return GSVector8i(_mm256_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8i xs##ys##zs##ws##lh() const {return GSVector8i(_mm256_shufflehi_epi16(_mm256_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+	#define VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR8i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR8i_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR8i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR8i_SHUFFLE_1(xs, xn) \
+		VECTOR8i_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR8i_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR8i_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR8i_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR8i_SHUFFLE_1(x, 0)
+	VECTOR8i_SHUFFLE_1(y, 1)
+	VECTOR8i_SHUFFLE_1(z, 2)
+	VECTOR8i_SHUFFLE_1(w, 3)
+
+	// a = v0[127:0]
+	// b = v0[255:128]
+	// c = v1[127:0]
+	// d = v1[255:128]
+	// _ = 0
+
+	#define VECTOR8i_PERMUTE128_2(as, an, bs, bn) \
+		__forceinline GSVector8i as##bs() const {return GSVector8i(_mm256_permute2x128_si256(m, m, an | (bn << 4)));} \
+		__forceinline GSVector8i as##bs(const GSVector8i& v) const {return GSVector8i(_mm256_permute2x128_si256(m, v.m, an | (bn << 4)));} \
+
+	#define VECTOR8i_PERMUTE128_1(as, an) \
+		VECTOR8i_PERMUTE128_2(as, an, a, 0) \
+		VECTOR8i_PERMUTE128_2(as, an, b, 1) \
+		VECTOR8i_PERMUTE128_2(as, an, c, 2) \
+		VECTOR8i_PERMUTE128_2(as, an, d, 3) \
+		VECTOR8i_PERMUTE128_2(as, an, _, 8) \
+
+	VECTOR8i_PERMUTE128_1(a, 0)
+	VECTOR8i_PERMUTE128_1(b, 1)
+	VECTOR8i_PERMUTE128_1(c, 2)
+	VECTOR8i_PERMUTE128_1(d, 3)
+	VECTOR8i_PERMUTE128_1(_, 8)
+
+	// a = v[63:0]
+	// b = v[127:64]
+	// c = v[191:128]
+	// d = v[255:192]
+
+	#define VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, ds, dn) \
+		__forceinline GSVector8i as##bs##cs##ds() const {return GSVector8i(_mm256_permute4x64_epi64(m, _MM_SHUFFLE(dn, cn, bn, an)));} \
+
+	#define VECTOR8i_PERMUTE64_3(as, an, bs, bn, cs, cn) \
+		VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, a, 0) \
+		VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, b, 1) \
+		VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, c, 2) \
+		VECTOR8i_PERMUTE64_4(as, an, bs, bn, cs, cn, d, 3) \
+
+	#define VECTOR8i_PERMUTE64_2(as, an, bs, bn) \
+		VECTOR8i_PERMUTE64_3(as, an, bs, bn, a, 0) \
+		VECTOR8i_PERMUTE64_3(as, an, bs, bn, b, 1) \
+		VECTOR8i_PERMUTE64_3(as, an, bs, bn, c, 2) \
+		VECTOR8i_PERMUTE64_3(as, an, bs, bn, d, 3) \
+
+	#define VECTOR8i_PERMUTE64_1(as, an) \
+		VECTOR8i_PERMUTE64_2(as, an, a, 0) \
+		VECTOR8i_PERMUTE64_2(as, an, b, 1) \
+		VECTOR8i_PERMUTE64_2(as, an, c, 2) \
+		VECTOR8i_PERMUTE64_2(as, an, d, 3) \
+
+	VECTOR8i_PERMUTE64_1(a, 0)
+	VECTOR8i_PERMUTE64_1(b, 1)
+	VECTOR8i_PERMUTE64_1(c, 2)
+	VECTOR8i_PERMUTE64_1(d, 3)
+
+	__forceinline GSVector8i permute32(const GSVector8i& mask) const
+	{
+		return GSVector8i(_mm256_permutevar8x32_epi32(m, mask));
+	}
+
+	__forceinline GSVector8i broadcast8() const
+	{
+		return GSVector8i(_mm256_broadcastb_epi8(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i broadcast16() const
+	{
+		return GSVector8i(_mm256_broadcastw_epi16(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i broadcast32() const
+	{
+		return GSVector8i(_mm256_broadcastd_epi32(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline GSVector8i broadcast64() const
+	{
+		return GSVector8i(_mm256_broadcastq_epi64(_mm256_castsi256_si128(m)));
+	}
+
+	__forceinline static GSVector8i broadcast8(const GSVector4i& v)
+	{
+		return GSVector8i(_mm256_broadcastb_epi8(v.m));
+	}
+
+	__forceinline static GSVector8i broadcast16(const GSVector4i& v)
+	{
+		return GSVector8i(_mm256_broadcastw_epi16(v.m));
+	}
+
+	__forceinline static GSVector8i broadcast32(const GSVector4i& v)
+	{
+		return GSVector8i(_mm256_broadcastd_epi32(v.m));
+	}
+
+	__forceinline static GSVector8i broadcast64(const GSVector4i& v)
+	{
+		return GSVector8i(_mm256_broadcastq_epi64(v.m));
+	}
+
+	__forceinline static GSVector8i broadcast128(const GSVector4i& v)
+	{
+		// this one only has m128 source op, it will be saved to a temp on stack if the compiler is not smart enough and use the address of v directly (<= vs2012u3rc2)
+
+		return GSVector8i(_mm256_broadcastsi128_si256(v)); // fastest
+		//return GSVector8i(v); // almost as fast as broadcast
+		//return cast(v).insert<1>(v); // slow
+		//return cast(v).aa(); // slowest
+	}
+
+	__forceinline static GSVector8i broadcast8(const void* p)
+	{
+		return GSVector8i(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(*(const int*)p)));
+	}
+
+	__forceinline static GSVector8i broadcast16(const void* p)
+	{
+		return GSVector8i(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(const int*)p)));
+	}
+
+	__forceinline static GSVector8i broadcast32(const void* p)
+	{
+		return GSVector8i(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(*(const int*)p)));
+	}
+
+	__forceinline static GSVector8i broadcast64(const void* p)
+	{
+		return GSVector8i(_mm256_broadcastq_epi64(_mm_loadl_epi64((const __m128i*)p)));
+	}
+
+	__forceinline static GSVector8i broadcast128(const void* p)
+	{
+		return GSVector8i(_mm256_broadcastsi128_si256(*(const __m128i*)p));
+	}
+
+	__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
+
+	__forceinline static GSVector8i xffffffff() {return zero() == zero();}
+
+	__forceinline static GSVector8i x00000001() {return xffffffff().srl32(31);}
+	__forceinline static GSVector8i x00000003() {return xffffffff().srl32(30);}
+	__forceinline static GSVector8i x00000007() {return xffffffff().srl32(29);}
+	__forceinline static GSVector8i x0000000f() {return xffffffff().srl32(28);}
+	__forceinline static GSVector8i x0000001f() {return xffffffff().srl32(27);}
+	__forceinline static GSVector8i x0000003f() {return xffffffff().srl32(26);}
+	__forceinline static GSVector8i x0000007f() {return xffffffff().srl32(25);}
+	__forceinline static GSVector8i x000000ff() {return xffffffff().srl32(24);}
+	__forceinline static GSVector8i x000001ff() {return xffffffff().srl32(23);}
+	__forceinline static GSVector8i x000003ff() {return xffffffff().srl32(22);}
+	__forceinline static GSVector8i x000007ff() {return xffffffff().srl32(21);}
+	__forceinline static GSVector8i x00000fff() {return xffffffff().srl32(20);}
+	__forceinline static GSVector8i x00001fff() {return xffffffff().srl32(19);}
+	__forceinline static GSVector8i x00003fff() {return xffffffff().srl32(18);}
+	__forceinline static GSVector8i x00007fff() {return xffffffff().srl32(17);}
+	__forceinline static GSVector8i x0000ffff() {return xffffffff().srl32(16);}
+	__forceinline static GSVector8i x0001ffff() {return xffffffff().srl32(15);}
+	__forceinline static GSVector8i x0003ffff() {return xffffffff().srl32(14);}
+	__forceinline static GSVector8i x0007ffff() {return xffffffff().srl32(13);}
+	__forceinline static GSVector8i x000fffff() {return xffffffff().srl32(12);}
+	__forceinline static GSVector8i x001fffff() {return xffffffff().srl32(11);}
+	__forceinline static GSVector8i x003fffff() {return xffffffff().srl32(10);}
+	__forceinline static GSVector8i x007fffff() {return xffffffff().srl32( 9);}
+	__forceinline static GSVector8i x00ffffff() {return xffffffff().srl32( 8);}
+	__forceinline static GSVector8i x01ffffff() {return xffffffff().srl32( 7);}
+	__forceinline static GSVector8i x03ffffff() {return xffffffff().srl32( 6);}
+	__forceinline static GSVector8i x07ffffff() {return xffffffff().srl32( 5);}
+	__forceinline static GSVector8i x0fffffff() {return xffffffff().srl32( 4);}
+	__forceinline static GSVector8i x1fffffff() {return xffffffff().srl32( 3);}
+	__forceinline static GSVector8i x3fffffff() {return xffffffff().srl32( 2);}
+	__forceinline static GSVector8i x7fffffff() {return xffffffff().srl32( 1);}
+
+	__forceinline static GSVector8i x80000000() {return xffffffff().sll32(31);}
+	__forceinline static GSVector8i xc0000000() {return xffffffff().sll32(30);}
+	__forceinline static GSVector8i xe0000000() {return xffffffff().sll32(29);}
+	__forceinline static GSVector8i xf0000000() {return xffffffff().sll32(28);}
+	__forceinline static GSVector8i xf8000000() {return xffffffff().sll32(27);}
+	__forceinline static GSVector8i xfc000000() {return xffffffff().sll32(26);}
+	__forceinline static GSVector8i xfe000000() {return xffffffff().sll32(25);}
+	__forceinline static GSVector8i xff000000() {return xffffffff().sll32(24);}
+	__forceinline static GSVector8i xff800000() {return xffffffff().sll32(23);}
+	__forceinline static GSVector8i xffc00000() {return xffffffff().sll32(22);}
+	__forceinline static GSVector8i xffe00000() {return xffffffff().sll32(21);}
+	__forceinline static GSVector8i xfff00000() {return xffffffff().sll32(20);}
+	__forceinline static GSVector8i xfff80000() {return xffffffff().sll32(19);}
+	__forceinline static GSVector8i xfffc0000() {return xffffffff().sll32(18);}
+	__forceinline static GSVector8i xfffe0000() {return xffffffff().sll32(17);}
+	__forceinline static GSVector8i xffff0000() {return xffffffff().sll32(16);}
+	__forceinline static GSVector8i xffff8000() {return xffffffff().sll32(15);}
+	__forceinline static GSVector8i xffffc000() {return xffffffff().sll32(14);}
+	__forceinline static GSVector8i xffffe000() {return xffffffff().sll32(13);}
+	__forceinline static GSVector8i xfffff000() {return xffffffff().sll32(12);}
+	__forceinline static GSVector8i xfffff800() {return xffffffff().sll32(11);}
+	__forceinline static GSVector8i xfffffc00() {return xffffffff().sll32(10);}
+	__forceinline static GSVector8i xfffffe00() {return xffffffff().sll32( 9);}
+	__forceinline static GSVector8i xffffff00() {return xffffffff().sll32( 8);}
+	__forceinline static GSVector8i xffffff80() {return xffffffff().sll32( 7);}
+	__forceinline static GSVector8i xffffffc0() {return xffffffff().sll32( 6);}
+	__forceinline static GSVector8i xffffffe0() {return xffffffff().sll32( 5);}
+	__forceinline static GSVector8i xfffffff0() {return xffffffff().sll32( 4);}
+	__forceinline static GSVector8i xfffffff8() {return xffffffff().sll32( 3);}
+	__forceinline static GSVector8i xfffffffc() {return xffffffff().sll32( 2);}
+	__forceinline static GSVector8i xfffffffe() {return xffffffff().sll32( 1);}
+
+	__forceinline static GSVector8i x0001() {return xffffffff().srl16(15);}
+	__forceinline static GSVector8i x0003() {return xffffffff().srl16(14);}
+	__forceinline static GSVector8i x0007() {return xffffffff().srl16(13);}
+	__forceinline static GSVector8i x000f() {return xffffffff().srl16(12);}
+	__forceinline static GSVector8i x001f() {return xffffffff().srl16(11);}
+	__forceinline static GSVector8i x003f() {return xffffffff().srl16(10);}
+	__forceinline static GSVector8i x007f() {return xffffffff().srl16( 9);}
+	__forceinline static GSVector8i x00ff() {return xffffffff().srl16( 8);}
+	__forceinline static GSVector8i x01ff() {return xffffffff().srl16( 7);}
+	__forceinline static GSVector8i x03ff() {return xffffffff().srl16( 6);}
+	__forceinline static GSVector8i x07ff() {return xffffffff().srl16( 5);}
+	__forceinline static GSVector8i x0fff() {return xffffffff().srl16( 4);}
+	__forceinline static GSVector8i x1fff() {return xffffffff().srl16( 3);}
+	__forceinline static GSVector8i x3fff() {return xffffffff().srl16( 2);}
+	__forceinline static GSVector8i x7fff() {return xffffffff().srl16( 1);}
+
+	__forceinline static GSVector8i x8000() {return xffffffff().sll16(15);}
+	__forceinline static GSVector8i xc000() {return xffffffff().sll16(14);}
+	__forceinline static GSVector8i xe000() {return xffffffff().sll16(13);}
+	__forceinline static GSVector8i xf000() {return xffffffff().sll16(12);}
+	__forceinline static GSVector8i xf800() {return xffffffff().sll16(11);}
+	__forceinline static GSVector8i xfc00() {return xffffffff().sll16(10);}
+	__forceinline static GSVector8i xfe00() {return xffffffff().sll16( 9);}
+	__forceinline static GSVector8i xff00() {return xffffffff().sll16( 8);}
+	__forceinline static GSVector8i xff80() {return xffffffff().sll16( 7);}
+	__forceinline static GSVector8i xffc0() {return xffffffff().sll16( 6);}
+	__forceinline static GSVector8i xffe0() {return xffffffff().sll16( 5);}
+	__forceinline static GSVector8i xfff0() {return xffffffff().sll16( 4);}
+	__forceinline static GSVector8i xfff8() {return xffffffff().sll16( 3);}
+	__forceinline static GSVector8i xfffc() {return xffffffff().sll16( 2);}
+	__forceinline static GSVector8i xfffe() {return xffffffff().sll16( 1);}
+
+	__forceinline static GSVector8i xffffffff(const GSVector8i& v) {return v == v;}
+
+	__forceinline static GSVector8i x00000001(const GSVector8i& v) {return xffffffff(v).srl32(31);}
+	__forceinline static GSVector8i x00000003(const GSVector8i& v) {return xffffffff(v).srl32(30);}
+	__forceinline static GSVector8i x00000007(const GSVector8i& v) {return xffffffff(v).srl32(29);}
+	__forceinline static GSVector8i x0000000f(const GSVector8i& v) {return xffffffff(v).srl32(28);}
+	__forceinline static GSVector8i x0000001f(const GSVector8i& v) {return xffffffff(v).srl32(27);}
+	__forceinline static GSVector8i x0000003f(const GSVector8i& v) {return xffffffff(v).srl32(26);}
+	__forceinline static GSVector8i x0000007f(const GSVector8i& v) {return xffffffff(v).srl32(25);}
+	__forceinline static GSVector8i x000000ff(const GSVector8i& v) {return xffffffff(v).srl32(24);}
+	__forceinline static GSVector8i x000001ff(const GSVector8i& v) {return xffffffff(v).srl32(23);}
+	__forceinline static GSVector8i x000003ff(const GSVector8i& v) {return xffffffff(v).srl32(22);}
+	__forceinline static GSVector8i x000007ff(const GSVector8i& v) {return xffffffff(v).srl32(21);}
+	__forceinline static GSVector8i x00000fff(const GSVector8i& v) {return xffffffff(v).srl32(20);}
+	__forceinline static GSVector8i x00001fff(const GSVector8i& v) {return xffffffff(v).srl32(19);}
+	__forceinline static GSVector8i x00003fff(const GSVector8i& v) {return xffffffff(v).srl32(18);}
+	__forceinline static GSVector8i x00007fff(const GSVector8i& v) {return xffffffff(v).srl32(17);}
+	__forceinline static GSVector8i x0000ffff(const GSVector8i& v) {return xffffffff(v).srl32(16);}
+	__forceinline static GSVector8i x0001ffff(const GSVector8i& v) {return xffffffff(v).srl32(15);}
+	__forceinline static GSVector8i x0003ffff(const GSVector8i& v) {return xffffffff(v).srl32(14);}
+	__forceinline static GSVector8i x0007ffff(const GSVector8i& v) {return xffffffff(v).srl32(13);}
+	__forceinline static GSVector8i x000fffff(const GSVector8i& v) {return xffffffff(v).srl32(12);}
+	__forceinline static GSVector8i x001fffff(const GSVector8i& v) {return xffffffff(v).srl32(11);}
+	__forceinline static GSVector8i x003fffff(const GSVector8i& v) {return xffffffff(v).srl32(10);}
+	__forceinline static GSVector8i x007fffff(const GSVector8i& v) {return xffffffff(v).srl32( 9);}
+	__forceinline static GSVector8i x00ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 8);}
+	__forceinline static GSVector8i x01ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 7);}
+	__forceinline static GSVector8i x03ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 6);}
+	__forceinline static GSVector8i x07ffffff(const GSVector8i& v) {return xffffffff(v).srl32( 5);}
+	__forceinline static GSVector8i x0fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 4);}
+	__forceinline static GSVector8i x1fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 3);}
+	__forceinline static GSVector8i x3fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 2);}
+	__forceinline static GSVector8i x7fffffff(const GSVector8i& v) {return xffffffff(v).srl32( 1);}
+
+	__forceinline static GSVector8i x80000000(const GSVector8i& v) {return xffffffff(v).sll32(31);}
+	__forceinline static GSVector8i xc0000000(const GSVector8i& v) {return xffffffff(v).sll32(30);}
+	__forceinline static GSVector8i xe0000000(const GSVector8i& v) {return xffffffff(v).sll32(29);}
+	__forceinline static GSVector8i xf0000000(const GSVector8i& v) {return xffffffff(v).sll32(28);}
+	__forceinline static GSVector8i xf8000000(const GSVector8i& v) {return xffffffff(v).sll32(27);}
+	__forceinline static GSVector8i xfc000000(const GSVector8i& v) {return xffffffff(v).sll32(26);}
+	__forceinline static GSVector8i xfe000000(const GSVector8i& v) {return xffffffff(v).sll32(25);}
+	__forceinline static GSVector8i xff000000(const GSVector8i& v) {return xffffffff(v).sll32(24);}
+	__forceinline static GSVector8i xff800000(const GSVector8i& v) {return xffffffff(v).sll32(23);}
+	__forceinline static GSVector8i xffc00000(const GSVector8i& v) {return xffffffff(v).sll32(22);}
+	__forceinline static GSVector8i xffe00000(const GSVector8i& v) {return xffffffff(v).sll32(21);}
+	__forceinline static GSVector8i xfff00000(const GSVector8i& v) {return xffffffff(v).sll32(20);}
+	__forceinline static GSVector8i xfff80000(const GSVector8i& v) {return xffffffff(v).sll32(19);}
+	__forceinline static GSVector8i xfffc0000(const GSVector8i& v) {return xffffffff(v).sll32(18);}
+	__forceinline static GSVector8i xfffe0000(const GSVector8i& v) {return xffffffff(v).sll32(17);}
+	__forceinline static GSVector8i xffff0000(const GSVector8i& v) {return xffffffff(v).sll32(16);}
+	__forceinline static GSVector8i xffff8000(const GSVector8i& v) {return xffffffff(v).sll32(15);}
+	__forceinline static GSVector8i xffffc000(const GSVector8i& v) {return xffffffff(v).sll32(14);}
+	__forceinline static GSVector8i xffffe000(const GSVector8i& v) {return xffffffff(v).sll32(13);}
+	__forceinline static GSVector8i xfffff000(const GSVector8i& v) {return xffffffff(v).sll32(12);}
+	__forceinline static GSVector8i xfffff800(const GSVector8i& v) {return xffffffff(v).sll32(11);}
+	__forceinline static GSVector8i xfffffc00(const GSVector8i& v) {return xffffffff(v).sll32(10);}
+	__forceinline static GSVector8i xfffffe00(const GSVector8i& v) {return xffffffff(v).sll32( 9);}
+	__forceinline static GSVector8i xffffff00(const GSVector8i& v) {return xffffffff(v).sll32( 8);}
+	__forceinline static GSVector8i xffffff80(const GSVector8i& v) {return xffffffff(v).sll32( 7);}
+	__forceinline static GSVector8i xffffffc0(const GSVector8i& v) {return xffffffff(v).sll32( 6);}
+	__forceinline static GSVector8i xffffffe0(const GSVector8i& v) {return xffffffff(v).sll32( 5);}
+	__forceinline static GSVector8i xfffffff0(const GSVector8i& v) {return xffffffff(v).sll32( 4);}
+	__forceinline static GSVector8i xfffffff8(const GSVector8i& v) {return xffffffff(v).sll32( 3);}
+	__forceinline static GSVector8i xfffffffc(const GSVector8i& v) {return xffffffff(v).sll32( 2);}
+	__forceinline static GSVector8i xfffffffe(const GSVector8i& v) {return xffffffff(v).sll32( 1);}
+
+	__forceinline static GSVector8i x0001(const GSVector8i& v) {return xffffffff(v).srl16(15);}
+	__forceinline static GSVector8i x0003(const GSVector8i& v) {return xffffffff(v).srl16(14);}
+	__forceinline static GSVector8i x0007(const GSVector8i& v) {return xffffffff(v).srl16(13);}
+	__forceinline static GSVector8i x000f(const GSVector8i& v) {return xffffffff(v).srl16(12);}
+	__forceinline static GSVector8i x001f(const GSVector8i& v) {return xffffffff(v).srl16(11);}
+	__forceinline static GSVector8i x003f(const GSVector8i& v) {return xffffffff(v).srl16(10);}
+	__forceinline static GSVector8i x007f(const GSVector8i& v) {return xffffffff(v).srl16( 9);}
+	__forceinline static GSVector8i x00ff(const GSVector8i& v) {return xffffffff(v).srl16( 8);}
+	__forceinline static GSVector8i x01ff(const GSVector8i& v) {return xffffffff(v).srl16( 7);}
+	__forceinline static GSVector8i x03ff(const GSVector8i& v) {return xffffffff(v).srl16( 6);}
+	__forceinline static GSVector8i x07ff(const GSVector8i& v) {return xffffffff(v).srl16( 5);}
+	__forceinline static GSVector8i x0fff(const GSVector8i& v) {return xffffffff(v).srl16( 4);}
+	__forceinline static GSVector8i x1fff(const GSVector8i& v) {return xffffffff(v).srl16( 3);}
+	__forceinline static GSVector8i x3fff(const GSVector8i& v) {return xffffffff(v).srl16( 2);}
+	__forceinline static GSVector8i x7fff(const GSVector8i& v) {return xffffffff(v).srl16( 1);}
+
+	__forceinline static GSVector8i x8000(const GSVector8i& v) {return xffffffff(v).sll16(15);}
+	__forceinline static GSVector8i xc000(const GSVector8i& v) {return xffffffff(v).sll16(14);}
+	__forceinline static GSVector8i xe000(const GSVector8i& v) {return xffffffff(v).sll16(13);}
+	__forceinline static GSVector8i xf000(const GSVector8i& v) {return xffffffff(v).sll16(12);}
+	__forceinline static GSVector8i xf800(const GSVector8i& v) {return xffffffff(v).sll16(11);}
+	__forceinline static GSVector8i xfc00(const GSVector8i& v) {return xffffffff(v).sll16(10);}
+	__forceinline static GSVector8i xfe00(const GSVector8i& v) {return xffffffff(v).sll16( 9);}
+	__forceinline static GSVector8i xff00(const GSVector8i& v) {return xffffffff(v).sll16( 8);}
+	__forceinline static GSVector8i xff80(const GSVector8i& v) {return xffffffff(v).sll16( 7);}
+	__forceinline static GSVector8i xffc0(const GSVector8i& v) {return xffffffff(v).sll16( 6);}
+	__forceinline static GSVector8i xffe0(const GSVector8i& v) {return xffffffff(v).sll16( 5);}
+	__forceinline static GSVector8i xfff0(const GSVector8i& v) {return xffffffff(v).sll16( 4);}
+	__forceinline static GSVector8i xfff8(const GSVector8i& v) {return xffffffff(v).sll16( 3);}
+	__forceinline static GSVector8i xfffc(const GSVector8i& v) {return xffffffff(v).sll16( 2);}
+	__forceinline static GSVector8i xfffe(const GSVector8i& v) {return xffffffff(v).sll16( 1);}
+
+	__forceinline static GSVector8i xff(int n) {return m_xff[n];}
+	__forceinline static GSVector8i x0f(int n) {return m_x0f[n];}
+};
+
+#endif
+
+#if _M_SSE >= 0x500
+
+__aligned(class, 32) GSVector8
+{
+public:
+	union
+	{
+		struct {float x0, y0, z0, w0, x1, y1, z1, w1;};
+		struct {float r0, g0, b0, a0, r1, g1, b1, a1;};
+		float v[8];
+		float f32[8];
+		int8 i8[32];
+		int16 i16[16];
+		int32 i32[8];
+		int64 i64[4];
+		uint8 u8[32];
+		uint16 u16[16];
+		uint32 u32[8];
+		uint64 u64[4];
+		__m256 m;
+		__m128 m0, m1;
+	};
+
+	static const GSVector8 m_half;
+	static const GSVector8 m_one;
+	static const GSVector8 m_x7fffffff;
+	static const GSVector8 m_x80000000;
+	static const GSVector8 m_x4b000000;
+	static const GSVector8 m_x4f800000;
+	static const GSVector8 m_max;
+	static const GSVector8 m_min;
+
+	__forceinline GSVector8() 
+	{
+	}
+
+	__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
+	{
+		m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
+	}
+
+	__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
+	{
+		m = _mm256_cvtepi32_ps(_mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0));
+	}
+
+	__forceinline GSVector8(__m128 m0, __m128 m1)
+	{
+		#if 0 // _MSC_VER >= 1700 
+		
+		this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
+
+		#else
+
+		this->m = zero().insert<0>(m0).insert<1>(m1);
+
+		#endif
+	}
+
+	__forceinline GSVector8(const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline explicit GSVector8(float f)
+	{
+		*this = f;
+	}
+
+	__forceinline explicit GSVector8(int i)
+	{
+		#if _M_SSE >= 0x501
+
+		m = _mm256_cvtepi32_ps(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+
+		#else 
+
+		GSVector4i v((int)i);
+
+		*this = GSVector4(v);
+
+		#endif
+	}
+
+	__forceinline explicit GSVector8(__m128 m)
+	{
+		*this = m;
+	}
+
+	__forceinline explicit GSVector8(__m256 m)
+	{
+		this->m = m;
+	}
+
+	#if _M_SSE >= 0x501
+
+	__forceinline explicit GSVector8(const GSVector8i& v);
+
+	__forceinline static GSVector8 cast(const GSVector8i& v);
+
+	#endif
+
+	__forceinline static GSVector8 cast(const GSVector4& v);
+	__forceinline static GSVector8 cast(const GSVector4i& v);
+
+	__forceinline void operator = (const GSVector8& v)
+	{
+		m = v.m;
+	}
+
+	__forceinline void operator = (float f)
+	{
+		#if _M_SSE >= 0x501
+
+		m =  _mm256_broadcastss_ps(_mm_load_ss(&f));
+
+		#else
+
+		m = _mm256_set1_ps(f);
+
+		#endif
+	}
+
+	__forceinline void operator = (__m128 m)
+	{
+		this->m = _mm256_insertf128_ps(_mm256_castps128_ps256(m), m, 1);
+	}
+
+	__forceinline void operator = (__m256 m)
+	{
+		this->m = m;
+	}
+
+	__forceinline operator __m256() const
+	{
+		return m;
+	}
+
+	__forceinline GSVector8 abs() const
+	{
+		#if _M_SSE >= 0x501
+
+		return *this & cast(GSVector8i::x7fffffff());
+
+		#else
+		
+		return *this & m_x7fffffff;
+
+		#endif
+	}
+
+	__forceinline GSVector8 neg() const
+	{
+		#if _M_SSE >= 0x501
+
+		return *this ^ cast(GSVector8i::x80000000());
+
+		#else
+		
+		return *this ^ m_x80000000;
+
+		#endif
+	}
+
+	__forceinline GSVector8 rcp() const
+	{
+		return GSVector8(_mm256_rcp_ps(m));
+	}
+
+	__forceinline GSVector8 rcpnr() const
+	{
+		GSVector8 v = rcp();
+
+		return (v + v) - (v * v) * *this;
+	}
+
+	template<int mode> __forceinline GSVector8 round() const
+	{
+		return GSVector8(_mm256_round_ps(m, mode));
+	}
+
+	__forceinline GSVector8 floor() const
+	{
+		return round<Round_NegInf>();
+	}
+
+	__forceinline GSVector8 ceil() const
+	{
+		return round<Round_PosInf>();
+	}
+
+	#if _M_SSE >= 0x501
+
+	#define LOG8_POLY0(x, c0) GSVector8(c0)
+	#define LOG8_POLY1(x, c0, c1) (LOG8_POLY0(x, c1).madd(x, GSVector8(c0)))
+	#define LOG8_POLY2(x, c0, c1, c2) (LOG8_POLY1(x, c1, c2).madd(x, GSVector8(c0)))
+	#define LOG8_POLY3(x, c0, c1, c2, c3) (LOG8_POLY2(x, c1, c2, c3).madd(x, GSVector8(c0)))
+	#define LOG8_POLY4(x, c0, c1, c2, c3, c4) (LOG8_POLY3(x, c1, c2, c3, c4).madd(x, GSVector8(c0)))
+	#define LOG8_POLY5(x, c0, c1, c2, c3, c4, c5) (LOG8_POLY4(x, c1, c2, c3, c4, c5).madd(x, GSVector8(c0)))
+
+	__forceinline GSVector8 log2(int precision = 5) const
+	{
+		// NOTE: see GSVector4::log2
+
+		GSVector8 one = m_one;
+
+		GSVector8i i = GSVector8i::cast(*this);
+
+		GSVector8 e = GSVector8(((i << 1) >> 24) - GSVector8i::x0000007f());
+		GSVector8 m = GSVector8::cast((i << 9) >> 9) | one;
+
+		GSVector8 p;
+
+		switch(precision)
+		{
+		case 3:
+			p = LOG8_POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+			break;
+		case 4:
+			p = LOG8_POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+			break;
+		default:
+		case 5:
+			p = LOG8_POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+			break;
+		case 6:
+			p = LOG8_POLY5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+			break;
+		}
+
+		// This effectively increases the polynomial degree by one, but ensures that log2(1) == 0
+
+		p = p * (m - one);
+
+		return p + e;
+	}
+
+	#endif
+
+	__forceinline GSVector8 madd(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fmadd_ps(m, a, b));
+		
+		#else
+		
+		return *this * a + b;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 msub(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fmsub_ps(m, a, b));
+		
+		#else
+		
+		return *this * a - b;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 nmadd(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fnmadd_ps(m, a, b));
+		
+		#else
+		
+		return b - *this * a;
+		
+		#endif
+	}
+
+	__forceinline GSVector8 nmsub(const GSVector8& a, const GSVector8& b) const
+	{
+		#if 0//_M_SSE >= 0x501
+
+		return GSVector8(_mm256_fnmsub_ps(m, a, b));
+		
+		#else
+
+		return -b - *this * a;
+
+		#endif
+	}
+
+	__forceinline GSVector8 addm(const GSVector8& a, const GSVector8& b) const
+	{
+		return a.madd(b, *this); // *this + a * b
+	}
+
+	__forceinline GSVector8 subm(const GSVector8& a, const GSVector8& b) const
+	{
+		return a.nmadd(b, *this); // *this - a * b
+	}
+
+	__forceinline GSVector8 hadd() const
+	{
+		return GSVector8(_mm256_hadd_ps(m, m));
+	}
+
+	__forceinline GSVector8 hadd(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_hadd_ps(m, v.m));
+	}
+
+	__forceinline GSVector8 hsub() const
+	{
+		return GSVector8(_mm256_hsub_ps(m, m));
+	}
+
+	__forceinline GSVector8 hsub(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_hsub_ps(m, v.m));
+	}
+
+	template<int i> __forceinline GSVector8 dp(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_dp_ps(m, v.m, i));
+	}
+
+	__forceinline GSVector8 sat(const GSVector8& a, const GSVector8& b) const
+	{
+		return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a), b));
+	}
+
+	__forceinline GSVector8 sat(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_min_ps(_mm256_max_ps(m, a.xyxy()), a.zwzw()));
+	}
+
+	__forceinline GSVector8 sat(const float scale = 255) const
+	{
+		return sat(zero(), GSVector8(scale));
+	}
+
+	__forceinline GSVector8 clamp(const float scale = 255) const
+	{
+		return min(GSVector8(scale));
+	}
+
+	__forceinline GSVector8 min(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_min_ps(m, a));
+	}
+
+	__forceinline GSVector8 max(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_max_ps(m, a));
+	}
+
+	template<int mask> __forceinline GSVector8 blend32(const GSVector8& a)  const
+	{
+		return GSVector8(_mm256_blend_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 blend32(const GSVector8& a, const GSVector8& mask)  const
+	{
+		return GSVector8(_mm256_blendv_ps(m, a, mask));
+	}
+
+	__forceinline GSVector8 upl(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpacklo_ps(m, a));
+	}
+
+	__forceinline GSVector8 uph(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_unpackhi_ps(m, a));
+	}
+
+	__forceinline GSVector8 upl64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	__forceinline GSVector8 uph64(const GSVector8& a) const
+	{
+		return GSVector8(_mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(m), _mm256_castps_pd(a))));
+	}
+
+	__forceinline GSVector8 l2h() const
+	{
+		return xyxy();
+	}
+
+	__forceinline GSVector8 h2l() const
+	{
+		return zwzw();
+	}
+
+	__forceinline GSVector8 andnot(const GSVector8& v) const
+	{
+		return GSVector8(_mm256_andnot_ps(v.m, m));
+	}
+
+	__forceinline int mask() const
+	{
+		return _mm256_movemask_ps(m);
+	}
+
+	__forceinline bool alltrue() const
+	{
+		return mask() == 0xff;
+	}
+
+	__forceinline bool allfalse() const
+	{
+		return _mm256_testz_ps(m, m) != 0;
+	}
+	
+	__forceinline GSVector8 replace_nan(const GSVector8& v) const
+	{
+		return v.blend32(*this, *this == *this);
+	}
+
+	template<int src, int dst> __forceinline GSVector8 insert32(const GSVector8& v) const
+	{
+		// TODO: use blendps when src == dst
+
+		ASSERT(src < 4 && dst < 4); // not cross lane like extract32()
+
+		switch(dst)
+		{
+		case 0:
+			switch(src)
+			{
+			case 0: return yyxx(v).zxzw(*this);
+			case 1: return yyyy(v).zxzw(*this);
+			case 2: return yyzz(v).zxzw(*this);
+			case 3: return yyww(v).zxzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 1:
+			switch(src)
+			{
+			case 0: return xxxx(v).xzzw(*this);
+			case 1: return xxyy(v).xzzw(*this);
+			case 2: return xxzz(v).xzzw(*this);
+			case 3: return xxww(v).xzzw(*this);
+			default: __assume(0);
+			}
+			break;
+		case 2:
+			switch(src)
+			{
+			case 0: return xyzx(wwxx(v));
+			case 1: return xyzx(wwyy(v));
+			case 2: return xyzx(wwzz(v));
+			case 3: return xyzx(wwww(v));
+			default: __assume(0);
+			}
+			break;
+		case 3:
+			switch(src)
+			{
+			case 0: return xyxz(zzxx(v));
+			case 1: return xyxz(zzyy(v));
+			case 2: return xyxz(zzzz(v));
+			case 3: return xyxz(zzww(v));
+			default: __assume(0);
+			}
+			break;
+		default:
+			__assume(0);
+		}
+
+		return *this;
+	}
+
+	template<int i> __forceinline int extract32() const
+	{
+		ASSERT(i < 8);
+
+		return extract<i / 4>().template extract32<i & 3>();
+	}
+
+	template<int i> __forceinline GSVector8 insert(__m128 m) const
+	{
+		ASSERT(i < 2);
+
+		return GSVector8(_mm256_insertf128_ps(this->m, m, i));
+	}
+
+	template<int i> __forceinline GSVector4 extract() const
+	{
+		ASSERT(i < 2);
+
+		if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
+
+		return GSVector4(_mm256_extractf128_ps(m, i));
+	}
+
+	__forceinline static GSVector8 zero()
+	{
+		return GSVector8(_mm256_setzero_ps());
+	}
+
+	__forceinline static GSVector8 xffffffff()
+	{
+		return zero() == zero();
+	}
+
+	// TODO
+
+	__forceinline static GSVector8 loadl(const void* p)
+	{
+		return GSVector8(_mm256_castps128_ps256(_mm_load_ps((float*)p)));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p)
+	{
+		return zero().insert<1>(_mm_load_ps((float*)p));
+	}
+
+	__forceinline static GSVector8 loadh(const void* p, const GSVector8& v)
+	{
+		return GSVector8(_mm256_insertf128_ps(v, _mm_load_ps((float*)p), 1));
+	}
+
+	__forceinline static GSVector8 load(const void* pl, const void* ph)
+	{
+		return loadh(ph, loadl(pl));
+	}
+
+	template<bool aligned> __forceinline static GSVector8 load(const void* p)
+	{
+		return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
+	}
+
+	// TODO
+
+	__forceinline static void storel(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 0));
+	}
+
+	__forceinline static void storeh(void* p, const GSVector8& v)
+	{
+		_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 1));
+	}
+
+	template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
+	{
+		if(aligned) _mm256_store_ps((float*)p, v.m);
+		else _mm256_storeu_ps((float*)p, v.m);
+	}
+
+	//
+
+	__forceinline static void zeroupper()
+	{
+		_mm256_zeroupper();
+	}
+
+	__forceinline static void zeroall()
+	{
+		_mm256_zeroall();
+	}
+
+	//
+
+	__forceinline GSVector8 operator - () const
+	{
+		return neg();
+	}
+
+	__forceinline void operator += (const GSVector8& v)
+	{
+		m = _mm256_add_ps(m, v);
+	}
+
+	__forceinline void operator -= (const GSVector8& v)
+	{
+		m = _mm256_sub_ps(m, v);
+	}
+
+	__forceinline void operator *= (const GSVector8& v)
+	{
+		m = _mm256_mul_ps(m, v);
+	}
+
+	__forceinline void operator /= (const GSVector8& v)
+	{
+		m = _mm256_div_ps(m, v);
+	}
+
+	__forceinline void operator += (float f)
+	{
+		*this += GSVector8(f);
+	}
+
+	__forceinline void operator -= (float f)
+	{
+		*this -= GSVector8(f);
+	}
+
+	__forceinline void operator *= (float f)
+	{
+		*this *= GSVector8(f);
+	}
+
+	__forceinline void operator /= (float f)
+	{
+		*this /= GSVector8(f);
+	}
+
+	__forceinline void operator &= (const GSVector8& v)
+	{
+		m = _mm256_and_ps(m, v);
+	}
+
+	__forceinline void operator |= (const GSVector8& v)
+	{
+		m = _mm256_or_ps(m, v);
+	}
+
+	__forceinline void operator ^= (const GSVector8& v)
+	{
+		m = _mm256_xor_ps(m, v);
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_add_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_sub_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_mul_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_div_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator + (const GSVector8& v, float f)
+	{
+		return v + GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator - (const GSVector8& v, float f)
+	{
+		return v - GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator * (const GSVector8& v, float f)
+	{
+		return v * GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator / (const GSVector8& v, float f)
+	{
+		return v / GSVector8(f);
+	}
+
+	__forceinline friend GSVector8 operator & (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_and_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator | (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_or_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator ^ (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_xor_ps(v1, v2));
+	}
+
+	__forceinline friend GSVector8 operator == (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_EQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator != (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_NEQ_OQ));
+	}
+
+	__forceinline friend GSVector8 operator > (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator < (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LT_OQ));
+	}
+
+	__forceinline friend GSVector8 operator >= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_GE_OQ));
+	}
+
+	__forceinline friend GSVector8 operator <= (const GSVector8& v1, const GSVector8& v2)
+	{
+		return GSVector8(_mm256_cmp_ps(v1, v2, _CMP_LE_OQ));
+	}
+
+	// x = v[31:0] / v[159:128]
+	// y = v[63:32] / v[191:160]
+	// z = v[95:64] / v[223:192]
+	// w = v[127:96] / v[255:224]
+
+
+	#define VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+		__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+		__forceinline GSVector8 xs##ys##zs##ws(const GSVector8& v) const {return GSVector8(_mm256_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+		// vs2012u3 cannot reuse the result of equivalent shuffles when it is done with _mm256_permute_ps (write v.xxxx() twice, and it will do it twice), but with _mm256_shuffle_ps it can.
+		//__forceinline GSVector8 xs##ys##zs##ws() const {return GSVector8(_mm256_permute_ps(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+	#define VECTOR8_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+		VECTOR8_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_2(xs, xn, ys, yn) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+		VECTOR8_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+	#define VECTOR8_SHUFFLE_1(xs, xn) \
+		VECTOR8_SHUFFLE_2(xs, xn, x, 0) \
+		VECTOR8_SHUFFLE_2(xs, xn, y, 1) \
+		VECTOR8_SHUFFLE_2(xs, xn, z, 2) \
+		VECTOR8_SHUFFLE_2(xs, xn, w, 3) \
+
+	VECTOR8_SHUFFLE_1(x, 0)
+	VECTOR8_SHUFFLE_1(y, 1)
+	VECTOR8_SHUFFLE_1(z, 2)
+	VECTOR8_SHUFFLE_1(w, 3)
+
+	// a = v0[127:0]
+	// b = v0[255:128]
+	// c = v1[127:0]
+	// d = v1[255:128]
+	// _ = 0
+
+	#define VECTOR8_PERMUTE128_2(as, an, bs, bn) \
+		__forceinline GSVector8 as##bs() const {return GSVector8(_mm256_permute2f128_ps(m, m, an | (bn << 4)));} \
+		__forceinline GSVector8 as##bs(const GSVector8& v) const {return GSVector8(_mm256_permute2f128_ps(m, v.m, an | (bn << 4)));} \
+
+	#define VECTOR8_PERMUTE128_1(as, an) \
+		VECTOR8_PERMUTE128_2(as, an, a, 0) \
+		VECTOR8_PERMUTE128_2(as, an, b, 1) \
+		VECTOR8_PERMUTE128_2(as, an, c, 2) \
+		VECTOR8_PERMUTE128_2(as, an, d, 3) \
+		VECTOR8_PERMUTE128_2(as, an, _, 8) \
+
+	VECTOR8_PERMUTE128_1(a, 0)
+	VECTOR8_PERMUTE128_1(b, 1)
+	VECTOR8_PERMUTE128_1(c, 2)
+	VECTOR8_PERMUTE128_1(d, 3)
+	VECTOR8_PERMUTE128_1(_, 8)
+
+	#if _M_SSE >= 0x501
+
+	// a = v[63:0]
+	// b = v[127:64]
+	// c = v[191:128]
+	// d = v[255:192]
+
+	#define VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, ds, dn) \
+		__forceinline GSVector8 as##bs##cs##ds() const {return GSVector8(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(m), _MM_SHUFFLE(dn, cn, bn, an))));} \
+
+	#define VECTOR8_PERMUTE64_3(as, an, bs, bn, cs, cn) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, a, 0) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, b, 1) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, c, 2) \
+		VECTOR8_PERMUTE64_4(as, an, bs, bn, cs, cn, d, 3) \
+
+	#define VECTOR8_PERMUTE64_2(as, an, bs, bn) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, a, 0) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, b, 1) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, c, 2) \
+		VECTOR8_PERMUTE64_3(as, an, bs, bn, d, 3) \
+
+	#define VECTOR8_PERMUTE64_1(as, an) \
+		VECTOR8_PERMUTE64_2(as, an, a, 0) \
+		VECTOR8_PERMUTE64_2(as, an, b, 1) \
+		VECTOR8_PERMUTE64_2(as, an, c, 2) \
+		VECTOR8_PERMUTE64_2(as, an, d, 3) \
+
+	VECTOR8_PERMUTE64_1(a, 0)
+	VECTOR8_PERMUTE64_1(b, 1)
+	VECTOR8_PERMUTE64_1(c, 2)
+	VECTOR8_PERMUTE64_1(d, 3)
+
+	__forceinline GSVector8 permute32(const GSVector8i& mask) const
+	{
+		return GSVector8(_mm256_permutevar8x32_ps(m, mask));
+	}
+
+	__forceinline GSVector8 broadcast32() const
+	{
+		return GSVector8(_mm256_broadcastss_ps(_mm256_castps256_ps128(m)));
+	}
+
+	__forceinline static GSVector8 broadcast32(const GSVector4& v)
+	{
+		return GSVector8(_mm256_broadcastss_ps(v.m));
+	}
+
+	__forceinline static GSVector8 broadcast32(const void* f)
+	{
+		return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
+	}
+
+	// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
+
+	#endif
+};
+
+#endif
+
+// conversion
+
+__forceinline GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+	m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+}
+
+__forceinline GSVector4::GSVector4(const GSVector4i& v)
+{
+	m = _mm_cvtepi32_ps(v);
+}
+
+#if _M_SSE >= 0x501
+
+__forceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate)
+{
+	m = truncate ? _mm256_cvttps_epi32(v) : _mm256_cvtps_epi32(v);
+}
+
+__forceinline GSVector8::GSVector8(const GSVector8i& v)
+{
+	m = _mm256_cvtepi32_ps(v);
+}
+
+#endif
+
+// casting
+
+__forceinline GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+	return GSVector4i(_mm_castps_si128(v.m));
+}
+
+__forceinline GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+	return GSVector4(_mm_castsi128_ps(v.m));
+}
+
+#if _M_SSE >= 0x500
+
+__forceinline GSVector4i GSVector4i::cast(const GSVector8& v)
+{
+	return GSVector4i(_mm_castps_si128(_mm256_castps256_ps128(v)));
+}
+
+__forceinline GSVector4 GSVector4::cast(const GSVector8& v)
+{
+	return GSVector4(_mm256_castps256_ps128(v));
+}
+
+__forceinline GSVector8 GSVector8::cast(const GSVector4i& v)
+{
+	return GSVector8(_mm256_castps128_ps256(_mm_castsi128_ps(v.m)));
+}
+
+__forceinline GSVector8 GSVector8::cast(const GSVector4& v)
+{
+	return GSVector8(_mm256_castps128_ps256(v.m));
+}
+
+#endif
+
+#if _M_SSE >= 0x501
+
+__forceinline GSVector4i GSVector4i::cast(const GSVector8i& v)
+{
+	return GSVector4i(_mm256_castsi256_si128(v));
+}
+
+__forceinline GSVector4 GSVector4::cast(const GSVector8i& v)
+{
+	return GSVector4(_mm_castsi128_ps(_mm256_castsi256_si128(v)));
+}
+
+__forceinline GSVector8i GSVector8i::cast(const GSVector4i& v)
+{
+	return GSVector8i(_mm256_castsi128_si256(v.m));
+}
+
+__forceinline GSVector8i GSVector8i::cast(const GSVector4& v)
+{
+	return GSVector8i(_mm256_castsi128_si256(_mm_castps_si128(v.m)));
+}
+
+__forceinline GSVector8i GSVector8i::cast(const GSVector8& v)
+{
+	return GSVector8i(_mm256_castps_si256(v.m));
+}
+
+__forceinline GSVector8 GSVector8::cast(const GSVector8i& v)
+{
+	return GSVector8(_mm256_castsi256_ps(v.m));
+}
+
+#endif
+
+#pragma pack(pop)
diff --git a/plugins/GSdx_legacy/GSVertex.h b/plugins/GSdx_legacy/GSVertex.h
new file mode 100644
index 0000000000..198fbb93d0
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertex.h
@@ -0,0 +1,67 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSVector.h"
+#include "GSVertexHW.h"
+#include "GSVertexSW.h"
+
+#pragma pack(push, 1)
+
+__aligned(struct, 32) GSVertex
+{
+	union
+	{
+		struct
+		{
+			GIFRegST ST; // S:0, T:4
+			GIFRegRGBAQ RGBAQ; // RGBA:8, Q:12
+			GIFRegXYZ XYZ; // XY:16, Z:20
+			union {uint32 UV; struct {uint16 U, V;};}; // UV:24
+			uint32 FOG; // FOG:28
+		};
+
+		__m128i m[2];
+	};
+
+	void operator = (const GSVertex& v) {m[0] = v.m[0]; m[1] = v.m[1];}
+};
+
+struct GSVertexP
+{
+	GSVector4 p;
+};
+
+__aligned(struct, 32) GSVertexPT1
+{
+	GSVector4 p;
+	GSVector2 t;
+};
+
+struct GSVertexPT2
+{
+	GSVector4 p;
+	GSVector2 t[2];
+};
+
+#pragma pack(pop)
diff --git a/plugins/GSdx_legacy/GSVertexArrayOGL.h b/plugins/GSdx_legacy/GSVertexArrayOGL.h
new file mode 100644
index 0000000000..5c6e31833f
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexArrayOGL.h
@@ -0,0 +1,334 @@
+/*
+ *	Copyright (C) 2011-2011 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "config.h"
+
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+extern uint64 g_vertex_upload_byte;
+#endif
+
+struct GSInputLayoutOGL {
+	GLint   size;
+	GLenum  type;
+	GLboolean normalize;
+	GLsizei stride;
+	const GLvoid* offset;
+};
+
+template<int STRIDE>
+class GSBufferOGL {
+	size_t m_start;
+	size_t m_count;
+	size_t m_limit;
+	const  GLenum m_target;
+	GLuint m_buffer_name;
+	uint8*  m_buffer_ptr;
+	const bool m_buffer_storage;
+	GLsync m_fence[5];
+
+	public:
+	GSBufferOGL(GLenum target)
+		: m_start(0)
+		, m_count(0)
+		, m_limit(0)
+		, m_target(target)
+		, m_buffer_storage(GLLoader::found_GL_ARB_buffer_storage)
+	{
+		glGenBuffers(1, &m_buffer_name);
+		// Opengl works best with 1-4MB buffer.
+		// Warning m_limit is the number of object (not the size in Bytes)
+		m_limit = 8 * 1024 * 1024 / STRIDE;
+
+		for (size_t i = 0; i < 5; i++) {
+			m_fence[i] = 0;
+		}
+
+		if (m_buffer_storage) {
+			// TODO: if we do manually the synchronization, I'm not sure size is important. It worths to investigate it.
+			// => bigger buffer => less sync
+			bind();
+			// coherency will be done by flushing
+			const GLbitfield common_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+			const GLbitfield map_flags = common_flags | GL_MAP_FLUSH_EXPLICIT_BIT;
+			const GLbitfield create_flags = common_flags | GL_CLIENT_STORAGE_BIT;
+
+			glBufferStorage(m_target, STRIDE * m_limit, NULL, create_flags );
+			m_buffer_ptr = (uint8*) glMapBufferRange(m_target, 0, STRIDE * m_limit, map_flags);
+			if (!m_buffer_ptr) {
+				fprintf(stderr, "Failed to map buffer\n");
+				throw GSDXError();
+			}
+		} else {
+			m_buffer_ptr = NULL;
+		}
+	}
+
+	~GSBufferOGL() {
+		if (m_buffer_storage) {
+			for (size_t i = 0; i < 5; i++) {
+				glDeleteSync(m_fence[i]);
+			}
+			// Don't know if we must do it
+			bind();
+			glUnmapBuffer(m_target);
+		}
+		glDeleteBuffers(1, &m_buffer_name);
+	}
+
+	void allocate() { allocate(m_limit); }
+
+	void allocate(size_t new_limit)
+	{
+		if (!m_buffer_storage) {
+			m_start = 0;
+			m_limit = new_limit;
+			glBufferData(m_target,  m_limit * STRIDE, NULL, GL_STREAM_DRAW);
+		}
+	}
+
+	void bind()
+	{
+		glBindBuffer(m_target, m_buffer_name);
+	}
+
+	void subdata_upload(const void* src)
+	{
+		// Current GPU buffer is really too small need to allocate a new one
+		if (m_count > m_limit) {
+			//fprintf(stderr, "Allocate a new buffer\n %d", STRIDE);
+			allocate(std::max<int>(m_count * 3 / 2, m_limit));
+
+		} else if (m_count > (m_limit - m_start) ) {
+			//fprintf(stderr, "Orphan the buffer %d\n", STRIDE);
+
+			// Not enough left free room. Just go back at the beginning
+			m_start = 0;
+			// Orphan the buffer to avoid synchronization
+			allocate(m_limit);
+		}
+
+		glBufferSubData(m_target,  STRIDE * m_start,  STRIDE * m_count, src);
+	}
+
+	void map_upload(const void* src)
+	{
+		ASSERT(m_count < m_limit);
+
+		size_t offset = m_start * STRIDE;
+		size_t length = m_count * STRIDE;
+
+		if (m_count > (m_limit - m_start) ) {
+			size_t current_chunk = offset >> 21;
+#ifdef ENABLE_OGL_DEBUG_FENCE
+			fprintf(stderr, "%x: Wrap buffer\n", m_target);
+			fprintf(stderr, "%x: Insert a fence in chunk %d\n", m_target, current_chunk);
+#endif
+			ASSERT(current_chunk > 0 && current_chunk < 5);
+			if (m_fence[current_chunk] == 0) {
+				m_fence[current_chunk] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}
+
+			// Wrap at startup
+			m_start = 0;
+			offset = 0;
+
+			// Only check first chunk
+			if (m_fence[0]) {
+#ifdef ENABLE_OGL_DEBUG_FENCE
+				GLenum status = glClientWaitSync(m_fence[0], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+				if (status != GL_ALREADY_SIGNALED) {
+					fprintf(stderr, "%x: Sync Sync! Buffer too small\n", m_target);
+				}
+#else
+				glClientWaitSync(m_fence[0], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+#endif
+				glDeleteSync(m_fence[0]);
+				m_fence[0] = 0;
+			}
+		}
+
+		// Protect buffer with fences
+		size_t current_chunk = offset >> 21;
+		size_t next_chunk = (offset + length) >> 21;
+		for (size_t c = current_chunk + 1; c <= next_chunk; c++) {
+#ifdef ENABLE_OGL_DEBUG_FENCE
+			fprintf(stderr, "%x: Insert a fence in chunk %d\n", m_target, c-1);
+#endif
+			ASSERT(c > 0 && c < 5);
+			m_fence[c-1] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			if (m_fence[c]) {
+#ifdef ENABLE_OGL_DEBUG_FENCE
+				GLenum status = glClientWaitSync(m_fence[c], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+#else
+				glClientWaitSync(m_fence[c], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+#endif
+				glDeleteSync(m_fence[c]);
+				m_fence[c] = 0;
+
+#ifdef ENABLE_OGL_DEBUG_FENCE
+				if (status != GL_ALREADY_SIGNALED) {
+					fprintf(stderr, "%x: Sync Sync! Buffer too small\n", m_target);
+				}
+#endif
+			}
+		}
+
+		void* dst = m_buffer_ptr + offset;
+
+		memcpy(dst, src, length);
+		glFlushMappedBufferRange(m_target, offset, length);
+	}
+
+	void upload(const void* src, uint32 count)
+	{
+#ifdef ENABLE_OGL_DEBUG_MEM_BW
+		g_vertex_upload_byte += count * STRIDE;
+#endif
+
+		m_count = count;
+
+		if (m_buffer_storage) {
+			map_upload(src);
+		} else {
+			subdata_upload(src);
+		}
+	}
+
+	void EndScene()
+	{
+		m_start += m_count;
+		m_count = 0;
+	}
+
+	void Draw(GLenum mode)
+	{
+		glDrawArrays(mode, m_start, m_count);
+	}
+
+	void Draw(GLenum mode, int offset, int count)
+	{
+		glDrawArrays(mode, m_start + offset, count);
+	}
+
+
+	void Draw(GLenum mode, GLint basevertex)
+	{
+		glDrawElementsBaseVertex(mode, m_count, GL_UNSIGNED_INT, (void*)(m_start * STRIDE), basevertex);
+	}
+
+	void Draw(GLenum mode, GLint basevertex, int offset, int count)
+	{
+		glDrawElementsBaseVertex(mode, count, GL_UNSIGNED_INT, (void*)((m_start + offset) * STRIDE), basevertex);
+	}
+
+	size_t GetStart() { return m_start; }
+
+};
+
+class GSVertexBufferStateOGL {
+	GSBufferOGL<sizeof(GSVertexPT1)> *m_vb;
+	GSBufferOGL<sizeof(uint32)> *m_ib;
+
+	GLuint m_va;
+	GLenum m_topology;
+
+	// No copy constructor please
+	GSVertexBufferStateOGL(const GSVertexBufferStateOGL& ) = delete;
+
+public:
+	GSVertexBufferStateOGL(GSInputLayoutOGL* layout, uint32 layout_nbr) : m_vb(NULL), m_ib(NULL), m_topology(0)
+	{
+		glGenVertexArrays(1, &m_va);
+		glBindVertexArray(m_va);
+
+		m_vb = new GSBufferOGL<sizeof(GSVertexPT1)>(GL_ARRAY_BUFFER);
+		m_ib = new GSBufferOGL<sizeof(uint32)>(GL_ELEMENT_ARRAY_BUFFER);
+
+		m_vb->bind();
+		m_ib->bind();
+
+		m_vb->allocate();
+		m_ib->allocate();
+		set_internal_format(layout, layout_nbr);
+	}
+
+	void bind()
+	{
+		// Note: index array are part of the VA state so it need to be bound only once.
+		glBindVertexArray(m_va);
+		if (m_vb)
+			m_vb->bind();
+	}
+
+	void set_internal_format(GSInputLayoutOGL* layout, uint32 layout_nbr)
+	{
+		for (uint32 i = 0; i < layout_nbr; i++) {
+			// Note this function need both a vertex array object and a GL_ARRAY_BUFFER buffer
+			glEnableVertexAttribArray(i);
+			switch (layout[i].type) {
+				case GL_UNSIGNED_SHORT:
+				case GL_UNSIGNED_INT:
+					if (layout[i].normalize) {
+						glVertexAttribPointer(i, layout[i].size, layout[i].type, layout[i].normalize,  layout[i].stride, layout[i].offset);
+					} else {
+						// Rule: when shader use integral (not normalized) you must use glVertexAttribIPointer (note the extra I)
+						glVertexAttribIPointer(i, layout[i].size, layout[i].type, layout[i].stride, layout[i].offset);
+					}
+					break;
+				default:
+					glVertexAttribPointer(i, layout[i].size, layout[i].type, layout[i].normalize,  layout[i].stride, layout[i].offset);
+					break;
+			}
+		}
+	}
+
+	void EndScene()
+	{
+		m_vb->EndScene();
+		m_ib->EndScene();
+	}
+
+	void DrawPrimitive() { m_vb->Draw(m_topology); }
+
+	void DrawPrimitive(int offset, int count) { m_vb->Draw(m_topology, offset, count); }
+
+	void DrawIndexedPrimitive() { m_ib->Draw(m_topology, m_vb->GetStart() ); }
+
+	void DrawIndexedPrimitive(int offset, int count) { m_ib->Draw(m_topology, m_vb->GetStart(), offset, count ); }
+
+	void SetTopology(GLenum topology) { m_topology = topology; }
+
+	void UploadVB(const void* vertices, size_t count) { m_vb->upload(vertices, count); }
+
+	void UploadIB(const void* index, size_t count) {
+		m_ib->upload(index, count);
+	}
+
+	~GSVertexBufferStateOGL()
+	{
+		glDeleteVertexArrays(1, &m_va);
+		delete m_vb;
+		delete m_ib;
+	}
+
+};
diff --git a/plugins/GSdx_legacy/GSVertexHW.h b/plugins/GSdx_legacy/GSVertexHW.h
new file mode 100644
index 0000000000..e0fe308b62
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexHW.h
@@ -0,0 +1,40 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GS.h"
+#include "GSVector.h"
+
+#pragma pack(push, 1)
+
+__aligned(struct, 32) GSVertexHW9
+{
+	GSVector4 t; 
+	GSVector4 p;
+
+	// t.z = union {struct {uint8 r, g, b, a;}; uint32 c0;};
+	// t.w = union {struct {uint8 ta0, ta1, res, f;}; uint32 c1;}
+
+	GSVertexHW9& operator = (GSVertexHW9& v) {t = v.t; p = v.p; return *this;}
+};
+
+#pragma pack(pop)
diff --git a/plugins/GSdx_legacy/GSVertexList.cpp b/plugins/GSdx_legacy/GSVertexList.cpp
new file mode 100644
index 0000000000..f6477c3ae6
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexList.cpp
@@ -0,0 +1,23 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "StdAfx.h"
+#include "GSVertexList.h"
diff --git a/plugins/GSdx_legacy/GSVertexList.h b/plugins/GSdx_legacy/GSVertexList.h
new file mode 100644
index 0000000000..bd0b016d4c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexList.h
@@ -0,0 +1,87 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+template <class Vertex> class GSVertexList
+{
+	void* m_base;
+	Vertex* m_v[3];
+	int m_count;
+
+public:
+	GSVertexList()
+		: m_count(0)
+	{
+		m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 32);
+
+		for(size_t i = 0; i < countof(m_v); i++)
+		{
+			m_v[i] = &((Vertex*)m_base)[i];
+		}
+	}
+
+	virtual ~GSVertexList()
+	{
+		_aligned_free(m_base);
+	}
+
+	void RemoveAll()
+	{
+		m_count = 0;
+	}
+
+	__forceinline Vertex& AddTail()
+	{
+		ASSERT(m_count < 3);
+
+		return *m_v[m_count++];
+	}
+
+	__forceinline void RemoveAt(int pos, int keep)
+	{
+		if(keep == 1)
+		{
+			Vertex* tmp = m_v[pos + 0];
+			m_v[pos + 0] = m_v[pos + 1];
+			m_v[pos + 1] = tmp;
+		}
+		else if(keep == 2)
+		{
+			Vertex* tmp = m_v[pos + 0];
+			m_v[pos + 0] = m_v[pos + 1];
+			m_v[pos + 1] = m_v[pos + 2];
+			m_v[pos + 2] = tmp;
+		}
+
+		m_count = pos + keep;
+	}
+
+	__forceinline void GetAt(int i, Vertex& v)
+	{
+		v = *m_v[i];
+	}
+
+	int GetCount()
+	{
+		return m_count;
+	}
+};
diff --git a/plugins/GSdx_legacy/GSVertexSW.cpp b/plugins/GSdx_legacy/GSVertexSW.cpp
new file mode 100644
index 0000000000..2b286741f7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexSW.cpp
@@ -0,0 +1,23 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSVertexSW.h"
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSVertexSW.h b/plugins/GSdx_legacy/GSVertexSW.h
new file mode 100644
index 0000000000..0d977658d7
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexSW.h
@@ -0,0 +1,266 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSVector.h"
+
+__aligned(struct, 32) GSVertexSW
+{
+	GSVector4 p, _pad, t, c;
+
+	__forceinline GSVertexSW() {}
+	__forceinline GSVertexSW(const GSVertexSW& v) {*this = v;}
+
+	__forceinline static GSVertexSW zero()
+	{
+		GSVertexSW v;
+
+		v.p = GSVector4::zero();
+		v.t = GSVector4::zero();
+		v.c = GSVector4::zero();
+
+		return v;
+	}
+	__forceinline void operator = (const GSVertexSW& v) 
+	{
+		p = v.p; 
+		t = v.t;
+		c = v.c; 
+	}
+	
+	__forceinline void operator += (const GSVertexSW& v) 
+	{
+		p += v.p; 
+		t += v.t;
+		c += v.c; 
+	}
+
+	__forceinline friend GSVertexSW operator + (const GSVertexSW& a, const GSVertexSW& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p + b.p;
+		v.t = a.t + b.t;
+		v.c = a.c + b.c;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator - (const GSVertexSW& a, const GSVertexSW& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p - b.p;
+		v.t = a.t - b.t;
+		v.c = a.c - b.c;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator * (const GSVertexSW& a, const GSVector4& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p * b;
+		v.t = a.t * b;
+		v.c = a.c * b;
+
+		return v;
+	}
+
+	__forceinline friend GSVertexSW operator / (const GSVertexSW& a, const GSVector4& b)
+	{
+		GSVertexSW v;
+
+		v.p = a.p / b;
+		v.t = a.t / b;
+		v.c = a.c / b;
+
+		return v;
+	}
+
+	static bool IsQuad(const GSVertexSW* v, int& tl, int& br)
+	{
+		GSVector4 v0 = v[0].p.xyxy(v[0].t);
+		GSVector4 v1 = v[1].p.xyxy(v[1].t);
+		GSVector4 v2 = v[2].p.xyxy(v[2].t);
+
+		GSVector4 v01 = v0 == v1;
+		GSVector4 v12 = v1 == v2;
+		GSVector4 v02 = v0 == v2;
+
+		GSVector4 vtl, vbr;
+
+		GSVector4 test;
+
+		int i;
+
+		if(v12.allfalse())
+		{
+			test = (v01 ^ v02) & (v01 ^ v02.zwxy());
+			vtl = v0;
+			vbr = v1 + (v2 - v0);
+			i = 0;
+		}
+		else if(v02.allfalse())
+		{
+			test = (v01 ^ v12) & (v01 ^ v12.zwxy());
+			vtl = v1;
+			vbr = v0 + (v2 - v1);
+			i = 1;
+		}
+		else if(v01.allfalse())
+		{
+			test = (v02 ^ v12) & (v02 ^ v12.zwxy());
+			vtl = v2;
+			vbr = v0 + (v1 - v2);
+			i = 2;
+		}
+		else
+		{
+			return false;
+		}
+
+		if(!test.alltrue())
+		{
+			return false;
+		}
+
+		tl = i;
+
+		GSVector4 v3 = v[3].p.xyxy(v[3].t);
+		GSVector4 v4 = v[4].p.xyxy(v[4].t);
+		GSVector4 v5 = v[5].p.xyxy(v[5].t);
+
+		GSVector4 v34 = v3 == v4;
+		GSVector4 v45 = v4 == v5;
+		GSVector4 v35 = v3 == v5;
+
+		if(v34.allfalse())
+		{
+			test = (v35 ^ v45) & (v35 ^ v45.zwxy()) & (vtl + v5 == v3 + v4) & (vbr == v5);
+			i = 5;
+		}
+		else if(v35.allfalse())
+		{
+			test = (v34 ^ v45) & (v34 ^ v45.zwxy()) & (vtl + v4 == v3 + v5) & (vbr == v4);
+			i = 4;
+		}
+		else if(v45.allfalse())
+		{
+			test = (v34 ^ v35) & (v34 ^ v35.zwxy()) & (vtl + v3 == v5 + v4) & (vbr == v3);
+			i = 3;
+		}
+		else
+		{
+			return false;
+		}
+
+		if(!test.alltrue())
+		{
+			return false;
+		}
+
+		br = i;
+
+		#if _M_SSE >= 0x500
+
+		{
+			// p.z, p.w, t.z, t.w, c.x, c.y, c.z, c.w
+
+			GSVector8 v0 = GSVector8(v[0].p.zwzw(v[0].t), v[0].c);
+			GSVector8 v1 = GSVector8(v[1].p.zwzw(v[1].t), v[1].c);
+			GSVector8 v2 = GSVector8(v[2].p.zwzw(v[2].t), v[2].c);
+			GSVector8 v3 = GSVector8(v[3].p.zwzw(v[3].t), v[3].c);
+			GSVector8 v4 = GSVector8(v[4].p.zwzw(v[4].t), v[4].c);
+			GSVector8 v5 = GSVector8(v[5].p.zwzw(v[5].t), v[5].c);
+
+			GSVector8 test = ((v0 == v1) & (v0 == v2)) & ((v0 == v3) & (v0 == v4)) & (v0 == v5);
+
+			return test.alltrue();
+		}
+		
+		#else
+
+		v0 = v[0].p.zwzw(v[0].t);
+		v1 = v[1].p.zwzw(v[1].t);
+		v2 = v[2].p.zwzw(v[2].t);
+		v3 = v[3].p.zwzw(v[3].t);
+		v4 = v[4].p.zwzw(v[4].t);
+		v5 = v[5].p.zwzw(v[5].t);
+
+		test = ((v0 == v1) & (v0 == v2)) & ((v0 == v3) & (v0 == v4)) & (v0 == v5);
+
+		if(!test.alltrue())
+		{
+			return false;
+		}
+
+		v0 = v[0].c;
+		v1 = v[1].c;
+		v2 = v[2].c;
+		v3 = v[3].c;
+		v4 = v[4].c;
+		v5 = v[5].c;
+
+		test = ((v0 == v1) & (v0 == v2)) & ((v0 == v3) & (v0 == v4)) & (v0 == v5);
+
+		if(!test.alltrue())
+		{
+			return false;
+		}
+
+		return true;
+
+		#endif
+	}
+};
+
+#if _M_SSE >= 0x501
+
+__aligned(struct, 32) GSVertexSW2
+{
+	GSVector4 p, _pad;
+	GSVector8 tc;
+
+	__forceinline GSVertexSW2() {}
+	__forceinline GSVertexSW2(const GSVertexSW2& v) {*this = v;}
+
+	__forceinline void operator = (const GSVertexSW2& v) 
+	{
+		p = v.p; 
+		tc = v.tc;
+	}
+
+	__forceinline friend GSVertexSW2 operator - (const GSVertexSW2& a, const GSVertexSW2& b)
+	{
+		GSVertexSW2 v;
+
+		v.p = a.p - b.p;
+		v.tc = a.tc - b.tc;
+
+		return v;
+	}
+};
+
+#endif
+
diff --git a/plugins/GSdx_legacy/GSVertexTrace.cpp b/plugins/GSdx_legacy/GSVertexTrace.cpp
new file mode 100644
index 0000000000..9ef65d86bb
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexTrace.cpp
@@ -0,0 +1,487 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSVertexTrace.h"
+#include "GSUtil.h"
+#include "GSState.h"
+
+const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX);
+
+GSVertexTrace::GSVertexTrace(const GSState* state)
+	: m_state(state)
+{
+	m_primclass = GS_INVALID_CLASS;
+	memset(&m_alpha, 0, sizeof(m_alpha));
+
+	#define InitUpdate3(P, IIP, TME, FST, COLOR) \
+		m_fmm[COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax<P, IIP, TME, FST, COLOR>;
+
+	#define InitUpdate2(P, IIP, TME) \
+		InitUpdate3(P, IIP, TME, 0, 0) \
+		InitUpdate3(P, IIP, TME, 0, 1) \
+		InitUpdate3(P, IIP, TME, 1, 0) \
+		InitUpdate3(P, IIP, TME, 1, 1) \
+
+	#define InitUpdate(P) \
+		InitUpdate2(P, 0, 0) \
+		InitUpdate2(P, 0, 1) \
+		InitUpdate2(P, 1, 0) \
+		InitUpdate2(P, 1, 1) \
+
+	InitUpdate(GS_POINT_CLASS);
+	InitUpdate(GS_LINE_CLASS);
+	InitUpdate(GS_TRIANGLE_CLASS);
+	InitUpdate(GS_SPRITE_CLASS);
+}
+
+void GSVertexTrace::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass)
+{
+	m_primclass = primclass;
+
+	uint32 iip = m_state->PRIM->IIP;
+	uint32 tme = m_state->PRIM->TME;
+	uint32 fst = m_state->PRIM->FST;
+	uint32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
+
+	(this->*m_fmm[color][fst][tme][iip][primclass])(vertex, index, count);
+
+	m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
+
+	m_alpha.valid = false;
+
+	if(m_state->PRIM->TME)
+	{
+		const GIFRegTEX1& TEX1 = m_state->m_context->TEX1;
+
+		m_filter.mmag = TEX1.IsMagLinear();
+		m_filter.mmin = TEX1.IsMinLinear();
+
+		if(TEX1.MXL == 0) // MXL == 0 => MMIN ignored, tested it on ps2
+		{
+			m_filter.linear = m_filter.mmag;
+
+			return;
+		}
+
+		float K = (float)TEX1.K / 16;
+
+		if(TEX1.LCM == 0 && m_state->PRIM->FST == 0) // FST == 1 => Q is not interpolated
+		{
+			// LOD = log2(1/|Q|) * (1 << L) + K
+
+			GSVector4::storel(&m_lod, m_max.t.uph(m_min.t).log2(3).neg() * (float)(1 << TEX1.L) + K);
+
+			if(m_lod.x > m_lod.y) {float tmp = m_lod.x; m_lod.x = m_lod.y; m_lod.y = tmp;}
+		}
+		else
+		{
+			m_lod.x = K;
+			m_lod.y = K;
+		}
+
+		if(m_lod.y <= 0)
+		{
+			m_filter.linear = m_filter.mmag;
+		}
+		else if(m_lod.x > 0)
+		{
+			m_filter.linear = m_filter.mmin;
+		}
+		else
+		{
+			m_filter.linear = m_filter.mmag | m_filter.mmin;
+		}
+	}
+}
+
+template<GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
+void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int count)
+{
+	const GSDrawingContext* context = m_state->m_context;
+
+	int n = 1;
+
+	switch(primclass)
+	{
+	case GS_POINT_CLASS:
+		n = 1;
+		break;
+	case GS_LINE_CLASS:
+	case GS_SPRITE_CLASS:
+		n = 2;
+		break;
+	case GS_TRIANGLE_CLASS:
+		n = 3;
+		break;
+	}
+
+	GSVector4 tmin = s_minmax.xxxx();
+	GSVector4 tmax = s_minmax.yyyy();
+	GSVector4i cmin = GSVector4i::xffffffff();
+	GSVector4i cmax = GSVector4i::zero();
+
+	#if _M_SSE >= 0x401
+
+	GSVector4i pmin = GSVector4i::xffffffff();
+	GSVector4i pmax = GSVector4i::zero();
+
+	#else
+
+	GSVector4 pmin = s_minmax.xxxx();
+	GSVector4 pmax = s_minmax.yyyy();
+	
+	#endif
+
+	const GSVertex* RESTRICT v = (GSVertex*)vertex;
+
+	for(int i = 0; i < count; i += n)
+	{
+		if(primclass == GS_POINT_CLASS)
+		{
+			GSVector4i c(v[index[i]].m[0]);
+
+			if(color)
+			{
+				cmin = cmin.min_u8(c);
+				cmax = cmax.max_u8(c);
+			}
+
+			if(tme)
+			{
+				if(!fst)
+				{
+					GSVector4 stq = GSVector4::cast(c);
+
+					GSVector4 q = stq.wwww();
+
+					stq = (stq.xyww() * q.rcpnr()).xyww(q);
+
+					tmin = tmin.min(stq);
+					tmax = tmax.max(stq);
+				}
+				else
+				{
+					GSVector4i uv(v[index[i]].m[1]);
+
+					GSVector4 st = GSVector4(uv.uph16()).xyxy();
+
+					tmin = tmin.min(st);
+					tmax = tmax.max(st);
+				}
+			}
+
+			GSVector4i xyzf(v[index[i]].m[1]);
+
+			GSVector4i xy = xyzf.upl16();
+			GSVector4i z = xyzf.yyyy();
+
+			#if _M_SSE >= 0x401
+
+			GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf));
+
+			pmin = pmin.min_u32(p);
+			pmax = pmax.max_u32(p);
+
+			#else
+
+			GSVector4 p = GSVector4(xy.upl64(z.srl32(1).upl32(xyzf.wwww())));
+
+			pmin = pmin.min(p);
+			pmax = pmax.max(p);
+
+			#endif
+		}
+		else if(primclass == GS_LINE_CLASS)
+		{
+			GSVector4i c0(v[index[i + 0]].m[0]);
+			GSVector4i c1(v[index[i + 1]].m[0]);
+
+			if(color)
+			{
+				if(iip)
+				{
+					cmin = cmin.min_u8(c0.min_u8(c1));
+					cmax = cmax.max_u8(c0.max_u8(c1));
+				}
+				else
+				{
+					cmin = cmin.min_u8(c1);
+					cmax = cmax.max_u8(c1);
+				}
+			}
+
+			if(tme)
+			{
+				if(!fst)
+				{
+					GSVector4 stq0 = GSVector4::cast(c0);
+					GSVector4 stq1 = GSVector4::cast(c1);
+
+					GSVector4 q = stq0.wwww(stq1).rcpnr();
+
+					stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
+					stq1 = (stq1.xyww() * q.zzzz()).xyww(stq1);
+
+					tmin = tmin.min(stq0.min(stq1));
+					tmax = tmax.max(stq0.max(stq1));
+				}
+				else
+				{
+					GSVector4i uv0(v[index[i + 0]].m[1]);
+					GSVector4i uv1(v[index[i + 1]].m[1]);
+
+					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
+					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
+
+					tmin = tmin.min(st0.min(st1));
+					tmax = tmax.max(st0.max(st1));
+				}
+			}
+
+			GSVector4i xyzf0(v[index[i + 0]].m[1]);
+			GSVector4i xyzf1(v[index[i + 1]].m[1]);
+
+			GSVector4i xy0 = xyzf0.upl16();
+			GSVector4i z0 = xyzf0.yyyy();
+			GSVector4i xy1 = xyzf1.upl16();
+			GSVector4i z1 = xyzf1.yyyy();
+
+			#if _M_SSE >= 0x401
+
+			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
+			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
+
+			pmin = pmin.min_u32(p0.min_u32(p1));
+			pmax = pmax.max_u32(p0.max_u32(p1));
+
+			#else
+
+			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
+			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
+
+			pmin = pmin.min(p0.min(p1));
+			pmax = pmax.max(p0.max(p1));
+
+			#endif
+		}
+		else if(primclass == GS_TRIANGLE_CLASS)
+		{
+			GSVector4i c0(v[index[i + 0]].m[0]);
+			GSVector4i c1(v[index[i + 1]].m[0]);
+			GSVector4i c2(v[index[i + 2]].m[0]);
+
+			if(color)
+			{
+				if(iip)
+				{
+					cmin = cmin.min_u8(c2).min_u8(c0.min_u8(c1));
+					cmax = cmax.max_u8(c2).max_u8(c0.max_u8(c1));
+				}
+				else
+				{
+					cmin = cmin.min_u8(c2);
+					cmax = cmax.max_u8(c2);
+				}
+			}
+
+			if(tme)
+			{
+				if(!fst)
+				{
+					GSVector4 stq0 = GSVector4::cast(c0);
+					GSVector4 stq1 = GSVector4::cast(c1);
+					GSVector4 stq2 = GSVector4::cast(c2);
+
+					GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr();
+
+					stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
+					stq1 = (stq1.xyww() * q.yyyy()).xyww(stq1);
+					stq2 = (stq2.xyww() * q.zzzz()).xyww(stq2);
+
+					tmin = tmin.min(stq2).min(stq0.min(stq1));
+					tmax = tmax.max(stq2).max(stq0.max(stq1));
+				}
+				else
+				{
+					GSVector4i uv0(v[index[i + 0]].m[1]);
+					GSVector4i uv1(v[index[i + 1]].m[1]);
+					GSVector4i uv2(v[index[i + 2]].m[1]);
+
+					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
+					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
+					GSVector4 st2 = GSVector4(uv2.uph16()).xyxy();
+
+					tmin = tmin.min(st2).min(st0.min(st1));
+					tmax = tmax.max(st2).max(st0.max(st1));
+				}
+			}
+
+			GSVector4i xyzf0(v[index[i + 0]].m[1]);
+			GSVector4i xyzf1(v[index[i + 1]].m[1]);
+			GSVector4i xyzf2(v[index[i + 2]].m[1]);
+
+			GSVector4i xy0 = xyzf0.upl16();
+			GSVector4i z0 = xyzf0.yyyy();
+			GSVector4i xy1 = xyzf1.upl16();
+			GSVector4i z1 = xyzf1.yyyy();
+			GSVector4i xy2 = xyzf2.upl16();
+			GSVector4i z2 = xyzf2.yyyy();
+
+			#if _M_SSE >= 0x401
+
+			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
+			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
+			GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2));
+
+			pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1));
+			pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1));
+
+			#else
+
+			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
+			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
+			GSVector4 p2 = GSVector4(xy2.upl64(z2.srl32(1).upl32(xyzf2.wwww())));
+
+			pmin = pmin.min(p2).min(p0.min(p1));
+			pmax = pmax.max(p2).max(p0.max(p1));
+
+			#endif
+		}
+		else if(primclass == GS_SPRITE_CLASS)
+		{
+			GSVector4i c0(v[index[i + 0]].m[0]);
+			GSVector4i c1(v[index[i + 1]].m[0]);
+
+			if(color)
+			{
+				if(iip)
+				{
+					cmin = cmin.min_u8(c0.min_u8(c1));
+					cmax = cmax.max_u8(c0.max_u8(c1));
+				}
+				else
+				{
+					cmin = cmin.min_u8(c1);
+					cmax = cmax.max_u8(c1);
+				}
+			}
+
+			if(tme)
+			{
+				if(!fst)
+				{
+					GSVector4 stq0 = GSVector4::cast(c0);
+					GSVector4 stq1 = GSVector4::cast(c1);
+
+					GSVector4 q = stq1.wwww().rcpnr();
+
+					stq0 = (stq0.xyww() * q).xyww(stq1);
+					stq1 = (stq1.xyww() * q).xyww(stq1);
+
+					tmin = tmin.min(stq0.min(stq1));
+					tmax = tmax.max(stq0.max(stq1));
+				}
+				else
+				{
+					GSVector4i uv0(v[index[i + 0]].m[1]);
+					GSVector4i uv1(v[index[i + 1]].m[1]);
+
+					GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
+					GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
+
+					tmin = tmin.min(st0.min(st1));
+					tmax = tmax.max(st0.max(st1));
+				}
+			}
+
+			GSVector4i xyzf0(v[index[i + 0]].m[1]);
+			GSVector4i xyzf1(v[index[i + 1]].m[1]);
+
+			GSVector4i xy0 = xyzf0.upl16();
+			GSVector4i z0 = xyzf0.yyyy();
+			GSVector4i xy1 = xyzf1.upl16();
+			GSVector4i z1 = xyzf1.yyyy();
+
+			#if _M_SSE >= 0x401
+
+			GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1));
+			GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
+
+			pmin = pmin.min_u32(p0.min_u32(p1));
+			pmax = pmax.max_u32(p0.max_u32(p1));
+
+			#else
+
+			GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf1.wwww())));
+			GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
+
+			pmin = pmin.min(p0.min(p1));
+			pmax = pmax.max(p0.max(p1));
+
+			#endif
+		}
+	}
+
+	#if _M_SSE >= 0x401
+
+	pmin = pmin.blend16<0x30>(pmin.srl32(1));
+	pmax = pmax.blend16<0x30>(pmax.srl32(1));
+
+	#endif
+
+	GSVector4 o(context->XYOFFSET);
+	GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f);
+
+	m_min.p = (GSVector4(pmin) - o) * s;
+	m_max.p = (GSVector4(pmax) - o) * s;
+
+	if(tme)
+	{
+		if(fst)
+		{
+			s = GSVector4(1.0f / 16, 1.0f).xxyy();
+		}
+		else
+		{
+			s = GSVector4(1 << context->TEX0.TW, 1 << context->TEX0.TH, 1, 1);
+		}
+
+		m_min.t = tmin * s;
+		m_max.t = tmax * s;
+	}
+	else
+	{
+		m_min.t = GSVector4::zero();
+		m_max.t = GSVector4::zero();
+	}
+
+	if(color)
+	{
+		m_min.c = cmin.zzzz().u8to32();
+		m_max.c = cmax.zzzz().u8to32();
+	}
+	else
+	{
+		m_min.c = GSVector4i::zero();
+		m_max.c = GSVector4i::zero();
+	}
+}
diff --git a/plugins/GSdx_legacy/GSVertexTrace.h b/plugins/GSdx_legacy/GSVertexTrace.h
new file mode 100644
index 0000000000..b3ee0b73e6
--- /dev/null
+++ b/plugins/GSdx_legacy/GSVertexTrace.h
@@ -0,0 +1,78 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSDrawingContext.h"
+#include "GSVertex.h"
+#include "GSVertexSW.h"
+#include "GSVertexHW.h"
+#include "GSFunctionMap.h"
+
+class GSState;
+
+__aligned(class, 32) GSVertexTrace : public GSAlignedClass<32>
+{
+public:
+	struct Vertex {GSVector4i c; GSVector4 p, t;};
+	struct VertexAlpha {int min, max; bool valid;};
+
+protected:
+	const GSState* m_state;
+
+	static const GSVector4 s_minmax;
+
+	typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const uint32* index, int count);
+
+	FindMinMaxPtr m_fmm[2][2][2][2][4];
+
+	template<GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
+	void FindMinMax(const void* vertex, const uint32* index, int count);
+
+public:
+	GS_PRIM_CLASS m_primclass;
+
+	Vertex m_min;
+	Vertex m_max;
+	VertexAlpha m_alpha; // source alpha range after tfx, GSRenderer::GetAlphaMinMax() updates it
+
+	union
+	{
+		uint32 value;
+		struct {uint32 r:4, g:4, b:4, a:4, x:1, y:1, z:1, f:1, s:1, t:1, q:1, _pad:1;};
+		struct {uint32 rgba:16, xyzf:4, stq:4;};
+	} m_eq;
+
+	union 
+	{
+		struct {uint32 mmag:1, mmin:1, linear:1;};
+	} m_filter;
+
+	GSVector2 m_lod; // x = min, y = max
+
+public:
+	GSVertexTrace(const GSState* state);
+	virtual ~GSVertexTrace() {}
+
+	void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
+
+	bool IsLinear() const {return m_filter.linear;}
+};
diff --git a/plugins/GSdx_legacy/GSWnd.cpp b/plugins/GSdx_legacy/GSWnd.cpp
new file mode 100644
index 0000000000..ebc230f19a
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWnd.cpp
@@ -0,0 +1,177 @@
+/*
+ *	Copyright (C) 2011-2014 Gregory hainaut
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWnd.h"
+
+void GSWndGL::PopulateGlFunction()
+{
+	*(void**)&(gl_ActiveTexture) = GetProcAddress("glActiveTexture");
+	*(void**)&(gl_BlendColor) = GetProcAddress("glBlendColor");
+
+	// Load mandatory function pointer
+#define GL_EXT_LOAD_OPT(ext) *(void**)&(ext) = GetProcAddress(#ext, true)
+	// Load extra function pointer
+#define GL_EXT_LOAD(ext)     *(void**)&(ext) = GetProcAddress(#ext, true)
+
+	GL_EXT_LOAD(glBlendEquationSeparate);
+	GL_EXT_LOAD(glBlendFuncSeparate);
+	GL_EXT_LOAD(glAttachShader);
+	GL_EXT_LOAD(glBindBuffer);
+	GL_EXT_LOAD(glBindBufferBase);
+	GL_EXT_LOAD(glBindBufferRange);
+	GL_EXT_LOAD(glBindFramebuffer);
+	GL_EXT_LOAD(glBindSampler);
+	GL_EXT_LOAD(glBindVertexArray);
+	GL_EXT_LOAD(glBlitFramebuffer);
+	GL_EXT_LOAD(glBufferData);
+	GL_EXT_LOAD(glCheckFramebufferStatus);
+	GL_EXT_LOAD(glClearBufferfv);
+	GL_EXT_LOAD(glClearBufferiv);
+	GL_EXT_LOAD(glClearBufferuiv);
+	GL_EXT_LOAD(glColorMaski);
+	GL_EXT_LOAD(glDeleteBuffers);
+	GL_EXT_LOAD(glDeleteFramebuffers);
+	GL_EXT_LOAD(glDeleteSamplers);
+	GL_EXT_LOAD(glDeleteVertexArrays);
+	GL_EXT_LOAD(glDetachShader);
+	GL_EXT_LOAD(glDrawBuffers);
+	GL_EXT_LOAD(glDrawElementsBaseVertex);
+	GL_EXT_LOAD(glEnableVertexAttribArray);
+	GL_EXT_LOAD(glFramebufferRenderbuffer);
+	GL_EXT_LOAD(glFramebufferTexture2D);
+	GL_EXT_LOAD(glGenBuffers);
+	GL_EXT_LOAD(glGenFramebuffers);
+	GL_EXT_LOAD(glGenSamplers);
+	GL_EXT_LOAD(glGenVertexArrays);
+	GL_EXT_LOAD(glGetBufferParameteriv);
+	GL_EXT_LOAD(glGetDebugMessageLogARB);
+	GL_EXT_LOAD_OPT(glDebugMessageCallback);
+	GL_EXT_LOAD(glGetProgramInfoLog);
+	GL_EXT_LOAD(glGetProgramiv);
+	GL_EXT_LOAD(glGetShaderiv);
+	GL_EXT_LOAD(glGetStringi);
+	GL_EXT_LOAD(glIsFramebuffer);
+	GL_EXT_LOAD(glMapBuffer);
+	GL_EXT_LOAD(glMapBufferRange);
+	GL_EXT_LOAD(glProgramParameteri);
+	GL_EXT_LOAD(glSamplerParameterf);
+	GL_EXT_LOAD(glSamplerParameteri);
+	GL_EXT_LOAD(glShaderSource);
+	GL_EXT_LOAD(glUniform1i);
+	GL_EXT_LOAD(glUnmapBuffer);
+	GL_EXT_LOAD(glVertexAttribIPointer);
+	GL_EXT_LOAD(glVertexAttribPointer);
+	GL_EXT_LOAD(glBufferSubData);
+	GL_EXT_LOAD(glFenceSync);
+	GL_EXT_LOAD(glDeleteSync);
+	GL_EXT_LOAD(glClientWaitSync);
+	GL_EXT_LOAD(glFlushMappedBufferRange);
+	// Query object
+	GL_EXT_LOAD(glBeginQuery);
+	GL_EXT_LOAD(glEndQuery);
+	GL_EXT_LOAD(glGetQueryiv);
+	GL_EXT_LOAD(glGetQueryObjectiv);
+	GL_EXT_LOAD(glGetQueryObjectuiv);
+	GL_EXT_LOAD(glQueryCounter);
+	GL_EXT_LOAD(glGetQueryObjecti64v);
+	GL_EXT_LOAD(glGetQueryObjectui64v);
+	GL_EXT_LOAD(glGetInteger64v);
+	// GL4.0
+	GL_EXT_LOAD_OPT(glBlendEquationSeparateiARB);
+	GL_EXT_LOAD_OPT(glBlendFuncSeparateiARB);
+	// GL4.1
+	GL_EXT_LOAD_OPT(glCreateShaderProgramv);
+	GL_EXT_LOAD_OPT(glBindProgramPipeline);
+	GL_EXT_LOAD_OPT(glDeleteProgramPipelines);
+	GL_EXT_LOAD_OPT(glGenProgramPipelines);
+	GL_EXT_LOAD_OPT(glGetProgramPipelineiv);
+	GL_EXT_LOAD_OPT(glGetProgramPipelineInfoLog);
+	GL_EXT_LOAD_OPT(glValidateProgramPipeline);
+	GL_EXT_LOAD_OPT(glUseProgramStages);
+	GL_EXT_LOAD_OPT(glProgramUniform1i); // but no GL4.2
+	GL_EXT_LOAD_OPT(glGetProgramBinary);
+	GL_EXT_LOAD_OPT(glViewportIndexedf);
+	GL_EXT_LOAD_OPT(glViewportIndexedfv);
+	GL_EXT_LOAD_OPT(glScissorIndexed);
+	GL_EXT_LOAD_OPT(glScissorIndexedv);
+	// NO GL4.1
+	GL_EXT_LOAD(glDeleteProgram);
+	GL_EXT_LOAD(glDeleteShader);
+	GL_EXT_LOAD(glCompileShader);
+	GL_EXT_LOAD(glCreateProgram);
+	GL_EXT_LOAD(glCreateShader);
+	GL_EXT_LOAD(glUseProgram);
+	GL_EXT_LOAD(glGetShaderInfoLog);
+	GL_EXT_LOAD(glLinkProgram);
+	// GL4.2
+	GL_EXT_LOAD_OPT(glBindImageTexture);
+	GL_EXT_LOAD_OPT(glMemoryBarrier);
+	GL_EXT_LOAD(glTexStorage2D);
+	// GL4.3
+	GL_EXT_LOAD_OPT(glCopyImageSubData);
+	GL_EXT_LOAD_OPT(glInvalidateTexImage);
+	GL_EXT_LOAD_OPT(glPushDebugGroup);
+	GL_EXT_LOAD_OPT(glPopDebugGroup);
+	GL_EXT_LOAD_OPT(glDebugMessageInsert);
+	GL_EXT_LOAD_OPT(glDebugMessageControl);
+	// GL4.4
+	GL_EXT_LOAD_OPT(glClearTexImage);
+	GL_EXT_LOAD_OPT(glBufferStorage);
+
+	// GL4.5
+	GL_EXT_LOAD_OPT(glCreateTextures);
+	GL_EXT_LOAD_OPT(glTextureStorage2D);
+	GL_EXT_LOAD_OPT(glTextureSubImage2D);
+	GL_EXT_LOAD_OPT(glCopyTextureSubImage2D);
+	GL_EXT_LOAD_OPT(glBindTextureUnit);
+	GL_EXT_LOAD_OPT(glGetTextureImage);
+	GL_EXT_LOAD_OPT(glTextureParameteri);
+
+	GL_EXT_LOAD_OPT(glCreateFramebuffers);
+	GL_EXT_LOAD_OPT(glClearNamedFramebufferfv);
+	GL_EXT_LOAD_OPT(glClearNamedFramebufferuiv);
+	GL_EXT_LOAD_OPT(glClearNamedFramebufferiv);
+	GL_EXT_LOAD_OPT(glNamedFramebufferTexture);
+	GL_EXT_LOAD_OPT(glNamedFramebufferDrawBuffers);
+	GL_EXT_LOAD_OPT(glNamedFramebufferReadBuffer);
+	GL_EXT_LOAD_OPT(glCheckNamedFramebufferStatus);
+
+	GL_EXT_LOAD_OPT(glCreateBuffers);
+	GL_EXT_LOAD_OPT(glNamedBufferStorage);
+	GL_EXT_LOAD_OPT(glNamedBufferData);
+	GL_EXT_LOAD_OPT(glNamedBufferSubData);
+	GL_EXT_LOAD_OPT(glMapNamedBuffer);
+	GL_EXT_LOAD_OPT(glMapNamedBufferRange);
+	GL_EXT_LOAD_OPT(glUnmapNamedBuffer);
+	GL_EXT_LOAD_OPT(glFlushMappedNamedBufferRange);
+
+	GL_EXT_LOAD_OPT(glCreateSamplers);
+	GL_EXT_LOAD_OPT(glCreateProgramPipelines);
+
+	GL_EXT_LOAD_OPT(glClipControl);
+	GL_EXT_LOAD_OPT(glTextureBarrier);
+
+	if (glCreateFramebuffers == NULL) {
+		Emulate_DSA::Init();
+	}
+}
diff --git a/plugins/GSdx_legacy/GSWnd.h b/plugins/GSdx_legacy/GSWnd.h
new file mode 100644
index 0000000000..fed7a278e2
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWnd.h
@@ -0,0 +1,90 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GSVector.h"
+
+class GSWnd
+{
+protected:
+	bool m_managed; // set true when we're attached to a 3rdparty window that's amanged by the emulator
+
+public:
+	GSWnd() : m_managed(false) {};
+	virtual ~GSWnd() {};
+
+	virtual bool Create(const string& title, int w, int h) = 0;
+	virtual bool Attach(void* handle, bool managed = true) = 0;
+	virtual void Detach() = 0;
+	bool IsManaged() const {return m_managed;}
+
+	virtual void* GetDisplay() = 0;
+	virtual void* GetHandle() = 0;
+	virtual GSVector4i GetClientRect() = 0;
+	virtual bool SetWindowText(const char* title) = 0;
+
+	virtual void AttachContext() {};
+	virtual void DetachContext() {};
+
+	virtual void Show() = 0;
+	virtual void Hide() = 0;
+	virtual void HideFrame() = 0;
+
+	virtual void Flip() {};
+	virtual void SetVSync(bool enable) {};
+
+};
+
+class GSWndGL : public GSWnd
+{
+protected:
+	bool m_ctx_attached;
+
+	bool IsContextAttached() const { return m_ctx_attached; }
+
+public:
+	GSWndGL() : m_ctx_attached(false) {};
+	virtual ~GSWndGL() {};
+
+	virtual bool Create(const string& title, int w, int h) = 0;
+	virtual bool Attach(void* handle, bool managed = true) = 0;
+	virtual void Detach() = 0;
+
+	virtual void* GetDisplay() = 0;
+	virtual void* GetHandle() = 0;
+	virtual GSVector4i GetClientRect() = 0;
+	virtual bool SetWindowText(const char* title) = 0;
+
+	virtual void AttachContext() = 0;
+	virtual void DetachContext() = 0;
+	virtual void* GetProcAddress(const char* name, bool opt = false) = 0;
+
+	virtual void Show() = 0;
+	virtual void Hide() = 0;
+	virtual void HideFrame() = 0;
+	virtual void Flip() = 0;
+	virtual void SetVSync(bool enable) = 0;
+
+	void PopulateGlFunction();
+};
diff --git a/plugins/GSdx_legacy/GSWndDX.cpp b/plugins/GSdx_legacy/GSWndDX.cpp
new file mode 100644
index 0000000000..52750aa350
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndDX.cpp
@@ -0,0 +1,211 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWndDX.h"
+
+#ifdef _WIN32
+GSWndDX::GSWndDX()
+	: m_hWnd(NULL)
+	, m_frame(true)
+{
+}
+
+GSWndDX::~GSWndDX()
+{
+}
+
+LRESULT CALLBACK GSWndDX::WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	GSWndDX* wnd = NULL;
+
+	if(message == WM_NCCREATE)
+	{
+		wnd = (GSWndDX*)((LPCREATESTRUCT)lParam)->lpCreateParams;
+
+		SetWindowLongPtr(hWnd, GWLP_USERDATA, (LONG_PTR)wnd);
+
+		wnd->m_hWnd = hWnd;
+	}
+	else
+	{
+		wnd = (GSWndDX*)GetWindowLongPtr(hWnd, GWLP_USERDATA);
+	}
+
+	if(wnd == NULL)
+	{
+		return DefWindowProc(hWnd, message, wParam, lParam);
+	}
+
+	return wnd->OnMessage(message, wParam, lParam);
+}
+
+LRESULT GSWndDX::OnMessage(UINT message, WPARAM wParam, LPARAM lParam)
+{
+	switch(message)
+	{
+	case WM_CLOSE:
+		Hide();
+		// DestroyWindow(m_hWnd);
+		return 0;
+	case WM_DESTROY:
+		// This kills the emulator when GS is closed, which *really* isn't desired behavior,
+		// especially in STGS mode (worked in MTGS mode since it only quit the thread, but even
+		// that wasn't needed).
+		//PostQuitMessage(0);
+		return 0;
+	default:
+		break;
+	}
+
+	return DefWindowProc((HWND)m_hWnd, message, wParam, lParam);
+}
+
+bool GSWndDX::Create(const string& title, int w, int h)
+{
+	if(m_hWnd) return false;
+
+	m_managed = true;
+
+	WNDCLASS wc;
+
+	memset(&wc, 0, sizeof(wc));
+
+	wc.style = CS_HREDRAW | CS_VREDRAW | CS_DBLCLKS;
+	wc.lpfnWndProc = WndProc;
+	wc.hInstance = theApp.GetModuleHandle();
+	// TODO: wc.hIcon = ;
+	wc.hCursor = LoadCursor(NULL, IDC_ARROW);
+	wc.hbrBackground = (HBRUSH)GetStockObject(BLACK_BRUSH);
+	wc.lpszClassName = "GSWndDX";
+
+	if(!GetClassInfo(wc.hInstance, wc.lpszClassName, &wc))
+	{
+		if(!RegisterClass(&wc))
+		{
+			return false;
+		}
+	}
+
+	DWORD style = WS_CLIPCHILDREN | WS_CLIPSIBLINGS | WS_OVERLAPPEDWINDOW | WS_BORDER;
+
+	GSVector4i r;
+
+	GetWindowRect(GetDesktopWindow(), r);
+
+	bool remote = !!GetSystemMetrics(SM_REMOTESESSION);
+
+	if(w <= 0 || h <= 0 || remote)
+	{
+		w = r.width() / 3;
+		h = r.width() / 4;
+
+		if(!remote)
+		{
+			w *= 2;
+			h *= 2;
+		}
+	}
+
+	r.left = (r.left + r.right - w) / 2;
+	r.top = (r.top + r.bottom - h) / 2;
+	r.right = r.left + w;
+	r.bottom = r.top + h;
+
+	AdjustWindowRect(r, style, FALSE);
+
+	m_hWnd = CreateWindow(wc.lpszClassName, title.c_str(), style, r.left, r.top, r.width(), r.height(), NULL, NULL, wc.hInstance, (LPVOID)this);
+
+	return m_hWnd != NULL;
+}
+
+bool GSWndDX::Attach(void* handle, bool managed)
+{
+	// TODO: subclass
+
+	m_hWnd = (HWND)handle;
+	m_managed = managed;
+
+	return true;
+}
+
+void GSWndDX::Detach()
+{
+	if(m_hWnd && m_managed)
+	{
+		// close the window, since it's under GSdx care.  It's not taking messages anyway, and
+		// that means its big, ugly, and in the way.
+
+		DestroyWindow(m_hWnd);
+	}
+
+	m_hWnd = NULL;
+	m_managed = true;
+}
+
+GSVector4i GSWndDX::GetClientRect()
+{
+	GSVector4i r;
+
+	::GetClientRect(m_hWnd, r);
+
+	return r;
+}
+
+// Returns FALSE if the window has no title, or if th window title is under the strict
+// management of the emulator.
+
+bool GSWndDX::SetWindowText(const char* title)
+{
+	if(!m_managed) return false;
+
+	::SetWindowText(m_hWnd, title);
+
+	return m_frame;
+}
+
+void GSWndDX::Show()
+{
+	if(!m_managed) return;
+
+	SetForegroundWindow(m_hWnd);
+	ShowWindow(m_hWnd, SW_SHOWNORMAL);
+	UpdateWindow(m_hWnd);
+}
+
+void GSWndDX::Hide()
+{
+	if(!m_managed) return;
+
+	ShowWindow(m_hWnd, SW_HIDE);
+}
+
+void GSWndDX::HideFrame()
+{
+	if(!m_managed) return;
+
+	SetWindowLong(m_hWnd, GWL_STYLE, GetWindowLong(m_hWnd, GWL_STYLE) & ~(WS_CAPTION|WS_THICKFRAME));
+	SetWindowPos(m_hWnd, NULL, 0, 0, 0, 0, SWP_NOSIZE | SWP_NOMOVE | SWP_NOZORDER | SWP_NOACTIVATE);
+	SetMenu(m_hWnd, NULL);
+
+	m_frame = false;
+}
+#endif
diff --git a/plugins/GSdx_legacy/GSWndDX.h b/plugins/GSdx_legacy/GSWndDX.h
new file mode 100644
index 0000000000..1cc96138e0
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndDX.h
@@ -0,0 +1,52 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWnd.h"
+
+#ifdef _WIN32
+class GSWndDX : public GSWnd
+{
+	HWND m_hWnd;
+
+	bool m_frame;
+
+	static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+	virtual LRESULT OnMessage(UINT message, WPARAM wParam, LPARAM lParam);
+
+public:
+	GSWndDX();
+	virtual ~GSWndDX();
+
+	bool Create(const string& title, int w, int h);
+	bool Attach(void* handle, bool managed = true);
+	void Detach();
+
+	void* GetDisplay() {return m_hWnd;}
+	void* GetHandle() {return m_hWnd;}
+	GSVector4i GetClientRect();
+	bool SetWindowText(const char* title);
+
+	void Show();
+	void Hide();
+	void HideFrame();
+};
+#endif
diff --git a/plugins/GSdx_legacy/GSWndEGL.cpp b/plugins/GSdx_legacy/GSWndEGL.cpp
new file mode 100644
index 0000000000..3b51fe02d1
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndEGL.cpp
@@ -0,0 +1,300 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWndEGL.h"
+
+#if defined(__linux__) && defined(EGL_SUPPORTED)
+
+GSWndEGL::GSWndEGL()
+	: m_NativeWindow(0), m_NativeDisplay(NULL)
+{
+}
+
+void GSWndEGL::CreateContext(int major, int minor)
+{
+	EGLConfig eglConfig;
+	EGLint numConfigs = 0;
+	EGLint contextAttribs[] =
+	{
+		EGL_CONTEXT_MAJOR_VERSION_KHR, major,
+		EGL_CONTEXT_MINOR_VERSION_KHR, minor,
+#ifdef ENABLE_OGL_DEBUG
+		EGL_CONTEXT_FLAGS_KHR, EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR,
+#else
+		// Open Source isn't happy with an unsupported flags...
+		//EGL_CONTEXT_FLAGS_KHR, GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR,
+#endif
+		EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR,
+		EGL_NONE
+	};
+	EGLint NullContextAttribs[] = { EGL_NONE };
+	EGLint attrList[] = {
+		EGL_RED_SIZE, 8,
+		EGL_GREEN_SIZE, 8,
+		EGL_BLUE_SIZE, 8,
+		EGL_DEPTH_SIZE, 24,
+		EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
+		EGL_NONE
+	};
+
+	eglBindAPI(EGL_OPENGL_API);
+
+	eglChooseConfig(m_eglDisplay, attrList, &eglConfig, 1, &numConfigs);
+	if ( numConfigs == 0 )
+	{
+		fprintf(stderr,"EGL: Failed to get a frame buffer config! (0x%x)\n", eglGetError() );
+		throw GSDXRecoverableError();
+	}
+
+	m_eglSurface = eglCreateWindowSurface(m_eglDisplay, eglConfig, m_NativeWindow, NULL);
+	if ( m_eglSurface == EGL_NO_SURFACE )
+	{
+		fprintf(stderr,"EGL: Failed to get a window surface\n");
+		throw GSDXRecoverableError();
+	}
+
+	m_eglContext = eglCreateContext(m_eglDisplay, eglConfig, EGL_NO_CONTEXT, contextAttribs);
+	EGLint status = eglGetError();
+	if (status == EGL_BAD_ATTRIBUTE || status == EGL_BAD_MATCH) {
+		// Radeon/Gallium don't support advance attribute. Fallback to random value
+		// Note: Intel gives an EGL_BAD_MATCH. I don't know why but let's by stubborn and retry.
+		fprintf(stderr, "EGL: warning your driver doesn't support advance openGL context attributes\n");
+		m_eglContext = eglCreateContext(m_eglDisplay, eglConfig, EGL_NO_CONTEXT, NullContextAttribs);
+		status = eglGetError();
+	}
+	if ( m_eglContext == EGL_NO_CONTEXT )
+	{
+		fprintf(stderr,"EGL: Failed to create the context\n");
+		fprintf(stderr,"EGL STATUS: %x\n", status);
+		throw GSDXRecoverableError();
+	}
+
+	if ( !eglMakeCurrent(m_eglDisplay, m_eglSurface, m_eglSurface, m_eglContext) )
+	{
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSWndEGL::AttachContext()
+{
+	if (!IsContextAttached()) {
+		// The setting of the API is local to a thread. This function 
+		// can be called from 2 threads.
+		eglBindAPI(EGL_OPENGL_API);
+
+		//fprintf(stderr, "Attach the context\n");
+		eglMakeCurrent(m_eglDisplay, m_eglSurface, m_eglSurface, m_eglContext);
+		m_ctx_attached = true;
+	}
+}
+
+void GSWndEGL::DetachContext()
+{
+	if (IsContextAttached()) {
+		//fprintf(stderr, "Detach the context\n");
+		eglMakeCurrent(m_eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+		m_ctx_attached = false;
+	}
+}
+
+void GSWndEGL::CheckContext()
+{
+	fprintf(stderr,"EGL: %s : %s\n", eglQueryString(m_eglDisplay, EGL_VENDOR) , eglQueryString(m_eglDisplay, EGL_VERSION) );
+	fprintf(stderr,"EGL: extensions supported: %s\n", eglQueryString(m_eglDisplay, EGL_EXTENSIONS));
+}
+
+bool GSWndEGL::Attach(void* handle, bool managed)
+{
+	m_NativeWindow = *(Window*)handle;
+	m_managed = managed;
+
+	m_NativeDisplay = XOpenDisplay(NULL);
+	OpenEGLDisplay();
+
+	CreateContext(3, 3);
+
+	AttachContext();
+
+	CheckContext();
+
+	PopulateGlFunction();
+
+	return true;
+}
+
+void GSWndEGL::Detach()
+{
+	// Actually the destructor is not called when there is only a GSclose/GSshutdown
+	// The window still need to be closed
+	DetachContext();
+	eglDestroyContext(m_eglDisplay, m_eglContext);
+	m_eglContext = NULL;
+	eglDestroySurface(m_eglDisplay, m_eglSurface);
+	m_eglSurface = NULL;
+	CloseEGLDisplay();
+
+	if (m_NativeDisplay) {
+		XCloseDisplay(m_NativeDisplay);
+		m_NativeDisplay = NULL;
+	}
+}
+
+bool GSWndEGL::Create(const string& title, int w, int h)
+{
+	if(m_NativeWindow)
+		throw GSDXRecoverableError();
+
+	if(w <= 0 || h <= 0) {
+		w = theApp.GetConfig("ModeWidth", 640);
+		h = theApp.GetConfig("ModeHeight", 480);
+	}
+
+	m_managed = true;
+
+	// note this part must be only executed when replaying .gs debug file
+	m_NativeDisplay = XOpenDisplay(NULL);
+	OpenEGLDisplay();
+
+	m_NativeWindow = XCreateSimpleWindow(m_NativeDisplay, DefaultRootWindow(m_NativeDisplay), 0, 0, w, h, 0, 0, 0);
+	XMapWindow (m_NativeDisplay, m_NativeWindow);
+
+	CreateContext(3, 3);
+
+	AttachContext();
+
+	CheckContext();
+
+	PopulateGlFunction();
+
+	if (m_NativeWindow == 0)
+		throw GSDXRecoverableError();
+
+	return true;
+}
+
+void* GSWndEGL::GetProcAddress(const char* name, bool opt)
+{
+	void* ptr = (void*)eglGetProcAddress(name);
+	if (ptr == NULL) {
+		fprintf(stderr, "Failed to find %s\n", name);
+		if (!opt)
+			throw GSDXRecoverableError();
+	}
+	return ptr;
+}
+
+void* GSWndEGL::GetDisplay()
+{
+	// note this part must be only executed when replaying .gs debug file
+	return (void*)m_NativeDisplay;
+}
+
+GSVector4i GSWndEGL::GetClientRect()
+{
+	unsigned int h = 480;
+	unsigned int w = 640;
+
+	unsigned int borderDummy;
+	unsigned int depthDummy;
+	Window winDummy;
+    int xDummy;
+    int yDummy;
+
+	if (!m_NativeDisplay) m_NativeDisplay = XOpenDisplay(NULL);
+	XGetGeometry(m_NativeDisplay, m_NativeWindow, &winDummy, &xDummy, &yDummy, &w, &h, &borderDummy, &depthDummy);
+
+	return GSVector4i(0, 0, (int)w, (int)h);
+}
+
+// Returns FALSE if the window has no title, or if th window title is under the strict
+// management of the emulator.
+
+bool GSWndEGL::SetWindowText(const char* title)
+{
+	if (!m_managed) return true;
+
+	XTextProperty prop;
+
+	memset(&prop, 0, sizeof(prop));
+
+	char* ptitle = (char*)title;
+	if (XStringListToTextProperty(&ptitle, 1, &prop)) {
+		XSetWMName(m_NativeDisplay, m_NativeWindow, &prop);
+	}
+
+	XFree(prop.value);
+	XFlush(m_NativeDisplay);
+
+	return true;
+}
+
+void GSWndEGL::SetVSync(bool enable)
+{
+	// 0 -> disable vsync
+	// n -> wait n frame
+	eglSwapInterval(m_eglDisplay, enable);
+}
+
+void GSWndEGL::Flip()
+{
+	eglSwapBuffers(m_eglDisplay, m_eglSurface);
+}
+
+void GSWndEGL::Show()
+{
+	XMapRaised(m_NativeDisplay, m_NativeWindow);
+	XFlush(m_NativeDisplay);
+}
+
+void GSWndEGL::Hide()
+{
+	XUnmapWindow(m_NativeDisplay, m_NativeWindow);
+	XFlush(m_NativeDisplay);
+}
+
+void GSWndEGL::HideFrame()
+{
+	// TODO
+}
+
+void GSWndEGL::CloseEGLDisplay()
+{
+	eglReleaseThread();
+	eglTerminate(m_eglDisplay);
+}
+
+void GSWndEGL::OpenEGLDisplay()
+{
+	// Create an EGL display from the native display
+	m_eglDisplay = eglGetDisplay((EGLNativeDisplayType)m_NativeDisplay);
+	if ( m_eglDisplay == EGL_NO_DISPLAY ) {
+		fprintf(stderr,"EGL: Failed to open a display! (0x%x)\n", eglGetError() );
+		throw GSDXRecoverableError();
+	}
+
+	if ( !eglInitialize(m_eglDisplay, NULL, NULL) ) {
+		fprintf(stderr,"EGL: Failed to initialize the display! (0x%x)\n", eglGetError() );
+		throw GSDXRecoverableError();
+	}
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSWndEGL.h b/plugins/GSdx_legacy/GSWndEGL.h
new file mode 100644
index 0000000000..17804bb726
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndEGL.h
@@ -0,0 +1,68 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "GSWnd.h"
+
+#if defined(__linux__) && defined(EGL_SUPPORTED)
+#include <X11/Xlib.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+class GSWndEGL : public GSWndGL
+{
+	EGLNativeWindowType    m_NativeWindow;
+	EGLNativeDisplayType   m_NativeDisplay;
+
+	EGLDisplay m_eglDisplay;
+	EGLSurface m_eglSurface;
+	EGLContext m_eglContext;
+
+	void CreateContext(int major, int minor);
+	void CheckContext();
+
+	void OpenEGLDisplay();
+	void CloseEGLDisplay();
+
+public:
+	GSWndEGL();
+	virtual ~GSWndEGL() {};
+
+	bool Create(const string& title, int w, int h);
+	bool Attach(void* handle, bool managed = true);
+	void Detach();
+
+	void* GetDisplay();
+	void* GetHandle() {return (void*)m_NativeWindow;}
+	GSVector4i GetClientRect();
+	bool SetWindowText(const char* title);
+
+	void AttachContext();
+	void DetachContext();
+	void* GetProcAddress(const char* name, bool opt = false);
+
+	void Show();
+	void Hide();
+	void HideFrame();
+	void Flip();
+	void SetVSync(bool enable);
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/GSWndOGL.cpp b/plugins/GSdx_legacy/GSWndOGL.cpp
new file mode 100644
index 0000000000..5537f2d555
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndOGL.cpp
@@ -0,0 +1,290 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWndOGL.h"
+
+#if defined(__linux__)
+GSWndOGL::GSWndOGL()
+	: m_NativeWindow(0), m_NativeDisplay(NULL), m_context(0), m_swapinterval(NULL)
+{
+}
+
+static bool ctxError = false;
+static int  ctxErrorHandler(Display *dpy, XErrorEvent *ev)
+{
+	ctxError = true;
+	return 0;
+}
+
+void GSWndOGL::CreateContext(int major, int minor)
+{
+	if ( !m_NativeDisplay || !m_NativeWindow )
+	{
+		fprintf( stderr, "Wrong X11 display/window\n" );
+		throw GSDXRecoverableError();
+	}
+
+	// Get visual information
+	static int attrListDbl[] =
+	{
+		// GLX_X_RENDERABLE: If True is specified, then only frame buffer configurations that have associated X
+		// visuals (and can be used to render to Windows and/or GLX pixmaps) will be considered. The default value is GLX_DONT_CARE.
+		GLX_X_RENDERABLE    , True,
+		GLX_RED_SIZE        , 8,
+		GLX_GREEN_SIZE      , 8,
+		GLX_BLUE_SIZE       , 8,
+		GLX_DEPTH_SIZE      , 24,
+		GLX_DOUBLEBUFFER    , True,
+		None
+	};
+
+	PFNGLXCHOOSEFBCONFIGPROC glX_ChooseFBConfig = (PFNGLXCHOOSEFBCONFIGPROC) glXGetProcAddress((GLubyte *) "glXChooseFBConfig");
+	int fbcount = 0;
+	GLXFBConfig *fbc = glX_ChooseFBConfig(m_NativeDisplay, DefaultScreen(m_NativeDisplay), attrListDbl, &fbcount);
+	if (!fbc || fbcount < 1) {
+		throw GSDXRecoverableError();
+	}
+
+	PFNGLXCREATECONTEXTATTRIBSARBPROC glX_CreateContextAttribsARB = (PFNGLXCREATECONTEXTATTRIBSARBPROC)glXGetProcAddress((const GLubyte*) "glXCreateContextAttribsARB");
+	if (!glX_CreateContextAttribsARB) {
+		throw GSDXRecoverableError();
+	}
+
+	// Install a dummy handler to handle gracefully (aka not segfault) the support of GL version
+	int (*oldHandler)(Display*, XErrorEvent*) = XSetErrorHandler(&ctxErrorHandler);
+	// Be sure the handler is installed
+	XSync( m_NativeDisplay, false);
+
+	// Create a context
+	int context_attribs[] =
+	{
+		GLX_CONTEXT_MAJOR_VERSION_ARB, major,
+		GLX_CONTEXT_MINOR_VERSION_ARB, minor,
+#ifdef ENABLE_OGL_DEBUG
+		GLX_CONTEXT_FLAGS_ARB, GLX_CONTEXT_DEBUG_BIT_ARB,
+#else
+		// Open Source isn't happy with an unsupported flags...
+		//GLX_CONTEXT_FLAGS_ARB, GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR,
+#endif
+		GLX_CONTEXT_PROFILE_MASK_ARB, GLX_CONTEXT_CORE_PROFILE_BIT_ARB,
+		None
+	};
+
+	m_context = glX_CreateContextAttribsARB(m_NativeDisplay, fbc[0], 0, true, context_attribs);
+	XFree(fbc);
+
+	// Don't forget to reinstall the older Handler
+	XSetErrorHandler(oldHandler);
+
+	// Get latest error
+	XSync( m_NativeDisplay, false);
+
+	if (!m_context || ctxError) {
+		fprintf(stderr, "Failed to create the opengl context. Check your drivers support openGL %d.%d. Hint: opensource drivers don't\n", major, minor );
+		throw GSDXRecoverableError();
+	}
+}
+
+void GSWndOGL::AttachContext()
+{
+	if (!IsContextAttached()) {
+		//fprintf(stderr, "Attach the context\n");
+		glXMakeCurrent(m_NativeDisplay, m_NativeWindow, m_context);
+		m_ctx_attached = true;
+	}
+}
+
+void GSWndOGL::DetachContext()
+{
+	if (IsContextAttached()) {
+		//fprintf(stderr, "Detach the context\n");
+		glXMakeCurrent(m_NativeDisplay, None, NULL);
+		m_ctx_attached = false;
+	}
+}
+
+void GSWndOGL::CheckContext()
+{
+	int glxMajorVersion, glxMinorVersion;
+	glXQueryVersion(m_NativeDisplay, &glxMajorVersion, &glxMinorVersion);
+	if (glXIsDirect(m_NativeDisplay, m_context))
+		fprintf(stdout, "glX-Version %d.%d with Direct Rendering\n", glxMajorVersion, glxMinorVersion);
+	else {
+		fprintf(stderr, "glX-Version %d.%d with Indirect Rendering !!! It won't support properly opengl\n", glxMajorVersion, glxMinorVersion);
+		throw GSDXRecoverableError();
+	}
+}
+
+bool GSWndOGL::Attach(void* handle, bool managed)
+{
+	m_NativeWindow = *(Window*)handle;
+	m_managed = managed;
+
+	m_NativeDisplay = XOpenDisplay(NULL);
+
+	CreateContext(3, 3);
+
+	AttachContext();
+
+	CheckContext();
+
+	m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
+
+	PopulateGlFunction();
+
+	return true;
+}
+
+void GSWndOGL::Detach()
+{
+	// Actually the destructor is not called when there is only a GSclose/GSshutdown
+	// The window still need to be closed
+	DetachContext();
+	if (m_context) glXDestroyContext(m_NativeDisplay, m_context);
+
+	if (m_NativeDisplay) {
+		XCloseDisplay(m_NativeDisplay);
+		m_NativeDisplay = NULL;
+	}
+}
+
+bool GSWndOGL::Create(const string& title, int w, int h)
+{
+	if(m_NativeWindow)
+		throw GSDXRecoverableError();
+
+	if(w <= 0 || h <= 0) {
+		w = theApp.GetConfig("ModeWidth", 640);
+		h = theApp.GetConfig("ModeHeight", 480);
+	}
+
+	m_managed = true;
+
+	// note this part must be only executed when replaying .gs debug file
+	m_NativeDisplay = XOpenDisplay(NULL);
+
+	m_NativeWindow = XCreateSimpleWindow(m_NativeDisplay, DefaultRootWindow(m_NativeDisplay), 0, 0, w, h, 0, 0, 0);
+	XMapWindow (m_NativeDisplay, m_NativeWindow);
+
+	if (m_NativeWindow == 0)
+		throw GSDXRecoverableError();
+
+	CreateContext(3, 3);
+
+	AttachContext();
+
+	CheckContext();
+
+	m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
+
+	PopulateGlFunction();
+
+	return true;
+}
+
+void* GSWndOGL::GetProcAddress(const char* name, bool opt)
+{
+	void* ptr = (void*)glXGetProcAddress((const GLubyte*)name);
+	if (ptr == NULL) {
+		fprintf(stderr, "Failed to find %s\n", name);
+		if (!opt)
+			throw GSDXRecoverableError();
+	}
+	return ptr;
+}
+
+void* GSWndOGL::GetDisplay()
+{
+	// note this part must be only executed when replaying .gs debug file
+	return (void*)m_NativeDisplay;
+}
+
+GSVector4i GSWndOGL::GetClientRect()
+{
+	unsigned int h = 480;
+	unsigned int w = 640;
+
+	unsigned int borderDummy;
+	unsigned int depthDummy;
+	Window winDummy;
+    int xDummy;
+    int yDummy;
+
+	if (!m_NativeDisplay) m_NativeDisplay = XOpenDisplay(NULL);
+	XGetGeometry(m_NativeDisplay, m_NativeWindow, &winDummy, &xDummy, &yDummy, &w, &h, &borderDummy, &depthDummy);
+
+	return GSVector4i(0, 0, (int)w, (int)h);
+}
+
+// Returns FALSE if the window has no title, or if th window title is under the strict
+// management of the emulator.
+
+bool GSWndOGL::SetWindowText(const char* title)
+{
+	if (!m_managed) return true;
+
+	XTextProperty prop;
+
+	memset(&prop, 0, sizeof(prop));
+
+	char* ptitle = (char*)title;
+	if (XStringListToTextProperty(&ptitle, 1, &prop)) {
+		XSetWMName(m_NativeDisplay, m_NativeWindow, &prop);
+	}
+
+	XFree(prop.value);
+	XFlush(m_NativeDisplay);
+
+	return true;
+}
+
+void GSWndOGL::SetVSync(bool enable)
+{
+	// m_swapinterval uses an integer as parameter
+	// 0 -> disable vsync
+	// n -> wait n frame
+	if (m_swapinterval) m_swapinterval(m_NativeDisplay, m_NativeWindow, (int)enable);
+}
+
+void GSWndOGL::Flip()
+{
+	glXSwapBuffers(m_NativeDisplay, m_NativeWindow);
+}
+
+void GSWndOGL::Show()
+{
+	XMapRaised(m_NativeDisplay, m_NativeWindow);
+	XFlush(m_NativeDisplay);
+}
+
+void GSWndOGL::Hide()
+{
+	XUnmapWindow(m_NativeDisplay, m_NativeWindow);
+	XFlush(m_NativeDisplay);
+}
+
+void GSWndOGL::HideFrame()
+{
+	// TODO
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/GSWndOGL.h b/plugins/GSdx_legacy/GSWndOGL.h
new file mode 100644
index 0000000000..7f71c049ab
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndOGL.h
@@ -0,0 +1,63 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "GSWnd.h"
+
+#if defined(__linux__)
+#include <X11/Xlib.h>
+#include <GL/glx.h>
+
+class GSWndOGL final : public GSWndGL
+{
+	Window     m_NativeWindow;
+	Display*   m_NativeDisplay;
+	GLXContext m_context;
+
+	PFNGLXSWAPINTERVALEXTPROC m_swapinterval;
+
+	void CreateContext(int major, int minor);
+	void CheckContext();
+
+public:
+	GSWndOGL();
+	virtual ~GSWndOGL() {};
+
+	bool Create(const string& title, int w, int h);
+	bool Attach(void* handle, bool managed = true);
+	void Detach();
+
+	void* GetDisplay();
+	void* GetHandle() {return (void*)m_NativeWindow;}
+	GSVector4i GetClientRect();
+	bool SetWindowText(const char* title);
+
+	void AttachContext();
+	void DetachContext();
+	void* GetProcAddress(const char* name, bool opt = false);
+
+	void Show();
+	void Hide();
+	void HideFrame();
+	void Flip();
+	void SetVSync(bool enable);
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/GSWndWGL.cpp b/plugins/GSdx_legacy/GSWndWGL.cpp
new file mode 100644
index 0000000000..e7af7dcae1
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndWGL.cpp
@@ -0,0 +1,381 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSWndWGL.h"
+
+#ifdef _WIN32
+GSWndWGL::GSWndWGL()
+	: m_NativeWindow(NULL), m_NativeDisplay(NULL), m_context(NULL)
+{
+}
+
+// Used by GSReplay. Perhaps the stuff used by GSReplay can be moved out? That way all
+// the GSOpen 1 stuff can be removed. But that'll take a bit of thinking.
+LRESULT CALLBACK GSWndWGL::WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	switch (message)
+	{
+	case WM_CLOSE:
+		// This takes place before GSClose, so don't destroy the Window so we can clean up.
+		ShowWindow(hWnd, SW_HIDE);
+		// DestroyWindow(hWnd);
+		return 0;
+	default:
+		return DefWindowProc(hWnd, message, wParam, lParam);
+	}
+}
+
+
+bool GSWndWGL::CreateContext(int major, int minor)
+{
+	if ( !m_NativeDisplay || !m_NativeWindow )
+	{
+		fprintf( stderr, "Wrong display/window\n" );
+		exit(1);
+	}
+
+	// GL2 context are quite easy but we need GL3 which is another painful story...
+	m_context = wglCreateContext(m_NativeDisplay);
+	if (!m_context) {
+		fprintf(stderr, "Failed to create a 2.0 context\n");
+		return false;
+	}
+
+	// FIXME test it
+	// Note: albeit every tutorial said that we need an opengl context to use the GL function wglCreateContextAttribsARB
+	// On linux it works without the extra temporary context, not sure the limitation still applied
+	if (major >= 3) {
+		AttachContext();
+
+		// Create a context
+		int context_attribs[] =
+		{
+			WGL_CONTEXT_MAJOR_VERSION_ARB, major,
+			WGL_CONTEXT_MINOR_VERSION_ARB, minor,
+			// FIXME : Request a debug context to ease opengl development
+			// Note: don't support deprecated feature (pre openg 3.1)
+			//GLX_CONTEXT_FLAGS_ARB, GLX_CONTEXT_DEBUG_BIT_ARB | GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB,
+			WGL_CONTEXT_FLAGS_ARB, WGL_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB
+#ifdef ENABLE_OGL_DEBUG
+			| WGL_CONTEXT_DEBUG_BIT_ARB
+#else
+			| GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR
+#endif
+			,
+			WGL_CONTEXT_PROFILE_MASK_ARB, WGL_CONTEXT_CORE_PROFILE_BIT_ARB,
+			0
+		};
+
+		PFNWGLCREATECONTEXTATTRIBSARBPROC wglCreateContextAttribsARB = (PFNWGLCREATECONTEXTATTRIBSARBPROC)wglGetProcAddress("wglCreateContextAttribsARB");
+		if (!wglCreateContextAttribsARB) {
+			fprintf(stderr, "Failed to init wglCreateContextAttribsARB function pointer\n");
+			return false;
+		}
+
+		HGLRC context30 = wglCreateContextAttribsARB(m_NativeDisplay, NULL, context_attribs);
+		if (!context30) {
+			fprintf(stderr, "Failed to create a 3.x context\n");
+			return false;
+		}
+
+		DetachContext();
+		wglDeleteContext(m_context);
+
+		m_context = context30;
+		fprintf(stderr, "3.x GL context successfully created\n");
+	}
+
+	return true;
+}
+
+void GSWndWGL::AttachContext()
+{
+	if (!IsContextAttached()) {
+		wglMakeCurrent(m_NativeDisplay, m_context);
+		m_ctx_attached = true;
+	}
+}
+
+void GSWndWGL::DetachContext()
+{
+	if (IsContextAttached()) {
+		wglMakeCurrent(NULL, NULL);
+		m_ctx_attached = false;
+	}
+}
+
+//TODO: DROP ???
+void GSWndWGL::CheckContext()
+{
+#if 0
+	int glxMajorVersion, glxMinorVersion;
+	glXQueryVersion(m_NativeDisplay, &glxMajorVersion, &glxMinorVersion);
+	if (glXIsDirect(m_NativeDisplay, m_context))
+		fprintf(stderr, "glX-Version %d.%d with Direct Rendering\n", glxMajorVersion, glxMinorVersion);
+	else
+		fprintf(stderr, "glX-Version %d.%d with Indirect Rendering !!! It won't support properly opengl\n", glxMajorVersion, glxMinorVersion);
+#endif
+}
+
+bool GSWndWGL::Attach(void* handle, bool managed)
+{
+	m_NativeWindow = (HWND)handle;
+	m_managed = managed;
+
+	if (!OpenWGLDisplay()) return false;
+
+	if (!CreateContext(3, 3)) return false;
+
+	AttachContext();
+
+	CheckContext();
+
+	m_swapinterval = (PFNWGLSWAPINTERVALEXTPROC)wglGetProcAddress("wglSwapIntervalEXT");
+
+	PopulateGlFunction();
+
+	UpdateWindow(m_NativeWindow);
+
+	return true;
+}
+
+void GSWndWGL::Detach()
+{
+	// Actually the destructor is not called when there is only a GSclose/GSshutdown
+	// The window still need to be closed
+	DetachContext();
+
+	if (m_context) wglDeleteContext(m_context);
+	m_context = NULL;
+
+	CloseWGLDisplay();
+
+	// Used by GSReplay.
+	if (m_NativeWindow && m_managed)
+	{
+		DestroyWindow(m_NativeWindow);
+		m_NativeWindow = NULL;
+	}
+
+}
+
+bool GSWndWGL::OpenWGLDisplay()
+{
+	GLuint	  PixelFormat;			// Holds The Results After Searching For A Match
+	PIXELFORMATDESCRIPTOR pfd =			 // pfd Tells Windows How We Want Things To Be
+
+	{
+		sizeof(PIXELFORMATDESCRIPTOR),			  // Size Of This Pixel Format Descriptor
+		1,										  // Version Number
+		PFD_DRAW_TO_WINDOW |						// Format Must Support Window
+		PFD_SUPPORT_OPENGL |						// Format Must Support OpenGL
+		PFD_DOUBLEBUFFER,						   // Must Support Double Buffering
+		PFD_TYPE_RGBA,							  // Request An RGBA Format
+		32,										 // Select Our Color Depth
+		0, 0, 0, 0, 0, 0,						   // Color Bits Ignored
+		0,										  // 8bit Alpha Buffer
+		0,										  // Shift Bit Ignored
+		0,										  // No Accumulation Buffer
+		0, 0, 0, 0,								 // Accumulation Bits Ignored
+		24,										 // 24Bit Z-Buffer (Depth Buffer)
+		8,										  // 8bit Stencil Buffer
+		0,										  // No Auxiliary Buffer
+		PFD_MAIN_PLANE,							 // Main Drawing Layer
+		0,										  // Reserved
+		0, 0, 0									 // Layer Masks Ignored
+	};
+
+	m_NativeDisplay = GetDC(m_NativeWindow);
+	if (!m_NativeDisplay)
+	{
+		MessageBox(NULL, "(1) Can't Create A GL Device Context.", "ERROR", MB_OK | MB_ICONEXCLAMATION);
+		return false;
+	}
+	PixelFormat = ChoosePixelFormat(m_NativeDisplay, &pfd);
+	if (!PixelFormat)
+	{
+		MessageBox(NULL, "(2) Can't Find A Suitable PixelFormat.", "ERROR", MB_OK | MB_ICONEXCLAMATION);
+		return false;
+	}
+
+	if (!SetPixelFormat(m_NativeDisplay, PixelFormat, &pfd))
+	{
+		MessageBox(NULL, "(3) Can't Set The PixelFormat.", "ERROR", MB_OK | MB_ICONEXCLAMATION);
+		return false;
+	}
+
+	return true;
+}
+
+void GSWndWGL::CloseWGLDisplay()
+{
+	if (m_NativeDisplay && !ReleaseDC(m_NativeWindow, m_NativeDisplay))				 // Are We Able To Release The DC
+	{
+		MessageBox(NULL, "Release Device Context Failed.", "SHUTDOWN ERROR", MB_OK | MB_ICONINFORMATION);
+	}
+	m_NativeDisplay = NULL;									 // Set DC To NULL
+}
+
+//TODO: GSopen 1 => Drop?
+// Used by GSReplay. At least for now.
+// More or less copy pasted from GSWndDX::Create and GSWndWGL::Attach with a few
+// modifications
+bool GSWndWGL::Create(const string& title, int w, int h)
+{
+	if(m_NativeWindow) return false;
+
+	m_managed = true;
+
+	WNDCLASS wc;
+
+	memset(&wc, 0, sizeof(wc));
+
+	wc.style = CS_HREDRAW | CS_VREDRAW | CS_DBLCLKS | CS_OWNDC;
+	wc.lpfnWndProc = WndProc;
+	wc.hInstance = theApp.GetModuleHandle();
+	wc.hCursor = LoadCursor(NULL, IDC_ARROW);
+	wc.hbrBackground = (HBRUSH)GetStockObject(BLACK_BRUSH);
+	wc.lpszClassName = "GSWndOGL";
+
+	if (!GetClassInfo(wc.hInstance, wc.lpszClassName, &wc))
+	{
+		if (!RegisterClass(&wc))
+		{
+			return false;
+		}
+	}
+
+	DWORD style = WS_CLIPCHILDREN | WS_CLIPSIBLINGS | WS_OVERLAPPEDWINDOW | WS_BORDER;
+
+	GSVector4i r;
+
+	GetWindowRect(GetDesktopWindow(), r);
+
+	// Old GSOpen ModeWidth and ModeHeight are not necessary with this.
+	bool remote = !!GetSystemMetrics(SM_REMOTESESSION);
+
+	if (w <= 0 || h <= 0 || remote)
+	{
+		w = r.width() / 3;
+		h = r.width() / 4;
+
+		if (!remote)
+		{
+			w *= 2;
+			h *= 2;
+		}
+	}
+
+	r.left = (r.left + r.right - w) / 2;
+	r.top = (r.top + r.bottom - h) / 2;
+	r.right = r.left + w;
+	r.bottom = r.top + h;
+
+	AdjustWindowRect(r, style, FALSE);
+
+	m_NativeWindow = CreateWindow(wc.lpszClassName, title.c_str(), style, r.left, r.top, r.width(), r.height(), NULL, NULL, wc.hInstance, (LPVOID)this);
+
+	if (m_NativeWindow == NULL) return false;
+
+	if (!OpenWGLDisplay()) return false;
+
+	if (!CreateContext(3, 3)) return false;
+
+	AttachContext();
+
+	m_swapinterval = (PFNWGLSWAPINTERVALEXTPROC)wglGetProcAddress("wglSwapIntervalEXT");
+
+	PopulateGlFunction();
+
+	return true;
+
+}
+
+//Same as DX
+GSVector4i GSWndWGL::GetClientRect()
+{
+	GSVector4i r;
+
+	::GetClientRect(m_NativeWindow, r);
+
+	return r;
+}
+
+void* GSWndWGL::GetProcAddress(const char* name, bool opt)
+{
+	void* ptr = (void*)wglGetProcAddress(name);
+	if (ptr == NULL) {
+		fprintf(stderr, "Failed to find %s\n", name);
+		if (!opt)
+			throw GSDXRecoverableError();
+	}
+	return ptr;
+}
+
+//TODO: check extensions supported or not
+//FIXME : extension allocation
+void GSWndWGL::SetVSync(bool enable)
+{
+	// m_swapinterval uses an integer as parameter
+	// 0 -> disable vsync
+	// n -> wait n frame
+	if (m_swapinterval) m_swapinterval((int)enable);
+}
+
+void GSWndWGL::Flip()
+{
+	SwapBuffers(m_NativeDisplay);
+}
+
+void GSWndWGL::Show()
+{
+	if (!m_managed) return;
+
+	// Used by GSReplay
+	SetForegroundWindow(m_NativeWindow);
+	ShowWindow(m_NativeWindow, SW_SHOWNORMAL);
+	UpdateWindow(m_NativeWindow);
+}
+
+void GSWndWGL::Hide()
+{
+}
+
+void GSWndWGL::HideFrame()
+{
+}
+
+// Returns FALSE if the window has no title, or if th window title is under the strict
+// management of the emulator.
+
+bool GSWndWGL::SetWindowText(const char* title)
+{
+	if (!m_managed) return false;
+
+	// Used by GSReplay.
+	::SetWindowText(m_NativeWindow, title);
+
+	return true;
+}
+
+
+#endif
diff --git a/plugins/GSdx_legacy/GSWndWGL.h b/plugins/GSdx_legacy/GSWndWGL.h
new file mode 100644
index 0000000000..4c477b7c6c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSWndWGL.h
@@ -0,0 +1,66 @@
+/*
+ *	Copyright (C) 2007-2012 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "GSWnd.h"
+
+#ifdef _WIN32
+
+class GSWndWGL : public GSWndGL
+{
+	HWND	 m_NativeWindow;
+	HDC		 m_NativeDisplay;
+	HGLRC	 m_context;
+
+	PFNWGLSWAPINTERVALEXTPROC m_swapinterval;
+
+	bool CreateContext(int major, int minor);
+	void CheckContext();
+
+	void CloseWGLDisplay();
+	bool OpenWGLDisplay();
+
+	static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+
+public:
+	GSWndWGL();
+	virtual ~GSWndWGL() {};
+
+	bool Create(const string& title, int w, int h);
+	bool Attach(void* handle, bool managed = true);
+	void Detach();
+
+	void* GetDisplay() {return m_NativeWindow;}
+	void* GetHandle() {return m_NativeWindow;}
+	GSVector4i GetClientRect();
+	bool SetWindowText(const char* title);
+
+	void AttachContext();
+	void DetachContext();
+	void* GetProcAddress(const char* name, bool opt);
+
+	void Show();
+	void Hide();
+	void HideFrame();
+	void Flip();
+	void SetVSync(bool enable);
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/GSdx.cpp b/plugins/GSdx_legacy/GSdx.cpp
new file mode 100644
index 0000000000..bd2a7ec564
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.cpp
@@ -0,0 +1,335 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSdx.h"
+#include "GS.h"
+
+static void* s_hModule;
+
+#ifdef _WIN32
+
+BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
+{
+	switch(ul_reason_for_call)
+	{
+	case DLL_PROCESS_ATTACH:
+		s_hModule = hModule;
+	case DLL_THREAD_ATTACH:
+	case DLL_THREAD_DETACH:
+	case DLL_PROCESS_DETACH:
+		break;
+	}
+
+	return TRUE;
+}
+
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	HRSRC hRsrc = FindResource((HMODULE)s_hModule, MAKEINTRESOURCE(id), type != NULL ? type : RT_RCDATA);
+	if(!hRsrc) return false;
+	HGLOBAL hGlobal = ::LoadResource((HMODULE)s_hModule, hRsrc);
+	if(!hGlobal) return false;
+	DWORD size = SizeofResource((HMODULE)s_hModule, hRsrc);
+	if(!size) return false;
+	buff.resize(size);
+	memcpy(buff.data(), LockResource(hGlobal), size);
+	return true;
+}
+
+#else
+
+bool GSdxApp::LoadResource(int id, vector<unsigned char>& buff, const char* type)
+{
+	buff.clear();
+	printf("LoadResource not implemented\n");
+	return false;
+}
+
+size_t GSdxApp::GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName)
+{
+	BuildConfigurationMap(lpFileName);
+
+	std::string key(lpKeyName);
+	std::string value = m_configuration_map[key];
+	if (value.empty()) {
+		// save the value for futur call
+		m_configuration_map[key] = std::string(lpDefault);
+		strcpy(lpReturnedString, lpDefault);
+	} else
+		strcpy(lpReturnedString, value.c_str());
+
+    return 0;
+}
+
+bool GSdxApp::WritePrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* pString, const char* lpFileName)
+{
+	BuildConfigurationMap(lpFileName);
+
+	std::string key(lpKeyName);
+	std::string value(pString);
+	m_configuration_map[key] = value;
+
+	// Save config to a file
+	FILE* f = fopen(lpFileName, "w");
+
+	if (f == NULL) return false; // FIXME print a nice message
+
+	map<std::string,std::string>::iterator it;
+	for (it = m_configuration_map.begin(); it != m_configuration_map.end(); ++it) {
+		// Do not save the inifile key which is not an option
+		if (it->first.compare("inifile") == 0) continue;
+
+		if (!it->second.empty())
+			fprintf(f, "%s = %s\n", it->first.c_str(), it->second.c_str());
+	}
+	fclose(f);
+
+	return false;
+}
+
+int GSdxApp::GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName)
+{
+	BuildConfigurationMap(lpFileName);
+
+	std::string value = m_configuration_map[std::string(lpKeyName)];
+	if (value.empty()) {
+		// save the value for futur call
+		SetConfig(lpKeyName, nDefault);
+		return nDefault;
+	} else
+		return atoi(value.c_str());
+}
+#endif
+
+GSdxApp theApp;
+
+GSdxApp::GSdxApp()
+{
+	m_ini = "inis/GSdx.ini";
+	m_section = "Settings";
+
+#ifdef _WIN32
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::OGL_HW), "OpenGL", "Hardware"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX1011_HW), "Direct3D11", "Hardware"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX9_HW),			"Direct3D9",	"Hardware"));
+
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::OGL_SW), "OpenGL", "Software"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX1011_SW), "Direct3D11", "Software"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX9_SW),		"Direct3D9",	"Software"));
+
+#ifdef _DEBUG
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX9_Null), "Direct3D9", "Null"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX1011_Null), "Direct3D11", "Null"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::Null_SW), "Null", "Software"));
+#endif
+#else // Linux
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::OGL_HW), "OpenGL", "Hardware"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::OGL_SW), "OpenGL", "Software"));
+#endif
+
+	// The null renderer goes third, it has use for benchmarking purposes in a release build
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::Null_Null), "None", "Core Benchmark"));
+
+#ifdef ENABLE_OPENCL
+	// OpenCL stuff goes last
+	// FIXME openCL isn't attached to a device (could be impacted by the window management stuff however)
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX9_OpenCL),		"Direct3D9",	"OpenCL"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::DX1011_OpenCL),	"Direct3D11",	"OpenCL"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::Null_OpenCL),	"Null",			"OpenCL"));
+	m_gs_renderers.push_back(GSSetting(static_cast<uint32>(GSRendererType::OGL_OpenCL),		"OpenGL",		"OpenCL"));
+#endif
+
+	m_gs_interlace.push_back(GSSetting(0, "None", ""));
+	m_gs_interlace.push_back(GSSetting(1, "Weave tff", "saw-tooth"));
+	m_gs_interlace.push_back(GSSetting(2, "Weave bff", "saw-tooth"));
+	m_gs_interlace.push_back(GSSetting(3, "Bob tff", "use blend if shaking"));
+	m_gs_interlace.push_back(GSSetting(4, "Bob bff", "use blend if shaking"));
+	m_gs_interlace.push_back(GSSetting(5, "Blend tff", "slight blur, 1/2 fps"));
+	m_gs_interlace.push_back(GSSetting(6, "Blend bff", "slight blur, 1/2 fps"));
+	m_gs_interlace.push_back(GSSetting(7, "Auto", ""));
+
+	m_gs_aspectratio.push_back(GSSetting(0, "Stretch", ""));
+	m_gs_aspectratio.push_back(GSSetting(1, "4:3", ""));
+	m_gs_aspectratio.push_back(GSSetting(2, "16:9", ""));
+
+	m_gs_upscale_multiplier.push_back(GSSetting(1, "Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(2, "2x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(3, "3x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(4, "4x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(5, "5x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(6, "6x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(8, "8x Native", ""));
+	m_gs_upscale_multiplier.push_back(GSSetting(0, "Custom", ""));
+
+	m_gs_max_anisotropy.push_back(GSSetting(0, "Off", ""));
+	m_gs_max_anisotropy.push_back(GSSetting(2, "2x", ""));
+	m_gs_max_anisotropy.push_back(GSSetting(4, "4x", ""));
+	m_gs_max_anisotropy.push_back(GSSetting(8, "8x", ""));
+	m_gs_max_anisotropy.push_back(GSSetting(16, "16x", ""));
+
+	m_gs_filter.push_back(GSSetting(0, "Nearest", ""));
+	m_gs_filter.push_back(GSSetting(1, "Bilinear", "Forced"));
+	m_gs_filter.push_back(GSSetting(2, "Bilinear", "PS2"));
+
+	m_gs_gl_ext.push_back(GSSetting(-1, "Auto", ""));
+	m_gs_gl_ext.push_back(GSSetting(0,  "Force-Disabled", ""));
+	m_gs_gl_ext.push_back(GSSetting(1,  "Force-Enabled", ""));
+
+	m_gs_hack.push_back(GSSetting(0,  "Off", ""));
+	m_gs_hack.push_back(GSSetting(1,  "Half", ""));
+	m_gs_hack.push_back(GSSetting(2,  "Full", ""));
+
+	m_gs_crc_level.push_back(GSSetting(0 , "None", "Debug"));
+	m_gs_crc_level.push_back(GSSetting(1 , "Minimum", "Debug"));
+	m_gs_crc_level.push_back(GSSetting(2 , "Partial", "OpenGL Recommended"));
+	m_gs_crc_level.push_back(GSSetting(3 , "Full", "Safest"));
+	m_gs_crc_level.push_back(GSSetting(4 , "Aggressive", ""));
+
+	m_gs_acc_blend_level.push_back(GSSetting(0, "None", "Fastest"));
+	m_gs_acc_blend_level.push_back(GSSetting(1, "Basic", "Recommended low-end PC"));
+	m_gs_acc_blend_level.push_back(GSSetting(2, "Medium", ""));
+	m_gs_acc_blend_level.push_back(GSSetting(3, "High", "Recommended high-end PC"));
+	m_gs_acc_blend_level.push_back(GSSetting(4, "Full", "Very Slow"));
+	m_gs_acc_blend_level.push_back(GSSetting(5, "Ultra", "Ultra Slow"));
+
+	m_gs_tv_shaders.push_back(GSSetting(0, "None", ""));
+	m_gs_tv_shaders.push_back(GSSetting(1, "Scanline filter", ""));
+	m_gs_tv_shaders.push_back(GSSetting(2, "Diagonal filter", ""));
+	m_gs_tv_shaders.push_back(GSSetting(3, "Triangular filter", ""));
+	m_gs_tv_shaders.push_back(GSSetting(4, "Wave filter", ""));
+
+	m_gpu_renderers.push_back(GSSetting(0, "Direct3D9 (Software)", ""));
+	m_gpu_renderers.push_back(GSSetting(1, "Direct3D11 (Software)", ""));
+	m_gpu_renderers.push_back(GSSetting(2, "SDL 1.3 (Software)", ""));
+	m_gpu_renderers.push_back(GSSetting(3, "Null (Software)", ""));
+	//m_gpu_renderers.push_back(GSSetting(4, "Null (Null)", ""));
+
+	m_gpu_filter.push_back(GSSetting(0, "Nearest", ""));
+	m_gpu_filter.push_back(GSSetting(1, "Bilinear (polygons only)", ""));
+	m_gpu_filter.push_back(GSSetting(2, "Bilinear", ""));
+
+	m_gpu_dithering.push_back(GSSetting(0, "Disabled", ""));
+	m_gpu_dithering.push_back(GSSetting(1, "Auto", ""));
+
+	m_gpu_aspectratio.push_back(GSSetting(0, "Stretch", ""));
+	m_gpu_aspectratio.push_back(GSSetting(1, "4:3", ""));
+	m_gpu_aspectratio.push_back(GSSetting(2, "16:9", ""));
+
+	m_gpu_scale.push_back(GSSetting(0 | (0 << 2), "H x 1 - V x 1", ""));
+	m_gpu_scale.push_back(GSSetting(1 | (0 << 2), "H x 2 - V x 1", ""));
+	m_gpu_scale.push_back(GSSetting(0 | (1 << 2), "H x 1 - V x 2", ""));
+	m_gpu_scale.push_back(GSSetting(1 | (1 << 2), "H x 2 - V x 2", ""));
+	m_gpu_scale.push_back(GSSetting(2 | (1 << 2), "H x 4 - V x 2", ""));
+	m_gpu_scale.push_back(GSSetting(1 | (2 << 2), "H x 2 - V x 4", ""));
+	m_gpu_scale.push_back(GSSetting(2 | (2 << 2), "H x 4 - V x 4", ""));
+}
+
+#ifdef __linux__
+void GSdxApp::ReloadConfig()
+{
+	if (m_configuration_map.empty()) return;
+
+	auto file = m_configuration_map.find("inifile");
+	if (file == m_configuration_map.end()) return;
+
+	// A map was built so reload it
+	std::string filename = file->second;
+	m_configuration_map.clear();
+	BuildConfigurationMap(filename.c_str());
+}
+
+void GSdxApp::BuildConfigurationMap(const char* lpFileName)
+{
+	// Check if the map was already built
+	std::string inifile_value(lpFileName);
+	if ( inifile_value.compare(m_configuration_map["inifile"]) == 0 ) return;
+	m_configuration_map["inifile"] = inifile_value;
+
+	// Load config from file
+	char value[256];
+	char key[256];
+	FILE* f = fopen(lpFileName, "r");
+
+	if (f == NULL) return; // FIXME print a nice message
+
+	while( fscanf(f, "%255s = %255s\n", key, value) != EOF ) {
+		std::string key_s(key);
+		std::string value_s(value);
+		m_configuration_map[key_s] = value_s;
+	}
+
+	fclose(f);
+}
+#endif
+
+void* GSdxApp::GetModuleHandlePtr()
+{
+	return s_hModule;
+}
+
+void GSdxApp::SetConfigDir(const char* dir)
+{
+	if( dir == NULL )
+	{
+		m_ini = "inis/GSdx.ini";
+	}
+	else
+	{
+		m_ini = dir;
+
+		if(m_ini[m_ini.length() - 1] != DIRECTORY_SEPARATOR)
+		{
+			m_ini += DIRECTORY_SEPARATOR;
+		}
+
+		m_ini += "GSdx.ini";
+	}
+}
+
+string GSdxApp::GetConfig(const char* entry, const char* value)
+{
+	char buff[4096] = {0};
+
+	GetPrivateProfileString(m_section.c_str(), entry, value, buff, countof(buff), m_ini.c_str());
+
+	return string(buff);
+}
+
+void GSdxApp::SetConfig(const char* entry, const char* value)
+{
+	WritePrivateProfileString(m_section.c_str(), entry, value, m_ini.c_str());
+}
+
+int GSdxApp::GetConfig(const char* entry, int value)
+{
+	return GetPrivateProfileInt(m_section.c_str(), entry, value, m_ini.c_str());
+}
+
+void GSdxApp::SetConfig(const char* entry, int value)
+{
+	char buff[32] = {0};
+
+	sprintf(buff, "%d", value);
+
+	SetConfig(entry, buff);
+}
diff --git a/plugins/GSdx_legacy/GSdx.def b/plugins/GSdx_legacy/GSdx.def
new file mode 100644
index 0000000000..7ab5563e41
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.def
@@ -0,0 +1,71 @@
+; GSdx.def : Declares the module parameters for the DLL.
+
+EXPORTS
+    ; Explicit exports can go here
+	PS2EgetLibType		
+	PS2EgetLibName		
+	PS2EgetLibVersion2	
+	PS2EgetCpuPlatform
+	GSsetBaseMem		
+	GSinit				
+	GSshutdown			
+	GSopen
+	GSopen2				
+	GSclose				
+	GSreset				
+	GSwriteCSR			
+	GSgifSoftReset
+	GSgifTransfer
+	GSgifTransfer1		
+	GSgifTransfer2		
+	GSgifTransfer3		
+	GSvsync				
+	GSmakeSnapshot		
+	GSkeyEvent			
+	GSfreeze            
+	GSconfigure			
+	GStest				
+	GSabout				
+	GSinitReadFIFO
+	GSreadFIFO
+	GSinitReadFIFO2
+	GSreadFIFO2
+	GSirqCallback
+	GSsetupRecording		
+	GSsetGameCRC		
+	GSsetFrameSkip
+	GSsetFrameLimit		
+	GSsetVsync
+	GSsetExclusive
+	GSsetSettingsDir
+	GSgetLastTag
+	GSReplay
+	GSBenchmark
+	GSgetTitleInfo2
+	PSEgetLibType
+	PSEgetLibName
+	PSEgetLibVersion
+	GPUinit
+	GPUshutdown
+	GPUopen
+	GPUclose
+	GPUconfigure
+	GPUabout
+	GPUtest
+	GPUwriteData
+	GPUwriteStatus
+	GPUreadData
+	GPUreadStatus
+	GPUdmaChain
+	GPUgetMode
+	GPUsetMode
+	GPUupdateLace
+	GPUmakeSnapshot
+    GPUwriteDataMem
+    GPUreadDataMem
+    GPUdisplayText
+    GPUdisplayFlags
+	GPUfreeze
+    GPUshowScreenPic
+    GPUgetScreenPic
+    GPUcursor
diff --git a/plugins/GSdx_legacy/GSdx.gcc.workspace b/plugins/GSdx_legacy/GSdx.gcc.workspace
new file mode 100644
index 0000000000..12ac2a87f4
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.gcc.workspace
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_workspace_file>
+	<Workspace title="Workspace">
+		<Project filename="GSdx.gcc.cbp" active="1">
+			<Depends filename="../../3rdparty/SDL-1.3.0-5387/SDL-1.3/SDL-1.3.cbp" />
+		</Project>
+		<Project filename="../../3rdparty/SDL-1.3.0-5387/SDL-1.3/SDL-1.3.cbp" />
+	</Workspace>
+</CodeBlocks_workspace_file>
diff --git a/plugins/GSdx_legacy/GSdx.h b/plugins/GSdx_legacy/GSdx.h
new file mode 100644
index 0000000000..5ea4201c4c
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.h
@@ -0,0 +1,83 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSSetting.h"
+
+class GSdxApp
+{
+	std::string m_ini;
+	std::string m_section;
+#ifdef __linux__
+	std::map< std::string, std::string > m_configuration_map;
+#endif
+
+public:
+	GSdxApp();
+
+    void* GetModuleHandlePtr();
+
+#ifdef _WIN32
+ 	HMODULE GetModuleHandle() {return (HMODULE)GetModuleHandlePtr();}
+#endif
+
+#ifdef __linux__
+	void BuildConfigurationMap(const char* lpFileName);
+	void ReloadConfig();
+
+	size_t GetPrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* lpDefault, char* lpReturnedString, size_t nSize, const char* lpFileName);
+	bool WritePrivateProfileString(const char* lpAppName, const char* lpKeyName, const char* pString, const char* lpFileName);
+	int GetPrivateProfileInt(const char* lpAppName, const char* lpKeyName, int nDefault, const char* lpFileName);
+#endif
+
+	bool LoadResource(int id, vector<unsigned char>& buff, const char* type = NULL);
+
+	string GetConfig(const char* entry, const char* value);
+	void SetConfig(const char* entry, const char* value);
+	int GetConfig(const char* entry, int value);
+	void SetConfig(const char* entry, int value);
+
+	void SetConfigDir(const char* dir);
+
+	vector<GSSetting> m_gs_renderers;
+	vector<GSSetting> m_gs_interlace;
+	vector<GSSetting> m_gs_aspectratio;
+	vector<GSSetting> m_gs_upscale_multiplier;
+	vector<GSSetting> m_gs_max_anisotropy;
+	vector<GSSetting> m_gs_filter;
+	vector<GSSetting> m_gs_gl_ext;
+	vector<GSSetting> m_gs_hack;
+	vector<GSSetting> m_gs_crc_level;
+	vector<GSSetting> m_gs_acc_blend_level;
+	vector<GSSetting> m_gs_tv_shaders;
+
+	vector<GSSetting> m_gpu_renderers;
+	vector<GSSetting> m_gpu_filter;
+	vector<GSSetting> m_gpu_dithering;
+	vector<GSSetting> m_gpu_aspectratio;
+	vector<GSSetting> m_gpu_scale;
+};
+
+struct GSDXError {};
+struct GSDXRecoverableError : GSDXError {};
+
+extern GSdxApp theApp;
diff --git a/plugins/GSdx_legacy/GSdx.props b/plugins/GSdx_legacy/GSdx.props
new file mode 100644
index 0000000000..a2fffcb395
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.props
@@ -0,0 +1,20 @@
+﻿<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <_PropertySheetDisplayName>GSdx</_PropertySheetDisplayName>
+    <TargetName>$(ProjectName)-$(SSEtype)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <DisableSpecificWarnings>4995;4324;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>JITProfiling.lib;d3d11_beta.lib;d3dx11.lib;d3d10.lib;d3d10_1.lib;d3dx10.lib;d3d9.lib;d3dx9.lib;ddraw.lib;dxguid.lib;winmm.lib;strmiids.lib;xinput.lib;cg.lib;cgGL.lib;glut32.lib;glew32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>./vtune;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>d3d9.dll;d3dx9_41.dll;d3d10.dll;d3d10_1.dll;d3dx10_41.dll;d3d11.dll;d3d11_beta.dll;d3dx11_41.dll;%(DelayLoadDLLs)</DelayLoadDLLs>
+    </Link>
+    <PreBuildEvent>
+      <Command>"$(SolutionDir)common\vsprops\preBuild.cmd" "$(ProjectDir)."</Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSdx.rc b/plugins/GSdx_legacy/GSdx.rc
new file mode 100644
index 0000000000..eb1f09d72b
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.rc
@@ -0,0 +1,375 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#ifndef APSTUDIO_INVOKED
+#include "targetver.h"
+#endif
+#define APSTUDIO_HIDDEN_SYMBOLS
+#include "windows.h"
+#undef APSTUDIO_HIDDEN_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// English (United States) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#ifndef APSTUDIO_INVOKED\r\n"
+    "#include ""targetver.h""\r\n"
+    "#endif\r\n"
+    "#define APSTUDIO_HIDDEN_SYMBOLS\r\n"
+    "#include ""windows.h""\r\n"
+    "#undef APSTUDIO_HIDDEN_SYMBOLS\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "#include ""res/tfx.fx""\r\n"
+    "#include ""res/convert.fx""\r\n"
+    "#include ""res/interlace.fx""\r\n"
+    "#include ""res/merge.fx""\r\n"
+    "#include ""res/fxaa.fx""\r\n"
+    "#include ""res/cs.fx""\r\n"
+    "#include ""res/shadeboost.fx""\r\n"
+    "#include ""res/tfx.cl""\r\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// RCDATA
+//
+
+IDR_TFX_FX              RCDATA                  "res\\tfx.fx"
+IDR_CONVERT_FX          RCDATA                  "res\\convert.fx"
+IDR_INTERLACE_FX        RCDATA                  "res\\interlace.fx"
+IDR_MERGE_FX            RCDATA                  "res\\merge.fx"
+IDR_FXAA_FX             RCDATA                  "res\\fxaa.fx"
+IDR_CS_FX               RCDATA                  "res\\cs.fx"
+IDR_SHADEBOOST_FX       RCDATA                  "res\\shadeboost.fx"
+IDR_TFX_CL              RCDATA                  "res\\tfx.cl"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Bitmap
+//
+
+IDB_LOGO9               BITMAP                  "res\\logo9.bmp"
+IDB_LOGO10              BITMAP                  "res\\logo10.bmp"
+IDB_LOGOGL              BITMAP                  "res\\logo-ogl.bmp"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Dialog
+//
+
+IDD_HACKS DIALOGEX 0, 0, 161, 200
+STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
+CAPTION "Hacks Configuration"
+FONT 8, "MS Shell Dlg", 400, 0, 0x1
+BEGIN
+    DEFPUSHBUTTON   "OK",IDOK,88,181,66,14
+    GROUPBOX        "USE AT YOUR OWN RISK!",IDC_STATIC,7,7,147,171,0,WS_EX_TRANSPARENT
+    CONTROL         "Preload Data Frame",IDC_PRELOAD_GS,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,95,80,8
+    RTEXT           "MSAA:",IDC_MSAA_TEXT,62,20,22,8
+    RTEXT           "Skipdraw:",IDC_STATIC,52,36,32,8
+    EDITTEXT        IDC_SKIPDRAWHACKEDIT,88,33,58,14,ES_RIGHT | ES_AUTOHSCROLL
+    CONTROL         "",IDC_SKIPDRAWHACK,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,113,34,11,14
+    CONTROL         "Alpha",IDC_ALPHAHACK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,88,64,34,8
+    CONTROL         "Half-pixel Offset",IDC_OFFSETHACK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,51,70,8
+    COMBOBOX        IDC_MSAACB,88,17,58,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    CONTROL         "Wild Arms Offset",IDC_WILDHACK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,64,70,8
+    CONTROL         "Safe accurate blending",IDC_SAFE_FBMASK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,77,90,8
+    CONTROL         "Alpha Stencil",IDC_ALPHASTENCIL,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,77,57,8
+    CONTROL         "Align Sprite",IDC_ALIGN_SPRITE,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,88,51,58,8
+    RTEXT           "TC Offset X:",IDC_STATIC,40,146,44,8
+    EDITTEXT        IDC_TCOFFSETX2,88,144,58,14,ES_RIGHT | ES_AUTOHSCROLL
+    CONTROL         "",IDC_TCOFFSETX,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,135,145,11,14
+    EDITTEXT        IDC_TCOFFSETY2,88,162,58,14,ES_RIGHT | ES_AUTOHSCROLL
+    CONTROL         "",IDC_TCOFFSETY,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,135,162,11,14
+    RTEXT           "TC Offset Y:",IDC_STATIC,36,165,48,8
+    COMBOBOX        IDC_ROUND_SPRITE,88,109,58,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    RTEXT           "Round Sprite:",IDC_STATIC,39,111,45,8
+    RTEXT           "Sprite:",IDC_STATIC,62,127,22,8
+    COMBOBOX        IDC_SPRITEHACK,88,125,58,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+END
+
+IDD_SHADER DIALOGEX 0, 0, 248, 250
+STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
+CAPTION "Shader Configuration"
+FONT 8, "MS Shell Dlg", 400, 0, 0x1
+BEGIN
+    DEFPUSHBUTTON   "OK",IDOK,69,231,50,14
+    DEFPUSHBUTTON   "Cancel",IDCANCEL,126,231,50,14
+    CONTROL         "Enable Shade Boost",IDC_SHADEBOOST,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,46,90,10
+    LTEXT           "Saturation",IDC_STATIC,15,64,34,8
+    CONTROL         "",IDC_SATURATION_SLIDER,"msctls_trackbar32",TBS_BOTH | TBS_NOTICKS | WS_TABSTOP,55,62,165,15
+    LTEXT           "Brightness",IDC_STATIC,15,89,34,8
+    CONTROL         "",IDC_BRIGHTNESS_SLIDER,"msctls_trackbar32",TBS_BOTH | TBS_NOTICKS | WS_TABSTOP,55,87,165,15
+    LTEXT           "Contrast",IDC_STATIC,15,114,29,8
+    CONTROL         "",IDC_CONTRAST_SLIDER,"msctls_trackbar32",TBS_BOTH | TBS_NOTICKS | WS_TABSTOP,55,111,165,15
+    RTEXT           "100",IDC_SATURATION_TEXT,220,64,15,8
+    RTEXT           "100",IDC_BRIGHTNESS_TEXT,220,89,15,8
+    RTEXT           "100",IDC_CONTRAST_TEXT,220,114,15,8
+    CONTROL         "Enable FXAA",IDC_FXAA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,188,6,54,10
+    CONTROL         "Enable External Shader",IDC_SHADER_FX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,15,156,90,10
+    LTEXT           "External Shader",IDC_SHADER_FX_TEXT,15,171,75,8
+    EDITTEXT        IDC_SHADER_FX_EDIT,15,179,170,14,ES_AUTOHSCROLL
+    PUSHBUTTON      "Browse",IDC_SHADER_FX_BUTTON,196,179,36,14
+    LTEXT           "External Shader Config",IDC_SHADER_FX_CONF_TEXT,15,196,75,8
+    PUSHBUTTON      "Browse",IDC_SHADER_FX_CONF_BUTTON,196,204,36,14
+    EDITTEXT        IDC_SHADER_FX_CONF_EDIT,15,204,170,14,ES_AUTOHSCROLL
+    COMBOBOX        IDC_TVSHADER,75,4,76,14,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "TV Shader (F7):",IDC_STATIC,15,6,55,8
+    GROUPBOX        "Shade-Boost Settings",IDC_STATIC,6,28,236,106,BS_CENTER
+    GROUPBOX        "External Shader Settings",IDC_STATIC,6,138,236,88,BS_CENTER
+END
+
+IDD_CAPTURE DIALOGEX 0, 0, 279, 71
+STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
+CAPTION "Capture settings"
+FONT 8, "MS Shell Dlg", 400, 0, 0x1
+BEGIN
+    EDITTEXT        IDC_FILENAME,7,7,207,14,ES_AUTOHSCROLL
+    PUSHBUTTON      "Browse...",IDC_BROWSE,222,7,50,14
+    COMBOBOX        IDC_CODECS,7,27,207,122,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    PUSHBUTTON      "Config...",IDC_CONFIGURE,222,26,50,14
+    LTEXT           "Size:",IDC_STATIC,6,50,16,8
+    EDITTEXT        IDC_WIDTH,30,47,31,14,ES_RIGHT | ES_AUTOHSCROLL | ES_NUMBER
+    EDITTEXT        IDC_HEIGHT,64,47,31,14,ES_RIGHT | ES_AUTOHSCROLL | ES_NUMBER
+    PUSHBUTTON      "Cancel",IDCANCEL,169,47,50,14
+    DEFPUSHBUTTON   "OK",IDOK,221,47,50,14
+    COMBOBOX        IDC_COLORSPACE,102,47,48,32,CBS_DROPDOWNLIST | WS_TABSTOP
+END
+
+IDD_GPUCONFIG DIALOGEX 0, 0, 189, 199
+STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
+CAPTION "Settings..."
+FONT 8, "MS Shell Dlg", 400, 0, 0x1
+BEGIN
+    CONTROL         IDB_LOGO9,IDC_LOGO9,"Static",SS_BITMAP,7,7,175,44
+    LTEXT           "Resolution:",IDC_STATIC,7,59,37,8
+    COMBOBOX        IDC_RESOLUTION,80,57,102,125,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Renderer:",IDC_STATIC,7,74,34,8
+    COMBOBOX        IDC_RENDERER,80,72,102,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Texture Filter (Del):",IDC_STATIC,7,90,64,8
+    COMBOBOX        IDC_FILTER,80,87,102,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Dithering (End):",IDC_STATIC,7,105,52,8
+    COMBOBOX        IDC_DITHERING,80,102,102,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Aspect Ratio (PgDn):",IDC_STATIC,7,120,68,8
+    COMBOBOX        IDC_ASPECTRATIO,80,117,102,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Extra Rend. Threads:",IDC_STATIC,7,157,70,8
+    EDITTEXT        IDC_SWTHREADS_EDIT,80,155,35,13,ES_AUTOHSCROLL | ES_NUMBER
+    CONTROL         "",IDC_SWTHREADS,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,99,161,11,14
+    DEFPUSHBUTTON   "OK",IDOK,43,178,50,14
+    PUSHBUTTON      "Cancel",IDCANCEL,96,178,50,14
+    CONTROL         IDB_LOGO10,IDC_LOGO11,"Static",SS_BITMAP,7,7,173,42
+    LTEXT           "Internal Resolution:",IDC_STATIC,7,135,64,8
+    COMBOBOX        IDC_SCALE,80,132,102,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    CONTROL         "Windowed",IDC_WINDOWED,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,129,157,49,10
+END
+
+IDD_CONFIG DIALOGEX 0, 0, 243, 373
+STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
+CAPTION "GSdx Settings"
+FONT 8, "MS Shell Dlg", 400, 0, 0x1
+BEGIN
+    CONTROL         IDB_LOGO10,IDC_LOGO11,"Static",SS_BITMAP | SS_REALSIZECONTROL,35,6,173,42
+    CONTROL         IDB_LOGO9,IDC_LOGO9,"Static",SS_BITMAP | SS_REALSIZECONTROL,34,6,175,44
+    CONTROL         IDB_LOGOGL,IDC_LOGOGL,"Static",SS_BITMAP | SS_REALSIZECONTROL,34,6,175,44
+    LTEXT           "Adapter:",IDC_STATIC,6,57,30,8
+    COMBOBOX        IDC_ADAPTER,71,55,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Renderer:",IDC_STATIC,6,72,34,8
+    COMBOBOX        IDC_RENDERER,71,70,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "OpenCL Device:",IDC_OPENCL_TEXT,6,102,53,8
+    COMBOBOX        IDC_OPENCL_DEVICE,71,100,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Interlacing (F5):",IDC_STATIC,6,87,52,8
+    COMBOBOX        IDC_INTERLACE,71,85,166,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    GROUPBOX        "Hardware Mode Settings",IDC_STATIC,6,116,231,152,BS_CENTER
+    CONTROL         "Allow 8-Bit Textures",IDC_PALTEX,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,128,90,10
+    LTEXT           "Internal Resolution:",IDC_UPSCALE_MULTIPLIER_TEXT,22,144,79,8
+    COMBOBOX        IDC_UPSCALE_MULTIPLIER,105,142,127,98,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Custom Resolution:",IDC_CUSTOM_TEXT,22,160,79,8
+    EDITTEXT        IDC_RESX_EDIT,105,158,61,13,ES_AUTOHSCROLL | ES_NUMBER
+    CONTROL         "",IDC_RESX,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,145,158,11,14
+    EDITTEXT        IDC_RESY_EDIT,171,158,61,13,ES_AUTOHSCROLL | ES_NUMBER
+    CONTROL         "",IDC_RESY,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,221,158,11,14
+    LTEXT           "Texture Filtering:",IDC_FILTER_TEXT,22,176,79,8
+    COMBOBOX        IDC_FILTER,105,174,127,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Anisotropic Filtering:",IDC_AFCOMBO_TEXT,22,192,79,8
+    COMBOBOX        IDC_AFCOMBO,105,190,127,118,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "CRC Hack Level:",IDC_CRC_LEVEL_TEXT,22,208,79,8
+    COMBOBOX        IDC_CRC_LEVEL,105,206,127,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    CONTROL         "Enable HW Hacks",IDC_HACKS_ENABLED,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,223,71,10
+    PUSHBUTTON      "Configure Hacks",IDC_HACKSBUTTON,105,221,127,14
+    CONTROL         "Accurate Date",IDC_ACCURATE_DATE,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,105,239,67,8
+    CONTROL         "Hardware Depth",IDC_TC_DEPTH,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,239,72,8
+    LTEXT           "Blending Unit Accuracy:",IDC_ACCURATE_BLEND_UNIT_TEXT,22,252,79,10
+    COMBOBOX        IDC_ACCURATE_BLEND_UNIT,105,251,127,63,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP
+    CONTROL         "Logarithmic Z",IDC_LOGZ,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,239,90,8
+    CONTROL         "Alpha Correction",IDC_FBA,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,105,239,74,8
+    CONTROL         "Mipmapping",IDC_MIPMAP,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,304,90,8
+    GROUPBOX        "Software Mode Settings",IDC_STATIC,6,275,231,40,BS_CENTER
+    CONTROL         "Edge Anti-aliasing (AA1)",IDC_AA1,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,11,287,91,10
+    LTEXT           "Rendering threads:",IDC_SWTHREADS_TEXT,124,288,80,8
+    EDITTEXT        IDC_SWTHREADS_EDIT,198,286,34,13,ES_AUTOHSCROLL | ES_NUMBER
+    CONTROL         "",IDC_SWTHREADS,"msctls_updown32",UDS_SETBUDDYINT | UDS_ALIGNRIGHT | UDS_AUTOBUDDY | UDS_ARROWKEYS | UDS_NOTHOUSANDS,221,285,11,14
+    PUSHBUTTON      "Configure",IDC_SHADEBUTTON,105,323,127,14
+    LTEXT           "Shader Configuration:",IDC_STATIC,11,326,90,14
+    DEFPUSHBUTTON   "OK",IDOK,69,353,50,14
+    PUSHBUTTON      "Cancel",IDCANCEL,125,353,50,14
+END
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// DESIGNINFO
+//
+
+#ifdef APSTUDIO_INVOKED
+GUIDELINES DESIGNINFO
+BEGIN
+    IDD_HACKS, DIALOG
+    BEGIN
+        LEFTMARGIN, 7
+        RIGHTMARGIN, 154
+        VERTGUIDE, 14
+        VERTGUIDE, 84
+        VERTGUIDE, 88
+        VERTGUIDE, 146
+        TOPMARGIN, 7
+        BOTTOMMARGIN, 177
+        HORZGUIDE, 51
+        HORZGUIDE, 64
+        HORZGUIDE, 77
+    END
+
+    IDD_SHADER, DIALOG
+    BEGIN
+        LEFTMARGIN, 6
+        RIGHTMARGIN, 242
+        TOPMARGIN, 7
+        BOTTOMMARGIN, 244
+    END
+
+    IDD_CAPTURE, DIALOG
+    BEGIN
+        VERTGUIDE, 6
+        VERTGUIDE, 30
+        VERTGUIDE, 271
+        HORZGUIDE, 54
+    END
+
+    IDD_GPUCONFIG, DIALOG
+    BEGIN
+        LEFTMARGIN, 7
+        RIGHTMARGIN, 182
+        VERTGUIDE, 80
+        VERTGUIDE, 182
+        TOPMARGIN, 7
+        BOTTOMMARGIN, 192
+    END
+
+    IDD_CONFIG, DIALOG
+    BEGIN
+        LEFTMARGIN, 6
+        RIGHTMARGIN, 237
+        VERTGUIDE, 11
+        VERTGUIDE, 22
+        VERTGUIDE, 101
+        VERTGUIDE, 105
+        VERTGUIDE, 232
+        TOPMARGIN, 6
+        BOTTOMMARGIN, 367
+    END
+END
+#endif    // APSTUDIO_INVOKED
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 1,0,1,9
+ PRODUCTVERSION 1,0,1,9
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x4L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904e4"
+        BEGIN
+            VALUE "Comments", "http://guliverkli.sf.net/"
+            VALUE "CompanyName", "Gabest"
+            VALUE "FileDescription", "GS plugin for ps2 emulators"
+            VALUE "FileVersion", "1, 0, 1, 9"
+            VALUE "InternalName", "GSdx.dll"
+            VALUE "LegalCopyright", "Copyright (c) 2007-2008 Gabest.  All rights reserved."
+            VALUE "OriginalFilename", "GSdx.dll"
+            VALUE "ProductName", "GSdx"
+            VALUE "ProductVersion", "1, 0, 1, 9"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1252
+    END
+END
+
+#endif    // English (United States) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+#include "res/tfx.fx"
+#include "res/convert.fx"
+#include "res/interlace.fx"
+#include "res/merge.fx"
+#include "res/fxaa.fx"
+#include "res/cs.fx"
+#include "res/shadeboost.fx"
+#include "res/tfx.cl"
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
diff --git a/plugins/GSdx_legacy/GSdx.vcxproj b/plugins/GSdx_legacy/GSdx.vcxproj
new file mode 100644
index 0000000000..33c82d1617
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.vcxproj
@@ -0,0 +1,2104 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug AVX2|Win32">
+      <Configuration>Debug AVX2</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug AVX2|x64">
+      <Configuration>Debug AVX2</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug AVX|Win32">
+      <Configuration>Debug AVX</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug AVX|x64">
+      <Configuration>Debug AVX</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug SSE4|Win32">
+      <Configuration>Debug SSE4</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug SSE4|x64">
+      <Configuration>Debug SSE4</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug SSSE3|Win32">
+      <Configuration>Debug SSSE3</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug SSSE3|x64">
+      <Configuration>Debug SSSE3</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release AVX2|Win32">
+      <Configuration>Release AVX2</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release AVX2|x64">
+      <Configuration>Release AVX2</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release AVX|Win32">
+      <Configuration>Release AVX</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release AVX|x64">
+      <Configuration>Release AVX</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release SSE4|Win32">
+      <Configuration>Release SSE4</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release SSE4|x64">
+      <Configuration>Release SSE4</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release SSSE3|Win32">
+      <Configuration>Release SSSE3</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release SSSE3|x64">
+      <Configuration>Release SSSE3</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>GSdx</ProjectName>
+    <ProjectGuid>{18E42F6F-3A62-41EE-B42F-79366C4F1E95}</ProjectGuid>
+    <RootNamespace>GSdx</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse4.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx_vs2013.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse4.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx_vs2013.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\ssse3.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\ssse3.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x86.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse4.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx_vs2013.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse4.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx_vs2013.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\avx2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\ssse3.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\ssse3.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\release.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="vsprops\ProjectRootDir.props" />
+    <Import Project="vsprops\sse2.props" />
+    <Import Project="vsprops\common.props" />
+    <Import Project="vsprops\debug.props" />
+    <Import Project="vsprops\x64.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <ModuleDefinitionFile>.\GSdx.def</ModuleDefinitionFile>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalLibraryDirectories>$(SvnRootDir)\deps\$(Platform)\Release;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+    <Midl>
+      <TargetEnvironment>X64</TargetEnvironment>
+    </Midl>
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+    </ClCompile>
+    <Link>
+      <DataExecutionPrevention>
+      </DataExecutionPrevention>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="GLLoader.cpp" />
+    <ClCompile Include="GLState.cpp" />
+    <ClCompile Include="GPU.cpp" />
+    <ClCompile Include="GPUDrawScanline.cpp" />
+    <ClCompile Include="GPUDrawScanlineCodeGenerator.cpp" />
+    <ClCompile Include="GPULocalMemory.cpp" />
+    <ClCompile Include="GPURenderer.cpp" />
+    <ClCompile Include="GPURendererSW.cpp" />
+    <ClCompile Include="GPUSettingsDlg.cpp" />
+    <ClCompile Include="GPUSetupPrimCodeGenerator.cpp" />
+    <ClCompile Include="GPUState.cpp" />
+    <ClCompile Include="GS.cpp" />
+    <ClCompile Include="GSAlignedClass.cpp" />
+    <ClCompile Include="GSBlock.cpp" />
+    <ClCompile Include="GSCapture.cpp" />
+    <ClCompile Include="GSCaptureDlg.cpp" />
+    <ClCompile Include="GSClut.cpp" />
+    <ClCompile Include="GSCodeBuffer.cpp" />
+    <ClCompile Include="GSCrc.cpp" />
+    <ClCompile Include="GSDevice.cpp" />
+    <ClCompile Include="GSDevice11.cpp" />
+    <ClCompile Include="GSDevice9.cpp" />
+    <ClCompile Include="GSDeviceDX.cpp" />
+    <ClCompile Include="GSDeviceNull.cpp" />
+    <ClCompile Include="GSDeviceOGL.cpp" />
+    <ClCompile Include="GSDeviceSW.cpp" />
+    <ClCompile Include="GSDialog.cpp" />
+    <ClCompile Include="GSDirtyRect.cpp" />
+    <ClCompile Include="GSDrawingContext.cpp" />
+    <ClCompile Include="GSDrawScanline.cpp" />
+    <ClCompile Include="GSDrawScanlineCodeGenerator.cpp" />
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDump.cpp" />
+    <ClCompile Include="GSdx.cpp" />
+    <ClCompile Include="GSFunctionMap.cpp" />
+    <ClCompile Include="GSLocalMemory.cpp" />
+    <ClCompile Include="GSPerfMon.cpp" />
+    <ClCompile Include="GSPng.cpp" />
+    <ClCompile Include="GSRasterizer.cpp">
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">AssemblyAndSourceCode</AssemblerOutput>
+    </ClCompile>
+    <ClCompile Include="GSRenderer.cpp" />
+    <ClCompile Include="GSRendererCL.cpp" />
+    <ClCompile Include="GSRendererCS.cpp" />
+    <ClCompile Include="GSRendererDX.cpp" />
+    <ClCompile Include="GSRendererDX11.cpp" />
+    <ClCompile Include="GSRendererDX9.cpp" />
+    <ClCompile Include="GSRendererHW.cpp" />
+    <ClCompile Include="GSRendererNull.cpp" />
+    <ClCompile Include="GSRendererOGL.cpp" />
+    <ClCompile Include="GSRendererSW.cpp" />
+    <ClCompile Include="GSSetting.cpp" />
+    <ClCompile Include="GSSettingsDlg.cpp" />
+    <ClCompile Include="GSSetupPrimCodeGenerator.cpp" />
+    <ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSShaderOGL.cpp" />
+    <ClCompile Include="GSState.cpp" />
+    <ClCompile Include="GSTables.cpp" />
+    <ClCompile Include="GSTexture.cpp" />
+    <ClCompile Include="GSTexture11.cpp" />
+    <ClCompile Include="GSTexture9.cpp" />
+    <ClCompile Include="GSTextureOGL.cpp" />
+    <ClCompile Include="GSTextureCache.cpp" />
+    <ClCompile Include="GSTextureCache11.cpp" />
+    <ClCompile Include="GSTextureCache9.cpp" />
+    <ClCompile Include="GSTextureCacheOGL.cpp" />
+    <ClCompile Include="GSTextureCacheSW.cpp" />
+    <ClCompile Include="GSTextureFX11.cpp" />
+    <ClCompile Include="GSTextureFX9.cpp" />
+    <ClCompile Include="GSTextureFXOGL.cpp" />
+    <ClCompile Include="GSTextureNull.cpp" />
+    <ClCompile Include="GSTextureSW.cpp" />
+    <ClCompile Include="GSThread.cpp" />
+    <ClCompile Include="GSUtil.cpp" />
+    <ClCompile Include="GSVector.cpp">
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">AssemblyAndSourceCode</AssemblerOutput>
+    </ClCompile>
+    <ClCompile Include="GSVertexList.cpp" />
+    <ClCompile Include="GSVertexSW.cpp" />
+    <ClCompile Include="GSVertexTrace.cpp" />
+    <ClCompile Include="GSWnd.cpp" />
+    <ClCompile Include="GSWndDX.cpp" />
+    <ClCompile Include="GSWndWGL.cpp" />
+    <ClCompile Include="stdafx.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amextra.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amfilter.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amvideo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\combase.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\ctlutil.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\ddmm.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\mtype.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\outputq.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\pstream.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\pullpin.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\refclock.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\renbase.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\schedule.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\seekpt.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\source.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\strmctl.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\sysclock.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\transfrm.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\transip.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\vtrans.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxdebug.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxlist.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxutil.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="config.h" />
+    <ClInclude Include="GLLoader.h" />
+    <ClInclude Include="GLState.h" />
+    <ClInclude Include="GPU.h" />
+    <ClInclude Include="GPUDrawingEnvironment.h" />
+    <ClInclude Include="GPUDrawScanline.h" />
+    <ClInclude Include="GPUDrawScanlineCodeGenerator.h" />
+    <ClInclude Include="GPULocalMemory.h" />
+    <ClInclude Include="GPURenderer.h" />
+    <ClInclude Include="GPURendererSW.h" />
+    <ClInclude Include="GPUScanlineEnvironment.h" />
+    <ClInclude Include="GPUSettingsDlg.h" />
+    <ClInclude Include="GPUSetupPrimCodeGenerator.h" />
+    <ClInclude Include="GPUState.h" />
+    <ClInclude Include="GPUVertex.h" />
+    <ClInclude Include="GS.h" />
+    <ClInclude Include="GSAlignedClass.h" />
+    <ClInclude Include="GSBlock.h" />
+    <ClInclude Include="GSCapture.h" />
+    <ClInclude Include="GSCaptureDlg.h" />
+    <ClInclude Include="GSClut.h" />
+    <ClInclude Include="GSCodeBuffer.h" />
+    <ClInclude Include="GSCrc.h" />
+    <ClInclude Include="GSDevice.h" />
+    <ClInclude Include="GSDevice11.h" />
+    <ClInclude Include="GSDevice9.h" />
+    <ClInclude Include="GSDeviceDX.h" />
+    <ClInclude Include="GSDeviceNull.h" />
+    <ClInclude Include="GSDeviceOGL.h" />
+    <ClInclude Include="GSDeviceSW.h" />
+    <ClInclude Include="GSDialog.h" />
+    <ClInclude Include="GSDirtyRect.h" />
+    <ClInclude Include="GSDrawingContext.h" />
+    <ClInclude Include="GSDrawingEnvironment.h" />
+    <ClInclude Include="GSDrawScanline.h" />
+    <ClInclude Include="GSDrawScanlineCodeGenerator.h" />
+    <ClInclude Include="GSDump.h" />
+    <ClInclude Include="GSdx.h" />
+    <ClInclude Include="GSFunctionMap.h" />
+    <ClInclude Include="GSLocalMemory.h" />
+    <ClInclude Include="GSPerfMon.h" />
+    <ClInclude Include="GSPng.h" />
+    <ClInclude Include="GSRasterizer.h" />
+    <ClInclude Include="GSRenderer.h" />
+    <ClInclude Include="GSRendererCL.h" />
+    <ClInclude Include="GSRendererCS.h" />
+    <ClInclude Include="GSRendererDX.h" />
+    <ClInclude Include="GSRendererDX11.h" />
+    <ClInclude Include="GSRendererDX9.h" />
+    <ClInclude Include="GSRendererHW.h" />
+    <ClInclude Include="GSRendererNull.h" />
+    <ClInclude Include="GSRendererOGL.h" />
+    <ClInclude Include="GSRendererSW.h" />
+    <ClInclude Include="GSScanlineEnvironment.h" />
+    <ClInclude Include="GSSetting.h" />
+    <ClInclude Include="GSSettingsDlg.h" />
+    <ClInclude Include="GSSetupPrimCodeGenerator.h" />
+    <ClInclude Include="GSShaderOGL.h" />
+    <ClInclude Include="GSState.h" />
+    <ClInclude Include="GSTables.h" />
+    <ClInclude Include="GSTexture.h" />
+    <ClInclude Include="GSTexture11.h" />
+    <ClInclude Include="GSTexture9.h" />
+    <ClInclude Include="GSTextureOGL.h" />
+    <ClInclude Include="GSTextureCache.h" />
+    <ClInclude Include="GSTextureCache11.h" />
+    <ClInclude Include="GSTextureCache9.h" />
+    <ClInclude Include="GSTextureCacheOGL.h" />
+    <ClInclude Include="GSTextureCacheSW.h" />
+    <ClInclude Include="GSTextureNull.h" />
+    <ClInclude Include="GSTextureSW.h" />
+    <ClInclude Include="GSThread.h" />
+    <ClInclude Include="GSUniformBufferOGL.h" />
+    <ClInclude Include="GSUtil.h" />
+    <ClInclude Include="GSVector.h" />
+    <ClInclude Include="GSVertex.h" />
+    <ClInclude Include="GSVertexArrayOGL.h" />
+    <ClInclude Include="GSVertexHW.h" />
+    <ClInclude Include="GSVertexList.h" />
+    <ClInclude Include="GSVertexSW.h" />
+    <ClInclude Include="GSVertexTrace.h" />
+    <ClInclude Include="GSWnd.h" />
+    <ClInclude Include="GSWndDX.h" />
+    <ClInclude Include="GSWndWGL.h" />
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="svnrev.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="baseclasses\amextra.h" />
+    <ClInclude Include="baseclasses\amfilter.h" />
+    <ClInclude Include="baseclasses\cache.h" />
+    <ClInclude Include="baseclasses\combase.h" />
+    <ClInclude Include="baseclasses\ctlutil.h" />
+    <ClInclude Include="baseclasses\ddmm.h" />
+    <ClInclude Include="baseclasses\dsschedule.h" />
+    <ClInclude Include="baseclasses\fourcc.h" />
+    <ClInclude Include="baseclasses\measure.h" />
+    <ClInclude Include="baseclasses\msgthrd.h" />
+    <ClInclude Include="baseclasses\mtype.h" />
+    <ClInclude Include="baseclasses\outputq.h" />
+    <ClInclude Include="baseclasses\pstream.h" />
+    <ClInclude Include="baseclasses\pullpin.h" />
+    <ClInclude Include="baseclasses\refclock.h" />
+    <ClInclude Include="baseclasses\reftime.h" />
+    <ClInclude Include="baseclasses\renbase.h" />
+    <ClInclude Include="baseclasses\schedule.h" />
+    <ClInclude Include="baseclasses\seekpt.h" />
+    <ClInclude Include="baseclasses\source.h" />
+    <ClInclude Include="baseclasses\streams.h" />
+    <ClInclude Include="baseclasses\strmctl.h" />
+    <ClInclude Include="baseclasses\sysclock.h" />
+    <ClInclude Include="baseclasses\transfrm.h" />
+    <ClInclude Include="baseclasses\transip.h" />
+    <ClInclude Include="baseclasses\vtrans.h" />
+    <ClInclude Include="baseclasses\wxdebug.h" />
+    <ClInclude Include="baseclasses\wxlist.h" />
+    <ClInclude Include="baseclasses\wxutil.h" />
+    <ClInclude Include="targetver.h" />
+    <ClInclude Include="xbyak\xbyak.h" />
+    <ClInclude Include="xbyak\xbyak_bin2hex.h" />
+    <ClInclude Include="xbyak\xbyak_mnemonic.h" />
+    <ClInclude Include="xbyak\xbyak_util.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="GSdx.def" />
+    <None Include="res\cs.fx" />
+    <None Include="res\fxaa.fx" />
+    <None Include="res\logo10.bmp" />
+    <None Include="res\logo9.bmp" />
+    <None Include="res\logo_ogl.bmp" />
+    <None Include="res\convert.fx" />
+    <None Include="res\interlace.fx" />
+    <None Include="res\merge.fx" />
+    <None Include="res\shadeboost.fx" />
+    <None Include="res\tfx.cl" />
+    <None Include="res\tfx.fx" />
+    <None Include="baseclasses\activex.rcv" />
+    <None Include="baseclasses\activex.ver" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="GSdx.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <Image Include="res\logo-ogl.bmp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\..\3rdparty\libpng\projects\vstudio\libpng\libpng.vcxproj">
+      <Project>{d6973076-9317-4ef2-a0b8-b7a18ac0713e}</Project>
+    </ProjectReference>
+    <ProjectReference Include="..\..\3rdparty\opencl\opencl.vcxproj">
+      <Project>{d80d4a75-c385-41bd-ae62-83d2e2b595a7}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+  <ProjectExtensions>
+    <VisualStudio>
+      <UserProperties RESOURCE_FILE="GSdx.rc" />
+    </VisualStudio>
+  </ProjectExtensions>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/GSdx.vcxproj.filters b/plugins/GSdx_legacy/GSdx.vcxproj.filters
new file mode 100644
index 0000000000..8e086a0146
--- /dev/null
+++ b/plugins/GSdx_legacy/GSdx.vcxproj.filters
@@ -0,0 +1,778 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav</Extensions>
+    </Filter>
+    <Filter Include="Shaders">
+      <UniqueIdentifier>{6d029896-e5fd-4b46-8576-52d7d90125e6}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Baseclasses">
+      <UniqueIdentifier>{3c2d6a4a-ff5a-420d-a0f7-4c17cc5c19df}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Xbyak">
+      <UniqueIdentifier>{d6fcc23b-bc82-4390-8a9a-928910bc4123}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="GS.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSAlignedClass.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSBlock.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSCapture.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSCaptureDlg.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSClut.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSCodeBuffer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSCrc.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDevice.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDevice11.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDevice9.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDeviceNull.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDeviceOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDialog.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDirtyRect.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanline.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDump.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSdx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSFunctionMap.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSLocalMemory.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSPerfMon.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRasterizer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRenderer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererDX.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererDX11.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererDX9.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererHW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererNull.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetting.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSettingsDlg.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSShaderOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSState.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTables.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTexture.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTexture11.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTexture9.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureCache.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureCache11.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureCache9.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureCacheOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureCacheSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureFX11.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureFX9.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureFXOGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureNull.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSThread.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSUtil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSVector.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSVertexList.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSVertexSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSVertexTrace.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSWnd.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSWndDX.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSWndWGL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amextra.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amfilter.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\amvideo.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\combase.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\ctlutil.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\ddmm.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\mtype.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\outputq.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\pstream.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\pullpin.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\refclock.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\renbase.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\schedule.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\seekpt.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\source.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\strmctl.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\sysclock.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\transfrm.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\transip.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\vtrans.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxdebug.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxlist.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="baseclasses\wxutil.cpp">
+      <Filter>Baseclasses</Filter>
+    </ClCompile>
+    <ClCompile Include="GLLoader.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GLState.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPU.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUDrawScanline.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUDrawScanlineCodeGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPULocalMemory.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPURenderer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPURendererSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUSettingsDlg.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUSetupPrimCodeGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GPUState.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDeviceDX.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSTextureSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDeviceSW.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererCS.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSRendererCL.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSDrawingContext.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSPng.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="GLLoader.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GLState.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GS.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSAlignedClass.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSBlock.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSCapture.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSCaptureDlg.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSClut.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSCodeBuffer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSCrc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDevice.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDevice11.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDevice9.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDeviceDX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDeviceNull.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDeviceOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDialog.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDirtyRect.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDrawingContext.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDrawingEnvironment.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDrawScanline.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDrawScanlineCodeGenerator.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDump.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSdx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSFunctionMap.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSLocalMemory.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSPerfMon.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRasterizer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRenderer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererDX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererDX11.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererDX9.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererHW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererNull.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSScanlineEnvironment.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSSetting.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSSettingsDlg.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSSetupPrimCodeGenerator.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSShaderOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSState.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTables.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTexture.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTexture11.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTexture9.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureCache.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureCache11.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureCache9.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureCacheOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureCacheSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureNull.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSThread.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSUniformBufferOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSUtil.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVector.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertex.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertexArrayOGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertexHW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertexList.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertexSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSVertexTrace.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSWnd.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSWndDX.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSWndWGL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="stdafx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="svnrev.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="resource.h">
+      <Filter>Resource Files</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\amextra.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\amfilter.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\cache.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\combase.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\ctlutil.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\ddmm.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\dsschedule.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\fourcc.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\measure.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\msgthrd.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\mtype.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\outputq.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\pstream.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\pullpin.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\refclock.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\reftime.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\renbase.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\schedule.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\seekpt.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\source.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\streams.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\strmctl.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\sysclock.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\transfrm.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\transip.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\vtrans.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\wxdebug.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\wxlist.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="baseclasses\wxutil.h">
+      <Filter>Baseclasses</Filter>
+    </ClInclude>
+    <ClInclude Include="xbyak\xbyak.h">
+      <Filter>Xbyak</Filter>
+    </ClInclude>
+    <ClInclude Include="xbyak\xbyak_bin2hex.h">
+      <Filter>Xbyak</Filter>
+    </ClInclude>
+    <ClInclude Include="xbyak\xbyak_mnemonic.h">
+      <Filter>Xbyak</Filter>
+    </ClInclude>
+    <ClInclude Include="xbyak\xbyak_util.h">
+      <Filter>Xbyak</Filter>
+    </ClInclude>
+    <ClInclude Include="GPU.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUDrawingEnvironment.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUDrawScanline.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUDrawScanlineCodeGenerator.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPULocalMemory.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPURenderer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPURendererSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUScanlineEnvironment.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUSettingsDlg.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUSetupPrimCodeGenerator.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUState.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GPUVertex.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSTextureSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSDeviceSW.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="config.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererCS.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSRendererCL.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="GSPng.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="res\logo10.bmp">
+      <Filter>Resource Files</Filter>
+    </None>
+    <None Include="res\logo9.bmp">
+      <Filter>Resource Files</Filter>
+    </None>
+    <None Include="res\convert.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\interlace.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\merge.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\tfx.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="baseclasses\activex.rcv">
+      <Filter>Baseclasses</Filter>
+    </None>
+    <None Include="baseclasses\activex.ver">
+      <Filter>Baseclasses</Filter>
+    </None>
+    <None Include="GSdx.def" />
+    <None Include="res\fxaa.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\shadeboost.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\cs.fx">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\tfx.cl">
+      <Filter>Shaders</Filter>
+    </None>
+    <None Include="res\logo_ogl.bmp">
+      <Filter>Resource Files</Filter>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="GSdx.rc">
+      <Filter>Resource Files</Filter>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Image Include="res\logo-ogl.bmp">
+      <Filter>Resource Files</Filter>
+    </Image>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/MurmurHash3.cpp b/plugins/GSdx_legacy/MurmurHash3.cpp
new file mode 100644
index 0000000000..66c8f08079
--- /dev/null
+++ b/plugins/GSdx_legacy/MurmurHash3.cpp
@@ -0,0 +1,336 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "stdafx.h"
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define	FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+    
+    h1 ^= k1;
+    h1 = ROTL32(h1,13); 
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+} 
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b; 
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5; 
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i*4+0);
+    uint32_t k2 = getblock32(blocks,i*4+1);
+    uint32_t k3 = getblock32(blocks,i*4+2);
+    uint32_t k4 = getblock32(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/plugins/GSdx_legacy/MurmurHash3.h b/plugins/GSdx_legacy/MurmurHash3.h
new file mode 100644
index 0000000000..de12fb71fb
--- /dev/null
+++ b/plugins/GSdx_legacy/MurmurHash3.h
@@ -0,0 +1,11 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+//-----------------------------------------------------------------------------
+
+#pragma once
+
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
diff --git a/plugins/GSdx_legacy/baseclasses/activex.rcv b/plugins/GSdx_legacy/baseclasses/activex.rcv
new file mode 100644
index 0000000000..3894f65096
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/activex.rcv
@@ -0,0 +1,142 @@
+//------------------------------------------------------------------------------
+// File: Activex.rcv
+//
+// Desc: DirectShow base classes - this file defines the version resource 
+//       used for the application.
+//
+//       NOTE: All strings MUST have an explicit \0 for termination!
+//
+//       For a complete description of the Version Resource, search the
+//       Microsoft Developer's Network (MSDN) CD-ROM for 'version resource'..
+//
+// Copyright (c) 1992 - 2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef _ACTIVEX_RCV_
+#define _ACTIVEX_RCV_
+
+#ifndef WIN32
+#define WIN32
+#endif
+#include <winver.h>
+
+#ifndef _ACTIVEX_VER_
+#include <activex.ver>
+#endif
+
+//
+// Version flags.
+//
+// OFFICIAL and FINAL should be defined when appropriate.
+//
+
+#ifndef OFFICIAL
+#define VER_PRIVATEBUILD      VS_FF_PRIVATEBUILD
+#else
+#define VER_PRIVATEBUILD      0
+#endif
+
+#ifndef FINAL
+#define VER_PRERELEASE        VS_FF_PRERELEASE
+#else
+#define VER_PRERELEASE        0
+#endif
+
+#ifdef DEBUG
+#define VER_DEBUG             VS_FF_DEBUG
+#else
+#define VER_DEBUG             0
+#endif
+
+//
+// Version definitions
+//
+
+#define VERSION_RES_FLAGSMASK   0x0030003FL
+#define VERSION_RES_FLAGS       (VER_PRIVATEBUILD|VER_PRERELEASE|VER_DEBUG)
+
+#ifndef VERSION_RES_OS
+#define VERSION_RES_OS          VOS__WINDOWS32
+#endif
+
+#ifndef VERSION_RES_TYPE
+#define VERSION_RES_TYPE        VFT_DLL
+#endif
+
+#ifndef VERSION_RES_SUBTYPE
+#define VERSION_RES_SUBTYPE     VFT2_UNKNOWN
+#endif
+
+#define VERSION_RES_LANGUAGE    0x409
+
+#ifndef VERSION_RES_CHARSET
+#ifdef UNICODE
+#define VERSION_RES_CHARSET     1200
+#else
+#define VERSION_RES_CHARSET     1252
+#endif
+#endif
+
+#ifndef VERSION_RES_ACTIVEX
+#define VERSION_RES_ACTIVEX     "Filter dll\0"
+#endif
+
+#ifdef  AMOVIE_SELF_REGISTER
+#ifndef OLE_SELF_REGISTER
+#define OLE_SELF_REGISTER
+#endif
+#endif
+
+#ifdef  OLE_SELF_REGISTER
+#ifdef  AMOVIE_SELF_REGISTER
+#define VERSION_RES_SELFREGISTER "AM20\0"
+#else
+#define VERSION_RES_SELFREGISTER "\0"
+#endif
+#endif
+
+//
+// Version resource
+//
+
+VS_VERSION_INFO VERSIONINFO
+FILEVERSION     VERSION_RES_MAJOR_VER, VERSION_RES_MINOR_VER, 0, VERSION_RES_BUILD
+PRODUCTVERSION  VERSION_RES_MAJOR_VER, VERSION_RES_MINOR_VER, 0, VERSION_RES_BUILD
+FILEFLAGSMASK   VERSION_RES_FLAGSMASK
+FILEFLAGS       VERSION_RES_FLAGS
+FILEOS          VERSION_RES_OS
+FILETYPE        VERSION_RES_TYPE
+FILESUBTYPE     VERSION_RES_SUBTYPE
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    BEGIN
+      VALUE "CompanyName",        VERSION_RES_COMPANY_NAME
+      VALUE "Comment",            VERSION_RES_COMMENT
+      VALUE "FileDescription",    VERSION_RES_BIN_DESCRIPTION
+      VALUE "FileVersion",        VERSION_RES_STRING
+      VALUE "InternalName",       VERSION_RES_BIN_NAME
+      VALUE "LegalCopyright",     VERSION_RES_COPYRIGHT
+      VALUE "OriginalFilename",   VERSION_RES_BIN_NAME
+      VALUE "ProductName",        VERSION_RES_PRODUCT_NAME
+#ifdef DEBUG
+      VALUE "ProductVersion",     VERSION_RES_STRING_D
+#else
+      VALUE "ProductVersion",     VERSION_RES_STRING
+#endif
+      VALUE "ActiveMovie",        VERSION_RES_ACTIVEX
+#ifdef OLE_SELF_REGISTER
+      VALUE "OLESelfRegister",    VERSION_RES_SELFREGISTER
+#endif
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", VERSION_RES_LANGUAGE, VERSION_RES_CHARSET
+  END
+END
+
+#endif
+// _ACTIVEX_RCV_
diff --git a/plugins/GSdx_legacy/baseclasses/activex.ver b/plugins/GSdx_legacy/baseclasses/activex.ver
new file mode 100644
index 0000000000..f948934679
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/activex.ver
@@ -0,0 +1,56 @@
+//------------------------------------------------------------------------------
+// File: Activex.ver
+//
+// Desc: DirectShow base classes - common versioning information for 
+//       ACTIVEX binaries.
+//
+// Copyright (c) 1996-2002, Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef _ACTIVEX_VER_
+#define _ACTIVEX_VER_
+
+// NOTE: all string resources that will be used in ACTIVEX.RCV for the
+// version resource information *MUST* have an explicit \0 terminator!
+
+#define VERSION_RES_MAJOR_VER   9
+#define VERSION_RES_MINOR_VER   0
+#define VERSION_RES_BUILD       0
+
+#define VERSION_RES_STRING_D    "9.00 (Debug)\0"
+#define VERSION_RES_STRING      "9.00\0"
+
+#define VERSION_RES_PRODUCT_NAME  "DirectX 9.0 Sample\0"
+#define VERSION_RES_COMMENT       "DirectShow Sample\0"
+#define VERSION_RES_COMPANY_NAME  "Microsoft Corporation\0"
+#define VERSION_RES_COPYRIGHT     "Copyright (C) 1992-2002 Microsoft Corporation\0"
+
+// The following defines are required on a file-by-file basis
+//
+// #define VERSION_RES_BIN_NAME         "sample.ax\0"
+// #define VERSION_RES_BIN_DESCRIPTION  "Sample Filter\0"
+//
+// Also required, if you don't want the defaults, are
+//
+// #define VERSION_RES_ACTIVEX  "Filter dll\0" (the default value)
+//
+// A string defining the type of component.
+//
+// #define VERSION_RES_TYPE     VFT_DLL (default)
+//                              VFT_APP
+//                              VFT_VXD
+//                              VFT_DRV
+//                              VFT_FONT
+//                              VFT_STATIC_LIB
+//                              VFT_UNKNOWN
+//
+// #define VERSION_RES_SUBTYPE  VFT2_UNKNOWN (default)
+//                              VFT2_DRV_INSTALLABLE
+//                              VFT2_DRV_SOUND
+//                              <several other options>
+//
+// See winver.h for further details
+
+#endif
+
diff --git a/plugins/GSdx_legacy/baseclasses/amextra.cpp b/plugins/GSdx_legacy/baseclasses/amextra.cpp
new file mode 100644
index 0000000000..7722d60a2d
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/amextra.cpp
@@ -0,0 +1,111 @@
+//------------------------------------------------------------------------------
+// File: AMExtra.cpp
+//
+// Desc: DirectShow base classes - implements CRenderedInputPin class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"        // DirectShow base class definitions
+#include <mmsystem.h>       // Needed for definition of timeGetTime
+#include <limits.h>         // Standard data type limit definitions
+#include "measure.h"        // Used for time critical log functions
+
+#include "amextra.h"
+
+#pragma warning(disable:4355)
+
+//  Implements CRenderedInputPin class
+
+CRenderedInputPin::CRenderedInputPin(TCHAR *pObjectName,
+                                     CBaseFilter *pFilter,
+                                     CCritSec *pLock,
+                                     HRESULT *phr,
+                                     LPCWSTR pName) :
+    CBaseInputPin(pObjectName, pFilter, pLock, phr, pName),
+    m_bAtEndOfStream(FALSE),
+    m_bCompleteNotified(FALSE)
+{
+}
+#ifdef UNICODE
+CRenderedInputPin::CRenderedInputPin(CHAR *pObjectName,
+                                     CBaseFilter *pFilter,
+                                     CCritSec *pLock,
+                                     HRESULT *phr,
+                                     LPCWSTR pName) :
+    CBaseInputPin(pObjectName, pFilter, pLock, phr, pName),
+    m_bAtEndOfStream(FALSE),
+    m_bCompleteNotified(FALSE)
+{
+}
+#endif
+
+// Flush end of stream condition - caller should do any
+// necessary stream level locking before calling this
+
+STDMETHODIMP CRenderedInputPin::EndOfStream()
+{
+    HRESULT hr = CheckStreaming();
+
+    //  Do EC_COMPLETE handling for rendered pins
+    if (S_OK == hr  && !m_bAtEndOfStream) {
+        m_bAtEndOfStream = TRUE;
+        FILTER_STATE fs;
+        EXECUTE_ASSERT(SUCCEEDED(m_pFilter->GetState(0, &fs)));
+        if (fs == State_Running) {
+            DoCompleteHandling();
+        }
+    }
+    return hr;
+}
+
+
+// Called to complete the flush
+
+STDMETHODIMP CRenderedInputPin::EndFlush()
+{
+    CAutoLock lck(m_pLock);
+
+    // Clean up renderer state
+    m_bAtEndOfStream = FALSE;
+    m_bCompleteNotified = FALSE;
+
+    return CBaseInputPin::EndFlush();
+}
+
+
+// Notify of Run() from filter
+
+HRESULT CRenderedInputPin::Run(REFERENCE_TIME tStart)
+{
+    UNREFERENCED_PARAMETER(tStart);
+    m_bCompleteNotified = FALSE;
+    if (m_bAtEndOfStream) {
+        DoCompleteHandling();
+    }
+    return S_OK;
+}
+
+
+//  Clear status on going into paused state
+
+HRESULT CRenderedInputPin::Active()
+{
+    m_bAtEndOfStream = FALSE;
+    m_bCompleteNotified = FALSE;
+    return CBaseInputPin::Active();
+}
+
+
+//  Do stuff to deliver end of stream
+
+void CRenderedInputPin::DoCompleteHandling()
+{
+    ASSERT(m_bAtEndOfStream);
+    if (!m_bCompleteNotified) {
+        m_bCompleteNotified = TRUE;
+        m_pFilter->NotifyEvent(EC_COMPLETE, S_OK, (LONG_PTR)(IBaseFilter *)m_pFilter);
+    }
+}
+
diff --git a/plugins/GSdx_legacy/baseclasses/amextra.h b/plugins/GSdx_legacy/baseclasses/amextra.h
new file mode 100644
index 0000000000..6642f5fa40
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/amextra.h
@@ -0,0 +1,56 @@
+//------------------------------------------------------------------------------
+// File: AMExtra.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __AMEXTRA__
+#define __AMEXTRA__
+
+// Simple rendered input pin
+//
+// NOTE if your filter queues stuff before rendering then it may not be
+// appropriate to use this class
+//
+// In that case queue the end of stream condition until the last sample
+// is actually rendered and flush the condition appropriately
+
+class CRenderedInputPin : public CBaseInputPin
+{
+public:
+
+    CRenderedInputPin(TCHAR *pObjectName,
+                      CBaseFilter *pFilter,
+                      CCritSec *pLock,
+                      HRESULT *phr,
+                      LPCWSTR pName);
+#ifdef UNICODE
+    CRenderedInputPin(CHAR *pObjectName,
+                      CBaseFilter *pFilter,
+                      CCritSec *pLock,
+                      HRESULT *phr,
+                      LPCWSTR pName);
+#endif
+
+    // Override methods to track end of stream state
+    STDMETHODIMP EndOfStream();
+    STDMETHODIMP EndFlush();
+
+    HRESULT Active();
+    HRESULT Run(REFERENCE_TIME tStart);
+
+protected:
+
+    // Member variables to track state
+    BOOL m_bAtEndOfStream;      // Set by EndOfStream
+    BOOL m_bCompleteNotified;   // Set when we notify for EC_COMPLETE
+
+private:
+    void DoCompleteHandling();
+};
+
+#endif // __AMEXTRA__
+
diff --git a/plugins/GSdx_legacy/baseclasses/amfilter.cpp b/plugins/GSdx_legacy/baseclasses/amfilter.cpp
new file mode 100644
index 0000000000..0241a022e7
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/amfilter.cpp
@@ -0,0 +1,5203 @@
+//------------------------------------------------------------------------------
+// File: AMFilter.cpp
+//
+// Desc: DirectShow base classes - implements class hierarchy for streams
+//       architecture.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+//=====================================================================
+//=====================================================================
+// The following classes are declared in this header:
+//
+//
+// CBaseMediaFilter            Basic IMediaFilter support (abstract class)
+// CBaseFilter                 Support for IBaseFilter (incl. IMediaFilter)
+// CEnumPins                   Enumerate input and output pins
+// CEnumMediaTypes             Enumerate the preferred pin formats
+// CBasePin                    Abstract base class for IPin interface
+//    CBaseOutputPin           Adds data provider member functions
+//    CBaseInputPin            Implements IMemInputPin interface
+// CMediaSample                Basic transport unit for IMemInputPin
+// CBaseAllocator              General list guff for most allocators
+//    CMemAllocator            Implements memory buffer allocation
+//
+//=====================================================================
+//=====================================================================
+
+#include "streams.h"
+
+
+
+//=====================================================================
+// Helpers
+//=====================================================================
+STDAPI CreateMemoryAllocator(IMemAllocator **ppAllocator)
+{
+    return CoCreateInstance(CLSID_MemoryAllocator,
+                            0,
+                            CLSCTX_INPROC_SERVER,
+                            IID_IMemAllocator,
+                            (void **)ppAllocator);
+}
+
+//  Put this one here rather than in ctlutil.cpp to avoid linking
+//  anything brought in by ctlutil.cpp
+STDAPI CreatePosPassThru(
+    LPUNKNOWN pAgg,
+    BOOL bRenderer,
+    IPin *pPin,
+    IUnknown **ppPassThru
+)
+{
+    *ppPassThru = NULL;
+    IUnknown *pUnkSeek;
+    HRESULT hr = CoCreateInstance(CLSID_SeekingPassThru,
+                                  pAgg,
+                                  CLSCTX_INPROC_SERVER,
+                                  IID_IUnknown,
+                                  (void **)&pUnkSeek
+                                 );
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    ISeekingPassThru *pPassThru;
+    hr = pUnkSeek->QueryInterface(IID_ISeekingPassThru, (void**)&pPassThru);
+    if (FAILED(hr)) {
+        pUnkSeek->Release();
+        return hr;
+    }
+    hr = pPassThru->Init(bRenderer, pPin);
+    pPassThru->Release();
+    if (FAILED(hr)) {
+        pUnkSeek->Release();
+        return hr;
+    }
+    *ppPassThru = pUnkSeek;
+    return S_OK;
+}
+
+
+
+#define CONNECT_TRACE_LEVEL 3
+
+//=====================================================================
+//=====================================================================
+// Implements CBaseMediaFilter
+//=====================================================================
+//=====================================================================
+
+
+/* Constructor */
+
+CBaseMediaFilter::CBaseMediaFilter(const TCHAR  *pName,
+                   LPUNKNOWN    pUnk,
+                   CCritSec *pLock,
+                   REFCLSID clsid) :
+    CUnknown(pName, pUnk),
+    m_pLock(pLock),
+    m_clsid(clsid),
+    m_State(State_Stopped),
+    m_pClock(NULL)
+{
+}
+
+
+/* Destructor */
+
+CBaseMediaFilter::~CBaseMediaFilter()
+{
+    // must be stopped, but can't call Stop here since
+    // our critsec has been destroyed.
+
+    /* Release any clock we were using */
+
+    if (m_pClock) {
+        m_pClock->Release();
+        m_pClock = NULL;
+    }
+}
+
+
+/* Override this to say what interfaces we support and where */
+
+STDMETHODIMP
+CBaseMediaFilter::NonDelegatingQueryInterface(
+    REFIID riid,
+    void ** ppv)
+{
+    if (riid == IID_IMediaFilter) {
+        return GetInterface((IMediaFilter *) this, ppv);
+    } else if (riid == IID_IPersist) {
+        return GetInterface((IPersist *) this, ppv);
+    } else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+/* Return the filter's clsid */
+STDMETHODIMP
+CBaseMediaFilter::GetClassID(CLSID *pClsID)
+{
+    CheckPointer(pClsID,E_POINTER);
+    ValidateReadWritePtr(pClsID,sizeof(CLSID));
+    *pClsID = m_clsid;
+    return NOERROR;
+}
+
+/* Override this if your state changes are not done synchronously */
+
+STDMETHODIMP
+CBaseMediaFilter::GetState(DWORD dwMSecs, FILTER_STATE *State)
+{
+    UNREFERENCED_PARAMETER(dwMSecs);
+    CheckPointer(State,E_POINTER);
+    ValidateReadWritePtr(State,sizeof(FILTER_STATE));
+
+    *State = m_State;
+    return S_OK;
+}
+
+
+/* Set the clock we will use for synchronisation */
+
+STDMETHODIMP
+CBaseMediaFilter::SetSyncSource(IReferenceClock *pClock)
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // Ensure the new one does not go away - even if the same as the old
+    if (pClock) {
+        pClock->AddRef();
+    }
+
+    // if we have a clock, release it
+    if (m_pClock) {
+        m_pClock->Release();
+    }
+
+    // Set the new reference clock (might be NULL)
+    // Should we query it to ensure it is a clock?  Consider for a debug build.
+    m_pClock = pClock;
+
+    return NOERROR;
+}
+
+/* Return the clock we are using for synchronisation */
+STDMETHODIMP
+CBaseMediaFilter::GetSyncSource(IReferenceClock **pClock)
+{
+    CheckPointer(pClock,E_POINTER);
+    ValidateReadWritePtr(pClock,sizeof(IReferenceClock *));
+    CAutoLock cObjectLock(m_pLock);
+
+    if (m_pClock) {
+        // returning an interface... addref it...
+        m_pClock->AddRef();
+    }
+    *pClock = (IReferenceClock*)m_pClock;
+    return NOERROR;
+}
+
+
+/* Put the filter into a stopped state */
+
+STDMETHODIMP
+CBaseMediaFilter::Stop()
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    m_State = State_Stopped;
+    return S_OK;
+}
+
+
+/* Put the filter into a paused state */
+
+STDMETHODIMP
+CBaseMediaFilter::Pause()
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    m_State = State_Paused;
+    return S_OK;
+}
+
+
+// Put the filter into a running state.
+
+// The time parameter is the offset to be added to the samples'
+// stream time to get the reference time at which they should be presented.
+//
+// you can either add these two and compare it against the reference clock,
+// or you can call CBaseMediaFilter::StreamTime and compare that against
+// the sample timestamp.
+
+STDMETHODIMP
+CBaseMediaFilter::Run(REFERENCE_TIME tStart)
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // remember the stream time offset
+    m_tStart = tStart;
+
+    if (m_State == State_Stopped){
+        HRESULT hr = Pause();
+
+        if (FAILED(hr)) {
+            return hr;
+        }
+    }
+    m_State = State_Running;
+    return S_OK;
+}
+
+
+//
+// return the current stream time - samples with start timestamps of this
+// time or before should be rendered by now
+HRESULT
+CBaseMediaFilter::StreamTime(CRefTime& rtStream)
+{
+    // Caller must lock for synchronization
+    // We can't grab the filter lock because we want to be able to call
+    // this from worker threads without deadlocking
+
+    if (m_pClock == NULL) {
+        return VFW_E_NO_CLOCK;
+    }
+
+    // get the current reference time
+    HRESULT hr = m_pClock->GetTime((REFERENCE_TIME*)&rtStream);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    // subtract the stream offset to get stream time
+    rtStream -= m_tStart;
+
+    return S_OK;
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CBaseFilter
+//=====================================================================
+//=====================================================================
+
+
+/* Override this to say what interfaces we support and where */
+
+STDMETHODIMP CBaseFilter::NonDelegatingQueryInterface(REFIID riid,
+                                                      void **ppv)
+{
+    /* Do we have this interface */
+
+    if (riid == IID_IBaseFilter) {
+        return GetInterface((IBaseFilter *) this, ppv);
+    } else if (riid == IID_IMediaFilter) {
+        return GetInterface((IMediaFilter *) this, ppv);
+    } else if (riid == IID_IPersist) {
+        return GetInterface((IPersist *) this, ppv);
+    } else if (riid == IID_IAMovieSetup) {
+        return GetInterface((IAMovieSetup *) this, ppv);
+    } else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+#ifdef DEBUG
+STDMETHODIMP_(ULONG) CBaseFilter::NonDelegatingRelease()
+{
+    if (m_cRef == 1) {
+        KASSERT(m_pGraph == NULL);
+    }
+    return CUnknown::NonDelegatingRelease();
+}
+#endif
+
+
+/* Constructor */
+
+CBaseFilter::CBaseFilter(const TCHAR    *pName,
+             LPUNKNOWN  pUnk,
+             CCritSec   *pLock,
+             REFCLSID   clsid) :
+    CUnknown( pName, pUnk ),
+    m_pLock(pLock),
+    m_clsid(clsid),
+    m_State(State_Stopped),
+    m_pClock(NULL),
+    m_pGraph(NULL),
+    m_pSink(NULL),
+    m_pName(NULL),
+    m_PinVersion(1)
+{
+
+    ASSERT(pLock != NULL);
+}
+
+/* Passes in a redundant HRESULT argument */
+
+CBaseFilter::CBaseFilter(TCHAR     *pName,
+                         LPUNKNOWN  pUnk,
+                         CCritSec  *pLock,
+                         REFCLSID   clsid,
+                         HRESULT   *phr) :
+    CUnknown( pName, pUnk ),
+    m_pLock(pLock),
+    m_clsid(clsid),
+    m_State(State_Stopped),
+    m_pClock(NULL),
+    m_pGraph(NULL),
+    m_pSink(NULL),
+    m_pName(NULL),
+    m_PinVersion(1)
+{
+
+    ASSERT(pLock != NULL);
+    UNREFERENCED_PARAMETER(phr);
+}
+
+#ifdef UNICODE
+CBaseFilter::CBaseFilter(const CHAR *pName,
+             LPUNKNOWN  pUnk,
+             CCritSec   *pLock,
+             REFCLSID   clsid) :
+    CUnknown( pName, pUnk ),
+    m_pLock(pLock),
+    m_clsid(clsid),
+    m_State(State_Stopped),
+    m_pClock(NULL),
+    m_pGraph(NULL),
+    m_pSink(NULL),
+    m_pName(NULL),
+    m_PinVersion(1)
+{
+
+    ASSERT(pLock != NULL);
+}
+CBaseFilter::CBaseFilter(CHAR     *pName,
+                         LPUNKNOWN  pUnk,
+                         CCritSec  *pLock,
+                         REFCLSID   clsid,
+                         HRESULT   *phr) :
+    CUnknown( pName, pUnk ),
+    m_pLock(pLock),
+    m_clsid(clsid),
+    m_State(State_Stopped),
+    m_pClock(NULL),
+    m_pGraph(NULL),
+    m_pSink(NULL),
+    m_pName(NULL),
+    m_PinVersion(1)
+{
+
+    ASSERT(pLock != NULL);
+    UNREFERENCED_PARAMETER(phr);
+}
+#endif
+
+/* Destructor */
+
+CBaseFilter::~CBaseFilter()
+{
+
+    // NOTE we do NOT hold references on the filtergraph for m_pGraph or m_pSink
+    // When we did we had the circular reference problem.  Nothing would go away.
+
+    delete[] m_pName;
+
+    // must be stopped, but can't call Stop here since
+    // our critsec has been destroyed.
+
+    /* Release any clock we were using */
+    if (m_pClock) {
+        m_pClock->Release();
+        m_pClock = NULL;
+    }
+}
+
+/* Return the filter's clsid */
+STDMETHODIMP
+CBaseFilter::GetClassID(CLSID *pClsID)
+{
+    CheckPointer(pClsID,E_POINTER);
+    ValidateReadWritePtr(pClsID,sizeof(CLSID));
+    *pClsID = m_clsid;
+    return NOERROR;
+}
+
+/* Override this if your state changes are not done synchronously */
+STDMETHODIMP
+CBaseFilter::GetState(DWORD dwMSecs, FILTER_STATE *State)
+{
+    UNREFERENCED_PARAMETER(dwMSecs);
+    CheckPointer(State,E_POINTER);
+    ValidateReadWritePtr(State,sizeof(FILTER_STATE));
+
+    *State = m_State;
+    return S_OK;
+}
+
+
+/* Set the clock we will use for synchronisation */
+
+STDMETHODIMP
+CBaseFilter::SetSyncSource(IReferenceClock *pClock)
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // Ensure the new one does not go away - even if the same as the old
+    if (pClock) {
+        pClock->AddRef();
+    }
+
+    // if we have a clock, release it
+    if (m_pClock) {
+        m_pClock->Release();
+    }
+
+    // Set the new reference clock (might be NULL)
+    // Should we query it to ensure it is a clock?  Consider for a debug build.
+    m_pClock = pClock;
+
+    return NOERROR;
+}
+
+/* Return the clock we are using for synchronisation */
+STDMETHODIMP
+CBaseFilter::GetSyncSource(IReferenceClock **pClock)
+{
+    CheckPointer(pClock,E_POINTER);
+    ValidateReadWritePtr(pClock,sizeof(IReferenceClock *));
+    CAutoLock cObjectLock(m_pLock);
+
+    if (m_pClock) {
+        // returning an interface... addref it...
+        m_pClock->AddRef();
+    }
+    *pClock = (IReferenceClock*)m_pClock;
+    return NOERROR;
+}
+
+
+
+// override CBaseMediaFilter Stop method, to deactivate any pins this
+// filter has.
+STDMETHODIMP
+CBaseFilter::Stop()
+{
+    CAutoLock cObjectLock(m_pLock);
+    HRESULT hr = NOERROR;
+
+    // notify all pins of the state change
+    if (m_State != State_Stopped) {
+        int cPins = GetPinCount();
+        for (int c = 0; c < cPins; c++) {
+
+            CBasePin *pPin = GetPin(c);
+
+            // Disconnected pins are not activated - this saves pins worrying
+            // about this state themselves. We ignore the return code to make
+            // sure everyone is inactivated regardless. The base input pin
+            // class can return an error if it has no allocator but Stop can
+            // be used to resync the graph state after something has gone bad
+
+            if (pPin->IsConnected()) {
+                HRESULT hrTmp = pPin->Inactive();
+                if (FAILED(hrTmp) && SUCCEEDED(hr)) {
+                    hr = hrTmp;
+                }
+            }
+        }
+    }
+
+
+    m_State = State_Stopped;
+    return hr;
+}
+
+
+// override CBaseMediaFilter Pause method to activate any pins
+// this filter has (also called from Run)
+
+STDMETHODIMP
+CBaseFilter::Pause()
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // notify all pins of the change to active state
+    if (m_State == State_Stopped) {
+        int cPins = GetPinCount();
+        for (int c = 0; c < cPins; c++) {
+
+            CBasePin *pPin = GetPin(c);
+
+            // Disconnected pins are not activated - this saves pins
+            // worrying about this state themselves
+
+            if (pPin->IsConnected()) {
+                HRESULT hr = pPin->Active();
+                if (FAILED(hr)) {
+                    return hr;
+                }
+            }
+        }
+    }
+
+
+
+    m_State = State_Paused;
+    return S_OK;
+}
+
+// Put the filter into a running state.
+
+// The time parameter is the offset to be added to the samples'
+// stream time to get the reference time at which they should be presented.
+//
+// you can either add these two and compare it against the reference clock,
+// or you can call CBaseFilter::StreamTime and compare that against
+// the sample timestamp.
+
+STDMETHODIMP
+CBaseFilter::Run(REFERENCE_TIME tStart)
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // remember the stream time offset
+    m_tStart = tStart;
+
+    if (m_State == State_Stopped){
+    HRESULT hr = Pause();
+
+    if (FAILED(hr)) {
+        return hr;
+    }
+    }
+    // notify all pins of the change to active state
+    if (m_State != State_Running) {
+        int cPins = GetPinCount();
+        for (int c = 0; c < cPins; c++) {
+
+            CBasePin *pPin = GetPin(c);
+
+            // Disconnected pins are not activated - this saves pins
+            // worrying about this state themselves
+
+            if (pPin->IsConnected()) {
+                HRESULT hr = pPin->Run(tStart);
+                if (FAILED(hr)) {
+                    return hr;
+                }
+            }
+        }
+    }
+
+
+    m_State = State_Running;
+    return S_OK;
+}
+
+//
+// return the current stream time - samples with start timestamps of this
+// time or before should be rendered by now
+HRESULT
+CBaseFilter::StreamTime(CRefTime& rtStream)
+{
+    // Caller must lock for synchronization
+    // We can't grab the filter lock because we want to be able to call
+    // this from worker threads without deadlocking
+
+    if (m_pClock == NULL) {
+        return VFW_E_NO_CLOCK;
+    }
+
+    // get the current reference time
+    HRESULT hr = m_pClock->GetTime((REFERENCE_TIME*)&rtStream);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    // subtract the stream offset to get stream time
+    rtStream -= m_tStart;
+
+    return S_OK;
+}
+
+
+/* Create an enumerator for the pins attached to this filter */
+
+STDMETHODIMP
+CBaseFilter::EnumPins(IEnumPins **ppEnum)
+{
+    CheckPointer(ppEnum,E_POINTER);
+    ValidateReadWritePtr(ppEnum,sizeof(IEnumPins *));
+
+    /* Create a new ref counted enumerator */
+
+    *ppEnum = new CEnumPins(this,
+                        NULL);
+
+    return *ppEnum == NULL ? E_OUTOFMEMORY : NOERROR;
+}
+
+
+// default behaviour of FindPin is to assume pins are named
+// by their pin names
+STDMETHODIMP
+CBaseFilter::FindPin(
+    LPCWSTR Id,
+    IPin ** ppPin
+)
+{
+    CheckPointer(ppPin,E_POINTER);
+    ValidateReadWritePtr(ppPin,sizeof(IPin *));
+
+    //  We're going to search the pin list so maintain integrity
+    CAutoLock lck(m_pLock);
+    int iCount = GetPinCount();
+    for (int i = 0; i < iCount; i++) {
+        CBasePin *pPin = GetPin(i);
+        ASSERT(pPin != NULL);
+
+        if (0 == lstrcmpW(pPin->Name(), Id)) {
+            //  Found one that matches
+            //
+            //  AddRef() and return it
+            *ppPin = pPin;
+            pPin->AddRef();
+            return S_OK;
+        }
+    }
+    *ppPin = NULL;
+    return VFW_E_NOT_FOUND;
+}
+
+/* Return information about this filter */
+
+STDMETHODIMP
+CBaseFilter::QueryFilterInfo(FILTER_INFO * pInfo)
+{
+    CheckPointer(pInfo,E_POINTER);
+    ValidateReadWritePtr(pInfo,sizeof(FILTER_INFO));
+
+    if (m_pName) {
+        lstrcpynW(pInfo->achName, m_pName, sizeof(pInfo->achName)/sizeof(WCHAR));
+    } else {
+        pInfo->achName[0] = L'\0';
+    }
+    pInfo->pGraph = m_pGraph;
+    if (m_pGraph)
+        m_pGraph->AddRef();
+    return NOERROR;
+}
+
+
+/* Provide the filter with a filter graph */
+
+STDMETHODIMP
+CBaseFilter::JoinFilterGraph(
+    IFilterGraph * pGraph,
+    LPCWSTR pName)
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    // NOTE: we no longer hold references on the graph (m_pGraph, m_pSink)
+
+    m_pGraph = pGraph;
+    if (m_pGraph) {
+        HRESULT hr = m_pGraph->QueryInterface(IID_IMediaEventSink,
+                        (void**) &m_pSink);
+        if (FAILED(hr)) {
+            ASSERT(m_pSink == NULL);
+        }
+        else m_pSink->Release();        // we do NOT keep a reference on it.
+    } else {
+        // if graph pointer is null, then we should
+        // also release the IMediaEventSink on the same object - we don't
+        // refcount it, so just set it to null
+        m_pSink = NULL;
+    }
+
+
+    if (m_pName) {
+        delete[] m_pName;
+        m_pName = NULL;
+    }
+
+    if (pName) {
+        DWORD nameLen = lstrlenW(pName)+1;
+        m_pName = new WCHAR[nameLen];
+        if (m_pName) {
+            CopyMemory(m_pName, pName, nameLen*sizeof(WCHAR));
+        } else {
+            // !!! error here?
+            ASSERT(FALSE);
+        }
+    }
+
+
+    return NOERROR;
+}
+
+
+// return a Vendor information string. Optional - may return E_NOTIMPL.
+// memory returned should be freed using CoTaskMemFree
+// default implementation returns E_NOTIMPL
+STDMETHODIMP
+CBaseFilter::QueryVendorInfo(
+    LPWSTR* pVendorInfo)
+{
+    UNREFERENCED_PARAMETER(pVendorInfo);
+    return E_NOTIMPL;
+}
+
+
+// send an event notification to the filter graph if we know about it.
+// returns S_OK if delivered, S_FALSE if the filter graph does not sink
+// events, or an error otherwise.
+HRESULT
+CBaseFilter::NotifyEvent(
+    long EventCode,
+    LONG_PTR EventParam1,
+    LONG_PTR EventParam2)
+{
+    // Snapshot so we don't have to lock up
+    IMediaEventSink *pSink = m_pSink;
+    if (pSink) {
+        if (EC_COMPLETE == EventCode) {
+            EventParam2 = (LONG_PTR)(IBaseFilter*)this;
+        }
+
+        return pSink->Notify(EventCode, EventParam1, EventParam2);
+    } else {
+        return E_NOTIMPL;
+    }
+}
+
+// Request reconnect
+// pPin is the pin to reconnect
+// pmt is the type to reconnect with - can be NULL
+// Calls ReconnectEx on the filter graph
+HRESULT
+CBaseFilter::ReconnectPin(
+    IPin *pPin,
+    AM_MEDIA_TYPE const *pmt
+)
+{
+    IFilterGraph2 *pGraph2;
+    if (m_pGraph != NULL) {
+        HRESULT hr = m_pGraph->QueryInterface(IID_IFilterGraph2, (void **)&pGraph2);
+        if (SUCCEEDED(hr)) {
+            hr = pGraph2->ReconnectEx(pPin, pmt);
+            pGraph2->Release();
+            return hr;
+        } else {
+            return m_pGraph->Reconnect(pPin);
+        }
+    } else {
+        return E_NOINTERFACE;
+    }
+}
+
+
+
+/* This is the same idea as the media type version does for type enumeration
+   on pins but for the list of pins available. So if the list of pins you
+   provide changes dynamically then either override this virtual function
+   to provide the version number, or more simply call IncrementPinVersion */
+
+LONG CBaseFilter::GetPinVersion()
+{
+    return m_PinVersion;
+}
+
+
+/* Increment the current pin version cookie */
+
+void CBaseFilter::IncrementPinVersion()
+{
+    InterlockedIncrement(&m_PinVersion);
+}
+
+/* register filter */
+
+STDMETHODIMP CBaseFilter::Register()
+{
+    // get setup data, if it exists
+    //
+    LPAMOVIESETUP_FILTER psetupdata = GetSetupData();
+
+    // check we've got data
+    //
+    if( NULL == psetupdata ) return S_FALSE;
+
+    // init is ref counted so call just in case
+    // we're being called cold.
+    //
+    HRESULT hr = CoInitialize( (LPVOID)NULL );
+    ASSERT( SUCCEEDED(hr) );
+
+    // get hold of IFilterMapper
+    //
+    IFilterMapper *pIFM;
+    hr = CoCreateInstance( CLSID_FilterMapper
+                             , NULL
+                             , CLSCTX_INPROC_SERVER
+                             , IID_IFilterMapper
+                             , (void **)&pIFM       );
+    if( SUCCEEDED(hr) )
+    {
+        hr = AMovieSetupRegisterFilter( psetupdata, pIFM, TRUE );
+        pIFM->Release();
+    }
+
+    // and clear up
+    //
+    CoFreeUnusedLibraries();
+    CoUninitialize();
+
+    return NOERROR;
+}
+
+
+/* unregister filter */
+
+STDMETHODIMP CBaseFilter::Unregister()
+{
+    // get setup data, if it exists
+    //
+    LPAMOVIESETUP_FILTER psetupdata = GetSetupData();
+
+    // check we've got data
+    //
+    if( NULL == psetupdata ) return S_FALSE;
+
+    // OLE init is ref counted so call
+    // just in case we're being called cold.
+    //
+    HRESULT hr = CoInitialize( (LPVOID)NULL );
+    ASSERT( SUCCEEDED(hr) );
+
+    // get hold of IFilterMapper
+    //
+    IFilterMapper *pIFM;
+    hr = CoCreateInstance( CLSID_FilterMapper
+                             , NULL
+                             , CLSCTX_INPROC_SERVER
+                             , IID_IFilterMapper
+                             , (void **)&pIFM       );
+    if( SUCCEEDED(hr) )
+    {
+        hr = AMovieSetupRegisterFilter( psetupdata, pIFM, FALSE );
+
+        // release interface
+        //
+        pIFM->Release();
+    }
+
+    // clear up
+    //
+    CoFreeUnusedLibraries();
+    CoUninitialize();
+
+    // handle one acceptable "error" - that
+    // of filter not being registered!
+    // (couldn't find a suitable #define'd
+    // name for the error!)
+    //
+    if( 0x80070002 == hr)
+      return NOERROR;
+    else
+      return hr;
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CEnumPins
+//=====================================================================
+//=====================================================================
+
+
+CEnumPins::CEnumPins(CBaseFilter *pFilter,
+             CEnumPins *pEnumPins) :
+    m_Position(0),
+    m_PinCount(0),
+    m_pFilter(pFilter),
+    m_cRef(1),               // Already ref counted
+    m_PinCache(NAME("Pin Cache"))
+{
+
+#ifdef DEBUG
+    m_dwCookie = DbgRegisterObjectCreation("CEnumPins", 0);
+#endif
+
+    /* We must be owned by a filter derived from CBaseFilter */
+
+    ASSERT(pFilter != NULL);
+
+    /* Hold a reference count on our filter */
+    m_pFilter->AddRef();
+
+    /* Are we creating a new enumerator */
+
+    if (pEnumPins == NULL) {
+        m_Version = m_pFilter->GetPinVersion();
+        m_PinCount = m_pFilter->GetPinCount();
+    } else {
+        ASSERT(m_Position <= m_PinCount);
+        m_Position = pEnumPins->m_Position;
+        m_PinCount = pEnumPins->m_PinCount;
+        m_Version = pEnumPins->m_Version;
+        m_PinCache.AddTail(&(pEnumPins->m_PinCache));
+    }
+}
+
+
+/* Destructor releases the reference count on our filter NOTE since we hold
+   a reference count on the filter who created us we know it is safe to
+   release it, no access can be made to it afterwards though as we have just
+   caused the last reference count to go and the object to be deleted */
+
+CEnumPins::~CEnumPins()
+{
+    m_pFilter->Release();
+
+#ifdef DEBUG
+    DbgRegisterObjectDestruction(m_dwCookie);
+#endif
+}
+
+
+/* Override this to say what interfaces we support where */
+
+STDMETHODIMP
+CEnumPins::QueryInterface(REFIID riid,void **ppv)
+{
+    CheckPointer(ppv, E_POINTER);
+
+    /* Do we have this interface */
+
+    if (riid == IID_IEnumPins || riid == IID_IUnknown) {
+        return GetInterface((IEnumPins *) this, ppv);
+    } else {
+        *ppv = NULL;
+        return E_NOINTERFACE;
+    }
+}
+
+STDMETHODIMP_(ULONG)
+CEnumPins::AddRef()
+{
+    return InterlockedIncrement(&m_cRef);
+}
+
+STDMETHODIMP_(ULONG)
+CEnumPins::Release()
+{
+    ULONG cRef = InterlockedDecrement(&m_cRef);
+    if (cRef == 0) {
+        delete this;
+    }
+    return cRef;
+}
+
+/* One of an enumerator's basic member functions allows us to create a cloned
+   interface that initially has the same state. Since we are taking a snapshot
+   of an object (current position and all) we must lock access at the start */
+
+STDMETHODIMP
+CEnumPins::Clone(IEnumPins **ppEnum)
+{
+    CheckPointer(ppEnum,E_POINTER);
+    ValidateReadWritePtr(ppEnum,sizeof(IEnumPins *));
+    HRESULT hr = NOERROR;
+
+    /* Check we are still in sync with the filter */
+    if (AreWeOutOfSync() == TRUE) {
+        *ppEnum = NULL;
+        hr =  VFW_E_ENUM_OUT_OF_SYNC;
+    } else {
+
+        *ppEnum = new CEnumPins(m_pFilter,
+                                this);
+        if (*ppEnum == NULL) {
+            hr = E_OUTOFMEMORY;
+        }
+    }
+    return hr;
+}
+
+
+/* Return the next pin after the current position */
+
+STDMETHODIMP
+CEnumPins::Next(ULONG cPins,        // place this many pins...
+        IPin **ppPins,      // ...in this array
+        ULONG *pcFetched)   // actual count passed returned here
+{
+    CheckPointer(ppPins,E_POINTER);
+    ValidateReadWritePtr(ppPins,cPins * sizeof(IPin *));
+
+    ASSERT(ppPins);
+
+    if (pcFetched!=NULL) {
+        ValidateWritePtr(pcFetched, sizeof(ULONG));
+        *pcFetched = 0;           // default unless we succeed
+    }
+    // now check that the parameter is valid
+    else if (cPins>1) {   // pcFetched == NULL
+        return E_INVALIDARG;
+    }
+    ULONG cFetched = 0;           // increment as we get each one.
+
+    /* Check we are still in sync with the filter */
+    if (AreWeOutOfSync() == TRUE) {
+    // If we are out of sync, we should refresh the enumerator.
+    // This will reset the position and update the other members, but
+    // will not clear cache of pins we have already returned.
+    Refresh();
+    }
+
+    /* Calculate the number of available pins */
+
+    int cRealPins = min(m_PinCount - m_Position, (int) cPins);
+    if (cRealPins == 0) {
+        return S_FALSE;
+    }
+
+    /* Return each pin interface NOTE GetPin returns CBasePin * not addrefed
+       so we must QI for the IPin (which increments its reference count)
+       If while we are retrieving a pin from the filter an error occurs we
+       assume that our internal state is stale with respect to the filter
+       (for example someone has deleted a pin) so we
+       return VFW_E_ENUM_OUT_OF_SYNC                            */
+
+    while (cRealPins && (m_PinCount - m_Position)) {
+
+        /* Get the next pin object from the filter */
+
+        CBasePin *pPin = m_pFilter->GetPin(m_Position++);
+        if (pPin == NULL) {
+            // If this happend, and it's not the first time through, then we've got a problem,
+            // since we should really go back and release the iPins, which we have previously
+            // AddRef'ed.
+            ASSERT( cFetched==0 );
+            return VFW_E_ENUM_OUT_OF_SYNC;
+        }
+
+        /* We only want to return this pin, if it is not in our cache */
+        if (0 == m_PinCache.Find(pPin))
+        {
+            /* From the object get an IPin interface */
+
+            *ppPins = pPin;
+            pPin->AddRef();
+
+            cFetched++;
+            ppPins++;
+
+            m_PinCache.AddTail(pPin);
+
+            cRealPins--;
+
+        }
+    }
+
+    if (pcFetched!=NULL) {
+        *pcFetched = cFetched;
+    }
+
+    return (cPins==cFetched ? NOERROR : S_FALSE);
+}
+
+
+/* Skip over one or more entries in the enumerator */
+
+STDMETHODIMP
+CEnumPins::Skip(ULONG cPins)
+{
+    /* Check we are still in sync with the filter */
+    if (AreWeOutOfSync() == TRUE) {
+        return VFW_E_ENUM_OUT_OF_SYNC;
+    }
+
+    /* Work out how many pins are left to skip over */
+    /* We could position at the end if we are asked to skip too many... */
+    /* ..which would match the base implementation for CEnumMediaTypes::Skip */
+
+    ULONG PinsLeft = m_PinCount - m_Position;
+    if (cPins > PinsLeft) {
+        return S_FALSE;
+    }
+    m_Position += cPins;
+    return NOERROR;
+}
+
+
+/* Set the current position back to the start */
+/* Reset has 4 simple steps:
+ *
+ * Set position to head of list
+ * Sync enumerator with object being enumerated
+ * Clear the cache of pins already returned
+ * return S_OK
+ */
+
+STDMETHODIMP
+CEnumPins::Reset()
+{
+    m_Version = m_pFilter->GetPinVersion();
+    m_PinCount = m_pFilter->GetPinCount();
+
+    m_Position = 0;
+
+    // Clear the cache
+    m_PinCache.RemoveAll();
+
+    return S_OK;
+}
+
+
+/* Set the current position back to the start */
+/* Refresh has 3 simple steps:
+ *
+ * Set position to head of list
+ * Sync enumerator with object being enumerated
+ * return S_OK
+ */
+
+STDMETHODIMP
+CEnumPins::Refresh()
+{
+    m_Version = m_pFilter->GetPinVersion();
+    m_PinCount = m_pFilter->GetPinCount();
+
+    m_Position = 0;
+    return S_OK;
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CEnumMediaTypes
+//=====================================================================
+//=====================================================================
+
+
+CEnumMediaTypes::CEnumMediaTypes(CBasePin *pPin,
+                 CEnumMediaTypes *pEnumMediaTypes) :
+    m_Position(0),
+    m_pPin(pPin),
+    m_cRef(1)
+{
+
+#ifdef DEBUG
+    m_dwCookie = DbgRegisterObjectCreation("CEnumMediaTypes", 0);
+#endif
+
+    /* We must be owned by a pin derived from CBasePin */
+
+    ASSERT(pPin != NULL);
+
+    /* Hold a reference count on our pin */
+    m_pPin->AddRef();
+
+    /* Are we creating a new enumerator */
+
+    if (pEnumMediaTypes == NULL) {
+        m_Version = m_pPin->GetMediaTypeVersion();
+        return;
+    }
+
+    m_Position = pEnumMediaTypes->m_Position;
+    m_Version = pEnumMediaTypes->m_Version;
+}
+
+
+/* Destructor releases the reference count on our base pin. NOTE since we hold
+   a reference count on the pin who created us we know it is safe to release
+   it, no access can be made to it afterwards though as we might have just
+   caused the last reference count to go and the object to be deleted */
+
+CEnumMediaTypes::~CEnumMediaTypes()
+{
+#ifdef DEBUG
+    DbgRegisterObjectDestruction(m_dwCookie);
+#endif
+    m_pPin->Release();
+}
+
+
+/* Override this to say what interfaces we support where */
+
+STDMETHODIMP
+CEnumMediaTypes::QueryInterface(REFIID riid,void **ppv)
+{
+    CheckPointer(ppv, E_POINTER);
+
+    /* Do we have this interface */
+
+    if (riid == IID_IEnumMediaTypes || riid == IID_IUnknown) {
+        return GetInterface((IEnumMediaTypes *) this, ppv);
+    } else {
+        *ppv = NULL;
+        return E_NOINTERFACE;
+    }
+}
+
+STDMETHODIMP_(ULONG)
+CEnumMediaTypes::AddRef()
+{
+    return InterlockedIncrement(&m_cRef);
+}
+
+STDMETHODIMP_(ULONG)
+CEnumMediaTypes::Release()
+{
+    ULONG cRef = InterlockedDecrement(&m_cRef);
+    if (cRef == 0) {
+        delete this;
+    }
+    return cRef;
+}
+
+/* One of an enumerator's basic member functions allows us to create a cloned
+   interface that initially has the same state. Since we are taking a snapshot
+   of an object (current position and all) we must lock access at the start */
+
+STDMETHODIMP
+CEnumMediaTypes::Clone(IEnumMediaTypes **ppEnum)
+{
+    CheckPointer(ppEnum,E_POINTER);
+    ValidateReadWritePtr(ppEnum,sizeof(IEnumMediaTypes *));
+    HRESULT hr = NOERROR;
+
+    /* Check we are still in sync with the pin */
+    if (AreWeOutOfSync() == TRUE) {
+        *ppEnum = NULL;
+        hr = VFW_E_ENUM_OUT_OF_SYNC;
+    } else {
+
+        *ppEnum = new CEnumMediaTypes(m_pPin,
+                                      this);
+
+        if (*ppEnum == NULL) {
+            hr =  E_OUTOFMEMORY;
+        }
+    }
+    return hr;
+}
+
+
+/* Enumerate the next pin(s) after the current position. The client using this
+   interface passes in a pointer to an array of pointers each of which will
+   be filled in with a pointer to a fully initialised media type format
+   Return NOERROR if it all works,
+          S_FALSE if fewer than cMediaTypes were enumerated.
+          VFW_E_ENUM_OUT_OF_SYNC if the enumerator has been broken by
+                                 state changes in the filter
+   The actual count always correctly reflects the number of types in the array.
+*/
+
+STDMETHODIMP
+CEnumMediaTypes::Next(ULONG cMediaTypes,          // place this many types...
+              AM_MEDIA_TYPE **ppMediaTypes,   // ...in this array
+              ULONG *pcFetched)           // actual count passed
+{
+    CheckPointer(ppMediaTypes,E_POINTER);
+    ValidateReadWritePtr(ppMediaTypes,cMediaTypes * sizeof(AM_MEDIA_TYPE *));
+    /* Check we are still in sync with the pin */
+    if (AreWeOutOfSync() == TRUE) {
+        return VFW_E_ENUM_OUT_OF_SYNC;
+    }
+
+    if (pcFetched!=NULL) {
+        ValidateWritePtr(pcFetched, sizeof(ULONG));
+        *pcFetched = 0;           // default unless we succeed
+    }
+    // now check that the parameter is valid
+    else if (cMediaTypes>1) {     // pcFetched == NULL
+        return E_INVALIDARG;
+    }
+    ULONG cFetched = 0;           // increment as we get each one.
+
+    /* Return each media type by asking the filter for them in turn - If we
+       have an error code retured to us while we are retrieving a media type
+       we assume that our internal state is stale with respect to the filter
+       (for example the window size changing) so we return
+       VFW_E_ENUM_OUT_OF_SYNC */
+
+    while (cMediaTypes) {
+
+        CMediaType cmt;
+
+        HRESULT hr = m_pPin->GetMediaType(m_Position++, &cmt);
+        if (S_OK != hr) {
+            break;
+        }
+
+        /* We now have a CMediaType object that contains the next media type
+           but when we assign it to the array position we CANNOT just assign
+           the AM_MEDIA_TYPE structure because as soon as the object goes out of
+           scope it will delete the memory we have just copied. The function
+           we use is CreateMediaType which allocates a task memory block */
+
+        /*  Transfer across the format block manually to save an allocate
+            and free on the format block and generally go faster */
+
+        *ppMediaTypes = (AM_MEDIA_TYPE *)CoTaskMemAlloc(sizeof(AM_MEDIA_TYPE));
+        if (*ppMediaTypes == NULL) {
+            break;
+        }
+
+        /*  Do a regular copy */
+        **ppMediaTypes = (AM_MEDIA_TYPE)cmt;
+
+        /*  Make sure the destructor doesn't free these */
+        cmt.pbFormat = NULL;
+        cmt.cbFormat = NULL;
+        cmt.pUnk     = NULL;
+
+
+        ppMediaTypes++;
+        cFetched++;
+        cMediaTypes--;
+    }
+
+    if (pcFetched!=NULL) {
+        *pcFetched = cFetched;
+    }
+
+    return ( cMediaTypes==0 ? NOERROR : S_FALSE );
+}
+
+
+/* Skip over one or more entries in the enumerator */
+
+STDMETHODIMP
+CEnumMediaTypes::Skip(ULONG cMediaTypes)
+{
+    //  If we're skipping 0 elements we're guaranteed to skip the
+    //  correct number of elements
+    if (cMediaTypes == 0) {
+        return S_OK;
+    }
+
+    /* Check we are still in sync with the pin */
+    if (AreWeOutOfSync() == TRUE) {
+        return VFW_E_ENUM_OUT_OF_SYNC;
+    }
+
+    m_Position += cMediaTypes;
+
+    /*  See if we're over the end */
+    CMediaType cmt;
+    return S_OK == m_pPin->GetMediaType(m_Position - 1, &cmt) ? S_OK : S_FALSE;
+}
+
+
+/* Set the current position back to the start */
+/* Reset has 3 simple steps:
+ *
+ * set position to head of list
+ * sync enumerator with object being enumerated
+ * return S_OK
+ */
+
+STDMETHODIMP
+CEnumMediaTypes::Reset()
+
+{
+    m_Position = 0;
+
+    // Bring the enumerator back into step with the current state.  This
+    // may be a noop but ensures that the enumerator will be valid on the
+    // next call.
+    m_Version = m_pPin->GetMediaTypeVersion();
+    return NOERROR;
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CBasePin
+//=====================================================================
+//=====================================================================
+
+
+/* NOTE The implementation of this class calls the CUnknown constructor with
+   a NULL outer unknown pointer. This has the effect of making us a self
+   contained class, ie any QueryInterface, AddRef or Release calls will be
+   routed to the class's NonDelegatingUnknown methods. You will typically
+   find that the classes that do this then override one or more of these
+   virtual functions to provide more specialised behaviour. A good example
+   of this is where a class wants to keep the QueryInterface internal but
+   still wants its lifetime controlled by the external object */
+
+/* Constructor */
+
+CBasePin::CBasePin(TCHAR *pObjectName,
+           CBaseFilter *pFilter,
+           CCritSec *pLock,
+           HRESULT *phr,
+           LPCWSTR pName,
+           PIN_DIRECTION dir) :
+    CUnknown( pObjectName, NULL ),
+    m_pFilter(pFilter),
+    m_pLock(pLock),
+    m_pName(NULL),
+    m_Connected(NULL),
+    m_dir(dir),
+    m_bRunTimeError(FALSE),
+    m_pQSink(NULL),
+    m_TypeVersion(1),
+    m_tStart(),
+    m_tStop(MAX_TIME),
+    m_bCanReconnectWhenActive(false),
+    m_bTryMyTypesFirst(false),
+    m_dRate(1.0)
+{
+    /*  WARNING - pFilter is often not a properly constituted object at
+        this state (in particular QueryInterface may not work) - this
+        is because its owner is often its containing object and we
+        have been called from the containing object's constructor so
+        the filter's owner has not yet had its CUnknown constructor
+        called
+    */
+
+    ASSERT(pFilter != NULL);
+    ASSERT(pLock != NULL);
+
+    if (pName) {
+        DWORD nameLen = lstrlenW(pName)+1;
+        m_pName = new WCHAR[nameLen];
+        if (m_pName) {
+            CopyMemory(m_pName, pName, nameLen*sizeof(WCHAR));
+        }
+    }
+
+#ifdef DEBUG
+    m_cRef = 0;
+#endif
+}
+
+#ifdef UNICODE
+CBasePin::CBasePin(CHAR *pObjectName,
+           CBaseFilter *pFilter,
+           CCritSec *pLock,
+           HRESULT *phr,
+           LPCWSTR pName,
+           PIN_DIRECTION dir) :
+    CUnknown( pObjectName, NULL ),
+    m_pFilter(pFilter),
+    m_pLock(pLock),
+    m_pName(NULL),
+    m_Connected(NULL),
+    m_dir(dir),
+    m_bRunTimeError(FALSE),
+    m_pQSink(NULL),
+    m_TypeVersion(1),
+    m_tStart(),
+    m_tStop(MAX_TIME),
+    m_bCanReconnectWhenActive(false),
+    m_bTryMyTypesFirst(false),
+    m_dRate(1.0)
+{
+    /*  WARNING - pFilter is often not a properly constituted object at
+        this state (in particular QueryInterface may not work) - this
+        is because its owner is often its containing object and we
+        have been called from the containing object's constructor so
+        the filter's owner has not yet had its CUnknown constructor
+        called
+    */
+
+    ASSERT(pFilter != NULL);
+    ASSERT(pLock != NULL);
+
+    if (pName) {
+        DWORD nameLen = lstrlenW(pName)+1;
+        m_pName = new WCHAR[nameLen];
+        if (m_pName) {
+            CopyMemory(m_pName, pName, nameLen*sizeof(WCHAR));
+        }
+    }
+
+#ifdef DEBUG
+    m_cRef = 0;
+#endif
+}
+#endif
+
+/* Destructor since a connected pin holds a reference count on us there is
+   no way that we can be deleted unless we are not currently connected */
+
+CBasePin::~CBasePin()
+{
+
+    //  We don't call disconnect because if the filter is going away
+    //  all the pins must have a reference count of zero so they must
+    //  have been disconnected anyway - (but check the assumption)
+    ASSERT(m_Connected == FALSE);
+
+    delete[] m_pName;
+
+    // check the internal reference count is consistent
+    ASSERT(m_cRef == 0);
+}
+
+
+/* Override this to say what interfaces we support and where */
+
+STDMETHODIMP
+CBasePin::NonDelegatingQueryInterface(REFIID riid, void ** ppv)
+{
+    /* Do we have this interface */
+
+    if (riid == IID_IPin) {
+        return GetInterface((IPin *) this, ppv);
+    } else if (riid == IID_IQualityControl) {
+        return GetInterface((IQualityControl *) this, ppv);
+    } else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+/* Override to increment the owning filter's reference count */
+
+STDMETHODIMP_(ULONG)
+CBasePin::NonDelegatingAddRef()
+{
+    ASSERT(InterlockedIncrement(&m_cRef) > 0);
+    return m_pFilter->AddRef();
+}
+
+
+/* Override to decrement the owning filter's reference count */
+
+STDMETHODIMP_(ULONG)
+CBasePin::NonDelegatingRelease()
+{
+    ASSERT(InterlockedDecrement(&m_cRef) >= 0);
+    return m_pFilter->Release();
+}
+
+
+/* Displays pin connection information */
+
+#ifdef DEBUG
+void
+CBasePin::DisplayPinInfo(IPin *pReceivePin)
+{
+
+    if (DbgCheckModuleLevel(LOG_TRACE, CONNECT_TRACE_LEVEL)) {
+        PIN_INFO ConnectPinInfo;
+        PIN_INFO ReceivePinInfo;
+
+        if (FAILED(QueryPinInfo(&ConnectPinInfo))) {
+            (void)StringCchCopyW(ConnectPinInfo.achName, NUMELMS(ConnectPinInfo.achName),L"Bad Pin");
+        } else {
+            QueryPinInfoReleaseFilter(ConnectPinInfo);
+        }
+
+        if (FAILED(pReceivePin->QueryPinInfo(&ReceivePinInfo))) {
+            (void)StringCchCopyW(ReceivePinInfo.achName, NUMELMS(ReceivePinInfo.achName),L"Bad Pin");
+        } else {
+            QueryPinInfoReleaseFilter(ReceivePinInfo);
+        }
+
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Trying to connect Pins :")));
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("    <%ls>"), ConnectPinInfo.achName));
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("    <%ls>"), ReceivePinInfo.achName));
+    }
+}
+#endif
+
+
+/* Displays general information on the pin media type */
+
+#ifdef DEBUG
+void CBasePin::DisplayTypeInfo(IPin *pPin, const CMediaType *pmt)
+{
+    UNREFERENCED_PARAMETER(pPin);
+    if (DbgCheckModuleLevel(LOG_TRACE, CONNECT_TRACE_LEVEL)) {
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Trying media type:")));
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("    major type:  %hs"),
+               GuidNames[*pmt->Type()]));
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("    sub type  :  %hs"),
+               GuidNames[*pmt->Subtype()]));
+    }
+}
+#endif
+
+/* Asked to connect to a pin. A pin is always attached to an owning filter
+   object so we always delegate our locking to that object. We first of all
+   retrieve a media type enumerator for the input pin and see if we accept
+   any of the formats that it would ideally like, failing that we retrieve
+   our enumerator and see if it will accept any of our preferred types */
+
+STDMETHODIMP
+CBasePin::Connect(
+    IPin * pReceivePin,
+    const AM_MEDIA_TYPE *pmt   // optional media type
+)
+{
+    CheckPointer(pReceivePin,E_POINTER);
+    ValidateReadPtr(pReceivePin,sizeof(IPin));
+    CAutoLock cObjectLock(m_pLock);
+    DisplayPinInfo(pReceivePin);
+
+    /* See if we are already connected */
+
+    if (m_Connected) {
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Already connected")));
+        return VFW_E_ALREADY_CONNECTED;
+    }
+
+    /* See if the filter is active */
+    if (!IsStopped() && !m_bCanReconnectWhenActive) {
+        return VFW_E_NOT_STOPPED;
+    }
+
+
+    // Find a mutually agreeable media type -
+    // Pass in the template media type. If this is partially specified,
+    // each of the enumerated media types will need to be checked against
+    // it. If it is non-null and fully specified, we will just try to connect
+    // with this.
+
+    const CMediaType * ptype = (CMediaType*)pmt;
+    HRESULT hr = AgreeMediaType(pReceivePin, ptype);
+    if (FAILED(hr)) {
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Failed to agree type")));
+
+        // Since the procedure is already returning an error code, there
+        // is nothing else this function can do to report the error.
+        EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+
+        return hr;
+    }
+
+    DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Connection succeeded")));
+
+
+    return NOERROR;
+}
+
+// given a specific media type, attempt a connection (includes
+// checking that the type is acceptable to this pin)
+HRESULT
+CBasePin::AttemptConnection(
+    IPin* pReceivePin,      // connect to this pin
+    const CMediaType* pmt   // using this type
+)
+{
+    // The caller should hold the filter lock becasue this function
+    // uses m_Connected.  The caller should also hold the filter lock
+    // because this function calls SetMediaType(), IsStopped() and
+    // CompleteConnect().
+    ASSERT(CritCheckIn(m_pLock));
+
+    // Check that the connection is valid  -- need to do this for every
+    // connect attempt since BreakConnect will undo it.
+    HRESULT hr = CheckConnect(pReceivePin);
+    if (FAILED(hr)) {
+        DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("CheckConnect failed")));
+
+        // Since the procedure is already returning an error code, there
+        // is nothing else this function can do to report the error.
+        EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+        return hr;
+    }
+
+    DisplayTypeInfo(pReceivePin, pmt);
+
+    /* Check we will accept this media type */
+
+    hr = CheckMediaType(pmt);
+    if (hr == NOERROR) {
+
+        /*  Make ourselves look connected otherwise ReceiveConnection
+            may not be able to complete the connection
+        */
+        m_Connected = pReceivePin;
+        m_Connected->AddRef();
+        hr = SetMediaType(pmt);
+        if (SUCCEEDED(hr)) {
+            /* See if the other pin will accept this type */
+
+            hr = pReceivePin->ReceiveConnection((IPin *)this, pmt);
+            if (SUCCEEDED(hr)) {
+                /* Complete the connection */
+
+                hr = CompleteConnect(pReceivePin);
+                if (SUCCEEDED(hr)) {
+                    return hr;
+                } else {
+                    DbgLog((LOG_TRACE,
+                            CONNECT_TRACE_LEVEL,
+                            TEXT("Failed to complete connection")));
+                    pReceivePin->Disconnect();
+                }
+            }
+        }
+    } else {
+        // we cannot use this media type
+
+        // return a specific media type error if there is one
+        // or map a general failure code to something more helpful
+        // (in particular S_FALSE gets changed to an error code)
+        if (SUCCEEDED(hr) ||
+            (hr == E_FAIL) ||
+            (hr == E_INVALIDARG)) {
+            hr = VFW_E_TYPE_NOT_ACCEPTED;
+        }
+    }
+
+    // BreakConnect and release any connection here in case CheckMediaType
+    // failed, or if we set anything up during a call back during
+    // ReceiveConnection.
+
+    // Since the procedure is already returning an error code, there
+    // is nothing else this function can do to report the error.
+    EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+    /*  If failed then undo our state */
+    if (m_Connected) {
+        m_Connected->Release();
+        m_Connected = NULL;
+    }
+
+    return hr;
+}
+
+/* Given an enumerator we cycle through all the media types it proposes and
+   firstly suggest them to our derived pin class and if that succeeds try
+   them with the pin in a ReceiveConnection call. This means that if our pin
+   proposes a media type we still check in here that we can support it. This
+   is deliberate so that in simple cases the enumerator can hold all of the
+   media types even if some of them are not really currently available */
+
+HRESULT CBasePin::TryMediaTypes(
+    IPin *pReceivePin,
+    const CMediaType *pmt,
+    IEnumMediaTypes *pEnum)
+{
+    /* Reset the current enumerator position */
+
+    HRESULT hr = pEnum->Reset();
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    CMediaType *pMediaType = NULL;
+    ULONG ulMediaCount = 0;
+
+    // attempt to remember a specific error code if there is one
+    HRESULT hrFailure = S_OK;
+
+    for (;;) {
+
+        /* Retrieve the next media type NOTE each time round the loop the
+           enumerator interface will allocate another AM_MEDIA_TYPE structure
+           If we are successful then we copy it into our output object, if
+           not then we must delete the memory allocated before returning */
+
+        hr = pEnum->Next(1, (AM_MEDIA_TYPE**)&pMediaType,&ulMediaCount);
+        if (hr != S_OK) {
+            if (S_OK == hrFailure) {
+                hrFailure = VFW_E_NO_ACCEPTABLE_TYPES;
+            }
+            return hrFailure;
+        }
+
+
+        ASSERT(ulMediaCount == 1);
+        ASSERT(pMediaType);
+
+        // check that this matches the partial type (if any)
+
+        if ((pmt == NULL) ||
+            pMediaType->MatchesPartial(pmt)) {
+
+            hr = AttemptConnection(pReceivePin, pMediaType);
+
+            // attempt to remember a specific error code
+            if (FAILED(hr) &&
+            SUCCEEDED(hrFailure) &&
+            (hr != E_FAIL) &&
+            (hr != E_INVALIDARG) &&
+            (hr != VFW_E_TYPE_NOT_ACCEPTED)) {
+                hrFailure = hr;
+            }
+        } else {
+            hr = VFW_E_NO_ACCEPTABLE_TYPES;
+        }
+
+        DeleteMediaType(pMediaType);
+
+        if (S_OK == hr) {
+            return hr;
+        }
+    }
+}
+
+
+/* This is called to make the connection, including the taask of finding
+   a media type for the pin connection. pmt is the proposed media type
+   from the Connect call: if this is fully specified, we will try that.
+   Otherwise we enumerate and try all the input pin's types first and
+   if that fails we then enumerate and try all our preferred media types.
+   For each media type we check it against pmt (if non-null and partially
+   specified) as well as checking that both pins will accept it.
+ */
+
+HRESULT CBasePin::AgreeMediaType(
+    IPin *pReceivePin,
+    const CMediaType *pmt)
+{
+    ASSERT(pReceivePin);
+    IEnumMediaTypes *pEnumMediaTypes = NULL;
+
+    // if the media type is fully specified then use that
+    if ( (pmt != NULL) && (!pmt->IsPartiallySpecified())) {
+
+        // if this media type fails, then we must fail the connection
+        // since if pmt is nonnull we are only allowed to connect
+        // using a type that matches it.
+
+        return AttemptConnection(pReceivePin, pmt);
+    }
+
+
+    /* Try the other pin's enumerator */
+
+    HRESULT hrFailure = VFW_E_NO_ACCEPTABLE_TYPES;
+
+    for (int i = 0; i < 2; i++) {
+        HRESULT hr;
+        if (i == (int)m_bTryMyTypesFirst) {
+            hr = pReceivePin->EnumMediaTypes(&pEnumMediaTypes);
+        } else {
+            hr = EnumMediaTypes(&pEnumMediaTypes);
+        }
+        if (SUCCEEDED(hr)) {
+            ASSERT(pEnumMediaTypes);
+            hr = TryMediaTypes(pReceivePin,pmt,pEnumMediaTypes);
+            pEnumMediaTypes->Release();
+            if (SUCCEEDED(hr)) {
+                return NOERROR;
+            } else {
+                // try to remember specific error codes if there are any
+                if ((hr != E_FAIL) &&
+                    (hr != E_INVALIDARG) &&
+                    (hr != VFW_E_TYPE_NOT_ACCEPTED)) {
+                    hrFailure = hr;
+                }
+            }
+        }
+    }
+
+    return hrFailure;
+}
+
+
+/* Called when we want to complete a connection to another filter. Failing
+   this will also fail the connection and disconnect the other pin as well */
+
+HRESULT
+CBasePin::CompleteConnect(IPin *pReceivePin)
+{
+    UNREFERENCED_PARAMETER(pReceivePin);
+    return NOERROR;
+}
+
+
+/* This is called to set the format for a pin connection - CheckMediaType
+   will have been called to check the connection format and if it didn't
+   return an error code then this (virtual) function will be invoked */
+
+HRESULT
+CBasePin::SetMediaType(const CMediaType *pmt)
+{
+    HRESULT hr = m_mt.Set(*pmt);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return NOERROR;
+}
+
+
+/* This is called during Connect() to provide a virtual method that can do
+   any specific check needed for connection such as QueryInterface. This
+   base class method just checks that the pin directions don't match */
+
+HRESULT
+CBasePin::CheckConnect(IPin * pPin)
+{
+    /* Check that pin directions DONT match */
+
+    PIN_DIRECTION pd;
+    pPin->QueryDirection(&pd);
+
+    ASSERT((pd == PINDIR_OUTPUT) || (pd == PINDIR_INPUT));
+    ASSERT((m_dir == PINDIR_OUTPUT) || (m_dir == PINDIR_INPUT));
+
+    // we should allow for non-input and non-output connections?
+    if (pd == m_dir) {
+        return VFW_E_INVALID_DIRECTION;
+    }
+    return NOERROR;
+}
+
+
+/* This is called when we realise we can't make a connection to the pin and
+   must undo anything we did in CheckConnect - override to release QIs done */
+
+HRESULT
+CBasePin::BreakConnect()
+{
+    return NOERROR;
+}
+
+
+/* Called normally by an output pin on an input pin to try and establish a
+   connection.
+*/
+
+STDMETHODIMP
+CBasePin::ReceiveConnection(
+    IPin * pConnector,      // this is the pin who we will connect to
+    const AM_MEDIA_TYPE *pmt    // this is the media type we will exchange
+)
+{
+    CheckPointer(pConnector,E_POINTER);
+    CheckPointer(pmt,E_POINTER);
+    ValidateReadPtr(pConnector,sizeof(IPin));
+    ValidateReadPtr(pmt,sizeof(AM_MEDIA_TYPE));
+    CAutoLock cObjectLock(m_pLock);
+
+    /* Are we already connected */
+    if (m_Connected) {
+        return VFW_E_ALREADY_CONNECTED;
+    }
+
+    /* See if the filter is active */
+    if (!IsStopped() && !m_bCanReconnectWhenActive) {
+        return VFW_E_NOT_STOPPED;
+    }
+
+    HRESULT hr = CheckConnect(pConnector);
+    if (FAILED(hr)) {
+        // Since the procedure is already returning an error code, there
+        // is nothing else this function can do to report the error.
+        EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+
+        return hr;
+    }
+
+    /* Ask derived class if this media type is ok */
+
+    CMediaType * pcmt = (CMediaType*) pmt;
+    hr = CheckMediaType(pcmt);
+    if (hr != NOERROR) {
+        // no -we don't support this media type
+
+        // Since the procedure is already returning an error code, there
+        // is nothing else this function can do to report the error.
+        EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+        // return a specific media type error if there is one
+        // or map a general failure code to something more helpful
+        // (in particular S_FALSE gets changed to an error code)
+        if (SUCCEEDED(hr) ||
+            (hr == E_FAIL) ||
+            (hr == E_INVALIDARG)) {
+            hr = VFW_E_TYPE_NOT_ACCEPTED;
+        }
+
+
+        return hr;
+    }
+
+    /* Complete the connection */
+
+    m_Connected = pConnector;
+    m_Connected->AddRef();
+    hr = SetMediaType(pcmt);
+    if (SUCCEEDED(hr)) {
+        hr = CompleteConnect(pConnector);
+        if (SUCCEEDED(hr)) {
+
+
+            return NOERROR;
+        }
+    }
+
+    DbgLog((LOG_TRACE, CONNECT_TRACE_LEVEL, TEXT("Failed to set the media type or failed to complete the connection.")));
+    m_Connected->Release();
+    m_Connected = NULL;
+
+    // Since the procedure is already returning an error code, there
+    // is nothing else this function can do to report the error.
+    EXECUTE_ASSERT( SUCCEEDED( BreakConnect() ) );
+
+
+    return hr;
+}
+
+
+/* Called when we want to terminate a pin connection */
+
+STDMETHODIMP
+CBasePin::Disconnect()
+{
+    CAutoLock cObjectLock(m_pLock);
+
+    /* See if the filter is active */
+    if (!IsStopped()) {
+        return VFW_E_NOT_STOPPED;
+    }
+
+    return DisconnectInternal();
+}
+
+STDMETHODIMP
+CBasePin::DisconnectInternal()
+{
+    ASSERT(CritCheckIn(m_pLock));
+
+    if (m_Connected) {
+        HRESULT hr = BreakConnect();
+        if( FAILED( hr ) ) {
+
+
+            // There is usually a bug in the program if BreakConnect() fails.
+            DbgBreak( "WARNING: BreakConnect() failed in CBasePin::Disconnect()." );
+            return hr;
+        }
+
+        m_Connected->Release();
+        m_Connected = NULL;
+
+
+        return S_OK;
+    } else {
+        // no connection - not an error
+
+
+        return S_FALSE;
+    }
+}
+
+
+/* Return an AddRef()'d pointer to the connected pin if there is one */
+STDMETHODIMP
+CBasePin::ConnectedTo(
+    IPin **ppPin
+)
+{
+    CheckPointer(ppPin,E_POINTER);
+    ValidateReadWritePtr(ppPin,sizeof(IPin *));
+    //
+    //  It's pointless to lock here.
+    //  The caller should ensure integrity.
+    //
+
+    IPin *pPin = m_Connected;
+    *ppPin = pPin;
+    if (pPin != NULL) {
+        pPin->AddRef();
+        return S_OK;
+    } else {
+        ASSERT(*ppPin == NULL);
+        return VFW_E_NOT_CONNECTED;
+    }
+}
+
+/* Return the media type of the connection */
+STDMETHODIMP
+CBasePin::ConnectionMediaType(
+    AM_MEDIA_TYPE *pmt
+)
+{
+    CheckPointer(pmt,E_POINTER);
+    ValidateReadWritePtr(pmt,sizeof(AM_MEDIA_TYPE));
+    CAutoLock cObjectLock(m_pLock);
+
+    /*  Copy constructor of m_mt allocates the memory */
+    if (IsConnected()) {
+        CopyMediaType( pmt, &m_mt );
+        return S_OK;
+    } else {
+        ((CMediaType *)pmt)->InitMediaType();
+        return VFW_E_NOT_CONNECTED;
+    }
+}
+
+/* Return information about the filter we are connect to */
+
+STDMETHODIMP
+CBasePin::QueryPinInfo(
+    PIN_INFO * pInfo
+)
+{
+    CheckPointer(pInfo,E_POINTER);
+    ValidateReadWritePtr(pInfo,sizeof(PIN_INFO));
+
+    pInfo->pFilter = m_pFilter;
+    if (m_pFilter) {
+        m_pFilter->AddRef();
+    }
+
+    if (m_pName) {
+        lstrcpynW(pInfo->achName, m_pName, sizeof(pInfo->achName)/sizeof(WCHAR));
+    } else {
+        pInfo->achName[0] = L'\0';
+    }
+
+    pInfo->dir = m_dir;
+
+    return NOERROR;
+}
+
+STDMETHODIMP
+CBasePin::QueryDirection(
+    PIN_DIRECTION * pPinDir
+)
+{
+    CheckPointer(pPinDir,E_POINTER);
+    ValidateReadWritePtr(pPinDir,sizeof(PIN_DIRECTION));
+
+    *pPinDir = m_dir;
+    return NOERROR;
+}
+
+// Default QueryId to return the pin's name
+STDMETHODIMP
+CBasePin::QueryId(
+    LPWSTR * Id
+)
+{
+    //  We're not going away because someone's got a pointer to us
+    //  so there's no need to lock
+
+    return AMGetWideString(Name(), Id);
+}
+
+/* Does this pin support this media type WARNING this interface function does
+   not lock the main object as it is meant to be asynchronous by nature - if
+   the media types you support depend on some internal state that is updated
+   dynamically then you will need to implement locking in a derived class */
+
+STDMETHODIMP
+CBasePin::QueryAccept(
+    const AM_MEDIA_TYPE *pmt
+)
+{
+    CheckPointer(pmt,E_POINTER);
+    ValidateReadPtr(pmt,sizeof(AM_MEDIA_TYPE));
+
+    /* The CheckMediaType method is valid to return error codes if the media
+       type is horrible, an example might be E_INVALIDARG. What we do here
+       is map all the error codes into either S_OK or S_FALSE regardless */
+
+    HRESULT hr = CheckMediaType((CMediaType*)pmt);
+    if (FAILED(hr)) {
+        return S_FALSE;
+    }
+    // note that the only defined success codes should be S_OK and S_FALSE...
+    return hr;
+}
+
+
+/* This can be called to return an enumerator for the pin's list of preferred
+   media types. An input pin is not obliged to have any preferred formats
+   although it can do. For example, the window renderer has a preferred type
+   which describes a video image that matches the current window size. All
+   output pins should expose at least one preferred format otherwise it is
+   possible that neither pin has any types and so no connection is possible */
+
+STDMETHODIMP
+CBasePin::EnumMediaTypes(
+    IEnumMediaTypes **ppEnum
+)
+{
+    CheckPointer(ppEnum,E_POINTER);
+    ValidateReadWritePtr(ppEnum,sizeof(IEnumMediaTypes *));
+
+    /* Create a new ref counted enumerator */
+
+    *ppEnum = new CEnumMediaTypes(this,
+                              NULL);
+
+    if (*ppEnum == NULL) {
+        return E_OUTOFMEMORY;
+    }
+
+    return NOERROR;
+}
+
+
+
+/* This is a virtual function that returns a media type corresponding with
+   place iPosition in the list. This base class simply returns an error as
+   we support no media types by default but derived classes should override */
+
+HRESULT CBasePin::GetMediaType(int iPosition, CMediaType *pMediaType)
+{
+    UNREFERENCED_PARAMETER(iPosition);
+    UNREFERENCED_PARAMETER(pMediaType);
+    return E_UNEXPECTED;
+}
+
+
+/* This is a virtual function that returns the current media type version.
+   The base class initialises the media type enumerators with the value 1
+   By default we always returns that same value. A Derived class may change
+   the list of media types available and after doing so it should increment
+   the version either in a method derived from this, or more simply by just
+   incrementing the m_TypeVersion base pin variable. The type enumerators
+   call this when they want to see if their enumerations are out of date */
+
+LONG CBasePin::GetMediaTypeVersion()
+{
+    return m_TypeVersion;
+}
+
+
+/* Increment the cookie representing the current media type version */
+
+void CBasePin::IncrementTypeVersion()
+{
+    InterlockedIncrement(&m_TypeVersion);
+}
+
+
+/* Called by IMediaFilter implementation when the state changes from Stopped
+   to either paused or running and in derived classes could do things like
+   commit memory and grab hardware resource (the default is to do nothing) */
+
+HRESULT
+CBasePin::Active(void)
+{
+    return NOERROR;
+}
+
+/* Called by IMediaFilter implementation when the state changes from
+   to either paused to running and in derived classes could do things like
+   commit memory and grab hardware resource (the default is to do nothing) */
+
+HRESULT
+CBasePin::Run(REFERENCE_TIME tStart)
+{
+    UNREFERENCED_PARAMETER(tStart);
+    return NOERROR;
+}
+
+
+/* Also called by the IMediaFilter implementation when the state changes to
+   Stopped at which point you should decommit allocators and free hardware
+   resources you grabbed in the Active call (default is also to do nothing) */
+
+HRESULT
+CBasePin::Inactive(void)
+{
+    m_bRunTimeError = FALSE;
+    return NOERROR;
+}
+
+
+// Called when no more data will arrive
+STDMETHODIMP
+CBasePin::EndOfStream(void)
+{
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CBasePin::SetSink(IQualityControl * piqc)
+{
+    CAutoLock cObjectLock(m_pLock);
+    if (piqc) ValidateReadPtr(piqc,sizeof(IQualityControl));
+    m_pQSink = piqc;
+    return NOERROR;
+} // SetSink
+
+
+STDMETHODIMP
+CBasePin::Notify(IBaseFilter * pSender, Quality q)
+{
+    UNREFERENCED_PARAMETER(q);
+    UNREFERENCED_PARAMETER(pSender);
+    DbgBreak("IQualityControl::Notify not over-ridden from CBasePin.  (IGNORE is OK)");
+    return E_NOTIMPL;
+} //Notify
+
+
+// NewSegment notifies of the start/stop/rate applying to the data
+// about to be received. Default implementation records data and
+// returns S_OK.
+// Override this to pass downstream.
+STDMETHODIMP
+CBasePin::NewSegment(
+                REFERENCE_TIME tStart,
+                REFERENCE_TIME tStop,
+                double dRate)
+{
+    m_tStart = tStart;
+    m_tStop = tStop;
+    m_dRate = dRate;
+
+    return S_OK;
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CBaseOutputPin
+//=====================================================================
+//=====================================================================
+
+
+CBaseOutputPin::CBaseOutputPin(TCHAR *pObjectName,
+                   CBaseFilter *pFilter,
+                   CCritSec *pLock,
+                   HRESULT *phr,
+                   LPCWSTR pName) :
+    CBasePin(pObjectName, pFilter, pLock, phr, pName, PINDIR_OUTPUT),
+    m_pAllocator(NULL),
+    m_pInputPin(NULL)
+{
+    ASSERT(pFilter);
+}
+
+#ifdef UNICODE
+CBaseOutputPin::CBaseOutputPin(CHAR *pObjectName,
+                   CBaseFilter *pFilter,
+                   CCritSec *pLock,
+                   HRESULT *phr,
+                   LPCWSTR pName) :
+    CBasePin(pObjectName, pFilter, pLock, phr, pName, PINDIR_OUTPUT),
+    m_pAllocator(NULL),
+    m_pInputPin(NULL)
+{
+    ASSERT(pFilter);
+}
+#endif
+
+/*   This is called after a media type has been proposed
+
+     Try to complete the connection by agreeing the allocator
+*/
+HRESULT
+CBaseOutputPin::CompleteConnect(IPin *pReceivePin)
+{
+    UNREFERENCED_PARAMETER(pReceivePin);
+    return DecideAllocator(m_pInputPin, &m_pAllocator);
+}
+
+
+/* This method is called when the output pin is about to try and connect to
+   an input pin. It is at this point that you should try and grab any extra
+   interfaces that you need, in this case IMemInputPin. Because this is
+   only called if we are not currently connected we do NOT need to call
+   BreakConnect. This also makes it easier to derive classes from us as
+   BreakConnect is only called when we actually have to break a connection
+   (or a partly made connection) and not when we are checking a connection */
+
+/* Overriden from CBasePin */
+
+HRESULT
+CBaseOutputPin::CheckConnect(IPin * pPin)
+{
+    HRESULT hr = CBasePin::CheckConnect(pPin);
+    if (FAILED(hr)) {
+    return hr;
+    }
+
+    // get an input pin and an allocator interface
+    hr = pPin->QueryInterface(IID_IMemInputPin, (void **) &m_pInputPin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return NOERROR;
+}
+
+
+/* Overriden from CBasePin */
+
+HRESULT
+CBaseOutputPin::BreakConnect()
+{
+    /* Release any allocator we hold */
+
+    if (m_pAllocator) {
+        // Always decommit the allocator because a downstream filter may or
+        // may not decommit the connection's allocator.  A memory leak could
+        // occur if the allocator is not decommited when a connection is broken.
+        HRESULT hr = m_pAllocator->Decommit();
+        if( FAILED( hr ) ) {
+            return hr;
+        }
+
+        m_pAllocator->Release();
+        m_pAllocator = NULL;
+    }
+
+    /* Release any input pin interface we hold */
+
+    if (m_pInputPin) {
+        m_pInputPin->Release();
+        m_pInputPin = NULL;
+    }
+    return NOERROR;
+}
+
+
+/* This is called when the input pin didn't give us a valid allocator */
+
+HRESULT
+CBaseOutputPin::InitAllocator(IMemAllocator **ppAlloc)
+{
+    return CreateMemoryAllocator(ppAlloc);
+}
+
+
+/* Decide on an allocator, override this if you want to use your own allocator
+   Override DecideBufferSize to call SetProperties. If the input pin fails
+   the GetAllocator call then this will construct a CMemAllocator and call
+   DecideBufferSize on that, and if that fails then we are completely hosed.
+   If the you succeed the DecideBufferSize call, we will notify the input
+   pin of the selected allocator. NOTE this is called during Connect() which
+   therefore looks after grabbing and locking the object's critical section */
+
+// We query the input pin for its requested properties and pass this to
+// DecideBufferSize to allow it to fulfill requests that it is happy
+// with (eg most people don't care about alignment and are thus happy to
+// use the downstream pin's alignment request).
+
+HRESULT
+CBaseOutputPin::DecideAllocator(IMemInputPin *pPin, IMemAllocator **ppAlloc)
+{
+    HRESULT hr = NOERROR;
+    *ppAlloc = NULL;
+
+    // get downstream prop request
+    // the derived class may modify this in DecideBufferSize, but
+    // we assume that he will consistently modify it the same way,
+    // so we only get it once
+    ALLOCATOR_PROPERTIES prop;
+    ZeroMemory(&prop, sizeof(prop));
+
+    // whatever he returns, we assume prop is either all zeros
+    // or he has filled it out.
+    pPin->GetAllocatorRequirements(&prop);
+
+    // if he doesn't care about alignment, then set it to 1
+    if (prop.cbAlign == 0) {
+        prop.cbAlign = 1;
+    }
+
+    /* Try the allocator provided by the input pin */
+
+    hr = pPin->GetAllocator(ppAlloc);
+    if (SUCCEEDED(hr)) {
+
+        hr = DecideBufferSize(*ppAlloc, &prop);
+        if (SUCCEEDED(hr)) {
+            hr = pPin->NotifyAllocator(*ppAlloc, FALSE);
+            if (SUCCEEDED(hr)) {
+                return NOERROR;
+            }
+        }
+    }
+
+    /* If the GetAllocator failed we may not have an interface */
+
+    if (*ppAlloc) {
+        (*ppAlloc)->Release();
+        *ppAlloc = NULL;
+    }
+
+    /* Try the output pin's allocator by the same method */
+
+    hr = InitAllocator(ppAlloc);
+    if (SUCCEEDED(hr)) {
+
+        // note - the properties passed here are in the same
+        // structure as above and may have been modified by
+        // the previous call to DecideBufferSize
+        hr = DecideBufferSize(*ppAlloc, &prop);
+        if (SUCCEEDED(hr)) {
+            hr = pPin->NotifyAllocator(*ppAlloc, FALSE);
+            if (SUCCEEDED(hr)) {
+                return NOERROR;
+            }
+        }
+    }
+
+    /* Likewise we may not have an interface to release */
+
+    if (*ppAlloc) {
+        (*ppAlloc)->Release();
+        *ppAlloc = NULL;
+    }
+    return hr;
+}
+
+
+/* This returns an empty sample buffer from the allocator WARNING the same
+   dangers and restrictions apply here as described below for Deliver() */
+
+HRESULT
+CBaseOutputPin::GetDeliveryBuffer(IMediaSample ** ppSample,
+                                  REFERENCE_TIME * pStartTime,
+                                  REFERENCE_TIME * pEndTime,
+                                  DWORD dwFlags)
+{
+    if (m_pAllocator != NULL) {
+        return m_pAllocator->GetBuffer(ppSample,pStartTime,pEndTime,dwFlags);
+    } else {
+        return E_NOINTERFACE;
+    }
+}
+
+
+/* Deliver a filled-in sample to the connected input pin. NOTE the object must
+   have locked itself before calling us otherwise we may get halfway through
+   executing this method only to find the filter graph has got in and
+   disconnected us from the input pin. If the filter has no worker threads
+   then the lock is best applied on Receive(), otherwise it should be done
+   when the worker thread is ready to deliver. There is a wee snag to worker
+   threads that this shows up. The worker thread must lock the object when
+   it is ready to deliver a sample, but it may have to wait until a state
+   change has completed, but that may never complete because the state change
+   is waiting for the worker thread to complete. The way to handle this is for
+   the state change code to grab the critical section, then set an abort event
+   for the worker thread, then release the critical section and wait for the
+   worker thread to see the event we set and then signal that it has finished
+   (with another event). At which point the state change code can complete */
+
+// note (if you've still got any breath left after reading that) that you
+// need to release the sample yourself after this call. if the connected
+// input pin needs to hold onto the sample beyond the call, it will addref
+// the sample itself.
+
+// of course you must release this one and call GetDeliveryBuffer for the
+// next. You cannot reuse it directly.
+
+HRESULT
+CBaseOutputPin::Deliver(IMediaSample * pSample)
+{
+    if (m_pInputPin == NULL) {
+        return VFW_E_NOT_CONNECTED;
+    }
+
+
+    return m_pInputPin->Receive(pSample);
+}
+
+
+// called from elsewhere in our filter to pass EOS downstream to
+// our connected input pin
+HRESULT
+CBaseOutputPin::DeliverEndOfStream(void)
+{
+    // remember this is on IPin not IMemInputPin
+    if (m_Connected == NULL) {
+        return VFW_E_NOT_CONNECTED;
+    }
+    return m_Connected->EndOfStream();
+}
+
+
+/* Commit the allocator's memory, this is called through IMediaFilter
+   which is responsible for locking the object before calling us */
+
+HRESULT
+CBaseOutputPin::Active(void)
+{
+    if (m_pAllocator == NULL) {
+        return VFW_E_NO_ALLOCATOR;
+    }
+    return m_pAllocator->Commit();
+}
+
+
+/* Free up or unprepare allocator's memory, this is called through
+   IMediaFilter which is responsible for locking the object first */
+
+HRESULT
+CBaseOutputPin::Inactive(void)
+{
+    m_bRunTimeError = FALSE;
+    if (m_pAllocator == NULL) {
+        return VFW_E_NO_ALLOCATOR;
+    }
+    return m_pAllocator->Decommit();
+}
+
+// we have a default handling of EndOfStream which is to return
+// an error, since this should be called on input pins only
+STDMETHODIMP
+CBaseOutputPin::EndOfStream(void)
+{
+    return E_UNEXPECTED;
+}
+
+
+// BeginFlush should be called on input pins only
+STDMETHODIMP
+CBaseOutputPin::BeginFlush(void)
+{
+    return E_UNEXPECTED;
+}
+
+// EndFlush should be called on input pins only
+STDMETHODIMP
+CBaseOutputPin::EndFlush(void)
+{
+    return E_UNEXPECTED;
+}
+
+// call BeginFlush on the connected input pin
+HRESULT
+CBaseOutputPin::DeliverBeginFlush(void)
+{
+    // remember this is on IPin not IMemInputPin
+    if (m_Connected == NULL) {
+        return VFW_E_NOT_CONNECTED;
+    }
+    return m_Connected->BeginFlush();
+}
+
+// call EndFlush on the connected input pin
+HRESULT
+CBaseOutputPin::DeliverEndFlush(void)
+{
+    // remember this is on IPin not IMemInputPin
+    if (m_Connected == NULL) {
+        return VFW_E_NOT_CONNECTED;
+    }
+    return m_Connected->EndFlush();
+}
+// deliver NewSegment to connected pin
+HRESULT
+CBaseOutputPin::DeliverNewSegment(
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop,
+    double dRate)
+{
+    if (m_Connected == NULL) {
+        return VFW_E_NOT_CONNECTED;
+    }
+    return m_Connected->NewSegment(tStart, tStop, dRate);
+}
+
+
+//=====================================================================
+//=====================================================================
+// Implements CBaseInputPin
+//=====================================================================
+//=====================================================================
+
+
+/* Constructor creates a default allocator object */
+
+CBaseInputPin::CBaseInputPin(TCHAR *pObjectName,
+                 CBaseFilter *pFilter,
+                 CCritSec *pLock,
+                 HRESULT *phr,
+                 LPCWSTR pPinName) :
+    CBasePin(pObjectName, pFilter, pLock, phr, pPinName, PINDIR_INPUT),
+    m_pAllocator(NULL),
+    m_bReadOnly(FALSE),
+    m_bFlushing(FALSE)
+{
+    ZeroMemory(&m_SampleProps, sizeof(m_SampleProps));
+}
+
+#ifdef UNICODE
+CBaseInputPin::CBaseInputPin(CHAR *pObjectName,
+                 CBaseFilter *pFilter,
+                 CCritSec *pLock,
+                 HRESULT *phr,
+                 LPCWSTR pPinName) :
+    CBasePin(pObjectName, pFilter, pLock, phr, pPinName, PINDIR_INPUT),
+    m_pAllocator(NULL),
+    m_bReadOnly(FALSE),
+    m_bFlushing(FALSE)
+{
+    ZeroMemory(&m_SampleProps, sizeof(m_SampleProps));
+}
+#endif
+
+/* Destructor releases it's reference count on the default allocator */
+
+CBaseInputPin::~CBaseInputPin()
+{
+    if (m_pAllocator != NULL) {
+    m_pAllocator->Release();
+    m_pAllocator = NULL;
+    }
+}
+
+
+// override this to publicise our interfaces
+STDMETHODIMP
+CBaseInputPin::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    /* Do we know about this interface */
+
+    if (riid == IID_IMemInputPin) {
+        return GetInterface((IMemInputPin *) this, ppv);
+    } else {
+        return CBasePin::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+/* Return the allocator interface that this input pin would like the output
+   pin to use. NOTE subsequent calls to GetAllocator should all return an
+   interface onto the SAME object so we create one object at the start
+
+   Note:
+       The allocator is Release()'d on disconnect and replaced on
+       NotifyAllocator().
+
+   Override this to provide your own allocator.
+*/
+
+STDMETHODIMP
+CBaseInputPin::GetAllocator(
+    IMemAllocator **ppAllocator)
+{
+    CheckPointer(ppAllocator,E_POINTER);
+    ValidateReadWritePtr(ppAllocator,sizeof(IMemAllocator *));
+    CAutoLock cObjectLock(m_pLock);
+
+    if (m_pAllocator == NULL) {
+        HRESULT hr = CreateMemoryAllocator(&m_pAllocator);
+        if (FAILED(hr)) {
+            return hr;
+        }
+    }
+    ASSERT(m_pAllocator != NULL);
+    *ppAllocator = m_pAllocator;
+    m_pAllocator->AddRef();
+    return NOERROR;
+}
+
+
+/* Tell the input pin which allocator the output pin is actually going to use
+   Override this if you care - NOTE the locking we do both here and also in
+   GetAllocator is unnecessary but derived classes that do something useful
+   will undoubtedly have to lock the object so this might help remind people */
+
+STDMETHODIMP
+CBaseInputPin::NotifyAllocator(
+    IMemAllocator * pAllocator,
+    BOOL bReadOnly)
+{
+    CheckPointer(pAllocator,E_POINTER);
+    ValidateReadPtr(pAllocator,sizeof(IMemAllocator));
+    CAutoLock cObjectLock(m_pLock);
+
+    IMemAllocator *pOldAllocator = m_pAllocator;
+    pAllocator->AddRef();
+    m_pAllocator = pAllocator;
+
+    if (pOldAllocator != NULL) {
+        pOldAllocator->Release();
+    }
+
+    // the readonly flag indicates whether samples from this allocator should
+    // be regarded as readonly - if true, then inplace transforms will not be
+    // allowed.
+    m_bReadOnly = (BYTE)bReadOnly;
+    return NOERROR;
+}
+
+
+HRESULT
+CBaseInputPin::BreakConnect()
+{
+    /* We don't need our allocator any more */
+    if (m_pAllocator) {
+        // Always decommit the allocator because a downstream filter may or
+        // may not decommit the connection's allocator.  A memory leak could
+        // occur if the allocator is not decommited when a pin is disconnected.
+        HRESULT hr = m_pAllocator->Decommit();
+        if( FAILED( hr ) ) {
+            return hr;
+        }
+
+        m_pAllocator->Release();
+        m_pAllocator = NULL;
+    }
+
+    return S_OK;
+}
+
+
+/* Do something with this media sample - this base class checks to see if the
+   format has changed with this media sample and if so checks that the filter
+   will accept it, generating a run time error if not. Once we have raised a
+   run time error we set a flag so that no more samples will be accepted
+
+   It is important that any filter should override this method and implement
+   synchronization so that samples are not processed when the pin is
+   disconnected etc
+*/
+
+STDMETHODIMP
+CBaseInputPin::Receive(IMediaSample *pSample)
+{
+    CheckPointer(pSample,E_POINTER);
+    ValidateReadPtr(pSample,sizeof(IMediaSample));
+    ASSERT(pSample);
+
+    HRESULT hr = CheckStreaming();
+    if (S_OK != hr) {
+        return hr;
+    }
+
+
+
+    /* Check for IMediaSample2 */
+    IMediaSample2 *pSample2;
+    if (SUCCEEDED(pSample->QueryInterface(IID_IMediaSample2, (void **)&pSample2))) {
+        hr = pSample2->GetProperties(sizeof(m_SampleProps), (PBYTE)&m_SampleProps);
+        pSample2->Release();
+        if (FAILED(hr)) {
+            return hr;
+        }
+    } else {
+        /*  Get the properties the hard way */
+        m_SampleProps.cbData = sizeof(m_SampleProps);
+        m_SampleProps.dwTypeSpecificFlags = 0;
+        m_SampleProps.dwStreamId = AM_STREAM_MEDIA;
+        m_SampleProps.dwSampleFlags = 0;
+        if (S_OK == pSample->IsDiscontinuity()) {
+            m_SampleProps.dwSampleFlags |= AM_SAMPLE_DATADISCONTINUITY;
+        }
+        if (S_OK == pSample->IsPreroll()) {
+            m_SampleProps.dwSampleFlags |= AM_SAMPLE_PREROLL;
+        }
+        if (S_OK == pSample->IsSyncPoint()) {
+            m_SampleProps.dwSampleFlags |= AM_SAMPLE_SPLICEPOINT;
+        }
+        if (SUCCEEDED(pSample->GetTime(&m_SampleProps.tStart,
+                                       &m_SampleProps.tStop))) {
+            m_SampleProps.dwSampleFlags |= AM_SAMPLE_TIMEVALID |
+                                           AM_SAMPLE_STOPVALID;
+        }
+        if (S_OK == pSample->GetMediaType(&m_SampleProps.pMediaType)) {
+            m_SampleProps.dwSampleFlags |= AM_SAMPLE_TYPECHANGED;
+        }
+        pSample->GetPointer(&m_SampleProps.pbBuffer);
+        m_SampleProps.lActual = pSample->GetActualDataLength();
+        m_SampleProps.cbBuffer = pSample->GetSize();
+    }
+
+    /* Has the format changed in this sample */
+
+    if (!(m_SampleProps.dwSampleFlags & AM_SAMPLE_TYPECHANGED)) {
+        return NOERROR;
+    }
+
+    /* Check the derived class accepts this format */
+    /* This shouldn't fail as the source must call QueryAccept first */
+
+    hr = CheckMediaType((CMediaType *)m_SampleProps.pMediaType);
+
+    if (hr == NOERROR) {
+        return NOERROR;
+    }
+
+    /* Raise a runtime error if we fail the media type */
+
+    m_bRunTimeError = TRUE;
+    EndOfStream();
+    m_pFilter->NotifyEvent(EC_ERRORABORT,VFW_E_TYPE_NOT_ACCEPTED,0);
+    return VFW_E_INVALIDMEDIATYPE;
+}
+
+
+/*  Receive multiple samples */
+STDMETHODIMP
+CBaseInputPin::ReceiveMultiple (
+    IMediaSample **pSamples,
+    long nSamples,
+    long *nSamplesProcessed)
+{
+    CheckPointer(pSamples,E_POINTER);
+    ValidateReadPtr(pSamples,nSamples * sizeof(IMediaSample *));
+
+    HRESULT hr = S_OK;
+    *nSamplesProcessed = 0;
+    while (nSamples-- > 0) {
+         hr = Receive(pSamples[*nSamplesProcessed]);
+
+         /*  S_FALSE means don't send any more */
+         if (hr != S_OK) {
+             break;
+         }
+         (*nSamplesProcessed)++;
+    }
+    return hr;
+}
+
+/*  See if Receive() might block */
+STDMETHODIMP
+CBaseInputPin::ReceiveCanBlock()
+{
+    /*  Ask all the output pins if they block
+        If there are no output pin assume we do block
+    */
+    int cPins = m_pFilter->GetPinCount();
+    int cOutputPins = 0;
+    for (int c = 0; c < cPins; c++) {
+        CBasePin *pPin = m_pFilter->GetPin(c);
+        PIN_DIRECTION pd;
+        HRESULT hr = pPin->QueryDirection(&pd);
+        if (FAILED(hr)) {
+            return hr;
+        }
+
+        if (pd == PINDIR_OUTPUT) {
+
+            IPin *pConnected;
+            hr = pPin->ConnectedTo(&pConnected);
+            if (SUCCEEDED(hr)) {
+                ASSERT(pConnected != NULL);
+                cOutputPins++;
+                IMemInputPin *pInputPin;
+                hr = pConnected->QueryInterface(
+                                              IID_IMemInputPin,
+                                              (void **)&pInputPin);
+                pConnected->Release();
+                if (SUCCEEDED(hr)) {
+                    hr = pInputPin->ReceiveCanBlock();
+                    pInputPin->Release();
+                    if (hr != S_FALSE) {
+                        return S_OK;
+                    }
+                } else {
+                    /*  There's a transport we don't understand here */
+                    return S_OK;
+                }
+            }
+        }
+    }
+    return cOutputPins == 0 ? S_OK : S_FALSE;
+}
+
+// Default handling for BeginFlush - call at the beginning
+// of your implementation (makes sure that all Receive calls
+// fail). After calling this, you need to free any queued data
+// and then call downstream.
+STDMETHODIMP
+CBaseInputPin::BeginFlush(void)
+{
+    //  BeginFlush is NOT synchronized with streaming but is part of
+    //  a control action - hence we synchronize with the filter
+    CAutoLock lck(m_pLock);
+
+    // if we are already in mid-flush, this is probably a mistake
+    // though not harmful - try to pick it up for now so I can think about it
+    ASSERT(!m_bFlushing);
+
+    // first thing to do is ensure that no further Receive calls succeed
+    m_bFlushing = TRUE;
+
+    // now discard any data and call downstream - must do that
+    // in derived classes
+    return S_OK;
+}
+
+// default handling for EndFlush - call at end of your implementation
+// - before calling this, ensure that there is no queued data and no thread
+// pushing any more without a further receive, then call downstream,
+// then call this method to clear the m_bFlushing flag and re-enable
+// receives
+STDMETHODIMP
+CBaseInputPin::EndFlush(void)
+{
+    //  Endlush is NOT synchronized with streaming but is part of
+    //  a control action - hence we synchronize with the filter
+    CAutoLock lck(m_pLock);
+
+    // almost certainly a mistake if we are not in mid-flush
+    ASSERT(m_bFlushing);
+
+    // before calling, sync with pushing thread and ensure
+    // no more data is going downstream, then call EndFlush on
+    // downstream pins.
+
+    // now re-enable Receives
+    m_bFlushing = FALSE;
+
+    // No more errors
+    m_bRunTimeError = FALSE;
+
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CBaseInputPin::Notify(IBaseFilter * pSender, Quality q)
+{
+    UNREFERENCED_PARAMETER(q);
+    CheckPointer(pSender,E_POINTER);
+    ValidateReadPtr(pSender,sizeof(IBaseFilter));
+    DbgBreak("IQuality::Notify called on an input pin");
+    return NOERROR;
+} // Notify
+
+/* Free up or unprepare allocator's memory, this is called through
+   IMediaFilter which is responsible for locking the object first */
+
+HRESULT
+CBaseInputPin::Inactive(void)
+{
+    m_bRunTimeError = FALSE;
+    if (m_pAllocator == NULL) {
+        return VFW_E_NO_ALLOCATOR;
+    }
+
+    m_bFlushing = FALSE;
+
+    return m_pAllocator->Decommit();
+}
+
+// what requirements do we have of the allocator - override if you want
+// to support other people's allocators but need a specific alignment
+// or prefix.
+STDMETHODIMP
+CBaseInputPin::GetAllocatorRequirements(ALLOCATOR_PROPERTIES*pProps)
+{
+    UNREFERENCED_PARAMETER(pProps);
+    return E_NOTIMPL;
+}
+
+//  Check if it's OK to process data
+//
+HRESULT
+CBaseInputPin::CheckStreaming()
+{
+    //  Shouldn't be able to get any data if we're not connected!
+    ASSERT(IsConnected());
+
+    //  Don't process stuff in Stopped state
+    if (IsStopped()) {
+        return VFW_E_WRONG_STATE;
+    }
+    if (m_bFlushing) {
+        return S_FALSE;
+    }
+    if (m_bRunTimeError) {
+        return VFW_E_RUNTIME_ERROR;
+    }
+    return S_OK;
+}
+
+// Pass on the Quality notification q to
+// a. Our QualityControl sink (if we have one) or else
+// b. to our upstream filter
+// and if that doesn't work, throw it away with a bad return code
+HRESULT
+CBaseInputPin::PassNotify(Quality& q)
+{
+    // We pass the message on, which means that we find the quality sink
+    // for our input pin and send it there
+
+    DbgLog((LOG_TRACE,3,TEXT("Passing Quality notification through transform")));
+    if (m_pQSink!=NULL) {
+        return m_pQSink->Notify(m_pFilter, q);
+    } else {
+        // no sink set, so pass it upstream
+        HRESULT hr;
+        IQualityControl * pIQC;
+
+        hr = VFW_E_NOT_FOUND;                   // default
+        if (m_Connected) {
+            m_Connected->QueryInterface(IID_IQualityControl, (void**)&pIQC);
+
+            if (pIQC!=NULL) {
+                hr = pIQC->Notify(m_pFilter, q);
+                pIQC->Release();
+            }
+        }
+        return hr;
+    }
+
+} // PassNotify
+
+//=====================================================================
+//=====================================================================
+// Memory allocation class, implements CMediaSample
+//=====================================================================
+//=====================================================================
+
+
+/* NOTE The implementation of this class calls the CUnknown constructor with
+   a NULL outer unknown pointer. This has the effect of making us a self
+   contained class, ie any QueryInterface, AddRef or Release calls will be
+   routed to the class's NonDelegatingUnknown methods. You will typically
+   find that the classes that do this then override one or more of these
+   virtual functions to provide more specialised behaviour. A good example
+   of this is where a class wants to keep the QueryInterface internal but
+   still wants it's lifetime controlled by the external object */
+
+/* The last two parameters have default values of NULL and zero */
+
+CMediaSample::CMediaSample(TCHAR *pName,
+               CBaseAllocator *pAllocator,
+               HRESULT *phr,
+               LPBYTE pBuffer,
+               LONG length) :
+    m_pBuffer(pBuffer),             // Initialise the buffer
+    m_cbBuffer(length),             // And it's length
+    m_lActual(length),              // By default, actual = length
+    m_pMediaType(NULL),             // No media type change
+    m_dwFlags(0),                   // Nothing set
+    m_cRef(0),                      // 0 ref count
+    m_dwTypeSpecificFlags(0),       // Type specific flags
+    m_dwStreamId(AM_STREAM_MEDIA),  // Stream id
+    m_pAllocator(pAllocator)        // Allocator
+{
+
+    /* We must have an owner and it must also be derived from class
+       CBaseAllocator BUT we do not hold a reference count on it */
+
+    ASSERT(pAllocator);
+}
+
+#ifdef UNICODE
+CMediaSample::CMediaSample(CHAR *pName,
+               CBaseAllocator *pAllocator,
+               HRESULT *phr,
+               LPBYTE pBuffer,
+               LONG length) :
+    m_pBuffer(pBuffer),             // Initialise the buffer
+    m_cbBuffer(length),             // And it's length
+    m_lActual(length),              // By default, actual = length
+    m_pMediaType(NULL),             // No media type change
+    m_dwFlags(0),                   // Nothing set
+    m_cRef(0),                      // 0 ref count
+    m_dwTypeSpecificFlags(0),       // Type specific flags
+    m_dwStreamId(AM_STREAM_MEDIA),  // Stream id
+    m_pAllocator(pAllocator)        // Allocator
+{
+
+    /* We must have an owner and it must also be derived from class
+       CBaseAllocator BUT we do not hold a reference count on it */
+
+    ASSERT(pAllocator);
+}
+#endif
+
+/* Destructor deletes the media type memory */
+
+CMediaSample::~CMediaSample()
+{
+
+    if (m_pMediaType) {
+    DeleteMediaType(m_pMediaType);
+    }
+}
+
+/* Override this to publicise our interfaces */
+
+STDMETHODIMP
+CMediaSample::QueryInterface(REFIID riid, void **ppv)
+{
+    if (riid == IID_IMediaSample ||
+        riid == IID_IMediaSample2 ||
+        riid == IID_IUnknown) {
+        return GetInterface((IMediaSample *) this, ppv);
+    } else {
+        return E_NOINTERFACE;
+    }
+}
+
+STDMETHODIMP_(ULONG)
+CMediaSample::AddRef()
+{
+    return InterlockedIncrement(&m_cRef);
+}
+
+
+// --  CMediaSample lifetimes --
+//
+// On final release of this sample buffer it is not deleted but
+// returned to the freelist of the owning memory allocator
+//
+// The allocator may be waiting for the last buffer to be placed on the free
+// list in order to decommit all the memory, so the ReleaseBuffer() call may
+// result in this sample being deleted. We also need to hold a refcount on
+// the allocator to stop that going away until we have finished with this.
+// However, we cannot release the allocator before the ReleaseBuffer, as the
+// release may cause us to be deleted. Similarly we can't do it afterwards.
+//
+// Thus we must leave it to the allocator to hold an addref on our behalf.
+// When he issues us in GetBuffer, he addref's himself. When ReleaseBuffer
+// is called, he releases himself, possibly causing us and him to be deleted.
+
+
+STDMETHODIMP_(ULONG)
+CMediaSample::Release()
+{
+    /* Decrement our own private reference count */
+    LONG lRef;
+    if (m_cRef == 1) {
+        lRef = 0;
+        m_cRef = 0;
+    } else {
+        lRef = InterlockedDecrement(&m_cRef);
+    }
+    ASSERT(lRef >= 0);
+
+    DbgLog((LOG_MEMORY,3,TEXT("    Unknown %X ref-- = %d"),
+        this, m_cRef));
+
+    /* Did we release our final reference count */
+    if (lRef == 0) {
+        /* Free all resources */
+        if (m_dwFlags & Sample_TypeChanged) {
+            SetMediaType(NULL);
+        }
+        ASSERT(m_pMediaType == NULL);
+        m_dwFlags = 0;
+        m_dwTypeSpecificFlags = 0;
+        m_dwStreamId = AM_STREAM_MEDIA;
+
+        /* This may cause us to be deleted */
+        // Our refcount is reliably 0 thus no-one will mess with us
+        m_pAllocator->ReleaseBuffer(this);
+    }
+    return (ULONG)lRef;
+}
+
+
+// set the buffer pointer and length. Used by allocators that
+// want variable sized pointers or pointers into already-read data.
+// This is only available through a CMediaSample* not an IMediaSample*
+// and so cannot be changed by clients.
+HRESULT
+CMediaSample::SetPointer(BYTE * ptr, LONG cBytes)
+{
+    m_pBuffer = ptr;            // new buffer area (could be null)
+    m_cbBuffer = cBytes;        // length of buffer
+    m_lActual = cBytes;         // length of data in buffer (assume full)
+
+    return S_OK;
+}
+
+
+// get me a read/write pointer to this buffer's memory. I will actually
+// want to use sizeUsed bytes.
+STDMETHODIMP
+CMediaSample::GetPointer(BYTE ** ppBuffer)
+{
+    ValidateReadWritePtr(ppBuffer,sizeof(BYTE *));
+
+    // creator must have set pointer either during
+    // constructor or by SetPointer
+    ASSERT(m_pBuffer);
+
+    *ppBuffer = m_pBuffer;
+    return NOERROR;
+}
+
+
+// return the size in bytes of this buffer
+STDMETHODIMP_(LONG)
+CMediaSample::GetSize(void)
+{
+    return m_cbBuffer;
+}
+
+
+// get the stream time at which this sample should start and finish.
+STDMETHODIMP
+CMediaSample::GetTime(
+    REFERENCE_TIME * pTimeStart,     // put time here
+    REFERENCE_TIME * pTimeEnd
+)
+{
+    ValidateReadWritePtr(pTimeStart,sizeof(REFERENCE_TIME));
+    ValidateReadWritePtr(pTimeEnd,sizeof(REFERENCE_TIME));
+
+    if (!(m_dwFlags & Sample_StopValid)) {
+        if (!(m_dwFlags & Sample_TimeValid)) {
+            return VFW_E_SAMPLE_TIME_NOT_SET;
+        } else {
+            *pTimeStart = m_Start;
+
+            //  Make sure old stuff works
+            *pTimeEnd = m_Start + 1;
+            return VFW_S_NO_STOP_TIME;
+        }
+    }
+
+    *pTimeStart = m_Start;
+    *pTimeEnd = m_End;
+    return NOERROR;
+}
+
+
+// Set the stream time at which this sample should start and finish.
+// NULL pointers means the time is reset
+STDMETHODIMP
+CMediaSample::SetTime(
+    REFERENCE_TIME * pTimeStart,
+    REFERENCE_TIME * pTimeEnd
+)
+{
+    if (pTimeStart == NULL) {
+        ASSERT(pTimeEnd == NULL);
+        m_dwFlags &= ~(Sample_TimeValid | Sample_StopValid);
+    } else {
+        if (pTimeEnd == NULL) {
+            m_Start = *pTimeStart;
+            m_dwFlags |= Sample_TimeValid;
+            m_dwFlags &= ~Sample_StopValid;
+        } else {
+            ValidateReadPtr(pTimeStart,sizeof(REFERENCE_TIME));
+            ValidateReadPtr(pTimeEnd,sizeof(REFERENCE_TIME));
+            ASSERT(*pTimeEnd >= *pTimeStart);
+
+            m_Start = *pTimeStart;
+            m_End = *pTimeEnd;
+            m_dwFlags |= Sample_TimeValid | Sample_StopValid;
+        }
+    }
+    return NOERROR;
+}
+
+
+// get the media times (eg bytes) for this sample
+STDMETHODIMP
+CMediaSample::GetMediaTime(
+    LONGLONG * pTimeStart,
+    LONGLONG * pTimeEnd
+)
+{
+    ValidateReadWritePtr(pTimeStart,sizeof(LONGLONG));
+    ValidateReadWritePtr(pTimeEnd,sizeof(LONGLONG));
+
+    if (!(m_dwFlags & Sample_MediaTimeValid)) {
+        return VFW_E_MEDIA_TIME_NOT_SET;
+    }
+
+    *pTimeStart = m_MediaStart;
+    *pTimeEnd = (m_MediaStart + m_MediaEnd);
+    return NOERROR;
+}
+
+
+// Set the media times for this sample
+STDMETHODIMP
+CMediaSample::SetMediaTime(
+    LONGLONG * pTimeStart,
+    LONGLONG * pTimeEnd
+)
+{
+    if (pTimeStart == NULL) {
+        ASSERT(pTimeEnd == NULL);
+        m_dwFlags &= ~Sample_MediaTimeValid;
+    } else {
+        ValidateReadPtr(pTimeStart,sizeof(LONGLONG));
+        ValidateReadPtr(pTimeEnd,sizeof(LONGLONG));
+        ASSERT(*pTimeEnd >= *pTimeStart);
+
+        m_MediaStart = *pTimeStart;
+        m_MediaEnd = (LONG)(*pTimeEnd - *pTimeStart);
+        m_dwFlags |= Sample_MediaTimeValid;
+    }
+    return NOERROR;
+}
+
+
+STDMETHODIMP
+CMediaSample::IsSyncPoint(void)
+{
+    if (m_dwFlags & Sample_SyncPoint) {
+        return S_OK;
+    } else {
+        return S_FALSE;
+    }
+}
+
+
+STDMETHODIMP
+CMediaSample::SetSyncPoint(BOOL bIsSyncPoint)
+{
+    if (bIsSyncPoint) {
+        m_dwFlags |= Sample_SyncPoint;
+    } else {
+        m_dwFlags &= ~Sample_SyncPoint;
+    }
+    return NOERROR;
+}
+
+// returns S_OK if there is a discontinuity in the data (this same is
+// not a continuation of the previous stream of data
+// - there has been a seek).
+STDMETHODIMP
+CMediaSample::IsDiscontinuity(void)
+{
+    if (m_dwFlags & Sample_Discontinuity) {
+        return S_OK;
+    } else {
+        return S_FALSE;
+    }
+}
+
+// set the discontinuity property - TRUE if this sample is not a
+// continuation, but a new sample after a seek.
+STDMETHODIMP
+CMediaSample::SetDiscontinuity(BOOL bDiscont)
+{
+    // should be TRUE or FALSE
+    if (bDiscont) {
+        m_dwFlags |= Sample_Discontinuity;
+    } else {
+        m_dwFlags &= ~Sample_Discontinuity;
+    }
+    return S_OK;
+}
+
+STDMETHODIMP
+CMediaSample::IsPreroll(void)
+{
+    if (m_dwFlags & Sample_Preroll) {
+        return S_OK;
+    } else {
+        return S_FALSE;
+    }
+}
+
+
+STDMETHODIMP
+CMediaSample::SetPreroll(BOOL bIsPreroll)
+{
+    if (bIsPreroll) {
+        m_dwFlags |= Sample_Preroll;
+    } else {
+        m_dwFlags &= ~Sample_Preroll;
+    }
+    return NOERROR;
+}
+
+STDMETHODIMP_(LONG)
+CMediaSample::GetActualDataLength(void)
+{
+    return m_lActual;
+}
+
+
+STDMETHODIMP
+CMediaSample::SetActualDataLength(LONG lActual)
+{
+    if (lActual > m_cbBuffer) {
+        ASSERT(lActual <= GetSize());
+        return VFW_E_BUFFER_OVERFLOW;
+    }
+    m_lActual = lActual;
+    return NOERROR;
+}
+
+
+/* These allow for limited format changes in band */
+
+STDMETHODIMP
+CMediaSample::GetMediaType(AM_MEDIA_TYPE **ppMediaType)
+{
+    ValidateReadWritePtr(ppMediaType,sizeof(AM_MEDIA_TYPE *));
+    ASSERT(ppMediaType);
+
+    /* Do we have a new media type for them */
+
+    if (!(m_dwFlags & Sample_TypeChanged)) {
+        ASSERT(m_pMediaType == NULL);
+        *ppMediaType = NULL;
+        return S_FALSE;
+    }
+
+    ASSERT(m_pMediaType);
+
+    /* Create a copy of our media type */
+
+    *ppMediaType = CreateMediaType(m_pMediaType);
+    if (*ppMediaType == NULL) {
+        return E_OUTOFMEMORY;
+    }
+    return NOERROR;
+}
+
+
+/* Mark this sample as having a different format type */
+
+STDMETHODIMP
+CMediaSample::SetMediaType(AM_MEDIA_TYPE *pMediaType)
+{
+    /* Delete the current media type */
+
+    if (m_pMediaType) {
+        DeleteMediaType(m_pMediaType);
+        m_pMediaType = NULL;
+    }
+
+    /* Mechanism for resetting the format type */
+
+    if (pMediaType == NULL) {
+        m_dwFlags &= ~Sample_TypeChanged;
+        return NOERROR;
+    }
+
+    ASSERT(pMediaType);
+    ValidateReadPtr(pMediaType,sizeof(AM_MEDIA_TYPE));
+
+    /* Take a copy of the media type */
+
+    m_pMediaType = CreateMediaType(pMediaType);
+    if (m_pMediaType == NULL) {
+        m_dwFlags &= ~Sample_TypeChanged;
+        return E_OUTOFMEMORY;
+    }
+
+    m_dwFlags |= Sample_TypeChanged;
+    return NOERROR;
+}
+
+// Set and get properties (IMediaSample2)
+STDMETHODIMP CMediaSample::GetProperties(
+    DWORD cbProperties,
+    BYTE * pbProperties
+)
+{
+    if (0 != cbProperties) {
+        CheckPointer(pbProperties, E_POINTER);
+        //  Return generic stuff up to the length
+        AM_SAMPLE2_PROPERTIES Props;
+        Props.cbData     = (DWORD) (min(cbProperties, sizeof(Props)));
+        Props.dwSampleFlags = m_dwFlags & ~Sample_MediaTimeValid;
+        Props.dwTypeSpecificFlags = m_dwTypeSpecificFlags;
+        Props.pbBuffer   = m_pBuffer;
+        Props.cbBuffer   = m_cbBuffer;
+        Props.lActual    = m_lActual;
+        Props.tStart     = m_Start;
+        Props.tStop      = m_End;
+        Props.dwStreamId = m_dwStreamId;
+        if (m_dwFlags & AM_SAMPLE_TYPECHANGED) {
+            Props.pMediaType = m_pMediaType;
+        } else {
+            Props.pMediaType = NULL;
+        }
+        CopyMemory(pbProperties, &Props, Props.cbData);
+    }
+    return S_OK;
+}
+
+#define CONTAINS_FIELD(type, field, offset) \
+    ((FIELD_OFFSET(type, field) + sizeof(((type *)0)->field)) <= offset)
+
+HRESULT CMediaSample::SetProperties(
+    DWORD cbProperties,
+    const BYTE * pbProperties
+)
+{
+
+    /*  Generic properties */
+    AM_MEDIA_TYPE *pMediaType = NULL;
+
+    if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, cbData, cbProperties)) {
+        CheckPointer(pbProperties, E_POINTER);
+        AM_SAMPLE2_PROPERTIES *pProps =
+            (AM_SAMPLE2_PROPERTIES *)pbProperties;
+
+        /*  Don't use more data than is actually there */
+        if (pProps->cbData < cbProperties) {
+            cbProperties = pProps->cbData;
+        }
+        /*  We only handle IMediaSample2 */
+        if (cbProperties > sizeof(*pProps) ||
+            pProps->cbData > sizeof(*pProps)) {
+            return E_INVALIDARG;
+        }
+        /*  Do checks first, the assignments (for backout) */
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, dwSampleFlags, cbProperties)) {
+            /*  Check the flags */
+            if (pProps->dwSampleFlags &
+                    (~Sample_ValidFlags | Sample_MediaTimeValid)) {
+                return E_INVALIDARG;
+            }
+            /*  Check a flag isn't being set for a property
+                not being provided
+            */
+            if ((pProps->dwSampleFlags & AM_SAMPLE_TIMEVALID) &&
+                 !(m_dwFlags & AM_SAMPLE_TIMEVALID) &&
+                 !CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, tStop, cbProperties)) {
+                 return E_INVALIDARG;
+            }
+        }
+        /*  NB - can't SET the pointer or size */
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, pbBuffer, cbProperties)) {
+
+            /*  Check pbBuffer */
+            if (pProps->pbBuffer != 0 && pProps->pbBuffer != m_pBuffer) {
+                return E_INVALIDARG;
+            }
+        }
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, cbBuffer, cbProperties)) {
+
+            /*  Check cbBuffer */
+            if (pProps->cbBuffer != 0 && pProps->cbBuffer != m_cbBuffer) {
+                return E_INVALIDARG;
+            }
+        }
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, cbBuffer, cbProperties) &&
+            CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, lActual, cbProperties)) {
+
+            /*  Check lActual */
+            if (pProps->cbBuffer < pProps->lActual) {
+                return E_INVALIDARG;
+            }
+        }
+
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, pMediaType, cbProperties)) {
+
+            /*  Check pMediaType */
+            if (pProps->dwSampleFlags & AM_SAMPLE_TYPECHANGED) {
+                CheckPointer(pProps->pMediaType, E_POINTER);
+                pMediaType = CreateMediaType(pProps->pMediaType);
+                if (pMediaType == NULL) {
+                    return E_OUTOFMEMORY;
+                }
+            }
+        }
+
+        /*  Now do the assignments */
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, dwStreamId, cbProperties)) {
+            m_dwStreamId = pProps->dwStreamId;
+        }
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, dwSampleFlags, cbProperties)) {
+            /*  Set the flags */
+            m_dwFlags = pProps->dwSampleFlags |
+                                (m_dwFlags & Sample_MediaTimeValid);
+            m_dwTypeSpecificFlags = pProps->dwTypeSpecificFlags;
+        } else {
+            if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, dwTypeSpecificFlags, cbProperties)) {
+                m_dwTypeSpecificFlags = pProps->dwTypeSpecificFlags;
+            }
+        }
+
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, lActual, cbProperties)) {
+            /*  Set lActual */
+            m_lActual = pProps->lActual;
+        }
+
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, tStop, cbProperties)) {
+
+            /*  Set the times */
+            m_End   = pProps->tStop;
+        }
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, tStart, cbProperties)) {
+
+            /*  Set the times */
+            m_Start = pProps->tStart;
+        }
+
+        if (CONTAINS_FIELD(AM_SAMPLE2_PROPERTIES, pMediaType, cbProperties)) {
+            /*  Set pMediaType */
+            if (pProps->dwSampleFlags & AM_SAMPLE_TYPECHANGED) {
+                if (m_pMediaType != NULL) {
+                    DeleteMediaType(m_pMediaType);
+                }
+                m_pMediaType = pMediaType;
+            }
+        }
+
+        /*  Fix up the type changed flag to correctly reflect the current state
+            If, for instance the input contained no type change but the
+            output does then if we don't do this we'd lose the
+            output media type.
+        */
+        if (m_pMediaType) {
+            m_dwFlags |= Sample_TypeChanged;
+        } else {
+            m_dwFlags &= ~Sample_TypeChanged;
+        }
+    }
+
+    return S_OK;
+}
+
+
+//
+// The streaming thread calls IPin::NewSegment(), IPin::EndOfStream(),
+// IMemInputPin::Receive() and IMemInputPin::ReceiveMultiple() on the
+// connected input pin.  The application thread calls Block().  The
+// following class members can only be called by the streaming thread.
+//
+//    Deliver()
+//    DeliverNewSegment()
+//    StartUsingOutputPin()
+//    StopUsingOutputPin()
+//    ChangeOutputFormat()
+//    ChangeMediaType()
+//    DynamicReconnect()
+//
+// The following class members can only be called by the application thread.
+//
+//    Block()
+//    SynchronousBlockOutputPin()
+//    AsynchronousBlockOutputPin()
+//
+
+CDynamicOutputPin::CDynamicOutputPin(
+    TCHAR *pObjectName,
+    CBaseFilter *pFilter,
+    CCritSec *pLock,
+    HRESULT *phr,
+    LPCWSTR pName) :
+        CBaseOutputPin(pObjectName, pFilter, pLock, phr, pName),
+        m_hStopEvent(NULL),
+        m_pGraphConfig(NULL),
+        m_bPinUsesReadOnlyAllocator(FALSE),
+        m_BlockState(NOT_BLOCKED),
+        m_hUnblockOutputPinEvent(NULL),
+        m_hNotifyCallerPinBlockedEvent(NULL),
+        m_dwBlockCallerThreadID(0),
+        m_dwNumOutstandingOutputPinUsers(0)
+{
+    HRESULT hr = Initialize();
+    if( FAILED( hr ) ) {
+        *phr = hr;
+        return;
+    }
+}
+
+#ifdef UNICODE
+CDynamicOutputPin::CDynamicOutputPin(
+    CHAR *pObjectName,
+    CBaseFilter *pFilter,
+    CCritSec *pLock,
+    HRESULT *phr,
+    LPCWSTR pName) :
+        CBaseOutputPin(pObjectName, pFilter, pLock, phr, pName),
+        m_hStopEvent(NULL),
+        m_pGraphConfig(NULL),
+        m_bPinUsesReadOnlyAllocator(FALSE),
+        m_BlockState(NOT_BLOCKED),
+        m_hUnblockOutputPinEvent(NULL),
+        m_hNotifyCallerPinBlockedEvent(NULL),
+        m_dwBlockCallerThreadID(0),
+        m_dwNumOutstandingOutputPinUsers(0)
+{
+    HRESULT hr = Initialize();
+    if( FAILED( hr ) ) {
+        *phr = hr;
+        return;
+    }
+}
+#endif
+
+CDynamicOutputPin::~CDynamicOutputPin()
+{
+    if(NULL != m_hUnblockOutputPinEvent) {
+        // This call should not fail because we have access to m_hUnblockOutputPinEvent
+        // and m_hUnblockOutputPinEvent is a valid event.
+        EXECUTE_ASSERT(::CloseHandle(m_hUnblockOutputPinEvent));
+    }
+
+    if(NULL != m_hNotifyCallerPinBlockedEvent) {
+        // This call should not fail because we have access to m_hNotifyCallerPinBlockedEvent
+        // and m_hNotifyCallerPinBlockedEvent is a valid event.
+        EXECUTE_ASSERT(::CloseHandle(m_hNotifyCallerPinBlockedEvent));
+    }
+}
+
+HRESULT CDynamicOutputPin::Initialize(void)
+{
+    m_hUnblockOutputPinEvent = ::CreateEvent( NULL,   // The event will have the default security descriptor.
+                                              TRUE,   // This is a manual reset event.
+                                              TRUE,   // The event is initially signaled.
+                                              NULL ); // The event is not named.
+
+    // CreateEvent() returns NULL if an error occurs.
+    if(NULL == m_hUnblockOutputPinEvent) {
+        return AmGetLastErrorToHResult();
+    }
+
+    //  Set flag to say we can reconnect while streaming.
+    SetReconnectWhenActive(true);
+
+    return S_OK;
+}
+
+STDMETHODIMP CDynamicOutputPin::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    if(riid == IID_IPinFlowControl) {
+        return GetInterface(static_cast<IPinFlowControl*>(this), ppv);
+    } else {
+        return CBaseOutputPin::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+STDMETHODIMP CDynamicOutputPin::Disconnect(void)
+{
+    CAutoLock cObjectLock(m_pLock);
+    return DisconnectInternal();
+}
+
+STDMETHODIMP CDynamicOutputPin::Block(DWORD dwBlockFlags, HANDLE hEvent)
+{
+    const DWORD VALID_FLAGS = AM_PIN_FLOW_CONTROL_BLOCK;
+
+    // Check for illegal flags.
+    if(dwBlockFlags & ~VALID_FLAGS) {
+        return E_INVALIDARG;
+    }
+
+    // Make sure the event is unsignaled.
+    if((dwBlockFlags & AM_PIN_FLOW_CONTROL_BLOCK) && (NULL != hEvent)) {
+        if( !::ResetEvent( hEvent ) ) {
+            return AmGetLastErrorToHResult();
+        }
+    }
+
+    // No flags are set if we are unblocking the output pin.
+    if(0 == dwBlockFlags) {
+
+        // This parameter should be NULL because unblock operations are always synchronous.
+        // There is no need to notify the caller when the event is done.
+        if(NULL != hEvent) {
+            return E_INVALIDARG;
+        }
+    }
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+
+    HRESULT hr;
+
+    if(dwBlockFlags & AM_PIN_FLOW_CONTROL_BLOCK) {
+        // IPinFlowControl::Block()'s hEvent parameter is NULL if the block is synchronous.
+        // If hEvent is not NULL, the block is asynchronous.
+        if(NULL == hEvent) {
+            hr = SynchronousBlockOutputPin();
+        } else {
+            hr = AsynchronousBlockOutputPin(hEvent);
+        }
+    } else {
+        hr = UnblockOutputPin();
+    }
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+
+    if(FAILED(hr)) {
+        return hr;
+    }
+
+    return S_OK;
+}
+
+HRESULT CDynamicOutputPin::SynchronousBlockOutputPin(void)
+{
+    HANDLE hNotifyCallerPinBlockedEvent = :: CreateEvent( NULL,   // The event will have the default security attributes.
+                                                          FALSE,  // This is an automatic reset event.
+                                                          FALSE,  // The event is initially unsignaled.
+                                                          NULL ); // The event is not named.
+
+    // CreateEvent() returns NULL if an error occurs.
+    if(NULL == hNotifyCallerPinBlockedEvent) {
+        return AmGetLastErrorToHResult();
+    }
+
+    HRESULT hr = AsynchronousBlockOutputPin(hNotifyCallerPinBlockedEvent);
+    if(FAILED(hr)) {
+        // This call should not fail because we have access to hNotifyCallerPinBlockedEvent
+        // and hNotifyCallerPinBlockedEvent is a valid event.
+        EXECUTE_ASSERT(::CloseHandle(hNotifyCallerPinBlockedEvent));
+
+        return hr;
+    }
+
+    hr = WaitEvent(hNotifyCallerPinBlockedEvent);
+
+    // This call should not fail because we have access to hNotifyCallerPinBlockedEvent
+    // and hNotifyCallerPinBlockedEvent is a valid event.
+    EXECUTE_ASSERT(::CloseHandle(hNotifyCallerPinBlockedEvent));
+
+    if(FAILED(hr)) {
+        return hr;
+    }
+
+    return S_OK;
+}
+
+HRESULT CDynamicOutputPin::AsynchronousBlockOutputPin(HANDLE hNotifyCallerPinBlockedEvent)
+{
+    // This function holds the m_BlockStateLock because it uses
+    // m_dwBlockCallerThreadID, m_BlockState and
+    // m_hNotifyCallerPinBlockedEvent.
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    if(NOT_BLOCKED != m_BlockState) {
+        if(m_dwBlockCallerThreadID == ::GetCurrentThreadId()) {
+            return VFW_E_PIN_ALREADY_BLOCKED_ON_THIS_THREAD;
+        } else {
+            return VFW_E_PIN_ALREADY_BLOCKED;
+        }
+    }
+
+    BOOL fSuccess = ::DuplicateHandle( ::GetCurrentProcess(),
+                                       hNotifyCallerPinBlockedEvent,
+                                       ::GetCurrentProcess(),
+                                       &m_hNotifyCallerPinBlockedEvent,
+                                       EVENT_MODIFY_STATE,
+                                       FALSE,
+                                       0 );
+    if( !fSuccess ) {
+        return AmGetLastErrorToHResult();
+    }
+
+    m_BlockState = PENDING;
+    m_dwBlockCallerThreadID = ::GetCurrentThreadId();
+
+    // The output pin cannot be blocked if the streaming thread is
+    // calling IPin::NewSegment(), IPin::EndOfStream(), IMemInputPin::Receive()
+    // or IMemInputPin::ReceiveMultiple() on the connected input pin.  Also, it
+    // cannot be blocked if the streaming thread is calling DynamicReconnect(),
+    // ChangeMediaType() or ChangeOutputFormat().
+    if(!StreamingThreadUsingOutputPin()) {
+
+        // The output pin can be immediately blocked.
+        BlockOutputPin();
+    }
+
+    return S_OK;
+}
+
+void CDynamicOutputPin::BlockOutputPin(void)
+{
+    // The caller should always hold the m_BlockStateLock because this function
+    // uses m_BlockState and m_hNotifyCallerPinBlockedEvent.
+    ASSERT(CritCheckIn(&m_BlockStateLock));
+
+    // This function should not be called if the streaming thread is modifying
+    // the connection state or it's passing data downstream.
+    ASSERT(!StreamingThreadUsingOutputPin());
+
+    // This should not fail because we successfully created the event
+    // and we have the security permissions to change it's state.
+    EXECUTE_ASSERT(::ResetEvent(m_hUnblockOutputPinEvent));
+
+    // This event should not fail because AsynchronousBlockOutputPin() successfully
+    // duplicated this handle and we have the appropriate security permissions.
+    EXECUTE_ASSERT(::SetEvent(m_hNotifyCallerPinBlockedEvent));
+    EXECUTE_ASSERT(::CloseHandle(m_hNotifyCallerPinBlockedEvent));
+
+    m_BlockState = BLOCKED;
+    m_hNotifyCallerPinBlockedEvent = NULL;
+}
+
+HRESULT CDynamicOutputPin::UnblockOutputPin(void)
+{
+    // UnblockOutputPin() holds the m_BlockStateLock because it
+    // uses m_BlockState, m_dwBlockCallerThreadID and
+    // m_hNotifyCallerPinBlockedEvent.
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    if(NOT_BLOCKED == m_BlockState) {
+        return S_FALSE;
+    }
+
+    // This should not fail because we successfully created the event
+    // and we have the security permissions to change it's state.
+    EXECUTE_ASSERT(::SetEvent(m_hUnblockOutputPinEvent));
+
+    // Cancel the block operation if it's still pending.
+    if(NULL != m_hNotifyCallerPinBlockedEvent) {
+        // This event should not fail because AsynchronousBlockOutputPin() successfully
+        // duplicated this handle and we have the appropriate security permissions.
+        EXECUTE_ASSERT(::SetEvent(m_hNotifyCallerPinBlockedEvent));
+        EXECUTE_ASSERT(::CloseHandle(m_hNotifyCallerPinBlockedEvent));
+    }
+
+    m_BlockState = NOT_BLOCKED;
+    m_dwBlockCallerThreadID = 0;
+    m_hNotifyCallerPinBlockedEvent = NULL;
+
+    return S_OK;
+}
+
+HRESULT CDynamicOutputPin::StartUsingOutputPin(void)
+{
+    // The caller should not hold m_BlockStateLock.  If the caller does,
+    // a deadlock could occur.
+    ASSERT(CritCheckOut(&m_BlockStateLock));
+
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+
+    // Are we in the middle of a block operation?
+    while(BLOCKED == m_BlockState) {
+        m_BlockStateLock.Unlock();
+
+        // If this ASSERT fires, a deadlock could occur.  The caller should make sure
+        // that this thread never acquires the Block State lock more than once.
+        ASSERT(CritCheckOut( &m_BlockStateLock ));
+
+        // WaitForMultipleObjects() returns WAIT_OBJECT_0 if the unblock event
+        // is fired.  It returns WAIT_OBJECT_0 + 1 if the stop event if fired.
+        // See the Windows SDK documentation for more information on
+        // WaitForMultipleObjects().
+        const DWORD UNBLOCK = WAIT_OBJECT_0;
+        const DWORD STOP = WAIT_OBJECT_0 + 1;
+
+        HANDLE ahWaitEvents[] = { m_hUnblockOutputPinEvent, m_hStopEvent };
+        DWORD dwNumWaitEvents = sizeof(ahWaitEvents)/sizeof(HANDLE);
+
+        DWORD dwReturnValue = ::WaitForMultipleObjects( dwNumWaitEvents, ahWaitEvents, FALSE, INFINITE );
+
+        m_BlockStateLock.Lock();
+
+        #ifdef DEBUG
+        AssertValid();
+        #endif // DEBUG
+
+        switch( dwReturnValue ) {
+        case UNBLOCK:
+            break;
+
+        case STOP:
+            return VFW_E_STATE_CHANGED;
+
+        case WAIT_FAILED:
+            return AmGetLastErrorToHResult();
+
+        default:
+            DbgBreak( "An Unexpected case occured in CDynamicOutputPin::StartUsingOutputPin()." );
+            return E_UNEXPECTED;
+        }
+    }
+
+    m_dwNumOutstandingOutputPinUsers++;
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+
+    return S_OK;
+}
+
+void CDynamicOutputPin::StopUsingOutputPin(void)
+{
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+
+    m_dwNumOutstandingOutputPinUsers--;
+
+    if((m_dwNumOutstandingOutputPinUsers == 0) && (NOT_BLOCKED != m_BlockState)) {
+        BlockOutputPin();
+    }
+
+    #ifdef DEBUG
+    AssertValid();
+    #endif // DEBUG
+}
+
+bool CDynamicOutputPin::StreamingThreadUsingOutputPin(void)
+{
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    return (m_dwNumOutstandingOutputPinUsers > 0);
+}
+
+void CDynamicOutputPin::SetConfigInfo(IGraphConfig *pGraphConfig, HANDLE hStopEvent)
+{
+    // This pointer is not addrefed because filters are not allowed to
+    // hold references to the filter graph manager.  See the documentation for
+    // IBaseFilter::JoinFilterGraph() in the Direct Show SDK for more information.
+    m_pGraphConfig = pGraphConfig;
+
+    m_hStopEvent = hStopEvent;
+}
+
+HRESULT CDynamicOutputPin::Active(void)
+{
+    // Make sure the user initialized the object by calling SetConfigInfo().
+    if((NULL == m_hStopEvent) || (NULL == m_pGraphConfig)) {
+        DbgBreak( ERROR: CDynamicOutputPin::Active() failed because m_pGraphConfig and m_hStopEvent were not initialized.  Call SetConfigInfo() to initialize them. );
+        return E_FAIL;
+    }
+
+    // If this ASSERT fires, the user may have passed an invalid event handle to SetConfigInfo().
+    // The ASSERT can also fire if the event if destroyed and then Active() is called.  An event
+    // handle is invalid if 1) the event does not exist or the user does not have the security
+    // permissions to use the event.
+    EXECUTE_ASSERT(ResetEvent(m_hStopEvent));
+
+    return CBaseOutputPin::Active();
+}
+
+HRESULT CDynamicOutputPin::Inactive(void)
+{
+    // If this ASSERT fires, the user may have passed an invalid event handle to SetConfigInfo().
+    // The ASSERT can also fire if the event if destroyed and then Active() is called.  An event
+    // handle is invalid if 1) the event does not exist or the user does not have the security
+    // permissions to use the event.
+    EXECUTE_ASSERT(SetEvent(m_hStopEvent));
+
+    return CBaseOutputPin::Inactive();
+}
+
+HRESULT CDynamicOutputPin::DeliverBeginFlush(void)
+{
+    // If this ASSERT fires, the user may have passed an invalid event handle to SetConfigInfo().
+    // The ASSERT can also fire if the event if destroyed and then DeliverBeginFlush() is called.
+    // An event handle is invalid if 1) the event does not exist or the user does not have the security
+    // permissions to use the event.
+    EXECUTE_ASSERT(SetEvent(m_hStopEvent));
+
+    return CBaseOutputPin::DeliverBeginFlush();
+}
+
+HRESULT CDynamicOutputPin::DeliverEndFlush(void)
+{
+    // If this ASSERT fires, the user may have passed an invalid event handle to SetConfigInfo().
+    // The ASSERT can also fire if the event if destroyed and then DeliverBeginFlush() is called.
+    // An event handle is invalid if 1) the event does not exist or the user does not have the security
+    // permissions to use the event.
+    EXECUTE_ASSERT(ResetEvent(m_hStopEvent));
+
+    return CBaseOutputPin::DeliverEndFlush();
+}
+
+
+// ChangeOutputFormat() either dynamicly changes the connection's format type or it dynamicly
+// reconnects the output pin.
+HRESULT CDynamicOutputPin::ChangeOutputFormat
+    (
+    const AM_MEDIA_TYPE *pmt,
+    REFERENCE_TIME tSegmentStart,
+    REFERENCE_TIME tSegmentStop,
+    double dSegmentRate
+    )
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    // Callers should always pass a valid media type to ChangeOutputFormat() .
+    ASSERT(NULL != pmt);
+
+    CMediaType cmt(*pmt);
+    HRESULT hr = ChangeMediaType(&cmt);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    hr = DeliverNewSegment(tSegmentStart, tSegmentStop, dSegmentRate);
+    if( FAILED( hr ) ) {
+        return hr;
+    }
+
+    return S_OK;
+}
+
+HRESULT CDynamicOutputPin::ChangeMediaType(const CMediaType *pmt)
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    // This function assumes the filter graph is running.
+    ASSERT(!IsStopped());
+
+    if(!IsConnected()) {
+        return VFW_E_NOT_CONNECTED;
+    }
+
+    /*  First check if the downstream pin will accept a dynamic
+        format change
+    */
+    QzCComPtr<IPinConnection> pConnection;
+
+    m_Connected->QueryInterface(IID_IPinConnection, (void **)&pConnection);
+    if(pConnection != NULL) {
+
+        if(S_OK == pConnection->DynamicQueryAccept(pmt)) {
+
+            HRESULT hr = ChangeMediaTypeHelper(pmt);
+            if(FAILED(hr)) {
+                return hr;
+            }
+
+            return S_OK;
+        }
+    }
+
+    /*  Can't do the dynamic connection */
+    return DynamicReconnect(pmt);
+}
+
+HRESULT CDynamicOutputPin::ChangeMediaTypeHelper(const CMediaType *pmt)
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    HRESULT hr = m_Connected->ReceiveConnection(this, pmt);
+    if(FAILED(hr)) {
+        return hr;
+    }
+
+    hr = SetMediaType(pmt);
+    if(FAILED(hr)) {
+        return hr;
+    }
+
+    // Does this pin use the local memory transport?
+    if(NULL != m_pInputPin) {
+        // This function assumes that m_pInputPin and m_Connected are
+        // two different interfaces to the same object.
+        ASSERT(::IsEqualObject(m_Connected, m_pInputPin));
+
+        ALLOCATOR_PROPERTIES apInputPinRequirements;
+        apInputPinRequirements.cbAlign = 0;
+        apInputPinRequirements.cbBuffer = 0;
+        apInputPinRequirements.cbPrefix = 0;
+        apInputPinRequirements.cBuffers = 0;
+
+        m_pInputPin->GetAllocatorRequirements(&apInputPinRequirements);
+
+        // A zero allignment does not make any sense.
+        if(0 == apInputPinRequirements.cbAlign) {
+            apInputPinRequirements.cbAlign = 1;
+        }
+
+        hr = m_pAllocator->Decommit();
+        if(FAILED(hr)) {
+            return hr;
+        }
+
+        hr = DecideBufferSize(m_pAllocator,  &apInputPinRequirements);
+        if(FAILED(hr)) {
+            return hr;
+        }
+
+        hr = m_pAllocator->Commit();
+        if(FAILED(hr)) {
+            return hr;
+        }
+
+        hr = m_pInputPin->NotifyAllocator(m_pAllocator, m_bPinUsesReadOnlyAllocator);
+        if(FAILED(hr)) {
+            return hr;
+        }
+    }
+
+    return S_OK;
+}
+
+// this method has to be called from the thread that is pushing data,
+// and it's the caller's responsibility to make sure that the thread
+// has no outstand samples because they cannot be delivered after a
+// reconnect
+//
+HRESULT CDynamicOutputPin::DynamicReconnect( const CMediaType* pmt )
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    if((m_pGraphConfig == NULL) || (NULL == m_hStopEvent)) {
+        return E_FAIL;
+    }
+
+    HRESULT hr = m_pGraphConfig->Reconnect(
+        this,
+        NULL,
+        pmt,
+        NULL,
+        m_hStopEvent,
+        AM_GRAPH_CONFIG_RECONNECT_CACHE_REMOVED_FILTERS );
+
+    return hr;
+}
+
+HRESULT CDynamicOutputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = CBaseOutputPin::CompleteConnect(pReceivePin);
+    if(SUCCEEDED(hr)) {
+        if(!IsStopped() && m_pAllocator) {
+            hr = m_pAllocator->Commit();
+            ASSERT(hr != VFW_E_ALREADY_COMMITTED);
+        }
+    }
+
+    return hr;
+}
+
+#ifdef DEBUG
+void CDynamicOutputPin::AssertValid(void)
+{
+    // Make sure the object was correctly initialized.
+
+    // This ASSERT only fires if the object failed to initialize
+    // and the user ignored the constructor's return code (phr).
+    ASSERT(NULL != m_hUnblockOutputPinEvent);
+
+    // If either of these ASSERTs fire, the user did not correctly call
+    // SetConfigInfo().
+    ASSERT(NULL != m_hStopEvent);
+    ASSERT(NULL != m_pGraphConfig);
+
+    // Make sure the block state is consistent.
+
+    CAutoLock alBlockStateLock(&m_BlockStateLock);
+
+    // BLOCK_STATE variables only have three legal values: PENDING, BLOCKED and NOT_BLOCKED.
+    ASSERT((NOT_BLOCKED == m_BlockState) || (PENDING == m_BlockState) || (BLOCKED == m_BlockState));
+
+    // m_hNotifyCallerPinBlockedEvent is only needed when a block operation cannot complete
+    // immediately.
+    ASSERT(((NULL == m_hNotifyCallerPinBlockedEvent) && (PENDING != m_BlockState)) ||
+           ((NULL != m_hNotifyCallerPinBlockedEvent) && (PENDING == m_BlockState)) );
+
+    // m_dwBlockCallerThreadID should always be 0 if the pin is not blocked and
+    // the user is not trying to block the pin.
+    ASSERT((0 == m_dwBlockCallerThreadID) || (NOT_BLOCKED != m_BlockState));
+
+    // If this ASSERT fires, the streaming thread is using the output pin and the
+    // output pin is blocked.
+    ASSERT(((0 != m_dwNumOutstandingOutputPinUsers) && (BLOCKED != m_BlockState)) ||
+           ((0 == m_dwNumOutstandingOutputPinUsers) && (NOT_BLOCKED != m_BlockState)) ||
+           ((0 == m_dwNumOutstandingOutputPinUsers) && (NOT_BLOCKED == m_BlockState)) );
+}
+#endif // DEBUG
+
+HRESULT CDynamicOutputPin::WaitEvent(HANDLE hEvent)
+{
+    const DWORD EVENT_SIGNALED = WAIT_OBJECT_0;
+
+    DWORD dwReturnValue = ::WaitForSingleObject(hEvent, INFINITE);
+
+    switch( dwReturnValue ) {
+    case EVENT_SIGNALED:
+        return S_OK;
+
+    case WAIT_FAILED:
+        return AmGetLastErrorToHResult();
+
+    default:
+        DbgBreak( "An Unexpected case occured in CDynamicOutputPin::WaitEvent()." );
+        return E_UNEXPECTED;
+    }
+}
+
+//=====================================================================
+//=====================================================================
+// Implements CBaseAllocator
+//=====================================================================
+//=====================================================================
+
+
+/* Constructor overrides the default settings for the free list to request
+   that it be alertable (ie the list can be cast to a handle which can be
+   passed to WaitForSingleObject). Both of the allocator lists also ask for
+   object locking, the all list matches the object default settings but I
+   have included them here just so it is obvious what kind of list it is */
+
+CBaseAllocator::CBaseAllocator(TCHAR *pName,
+                               LPUNKNOWN pUnk,
+                               HRESULT *phr,
+                               BOOL bEvent,
+                               BOOL fEnableReleaseCallback
+                               ) :
+    CUnknown(pName, pUnk),
+    m_lAllocated(0),
+    m_bChanged(FALSE),
+    m_bCommitted(FALSE),
+    m_bDecommitInProgress(FALSE),
+    m_lSize(0),
+    m_lCount(0),
+    m_lAlignment(0),
+    m_lPrefix(0),
+    m_hSem(NULL),
+    m_lWaiting(0),
+    m_fEnableReleaseCallback(fEnableReleaseCallback),
+    m_pNotify(NULL)
+{
+
+    if (bEvent) {
+        m_hSem = CreateSemaphore(NULL, 0, 0x7FFFFFFF, NULL);
+        if (m_hSem == NULL) {
+            *phr = E_OUTOFMEMORY;
+            return;
+        }
+    }
+}
+
+#ifdef UNICODE
+CBaseAllocator::CBaseAllocator(CHAR *pName,
+                               LPUNKNOWN pUnk,
+                               HRESULT *phr,
+                               BOOL bEvent,
+                               BOOL fEnableReleaseCallback) :
+    CUnknown(pName, pUnk),
+    m_lAllocated(0),
+    m_bChanged(FALSE),
+    m_bCommitted(FALSE),
+    m_bDecommitInProgress(FALSE),
+    m_lSize(0),
+    m_lCount(0),
+    m_lAlignment(0),
+    m_lPrefix(0),
+    m_hSem(NULL),
+    m_lWaiting(0),
+    m_fEnableReleaseCallback(fEnableReleaseCallback),
+    m_pNotify(NULL)
+{
+
+    if (bEvent) {
+        m_hSem = CreateSemaphore(NULL, 0, 0x7FFFFFFF, NULL);
+        if (m_hSem == NULL) {
+            *phr = E_OUTOFMEMORY;
+            return;
+        }
+    }
+}
+#endif
+
+/* Destructor */
+
+CBaseAllocator::~CBaseAllocator()
+{
+    // we can't call Decommit here since that would mean a call to a
+    // pure virtual in destructor.
+    // We must assume that the derived class has gone into decommit state in
+    // its destructor.
+
+    ASSERT(!m_bCommitted);
+    if (m_hSem != NULL) {
+        EXECUTE_ASSERT(CloseHandle(m_hSem));
+    }
+    if (m_pNotify) {
+        m_pNotify->Release();
+    }
+}
+
+
+/* Override this to publicise our interfaces */
+
+STDMETHODIMP
+CBaseAllocator::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    /* Do we know about this interface */
+
+    if (riid == IID_IMemAllocator ||
+        riid == IID_IMemAllocatorCallbackTemp && m_fEnableReleaseCallback) {
+        return GetInterface((IMemAllocatorCallbackTemp *) this, ppv);
+    } else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+/* This sets the size and count of the required samples. The memory isn't
+   actually allocated until Commit() is called, if memory has already been
+   allocated then assuming no samples are outstanding the user may call us
+   to change the buffering, the memory will be released in Commit() */
+
+STDMETHODIMP
+CBaseAllocator::SetProperties(
+                ALLOCATOR_PROPERTIES* pRequest,
+                ALLOCATOR_PROPERTIES* pActual)
+{
+    CheckPointer(pRequest, E_POINTER);
+    CheckPointer(pActual, E_POINTER);
+    ValidateReadWritePtr(pActual, sizeof(ALLOCATOR_PROPERTIES));
+    CAutoLock cObjectLock(this);
+
+    ZeroMemory(pActual, sizeof(ALLOCATOR_PROPERTIES));
+
+    ASSERT(pRequest->cbBuffer > 0);
+
+    /*  Check the alignment requested */
+    if (pRequest->cbAlign != 1) {
+        DbgLog((LOG_ERROR, 2, TEXT("Alignment requested was 0x%x, not 1"),
+               pRequest->cbAlign));
+        return VFW_E_BADALIGN;
+    }
+
+    /* Can't do this if already committed, there is an argument that says we
+       should not reject the SetProperties call if there are buffers still
+       active. However this is called by the source filter, which is the same
+       person who is holding the samples. Therefore it is not unreasonable
+       for them to free all their samples before changing the requirements */
+
+    if (m_bCommitted) {
+        return VFW_E_ALREADY_COMMITTED;
+    }
+
+    /* Must be no outstanding buffers */
+
+    if (m_lAllocated != m_lFree.GetCount()) {
+        return VFW_E_BUFFERS_OUTSTANDING;
+    }
+
+    /* There isn't any real need to check the parameters as they
+       will just be rejected when the user finally calls Commit */
+
+    pActual->cbBuffer = m_lSize = pRequest->cbBuffer;
+    pActual->cBuffers = m_lCount = pRequest->cBuffers;
+    pActual->cbAlign = m_lAlignment = pRequest->cbAlign;
+    pActual->cbPrefix = m_lPrefix = pRequest->cbPrefix;
+
+    m_bChanged = TRUE;
+    return NOERROR;
+}
+
+STDMETHODIMP
+CBaseAllocator::GetProperties(
+    ALLOCATOR_PROPERTIES * pActual)
+{
+    CheckPointer(pActual,E_POINTER);
+    ValidateReadWritePtr(pActual,sizeof(ALLOCATOR_PROPERTIES));
+
+    CAutoLock cObjectLock(this);
+    pActual->cbBuffer = m_lSize;
+    pActual->cBuffers = m_lCount;
+    pActual->cbAlign = m_lAlignment;
+    pActual->cbPrefix = m_lPrefix;
+    return NOERROR;
+}
+
+// get container for a sample. Blocking, synchronous call to get the
+// next free buffer (as represented by an IMediaSample interface).
+// on return, the time etc properties will be invalid, but the buffer
+// pointer and size will be correct.
+
+HRESULT CBaseAllocator::GetBuffer(IMediaSample **ppBuffer,
+                                  REFERENCE_TIME *pStartTime,
+                                  REFERENCE_TIME *pEndTime,
+                                  DWORD dwFlags
+                                  )
+{
+    UNREFERENCED_PARAMETER(pStartTime);
+    UNREFERENCED_PARAMETER(pEndTime);
+    UNREFERENCED_PARAMETER(dwFlags);
+    CMediaSample *pSample;
+
+    *ppBuffer = NULL;
+    for (;;)
+    {
+        {  // scope for lock
+            CAutoLock cObjectLock(this);
+
+            /* Check we are committed */
+            if (!m_bCommitted) {
+                return VFW_E_NOT_COMMITTED;
+            }
+            pSample = (CMediaSample *) m_lFree.RemoveHead();
+            if (pSample == NULL) {
+                SetWaiting();
+            }
+        }
+
+        /* If we didn't get a sample then wait for the list to signal */
+
+        if (pSample) {
+            break;
+        }
+        if (dwFlags & AM_GBF_NOWAIT) {
+            return VFW_E_TIMEOUT;
+        }
+        ASSERT(m_hSem != NULL);
+        WaitForSingleObject(m_hSem, INFINITE);
+    }
+
+    /* Addref the buffer up to one. On release
+       back to zero instead of being deleted, it will requeue itself by
+       calling the ReleaseBuffer member function. NOTE the owner of a
+       media sample must always be derived from CBaseAllocator */
+
+
+    ASSERT(pSample->m_cRef == 0);
+    pSample->m_cRef = 1;
+    *ppBuffer = pSample;
+
+
+    return NOERROR;
+}
+
+
+/* Final release of a CMediaSample will call this */
+
+STDMETHODIMP
+CBaseAllocator::ReleaseBuffer(IMediaSample * pSample)
+{
+    CheckPointer(pSample,E_POINTER);
+    ValidateReadPtr(pSample,sizeof(IMediaSample));
+
+
+
+    BOOL bRelease = FALSE;
+    {
+        CAutoLock cal(this);
+
+        /* Put back on the free list */
+
+        m_lFree.Add((CMediaSample *)pSample);
+        if (m_lWaiting != 0) {
+            NotifySample();
+        }
+
+        // if there is a pending Decommit, then we need to complete it by
+        // calling Free() when the last buffer is placed on the free list
+
+        LONG l1 = m_lFree.GetCount();
+        if (m_bDecommitInProgress && (l1 == m_lAllocated)) {
+            Free();
+            m_bDecommitInProgress = FALSE;
+            bRelease = TRUE;
+        }
+    }
+
+    if (m_pNotify) {
+
+        ASSERT(m_fEnableReleaseCallback);
+
+        //
+        // Note that this is not synchronized with setting up a notification
+        // method.
+        //
+        m_pNotify->NotifyRelease();
+    }
+
+    /* For each buffer there is one AddRef, made in GetBuffer and released
+       here. This may cause the allocator and all samples to be deleted */
+
+    if (bRelease) {
+        Release();
+    }
+    return NOERROR;
+}
+
+STDMETHODIMP
+CBaseAllocator::SetNotify(
+    IMemAllocatorNotifyCallbackTemp* pNotify
+    )
+{
+    ASSERT(m_fEnableReleaseCallback);
+    CAutoLock lck(this);
+    if (pNotify) {
+        pNotify->AddRef();
+    }
+    if (m_pNotify) {
+        m_pNotify->Release();
+    }
+    m_pNotify = pNotify;
+    return S_OK;
+}
+
+STDMETHODIMP
+CBaseAllocator::GetFreeCount(
+    LONG* plBuffersFree
+    )
+{
+    ASSERT(m_fEnableReleaseCallback);
+    CAutoLock cObjectLock(this);
+    *plBuffersFree = m_lCount - m_lAllocated + m_lFree.GetCount();
+    return NOERROR;
+}
+
+void
+CBaseAllocator::NotifySample()
+{
+    if (m_lWaiting != 0) {
+        ASSERT(m_hSem != NULL);
+        ReleaseSemaphore(m_hSem, m_lWaiting, 0);
+        m_lWaiting = 0;
+    }
+}
+
+STDMETHODIMP
+CBaseAllocator::Commit()
+{
+    /* Check we are not decommitted */
+    CAutoLock cObjectLock(this);
+
+    // cannot need to alloc or re-alloc if we are committed
+    if (m_bCommitted) {
+        return NOERROR;
+    }
+
+    /* Allow GetBuffer calls */
+
+    m_bCommitted = TRUE;
+
+    // is there a pending decommit ? if so, just cancel it
+    if (m_bDecommitInProgress) {
+        m_bDecommitInProgress = FALSE;
+
+        // don't call Alloc at this point. He cannot allow SetProperties
+        // between Decommit and the last free, so the buffer size cannot have
+        // changed. And because some of the buffers are not free yet, he
+        // cannot re-alloc anyway.
+        return NOERROR;
+    }
+
+    DbgLog((LOG_MEMORY, 1, TEXT("Allocating: %ldx%ld"), m_lCount, m_lSize));
+
+    // actually need to allocate the samples
+    HRESULT hr = Alloc();
+    if (FAILED(hr)) {
+        m_bCommitted = FALSE;
+        return hr;
+    }
+    AddRef();
+    return NOERROR;
+}
+
+
+STDMETHODIMP
+CBaseAllocator::Decommit()
+{
+    BOOL bRelease = FALSE;
+    {
+        /* Check we are not already decommitted */
+        CAutoLock cObjectLock(this);
+        if (m_bCommitted == FALSE) {
+            if (m_bDecommitInProgress == FALSE) {
+                return NOERROR;
+            }
+        }
+
+        /* No more GetBuffer calls will succeed */
+        m_bCommitted = FALSE;
+
+        // are any buffers outstanding?
+        if (m_lFree.GetCount() < m_lAllocated) {
+            // please complete the decommit when last buffer is freed
+            m_bDecommitInProgress = TRUE;
+        } else {
+            m_bDecommitInProgress = FALSE;
+
+            // need to complete the decommit here as there are no
+            // outstanding buffers
+
+            Free();
+            bRelease = TRUE;
+        }
+
+        // Tell anyone waiting that they can go now so we can
+        // reject their call
+        NotifySample();
+    }
+
+    if (bRelease) {
+        Release();
+    }
+    return NOERROR;
+}
+
+
+/* Base definition of allocation which checks we are ok to go ahead and do
+   the full allocation. We return S_FALSE if the requirements are the same */
+
+HRESULT
+CBaseAllocator::Alloc(void)
+{
+    /* Error if he hasn't set the size yet */
+    if (m_lCount <= 0 || m_lSize <= 0 || m_lAlignment <= 0) {
+        return VFW_E_SIZENOTSET;
+    }
+
+    /* should never get here while buffers outstanding */
+    ASSERT(m_lFree.GetCount() == m_lAllocated);
+
+    /* If the requirements haven't changed then don't reallocate */
+    if (m_bChanged == FALSE) {
+        return S_FALSE;
+    }
+
+    return NOERROR;
+}
+
+/*  Implement CBaseAllocator::CSampleList::Remove(pSample)
+    Removes pSample from the list
+*/
+void
+CBaseAllocator::CSampleList::Remove(CMediaSample * pSample)
+{
+    CMediaSample **pSearch;
+    for (pSearch = &m_List;
+         *pSearch != NULL;
+         pSearch = &(CBaseAllocator::NextSample(*pSearch))) {
+       if (*pSearch == pSample) {
+           *pSearch = CBaseAllocator::NextSample(pSample);
+           CBaseAllocator::NextSample(pSample) = NULL;
+           m_nOnList--;
+           return;
+       }
+    }
+    DbgBreak("Couldn't find sample in list");
+}
+
+//=====================================================================
+//=====================================================================
+// Implements CMemAllocator
+//=====================================================================
+//=====================================================================
+
+
+/* This goes in the factory template table to create new instances */
+CUnknown *CMemAllocator::CreateInstance(LPUNKNOWN pUnk, HRESULT *phr)
+{
+    CUnknown *pUnkRet = new CMemAllocator(NAME("CMemAllocator"), pUnk, phr);
+    return pUnkRet;
+}
+
+CMemAllocator::CMemAllocator(
+    TCHAR *pName,
+    LPUNKNOWN pUnk,
+    HRESULT *phr)
+    : CBaseAllocator(pName, pUnk, phr, TRUE, TRUE),
+    m_pBuffer(NULL)
+{
+}
+
+#ifdef UNICODE
+CMemAllocator::CMemAllocator(
+    CHAR *pName,
+    LPUNKNOWN pUnk,
+    HRESULT *phr)
+    : CBaseAllocator(pName, pUnk, phr, TRUE, TRUE),
+    m_pBuffer(NULL)
+{
+}
+#endif
+
+/* This sets the size and count of the required samples. The memory isn't
+   actually allocated until Commit() is called, if memory has already been
+   allocated then assuming no samples are outstanding the user may call us
+   to change the buffering, the memory will be released in Commit() */
+STDMETHODIMP
+CMemAllocator::SetProperties(
+                ALLOCATOR_PROPERTIES* pRequest,
+                ALLOCATOR_PROPERTIES* pActual)
+{
+    CheckPointer(pActual,E_POINTER);
+    ValidateReadWritePtr(pActual,sizeof(ALLOCATOR_PROPERTIES));
+    CAutoLock cObjectLock(this);
+
+    ZeroMemory(pActual, sizeof(ALLOCATOR_PROPERTIES));
+
+    ASSERT(pRequest->cbBuffer > 0);
+
+    SYSTEM_INFO SysInfo;
+    GetSystemInfo(&SysInfo);
+
+    /*  Check the alignment request is a power of 2 */
+    if ((-pRequest->cbAlign & pRequest->cbAlign) != pRequest->cbAlign) {
+        DbgLog((LOG_ERROR, 1, TEXT("Alignment requested 0x%x not a power of 2!"),
+               pRequest->cbAlign));
+    }
+    /*  Check the alignment requested */
+    if (pRequest->cbAlign == 0 ||
+    (SysInfo.dwAllocationGranularity & (pRequest->cbAlign - 1)) != 0) {
+        DbgLog((LOG_ERROR, 1, TEXT("Invalid alignment 0x%x requested - granularity = 0x%x"),
+               pRequest->cbAlign, SysInfo.dwAllocationGranularity));
+        return VFW_E_BADALIGN;
+    }
+
+    /* Can't do this if already committed, there is an argument that says we
+       should not reject the SetProperties call if there are buffers still
+       active. However this is called by the source filter, which is the same
+       person who is holding the samples. Therefore it is not unreasonable
+       for them to free all their samples before changing the requirements */
+
+    if (m_bCommitted == TRUE) {
+        return VFW_E_ALREADY_COMMITTED;
+    }
+
+    /* Must be no outstanding buffers */
+
+    if (m_lFree.GetCount() < m_lAllocated) {
+        return VFW_E_BUFFERS_OUTSTANDING;
+    }
+
+    /* There isn't any real need to check the parameters as they
+       will just be rejected when the user finally calls Commit */
+
+    // round length up to alignment - remember that prefix is included in
+    // the alignment
+    LONG lSize = pRequest->cbBuffer + pRequest->cbPrefix;
+    LONG lRemainder = lSize % pRequest->cbAlign;
+    if (lRemainder != 0) {
+        lSize = lSize - lRemainder + pRequest->cbAlign;
+    }
+    pActual->cbBuffer = m_lSize = (lSize - pRequest->cbPrefix);
+
+    pActual->cBuffers = m_lCount = pRequest->cBuffers;
+    pActual->cbAlign = m_lAlignment = pRequest->cbAlign;
+    pActual->cbPrefix = m_lPrefix = pRequest->cbPrefix;
+
+    m_bChanged = TRUE;
+    return NOERROR;
+}
+
+// override this to allocate our resources when Commit is called.
+//
+// note that our resources may be already allocated when this is called,
+// since we don't free them on Decommit. We will only be called when in
+// decommit state with all buffers free.
+//
+// object locked by caller
+HRESULT
+CMemAllocator::Alloc(void)
+{
+    CAutoLock lck(this);
+
+    /* Check he has called SetProperties */
+    HRESULT hr = CBaseAllocator::Alloc();
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    /* If the requirements haven't changed then don't reallocate */
+    if (hr == S_FALSE) {
+        ASSERT(m_pBuffer);
+        return NOERROR;
+    }
+    ASSERT(hr == S_OK); // we use this fact in the loop below
+
+    /* Free the old resources */
+    if (m_pBuffer) {
+        ReallyFree();
+    }
+
+    /* Compute the aligned size */
+    LONG lAlignedSize = m_lSize + m_lPrefix;
+    if (m_lAlignment > 1) {
+        LONG lRemainder = lAlignedSize % m_lAlignment;
+        if (lRemainder != 0) {
+            lAlignedSize += (m_lAlignment - lRemainder);
+        }
+    }
+
+    /* Create the contiguous memory block for the samples
+       making sure it's properly aligned (64K should be enough!)
+    */
+    ASSERT(lAlignedSize % m_lAlignment == 0);
+
+    m_pBuffer = (PBYTE)VirtualAlloc(NULL,
+                    m_lCount * lAlignedSize,
+                    MEM_COMMIT,
+                    PAGE_READWRITE);
+
+    if (m_pBuffer == NULL) {
+        return E_OUTOFMEMORY;
+    }
+
+    LPBYTE pNext = m_pBuffer;
+    CMediaSample *pSample;
+
+    ASSERT(m_lAllocated == 0);
+
+    // Create the new samples - we have allocated m_lSize bytes for each sample
+    // plus m_lPrefix bytes per sample as a prefix. We set the pointer to
+    // the memory after the prefix - so that GetPointer() will return a pointer
+    // to m_lSize bytes.
+    for (; m_lAllocated < m_lCount; m_lAllocated++, pNext += lAlignedSize) {
+
+
+        pSample = new CMediaSample(
+                            NAME("Default memory media sample"),
+                this,
+                            &hr,
+                            pNext + m_lPrefix,      // GetPointer() value
+                            m_lSize);               // not including prefix
+
+            ASSERT(SUCCEEDED(hr));
+        if (pSample == NULL) {
+            return E_OUTOFMEMORY;
+        }
+
+        // This CANNOT fail
+        m_lFree.Add(pSample);
+    }
+
+    m_bChanged = FALSE;
+    return NOERROR;
+}
+
+
+// override this to free up any resources we have allocated.
+// called from the base class on Decommit when all buffers have been
+// returned to the free list.
+//
+// caller has already locked the object.
+
+// in our case, we keep the memory until we are deleted, so
+// we do nothing here. The memory is deleted in the destructor by
+// calling ReallyFree()
+void
+CMemAllocator::Free(void)
+{
+    return;
+}
+
+
+// called from the destructor (and from Alloc if changing size/count) to
+// actually free up the memory
+void
+CMemAllocator::ReallyFree(void)
+{
+    /* Should never be deleting this unless all buffers are freed */
+
+    ASSERT(m_lAllocated == m_lFree.GetCount());
+
+    /* Free up all the CMediaSamples */
+
+    CMediaSample *pSample;
+    for (;;) {
+        pSample = m_lFree.RemoveHead();
+        if (pSample != NULL) {
+            delete pSample;
+        } else {
+            break;
+        }
+    }
+
+    m_lAllocated = 0;
+
+    // free the block of buffer memory
+    if (m_pBuffer) {
+        EXECUTE_ASSERT(VirtualFree(m_pBuffer, 0, MEM_RELEASE));
+        m_pBuffer = NULL;
+    }
+}
+
+
+/* Destructor frees our memory resources */
+
+CMemAllocator::~CMemAllocator()
+{
+    Decommit();
+    ReallyFree();
+}
+
+// ------------------------------------------------------------------------
+// filter registration through IFilterMapper. used if IFilterMapper is
+// not found (Quartz 1.0 install)
+
+STDAPI
+AMovieSetupRegisterFilter( const AMOVIESETUP_FILTER * const psetupdata
+                         , IFilterMapper *                  pIFM
+                         , BOOL                             bRegister  )
+{
+  DbgLog((LOG_TRACE, 3, TEXT("= AMovieSetupRegisterFilter")));
+
+  // check we've got data
+  //
+  if( NULL == psetupdata ) return S_FALSE;
+
+
+  // unregister filter
+  // (as pins are subkeys of filter's CLSID key
+  // they do not need to be removed separately).
+  //
+  DbgLog((LOG_TRACE, 3, TEXT("= = unregister filter")));
+  HRESULT hr = pIFM->UnregisterFilter( *(psetupdata->clsID) );
+
+
+  if( bRegister )
+  {
+    // register filter
+    //
+    DbgLog((LOG_TRACE, 3, TEXT("= = register filter")));
+    hr = pIFM->RegisterFilter( *(psetupdata->clsID)
+                             , psetupdata->strName
+                             , psetupdata->dwMerit    );
+    if( SUCCEEDED(hr) )
+    {
+      // all its pins
+      //
+      DbgLog((LOG_TRACE, 3, TEXT("= = register filter pins")));
+      for( UINT m1=0; m1 < psetupdata->nPins; m1++ )
+      {
+        hr = pIFM->RegisterPin( *(psetupdata->clsID)
+                              , psetupdata->lpPin[m1].strName
+                              , psetupdata->lpPin[m1].bRendered
+                              , psetupdata->lpPin[m1].bOutput
+                              , psetupdata->lpPin[m1].bZero
+                              , psetupdata->lpPin[m1].bMany
+                              , *(psetupdata->lpPin[m1].clsConnectsToFilter)
+                              , psetupdata->lpPin[m1].strConnectsToPin );
+
+        if( SUCCEEDED(hr) )
+        {
+          // and each pin's media types
+          //
+          DbgLog((LOG_TRACE, 3, TEXT("= = register filter pin types")));
+          for( UINT m2=0; m2 < psetupdata->lpPin[m1].nMediaTypes; m2++ )
+          {
+            hr = pIFM->RegisterPinType( *(psetupdata->clsID)
+                                      , psetupdata->lpPin[m1].strName
+                                      , *(psetupdata->lpPin[m1].lpMediaType[m2].clsMajorType)
+                                      , *(psetupdata->lpPin[m1].lpMediaType[m2].clsMinorType) );
+            if( FAILED(hr) ) break;
+          }
+          if( FAILED(hr) ) break;
+        }
+        if( FAILED(hr) ) break;
+      }
+    }
+  }
+
+  // handle one acceptable "error" - that
+  // of filter not being registered!
+  // (couldn't find a suitable #define'd
+  // name for the error!)
+  //
+  if( 0x80070002 == hr)
+    return NOERROR;
+  else
+    return hr;
+}
+
+//  Remove warnings about unreferenced inline functions
+#pragma warning(disable:4514)
+
diff --git a/plugins/GSdx_legacy/baseclasses/amfilter.h b/plugins/GSdx_legacy/baseclasses/amfilter.h
new file mode 100644
index 0000000000..70dbb44547
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/amfilter.h
@@ -0,0 +1,1587 @@
+//------------------------------------------------------------------------------
+// File: AMFilter.h
+//
+// Desc: DirectShow base classes - efines class hierarchy for streams
+//       architecture.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __FILTER__
+#define __FILTER__
+
+/* The following classes are declared in this header: */
+
+class CBaseMediaFilter;     // IMediaFilter support
+class CBaseFilter;          // IBaseFilter,IMediaFilter support
+class CBasePin;             // Abstract base class for IPin interface
+class CEnumPins;            // Enumerate input and output pins
+class CEnumMediaTypes;      // Enumerate the pin's preferred formats
+class CBaseOutputPin;       // Adds data provider member functions
+class CBaseInputPin;        // Implements IMemInputPin interface
+class CMediaSample;         // Basic transport unit for IMemInputPin
+class CBaseAllocator;       // General list guff for most allocators
+class CMemAllocator;        // Implements memory buffer allocation
+
+
+//=====================================================================
+//=====================================================================
+//
+// QueryFilterInfo and QueryPinInfo AddRef the interface pointers
+// they return.  You can use the macro below to release the interface.
+//
+//=====================================================================
+//=====================================================================
+
+#define QueryFilterInfoReleaseGraph(fi) if ((fi).pGraph) (fi).pGraph->Release();
+
+#define QueryPinInfoReleaseFilter(pi) if ((pi).pFilter) (pi).pFilter->Release();
+
+//=====================================================================
+//=====================================================================
+// Defines CBaseMediaFilter
+//
+// Abstract base class implementing IMediaFilter.
+//
+// Typically you will derive your filter from CBaseFilter rather than
+// this,  unless you are implementing an object such as a plug-in
+// distributor that needs to support IMediaFilter but not IBaseFilter.
+//
+// Note that IMediaFilter is derived from IPersist to allow query of
+// class id.
+//=====================================================================
+//=====================================================================
+
+class AM_NOVTABLE CBaseMediaFilter : public CUnknown,
+                                     public IMediaFilter
+{
+
+protected:
+
+    FILTER_STATE    m_State;            // current state: running, paused
+    IReferenceClock *m_pClock;          // this filter's reference clock
+    // note: all filters in a filter graph use the same clock
+
+    // offset from stream time to reference time
+    CRefTime        m_tStart;
+
+    CLSID	    m_clsid;            // This filters clsid
+                                        // used for serialization
+    CCritSec        *m_pLock;           // Object we use for locking
+
+public:
+
+    CBaseMediaFilter(
+        const TCHAR     *pName,
+        LPUNKNOWN pUnk,
+        CCritSec  *pLock,
+	REFCLSID   clsid);
+
+    virtual ~CBaseMediaFilter();
+
+    DECLARE_IUNKNOWN
+
+    // override this to say what interfaces we support where
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void ** ppv);
+
+    //
+    // --- IPersist method ---
+    //
+
+    STDMETHODIMP GetClassID(CLSID *pClsID);
+
+    // --- IMediaFilter methods ---
+
+    STDMETHODIMP GetState(DWORD dwMSecs, FILTER_STATE *State);
+
+    STDMETHODIMP SetSyncSource(IReferenceClock *pClock);
+
+    STDMETHODIMP GetSyncSource(IReferenceClock **pClock);
+
+    // default implementation of Stop and Pause just record the
+    // state. Override to activate or de-activate your filter.
+    // Note that Run when called from Stopped state will call Pause
+    // to ensure activation, so if you are a source or transform
+    // you will probably not need to override Run.
+    STDMETHODIMP Stop();
+    STDMETHODIMP Pause();
+
+
+    // the start parameter is the difference to be added to the
+    // sample's stream time to get the reference time for
+    // its presentation
+    STDMETHODIMP Run(REFERENCE_TIME tStart);
+
+    // --- helper methods ---
+
+    // return the current stream time - ie find out what
+    // stream time should be appearing now
+    virtual HRESULT StreamTime(CRefTime& rtStream);
+
+    // Is the filter currently active? (running or paused)
+    BOOL IsActive() {
+        CAutoLock cObjectLock(m_pLock);
+        return ((m_State == State_Paused) || (m_State == State_Running));
+    };
+};
+
+//=====================================================================
+//=====================================================================
+// Defines CBaseFilter
+//
+// An abstract class providing basic IBaseFilter support for pin
+// enumeration and filter information reading.
+//
+// We cannot derive from CBaseMediaFilter since methods in IMediaFilter
+// are also in IBaseFilter and would be ambiguous. Since much of the code
+// assumes that they derive from a class that has m_State and other state
+// directly available, we duplicate code from CBaseMediaFilter rather than
+// having a member variable.
+//
+// Derive your filter from this, or from a derived object such as
+// CTransformFilter.
+//=====================================================================
+//=====================================================================
+
+
+class AM_NOVTABLE CBaseFilter : public CUnknown,        // Handles an IUnknown
+                    public IBaseFilter,     // The Filter Interface
+                    public IAMovieSetup     // For un/registration
+{
+
+friend class CBasePin;
+
+protected:
+    FILTER_STATE    m_State;            // current state: running, paused
+    IReferenceClock *m_pClock;          // this graph's ref clock
+    CRefTime        m_tStart;           // offset from stream time to reference time
+    CLSID	    m_clsid;            // This filters clsid
+                                        // used for serialization
+    CCritSec        *m_pLock;           // Object we use for locking
+
+    WCHAR           *m_pName;           // Full filter name
+    IFilterGraph    *m_pGraph;          // Graph we belong to
+    IMediaEventSink *m_pSink;           // Called with notify events
+    LONG            m_PinVersion;       // Current pin version
+
+public:
+
+    CBaseFilter(
+        const TCHAR *pName,     // Object description
+        LPUNKNOWN pUnk,         // IUnknown of delegating object
+        CCritSec  *pLock,       // Object who maintains lock
+	REFCLSID   clsid);      // The clsid to be used to serialize this filter
+
+    CBaseFilter(
+        TCHAR     *pName,       // Object description
+        LPUNKNOWN pUnk,         // IUnknown of delegating object
+        CCritSec  *pLock,       // Object who maintains lock
+	REFCLSID   clsid,       // The clsid to be used to serialize this filter
+        HRESULT   *phr);        // General OLE return code
+#ifdef UNICODE
+    CBaseFilter(
+        const CHAR *pName,     // Object description
+        LPUNKNOWN pUnk,         // IUnknown of delegating object
+        CCritSec  *pLock,       // Object who maintains lock
+	REFCLSID   clsid);      // The clsid to be used to serialize this filter
+
+    CBaseFilter(
+        CHAR     *pName,       // Object description
+        LPUNKNOWN pUnk,         // IUnknown of delegating object
+        CCritSec  *pLock,       // Object who maintains lock
+	REFCLSID   clsid,       // The clsid to be used to serialize this filter
+        HRESULT   *phr);        // General OLE return code
+#endif
+    ~CBaseFilter();
+
+    DECLARE_IUNKNOWN
+
+    // override this to say what interfaces we support where
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void ** ppv);
+#ifdef DEBUG
+    STDMETHODIMP_(ULONG) NonDelegatingRelease();
+#endif
+
+    //
+    // --- IPersist method ---
+    //
+
+    STDMETHODIMP GetClassID(CLSID *pClsID);
+
+    // --- IMediaFilter methods ---
+
+    STDMETHODIMP GetState(DWORD dwMSecs, FILTER_STATE *State);
+
+    STDMETHODIMP SetSyncSource(IReferenceClock *pClock);
+
+    STDMETHODIMP GetSyncSource(IReferenceClock **pClock);
+
+
+    // override Stop and Pause so we can activate the pins.
+    // Note that Run will call Pause first if activation needed.
+    // Override these if you want to activate your filter rather than
+    // your pins.
+    STDMETHODIMP Stop();
+    STDMETHODIMP Pause();
+
+    // the start parameter is the difference to be added to the
+    // sample's stream time to get the reference time for
+    // its presentation
+    STDMETHODIMP Run(REFERENCE_TIME tStart);
+
+    // --- helper methods ---
+
+    // return the current stream time - ie find out what
+    // stream time should be appearing now
+    virtual HRESULT StreamTime(CRefTime& rtStream);
+
+    // Is the filter currently active?
+    BOOL IsActive() {
+        CAutoLock cObjectLock(m_pLock);
+        return ((m_State == State_Paused) || (m_State == State_Running));
+    };
+
+    // Is this filter stopped (without locking)
+    BOOL IsStopped() {
+        return (m_State == State_Stopped);
+    };
+
+    //
+    // --- IBaseFilter methods ---
+    //
+
+    // pin enumerator
+    STDMETHODIMP EnumPins(
+                    IEnumPins ** ppEnum);
+
+
+    // default behaviour of FindPin assumes pin ids are their names
+    STDMETHODIMP FindPin(
+        LPCWSTR Id,
+        IPin ** ppPin
+    );
+
+    STDMETHODIMP QueryFilterInfo(
+                    FILTER_INFO * pInfo);
+
+    STDMETHODIMP JoinFilterGraph(
+                    IFilterGraph * pGraph,
+                    LPCWSTR pName);
+
+    // return a Vendor information string. Optional - may return E_NOTIMPL.
+    // memory returned should be freed using CoTaskMemFree
+    // default implementation returns E_NOTIMPL
+    STDMETHODIMP QueryVendorInfo(
+                    LPWSTR* pVendorInfo
+            );
+
+    // --- helper methods ---
+
+    // send an event notification to the filter graph if we know about it.
+    // returns S_OK if delivered, S_FALSE if the filter graph does not sink
+    // events, or an error otherwise.
+    HRESULT NotifyEvent(
+        long EventCode,
+        LONG_PTR EventParam1,
+        LONG_PTR EventParam2);
+
+    // return the filter graph we belong to
+    IFilterGraph *GetFilterGraph() {
+        return m_pGraph;
+    }
+
+    // Request reconnect
+    // pPin is the pin to reconnect
+    // pmt is the type to reconnect with - can be NULL
+    // Calls ReconnectEx on the filter graph
+    HRESULT ReconnectPin(IPin *pPin, AM_MEDIA_TYPE const *pmt);
+
+    // find out the current pin version (used by enumerators)
+    virtual LONG GetPinVersion();
+    void IncrementPinVersion();
+
+    // you need to supply these to access the pins from the enumerator
+    // and for default Stop and Pause/Run activation.
+    virtual int GetPinCount() PURE;
+    virtual CBasePin *GetPin(int n) PURE;
+
+    // --- IAMovieSetup methods ---
+
+    STDMETHODIMP Register();    // ask filter to register itself
+    STDMETHODIMP Unregister();  // and unregister itself
+
+    // --- setup helper methods ---
+    // (override to return filters setup data)
+
+    virtual LPAMOVIESETUP_FILTER GetSetupData(){ return NULL; }
+
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CBasePin
+//
+// Abstract class that supports the basics of IPin
+//=====================================================================
+//=====================================================================
+
+class  AM_NOVTABLE CBasePin : public CUnknown, public IPin, public IQualityControl
+{
+
+protected:
+
+    WCHAR *         m_pName;		        // This pin's name
+    IPin            *m_Connected;               // Pin we have connected to
+    PIN_DIRECTION   m_dir;                      // Direction of this pin
+    CCritSec        *m_pLock;                   // Object we use for locking
+    bool            m_bRunTimeError;            // Run time error generated
+    bool            m_bCanReconnectWhenActive;  // OK to reconnect when active
+    bool            m_bTryMyTypesFirst;         // When connecting enumerate
+                                                // this pin's types first
+    CBaseFilter    *m_pFilter;                  // Filter we were created by
+    IQualityControl *m_pQSink;                  // Target for Quality messages
+    LONG            m_TypeVersion;              // Holds current type version
+    CMediaType      m_mt;                       // Media type of connection
+
+    CRefTime        m_tStart;                   // time from NewSegment call
+    CRefTime        m_tStop;                    // time from NewSegment
+    double          m_dRate;                    // rate from NewSegment
+
+#ifdef DEBUG
+    LONG            m_cRef;                     // Ref count tracing
+#endif
+
+    // displays pin connection information
+
+#ifdef DEBUG
+    void DisplayPinInfo(IPin *pReceivePin);
+    void DisplayTypeInfo(IPin *pPin, const CMediaType *pmt);
+#else
+    void DisplayPinInfo(IPin *pReceivePin) {};
+    void DisplayTypeInfo(IPin *pPin, const CMediaType *pmt) {};
+#endif
+
+    // used to agree a media type for a pin connection
+
+    // given a specific media type, attempt a connection (includes
+    // checking that the type is acceptable to this pin)
+    HRESULT
+    AttemptConnection(
+        IPin* pReceivePin,      // connect to this pin
+        const CMediaType* pmt   // using this type
+    );
+
+    // try all the media types in this enumerator - for each that
+    // we accept, try to connect using ReceiveConnection.
+    HRESULT TryMediaTypes(
+                        IPin *pReceivePin,      // connect to this pin
+                        const CMediaType *pmt,        // proposed type from Connect
+                        IEnumMediaTypes *pEnum);    // try this enumerator
+
+    // establish a connection with a suitable mediatype. Needs to
+    // propose a media type if the pmt pointer is null or partially
+    // specified - use TryMediaTypes on both our and then the other pin's
+    // enumerator until we find one that works.
+    HRESULT AgreeMediaType(
+                        IPin *pReceivePin,      // connect to this pin
+                        const CMediaType *pmt);       // proposed type from Connect
+
+public:
+
+    CBasePin(
+        TCHAR *pObjectName,         // Object description
+        CBaseFilter *pFilter,       // Owning filter who knows about pins
+        CCritSec *pLock,            // Object who implements the lock
+        HRESULT *phr,               // General OLE return code
+        LPCWSTR pName,              // Pin name for us
+        PIN_DIRECTION dir);         // Either PINDIR_INPUT or PINDIR_OUTPUT
+#ifdef UNICODE
+    CBasePin(
+        CHAR *pObjectName,         // Object description
+        CBaseFilter *pFilter,       // Owning filter who knows about pins
+        CCritSec *pLock,            // Object who implements the lock
+        HRESULT *phr,               // General OLE return code
+        LPCWSTR pName,              // Pin name for us
+        PIN_DIRECTION dir);         // Either PINDIR_INPUT or PINDIR_OUTPUT
+#endif
+    virtual ~CBasePin();
+
+    DECLARE_IUNKNOWN
+
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void ** ppv);
+    STDMETHODIMP_(ULONG) NonDelegatingRelease();
+    STDMETHODIMP_(ULONG) NonDelegatingAddRef();
+
+    // --- IPin methods ---
+
+    // take lead role in establishing a connection. Media type pointer
+    // may be null, or may point to partially-specified mediatype
+    // (subtype or format type may be GUID_NULL).
+    STDMETHODIMP Connect(
+        IPin * pReceivePin,
+        const AM_MEDIA_TYPE *pmt   // optional media type
+    );
+
+    // (passive) accept a connection from another pin
+    STDMETHODIMP ReceiveConnection(
+        IPin * pConnector,      // this is the initiating connecting pin
+        const AM_MEDIA_TYPE *pmt   // this is the media type we will exchange
+    );
+
+    STDMETHODIMP Disconnect();
+
+    STDMETHODIMP ConnectedTo(IPin **pPin);
+
+    STDMETHODIMP ConnectionMediaType(AM_MEDIA_TYPE *pmt);
+
+    STDMETHODIMP QueryPinInfo(
+        PIN_INFO * pInfo
+    );
+
+    STDMETHODIMP QueryDirection(
+    	PIN_DIRECTION * pPinDir
+    );
+
+    STDMETHODIMP QueryId(
+        LPWSTR * Id
+    );
+
+    // does the pin support this media type
+    STDMETHODIMP QueryAccept(
+        const AM_MEDIA_TYPE *pmt
+    );
+
+    // return an enumerator for this pins preferred media types
+    STDMETHODIMP EnumMediaTypes(
+        IEnumMediaTypes **ppEnum
+    );
+
+    // return an array of IPin* - the pins that this pin internally connects to
+    // All pins put in the array must be AddReffed (but no others)
+    // Errors: "Can't say" - FAIL, not enough slots - return S_FALSE
+    // Default: return E_NOTIMPL
+    // The filter graph will interpret NOT_IMPL as any input pin connects to
+    // all visible output pins and vice versa.
+    // apPin can be NULL if nPin==0 (not otherwise).
+    STDMETHODIMP QueryInternalConnections(
+        IPin* *apPin,     // array of IPin*
+        ULONG *nPin       // on input, the number of slots
+                          // on output  the number of pins
+    ) { return E_NOTIMPL; }
+
+    // Called when no more data will be sent
+    STDMETHODIMP EndOfStream(void);
+
+    // Begin/EndFlush still PURE
+
+    // NewSegment notifies of the start/stop/rate applying to the data
+    // about to be received. Default implementation records data and
+    // returns S_OK.
+    // Override this to pass downstream.
+    STDMETHODIMP NewSegment(
+                    REFERENCE_TIME tStart,
+                    REFERENCE_TIME tStop,
+                    double dRate);
+
+    //================================================================================
+    // IQualityControl methods
+    //================================================================================
+
+    STDMETHODIMP Notify(IBaseFilter * pSender, Quality q);
+
+    STDMETHODIMP SetSink(IQualityControl * piqc);
+
+    // --- helper methods ---
+
+    // Returns true if the pin is connected. false otherwise.
+    BOOL IsConnected(void) {return (m_Connected != NULL); };
+    // Return the pin this is connected to (if any)
+    IPin * GetConnected() { return m_Connected; };
+
+    // Check if our filter is currently stopped
+    BOOL IsStopped() {
+        return (m_pFilter->m_State == State_Stopped);
+    };
+
+    // find out the current type version (used by enumerators)
+    virtual LONG GetMediaTypeVersion();
+    void IncrementTypeVersion();
+
+    // switch the pin to active (paused or running) mode
+    // not an error to call this if already active
+    virtual HRESULT Active(void);
+
+    // switch the pin to inactive state - may already be inactive
+    virtual HRESULT Inactive(void);
+
+    // Notify of Run() from filter
+    virtual HRESULT Run(REFERENCE_TIME tStart);
+
+    // check if the pin can support this specific proposed type and format
+    virtual HRESULT CheckMediaType(const CMediaType *) PURE;
+
+    // set the connection to use this format (previously agreed)
+    virtual HRESULT SetMediaType(const CMediaType *);
+
+    // check that the connection is ok before verifying it
+    // can be overridden eg to check what interfaces will be supported.
+    virtual HRESULT CheckConnect(IPin *);
+
+    // Set and release resources required for a connection
+    virtual HRESULT BreakConnect();
+    virtual HRESULT CompleteConnect(IPin *pReceivePin);
+
+    // returns the preferred formats for a pin
+    virtual HRESULT GetMediaType(int iPosition,CMediaType *pMediaType);
+
+    // access to NewSegment values
+    REFERENCE_TIME CurrentStopTime() {
+        return m_tStop;
+    }
+    REFERENCE_TIME CurrentStartTime() {
+        return m_tStart;
+    }
+    double CurrentRate() {
+        return m_dRate;
+    }
+
+    //  Access name
+    LPWSTR Name() { return m_pName; };
+
+    //  Can reconnectwhen active?
+    void SetReconnectWhenActive(bool bCanReconnect)
+    {
+        m_bCanReconnectWhenActive = bCanReconnect;
+    }
+
+    bool CanReconnectWhenActive()
+    {
+        return m_bCanReconnectWhenActive;
+    }
+
+protected:
+    STDMETHODIMP DisconnectInternal();
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CEnumPins
+//
+// Pin enumerator class that works by calling CBaseFilter. This interface
+// is provided by CBaseFilter::EnumPins and calls GetPinCount() and
+// GetPin() to enumerate existing pins. Needs to be a separate object so
+// that it can be cloned (creating an existing object at the same
+// position in the enumeration)
+//
+//=====================================================================
+//=====================================================================
+
+class CEnumPins : public IEnumPins      // The interface we support
+{
+    int m_Position;                 // Current ordinal position
+    int m_PinCount;                 // Number of pins available
+    CBaseFilter *m_pFilter;         // The filter who owns us
+    LONG m_Version;                 // Pin version information
+    LONG m_cRef;
+
+    typedef CGenericList<CBasePin> CPinList;
+
+    CPinList m_PinCache;	    // These pointers have not been AddRef'ed and
+				    // so they should not be dereferenced.  They are
+				    // merely kept to ID which pins have been enumerated.
+
+#ifdef DEBUG
+    DWORD m_dwCookie;
+#endif
+
+    /* If while we are retrieving a pin for example from the filter an error
+       occurs we assume that our internal state is stale with respect to the
+       filter (someone may have deleted all the pins). We can check before
+       starting whether or not the operation is likely to fail by asking the
+       filter what it's current version number is. If the filter has not
+       overriden the GetPinVersion method then this will always match */
+
+    BOOL AreWeOutOfSync() {
+        return (m_pFilter->GetPinVersion() == m_Version ? FALSE : TRUE);
+    };
+
+    /* This method performs the same operations as Reset, except is does not clear
+       the cache of pins already enumerated. */
+
+    STDMETHODIMP Refresh();
+
+public:
+
+    CEnumPins(
+        CBaseFilter *pFilter,
+        CEnumPins *pEnumPins);
+
+    virtual ~CEnumPins();
+
+    // IUnknown
+    STDMETHODIMP QueryInterface(REFIID riid, void **ppv);
+    STDMETHODIMP_(ULONG) AddRef();
+    STDMETHODIMP_(ULONG) Release();
+
+    // IEnumPins
+    STDMETHODIMP Next(
+        ULONG cPins,         // place this many pins...
+        IPin ** ppPins,      // ...in this array of IPin*
+        ULONG * pcFetched    // actual count passed returned here
+    );
+
+    STDMETHODIMP Skip(ULONG cPins);
+    STDMETHODIMP Reset();
+    STDMETHODIMP Clone(IEnumPins **ppEnum);
+
+
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CEnumMediaTypes
+//
+// Enumerates the preferred formats for input and output pins
+//=====================================================================
+//=====================================================================
+
+class CEnumMediaTypes : public IEnumMediaTypes    // The interface we support
+{
+    int m_Position;           // Current ordinal position
+    CBasePin *m_pPin;         // The pin who owns us
+    LONG m_Version;           // Media type version value
+    LONG m_cRef;
+#ifdef DEBUG
+    DWORD m_dwCookie;
+#endif
+
+    /* The media types a filter supports can be quite dynamic so we add to
+       the general IEnumXXXX interface the ability to be signaled when they
+       change via an event handle the connected filter supplies. Until the
+       Reset method is called after the state changes all further calls to
+       the enumerator (except Reset) will return E_UNEXPECTED error code */
+
+    BOOL AreWeOutOfSync() {
+        return (m_pPin->GetMediaTypeVersion() == m_Version ? FALSE : TRUE);
+    };
+
+public:
+
+    CEnumMediaTypes(
+        CBasePin *pPin,
+        CEnumMediaTypes *pEnumMediaTypes);
+
+    virtual ~CEnumMediaTypes();
+
+    // IUnknown
+    STDMETHODIMP QueryInterface(REFIID riid, void **ppv);
+    STDMETHODIMP_(ULONG) AddRef();
+    STDMETHODIMP_(ULONG) Release();
+
+    // IEnumMediaTypes
+    STDMETHODIMP Next(
+        ULONG cMediaTypes,          // place this many pins...
+        AM_MEDIA_TYPE ** ppMediaTypes,  // ...in this array
+        ULONG * pcFetched           // actual count passed
+    );
+
+    STDMETHODIMP Skip(ULONG cMediaTypes);
+    STDMETHODIMP Reset();
+    STDMETHODIMP Clone(IEnumMediaTypes **ppEnum);
+};
+
+
+
+
+//=====================================================================
+//=====================================================================
+// Defines CBaseOutputPin
+//
+// class derived from CBasePin that can pass buffers to a connected pin
+// that supports IMemInputPin. Supports IPin.
+//
+// Derive your output pin from this.
+//
+//=====================================================================
+//=====================================================================
+
+class  AM_NOVTABLE CBaseOutputPin : public CBasePin
+{
+
+protected:
+
+    IMemAllocator *m_pAllocator;
+    IMemInputPin *m_pInputPin;        // interface on the downstreaminput pin
+                                      // set up in CheckConnect when we connect.
+
+public:
+
+    CBaseOutputPin(
+        TCHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+#ifdef UNICODE
+    CBaseOutputPin(
+        CHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+#endif
+    // override CompleteConnect() so we can negotiate an allocator
+    virtual HRESULT CompleteConnect(IPin *pReceivePin);
+
+    // negotiate the allocator and its buffer size/count and other properties
+    // Calls DecideBufferSize to set properties
+    virtual HRESULT DecideAllocator(IMemInputPin * pPin, IMemAllocator ** pAlloc);
+
+    // override this to set the buffer size and count. Return an error
+    // if the size/count is not to your liking.
+    // The allocator properties passed in are those requested by the
+    // input pin - use eg the alignment and prefix members if you have
+    // no preference on these.
+    virtual HRESULT DecideBufferSize(
+        IMemAllocator * pAlloc,
+        ALLOCATOR_PROPERTIES * ppropInputRequest
+    ) PURE;
+
+    // returns an empty sample buffer from the allocator
+    virtual HRESULT GetDeliveryBuffer(IMediaSample ** ppSample,
+                                      REFERENCE_TIME * pStartTime,
+                                      REFERENCE_TIME * pEndTime,
+                                      DWORD dwFlags);
+
+    // deliver a filled-in sample to the connected input pin
+    // note - you need to release it after calling this. The receiving
+    // pin will addref the sample if it needs to hold it beyond the
+    // call.
+    virtual HRESULT Deliver(IMediaSample *);
+
+    // override this to control the connection
+    virtual HRESULT InitAllocator(IMemAllocator **ppAlloc);
+    HRESULT CheckConnect(IPin *pPin);
+    HRESULT BreakConnect();
+
+    // override to call Commit and Decommit
+    HRESULT Active(void);
+    HRESULT Inactive(void);
+
+    // we have a default handling of EndOfStream which is to return
+    // an error, since this should be called on input pins only
+    STDMETHODIMP EndOfStream(void);
+
+    // called from elsewhere in our filter to pass EOS downstream to
+    // our connected input pin
+    virtual HRESULT DeliverEndOfStream(void);
+
+    // same for Begin/EndFlush - we handle Begin/EndFlush since it
+    // is an error on an output pin, and we have Deliver methods to
+    // call the methods on the connected pin
+    STDMETHODIMP BeginFlush(void);
+    STDMETHODIMP EndFlush(void);
+    virtual HRESULT DeliverBeginFlush(void);
+    virtual HRESULT DeliverEndFlush(void);
+
+    // deliver NewSegment to connected pin - you will need to
+    // override this if you queue any data in your output pin.
+    virtual HRESULT DeliverNewSegment(
+                        REFERENCE_TIME tStart,
+                        REFERENCE_TIME tStop,
+                        double dRate);
+
+    //================================================================================
+    // IQualityControl methods
+    //================================================================================
+
+    // All inherited from CBasePin and not overridden here.
+    // STDMETHODIMP Notify(IBaseFilter * pSender, Quality q);
+    // STDMETHODIMP SetSink(IQualityControl * piqc);
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CBaseInputPin
+//
+// derive your standard input pin from this.
+// you need to supply GetMediaType and CheckConnect etc (see CBasePin),
+// and you need to supply Receive to do something more useful.
+//
+//=====================================================================
+//=====================================================================
+
+class AM_NOVTABLE CBaseInputPin : public CBasePin,
+                                  public IMemInputPin
+{
+
+protected:
+
+    IMemAllocator *m_pAllocator;    // Default memory allocator
+
+    // allocator is read-only, so received samples
+    // cannot be modified (probably only relevant to in-place
+    // transforms
+    BYTE m_bReadOnly;
+
+    // in flushing state (between BeginFlush and EndFlush)
+    // if TRUE, all Receives are returned with S_FALSE
+    BYTE m_bFlushing;
+
+    // Sample properties - initalized in Receive
+    AM_SAMPLE2_PROPERTIES m_SampleProps;
+
+public:
+
+    CBaseInputPin(
+        TCHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+#ifdef UNICODE
+    CBaseInputPin(
+        CHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+#endif
+    virtual ~CBaseInputPin();
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    // return the allocator interface that this input pin
+    // would like the output pin to use
+    STDMETHODIMP GetAllocator(IMemAllocator ** ppAllocator);
+
+    // tell the input pin which allocator the output pin is actually
+    // going to use.
+    STDMETHODIMP NotifyAllocator(
+                    IMemAllocator * pAllocator,
+                    BOOL bReadOnly);
+
+    // do something with this media sample
+    STDMETHODIMP Receive(IMediaSample *pSample);
+
+    // do something with these media samples
+    STDMETHODIMP ReceiveMultiple (
+        IMediaSample **pSamples,
+        long nSamples,
+        long *nSamplesProcessed);
+
+    // See if Receive() blocks
+    STDMETHODIMP ReceiveCanBlock();
+
+    // Default handling for BeginFlush - call at the beginning
+    // of your implementation (makes sure that all Receive calls
+    // fail). After calling this, you need to free any queued data
+    // and then call downstream.
+    STDMETHODIMP BeginFlush(void);
+
+    // default handling for EndFlush - call at end of your implementation
+    // - before calling this, ensure that there is no queued data and no thread
+    // pushing any more without a further receive, then call downstream,
+    // then call this method to clear the m_bFlushing flag and re-enable
+    // receives
+    STDMETHODIMP EndFlush(void);
+
+    // this method is optional (can return E_NOTIMPL).
+    // default implementation returns E_NOTIMPL. Override if you have
+    // specific alignment or prefix needs, but could use an upstream
+    // allocator
+    STDMETHODIMP GetAllocatorRequirements(ALLOCATOR_PROPERTIES*pProps);
+
+    // Release the pin's allocator.
+    HRESULT BreakConnect();
+
+    // helper method to check the read-only flag
+    BOOL IsReadOnly() {
+        return m_bReadOnly;
+    };
+
+    // helper method to see if we are flushing
+    BOOL IsFlushing() {
+        return m_bFlushing;
+    };
+
+    //  Override this for checking whether it's OK to process samples
+    //  Also call this from EndOfStream.
+    virtual HRESULT CheckStreaming();
+
+    // Pass a Quality notification on to the appropriate sink
+    HRESULT PassNotify(Quality& q);
+
+
+    //================================================================================
+    // IQualityControl methods (from CBasePin)
+    //================================================================================
+
+    STDMETHODIMP Notify(IBaseFilter * pSender, Quality q);
+
+    // no need to override:
+    // STDMETHODIMP SetSink(IQualityControl * piqc);
+
+
+    // switch the pin to inactive state - may already be inactive
+    virtual HRESULT Inactive(void);
+
+    // Return sample properties pointer
+    AM_SAMPLE2_PROPERTIES * SampleProps() {
+        ASSERT(m_SampleProps.cbData != 0);
+        return &m_SampleProps;
+    }
+
+};
+
+///////////////////////////////////////////////////////////////////////////
+// CDynamicOutputPin
+//
+
+class CDynamicOutputPin : public CBaseOutputPin,
+                          public IPinFlowControl
+{
+public:
+#ifdef UNICODE
+    CDynamicOutputPin(
+        CHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+#endif
+
+    CDynamicOutputPin(
+        TCHAR *pObjectName,
+        CBaseFilter *pFilter,
+        CCritSec *pLock,
+        HRESULT *phr,
+        LPCWSTR pName);
+
+    ~CDynamicOutputPin();
+
+    // IUnknown Methods
+    DECLARE_IUNKNOWN
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    // IPin Methods
+    STDMETHODIMP Disconnect(void);
+
+    // IPinFlowControl Methods
+    STDMETHODIMP Block(DWORD dwBlockFlags, HANDLE hEvent);
+
+    //  Set graph config info
+    void SetConfigInfo(IGraphConfig *pGraphConfig, HANDLE hStopEvent);
+
+    #ifdef DEBUG
+    virtual HRESULT Deliver(IMediaSample *pSample);
+    virtual HRESULT DeliverEndOfStream(void);
+    virtual HRESULT DeliverNewSegment(REFERENCE_TIME tStart, REFERENCE_TIME tStop, double dRate);
+    #endif // DEBUG
+
+    HRESULT DeliverBeginFlush(void);
+    HRESULT DeliverEndFlush(void);
+
+    HRESULT Inactive(void);
+    HRESULT Active(void);
+    virtual HRESULT CompleteConnect(IPin *pReceivePin);
+
+    virtual HRESULT StartUsingOutputPin(void);
+    virtual void StopUsingOutputPin(void);
+    virtual bool StreamingThreadUsingOutputPin(void);
+
+    HRESULT ChangeOutputFormat
+        (
+        const AM_MEDIA_TYPE *pmt,
+        REFERENCE_TIME tSegmentStart,
+        REFERENCE_TIME tSegmentStop,
+        double dSegmentRate
+        );
+    HRESULT ChangeMediaType(const CMediaType *pmt);
+    HRESULT DynamicReconnect(const CMediaType *pmt);
+
+protected:
+    HRESULT SynchronousBlockOutputPin(void);
+    HRESULT AsynchronousBlockOutputPin(HANDLE hNotifyCallerPinBlockedEvent);
+    HRESULT UnblockOutputPin(void);
+
+    void BlockOutputPin(void);
+    void ResetBlockState(void);
+
+    static HRESULT WaitEvent(HANDLE hEvent);
+
+    enum BLOCK_STATE
+    {
+        NOT_BLOCKED,
+        PENDING,
+        BLOCKED
+    };
+
+    // This lock should be held when the following class members are
+    // being used: m_hNotifyCallerPinBlockedEvent, m_BlockState,
+    // m_dwBlockCallerThreadID and m_dwNumOutstandingOutputPinUsers.
+    CCritSec m_BlockStateLock;
+
+    // This event should be signaled when the output pin is
+    // not blocked.  This is a manual reset event.  For more
+    // information on events, see the documentation for
+    // CreateEvent() in the Windows SDK.
+    HANDLE m_hUnblockOutputPinEvent;
+
+    // This event will be signaled when block operation succeedes or
+    // when the user cancels the block operation.  The block operation
+    // can be canceled by calling IPinFlowControl2::Block( 0, NULL )
+    // while the block operation is pending.
+    HANDLE m_hNotifyCallerPinBlockedEvent;
+
+    // The state of the current block operation.
+    BLOCK_STATE m_BlockState;
+
+    // The ID of the thread which last called IPinFlowControl::Block().
+    // For more information on thread IDs, see the documentation for
+    // GetCurrentThreadID() in the Windows SDK.
+    DWORD m_dwBlockCallerThreadID;
+
+    // The number of times StartUsingOutputPin() has been sucessfully
+    // called and a corresponding call to StopUsingOutputPin() has not
+    // been made.  When this variable is greater than 0, the streaming
+    // thread is calling IPin::NewSegment(), IPin::EndOfStream(),
+    // IMemInputPin::Receive() or IMemInputPin::ReceiveMultiple().  The
+    // streaming thread could also be calling: DynamicReconnect(),
+    // ChangeMediaType() or ChangeOutputFormat().  The output pin cannot
+    // be blocked while the output pin is being used.
+    DWORD m_dwNumOutstandingOutputPinUsers;
+
+    // This event should be set when the IMediaFilter::Stop() is called.
+    // This is a manual reset event.  It is also set when the output pin
+    // delivers a flush to the connected input pin.
+    HANDLE m_hStopEvent;
+    IGraphConfig* m_pGraphConfig;
+
+    // TRUE if the output pin's allocator's samples are read only.
+    // Otherwise FALSE.  For more information, see the documentation
+    // for IMemInputPin::NotifyAllocator().
+    BOOL m_bPinUsesReadOnlyAllocator;
+
+private:
+    HRESULT Initialize(void);
+    HRESULT ChangeMediaTypeHelper(const CMediaType *pmt);
+
+    #ifdef DEBUG
+    void AssertValid(void);
+    #endif // DEBUG
+};
+
+class CAutoUsingOutputPin
+{
+public:
+    CAutoUsingOutputPin( CDynamicOutputPin* pOutputPin, HRESULT* phr );
+    ~CAutoUsingOutputPin();
+
+private:
+    CDynamicOutputPin* m_pOutputPin;
+};
+
+inline CAutoUsingOutputPin::CAutoUsingOutputPin( CDynamicOutputPin* pOutputPin, HRESULT* phr ) :
+    m_pOutputPin(NULL)
+{
+    // The caller should always pass in valid pointers.
+    ASSERT( NULL != pOutputPin );
+    ASSERT( NULL != phr );
+
+    // Make sure the user initialized phr.
+    ASSERT( S_OK == *phr );
+
+    HRESULT hr = pOutputPin->StartUsingOutputPin();
+    if( FAILED( hr ) )
+    {
+        *phr = hr;
+        return;
+    }
+
+    m_pOutputPin = pOutputPin;
+}
+
+inline CAutoUsingOutputPin::~CAutoUsingOutputPin()
+{
+    if( NULL != m_pOutputPin )
+    {
+        m_pOutputPin->StopUsingOutputPin();
+    }
+}
+
+#ifdef DEBUG
+
+inline HRESULT CDynamicOutputPin::Deliver(IMediaSample *pSample)
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    return CBaseOutputPin::Deliver(pSample);
+}
+
+inline HRESULT CDynamicOutputPin::DeliverEndOfStream(void)
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT( StreamingThreadUsingOutputPin() );
+
+    return CBaseOutputPin::DeliverEndOfStream();
+}
+
+inline HRESULT CDynamicOutputPin::DeliverNewSegment(REFERENCE_TIME tStart, REFERENCE_TIME tStop, double dRate)
+{
+    // The caller should call StartUsingOutputPin() before calling this
+    // method.
+    ASSERT(StreamingThreadUsingOutputPin());
+
+    return CBaseOutputPin::DeliverNewSegment(tStart, tStop, dRate);
+}
+
+#endif // DEBUG
+
+//=====================================================================
+//=====================================================================
+// Memory allocators
+//
+// the shared memory transport between pins requires the input pin
+// to provide a memory allocator that can provide sample objects. A
+// sample object supports the IMediaSample interface.
+//
+// CBaseAllocator handles the management of free and busy samples. It
+// allocates CMediaSample objects. CBaseAllocator is an abstract class:
+// in particular it has no method of initializing the list of free
+// samples. CMemAllocator is derived from CBaseAllocator and initializes
+// the list of samples using memory from the standard IMalloc interface.
+//
+// If you want your buffers to live in some special area of memory,
+// derive your allocator object from CBaseAllocator. If you derive your
+// IMemInputPin interface object from CBaseMemInputPin, you will get
+// CMemAllocator-based allocation etc for free and will just need to
+// supply the Receive handling, and media type / format negotiation.
+//=====================================================================
+//=====================================================================
+
+
+//=====================================================================
+//=====================================================================
+// Defines CMediaSample
+//
+// an object of this class supports IMediaSample and represents a buffer
+// for media data with some associated properties. Releasing it returns
+// it to a freelist managed by a CBaseAllocator derived object.
+//=====================================================================
+//=====================================================================
+
+class CMediaSample : public IMediaSample2    // The interface we support
+{
+
+protected:
+
+    friend class CBaseAllocator;
+
+    /*  Values for dwFlags - these are used for backward compatiblity
+        only now - use AM_SAMPLE_xxx
+    */
+    enum { Sample_SyncPoint       = 0x01,   /* Is this a sync point */
+           Sample_Preroll         = 0x02,   /* Is this a preroll sample */
+           Sample_Discontinuity   = 0x04,   /* Set if start of new segment */
+           Sample_TypeChanged     = 0x08,   /* Has the type changed */
+           Sample_TimeValid       = 0x10,   /* Set if time is valid */
+           Sample_MediaTimeValid  = 0x20,   /* Is the media time valid */
+           Sample_TimeDiscontinuity = 0x40, /* Time discontinuity */
+           Sample_StopValid       = 0x100,  /* Stop time valid */
+           Sample_ValidFlags      = 0x1FF
+         };
+
+    /* Properties, the media sample class can be a container for a format
+       change in which case we take a copy of a type through the SetMediaType
+       interface function and then return it when GetMediaType is called. As
+       we do no internal processing on it we leave it as a pointer */
+
+    DWORD            m_dwFlags;         /* Flags for this sample */
+                                        /* Type specific flags are packed
+                                           into the top word
+                                        */
+    DWORD            m_dwTypeSpecificFlags; /* Media type specific flags */
+    LPBYTE           m_pBuffer;         /* Pointer to the complete buffer */
+    LONG             m_lActual;         /* Length of data in this sample */
+    LONG             m_cbBuffer;        /* Size of the buffer */
+    CBaseAllocator  *m_pAllocator;      /* The allocator who owns us */
+    CMediaSample     *m_pNext;          /* Chaining in free list */
+    REFERENCE_TIME   m_Start;           /* Start sample time */
+    REFERENCE_TIME   m_End;             /* End sample time */
+    LONGLONG         m_MediaStart;      /* Real media start position */
+    LONG             m_MediaEnd;        /* A difference to get the end */
+    AM_MEDIA_TYPE    *m_pMediaType;     /* Media type change data */
+    DWORD            m_dwStreamId;      /* Stream id */
+public:
+    LONG             m_cRef;            /* Reference count */
+
+
+public:
+
+    CMediaSample(
+        TCHAR *pName,
+        CBaseAllocator *pAllocator,
+        HRESULT *phr,
+        LPBYTE pBuffer = NULL,
+        LONG length = 0);
+#ifdef UNICODE
+    CMediaSample(
+        CHAR *pName,
+        CBaseAllocator *pAllocator,
+        HRESULT *phr,
+        LPBYTE pBuffer = NULL,
+        LONG length = 0);
+#endif
+
+    virtual ~CMediaSample();
+
+    /* Note the media sample does not delegate to its owner */
+
+    STDMETHODIMP QueryInterface(REFIID riid, void **ppv);
+    STDMETHODIMP_(ULONG) AddRef();
+    STDMETHODIMP_(ULONG) Release();
+
+    // set the buffer pointer and length. Used by allocators that
+    // want variable sized pointers or pointers into already-read data.
+    // This is only available through a CMediaSample* not an IMediaSample*
+    // and so cannot be changed by clients.
+    HRESULT SetPointer(BYTE * ptr, LONG cBytes);
+
+    // Get me a read/write pointer to this buffer's memory.
+    STDMETHODIMP GetPointer(BYTE ** ppBuffer);
+
+    STDMETHODIMP_(LONG) GetSize(void);
+
+    // get the stream time at which this sample should start and finish.
+    STDMETHODIMP GetTime(
+        REFERENCE_TIME * pTimeStart,     // put time here
+        REFERENCE_TIME * pTimeEnd
+    );
+
+    // Set the stream time at which this sample should start and finish.
+    STDMETHODIMP SetTime(
+        REFERENCE_TIME * pTimeStart,     // put time here
+        REFERENCE_TIME * pTimeEnd
+    );
+    STDMETHODIMP IsSyncPoint(void);
+    STDMETHODIMP SetSyncPoint(BOOL bIsSyncPoint);
+    STDMETHODIMP IsPreroll(void);
+    STDMETHODIMP SetPreroll(BOOL bIsPreroll);
+
+    STDMETHODIMP_(LONG) GetActualDataLength(void);
+    STDMETHODIMP SetActualDataLength(LONG lActual);
+
+    // these allow for limited format changes in band
+
+    STDMETHODIMP GetMediaType(AM_MEDIA_TYPE **ppMediaType);
+    STDMETHODIMP SetMediaType(AM_MEDIA_TYPE *pMediaType);
+
+    // returns S_OK if there is a discontinuity in the data (this same is
+    // not a continuation of the previous stream of data
+    // - there has been a seek).
+    STDMETHODIMP IsDiscontinuity(void);
+    // set the discontinuity property - TRUE if this sample is not a
+    // continuation, but a new sample after a seek.
+    STDMETHODIMP SetDiscontinuity(BOOL bDiscontinuity);
+
+    // get the media times for this sample
+    STDMETHODIMP GetMediaTime(
+    	LONGLONG * pTimeStart,
+	LONGLONG * pTimeEnd
+    );
+
+    // Set the media times for this sample
+    STDMETHODIMP SetMediaTime(
+    	LONGLONG * pTimeStart,
+	LONGLONG * pTimeEnd
+    );
+
+    // Set and get properties (IMediaSample2)
+    STDMETHODIMP GetProperties(
+        DWORD cbProperties,
+        BYTE * pbProperties
+    );
+
+    STDMETHODIMP SetProperties(
+        DWORD cbProperties,
+        const BYTE * pbProperties
+    );
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CBaseAllocator
+//
+// Abstract base class that manages a list of media samples
+//
+// This class provides support for getting buffers from the free list,
+// including handling of commit and (asynchronous) decommit.
+//
+// Derive from this class and override the Alloc and Free functions to
+// allocate your CMediaSample (or derived) objects and add them to the
+// free list, preparing them as necessary.
+//=====================================================================
+//=====================================================================
+
+class AM_NOVTABLE CBaseAllocator : public CUnknown,// A non delegating IUnknown
+                       public IMemAllocatorCallbackTemp, // The interface we support
+                       public CCritSec             // Provides object locking
+{
+    class CSampleList;
+    friend class CSampleList;
+
+    /*  Trick to get at protected member in CMediaSample */
+    static CMediaSample * &NextSample(CMediaSample *pSample)
+    {
+        return pSample->m_pNext;
+    };
+
+    /*  Mini list class for the free list */
+    class CSampleList
+    {
+    public:
+        CSampleList() : m_List(NULL), m_nOnList(0) {};
+#ifdef DEBUG
+        ~CSampleList()
+        {
+            ASSERT(m_nOnList == 0);
+        };
+#endif
+        CMediaSample *Head() const { return m_List; };
+        CMediaSample *Next(CMediaSample *pSample) const { return CBaseAllocator::NextSample(pSample); };
+        int GetCount() const { return m_nOnList; };
+        void Add(CMediaSample *pSample)
+        {
+            ASSERT(pSample != NULL);
+            CBaseAllocator::NextSample(pSample) = m_List;
+            m_List = pSample;
+            m_nOnList++;
+        };
+        CMediaSample *RemoveHead()
+        {
+            CMediaSample *pSample = m_List;
+            if (pSample != NULL) {
+                m_List = CBaseAllocator::NextSample(m_List);
+                m_nOnList--;
+            }
+            return pSample;
+        };
+        void Remove(CMediaSample *pSample);
+
+    public:
+        CMediaSample *m_List;
+        int           m_nOnList;
+    };
+protected:
+
+    CSampleList m_lFree;        // Free list
+
+    /*  Note to overriders of CBaseAllocator.
+
+        We use a lazy signalling mechanism for waiting for samples.
+        This means we don't call the OS if no waits occur.
+
+        In order to implement this:
+
+        1. When a new sample is added to m_lFree call NotifySample() which
+           calls ReleaseSemaphore on m_hSem with a count of m_lWaiting and
+           sets m_lWaiting to 0.
+           This must all be done holding the allocator's critical section.
+
+        2. When waiting for a sample call SetWaiting() which increments
+           m_lWaiting BEFORE leaving the allocator's critical section.
+
+        3. Actually wait by calling WaitForSingleObject(m_hSem, INFINITE)
+           having left the allocator's critical section.  The effect of
+           this is to remove 1 from the semaphore's count.  You MUST call
+           this once having incremented m_lWaiting.
+
+        The following are then true when the critical section is not held :
+            (let nWaiting = number about to wait or waiting)
+
+            (1) if (m_lFree.GetCount() != 0) then (m_lWaiting == 0)
+            (2) m_lWaiting + Semaphore count == nWaiting
+
+        We would deadlock if
+           nWaiting != 0 &&
+           m_lFree.GetCount() != 0 &&
+           Semaphore count == 0
+
+           But from (1) if m_lFree.GetCount() != 0 then m_lWaiting == 0 so
+           from (2) Semaphore count == nWaiting (which is non-0) so the
+           deadlock can't happen.
+    */
+
+    HANDLE m_hSem;              // For signalling
+    long m_lWaiting;            // Waiting for a free element
+    long m_lCount;              // how many buffers we have agreed to provide
+    long m_lAllocated;          // how many buffers are currently allocated
+    long m_lSize;               // agreed size of each buffer
+    long m_lAlignment;          // agreed alignment
+    long m_lPrefix;             // agreed prefix (preceeds GetPointer() value)
+    BOOL m_bChanged;            // Have the buffer requirements changed
+
+    // if true, we are decommitted and can't allocate memory
+    BOOL m_bCommitted;
+    // if true, the decommit has happened, but we haven't called Free yet
+    // as there are still outstanding buffers
+    BOOL m_bDecommitInProgress;
+
+    //  Notification interface
+    IMemAllocatorNotifyCallbackTemp *m_pNotify;
+
+    BOOL m_fEnableReleaseCallback;
+
+    // called to decommit the memory when the last buffer is freed
+    // pure virtual - need to override this
+    virtual void Free(void) PURE;
+
+    // override to allocate the memory when commit called
+    virtual HRESULT Alloc(void);
+
+public:
+
+    CBaseAllocator(
+        TCHAR *, LPUNKNOWN, HRESULT *,
+        BOOL bEvent = TRUE, BOOL fEnableReleaseCallback = FALSE);
+#ifdef UNICODE
+    CBaseAllocator(
+        CHAR *, LPUNKNOWN, HRESULT *,
+        BOOL bEvent = TRUE, BOOL fEnableReleaseCallback = FALSE);
+#endif
+    virtual ~CBaseAllocator();
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    STDMETHODIMP SetProperties(
+		    ALLOCATOR_PROPERTIES* pRequest,
+		    ALLOCATOR_PROPERTIES* pActual);
+
+    // return the properties actually being used on this allocator
+    STDMETHODIMP GetProperties(
+		    ALLOCATOR_PROPERTIES* pProps);
+
+    // override Commit to allocate memory. We handle the GetBuffer
+    //state changes
+    STDMETHODIMP Commit();
+
+    // override this to handle the memory freeing. We handle any outstanding
+    // GetBuffer calls
+    STDMETHODIMP Decommit();
+
+    // get container for a sample. Blocking, synchronous call to get the
+    // next free buffer (as represented by an IMediaSample interface).
+    // on return, the time etc properties will be invalid, but the buffer
+    // pointer and size will be correct. The two time parameters are
+    // optional and either may be NULL, they may alternatively be set to
+    // the start and end times the sample will have attached to it
+    // bPrevFramesSkipped is not used (used only by the video renderer's
+    // allocator where it affects quality management in direct draw).
+
+    STDMETHODIMP GetBuffer(IMediaSample **ppBuffer,
+                           REFERENCE_TIME * pStartTime,
+                           REFERENCE_TIME * pEndTime,
+                           DWORD dwFlags);
+
+    // final release of a CMediaSample will call this
+    STDMETHODIMP ReleaseBuffer(IMediaSample *pBuffer);
+    // obsolete:: virtual void PutOnFreeList(CMediaSample * pSample);
+
+    STDMETHODIMP SetNotify(IMemAllocatorNotifyCallbackTemp *pNotify);
+
+    STDMETHODIMP GetFreeCount(LONG *plBuffersFree);
+
+    // Notify that a sample is available
+    void NotifySample();
+
+    // Notify that we're waiting for a sample
+    void SetWaiting() { m_lWaiting++; };
+};
+
+
+//=====================================================================
+//=====================================================================
+// Defines CMemAllocator
+//
+// this is an allocator based on CBaseAllocator that allocates sample
+// buffers in main memory (from 'new'). You must call SetProperties
+// before calling Commit.
+//
+// we don't free the memory when going into Decommit state. The simplest
+// way to implement this without complicating CBaseAllocator is to
+// have a Free() function, called to go into decommit state, that does
+// nothing and a ReallyFree function called from our destructor that
+// actually frees the memory.
+//=====================================================================
+//=====================================================================
+
+//  Make me one from quartz.dll
+STDAPI CreateMemoryAllocator(IMemAllocator **ppAllocator);
+
+class CMemAllocator : public CBaseAllocator
+{
+
+protected:
+
+    LPBYTE m_pBuffer;   // combined memory for all buffers
+
+    // override to free the memory when decommit completes
+    // - we actually do nothing, and save the memory until deletion.
+    void Free(void);
+
+    // called from the destructor (and from Alloc if changing size/count) to
+    // actually free up the memory
+    void ReallyFree(void);
+
+    // overriden to allocate the memory when commit called
+    HRESULT Alloc(void);
+
+public:
+    /* This goes in the factory template table to create new instances */
+    static CUnknown *CreateInstance(LPUNKNOWN, HRESULT *);
+
+    STDMETHODIMP SetProperties(
+		    ALLOCATOR_PROPERTIES* pRequest,
+		    ALLOCATOR_PROPERTIES* pActual);
+
+    CMemAllocator(TCHAR *, LPUNKNOWN, HRESULT *);
+#ifdef UNICODE
+    CMemAllocator(CHAR *, LPUNKNOWN, HRESULT *);
+#endif
+    ~CMemAllocator();
+};
+
+// helper used by IAMovieSetup implementation
+STDAPI
+AMovieSetupRegisterFilter( const AMOVIESETUP_FILTER * const psetupdata
+                         , IFilterMapper *                  pIFM
+                         , BOOL                             bRegister  );
+
+
+///////////////////////////////////////////////////////////////////////////
+// ------------------------------------------------------------------------
+// ------------------------------------------------------------------------
+// ------------------------------------------------------------------------
+// ------------------------------------------------------------------------
+///////////////////////////////////////////////////////////////////////////
+
+#endif /* __FILTER__ */
+
+
+
diff --git a/plugins/GSdx_legacy/baseclasses/amvideo.cpp b/plugins/GSdx_legacy/baseclasses/amvideo.cpp
new file mode 100644
index 0000000000..35cba88d4c
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/amvideo.cpp
@@ -0,0 +1,275 @@
+//------------------------------------------------------------------------------
+// File: AMVideo.cpp
+//
+// Desc: DirectShow base classes - implements helper functions for
+//       bitmap formats.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include <limits.h>
+
+// These are bit field masks for true colour devices
+
+const DWORD bits555[] = {0x007C00,0x0003E0,0x00001F};
+const DWORD bits565[] = {0x00F800,0x0007E0,0x00001F};
+const DWORD bits888[] = {0xFF0000,0x00FF00,0x0000FF};
+
+// This maps bitmap subtypes into a bits per pixel value and also a
+// name. unicode and ansi versions are stored because we have to
+// return a pointer to a static string.
+const struct {
+    const GUID *pSubtype;
+    WORD BitCount;
+    CHAR *pName;
+    WCHAR *wszName;
+} BitCountMap[] =  { &MEDIASUBTYPE_RGB1,        1,   "RGB Monochrome",     L"RGB Monochrome",
+                     &MEDIASUBTYPE_RGB4,        4,   "RGB VGA",            L"RGB VGA",
+                     &MEDIASUBTYPE_RGB8,        8,   "RGB 8",              L"RGB 8",
+                     &MEDIASUBTYPE_RGB565,      16,  "RGB 565 (16 bit)",   L"RGB 565 (16 bit)",
+                     &MEDIASUBTYPE_RGB555,      16,  "RGB 555 (16 bit)",   L"RGB 555 (16 bit)",
+                     &MEDIASUBTYPE_RGB24,       24,  "RGB 24",             L"RGB 24",
+                     &MEDIASUBTYPE_RGB32,       32,  "RGB 32",             L"RGB 32",
+                     &MEDIASUBTYPE_ARGB32,    32,  "ARGB 32",             L"ARGB 32",
+                     &MEDIASUBTYPE_Overlay,     0,   "Overlay",            L"Overlay",
+                     &GUID_NULL,                0,   "UNKNOWN",            L"UNKNOWN"
+};
+
+// Return the size of the bitmap as defined by this header
+
+STDAPI_(DWORD) GetBitmapSize(const BITMAPINFOHEADER *pHeader)
+{
+    return DIBSIZE(*pHeader);
+}
+
+
+// This is called if the header has a 16 bit colour depth and needs to work
+// out the detailed type from the bit fields (either RGB 565 or RGB 555)
+
+STDAPI_(const GUID) GetTrueColorType(const BITMAPINFOHEADER *pbmiHeader)
+{
+    BITMAPINFO *pbmInfo = (BITMAPINFO *) pbmiHeader;
+    ASSERT(pbmiHeader->biBitCount == 16);
+
+    // If its BI_RGB then it's RGB 555 by default
+
+    if (pbmiHeader->biCompression == BI_RGB) {
+        return MEDIASUBTYPE_RGB555;
+    }
+
+    // Compare the bit fields with RGB 555
+
+    DWORD *pMask = (DWORD *) pbmInfo->bmiColors;
+    if (pMask[0] == bits555[0]) {
+        if (pMask[1] == bits555[1]) {
+            if (pMask[2] == bits555[2]) {
+                return MEDIASUBTYPE_RGB555;
+            }
+        }
+    }
+
+    // Compare the bit fields with RGB 565
+
+    pMask = (DWORD *) pbmInfo->bmiColors;
+    if (pMask[0] == bits565[0]) {
+        if (pMask[1] == bits565[1]) {
+            if (pMask[2] == bits565[2]) {
+                return MEDIASUBTYPE_RGB565;
+            }
+        }
+    }
+    return GUID_NULL;
+}
+
+
+// Given a BITMAPINFOHEADER structure this returns the GUID sub type that is
+// used to describe it in format negotiations. For example a video codec fills
+// in the format block with a VIDEOINFO structure, it also fills in the major
+// type with MEDIATYPE_VIDEO and the subtype with a GUID that matches the bit
+// count, for example if it is an eight bit image then MEDIASUBTYPE_RGB8
+
+STDAPI_(const GUID) GetBitmapSubtype(const BITMAPINFOHEADER *pbmiHeader)
+{
+    ASSERT(pbmiHeader);
+
+    // If it's not RGB then create a GUID from the compression type
+
+    if (pbmiHeader->biCompression != BI_RGB) {
+        if (pbmiHeader->biCompression != BI_BITFIELDS) {
+            FOURCCMap FourCCMap(pbmiHeader->biCompression);
+            return (const GUID) FourCCMap;
+        }
+    }
+
+    // Map the RGB DIB bit depth to a image GUID
+
+    switch(pbmiHeader->biBitCount) {
+        case 1    :   return MEDIASUBTYPE_RGB1;
+        case 4    :   return MEDIASUBTYPE_RGB4;
+        case 8    :   return MEDIASUBTYPE_RGB8;
+        case 16   :   return GetTrueColorType(pbmiHeader);
+        case 24   :   return MEDIASUBTYPE_RGB24;
+        case 32   :   return MEDIASUBTYPE_RGB32;
+    }
+    return GUID_NULL;
+}
+
+
+// Given a video bitmap subtype we return the number of bits per pixel it uses
+// We return a WORD bit count as thats what the BITMAPINFOHEADER uses. If the
+// GUID subtype is not found in the table we return an invalid USHRT_MAX
+
+STDAPI_(WORD) GetBitCount(const GUID *pSubtype)
+{
+    ASSERT(pSubtype);
+    const GUID *pMediaSubtype;
+    INT iPosition = 0;
+
+    // Scan the mapping list seeing if the source GUID matches any known
+    // bitmap subtypes, the list is terminated by a GUID_NULL entry
+
+    while (TRUE) {
+        pMediaSubtype = BitCountMap[iPosition].pSubtype;
+        if (IsEqualGUID(*pMediaSubtype,GUID_NULL)) {
+            return USHRT_MAX;
+        }
+        if (IsEqualGUID(*pMediaSubtype,*pSubtype)) {
+            return BitCountMap[iPosition].BitCount;
+        }
+        iPosition++;
+    }
+}
+
+
+// Given a bitmap subtype we return a description name that can be used for
+// debug purposes. In a retail build this function still returns the names
+// If the subtype isn't found in the lookup table we return string UNKNOWN
+
+int LocateSubtype(const GUID *pSubtype)
+{
+    ASSERT(pSubtype);
+    const GUID *pMediaSubtype;
+    INT iPosition = 0;
+
+    // Scan the mapping list seeing if the source GUID matches any known
+    // bitmap subtypes, the list is terminated by a GUID_NULL entry
+
+    while (TRUE) {
+        pMediaSubtype = BitCountMap[iPosition].pSubtype;
+        if (IsEqualGUID(*pMediaSubtype,*pSubtype) ||
+            IsEqualGUID(*pMediaSubtype,GUID_NULL)
+            )
+        {
+            break;
+        }
+
+        iPosition++;
+    }
+
+    return iPosition;
+}
+
+
+
+STDAPI_(WCHAR *) GetSubtypeNameW(const GUID *pSubtype)
+{
+    return BitCountMap[LocateSubtype(pSubtype)].wszName;
+}
+
+STDAPI_(CHAR *) GetSubtypeNameA(const GUID *pSubtype)
+{
+    return BitCountMap[LocateSubtype(pSubtype)].pName;
+}
+
+#ifndef GetSubtypeName
+#error wxutil.h should have defined GetSubtypeName
+#endif
+#undef GetSubtypeName
+
+// this is here for people that linked to it directly; most people
+// would use the header file that picks the A or W version.
+STDAPI_(CHAR *) GetSubtypeName(const GUID *pSubtype)
+{
+    return GetSubtypeNameA(pSubtype);
+}
+
+
+// The mechanism for describing a bitmap format is with the BITMAPINFOHEADER
+// This is really messy to deal with because it invariably has fields that
+// follow it holding bit fields, palettes and the rest. This function gives
+// the number of bytes required to hold a VIDEOINFO that represents it. This
+// count includes the prefix information (like the rcSource rectangle) the
+// BITMAPINFOHEADER field, and any other colour information on the end.
+//
+// WARNING If you want to copy a BITMAPINFOHEADER into a VIDEOINFO always make
+// sure that you use the HEADER macro because the BITMAPINFOHEADER field isn't
+// right at the start of the VIDEOINFO (there are a number of other fields),
+//
+//     CopyMemory(HEADER(pVideoInfo),pbmi,sizeof(BITMAPINFOHEADER));
+//
+
+STDAPI_(LONG) GetBitmapFormatSize(const BITMAPINFOHEADER *pHeader)
+{
+    // Everyone has this to start with this
+    LONG Size = SIZE_PREHEADER + pHeader->biSize;
+
+    ASSERT(pHeader->biSize >= sizeof(BITMAPINFOHEADER));
+
+    // Does this format use a palette, if the number of colours actually used
+    // is zero then it is set to the maximum that are allowed for that colour
+    // depth (an example is 256 for eight bits). Truecolour formats may also
+    // pass a palette with them in which case the used count is non zero
+
+    // This would scare me.
+    ASSERT(pHeader->biBitCount <= iPALETTE || pHeader->biClrUsed == 0);
+
+    if (pHeader->biBitCount <= iPALETTE || pHeader->biClrUsed) {
+        LONG Entries = (DWORD) 1 << pHeader->biBitCount;
+        if (pHeader->biClrUsed) {
+            Entries = pHeader->biClrUsed;
+        }
+        Size += Entries * sizeof(RGBQUAD);
+    }
+
+    // Truecolour formats may have a BI_BITFIELDS specifier for compression
+    // type which means that room for three DWORDs should be allocated that
+    // specify where in each pixel the RGB colour components may be found
+
+    if (pHeader->biCompression == BI_BITFIELDS) {
+        Size += SIZE_MASKS;
+    }
+
+    // A BITMAPINFO for a palettised image may also contain a palette map that
+    // provides the information to map from a source palette to a destination
+    // palette during a BitBlt for example, because this information is only
+    // ever processed during drawing you don't normally store the palette map
+    // nor have any way of knowing if it is present in the data structure
+
+    return Size;
+}
+
+
+// Returns TRUE if the VIDEOINFO contains a palette
+
+STDAPI_(BOOL) ContainsPalette(const VIDEOINFOHEADER *pVideoInfo)
+{
+    if (PALETTISED(pVideoInfo) == FALSE) {
+        if (pVideoInfo->bmiHeader.biClrUsed == 0) {
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
+
+// Return a pointer to the first entry in a palette
+
+STDAPI_(const RGBQUAD *) GetBitmapPalette(const VIDEOINFOHEADER *pVideoInfo)
+{
+    if (pVideoInfo->bmiHeader.biCompression == BI_BITFIELDS) {
+        return TRUECOLOR(pVideoInfo)->bmiColors;
+    }
+    return COLORS(pVideoInfo);
+}
diff --git a/plugins/GSdx_legacy/baseclasses/cache.h b/plugins/GSdx_legacy/baseclasses/cache.h
new file mode 100644
index 0000000000..2ef996f4bc
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/cache.h
@@ -0,0 +1,74 @@
+//------------------------------------------------------------------------------
+// File: Cache.h
+//
+// Desc: DirectShow base classes - efines a non-MFC generic cache class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+/* This class implements a simple cache. A cache object is instantiated
+   with the number of items it is to hold. An item is a pointer to an
+   object derived from CBaseObject (helps reduce memory leaks). The cache
+   can then have objects added to it and removed from it. The cache size
+   is fixed at construction time and may therefore run out or be flooded.
+   If it runs out it returns a NULL pointer, if it fills up it also returns
+   a NULL pointer instead of a pointer to the object just inserted */
+
+/* Making these classes inherit from CBaseObject does nothing for their
+   functionality but it allows us to check there are no memory leaks */
+
+/* WARNING Be very careful when using this class, what it lets you do is
+   store and retrieve objects so that you can minimise object creation
+   which in turns improves efficiency. However the object you store is
+   exactly the same as the object you get back which means that it short
+   circuits the constructor initialisation phase. This means any class
+   variables the object has (eg pointers) are highly likely to be invalid.
+   Therefore ensure you reinitialise the object before using it again */
+
+
+#ifndef __CACHE__
+#define __CACHE__
+
+
+class CCache : CBaseObject {
+
+    /* Make copy constructor and assignment operator inaccessible */
+
+    CCache(const CCache &refCache);
+    CCache &operator=(const CCache &refCache);
+
+private:
+
+    /* These are initialised in the constructor. The first variable points to
+       an array of pointers, each of which points to a CBaseObject derived
+       object. The m_iCacheSize is the static fixed size for the cache and the
+       m_iUsed defines the number of places filled with objects at any time.
+       We fill the array of pointers from the start (ie m_ppObjects[0] first)
+       and then only add and remove objects from the end position, so in this
+       respect the array of object pointers should be treated as a stack */
+
+    CBaseObject **m_ppObjects;
+    const INT m_iCacheSize;
+    INT m_iUsed;
+
+public:
+
+    CCache(TCHAR *pName,INT iItems);
+    virtual ~CCache();
+
+    /* Add an item to the cache */
+    CBaseObject *AddToCache(CBaseObject *pObject);
+
+    /* Remove an item from the cache */
+    CBaseObject *RemoveFromCache();
+
+    /* Delete all the objects held in the cache */
+    void RemoveAll(void);
+
+    /* Return the cache size which is set during construction */
+    INT GetCacheSize(void) const {return m_iCacheSize;};
+};
+
+#endif /* __CACHE__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/combase.cpp b/plugins/GSdx_legacy/baseclasses/combase.cpp
new file mode 100644
index 0000000000..d8c193a9dc
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/combase.cpp
@@ -0,0 +1,256 @@
+//------------------------------------------------------------------------------
+// File: ComBase.cpp
+//
+// Desc: DirectShow base classes - implements class hierarchy for creating
+//       COM objects.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#pragma warning( disable : 4514 )   // Disable warnings re unused inline functions
+
+
+/* Define the static member variable */
+
+LONG CBaseObject::m_cObjects = 0;
+
+
+/* Constructor */
+
+CBaseObject::CBaseObject(const TCHAR *pName)
+{
+    /* Increment the number of active objects */
+    InterlockedIncrement(&m_cObjects);
+
+#ifdef DEBUG
+
+#ifdef UNICODE
+    m_dwCookie = DbgRegisterObjectCreation(0, pName);
+#else
+    m_dwCookie = DbgRegisterObjectCreation(pName, 0);
+#endif
+
+#endif
+}
+
+#ifdef UNICODE
+CBaseObject::CBaseObject(const char *pName)
+{
+    /* Increment the number of active objects */
+    InterlockedIncrement(&m_cObjects);
+
+#ifdef DEBUG
+    m_dwCookie = DbgRegisterObjectCreation(pName, 0);
+#endif
+}
+#endif
+
+HINSTANCE	hlibOLEAut32;
+
+/* Destructor */
+
+CBaseObject::~CBaseObject()
+{
+    /* Decrement the number of objects active */
+    if (InterlockedDecrement(&m_cObjects) == 0) {
+	if (hlibOLEAut32) {
+	    FreeLibrary(hlibOLEAut32);
+
+	    hlibOLEAut32 = 0;
+	}
+    };
+
+
+#ifdef DEBUG
+    DbgRegisterObjectDestruction(m_dwCookie);
+#endif
+}
+
+static const TCHAR szOle32Aut[]   = TEXT("OleAut32.dll");
+
+HINSTANCE LoadOLEAut32()
+{
+    if (hlibOLEAut32 == 0) {
+
+	hlibOLEAut32 = LoadLibrary(szOle32Aut);
+    }
+
+    return hlibOLEAut32;
+}
+
+
+/* Constructor */
+
+// We know we use "this" in the initialization list, we also know we don't modify *phr.
+#pragma warning( disable : 4355 4100 )
+CUnknown::CUnknown(const TCHAR *pName, LPUNKNOWN pUnk)
+: CBaseObject(pName)
+/* Start the object with a reference count of zero - when the      */
+/* object is queried for it's first interface this may be          */
+/* incremented depending on whether or not this object is          */
+/* currently being aggregated upon                                 */
+, m_cRef(0)
+/* Set our pointer to our IUnknown interface.                      */
+/* If we have an outer, use its, otherwise use ours.               */
+/* This pointer effectivly points to the owner of                  */
+/* this object and can be accessed by the GetOwner() method.       */
+, m_pUnknown( pUnk != 0 ? pUnk : reinterpret_cast<LPUNKNOWN>( static_cast<PNDUNKNOWN>(this) ) )
+ /* Why the double cast?  Well, the inner cast is a type-safe cast */
+ /* to pointer to a type from which we inherit.  The second is     */
+ /* type-unsafe but works because INonDelegatingUnknown "behaves   */
+ /* like" IUnknown. (Only the names on the methods change.)        */
+{
+    // Everything we need to do has been done in the initializer list
+}
+
+// This does the same as above except it has a useless HRESULT argument
+// use the previous constructor, this is just left for compatibility...
+CUnknown::CUnknown(TCHAR *pName, LPUNKNOWN pUnk,HRESULT *phr) :
+    CBaseObject(pName),
+    m_cRef(0),
+    m_pUnknown( pUnk != 0 ? pUnk : reinterpret_cast<LPUNKNOWN>( static_cast<PNDUNKNOWN>(this) ) )
+{
+}
+
+#ifdef UNICODE
+CUnknown::CUnknown(const CHAR *pName, LPUNKNOWN pUnk)
+: CBaseObject(pName), m_cRef(0),
+    m_pUnknown( pUnk != 0 ? pUnk : reinterpret_cast<LPUNKNOWN>( static_cast<PNDUNKNOWN>(this) ) )
+{ }
+
+CUnknown::CUnknown(CHAR *pName, LPUNKNOWN pUnk,HRESULT *phr) :
+    CBaseObject(pName), m_cRef(0),
+    m_pUnknown( pUnk != 0 ? pUnk : reinterpret_cast<LPUNKNOWN>( static_cast<PNDUNKNOWN>(this) ) )
+{ }
+
+#endif
+
+#pragma warning( default : 4355 4100 )
+
+
+/* QueryInterface */
+
+STDMETHODIMP CUnknown::NonDelegatingQueryInterface(REFIID riid, void ** ppv)
+{
+    CheckPointer(ppv,E_POINTER);
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+
+    /* We know only about IUnknown */
+
+    if (riid == IID_IUnknown) {
+        GetInterface((LPUNKNOWN) (PNDUNKNOWN) this, ppv);
+        return NOERROR;
+    } else {
+        *ppv = NULL;
+        return E_NOINTERFACE;
+    }
+}
+
+/* We have to ensure that we DON'T use a max macro, since these will typically   */
+/* lead to one of the parameters being evaluated twice.  Since we are worried    */
+/* about concurrency, we can't afford to access the m_cRef twice since we can't  */
+/* afford to run the risk that its value having changed between accesses.        */
+
+template<class T> inline static T ourmax( const T & a, const T & b )
+{
+    return a > b ? a : b;
+}
+
+/* AddRef */
+
+STDMETHODIMP_(ULONG) CUnknown::NonDelegatingAddRef()
+{
+    LONG lRef = InterlockedIncrement( &m_cRef );
+    ASSERT(lRef > 0);
+    DbgLog((LOG_MEMORY,3,TEXT("    Obj %d ref++ = %d"),
+           m_dwCookie, m_cRef));
+    return ourmax(ULONG(m_cRef), 1ul);
+}
+
+
+/* Release */
+
+STDMETHODIMP_(ULONG) CUnknown::NonDelegatingRelease()
+{
+    /* If the reference count drops to zero delete ourselves */
+
+    LONG lRef = InterlockedDecrement( &m_cRef );
+    ASSERT(lRef >= 0);
+
+    DbgLog((LOG_MEMORY,3,TEXT("    Object %d ref-- = %d"),
+	    m_dwCookie, m_cRef));
+    if (lRef == 0) {
+
+        // COM rules say we must protect against re-entrancy.
+        // If we are an aggregator and we hold our own interfaces
+        // on the aggregatee, the QI for these interfaces will
+        // addref ourselves. So after doing the QI we must release
+        // a ref count on ourselves. Then, before releasing the
+        // private interface, we must addref ourselves. When we do
+        // this from the destructor here it will result in the ref
+        // count going to 1 and then back to 0 causing us to
+        // re-enter the destructor. Hence we add an extra refcount here
+        // once we know we will delete the object.
+        // for an example aggregator see filgraph\distrib.cpp.
+
+        m_cRef++;
+
+        delete this;
+        return ULONG(0);
+    } else {
+        return ourmax(ULONG(m_cRef), 1ul);
+    }
+}
+
+
+/* Return an interface pointer to a requesting client
+   performing a thread safe AddRef as necessary */
+
+STDAPI GetInterface(LPUNKNOWN pUnk, void **ppv)
+{
+    CheckPointer(ppv, E_POINTER);
+    *ppv = pUnk;
+    pUnk->AddRef();
+    return NOERROR;
+}
+
+
+/* Compares two interfaces and returns TRUE if they are on the same object */
+
+BOOL WINAPI IsEqualObject(IUnknown *pFirst, IUnknown *pSecond)
+{
+    /*  Different objects can't have the same interface pointer for
+        any interface
+    */
+    if (pFirst == pSecond) {
+        return TRUE;
+    }
+    /*  OK - do it the hard way - check if they have the same
+        IUnknown pointers - a single object can only have one of these
+    */
+    LPUNKNOWN pUnknown1;     // Retrieve the IUnknown interface
+    LPUNKNOWN pUnknown2;     // Retrieve the other IUnknown interface
+    HRESULT hr;              // General OLE return code
+
+    ASSERT(pFirst);
+    ASSERT(pSecond);
+
+    /* See if the IUnknown pointers match */
+
+    hr = pFirst->QueryInterface(IID_IUnknown,(void **) &pUnknown1);
+    ASSERT(SUCCEEDED(hr));
+    ASSERT(pUnknown1);
+
+    hr = pSecond->QueryInterface(IID_IUnknown,(void **) &pUnknown2);
+    ASSERT(SUCCEEDED(hr));
+    ASSERT(pUnknown2);
+
+    /* Release the extra interfaces we hold */
+
+    pUnknown1->Release();
+    pUnknown2->Release();
+    return (pUnknown1 == pUnknown2);
+}
+
diff --git a/plugins/GSdx_legacy/baseclasses/combase.h b/plugins/GSdx_legacy/baseclasses/combase.h
new file mode 100644
index 0000000000..901056b77e
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/combase.h
@@ -0,0 +1,319 @@
+//------------------------------------------------------------------------------
+// File: ComBase.h
+//
+// Desc: DirectShow base classes - defines a class hierarchy for creating
+//       COM objects.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+/*
+
+a. Derive your COM object from CUnknown
+
+b. Make a static CreateInstance function that takes an LPUNKNOWN, an HRESULT *
+   and a TCHAR *. The LPUNKNOWN defines the object to delegate IUnknown calls
+   to. The HRESULT * allows error codes to be passed around constructors and
+   the TCHAR * is a descriptive name that can be printed on the debugger.
+
+   It is important that constructors only change the HRESULT * if they have
+   to set an ERROR code, if it was successful then leave it alone or you may
+   overwrite an error code from an object previously created.
+
+   When you call a constructor the descriptive name should be in static store
+   as we do not copy the string. To stop large amounts of memory being used
+   in retail builds by all these static strings use the NAME macro,
+
+   CMyFilter = new CImplFilter(NAME("My filter"),pUnknown,phr);
+   if (FAILED(hr)) {
+       return hr;
+   }
+
+   In retail builds NAME(_x_) compiles to NULL, the base CBaseObject class
+   knows not to do anything with objects that don't have a name.
+
+c. Have a constructor for your object that passes the LPUNKNOWN, HRESULT * and
+   TCHAR * to the CUnknown constructor. You can set the HRESULT if you have an
+   error, or just simply pass it through to the constructor.
+
+   The object creation will fail in the class factory if the HRESULT indicates
+   an error (ie FAILED(HRESULT) == TRUE)
+
+d. Create a FactoryTemplate with your object's class id and CreateInstance
+   function.
+
+Then (for each interface) either
+
+Multiple inheritance
+
+1. Also derive it from ISomeInterface
+2. Include DECLARE_IUNKNOWN in your class definition to declare
+   implementations of QueryInterface, AddRef and Release that
+   call the outer unknown
+3. Override NonDelegatingQueryInterface to expose ISomeInterface by
+   code something like
+
+     if (riid == IID_ISomeInterface) {
+         return GetInterface((ISomeInterface *) this, ppv);
+     } else {
+         return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+     }
+
+4. Declare and implement the member functions of ISomeInterface.
+
+or: Nested interfaces
+
+1. Declare a class derived from CUnknown
+2. Include DECLARE_IUNKNOWN in your class definition
+3. Override NonDelegatingQueryInterface to expose ISomeInterface by
+   code something like
+
+     if (riid == IID_ISomeInterface) {
+         return GetInterface((ISomeInterface *) this, ppv);
+     } else {
+         return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+     }
+
+4. Implement the member functions of ISomeInterface. Use GetOwner() to
+   access the COM object class.
+
+And in your COM object class:
+
+5. Make the nested class a friend of the COM object class, and declare
+   an instance of the nested class as a member of the COM object class.
+
+   NOTE that because you must always pass the outer unknown and an hResult
+   to the CUnknown constructor you cannot use a default constructor, in
+   other words you will have to make the member variable a pointer to the
+   class and make a NEW call in your constructor to actually create it.
+
+6. override the NonDelegatingQueryInterface with code like this:
+
+     if (riid == IID_ISomeInterface) {
+         return m_pImplFilter->
+            NonDelegatingQueryInterface(IID_ISomeInterface, ppv);
+     } else {
+         return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+     }
+
+You can have mixed classes which support some interfaces via multiple
+inheritance and some via nested classes
+
+*/
+
+#ifndef __COMBASE__
+#define __COMBASE__
+
+// Filter Setup data structures no defined in axextend.idl
+
+typedef REGPINTYPES
+AMOVIESETUP_MEDIATYPE, * PAMOVIESETUP_MEDIATYPE, * FAR LPAMOVIESETUP_MEDIATYPE;
+
+typedef REGFILTERPINS
+AMOVIESETUP_PIN, * PAMOVIESETUP_PIN, * FAR LPAMOVIESETUP_PIN;
+
+typedef struct _AMOVIESETUP_FILTER
+{
+  const CLSID * clsID;
+  const WCHAR * strName;
+  DWORD      dwMerit;
+  UINT       nPins;
+  const AMOVIESETUP_PIN * lpPin;
+}
+AMOVIESETUP_FILTER, * PAMOVIESETUP_FILTER, * FAR LPAMOVIESETUP_FILTER;
+
+/* The DLLENTRY module initialises the module handle on loading */
+
+extern HINSTANCE g_hInst;
+
+/* On DLL load remember which platform we are running on */
+
+extern DWORD g_amPlatform;
+extern OSVERSIONINFO g_osInfo;     // Filled in by GetVersionEx
+
+/* Version of IUnknown that is renamed to allow a class to support both
+   non delegating and delegating IUnknowns in the same COM object */
+
+#ifndef INONDELEGATINGUNKNOWN_DEFINED
+DECLARE_INTERFACE(INonDelegatingUnknown)
+{
+    STDMETHOD(NonDelegatingQueryInterface) (THIS_ REFIID, LPVOID *) PURE;
+    STDMETHOD_(ULONG, NonDelegatingAddRef)(THIS) PURE;
+    STDMETHOD_(ULONG, NonDelegatingRelease)(THIS) PURE;
+};
+#define INONDELEGATINGUNKNOWN_DEFINED
+#endif
+
+typedef INonDelegatingUnknown *PNDUNKNOWN;
+
+
+/* This is the base object class that supports active object counting. As
+   part of the debug facilities we trace every time a C++ object is created
+   or destroyed. The name of the object has to be passed up through the class
+   derivation list during construction as you cannot call virtual functions
+   in the constructor. The downside of all this is that every single object
+   constructor has to take an object name parameter that describes it */
+
+class CBaseObject
+{
+
+private:
+
+    // Disable the copy constructor and assignment by default so you will get
+    //   compiler errors instead of unexpected behaviour if you pass objects
+    //   by value or assign objects.
+    CBaseObject(const CBaseObject& objectSrc);          // no implementation
+    void operator=(const CBaseObject& objectSrc);       // no implementation
+
+private:
+    static LONG m_cObjects;     /* Total number of objects active */
+
+protected:
+#ifdef DEBUG
+    DWORD m_dwCookie;           /* Cookie identifying this object */
+#endif
+
+
+public:
+
+    /* These increment and decrement the number of active objects */
+
+    CBaseObject(const TCHAR *pName);
+#ifdef UNICODE
+    CBaseObject(const char *pName);
+#endif
+    ~CBaseObject();
+
+    /* Call this to find if there are any CUnknown derived objects active */
+
+    static LONG ObjectsActive() {
+        return m_cObjects;
+    };
+};
+
+
+/* An object that supports one or more COM interfaces will be based on
+   this class. It supports counting of total objects for DLLCanUnloadNow
+   support, and an implementation of the core non delegating IUnknown */
+
+class AM_NOVTABLE CUnknown : public INonDelegatingUnknown,
+                 public CBaseObject
+{
+private:
+    const LPUNKNOWN m_pUnknown; /* Owner of this object */
+
+protected:                      /* So we can override NonDelegatingRelease() */
+    volatile LONG m_cRef;       /* Number of reference counts */
+
+public:
+
+    CUnknown(const TCHAR *pName, LPUNKNOWN pUnk);
+    virtual ~CUnknown() {};
+
+    // This is redundant, just use the other constructor
+    //   as we never touch the HRESULT in this anyway
+    CUnknown(TCHAR *pName, LPUNKNOWN pUnk,HRESULT *phr);
+#ifdef UNICODE
+    CUnknown(const char *pName, LPUNKNOWN pUnk);
+    CUnknown(char *pName, LPUNKNOWN pUnk,HRESULT *phr);
+#endif
+
+    /* Return the owner of this object */
+
+    LPUNKNOWN GetOwner() const {
+        return m_pUnknown;
+    };
+
+    /* Called from the class factory to create a new instance, it is
+       pure virtual so it must be overriden in your derived class */
+
+    /* static CUnknown *CreateInstance(LPUNKNOWN, HRESULT *) */
+
+    /* Non delegating unknown implementation */
+
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID, void **);
+    STDMETHODIMP_(ULONG) NonDelegatingAddRef();
+    STDMETHODIMP_(ULONG) NonDelegatingRelease();
+};
+
+#if (_MSC_VER <= 1200)
+#pragma warning(disable:4211)
+
+/* The standard InterlockedXXX functions won't take volatiles */
+static inline LONG WINAPI InterlockedIncrement( volatile LONG * plong )
+{ return InterlockedIncrement( const_cast<LONG*>( plong ) ); }
+
+static inline LONG WINAPI InterlockedDecrement( volatile LONG * plong )
+{ return InterlockedDecrement( const_cast<LONG*>( plong ) ); }
+
+#pragma warning(default:4211)
+#endif
+
+
+/* Return an interface pointer to a requesting client
+   performing a thread safe AddRef as necessary */
+
+STDAPI GetInterface(LPUNKNOWN pUnk, void **ppv);
+
+/* A function that can create a new COM object */
+
+typedef CUnknown *(CALLBACK *LPFNNewCOMObject)(LPUNKNOWN pUnkOuter, HRESULT *phr);
+
+/*  A function (can be NULL) which is called from the DLL entrypoint
+    routine for each factory template:
+
+    bLoading - TRUE on DLL load, FALSE on DLL unload
+    rclsid   - the m_ClsID of the entry
+*/
+typedef void (CALLBACK *LPFNInitRoutine)(BOOL bLoading, const CLSID *rclsid);
+
+/* Create one of these per object class in an array so that
+   the default class factory code can create new instances */
+
+class CFactoryTemplate {
+
+public:
+
+    const WCHAR *              m_Name;
+    const CLSID *              m_ClsID;
+    LPFNNewCOMObject           m_lpfnNew;
+    LPFNInitRoutine            m_lpfnInit;
+    const AMOVIESETUP_FILTER * m_pAMovieSetup_Filter;
+
+    BOOL IsClassID(REFCLSID rclsid) const {
+        return (IsEqualCLSID(*m_ClsID,rclsid));
+    };
+
+    CUnknown *CreateInstance(LPUNKNOWN pUnk, HRESULT *phr) const {
+        CheckPointer(phr,NULL);
+        return m_lpfnNew(pUnk, phr);
+    };
+};
+
+
+/* You must override the (pure virtual) NonDelegatingQueryInterface to return
+   interface pointers (using GetInterface) to the interfaces your derived
+   class supports (the default implementation only supports IUnknown) */
+
+#define DECLARE_IUNKNOWN                                        \
+    STDMETHODIMP QueryInterface(REFIID riid, void **ppv) {      \
+        return GetOwner()->QueryInterface(riid,ppv);            \
+    };                                                          \
+    STDMETHODIMP_(ULONG) AddRef() {                             \
+        return GetOwner()->AddRef();                            \
+    };                                                          \
+    STDMETHODIMP_(ULONG) Release() {                            \
+        return GetOwner()->Release();                           \
+    };
+
+
+
+HINSTANCE	LoadOLEAut32();
+
+
+#endif /* __COMBASE__ */
+
+
+
+
diff --git a/plugins/GSdx_legacy/baseclasses/ctlutil.cpp b/plugins/GSdx_legacy/baseclasses/ctlutil.cpp
new file mode 100644
index 0000000000..a234f838a1
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/ctlutil.cpp
@@ -0,0 +1,2531 @@
+//------------------------------------------------------------------------------
+// File: CtlUtil.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// Base classes implementing IDispatch parsing for the basic control dual
+// interfaces. Derive from these and implement just the custom method and
+// property methods. We also implement CPosPassThru that can be used by
+// renderers and transforms to pass by IMediaPosition and IMediaSeeking
+
+
+#include "streams.h"
+#include <limits.h>
+#include "seekpt.h"
+
+// 'bool' non standard reserved word
+#pragma warning(disable:4237)
+
+
+// --- CBaseDispatch implementation ----------
+CBaseDispatch::~CBaseDispatch()
+{
+    if (m_pti) {
+	m_pti->Release();
+    }
+}
+
+
+// return 1 if we support GetTypeInfo
+
+STDMETHODIMP
+CBaseDispatch::GetTypeInfoCount(UINT * pctinfo)
+{
+    CheckPointer(pctinfo,E_POINTER);
+    ValidateReadWritePtr(pctinfo,sizeof(UINT *));
+    *pctinfo = 1;
+    return S_OK;
+}
+
+
+typedef HRESULT (STDAPICALLTYPE *LPLOADTYPELIB)(
+			    const OLECHAR FAR *szFile,
+			    ITypeLib FAR* FAR* pptlib);
+
+typedef HRESULT (STDAPICALLTYPE *LPLOADREGTYPELIB)(REFGUID rguid,
+			    WORD wVerMajor,
+			    WORD wVerMinor,
+			    LCID lcid,
+			    ITypeLib FAR* FAR* pptlib);
+
+// attempt to find our type library
+
+STDMETHODIMP
+CBaseDispatch::GetTypeInfo(
+  REFIID riid,
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    CheckPointer(pptinfo,E_POINTER);
+    ValidateReadWritePtr(pptinfo,sizeof(ITypeInfo *));
+    HRESULT hr;
+
+    *pptinfo = NULL;
+
+    // we only support one type element
+    if (0 != itinfo) {
+	return TYPE_E_ELEMENTNOTFOUND;
+    }
+
+    if (NULL == pptinfo) {
+	return E_POINTER;
+    }
+
+    // always look for neutral
+    if (NULL == m_pti) {
+
+	LPLOADTYPELIB	    lpfnLoadTypeLib;
+	LPLOADREGTYPELIB    lpfnLoadRegTypeLib;
+	ITypeLib	    *ptlib;
+	HINSTANCE	    hInst;
+
+	static const char  szTypeLib[]	  = "LoadTypeLib";
+	static const char  szRegTypeLib[] = "LoadRegTypeLib";
+	static const WCHAR szControl[]	  = L"control.tlb";
+
+	//
+	// Try to get the Ole32Aut.dll module handle.
+	//
+
+	hInst = LoadOLEAut32();
+	if (hInst == NULL) {
+	    DWORD dwError = GetLastError();
+	    return AmHresultFromWin32(dwError);
+	}
+	lpfnLoadRegTypeLib = (LPLOADREGTYPELIB)GetProcAddress(hInst,
+							      szRegTypeLib);
+	if (lpfnLoadRegTypeLib == NULL) {
+	    DWORD dwError = GetLastError();
+	    return AmHresultFromWin32(dwError);
+	}
+
+	hr = (*lpfnLoadRegTypeLib)(LIBID_QuartzTypeLib, 1, 0, // version 1.0
+				   lcid, &ptlib);
+
+	if (FAILED(hr)) {
+
+	    // attempt to load directly - this will fill the
+	    // registry in if it finds it
+
+	    lpfnLoadTypeLib = (LPLOADTYPELIB)GetProcAddress(hInst, szTypeLib);
+	    if (lpfnLoadTypeLib == NULL) {
+		DWORD dwError = GetLastError();
+		return AmHresultFromWin32(dwError);
+	    }
+
+	    hr = (*lpfnLoadTypeLib)(szControl, &ptlib);
+	    if (FAILED(hr)) {
+		return hr;
+	    }
+	}
+
+	hr = ptlib->GetTypeInfoOfGuid(
+		    riid,
+		    &m_pti);
+
+	ptlib->Release();
+
+	if (FAILED(hr)) {
+	    return hr;
+	}
+    }
+
+    *pptinfo = m_pti;
+    m_pti->AddRef();
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CBaseDispatch::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    // although the IDispatch riid is dead, we use this to pass from
+    // the interface implementation class to us the iid we are talking about.
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(riid, 0, lcid, &pti);
+
+    if (SUCCEEDED(hr)) {
+	hr = pti->GetIDsOfNames(rgszNames, cNames, rgdispid);
+
+	pti->Release();
+    }
+    return hr;
+}
+
+
+// --- CMediaControl implementation ---------
+
+CMediaControl::CMediaControl(const TCHAR * name,LPUNKNOWN pUnk) :
+    CUnknown(name, pUnk)
+{
+}
+
+// expose our interfaces IMediaControl and IUnknown
+
+STDMETHODIMP
+CMediaControl::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IMediaControl) {
+	return GetInterface( (IMediaControl *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+// return 1 if we support GetTypeInfo
+
+STDMETHODIMP
+CMediaControl::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+// attempt to find our type library
+
+STDMETHODIMP
+CMediaControl::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IMediaControl,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CMediaControl::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IMediaControl,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CMediaControl::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IMediaControl *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- CMediaEvent implementation ----------
+
+
+CMediaEvent::CMediaEvent(const TCHAR * name,LPUNKNOWN pUnk) :
+    CUnknown(name, pUnk)
+{
+}
+
+
+// expose our interfaces IMediaEvent and IUnknown
+
+STDMETHODIMP
+CMediaEvent::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IMediaEvent || riid == IID_IMediaEventEx) {
+	return GetInterface( (IMediaEventEx *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+// return 1 if we support GetTypeInfo
+
+STDMETHODIMP
+CMediaEvent::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+// attempt to find our type library
+
+STDMETHODIMP
+CMediaEvent::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IMediaEvent,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CMediaEvent::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IMediaEvent,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CMediaEvent::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IMediaEvent *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- CMediaPosition implementation ----------
+
+
+CMediaPosition::CMediaPosition(const TCHAR * name,LPUNKNOWN pUnk) :
+    CUnknown(name, pUnk)
+{
+}
+
+CMediaPosition::CMediaPosition(const TCHAR * name,
+                               LPUNKNOWN pUnk,
+                               HRESULT * phr) :
+    CUnknown(name, pUnk)
+{
+    UNREFERENCED_PARAMETER(phr);
+}
+
+
+// expose our interfaces IMediaPosition and IUnknown
+
+STDMETHODIMP
+CMediaPosition::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IMediaPosition) {
+	return GetInterface( (IMediaPosition *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+// return 1 if we support GetTypeInfo
+
+STDMETHODIMP
+CMediaPosition::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+// attempt to find our type library
+
+STDMETHODIMP
+CMediaPosition::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IMediaPosition,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CMediaPosition::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IMediaPosition,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CMediaPosition::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IMediaPosition *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- IMediaPosition and IMediaSeeking pass through class ----------
+
+
+CPosPassThru::CPosPassThru(const TCHAR *pName,
+			   LPUNKNOWN pUnk,
+			   HRESULT *phr,
+			   IPin *pPin) :
+    CMediaPosition(pName,pUnk),
+    m_pPin(pPin)
+{
+    if (pPin == NULL) {
+	*phr = E_POINTER;
+	return;
+    }
+}
+
+
+// Expose our IMediaSeeking and IMediaPosition interfaces
+
+STDMETHODIMP
+CPosPassThru::NonDelegatingQueryInterface(REFIID riid,void **ppv)
+{
+    CheckPointer(ppv,E_POINTER);
+    *ppv = NULL;
+
+    if (riid == IID_IMediaSeeking) {
+	return GetInterface( static_cast<IMediaSeeking *>(this), ppv);
+    }
+    return CMediaPosition::NonDelegatingQueryInterface(riid,ppv);
+}
+
+
+// Return the IMediaPosition interface from our peer
+
+HRESULT
+CPosPassThru::GetPeer(IMediaPosition ** ppMP)
+{
+    *ppMP = NULL;
+
+    IPin *pConnected;
+    HRESULT hr = m_pPin->ConnectedTo(&pConnected);
+    if (FAILED(hr)) {
+	return E_NOTIMPL;
+    }
+    IMediaPosition * pMP;
+    hr = pConnected->QueryInterface(IID_IMediaPosition, (void **) &pMP);
+    pConnected->Release();
+    if (FAILED(hr)) {
+	return E_NOTIMPL;
+    }
+
+    *ppMP = pMP;
+    return S_OK;
+}
+
+
+// Return the IMediaSeeking interface from our peer
+
+HRESULT
+CPosPassThru::GetPeerSeeking(IMediaSeeking ** ppMS)
+{
+    *ppMS = NULL;
+
+    IPin *pConnected;
+    HRESULT hr = m_pPin->ConnectedTo(&pConnected);
+    if (FAILED(hr)) {
+	return E_NOTIMPL;
+    }
+    IMediaSeeking * pMS;
+    hr = pConnected->QueryInterface(IID_IMediaSeeking, (void **) &pMS);
+    pConnected->Release();
+    if (FAILED(hr)) {
+	return E_NOTIMPL;
+    }
+
+    *ppMS = pMS;
+    return S_OK;
+}
+
+
+// --- IMediaSeeking methods ----------
+
+
+STDMETHODIMP
+CPosPassThru::GetCapabilities(DWORD * pCaps)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->GetCapabilities(pCaps);
+    pMS->Release();
+    return hr;
+}
+
+STDMETHODIMP
+CPosPassThru::CheckCapabilities(DWORD * pCaps)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->CheckCapabilities(pCaps);
+    pMS->Release();
+    return hr;
+}
+
+STDMETHODIMP
+CPosPassThru::IsFormatSupported(const GUID * pFormat)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->IsFormatSupported(pFormat);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::QueryPreferredFormat(GUID *pFormat)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->QueryPreferredFormat(pFormat);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::SetTimeFormat(const GUID * pFormat)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->SetTimeFormat(pFormat);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::GetTimeFormat(GUID *pFormat)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->GetTimeFormat(pFormat);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::IsUsingTimeFormat(const GUID * pFormat)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->IsUsingTimeFormat(pFormat);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::ConvertTimeFormat(LONGLONG * pTarget, const GUID * pTargetFormat,
+				LONGLONG    Source, const GUID * pSourceFormat )
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->ConvertTimeFormat(pTarget, pTargetFormat, Source, pSourceFormat );
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::SetPositions( LONGLONG * pCurrent, DWORD CurrentFlags
+			  , LONGLONG * pStop, DWORD StopFlags )
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->SetPositions(pCurrent, CurrentFlags, pStop, StopFlags );
+    pMS->Release();
+    return hr;
+}
+
+STDMETHODIMP
+CPosPassThru::GetPositions(LONGLONG *pCurrent, LONGLONG * pStop)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->GetPositions(pCurrent,pStop);
+    pMS->Release();
+    return hr;
+}
+
+HRESULT
+CPosPassThru::GetSeekingLongLong
+( HRESULT (__stdcall IMediaSeeking::*pMethod)( LONGLONG * )
+, LONGLONG * pll
+)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (SUCCEEDED(hr))
+    {
+	hr = (pMS->*pMethod)(pll);
+	pMS->Release();
+    }
+    return hr;
+}
+
+// If we don't have a current position then ask upstream
+
+STDMETHODIMP
+CPosPassThru::GetCurrentPosition(LONGLONG *pCurrent)
+{
+    // Can we report the current position
+    HRESULT hr = GetMediaTime(pCurrent,NULL);
+    if (SUCCEEDED(hr)) hr = NOERROR;
+    else hr = GetSeekingLongLong( &IMediaSeeking::GetCurrentPosition, pCurrent );
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::GetStopPosition(LONGLONG *pStop)
+{
+    return GetSeekingLongLong( &IMediaSeeking::GetStopPosition, pStop );;
+}
+
+STDMETHODIMP
+CPosPassThru::GetDuration(LONGLONG *pDuration)
+{
+    return GetSeekingLongLong( &IMediaSeeking::GetDuration, pDuration );;
+}
+
+
+STDMETHODIMP
+CPosPassThru::GetPreroll(LONGLONG *pllPreroll)
+{
+    return GetSeekingLongLong( &IMediaSeeking::GetPreroll, pllPreroll );;
+}
+
+
+STDMETHODIMP
+CPosPassThru::GetAvailable( LONGLONG *pEarliest, LONGLONG *pLatest )
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMS->GetAvailable( pEarliest, pLatest );
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::GetRate(double * pdRate)
+{
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMS->GetRate(pdRate);
+    pMS->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::SetRate(double dRate)
+{
+    if (0.0 == dRate) {
+		return E_INVALIDARG;
+    }
+
+    IMediaSeeking* pMS;
+    HRESULT hr = GetPeerSeeking(&pMS);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMS->SetRate(dRate);
+    pMS->Release();
+    return hr;
+}
+
+
+
+
+// --- IMediaPosition methods ----------
+
+
+STDMETHODIMP
+CPosPassThru::get_Duration(REFTIME * plength)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pMP->get_Duration(plength);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::get_CurrentPosition(REFTIME * pllTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->get_CurrentPosition(pllTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::put_CurrentPosition(REFTIME llTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->put_CurrentPosition(llTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::get_StopTime(REFTIME * pllTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->get_StopTime(pllTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::put_StopTime(REFTIME llTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->put_StopTime(llTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::get_PrerollTime(REFTIME * pllTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->get_PrerollTime(pllTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::put_PrerollTime(REFTIME llTime)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->put_PrerollTime(llTime);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::get_Rate(double * pdRate)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->get_Rate(pdRate);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::put_Rate(double dRate)
+{
+    if (0.0 == dRate) {
+		return E_INVALIDARG;
+    }
+
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->put_Rate(dRate);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::CanSeekForward(LONG *pCanSeekForward)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->CanSeekForward(pCanSeekForward);
+    pMP->Release();
+    return hr;
+}
+
+
+STDMETHODIMP
+CPosPassThru::CanSeekBackward(LONG *pCanSeekBackward)
+{
+    IMediaPosition* pMP;
+    HRESULT hr = GetPeer(&pMP);
+    if (FAILED(hr)) {
+	return hr;
+    }
+    hr = pMP->CanSeekBackward(pCanSeekBackward);
+    pMP->Release();
+    return hr;
+}
+
+
+// --- Implements the CRendererPosPassThru class ----------
+
+
+// Media times (eg current frame, field, sample etc) are passed through the
+// filtergraph in media samples. When a renderer gets a sample with media
+// times in it, it will call one of the RegisterMediaTime methods we expose
+// (one takes an IMediaSample, the other takes the media times direct). We
+// store the media times internally and return them in GetCurrentPosition.
+
+CRendererPosPassThru::CRendererPosPassThru(const TCHAR *pName,
+					   LPUNKNOWN pUnk,
+					   HRESULT *phr,
+					   IPin *pPin) :
+    CPosPassThru(pName,pUnk,phr,pPin),
+    m_StartMedia(0),
+    m_EndMedia(0),
+    m_bReset(TRUE)
+{
+}
+
+
+// Sets the media times the object should report
+
+HRESULT
+CRendererPosPassThru::RegisterMediaTime(IMediaSample *pMediaSample)
+{
+    ASSERT(pMediaSample);
+    LONGLONG StartMedia;
+    LONGLONG EndMedia;
+
+    CAutoLock cAutoLock(&m_PositionLock);
+
+    // Get the media times from the sample
+
+    HRESULT hr = pMediaSample->GetTime(&StartMedia,&EndMedia);
+    if (FAILED(hr))
+    {
+	ASSERT(hr == VFW_E_SAMPLE_TIME_NOT_SET);
+	return hr;
+    }
+
+    m_StartMedia = StartMedia;
+    m_EndMedia = EndMedia;
+    m_bReset = FALSE;
+    return NOERROR;
+}
+
+
+// Sets the media times the object should report
+
+HRESULT
+CRendererPosPassThru::RegisterMediaTime(LONGLONG StartTime,LONGLONG EndTime)
+{
+    CAutoLock cAutoLock(&m_PositionLock);
+    m_StartMedia = StartTime;
+    m_EndMedia = EndTime;
+    m_bReset = FALSE;
+    return NOERROR;
+}
+
+
+// Return the current media times registered in the object
+
+HRESULT
+CRendererPosPassThru::GetMediaTime(LONGLONG *pStartTime,LONGLONG *pEndTime)
+{
+    ASSERT(pStartTime);
+
+    CAutoLock cAutoLock(&m_PositionLock);
+    if (m_bReset == TRUE) {
+	return E_FAIL;
+    }
+
+    // We don't have to return the end time
+
+    HRESULT hr = ConvertTimeFormat( pStartTime, 0, m_StartMedia, &TIME_FORMAT_MEDIA_TIME );
+    if (pEndTime && SUCCEEDED(hr)) {
+	hr = ConvertTimeFormat( pEndTime, 0, m_EndMedia, &TIME_FORMAT_MEDIA_TIME );
+    }
+    return hr;
+}
+
+
+// Resets the media times we hold
+
+HRESULT
+CRendererPosPassThru::ResetMediaTime()
+{
+    CAutoLock cAutoLock(&m_PositionLock);
+    m_StartMedia = 0;
+    m_EndMedia = 0;
+    m_bReset = TRUE;
+    return NOERROR;
+}
+
+// Intended to be called by the owing filter during EOS processing so
+// that the media times can be adjusted to the stop time.  This ensures
+// that the GetCurrentPosition will actully get to the stop position.
+HRESULT
+CRendererPosPassThru::EOS()
+{
+    HRESULT hr;
+
+    if ( m_bReset == TRUE ) hr = E_FAIL;
+    else
+    {
+	LONGLONG llStop;
+	if SUCCEEDED(hr=GetStopPosition(&llStop))
+	{
+	    CAutoLock cAutoLock(&m_PositionLock);
+	    m_StartMedia =
+	    m_EndMedia	 = llStop;
+	}
+    }
+    return hr;
+}
+
+// -- CSourceSeeking implementation ------------
+
+CSourceSeeking::CSourceSeeking(
+    const TCHAR * pName,
+    LPUNKNOWN pUnk,
+    HRESULT* phr,
+    CCritSec * pLock) :
+        CUnknown(pName, pUnk),
+        m_pLock(pLock),
+        m_rtStart((long)0)
+{
+    m_rtStop = _I64_MAX / 2;
+    m_rtDuration = m_rtStop;
+    m_dRateSeeking = 1.0;
+
+    m_dwSeekingCaps = AM_SEEKING_CanSeekForwards
+        | AM_SEEKING_CanSeekBackwards
+        | AM_SEEKING_CanSeekAbsolute
+        | AM_SEEKING_CanGetStopPos
+        | AM_SEEKING_CanGetDuration;
+}
+
+HRESULT CSourceSeeking::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    if(riid == IID_IMediaSeeking) {
+        CheckPointer(ppv, E_POINTER);
+        return GetInterface(static_cast<IMediaSeeking *>(this), ppv);
+    }
+    else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+HRESULT CSourceSeeking::IsFormatSupported(const GUID * pFormat)
+{
+    CheckPointer(pFormat, E_POINTER);
+    // only seeking in time (REFERENCE_TIME units) is supported
+    return *pFormat == TIME_FORMAT_MEDIA_TIME ? S_OK : S_FALSE;
+}
+
+HRESULT CSourceSeeking::QueryPreferredFormat(GUID *pFormat)
+{
+    CheckPointer(pFormat, E_POINTER);
+    *pFormat = TIME_FORMAT_MEDIA_TIME;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::SetTimeFormat(const GUID * pFormat)
+{
+    CheckPointer(pFormat, E_POINTER);
+
+    // nothing to set; just check that it's TIME_FORMAT_TIME
+    return *pFormat == TIME_FORMAT_MEDIA_TIME ? S_OK : E_INVALIDARG;
+}
+
+HRESULT CSourceSeeking::IsUsingTimeFormat(const GUID * pFormat)
+{
+    CheckPointer(pFormat, E_POINTER);
+    return *pFormat == TIME_FORMAT_MEDIA_TIME ? S_OK : S_FALSE;
+}
+
+HRESULT CSourceSeeking::GetTimeFormat(GUID *pFormat)
+{
+    CheckPointer(pFormat, E_POINTER);
+    *pFormat = TIME_FORMAT_MEDIA_TIME;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::GetDuration(LONGLONG *pDuration)
+{
+    CheckPointer(pDuration, E_POINTER);
+    CAutoLock lock(m_pLock);
+    *pDuration = m_rtDuration;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::GetStopPosition(LONGLONG *pStop)
+{
+    CheckPointer(pStop, E_POINTER);
+    CAutoLock lock(m_pLock);
+    *pStop = m_rtStop;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::GetCurrentPosition(LONGLONG *pCurrent)
+{
+    // GetCurrentPosition is typically supported only in renderers and
+    // not in source filters.
+    return E_NOTIMPL;
+}
+
+HRESULT CSourceSeeking::GetCapabilities( DWORD * pCapabilities )
+{
+    CheckPointer(pCapabilities, E_POINTER);
+    *pCapabilities = m_dwSeekingCaps;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::CheckCapabilities( DWORD * pCapabilities )
+{
+    CheckPointer(pCapabilities, E_POINTER);
+
+    // make sure all requested capabilities are in our mask
+    return (~m_dwSeekingCaps & *pCapabilities) ? S_FALSE : S_OK;
+}
+
+HRESULT CSourceSeeking::ConvertTimeFormat( LONGLONG * pTarget, const GUID * pTargetFormat,
+                           LONGLONG    Source, const GUID * pSourceFormat )
+{
+    CheckPointer(pTarget, E_POINTER);
+    // format guids can be null to indicate current format
+
+    // since we only support TIME_FORMAT_MEDIA_TIME, we don't really
+    // offer any conversions.
+    if(pTargetFormat == 0 || *pTargetFormat == TIME_FORMAT_MEDIA_TIME)
+    {
+        if(pSourceFormat == 0 || *pSourceFormat == TIME_FORMAT_MEDIA_TIME)
+        {
+            *pTarget = Source;
+            return S_OK;
+        }
+    }
+
+    return E_INVALIDARG;
+}
+
+
+HRESULT CSourceSeeking::SetPositions( LONGLONG * pCurrent,  DWORD CurrentFlags
+                      , LONGLONG * pStop,  DWORD StopFlags )
+{
+    DWORD StopPosBits = StopFlags & AM_SEEKING_PositioningBitsMask;
+    DWORD StartPosBits = CurrentFlags & AM_SEEKING_PositioningBitsMask;
+
+    if(StopFlags) {
+        CheckPointer(pStop, E_POINTER);
+
+        // accept only relative, incremental, or absolute positioning
+        if(StopPosBits != StopFlags) {
+            return E_INVALIDARG;
+        }
+    }
+
+    if(CurrentFlags) {
+        CheckPointer(pCurrent, E_POINTER);
+        if(StartPosBits != AM_SEEKING_AbsolutePositioning &&
+           StartPosBits != AM_SEEKING_RelativePositioning) {
+            return E_INVALIDARG;
+        }
+    }
+
+
+    // scope for autolock
+    {
+        CAutoLock lock(m_pLock);
+
+        // set start position
+        if(StartPosBits == AM_SEEKING_AbsolutePositioning)
+        {
+            m_rtStart = *pCurrent;
+        }
+        else if(StartPosBits == AM_SEEKING_RelativePositioning)
+        {
+            m_rtStart += *pCurrent;
+        }
+
+        // set stop position
+        if(StopPosBits == AM_SEEKING_AbsolutePositioning)
+        {
+            m_rtStop = *pStop;
+        }
+        else if(StopPosBits == AM_SEEKING_IncrementalPositioning)
+        {
+            m_rtStop = m_rtStart + *pStop;
+        }
+        else if(StopPosBits == AM_SEEKING_RelativePositioning)
+        {
+            m_rtStop = m_rtStop + *pStop;
+        }
+    }
+
+
+    HRESULT hr = S_OK;
+    if(SUCCEEDED(hr) && StopPosBits) {
+        hr = ChangeStop();
+    }
+    if(StartPosBits) {
+        hr = ChangeStart();
+    }
+
+    return hr;
+}
+
+
+HRESULT CSourceSeeking::GetPositions( LONGLONG * pCurrent, LONGLONG * pStop )
+{
+    if(pCurrent) {
+        *pCurrent = m_rtStart;
+    }
+    if(pStop) {
+        *pStop = m_rtStop;
+    }
+
+    return S_OK;;
+}
+
+
+HRESULT CSourceSeeking::GetAvailable( LONGLONG * pEarliest, LONGLONG * pLatest )
+{
+    if(pEarliest) {
+        *pEarliest = 0;
+    }
+    if(pLatest) {
+        CAutoLock lock(m_pLock);
+        *pLatest = m_rtDuration;
+    }
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::SetRate( double dRate)
+{
+    {
+        CAutoLock lock(m_pLock);
+        m_dRateSeeking = dRate;
+    }
+    return ChangeRate();
+}
+
+HRESULT CSourceSeeking::GetRate( double * pdRate)
+{
+    CheckPointer(pdRate, E_POINTER);
+    CAutoLock lock(m_pLock);
+    *pdRate = m_dRateSeeking;
+    return S_OK;
+}
+
+HRESULT CSourceSeeking::GetPreroll(LONGLONG *pPreroll)
+{
+    CheckPointer(pPreroll, E_POINTER);
+    *pPreroll = 0;
+    return S_OK;
+}
+
+
+
+
+
+// --- CSourcePosition implementation ----------
+
+
+CSourcePosition::CSourcePosition(const TCHAR * pName,
+				 LPUNKNOWN pUnk,
+				 HRESULT* phr,
+				 CCritSec * pLock) :
+    CMediaPosition(pName, pUnk),
+    m_pLock(pLock),
+    m_Start(CRefTime((LONGLONG)0))
+{
+    m_Stop = _I64_MAX;
+    m_Rate = 1.0;
+}
+
+
+STDMETHODIMP
+CSourcePosition::get_Duration(REFTIME * plength)
+{
+    CheckPointer(plength,E_POINTER);
+    ValidateReadWritePtr(plength,sizeof(REFTIME));
+    CAutoLock lock(m_pLock);
+
+    *plength = m_Duration;
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CSourcePosition::put_CurrentPosition(REFTIME llTime)
+{
+    m_pLock->Lock();
+    m_Start = llTime;
+    m_pLock->Unlock();
+
+    return ChangeStart();
+}
+
+
+STDMETHODIMP
+CSourcePosition::get_StopTime(REFTIME * pllTime)
+{
+    CheckPointer(pllTime,E_POINTER);
+    ValidateReadWritePtr(pllTime,sizeof(REFTIME));
+    CAutoLock lock(m_pLock);
+
+    *pllTime = m_Stop;
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CSourcePosition::put_StopTime(REFTIME llTime)
+{
+    m_pLock->Lock();
+    m_Stop = llTime;
+    m_pLock->Unlock();
+
+    return ChangeStop();
+}
+
+
+STDMETHODIMP
+CSourcePosition::get_PrerollTime(REFTIME * pllTime)
+{
+    CheckPointer(pllTime,E_POINTER);
+    ValidateReadWritePtr(pllTime,sizeof(REFTIME));
+    return E_NOTIMPL;
+}
+
+
+STDMETHODIMP
+CSourcePosition::put_PrerollTime(REFTIME llTime)
+{
+    return E_NOTIMPL;
+}
+
+
+STDMETHODIMP
+CSourcePosition::get_Rate(double * pdRate)
+{
+    CheckPointer(pdRate,E_POINTER);
+    ValidateReadWritePtr(pdRate,sizeof(double));
+    CAutoLock lock(m_pLock);
+
+    *pdRate = m_Rate;
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CSourcePosition::put_Rate(double dRate)
+{
+    m_pLock->Lock();
+    m_Rate = dRate;
+    m_pLock->Unlock();
+
+    return ChangeRate();
+}
+
+
+// By default we can seek forwards
+
+STDMETHODIMP
+CSourcePosition::CanSeekForward(LONG *pCanSeekForward)
+{
+    CheckPointer(pCanSeekForward,E_POINTER);
+    *pCanSeekForward = OATRUE;
+    return S_OK;
+}
+
+
+// By default we can seek backwards
+
+STDMETHODIMP
+CSourcePosition::CanSeekBackward(LONG *pCanSeekBackward)
+{
+    CheckPointer(pCanSeekBackward,E_POINTER);
+    *pCanSeekBackward = OATRUE;
+    return S_OK;
+}
+
+
+// --- Implementation of CBasicAudio class ----------
+
+
+CBasicAudio::CBasicAudio(const TCHAR * pName,LPUNKNOWN punk) :
+    CUnknown(pName, punk)
+{
+}
+
+// overriden to publicise our interfaces
+
+STDMETHODIMP
+CBasicAudio::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IBasicAudio) {
+	return GetInterface( (IBasicAudio *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+STDMETHODIMP
+CBasicAudio::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+STDMETHODIMP
+CBasicAudio::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IBasicAudio,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CBasicAudio::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IBasicAudio,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CBasicAudio::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IBasicAudio *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- IVideoWindow implementation ----------
+
+CBaseVideoWindow::CBaseVideoWindow(const TCHAR * pName,LPUNKNOWN punk) :
+    CUnknown(pName, punk)
+{
+}
+
+
+// overriden to publicise our interfaces
+
+STDMETHODIMP
+CBaseVideoWindow::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IVideoWindow) {
+	return GetInterface( (IVideoWindow *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+STDMETHODIMP
+CBaseVideoWindow::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+STDMETHODIMP
+CBaseVideoWindow::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IVideoWindow,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CBaseVideoWindow::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IVideoWindow,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CBaseVideoWindow::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IVideoWindow *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- IBasicVideo implementation ----------
+
+
+CBaseBasicVideo::CBaseBasicVideo(const TCHAR * pName,LPUNKNOWN punk) :
+    CUnknown(pName, punk)
+{
+}
+
+
+// overriden to publicise our interfaces
+
+STDMETHODIMP
+CBaseBasicVideo::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IBasicVideo || riid == IID_IBasicVideo2) {
+	return GetInterface( static_cast<IBasicVideo2 *>(this), ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+STDMETHODIMP
+CBaseBasicVideo::GetTypeInfoCount(UINT * pctinfo)
+{
+    return m_basedisp.GetTypeInfoCount(pctinfo);
+}
+
+
+STDMETHODIMP
+CBaseBasicVideo::GetTypeInfo(
+  UINT itinfo,
+  LCID lcid,
+  ITypeInfo ** pptinfo)
+{
+    return m_basedisp.GetTypeInfo(
+		IID_IBasicVideo,
+		itinfo,
+		lcid,
+		pptinfo);
+}
+
+
+STDMETHODIMP
+CBaseBasicVideo::GetIDsOfNames(
+  REFIID riid,
+  OLECHAR  ** rgszNames,
+  UINT cNames,
+  LCID lcid,
+  DISPID * rgdispid)
+{
+    return m_basedisp.GetIDsOfNames(
+			IID_IBasicVideo,
+			rgszNames,
+			cNames,
+			lcid,
+			rgdispid);
+}
+
+
+STDMETHODIMP
+CBaseBasicVideo::Invoke(
+  DISPID dispidMember,
+  REFIID riid,
+  LCID lcid,
+  WORD wFlags,
+  DISPPARAMS * pdispparams,
+  VARIANT * pvarResult,
+  EXCEPINFO * pexcepinfo,
+  UINT * puArgErr)
+{
+    // this parameter is a dead leftover from an earlier interface
+    if (IID_NULL != riid) {
+	return DISP_E_UNKNOWNINTERFACE;
+    }
+
+    ITypeInfo * pti;
+    HRESULT hr = GetTypeInfo(0, lcid, &pti);
+
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    hr = pti->Invoke(
+	    (IBasicVideo *)this,
+	    dispidMember,
+	    wFlags,
+	    pdispparams,
+	    pvarResult,
+	    pexcepinfo,
+	    puArgErr);
+
+    pti->Release();
+    return hr;
+}
+
+
+// --- Implementation of Deferred Commands ----------
+
+
+CDispParams::CDispParams(UINT nArgs, VARIANT* pArgs, HRESULT *phr)
+{
+   cNamedArgs = 0;
+   rgdispidNamedArgs = NULL;
+   cArgs = nArgs;
+
+    if (cArgs) {
+	rgvarg = new VARIANT[cArgs];
+        if (NULL == rgvarg) {
+            cArgs = 0;
+            if (phr) {
+                *phr = E_OUTOFMEMORY;
+            }
+            return;
+        }
+
+	for (UINT i = 0; i < cArgs; i++) {
+
+	    VARIANT * pDest = &rgvarg[i];
+	    VARIANT * pSrc = &pArgs[i];
+
+	    pDest->vt = pSrc->vt;
+	    switch(pDest->vt) {
+
+	    case VT_I4:
+		pDest->lVal = pSrc->lVal;
+		break;
+
+	    case VT_UI1:
+		pDest->bVal = pSrc->bVal;
+		break;
+
+	    case VT_I2:
+		pDest->iVal = pSrc->iVal;
+		break;
+
+	    case VT_R4:
+		pDest->fltVal = pSrc->fltVal;
+		break;
+
+	    case VT_R8:
+		pDest->dblVal = pSrc->dblVal;
+		break;
+
+	    case VT_BOOL:
+		pDest->boolVal = pSrc->boolVal;
+		break;
+
+	    case VT_ERROR:
+		pDest->scode = pSrc->scode;
+		break;
+
+	    case VT_CY:
+		pDest->cyVal = pSrc->cyVal;
+		break;
+
+	    case VT_DATE:
+		pDest->date = pSrc->date;
+		break;
+
+	    case VT_BSTR:
+		if (pSrc->bstrVal == NULL) {
+		    pDest->bstrVal = NULL;
+		} else {
+
+		    // a BSTR is a WORD followed by a UNICODE string.
+		    // the pointer points just after the WORD
+
+		    WORD len = * (WORD*) (pSrc->bstrVal - (sizeof(WORD) / sizeof(OLECHAR)));
+		    OLECHAR* pch = new OLECHAR[len + (sizeof(WORD)/sizeof(OLECHAR))];
+                    if (pch) {
+        		WORD *pui = (WORD*)pch;
+        		*pui = len;
+         	        pDest->bstrVal = pch + (sizeof(WORD)/sizeof(OLECHAR));
+         		CopyMemory(pDest->bstrVal, pSrc->bstrVal, len*sizeof(OLECHAR));
+                    } else {
+                        cArgs = i;
+                        if (phr) {
+                            *phr = E_OUTOFMEMORY;
+                        }
+                    }
+		}
+		pDest->bstrVal = pSrc->bstrVal;
+		break;
+
+	    case VT_UNKNOWN:
+		pDest->punkVal = pSrc->punkVal;
+		pDest->punkVal->AddRef();
+		break;
+
+	    case VT_DISPATCH:
+		pDest->pdispVal = pSrc->pdispVal;
+		pDest->pdispVal->AddRef();
+		break;
+
+	    default:
+		// a type we haven't got round to adding yet!
+		ASSERT(0);
+		break;
+	    }
+	}
+
+    } else {
+	rgvarg = NULL;
+    }
+
+}
+
+
+CDispParams::~CDispParams()
+{
+    for (UINT i = 0; i < cArgs; i++) {
+	switch(rgvarg[i].vt) {
+	case VT_BSTR:
+	    if (rgvarg[i].bstrVal != NULL) {
+		OLECHAR * pch = rgvarg[i].bstrVal - (sizeof(WORD)/sizeof(OLECHAR));
+		delete pch;
+	    }
+	    break;
+
+	case VT_UNKNOWN:
+	    rgvarg[i].punkVal->Release();
+	    break;
+
+	case VT_DISPATCH:
+	    rgvarg[i].pdispVal->Release();
+	    break;
+	}
+    }
+    delete[] rgvarg;
+}
+
+
+// lifetime is controlled by refcounts (see defer.h)
+
+CDeferredCommand::CDeferredCommand(
+    CCmdQueue * pQ,
+    LPUNKNOWN	pUnk,
+    HRESULT *	phr,
+    LPUNKNOWN	pUnkExecutor,
+    REFTIME	time,
+    GUID*	iid,
+    long	dispidMethod,
+    short	wFlags,
+    long	nArgs,
+    VARIANT*	pDispParams,
+    VARIANT*	pvarResult,
+    short*	puArgErr,
+    BOOL	bStream
+    ) :
+	CUnknown(NAME("DeferredCommand"), pUnk),
+	m_pQueue(pQ),
+	m_pUnk(pUnkExecutor),
+	m_iid(iid),
+	m_dispidMethod(dispidMethod),
+	m_wFlags(wFlags),
+	m_DispParams(nArgs, pDispParams, phr),
+	m_pvarResult(pvarResult),
+	m_bStream(bStream),
+	m_hrResult(E_ABORT)
+
+{
+    // convert REFTIME to REFERENCE_TIME
+    COARefTime convertor(time);
+    m_time = convertor;
+
+    // no check of time validity - it's ok to queue a command that's
+    // already late
+
+    // check iid is supportable on pUnk by QueryInterface for it
+    IUnknown * pInterface;
+    HRESULT hr = m_pUnk->QueryInterface(GetIID(), (void**) &pInterface);
+    if (FAILED(hr)) {
+	*phr = hr;
+	return;
+    }
+    pInterface->Release();
+
+
+    // !!! check dispidMethod and param/return types using typelib
+    ITypeInfo *pti;
+    hr = m_Dispatch.GetTypeInfo(*iid, 0, 0, &pti);
+    if (FAILED(hr)) {
+	*phr = hr;
+	return;
+    }
+    // !!! some sort of ITypeInfo validity check here
+    pti->Release();
+
+
+    // Fix up the dispid for put and get
+    if (wFlags == DISPATCH_PROPERTYPUT) {
+        m_DispParams.cNamedArgs = 1;
+        m_DispId = DISPID_PROPERTYPUT;
+        m_DispParams.rgdispidNamedArgs = &m_DispId;
+    }
+
+    // all checks ok - add to queue
+    hr = pQ->Insert(this);
+    if (FAILED(hr)) {
+	*phr = hr;
+    }
+}
+
+
+// refcounts are held by caller of InvokeAt... and by list. So if
+// we get here, we can't be on the list
+
+#if 0
+CDeferredCommand::~CDeferredCommand()
+{
+    // this assert is invalid since if the queue is deleted while we are
+    // still on the queue, we will have been removed by the queue and this
+    // m_pQueue will not have been modified.
+    // ASSERT(m_pQueue == NULL);
+
+    // we don't hold a ref count on pUnk, which is the object that should
+    // execute the command.
+    // This is because there would otherwise be a circular refcount problem
+    // since pUnk probably owns the CmdQueue object that has a refcount
+    // on us.
+    // The lifetime of pUnk is guaranteed by it being part of, or lifetime
+    // controlled by, our parent object. As long as we are on the list, pUnk
+    // must be valid. Once we are off the list, we do not use pUnk.
+
+}
+#endif
+
+
+// overriden to publicise our interfaces
+
+STDMETHODIMP
+CDeferredCommand::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    if (riid == IID_IDeferredCommand) {
+	return GetInterface( (IDeferredCommand *) this, ppv);
+    } else {
+	return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+// remove from q. this will reduce the refcount by one (since the q
+// holds a count) but can't make us go away since he must have a
+// refcount in order to call this method.
+
+STDMETHODIMP
+CDeferredCommand::Cancel()
+{
+    if (m_pQueue == NULL) {
+	return VFW_E_ALREADY_CANCELLED;
+    }
+
+    HRESULT hr = m_pQueue->Remove(this);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    m_pQueue = NULL;
+    return S_OK;
+}
+
+
+STDMETHODIMP
+CDeferredCommand::Confidence(LONG* pConfidence)
+{
+    return E_NOTIMPL;
+}
+
+
+STDMETHODIMP
+CDeferredCommand::GetHResult(HRESULT * phrResult)
+{
+    CheckPointer(phrResult,E_POINTER);
+    ValidateReadWritePtr(phrResult,sizeof(HRESULT));
+
+    if (m_pQueue != NULL) {
+	return E_ABORT;
+    }
+    *phrResult = m_hrResult;
+    return S_OK;
+}
+
+
+// set the time to be a new time (checking that it is valid) and
+// then requeue
+
+STDMETHODIMP
+CDeferredCommand::Postpone(REFTIME newtime)
+{
+
+    // check that this time is not past
+    // convert REFTIME to REFERENCE_TIME
+    COARefTime convertor(newtime);
+
+    // check that the time has not passed
+    if (m_pQueue->CheckTime(convertor, IsStreamTime())) {
+	return VFW_E_TIME_ALREADY_PASSED;
+    }
+
+    // extract from list
+    HRESULT hr = m_pQueue->Remove(this);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    // change time
+    m_time = convertor;
+
+    // requeue
+    hr = m_pQueue->Insert(this);
+
+    return hr;
+}
+
+
+HRESULT
+CDeferredCommand::Invoke()
+{
+    // check that we are still outstanding
+    if (m_pQueue == NULL) {
+	return VFW_E_ALREADY_CANCELLED;
+    }
+
+    // get the type info
+    ITypeInfo* pti;
+    HRESULT hr = m_Dispatch.GetTypeInfo(GetIID(), 0, 0, &pti);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    // qi for the expected interface and then invoke it. Note that we have to
+    // treat the returned interface as IUnknown since we don't know its type.
+    IUnknown* pInterface;
+
+    hr = m_pUnk->QueryInterface(GetIID(), (void**) &pInterface);
+    if (FAILED(hr)) {
+	pti->Release();
+	return hr;
+    }
+
+    EXCEPINFO expinfo;
+    UINT uArgErr;
+    m_hrResult = pti->Invoke(
+	pInterface,
+	GetMethod(),
+	GetFlags(),
+	GetParams(),
+	GetResult(),
+	&expinfo,
+	&uArgErr);
+
+    // release the interface we QI'd for
+    pInterface->Release();
+    pti->Release();
+
+
+    // remove from list whether or not successful
+    // or we loop indefinitely
+    hr = m_pQueue->Remove(this);
+    m_pQueue = NULL;
+    return hr;
+}
+
+
+
+// --- CCmdQueue methods ----------
+
+
+CCmdQueue::CCmdQueue() :
+    m_listPresentation(NAME("Presentation time command list")),
+    m_listStream(NAME("Stream time command list")),
+    m_evDue(TRUE),    // manual reset
+    m_dwAdvise(0),
+    m_pClock(NULL),
+    m_bRunning(FALSE)
+{
+}
+
+
+CCmdQueue::~CCmdQueue()
+{
+    // empty all our lists
+
+    // we hold a refcount on each, so traverse and Release each
+    // entry then RemoveAll to empty the list
+    WXLIST_POSITION pos = m_listPresentation.GetHeadPosition();
+
+    while(pos) {
+	CDeferredCommand* pCmd = m_listPresentation.GetNext(pos);
+	pCmd->Release();
+    }
+    m_listPresentation.RemoveAll();
+
+    pos = m_listStream.GetHeadPosition();
+
+    while(pos) {
+	CDeferredCommand* pCmd = m_listStream.GetNext(pos);
+	pCmd->Release();
+    }
+    m_listStream.RemoveAll();
+
+    if (m_pClock) {
+	if (m_dwAdvise) {
+	    m_pClock->Unadvise(m_dwAdvise);
+	    m_dwAdvise = 0;
+	}
+	m_pClock->Release();
+    }
+}
+
+
+// returns a new CDeferredCommand object that will be initialised with
+// the parameters and will be added to the queue during construction.
+// returns S_OK if successfully created otherwise an error and
+// no object has been queued.
+
+HRESULT
+CCmdQueue::New(
+    CDeferredCommand **ppCmd,
+    LPUNKNOWN	pUnk,		// this object will execute command
+    REFTIME	time,
+    GUID*	iid,
+    long	dispidMethod,
+    short	wFlags,
+    long	cArgs,
+    VARIANT*	pDispParams,
+    VARIANT*	pvarResult,
+    short*	puArgErr,
+    BOOL	bStream
+)
+{
+    CAutoLock lock(&m_Lock);
+
+    HRESULT hr = S_OK;
+    *ppCmd = NULL;
+
+    CDeferredCommand* pCmd;
+    pCmd = new CDeferredCommand(
+		    this,
+		    NULL,	    // not aggregated
+		    &hr,
+		    pUnk,	    // this guy will execute
+		    time,
+		    iid,
+		    dispidMethod,
+		    wFlags,
+		    cArgs,
+		    pDispParams,
+		    pvarResult,
+		    puArgErr,
+		    bStream);
+
+    if (pCmd == NULL) {
+	hr = E_OUTOFMEMORY;
+    } else {
+	*ppCmd = pCmd;
+    }
+    return hr;
+}
+
+
+HRESULT
+CCmdQueue::Insert(CDeferredCommand* pCmd)
+{
+    CAutoLock lock(&m_Lock);
+
+    // addref the item
+    pCmd->AddRef();
+
+    CGenericList<CDeferredCommand> * pList;
+    if (pCmd->IsStreamTime()) {
+	pList = &m_listStream;
+    } else {
+	pList = &m_listPresentation;
+    }
+    WXLIST_POSITION pos = pList->GetHeadPosition();
+
+    // seek past all items that are before us
+    while (pos &&
+	(pList->Get(pos)->GetTime() <= pCmd->GetTime())) {
+
+	pList->GetNext(pos);
+    }
+
+    // now at end of list or in front of items that come later
+    if (!pos) {
+	pList->AddTail(pCmd);
+    } else {
+	pList->AddBefore(pos, pCmd);
+    }
+
+    SetTimeAdvise();
+    return S_OK;
+}
+
+
+HRESULT
+CCmdQueue::Remove(CDeferredCommand* pCmd)
+{
+    CAutoLock lock(&m_Lock);
+    HRESULT hr = S_OK;
+
+    CGenericList<CDeferredCommand> * pList;
+    if (pCmd->IsStreamTime()) {
+	pList = &m_listStream;
+    } else {
+	pList = &m_listPresentation;
+    }
+    WXLIST_POSITION pos = pList->GetHeadPosition();
+
+    // traverse the list
+    while (pos && (pList->Get(pos) != pCmd)) {
+	pList->GetNext(pos);
+    }
+
+    // did we drop off the end?
+    if (!pos) {
+	hr = VFW_E_NOT_FOUND;
+    } else {
+
+	// found it - now take off list
+	pList->Remove(pos);
+
+	// Insert did an AddRef, so release it
+	pCmd->Release();
+
+	// check that timer request is still for earliest time
+	SetTimeAdvise();
+    }
+    return hr;
+}
+
+
+// set the clock used for timing
+
+HRESULT
+CCmdQueue::SetSyncSource(IReferenceClock* pClock)
+{
+    CAutoLock lock(&m_Lock);
+
+    // addref the new clock first in case they are the same
+    if (pClock) {
+	pClock->AddRef();
+    }
+
+    // kill any advise on the old clock
+    if (m_pClock) {
+	if (m_dwAdvise) {
+	    m_pClock->Unadvise(m_dwAdvise);
+	    m_dwAdvise = 0;
+	}
+	m_pClock->Release();
+    }
+    m_pClock = pClock;
+
+    // set up a new advise
+    SetTimeAdvise();
+    return S_OK;
+}
+
+
+// set up a timer event with the reference clock
+
+void
+CCmdQueue::SetTimeAdvise(void)
+{
+    // make sure we have a clock to use
+    if (!m_pClock) {
+	return;
+    }
+
+    // reset the event whenever we are requesting a new signal
+    m_evDue.Reset();
+
+    // time 0 is earliest
+    CRefTime current;
+
+    // find the earliest presentation time
+    if (m_listPresentation.GetCount() > 0) {
+
+	WXLIST_POSITION pos = m_listPresentation.GetHeadPosition();
+	current = m_listPresentation.Get(pos)->GetTime();
+    }
+
+    // if we're running, check the stream times too
+    if (m_bRunning) {
+
+	CRefTime t;
+
+	if (m_listStream.GetCount() > 0) {
+
+	    WXLIST_POSITION pos = m_listStream.GetHeadPosition();
+	    t = m_listStream.Get(pos)->GetTime();
+
+	    // add on stream time offset to get presentation time
+	    t += m_StreamTimeOffset;
+
+	    // is this earlier?
+	    if ((current == TimeZero) || (t < current)) {
+		current = t;
+	    }
+	}
+    }
+
+    // need to change?
+    if ((current > TimeZero) && (current != m_tCurrentAdvise)) {
+	if (m_dwAdvise) {
+	    m_pClock->Unadvise(m_dwAdvise);
+	    // reset the event whenever we are requesting a new signal
+	    m_evDue.Reset();
+	}
+
+	// ask for time advice - the first two params are either
+	// stream time offset and stream time or
+	// presentation time and 0. we always use the latter
+	HRESULT hr = m_pClock->AdviseTime(
+		    (REFERENCE_TIME)current,
+		    TimeZero,
+		    (HEVENT) HANDLE(m_evDue),
+		    &m_dwAdvise);
+
+	ASSERT(SUCCEEDED(hr));
+	m_tCurrentAdvise = current;
+    }
+}
+
+
+// switch to run mode. Streamtime to Presentation time mapping known.
+
+HRESULT
+CCmdQueue::Run(REFERENCE_TIME tStreamTimeOffset)
+{
+    CAutoLock lock(&m_Lock);
+
+    m_StreamTimeOffset = tStreamTimeOffset;
+    m_bRunning = TRUE;
+
+    // ensure advise is accurate
+    SetTimeAdvise();
+    return S_OK;
+}
+
+
+// switch to Stopped or Paused mode. Time mapping not known.
+
+HRESULT
+CCmdQueue::EndRun()
+{
+    CAutoLock lock(&m_Lock);
+
+    m_bRunning = FALSE;
+
+    // check timer setting - stream times
+    SetTimeAdvise();
+    return S_OK;
+}
+
+
+// return a pointer to the next due command. Blocks for msTimeout
+// milliseconds until there is a due command.
+// Stream-time commands will only become due between Run and Endrun calls.
+// The command remains queued until invoked or cancelled.
+// Returns E_ABORT if timeout occurs, otherwise S_OK (or other error).
+//
+// returns an AddRef'd object
+
+HRESULT
+CCmdQueue::GetDueCommand(CDeferredCommand ** ppCmd, long msTimeout)
+{
+    // loop until we timeout or find a due command
+    for (;;) {
+
+	{
+	    CAutoLock lock(&m_Lock);
+
+
+	    // find the earliest command
+	    CDeferredCommand * pCmd = NULL;
+
+	    // check the presentation time and the
+	    // stream time list to find the earliest
+
+	    if (m_listPresentation.GetCount() > 0) {
+		WXLIST_POSITION pos = m_listPresentation.GetHeadPosition();
+		pCmd = m_listPresentation.Get(pos);
+	    }
+
+	    if (m_bRunning && (m_listStream.GetCount() > 0)) {
+		WXLIST_POSITION pos = m_listStream.GetHeadPosition();
+		CDeferredCommand* pStrm = m_listStream.Get(pos);
+
+		CRefTime t = pStrm->GetTime() + m_StreamTimeOffset;
+		if (!pCmd || (t < pCmd->GetTime())) {
+		    pCmd = pStrm;
+		}
+	    }
+
+	    //	if we have found one, is it due?
+	    if (pCmd) {
+		if (CheckTime(pCmd->GetTime(), pCmd->IsStreamTime())) {
+
+		    // yes it's due - addref it
+		    pCmd->AddRef();
+		    *ppCmd = pCmd;
+		    return S_OK;
+		}
+	    }
+	}
+
+	// block until the advise is signalled
+	if (WaitForSingleObject(m_evDue, msTimeout) != WAIT_OBJECT_0) {
+	    return E_ABORT;
+	}
+    }
+}
+
+
+// return a pointer to a command that will be due for a given time.
+// Pass in a stream time here. The stream time offset will be passed
+// in via the Run method.
+// Commands remain queued until invoked or cancelled.
+// This method will not block. It will report E_ABORT if there are no
+// commands due yet.
+//
+// returns an AddRef'd object
+
+HRESULT
+CCmdQueue::GetCommandDueFor(REFERENCE_TIME rtStream, CDeferredCommand**ppCmd)
+{
+    CAutoLock lock(&m_Lock);
+
+    CRefTime tStream(rtStream);
+
+    // find the earliest stream and presentation time commands
+    CDeferredCommand* pStream = NULL;
+    if (m_listStream.GetCount() > 0) {
+	WXLIST_POSITION pos = m_listStream.GetHeadPosition();
+	pStream = m_listStream.Get(pos);
+    }
+    CDeferredCommand* pPresent = NULL;
+    if (m_listPresentation.GetCount() > 0) {
+	WXLIST_POSITION pos = m_listPresentation.GetHeadPosition();
+	pPresent = m_listPresentation.Get(pos);
+    }
+
+    // is there a presentation time that has passed already
+    if (pPresent && CheckTime(pPresent->GetTime(), FALSE)) {
+	pPresent->AddRef();
+	*ppCmd = pPresent;
+	return S_OK;
+    }
+
+    // is there a stream time command due before this stream time
+    if (pStream && (pStream->GetTime() <= tStream)) {
+	pPresent->AddRef();
+	*ppCmd = pStream;
+	return S_OK;
+    }
+
+    // if we are running, we can map presentation times to
+    // stream time. In this case, is there a presentation time command
+    // that will be due before this stream time is presented?
+    if (m_bRunning && pPresent) {
+
+	// this stream time will appear at...
+	tStream += m_StreamTimeOffset;
+
+	// due before that?
+	if (pPresent->GetTime() <= tStream) {
+	    *ppCmd = pPresent;
+	    return S_OK;
+	}
+    }
+
+    // no commands due yet
+    return VFW_E_NOT_FOUND;
+}
+
diff --git a/plugins/GSdx_legacy/baseclasses/ctlutil.h b/plugins/GSdx_legacy/baseclasses/ctlutil.h
new file mode 100644
index 0000000000..ce4490cdef
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/ctlutil.h
@@ -0,0 +1,919 @@
+//------------------------------------------------------------------------------
+// File: CtlUtil.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// Base classes implementing IDispatch parsing for the basic control dual
+// interfaces. Derive from these and implement just the custom method and
+// property methods. We also implement CPosPassThru that can be used by
+// renderers and transforms to pass by IMediaPosition and IMediaSeeking
+
+#ifndef __CTLUTIL__
+#define __CTLUTIL__
+
+// OLE Automation has different ideas of TRUE and FALSE
+
+#define OATRUE (-1)
+#define OAFALSE (0)
+
+
+// It's possible that we could replace this class with CreateStdDispatch
+
+class CBaseDispatch
+{
+    ITypeInfo * m_pti;
+
+public:
+
+    CBaseDispatch() : m_pti(NULL) {}
+    ~CBaseDispatch();
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      REFIID riid,
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+};
+
+
+class AM_NOVTABLE CMediaControl :
+    public IMediaControl,
+    public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+public:
+
+    CMediaControl(const TCHAR *, LPUNKNOWN);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+};
+
+
+class AM_NOVTABLE CMediaEvent :
+    public IMediaEventEx,
+    public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+public:
+
+    CMediaEvent(const TCHAR *, LPUNKNOWN);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+};
+
+
+class AM_NOVTABLE CMediaPosition :
+    public IMediaPosition,
+    public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+
+public:
+
+    CMediaPosition(const TCHAR *, LPUNKNOWN);
+    CMediaPosition(const TCHAR *, LPUNKNOWN, HRESULT *phr);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+
+};
+
+
+// OA-compatibility means that we must use double as the RefTime value,
+// and REFERENCE_TIME (essentially a LONGLONG) within filters.
+// this class converts between the two
+
+class COARefTime : public CRefTime {
+public:
+
+    COARefTime() {
+    };
+
+    COARefTime(CRefTime t)
+        : CRefTime(t)
+    {
+    };
+
+    COARefTime(REFERENCE_TIME t)
+        : CRefTime(t)
+    {
+    };
+
+    COARefTime(double d) {
+        m_time = (LONGLONG) (d * 10000000);
+    };
+
+    operator double() {
+        return double(m_time) / 10000000;
+    };
+
+    operator REFERENCE_TIME() {
+        return m_time;
+    };
+
+    COARefTime& operator=(const double& rd)  {
+        m_time = (LONGLONG) (rd * 10000000);
+        return *this;
+    }
+
+    COARefTime& operator=(const REFERENCE_TIME& rt)  {
+        m_time = rt;
+        return *this;
+    }
+
+    inline BOOL operator==(const COARefTime& rt)
+    {
+        return m_time == rt.m_time;
+    };
+
+    inline BOOL operator!=(const COARefTime& rt)
+    {
+        return m_time != rt.m_time;
+    };
+
+    inline BOOL operator < (const COARefTime& rt)
+    {
+        return m_time < rt.m_time;
+    };
+
+    inline BOOL operator > (const COARefTime& rt)
+    {
+        return m_time > rt.m_time;
+    };
+
+    inline BOOL operator >= (const COARefTime& rt)
+    {
+        return m_time >= rt.m_time;
+    };
+
+    inline BOOL operator <= (const COARefTime& rt)
+    {
+        return m_time <= rt.m_time;
+    };
+
+    inline COARefTime operator+(const COARefTime& rt)
+    {
+        return COARefTime(m_time + rt.m_time);
+    };
+
+    inline COARefTime operator-(const COARefTime& rt)
+    {
+        return COARefTime(m_time - rt.m_time);
+    };
+
+    inline COARefTime operator*(LONG l)
+    {
+        return COARefTime(m_time * l);
+    };
+
+    inline COARefTime operator/(LONG l)
+    {
+        return COARefTime(m_time / l);
+    };
+
+private:
+    //  Prevent bugs from constructing from LONG (which gets
+    //  converted to double and then multiplied by 10000000
+    COARefTime(LONG);
+    int operator=(LONG);
+};
+
+
+// A utility class that handles IMediaPosition and IMediaSeeking on behalf
+// of single-input pin renderers, or transform filters.
+//
+// Renderers will expose this from the filter; transform filters will
+// expose it from the output pin and not the renderer.
+//
+// Create one of these, giving it your IPin* for your input pin, and delegate
+// all IMediaPosition methods to it. It will query the input pin for
+// IMediaPosition and respond appropriately.
+//
+// Call ForceRefresh if the pin connection changes.
+//
+// This class no longer caches the upstream IMediaPosition or IMediaSeeking
+// it acquires it on each method call. This means ForceRefresh is not needed.
+// The method is kept for source compatibility and to minimise the changes
+// if we need to put it back later for performance reasons.
+
+class CPosPassThru : public IMediaSeeking, public CMediaPosition
+{
+    IPin *m_pPin;
+
+    HRESULT GetPeer(IMediaPosition **ppMP);
+    HRESULT GetPeerSeeking(IMediaSeeking **ppMS);
+
+public:
+
+    CPosPassThru(const TCHAR *, LPUNKNOWN, HRESULT*, IPin *);
+    DECLARE_IUNKNOWN
+
+    HRESULT ForceRefresh() {
+        return S_OK;
+    };
+
+    // override to return an accurate current position
+    virtual HRESULT GetMediaTime(LONGLONG *pStartTime,LONGLONG *pEndTime) {
+        return E_FAIL;
+    }
+
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid,void **ppv);
+
+    // IMediaSeeking methods
+    STDMETHODIMP GetCapabilities( DWORD * pCapabilities );
+    STDMETHODIMP CheckCapabilities( DWORD * pCapabilities );
+    STDMETHODIMP SetTimeFormat(const GUID * pFormat);
+    STDMETHODIMP GetTimeFormat(GUID *pFormat);
+    STDMETHODIMP IsUsingTimeFormat(const GUID * pFormat);
+    STDMETHODIMP IsFormatSupported( const GUID * pFormat);
+    STDMETHODIMP QueryPreferredFormat( GUID *pFormat);
+    STDMETHODIMP ConvertTimeFormat(LONGLONG * pTarget, const GUID * pTargetFormat,
+                                   LONGLONG    Source, const GUID * pSourceFormat );
+    STDMETHODIMP SetPositions( LONGLONG * pCurrent, DWORD CurrentFlags
+                             , LONGLONG * pStop, DWORD StopFlags );
+
+    STDMETHODIMP GetPositions( LONGLONG * pCurrent, LONGLONG * pStop );
+    STDMETHODIMP GetCurrentPosition( LONGLONG * pCurrent );
+    STDMETHODIMP GetStopPosition( LONGLONG * pStop );
+    STDMETHODIMP SetRate( double dRate);
+    STDMETHODIMP GetRate( double * pdRate);
+    STDMETHODIMP GetDuration( LONGLONG *pDuration);
+    STDMETHODIMP GetAvailable( LONGLONG *pEarliest, LONGLONG *pLatest );
+    STDMETHODIMP GetPreroll( LONGLONG *pllPreroll );
+
+    // IMediaPosition properties
+    STDMETHODIMP get_Duration(REFTIME * plength);
+    STDMETHODIMP put_CurrentPosition(REFTIME llTime);
+    STDMETHODIMP get_StopTime(REFTIME * pllTime);
+    STDMETHODIMP put_StopTime(REFTIME llTime);
+    STDMETHODIMP get_PrerollTime(REFTIME * pllTime);
+    STDMETHODIMP put_PrerollTime(REFTIME llTime);
+    STDMETHODIMP get_Rate(double * pdRate);
+    STDMETHODIMP put_Rate(double dRate);
+    STDMETHODIMP get_CurrentPosition(REFTIME * pllTime);
+    STDMETHODIMP CanSeekForward(LONG *pCanSeekForward);
+    STDMETHODIMP CanSeekBackward(LONG *pCanSeekBackward);
+
+private:
+    HRESULT GetSeekingLongLong( HRESULT (__stdcall IMediaSeeking::*pMethod)( LONGLONG * ),
+                                LONGLONG * pll );
+};
+
+
+// Adds the ability to return a current position
+
+class CRendererPosPassThru : public CPosPassThru
+{
+    CCritSec m_PositionLock;    // Locks access to our position
+    LONGLONG m_StartMedia;      // Start media time last seen
+    LONGLONG m_EndMedia;        // And likewise the end media
+    BOOL m_bReset;              // Have media times been set
+
+public:
+
+    // Used to help with passing media times through graph
+
+    CRendererPosPassThru(const TCHAR *, LPUNKNOWN, HRESULT*, IPin *);
+    HRESULT RegisterMediaTime(IMediaSample *pMediaSample);
+    HRESULT RegisterMediaTime(LONGLONG StartTime,LONGLONG EndTime);
+    HRESULT GetMediaTime(LONGLONG *pStartTime,LONGLONG *pEndTime);
+    HRESULT ResetMediaTime();
+    HRESULT EOS();
+};
+
+STDAPI CreatePosPassThru(
+    LPUNKNOWN pAgg,
+    BOOL bRenderer,
+    IPin *pPin,
+    IUnknown **ppPassThru
+);
+
+// A class that handles the IDispatch part of IBasicAudio and leaves the
+// properties and methods themselves pure virtual.
+
+class AM_NOVTABLE CBasicAudio : public IBasicAudio, public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+public:
+
+    CBasicAudio(const TCHAR *, LPUNKNOWN);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+};
+
+
+// A class that handles the IDispatch part of IBasicVideo and leaves the
+// properties and methods themselves pure virtual.
+
+class AM_NOVTABLE CBaseBasicVideo : public IBasicVideo2, public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+public:
+
+    CBaseBasicVideo(const TCHAR *, LPUNKNOWN);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+
+    STDMETHODIMP GetPreferredAspectRatio(
+      long *plAspectX,
+      long *plAspectY)
+    {
+        return E_NOTIMPL;
+    }
+};
+
+
+// A class that handles the IDispatch part of IVideoWindow and leaves the
+// properties and methods themselves pure virtual.
+
+class AM_NOVTABLE CBaseVideoWindow : public IVideoWindow, public CUnknown
+{
+    CBaseDispatch m_basedisp;
+
+public:
+
+    CBaseVideoWindow(const TCHAR *, LPUNKNOWN);
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    /* IDispatch methods */
+    STDMETHODIMP GetTypeInfoCount(UINT * pctinfo);
+
+    STDMETHODIMP GetTypeInfo(
+      UINT itinfo,
+      LCID lcid,
+      ITypeInfo ** pptinfo);
+
+    STDMETHODIMP GetIDsOfNames(
+      REFIID riid,
+      OLECHAR  ** rgszNames,
+      UINT cNames,
+      LCID lcid,
+      DISPID * rgdispid);
+
+    STDMETHODIMP Invoke(
+      DISPID dispidMember,
+      REFIID riid,
+      LCID lcid,
+      WORD wFlags,
+      DISPPARAMS * pdispparams,
+      VARIANT * pvarResult,
+      EXCEPINFO * pexcepinfo,
+      UINT * puArgErr);
+};
+
+
+// abstract class to help source filters with their implementation
+// of IMediaPosition. Derive from this and set the duration (and stop
+// position). Also override NotifyChange to do something when the properties
+// change.
+
+class AM_NOVTABLE CSourcePosition : public CMediaPosition
+{
+
+public:
+    CSourcePosition(const TCHAR *, LPUNKNOWN, HRESULT*, CCritSec *);
+
+    // IMediaPosition methods
+    STDMETHODIMP get_Duration(REFTIME * plength);
+    STDMETHODIMP put_CurrentPosition(REFTIME llTime);
+    STDMETHODIMP get_StopTime(REFTIME * pllTime);
+    STDMETHODIMP put_StopTime(REFTIME llTime);
+    STDMETHODIMP get_PrerollTime(REFTIME * pllTime);
+    STDMETHODIMP put_PrerollTime(REFTIME llTime);
+    STDMETHODIMP get_Rate(double * pdRate);
+    STDMETHODIMP put_Rate(double dRate);
+    STDMETHODIMP CanSeekForward(LONG *pCanSeekForward);
+    STDMETHODIMP CanSeekBackward(LONG *pCanSeekBackward);
+
+    // override if you can return the data you are actually working on
+    STDMETHODIMP get_CurrentPosition(REFTIME * pllTime) {
+        return E_NOTIMPL;
+    };
+
+protected:
+
+    // we call this to notify changes. Override to handle them
+    virtual HRESULT ChangeStart() PURE;
+    virtual HRESULT ChangeStop() PURE;
+    virtual HRESULT ChangeRate() PURE;
+
+    COARefTime m_Duration;
+    COARefTime m_Start;
+    COARefTime m_Stop;
+    double m_Rate;
+
+    CCritSec * m_pLock;
+};
+
+class AM_NOVTABLE CSourceSeeking :
+    public IMediaSeeking,
+    public CUnknown
+{
+
+public:
+
+    DECLARE_IUNKNOWN;
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    // IMediaSeeking methods
+
+    STDMETHODIMP IsFormatSupported(const GUID * pFormat);
+    STDMETHODIMP QueryPreferredFormat(GUID *pFormat);
+    STDMETHODIMP SetTimeFormat(const GUID * pFormat);
+    STDMETHODIMP IsUsingTimeFormat(const GUID * pFormat);
+    STDMETHODIMP GetTimeFormat(GUID *pFormat);
+    STDMETHODIMP GetDuration(LONGLONG *pDuration);
+    STDMETHODIMP GetStopPosition(LONGLONG *pStop);
+    STDMETHODIMP GetCurrentPosition(LONGLONG *pCurrent);
+    STDMETHODIMP GetCapabilities( DWORD * pCapabilities );
+    STDMETHODIMP CheckCapabilities( DWORD * pCapabilities );
+    STDMETHODIMP ConvertTimeFormat( LONGLONG * pTarget, const GUID * pTargetFormat,
+                                    LONGLONG    Source, const GUID * pSourceFormat );
+
+    STDMETHODIMP SetPositions( LONGLONG * pCurrent,  DWORD CurrentFlags
+			     , LONGLONG * pStop,  DWORD StopFlags );
+
+    STDMETHODIMP GetPositions( LONGLONG * pCurrent, LONGLONG * pStop );
+
+    STDMETHODIMP GetAvailable( LONGLONG * pEarliest, LONGLONG * pLatest );
+    STDMETHODIMP SetRate( double dRate);
+    STDMETHODIMP GetRate( double * pdRate);
+    STDMETHODIMP GetPreroll(LONGLONG *pPreroll);
+
+
+protected:
+
+    // ctor
+    CSourceSeeking(const TCHAR *, LPUNKNOWN, HRESULT*, CCritSec *);
+
+    // we call this to notify changes. Override to handle them
+    virtual HRESULT ChangeStart() PURE;
+    virtual HRESULT ChangeStop() PURE;
+    virtual HRESULT ChangeRate() PURE;
+
+    CRefTime m_rtDuration;      // length of stream
+    CRefTime m_rtStart;         // source will start here
+    CRefTime m_rtStop;          // source will stop here
+    double m_dRateSeeking;
+
+    // seeking capabilities
+    DWORD m_dwSeekingCaps;
+
+    CCritSec * m_pLock;
+};
+
+
+// Base classes supporting Deferred commands.
+
+// Deferred commands are queued by calls to methods on the IQueueCommand
+// interface, exposed by the filtergraph and by some filters. A successful
+// call to one of these methods will return an IDeferredCommand interface
+// representing the queued command.
+//
+// A CDeferredCommand object represents a single deferred command, and exposes
+// the IDeferredCommand interface as well as other methods permitting time
+// checks and actual execution. It contains a reference to the CCommandQueue
+// object on which it is queued.
+//
+// CCommandQueue is a base class providing a queue of CDeferredCommand
+// objects, and methods to add, remove, check status and invoke the queued
+// commands. A CCommandQueue object would be part of an object that
+// implemented IQueueCommand.
+
+class CCmdQueue;
+
+// take a copy of the params and store them. Release any allocated
+// memory in destructor
+
+class CDispParams : public DISPPARAMS
+{
+public:
+    CDispParams(UINT nArgs, VARIANT* pArgs, HRESULT *phr = NULL);
+    ~CDispParams();
+};
+
+
+// CDeferredCommand lifetime is controlled by refcounts. Caller of
+// InvokeAt.. gets a refcounted interface pointer, and the CCmdQueue
+// object also holds a refcount on us. Calling Cancel or Invoke takes
+// us off the CCmdQueue and thus reduces the refcount by 1. Once taken
+// off the queue we cannot be put back on the queue.
+
+class CDeferredCommand
+    : public CUnknown,
+      public IDeferredCommand
+{
+public:
+
+    CDeferredCommand(
+        CCmdQueue * pQ,
+        LPUNKNOWN   pUnk,               // aggregation outer unk
+        HRESULT *   phr,
+        LPUNKNOWN   pUnkExecutor,       // object that will execute this cmd
+        REFTIME     time,
+        GUID*       iid,
+        long        dispidMethod,
+        short       wFlags,
+        long        cArgs,
+        VARIANT*    pDispParams,
+        VARIANT*    pvarResult,
+        short*      puArgErr,
+        BOOL        bStream
+        );
+
+    DECLARE_IUNKNOWN
+
+    // override this to publicise our interfaces
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    // IDeferredCommand methods
+    STDMETHODIMP Cancel();
+    STDMETHODIMP Confidence(
+                    LONG* pConfidence);
+    STDMETHODIMP Postpone(
+                    REFTIME newtime);
+    STDMETHODIMP GetHResult(
+                    HRESULT* phrResult);
+
+    // other public methods
+
+    HRESULT Invoke();
+
+    // access methods
+
+    // returns TRUE if streamtime, FALSE if presentation time
+    BOOL IsStreamTime() {
+       return m_bStream;
+    };
+
+    CRefTime GetTime() {
+        return m_time;
+    };
+
+    REFIID GetIID() {
+        return *m_iid;
+    };
+
+    long GetMethod() {
+        return m_dispidMethod;
+    };
+
+    short GetFlags() {
+        return m_wFlags;
+    };
+
+    DISPPARAMS* GetParams() {
+        return &m_DispParams;
+    };
+
+    VARIANT* GetResult() {
+        return m_pvarResult;
+    };
+
+protected:
+
+    CCmdQueue* m_pQueue;
+
+    // pUnk for the interface that we will execute the command on
+    LPUNKNOWN   m_pUnk;
+
+    // stored command data
+    REFERENCE_TIME     m_time;
+    GUID*       m_iid;
+    long        m_dispidMethod;
+    short       m_wFlags;
+    VARIANT*    m_pvarResult;
+    BOOL        m_bStream;
+    CDispParams m_DispParams;
+    DISPID      m_DispId;         //  For get and put
+
+    // we use this for ITypeInfo access
+    CBaseDispatch   m_Dispatch;
+
+    // save retval here
+    HRESULT     m_hrResult;
+};
+
+
+// a list of CDeferredCommand objects. this is a base class providing
+// the basics of access to the list. If you want to use CDeferredCommand
+// objects then your queue needs to be derived from this class.
+
+class AM_NOVTABLE CCmdQueue
+{
+public:
+    CCmdQueue();
+    virtual ~CCmdQueue();
+
+    // returns a new CDeferredCommand object that will be initialised with
+    // the parameters and will be added to the queue during construction.
+    // returns S_OK if successfully created otherwise an error and
+    // no object has been queued.
+    virtual HRESULT  New(
+        CDeferredCommand **ppCmd,
+        LPUNKNOWN   pUnk,
+        REFTIME     time,
+        GUID*       iid,
+        long        dispidMethod,
+        short       wFlags,
+        long        cArgs,
+        VARIANT*    pDispParams,
+        VARIANT*    pvarResult,
+        short*      puArgErr,
+        BOOL        bStream
+    );
+
+    // called by the CDeferredCommand object to add and remove itself
+    // from the queue
+    virtual HRESULT Insert(CDeferredCommand* pCmd);
+    virtual HRESULT Remove(CDeferredCommand* pCmd);
+
+    // Command-Due Checking
+    //
+    // There are two schemes of synchronisation: coarse and accurate. In
+    // coarse mode, you wait till the time arrives and then execute the cmd.
+    // In accurate mode, you wait until you are processing the sample that
+    // will appear at the time, and then execute the command. It's up to the
+    // filter which one it will implement. The filtergraph will always
+    // implement coarse mode for commands queued at the filtergraph.
+    //
+    // If you want coarse sync, you probably want to wait until there is a
+    // command due, and then execute it. You can do this by calling
+    // GetDueCommand. If you have several things to wait for, get the
+    // event handle from GetDueHandle() and when this is signalled then call
+    // GetDueCommand. Stream time will only advance between calls to Run and
+    // EndRun. Note that to avoid an extra thread there is no guarantee that
+    // if the handle is set there will be a command ready. Each time the
+    // event is signalled, call GetDueCommand (probably with a 0 timeout);
+    // This may return E_ABORT.
+    //
+    // If you want accurate sync, you must call GetCommandDueFor, passing
+    // as a parameter the stream time of the samples you are about to process.
+    // This will return:
+    //   -- a stream-time command due at or before that stream time
+    //   -- a presentation-time command due at or before the
+    //      time that stream time will be presented (only between Run
+    //      and EndRun calls, since outside of this, the mapping from
+    //      stream time to presentation time is not known.
+    //   -- any presentation-time command due now.
+    // This means that if you want accurate synchronisation on samples that
+    // might be processed during Paused mode, you need to use
+    // stream-time commands.
+    //
+    // In all cases, commands remain queued until Invoked or Cancelled. The
+    // setting and resetting of the event handle is managed entirely by this
+    // queue object.
+
+    // set the clock used for timing
+    virtual HRESULT SetSyncSource(IReferenceClock*);
+
+    // switch to run mode. Streamtime to Presentation time mapping known.
+    virtual HRESULT Run(REFERENCE_TIME tStreamTimeOffset);
+
+    // switch to Stopped or Paused mode. Time mapping not known.
+    virtual HRESULT EndRun();
+
+    // return a pointer to the next due command. Blocks for msTimeout
+    // milliseconds until there is a due command.
+    // Stream-time commands will only become due between Run and Endrun calls.
+    // The command remains queued until invoked or cancelled.
+    // Returns E_ABORT if timeout occurs, otherwise S_OK (or other error).
+    // Returns an AddRef-ed object
+    virtual HRESULT GetDueCommand(CDeferredCommand ** ppCmd, long msTimeout);
+
+    // return the event handle that will be signalled whenever
+    // there are deferred commands due for execution (when GetDueCommand
+    // will not block).
+    HANDLE GetDueHandle() {
+        return HANDLE(m_evDue);
+    };
+
+    // return a pointer to a command that will be due for a given time.
+    // Pass in a stream time here. The stream time offset will be passed
+    // in via the Run method.
+    // Commands remain queued until invoked or cancelled.
+    // This method will not block. It will report VFW_E_NOT_FOUND if there
+    // are no commands due yet.
+    // Returns an AddRef-ed object
+    virtual HRESULT GetCommandDueFor(REFERENCE_TIME tStream, CDeferredCommand**ppCmd);
+
+    // check if a given time is due (TRUE if it is due yet)
+    BOOL CheckTime(CRefTime time, BOOL bStream) {
+
+        // if no clock, nothing is due!
+        if (!m_pClock) {
+            return FALSE;
+        }
+
+        // stream time
+        if (bStream) {
+
+            // not valid if not running
+            if (!m_bRunning) {
+                return FALSE;
+            }
+            // add on known stream time offset to get presentation time
+            time += m_StreamTimeOffset;
+        }
+
+        CRefTime Now;
+        m_pClock->GetTime((REFERENCE_TIME*)&Now);
+        return (time <= Now);
+    };
+
+protected:
+
+    // protect access to lists etc
+    CCritSec m_Lock;
+
+    // commands queued in presentation time are stored here
+    CGenericList<CDeferredCommand> m_listPresentation;
+
+    // commands queued in stream time are stored here
+    CGenericList<CDeferredCommand> m_listStream;
+
+    // set when any commands are due
+    CAMEvent m_evDue;
+
+    // creates an advise for the earliest time required, if any
+    void SetTimeAdvise(void);
+
+    // advise id from reference clock (0 if no outstanding advise)
+    DWORD_PTR m_dwAdvise;
+
+    // advise time is for this presentation time
+    CRefTime m_tCurrentAdvise;
+
+    // the reference clock we are using (addrefed)
+    IReferenceClock* m_pClock;
+
+    // true when running
+    BOOL m_bRunning;
+
+    // contains stream time offset when m_bRunning is true
+    CRefTime m_StreamTimeOffset;
+};
+
+#endif // __CTLUTIL__
diff --git a/plugins/GSdx_legacy/baseclasses/ddmm.cpp b/plugins/GSdx_legacy/baseclasses/ddmm.cpp
new file mode 100644
index 0000000000..3cf6fc1490
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/ddmm.cpp
@@ -0,0 +1,130 @@
+//------------------------------------------------------------------------------
+// File: DDMM.cpp
+//
+// Desc: DirectShow base classes - implements routines for using DirectDraw
+//       on a multimonitor system.
+//
+// Copyright (c)  Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+//#include <strsafe.h>
+#include <strmif.h>
+#include <mmsystem.h>
+#include "ddmm.h"
+
+/*
+ * FindDeviceCallback
+ */
+typedef struct {
+	LPSTR   szDevice;
+	GUID*   lpGUID;
+	GUID    GUID;
+	BOOL    fFound;
+}   FindDeviceData;
+
+BOOL CALLBACK FindDeviceCallback(GUID* lpGUID, LPSTR szName, LPSTR szDevice, LPVOID lParam)
+{
+	FindDeviceData *p = (FindDeviceData*)lParam;
+
+	if (lstrcmpiA(p->szDevice, szDevice) == 0) {
+	    if (lpGUID) {
+		p->GUID = *lpGUID;
+		p->lpGUID = &p->GUID;
+	    } else {
+		p->lpGUID = NULL;
+	    }
+	    p->fFound = TRUE;
+	    return FALSE;
+	}
+	return TRUE;
+}
+
+
+BOOL CALLBACK FindDeviceCallbackEx(GUID* lpGUID, LPSTR szName, LPSTR szDevice, LPVOID lParam, HMONITOR hMonitor)
+{
+	FindDeviceData *p = (FindDeviceData*)lParam;
+
+	if (lstrcmpiA(p->szDevice, szDevice) == 0) {
+	    if (lpGUID) {
+		p->GUID = *lpGUID;
+		p->lpGUID = &p->GUID;
+	    } else {
+		p->lpGUID = NULL;
+	    }
+	    p->fFound = TRUE;
+	    return FALSE;
+	}
+	return TRUE;
+}
+
+
+/*
+ * DirectDrawCreateFromDevice
+ *
+ * create a DirectDraw object for a particular device
+ */
+IDirectDraw * DirectDrawCreateFromDevice(LPSTR szDevice, PDRAWCREATE DirectDrawCreateP, PDRAWENUM DirectDrawEnumerateP)
+{
+	IDirectDraw*    pdd = NULL;
+	FindDeviceData  find;
+
+	if (szDevice == NULL) {
+		DirectDrawCreateP(NULL, &pdd, NULL);
+		return pdd;
+	}
+
+	find.szDevice = szDevice;
+	find.fFound   = FALSE;
+	DirectDrawEnumerateP(FindDeviceCallback, (LPVOID)&find);
+
+	if (find.fFound)
+	{
+		//
+		// In 4bpp mode the following DDraw call causes a message box to be popped
+		// up by DDraw (!?!).  It's DDraw's fault, but we don't like it.  So we
+		// make sure it doesn't happen.
+		//
+		UINT ErrorMode = SetErrorMode(SEM_FAILCRITICALERRORS);
+		DirectDrawCreateP(find.lpGUID, &pdd, NULL);
+		SetErrorMode(ErrorMode);
+	}
+
+	return pdd;
+}
+
+
+/*
+ * DirectDrawCreateFromDeviceEx
+ *
+ * create a DirectDraw object for a particular device
+ */
+IDirectDraw * DirectDrawCreateFromDeviceEx(LPSTR szDevice, PDRAWCREATE DirectDrawCreateP, LPDIRECTDRAWENUMERATEEXA DirectDrawEnumerateExP)
+{
+	IDirectDraw*    pdd = NULL;
+	FindDeviceData  find;
+
+	if (szDevice == NULL) {
+		DirectDrawCreateP(NULL, &pdd, NULL);
+		return pdd;
+	}
+
+	find.szDevice = szDevice;
+	find.fFound   = FALSE;
+	DirectDrawEnumerateExP(FindDeviceCallbackEx, (LPVOID)&find,
+					DDENUM_ATTACHEDSECONDARYDEVICES);
+
+	if (find.fFound)
+	{
+		//
+		// In 4bpp mode the following DDraw call causes a message box to be popped
+		// up by DDraw (!?!).  It's DDraw's fault, but we don't like it.  So we
+		// make sure it doesn't happen.
+		//
+		UINT ErrorMode = SetErrorMode(SEM_FAILCRITICALERRORS);
+		DirectDrawCreateP(find.lpGUID, &pdd, NULL);
+		SetErrorMode(ErrorMode);
+	}
+
+	return pdd;
+}
diff --git a/plugins/GSdx_legacy/baseclasses/ddmm.h b/plugins/GSdx_legacy/baseclasses/ddmm.h
new file mode 100644
index 0000000000..4d60e3b10b
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/ddmm.h
@@ -0,0 +1,28 @@
+//------------------------------------------------------------------------------
+// File: DDMM.h
+//
+// Desc: DirectShow base classes - efines routines for using DirectDraw
+//       on a multimonitor system.
+//
+// Copyright (c)  Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifdef __cplusplus
+extern "C" {            /* Assume C declarations for C++ */
+#endif  /* __cplusplus */
+
+// DDRAW.H might not include these
+#ifndef DDENUM_ATTACHEDSECONDARYDEVICES
+#define DDENUM_ATTACHEDSECONDARYDEVICES     0x00000001L
+#endif
+
+typedef HRESULT (*PDRAWCREATE)(IID *,LPDIRECTDRAW *,LPUNKNOWN);
+typedef HRESULT (*PDRAWENUM)(LPDDENUMCALLBACKA, LPVOID);
+
+IDirectDraw * DirectDrawCreateFromDevice(LPSTR, PDRAWCREATE, PDRAWENUM);
+IDirectDraw * DirectDrawCreateFromDeviceEx(LPSTR, PDRAWCREATE, LPDIRECTDRAWENUMERATEEXA);
+
+#ifdef __cplusplus
+}
+#endif	/* __cplusplus */
diff --git a/plugins/GSdx_legacy/baseclasses/dsschedule.h b/plugins/GSdx_legacy/baseclasses/dsschedule.h
new file mode 100644
index 0000000000..a81c5760a4
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/dsschedule.h
@@ -0,0 +1,128 @@
+//------------------------------------------------------------------------------
+// File: DSSchedule.h (replaces DirectX 8's schedule.h)
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __CAMSchedule__
+#define __CAMSchedule__
+
+class CAMSchedule : private CBaseObject
+{
+public:
+    virtual ~CAMSchedule();
+    // ev is the event we should fire if the advise time needs re-evaluating
+    CAMSchedule( HANDLE ev );
+
+    DWORD GetAdviseCount();
+    REFERENCE_TIME GetNextAdviseTime();
+
+    // We need a method for derived classes to add advise packets, we return the cookie
+    DWORD_PTR AddAdvisePacket( const REFERENCE_TIME & time1, const REFERENCE_TIME & time2, HANDLE h, BOOL periodic );
+    // And a way to cancel
+    HRESULT Unadvise(DWORD_PTR dwAdviseCookie);
+
+    // Tell us the time please, and we'll dispatch the expired events.  We return the time of the next event.
+    // NB: The time returned will be "useless" if you start adding extra Advises.  But that's the problem of
+    // whoever is using this helper class (typically a clock).
+    REFERENCE_TIME Advise( const REFERENCE_TIME & rtTime );
+
+    // Get the event handle which will be set if advise time requires re-evaluation.
+    HANDLE GetEvent() const { return m_ev; }
+
+private:
+    // We define the nodes that will be used in our singly linked list
+    // of advise packets.  The list is ordered by time, with the
+    // elements that will expire first at the front.
+    class CAdvisePacket
+    {
+    public:
+        CAdvisePacket()
+        {}
+
+        CAdvisePacket * m_next;
+        DWORD_PTR       m_dwAdviseCookie;
+        REFERENCE_TIME  m_rtEventTime;      // Time at which event should be set
+        REFERENCE_TIME  m_rtPeriod;         // Periodic time
+        HANDLE          m_hNotify;          // Handle to event or semephore
+        BOOL            m_bPeriodic;        // TRUE => Periodic event
+
+        CAdvisePacket( CAdvisePacket * next, LONGLONG time ) : m_next(next), m_rtEventTime(time)
+        {}
+
+        void InsertAfter( CAdvisePacket * p )
+        {
+            p->m_next = m_next;
+            m_next    = p;
+        }
+
+        int IsZ() const // That is, is it the node that represents the end of the list
+        { return m_next == 0; }
+
+        CAdvisePacket * RemoveNext()
+        {
+            CAdvisePacket *const next = m_next;
+            CAdvisePacket *const new_next = next->m_next;
+            m_next = new_next;
+            return next;
+        }
+
+        void DeleteNext()
+        {
+            delete RemoveNext();
+        }
+
+        CAdvisePacket * Next() const
+        {
+            CAdvisePacket * result = m_next;
+            if (result->IsZ()) result = 0;
+            return result;
+        }
+
+        DWORD_PTR Cookie() const
+        { return m_dwAdviseCookie; }
+    };
+
+    // Structure is:
+    // head -> elmt1 -> elmt2 -> z -> null
+    // So an empty list is:       head -> z -> null
+    // Having head & z as links makes insertaion,
+    // deletion and shunting much easier.
+    CAdvisePacket   head, z;            // z is both a tail and a sentry
+
+    volatile DWORD_PTR  m_dwNextCookie;     // Strictly increasing
+    volatile DWORD  m_dwAdviseCount;    // Number of elements on list
+
+    CCritSec        m_Serialize;
+
+    // AddAdvisePacket: adds the packet, returns the cookie (0 if failed)
+    DWORD_PTR AddAdvisePacket( CAdvisePacket * pPacket );
+    // Event that we should set if the packed added above will be the next to fire.
+    const HANDLE m_ev;
+
+    // A Shunt is where we have changed the first element in the
+    // list and want it re-evaluating (i.e. repositioned) in
+    // the list.
+    void ShuntHead();
+
+    // Rather than delete advise packets, we cache them for future use
+    CAdvisePacket * m_pAdviseCache;
+    DWORD           m_dwCacheCount;
+    enum { dwCacheMax = 5 };             // Don't bother caching more than five
+
+    void Delete( CAdvisePacket * pLink );// This "Delete" will cache the Link
+
+// Attributes and methods for debugging
+public:
+#ifdef DEBUG
+    void DumpLinkedList();
+#else
+    void DumpLinkedList() {}
+#endif
+
+};
+
+#endif // __CAMSchedule__
diff --git a/plugins/GSdx_legacy/baseclasses/fourcc.h b/plugins/GSdx_legacy/baseclasses/fourcc.h
new file mode 100644
index 0000000000..ae20ab3425
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/fourcc.h
@@ -0,0 +1,101 @@
+//------------------------------------------------------------------------------
+// File: FourCC.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// FOURCCMap
+//
+// provides a mapping between old-style multimedia format DWORDs
+// and new-style GUIDs.
+//
+// A range of 4 billion GUIDs has been allocated to ensure that this
+// mapping can be done straightforwardly one-to-one in both directions.
+//
+// January 95
+
+
+#ifndef __FOURCC__
+#define __FOURCC__
+
+
+// Multimedia format types are marked with DWORDs built from four 8-bit
+// chars and known as FOURCCs. New multimedia AM_MEDIA_TYPE definitions include
+// a subtype GUID. In order to simplify the mapping, GUIDs in the range:
+//    XXXXXXXX-0000-0010-8000-00AA00389B71
+// are reserved for FOURCCs.
+
+class FOURCCMap : public GUID
+{
+
+public:
+    FOURCCMap();
+    FOURCCMap(DWORD Fourcc);
+    FOURCCMap(const GUID *);
+
+
+    DWORD GetFOURCC(void);
+    void SetFOURCC(DWORD fourcc);
+    void SetFOURCC(const GUID *);
+
+private:
+    void InitGUID();
+};
+
+#define GUID_Data2      0
+#define GUID_Data3     0x10
+#define GUID_Data4_1   0xaa000080
+#define GUID_Data4_2   0x719b3800
+
+inline void
+FOURCCMap::InitGUID() {
+    Data2 = GUID_Data2;
+    Data3 = GUID_Data3;
+    ((DWORD *)Data4)[0] = GUID_Data4_1;
+    ((DWORD *)Data4)[1] = GUID_Data4_2;
+}
+
+inline
+FOURCCMap::FOURCCMap() {
+    InitGUID();
+    SetFOURCC( DWORD(0));
+}
+
+inline
+FOURCCMap::FOURCCMap(DWORD fourcc)
+{
+    InitGUID();
+    SetFOURCC(fourcc);
+}
+
+inline
+FOURCCMap::FOURCCMap(const GUID * pGuid)
+{
+    InitGUID();
+    SetFOURCC(pGuid);
+}
+
+inline void
+FOURCCMap::SetFOURCC(const GUID * pGuid)
+{
+    FOURCCMap * p = (FOURCCMap*) pGuid;
+    SetFOURCC(p->GetFOURCC());
+}
+
+inline void
+FOURCCMap::SetFOURCC(DWORD fourcc)
+{
+    Data1 = fourcc;
+}
+
+inline DWORD
+FOURCCMap::GetFOURCC(void)
+{
+    return Data1;
+}
+
+#endif /* __FOURCC__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/measure.h b/plugins/GSdx_legacy/baseclasses/measure.h
new file mode 100644
index 0000000000..f90eb15897
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/measure.h
@@ -0,0 +1,222 @@
+//------------------------------------------------------------------------------
+// File: Measure.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+/*
+   The idea is to pepper the source code with interesting measurements and
+   have the last few thousand of these recorded in a circular buffer that
+   can be post-processed to give interesting numbers.
+
+   WHAT THE LOG LOOKS LIKE:
+
+  Time (sec)   Type        Delta  Incident_Name
+    0.055,41  NOTE      -.       Incident Nine  - Another note
+    0.055,42  NOTE      0.000,01 Incident Nine  - Another note
+    0.055,44  NOTE      0.000,02 Incident Nine  - Another note
+    0.055,45  STOP      -.       Incident Eight - Also random
+    0.055,47  START     -.       Incident Seven - Random
+    0.055,49  NOTE      0.000,05 Incident Nine  - Another note
+    ------- <etc.  there is a lot of this> ----------------
+    0.125,60  STOP      0.000,03 Msr_Stop
+    0.125,62  START     -.       Msr_Start
+    0.125,63  START     -.       Incident Two   - Start/Stop
+    0.125,65  STOP      0.000,03 Msr_Start
+    0.125,66  START     -.       Msr_Stop
+    0.125,68  STOP      0.000,05 Incident Two   - Start/Stop
+    0.125,70  STOP      0.000,04 Msr_Stop
+    0.125,72  START     -.       Msr_Start
+    0.125,73  START     -.       Incident Two   - Start/Stop
+    0.125,75  STOP      0.000,03 Msr_Start
+    0.125,77  START     -.       Msr_Stop
+    0.125,78  STOP      0.000,05 Incident Two   - Start/Stop
+    0.125,80  STOP      0.000,03 Msr_Stop
+    0.125,81  NOTE      -.       Incident Three - single Note
+    0.125,83  START     -.       Incident Four  - Start, no stop
+    0.125,85  START     -.       Incident Five  - Single Start/Stop
+    0.125,87  STOP      0.000,02 Incident Five  - Single Start/Stop
+
+Number      Average       StdDev     Smallest      Largest Incident_Name
+    10     0.000,58     0.000,10     0.000,55     0.000,85 Incident One   - Note
+    50     0.000,05     0.000,00     0.000,05     0.000,05 Incident Two   - Start/Stop
+     1     -.           -.           -.           -.       Incident Three - single Note
+     0     -.           -.           -.           -.       Incident Four  - Start, no stop
+     1     0.000,02     -.           0.000,02     0.000,02 Incident Five  - Single Start/Stop
+     0     -.           -.           -.           -.       Incident Six   - zero occurrences
+   100     0.000,25     0.000,12     0.000,02     0.000,62 Incident Seven - Random
+   100     0.000,79     0.000,48     0.000,02     0.001,92 Incident Eight - Also random
+  5895     0.000,01     0.000,01     0.000,01     0.000,56 Incident Nine  - Another note
+    10     0.000,03     0.000,00     0.000,03     0.000,04 Msr_Note
+    50     0.000,03     0.000,00     0.000,03     0.000,04 Msr_Start
+    50     0.000,04     0.000,03     0.000,03     0.000,31 Msr_Stop
+
+  WHAT IT MEANS:
+    The log shows what happened and when.  Each line shows the time at which
+    something happened (see WHAT YOU CODE below) what it was that happened
+    and (if approporate) the time since the corresponding previous event
+    (that's the delta column).
+
+    The statistics show how many times each event occurred, what the average
+    delta time was, also the standard deviation, largest and smalles delta.
+
+   WHAT YOU CODE:
+
+   Before anything else executes: - register your ids
+
+    int id1     = Msr_Register("Incident One   - Note");
+    int id2     = Msr_Register("Incident Two   - Start/Stop");
+    int id3     = Msr_Register("Incident Three - single Note");
+    etc.
+
+   At interesting moments:
+
+       // To measure a repetitive event - e.g. end of bitblt to screen
+       Msr_Note(Id9);             // e.g. "video frame hiting the screen NOW!"
+
+           or
+
+       // To measure an elapsed time e.g. time taken to decode an MPEG B-frame
+       Msr_Start(Id2);            // e.g. "Starting to decode MPEG B-frame"
+         . . .
+       MsrStop(Id2);              //      "Finished MPEG decode"
+
+   At the end:
+
+       HANDLE hFile;
+       hFile = CreateFile("Perf.log", GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, 0, NULL);
+       Msr_Dump(hFile);           // This writes the log out to the file
+       CloseHandle(hFile);
+
+           or
+
+       Msr_Dump(NULL);            // This writes it to DbgLog((LOG_TRACE,0, ... ));
+                                  // but if you are writing it out to the debugger
+                                  // then the times are probably all garbage because
+                                  // the debugger can make things run awfully slow.
+
+    A given id should be used either for start / stop or Note calls.  If Notes
+    are mixed in with Starts and Stops their statistics will be gibberish.
+
+    If you code the calls in upper case i.e. MSR_START(idMunge); then you get
+    macros which will turn into nothing unless PERF is defined.
+
+    You can reset the statistical counts for a given id by calling Reset(Id).
+    They are reset by default at the start.
+    It logs Reset as a special incident, so you can see it in the log.
+
+    The log is a circular buffer in storage (to try to minimise disk I/O).
+    It overwrites the oldest entries once full.  The statistics include ALL
+    incidents since the last Reset, whether still visible in the log or not.
+*/
+
+#ifndef __MEASURE__
+#define __MEASURE__
+
+#ifdef PERF
+#define MSR_INIT() Msr_Init()
+#define MSR_TERMINATE() Msr_Terminate()
+#define MSR_REGISTER(a) Msr_Register(a)
+#define MSR_RESET(a) Msr_Reset(a)
+#define MSR_CONTROL(a) Msr_Control(a)
+#define MSR_START(a) Msr_Start(a)
+#define MSR_STOP(a) Msr_Stop(a)
+#define MSR_NOTE(a) Msr_Note(a)
+#define MSR_INTEGER(a,b) Msr_Integer(a,b)
+#define MSR_DUMP(a) Msr_Dump(a)
+#define MSR_DUMPSTATS(a) Msr_DumpStats(a)
+#else
+#define MSR_INIT() ((void)0)
+#define MSR_TERMINATE() ((void)0)
+#define MSR_REGISTER(a) 0
+#define MSR_RESET(a) ((void)0)
+#define MSR_CONTROL(a) ((void)0)
+#define MSR_START(a) ((void)0)
+#define MSR_STOP(a) ((void)0)
+#define MSR_NOTE(a) ((void)0)
+#define MSR_INTEGER(a,b) ((void)0)
+#define MSR_DUMP(a) ((void)0)
+#define MSR_DUMPSTATS(a) ((void)0)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This must be called first - (called by the DllEntry)
+
+void WINAPI Msr_Init(void);
+
+
+// Call this last to clean up (or just let it fall off the end - who cares?)
+
+void WINAPI Msr_Terminate(void);
+
+
+// Call this to get an Id for an "incident" that you can pass to Start, Stop or Note
+// everything that's logged is called an "incident".
+
+int  WINAPI Msr_Register(LPTSTR Incident);
+
+
+// Reset the statistical counts for an incident
+
+void WINAPI Msr_Reset(int Id);
+
+
+// Reset all the counts for all incidents
+#define MSR_RESET_ALL 0
+#define MSR_PAUSE 1
+#define MSR_RUN 2
+
+void WINAPI Msr_Control(int iAction);
+
+
+// log the start of an operation
+
+void WINAPI Msr_Start(int Id);
+
+
+// log the end of an operation
+
+void WINAPI Msr_Stop(int Id);
+
+
+// log a one-off or repetitive operation
+
+void WINAPI Msr_Note(int Id);
+
+
+// log an integer (on which we can see statistics later)
+void WINAPI Msr_Integer(int Id, int n);
+
+
+// print out all the vaialable log (it may have wrapped) and then the statistics.
+// When the log wraps you lose log but the statistics are still complete.
+// hFIle==NULL => use DbgLog
+// otherwise hFile must have come from CreateFile or OpenFile.
+
+void WINAPI Msr_Dump(HANDLE hFile);
+
+
+// just dump the statistics - never mind the log
+
+void WINAPI Msr_DumpStats(HANDLE hFile);
+
+// Type definitions in case you want to declare a pointer to the dump functions
+// (makes it a trifle easier to do dynamic linking
+// i.e. LoadModule, GetProcAddress and call that)
+
+// Typedefs so can declare MSR_DUMPPROC *MsrDumpStats; or whatever
+typedef void WINAPI MSR_DUMPPROC(HANDLE hFile);
+typedef void WINAPI MSR_CONTROLPROC(int iAction);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __MEASURE__
diff --git a/plugins/GSdx_legacy/baseclasses/msgthrd.h b/plugins/GSdx_legacy/baseclasses/msgthrd.h
new file mode 100644
index 0000000000..88a1cc3959
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/msgthrd.h
@@ -0,0 +1,120 @@
+//------------------------------------------------------------------------------
+// File: MsgThrd.h
+//
+// Desc: DirectShow base classes - provides support for a worker thread
+//       class to which one can asynchronously post messages.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// Message class - really just a structure.
+//
+class CMsg {
+public:
+    UINT uMsg;
+    DWORD dwFlags;
+    LPVOID lpParam;
+    CAMEvent *pEvent;
+
+    CMsg(UINT u, DWORD dw, LPVOID lp, CAMEvent *pEvnt)
+        : uMsg(u), dwFlags(dw), lpParam(lp), pEvent(pEvnt) {}
+
+    CMsg()
+        : uMsg(0), dwFlags(0L), lpParam(NULL), pEvent(NULL) {}
+};
+
+// This is the actual thread class.  It exports all the usual thread control
+// functions.  The created thread is different from a normal WIN32 thread in
+// that it is prompted to perform particaular tasks by responding to messages
+// posted to its message queue.
+//
+class AM_NOVTABLE CMsgThread {
+private:
+    static DWORD WINAPI DefaultThreadProc(LPVOID lpParam);
+    DWORD               m_ThreadId;
+    HANDLE              m_hThread;
+
+protected:
+
+    // if you want to override GetThreadMsg to block on other things
+    // as well as this queue, you need access to this
+    CGenericList<CMsg>        m_ThreadQueue;
+    CCritSec                  m_Lock;
+    HANDLE                    m_hSem;
+    LONG                      m_lWaiting;
+
+public:
+    CMsgThread()
+        : m_ThreadId(0),
+        m_hThread(NULL),
+        m_lWaiting(0),
+        m_hSem(NULL),
+        // make a list with a cache of 5 items
+        m_ThreadQueue(NAME("MsgThread list"), 5)
+        {
+        }
+
+    ~CMsgThread();
+    // override this if you want to block on other things as well
+    // as the message loop
+    void virtual GetThreadMsg(CMsg *msg);
+
+    // override this if you want to do something on thread startup
+    virtual void OnThreadInit() {
+    };
+
+    BOOL CreateThread();
+
+    BOOL WaitForThreadExit(LPDWORD lpdwExitCode) {
+        if (m_hThread != NULL) {
+            WaitForSingleObject(m_hThread, INFINITE);
+            return GetExitCodeThread(m_hThread, lpdwExitCode);
+        }
+        return FALSE;
+    }
+
+    DWORD ResumeThread() {
+        return ::ResumeThread(m_hThread);
+    }
+
+    DWORD SuspendThread() {
+        return ::SuspendThread(m_hThread);
+    }
+
+    int GetThreadPriority() {
+        return ::GetThreadPriority(m_hThread);
+    }
+
+    BOOL SetThreadPriority(int nPriority) {
+        return ::SetThreadPriority(m_hThread, nPriority);
+    }
+
+    HANDLE GetThreadHandle() {
+        return m_hThread;
+    }
+
+    DWORD GetThreadId() {
+        return m_ThreadId;
+    }
+
+
+    void PutThreadMsg(UINT uMsg, DWORD dwMsgFlags,
+                      LPVOID lpMsgParam, CAMEvent *pEvent = NULL) {
+        CAutoLock lck(&m_Lock);
+        CMsg* pMsg = new CMsg(uMsg, dwMsgFlags, lpMsgParam, pEvent);
+        m_ThreadQueue.AddTail(pMsg);
+        if (m_lWaiting != 0) {
+            ReleaseSemaphore(m_hSem, m_lWaiting, 0);
+            m_lWaiting = 0;
+        }
+    }
+
+    // This is the function prototype of the function that the client
+    // supplies.  It is always called on the created thread, never on
+    // the creator thread.
+    //
+    virtual LRESULT ThreadMessageProc(
+        UINT uMsg, DWORD dwFlags, LPVOID lpParam, CAMEvent *pEvent) = 0;
+};
+
diff --git a/plugins/GSdx_legacy/baseclasses/mtype.cpp b/plugins/GSdx_legacy/baseclasses/mtype.cpp
new file mode 100644
index 0000000000..0d86343e26
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/mtype.cpp
@@ -0,0 +1,477 @@
+//------------------------------------------------------------------------------
+// File: MType.cpp
+//
+// Desc: DirectShow base classes - implements a class that holds and
+//       manages media type information.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// helper class that derived pin objects can use to compare media
+// types etc. Has same data members as the struct AM_MEDIA_TYPE defined
+// in the streams IDL file, but also has (non-virtual) functions
+
+#include "streams.h"
+#include <mmreg.h>
+
+CMediaType::~CMediaType(){
+    FreeMediaType(*this);
+}
+
+
+CMediaType::CMediaType()
+{
+    InitMediaType();
+}
+
+
+CMediaType::CMediaType(const GUID * type)
+{
+    InitMediaType();
+    majortype = *type;
+}
+
+
+// copy constructor does a deep copy of the format block
+
+CMediaType::CMediaType(const AM_MEDIA_TYPE& rt, HRESULT* phr)
+{
+    HRESULT hr = CopyMediaType(this, &rt);
+    if (FAILED(hr) && (NULL != phr)) {
+        *phr = hr;
+    }
+}
+
+
+CMediaType::CMediaType(const CMediaType& rt, HRESULT* phr)
+{
+    HRESULT hr = CopyMediaType(this, &rt);
+    if (FAILED(hr) && (NULL != phr)) {
+        *phr = hr;
+    }
+}
+
+
+// this class inherits publicly from AM_MEDIA_TYPE so the compiler could generate
+// the following assignment operator itself, however it could introduce some
+// memory conflicts and leaks in the process because the structure contains
+// a dynamically allocated block (pbFormat) which it will not copy correctly
+
+CMediaType&
+CMediaType::operator=(const AM_MEDIA_TYPE& rt)
+{
+    Set(rt);
+    return *this;
+}
+
+CMediaType&
+CMediaType::operator=(const CMediaType& rt)
+{
+    *this = (AM_MEDIA_TYPE &) rt;
+    return *this;
+}
+
+BOOL
+CMediaType::operator == (const CMediaType& rt) const
+{
+    // I don't believe we need to check sample size or
+    // temporal compression flags, since I think these must
+    // be represented in the type, subtype and format somehow. They
+    // are pulled out as separate flags so that people who don't understand
+    // the particular format representation can still see them, but
+    // they should duplicate information in the format block.
+
+    return ((IsEqualGUID(majortype,rt.majortype) == TRUE) &&
+        (IsEqualGUID(subtype,rt.subtype) == TRUE) &&
+        (IsEqualGUID(formattype,rt.formattype) == TRUE) &&
+        (cbFormat == rt.cbFormat) &&
+        ( (cbFormat == 0) ||
+          (memcmp(pbFormat, rt.pbFormat, cbFormat) == 0)));
+}
+
+
+BOOL
+CMediaType::operator != (const CMediaType& rt) const
+{
+    /* Check to see if they are equal */
+
+    if (*this == rt) {
+        return FALSE;
+    }
+    return TRUE;
+}
+
+
+HRESULT
+CMediaType::Set(const CMediaType& rt)
+{
+    return Set((AM_MEDIA_TYPE &) rt);
+}
+
+
+HRESULT
+CMediaType::Set(const AM_MEDIA_TYPE& rt)
+{
+    if (&rt != this) {
+        FreeMediaType(*this);
+        HRESULT hr = CopyMediaType(this, &rt);
+        if (FAILED(hr)) {
+            return E_OUTOFMEMORY;
+        }
+    }
+
+    return S_OK;
+}
+
+
+BOOL
+CMediaType::IsValid() const
+{
+    return (!IsEqualGUID(majortype,GUID_NULL));
+}
+
+
+void
+CMediaType::SetType(const GUID* ptype)
+{
+    majortype = *ptype;
+}
+
+
+void
+CMediaType::SetSubtype(const GUID* ptype)
+{
+    subtype = *ptype;
+}
+
+
+ULONG
+CMediaType::GetSampleSize() const {
+    if (IsFixedSize()) {
+        return lSampleSize;
+    } else {
+        return 0;
+    }
+}
+
+
+void
+CMediaType::SetSampleSize(ULONG sz) {
+    if (sz == 0) {
+        SetVariableSize();
+    } else {
+        bFixedSizeSamples = TRUE;
+        lSampleSize = sz;
+    }
+}
+
+
+void
+CMediaType::SetVariableSize() {
+    bFixedSizeSamples = FALSE;
+}
+
+
+void
+CMediaType::SetTemporalCompression(BOOL bCompressed) {
+    bTemporalCompression = bCompressed;
+}
+
+BOOL
+CMediaType::SetFormat(BYTE * pformat, ULONG cb)
+{
+    if (NULL == AllocFormatBuffer(cb))
+	return(FALSE);
+
+    ASSERT(pbFormat);
+    memcpy(pbFormat, pformat, cb);
+    return(TRUE);
+}
+
+
+// set the type of the media type format block, this type defines what you
+// will actually find in the format pointer. For example FORMAT_VideoInfo or
+// FORMAT_WaveFormatEx. In the future this may be an interface pointer to a
+// property set. Before sending out media types this should be filled in.
+
+void
+CMediaType::SetFormatType(const GUID *pformattype)
+{
+    formattype = *pformattype;
+}
+
+
+// reset the format buffer
+
+void CMediaType::ResetFormatBuffer()
+{
+    if (cbFormat) {
+        CoTaskMemFree((PVOID)pbFormat);
+    }
+    cbFormat = 0;
+    pbFormat = NULL;
+}
+
+
+// allocate length bytes for the format and return a read/write pointer
+// If we cannot allocate the new block of memory we return NULL leaving
+// the original block of memory untouched (as does ReallocFormatBuffer)
+
+BYTE*
+CMediaType::AllocFormatBuffer(ULONG length)
+{
+    ASSERT(length);
+
+    // do the types have the same buffer size
+
+    if (cbFormat == length) {
+        return pbFormat;
+    }
+
+    // allocate the new format buffer
+
+    BYTE *pNewFormat = (PBYTE)CoTaskMemAlloc(length);
+    if (pNewFormat == NULL) {
+        if (length <= cbFormat) return pbFormat; //reuse the old block anyway.
+        return NULL;
+    }
+
+    // delete the old format
+
+    if (cbFormat != 0) {
+        ASSERT(pbFormat);
+        CoTaskMemFree((PVOID)pbFormat);
+    }
+
+    cbFormat = length;
+    pbFormat = pNewFormat;
+    return pbFormat;
+}
+
+
+// reallocate length bytes for the format and return a read/write pointer
+// to it. We keep as much information as we can given the new buffer size
+// if this fails the original format buffer is left untouched. The caller
+// is responsible for ensuring the size of memory required is non zero
+
+BYTE*
+CMediaType::ReallocFormatBuffer(ULONG length)
+{
+    ASSERT(length);
+
+    // do the types have the same buffer size
+
+    if (cbFormat == length) {
+        return pbFormat;
+    }
+
+    // allocate the new format buffer
+
+    BYTE *pNewFormat = (PBYTE)CoTaskMemAlloc(length);
+    if (pNewFormat == NULL) {
+        if (length <= cbFormat) return pbFormat; //reuse the old block anyway.
+        return NULL;
+    }
+
+    // copy any previous format (or part of if new is smaller)
+    // delete the old format and replace with the new one
+
+    if (cbFormat != 0) {
+        ASSERT(pbFormat);
+        memcpy(pNewFormat,pbFormat,min(length,cbFormat));
+        CoTaskMemFree((PVOID)pbFormat);
+    }
+
+    cbFormat = length;
+    pbFormat = pNewFormat;
+    return pNewFormat;
+}
+
+// initialise a media type structure
+
+void CMediaType::InitMediaType()
+{
+    ZeroMemory((PVOID)this, sizeof(*this));
+    lSampleSize = 1;
+    bFixedSizeSamples = TRUE;
+}
+
+
+// a partially specified media type can be passed to IPin::Connect
+// as a constraint on the media type used in the connection.
+// the type, subtype or format type can be null.
+BOOL
+CMediaType::IsPartiallySpecified(void) const
+{
+    if ((majortype == GUID_NULL) ||
+        (formattype == GUID_NULL)) {
+            return TRUE;
+    } else {
+        return FALSE;
+    }
+}
+
+BOOL
+CMediaType::MatchesPartial(const CMediaType* ppartial) const
+{
+    if ((ppartial->majortype != GUID_NULL) &&
+        (majortype != ppartial->majortype)) {
+            return FALSE;
+    }
+    if ((ppartial->subtype != GUID_NULL) &&
+        (subtype != ppartial->subtype)) {
+            return FALSE;
+    }
+
+    if (ppartial->formattype != GUID_NULL) {
+        // if the format block is specified then it must match exactly
+        if (formattype != ppartial->formattype) {
+            return FALSE;
+        }
+        if (cbFormat != ppartial->cbFormat) {
+            return FALSE;
+        }
+        if ((cbFormat != 0) &&
+            (memcmp(pbFormat, ppartial->pbFormat, cbFormat) != 0)) {
+                return FALSE;
+        }
+    }
+
+    return TRUE;
+
+}
+
+
+
+// general purpose function to delete a heap allocated AM_MEDIA_TYPE structure
+// which is useful when calling IEnumMediaTypes::Next as the interface
+// implementation allocates the structures which you must later delete
+// the format block may also be a pointer to an interface to release
+
+void WINAPI DeleteMediaType(AM_MEDIA_TYPE *pmt)
+{
+    // allow NULL pointers for coding simplicity
+
+    if (pmt == NULL) {
+        return;
+    }
+
+    FreeMediaType(*pmt);
+    CoTaskMemFree((PVOID)pmt);
+}
+
+
+// this also comes in useful when using the IEnumMediaTypes interface so
+// that you can copy a media type, you can do nearly the same by creating
+// a CMediaType object but as soon as it goes out of scope the destructor
+// will delete the memory it allocated (this takes a copy of the memory)
+
+AM_MEDIA_TYPE * WINAPI CreateMediaType(AM_MEDIA_TYPE const *pSrc)
+{
+    ASSERT(pSrc);
+
+    // Allocate a block of memory for the media type
+
+    AM_MEDIA_TYPE *pMediaType =
+        (AM_MEDIA_TYPE *)CoTaskMemAlloc(sizeof(AM_MEDIA_TYPE));
+
+    if (pMediaType == NULL) {
+        return NULL;
+    }
+    // Copy the variable length format block
+
+    HRESULT hr = CopyMediaType(pMediaType,pSrc);
+    if (FAILED(hr)) {
+        CoTaskMemFree((PVOID)pMediaType);
+        return NULL;
+    }
+
+    return pMediaType;
+}
+
+
+//  Copy 1 media type to another
+
+HRESULT WINAPI CopyMediaType(AM_MEDIA_TYPE *pmtTarget, const AM_MEDIA_TYPE *pmtSource)
+{
+    //  We'll leak if we copy onto one that already exists - there's one
+    //  case we can check like that - copying to itself.
+    ASSERT(pmtSource != pmtTarget);
+    *pmtTarget = *pmtSource;
+    if (pmtSource->cbFormat != 0) {
+        ASSERT(pmtSource->pbFormat != NULL);
+        pmtTarget->pbFormat = (PBYTE)CoTaskMemAlloc(pmtSource->cbFormat);
+        if (pmtTarget->pbFormat == NULL) {
+            pmtTarget->cbFormat = 0;
+            return E_OUTOFMEMORY;
+        } else {
+            CopyMemory((PVOID)pmtTarget->pbFormat, (PVOID)pmtSource->pbFormat,
+                       pmtTarget->cbFormat);
+        }
+    }
+    if (pmtTarget->pUnk != NULL) {
+        pmtTarget->pUnk->AddRef();
+    }
+
+    return S_OK;
+}
+
+//  Free an existing media type (ie free resources it holds)
+
+void WINAPI FreeMediaType(AM_MEDIA_TYPE& mt)
+{
+    if (mt.cbFormat != 0) {
+        CoTaskMemFree((PVOID)mt.pbFormat);
+
+        // Strictly unnecessary but tidier
+        mt.cbFormat = 0;
+        mt.pbFormat = NULL;
+    }
+    if (mt.pUnk != NULL) {
+        mt.pUnk->Release();
+        mt.pUnk = NULL;
+    }
+}
+
+//  Initialize a media type from a WAVEFORMATEX
+
+STDAPI CreateAudioMediaType(
+    const WAVEFORMATEX *pwfx,
+    AM_MEDIA_TYPE *pmt,
+    BOOL bSetFormat
+)
+{
+    pmt->majortype            = MEDIATYPE_Audio;
+    if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+        pmt->subtype = ((PWAVEFORMATEXTENSIBLE)pwfx)->SubFormat;
+    } else {
+        pmt->subtype              = FOURCCMap(pwfx->wFormatTag);
+    }
+    pmt->formattype           = FORMAT_WaveFormatEx;
+    pmt->bFixedSizeSamples    = TRUE;
+    pmt->bTemporalCompression = FALSE;
+    pmt->lSampleSize          = pwfx->nBlockAlign;
+    pmt->pUnk                 = NULL;
+    if (bSetFormat) {
+        if (pwfx->wFormatTag == WAVE_FORMAT_PCM) {
+            pmt->cbFormat         = sizeof(WAVEFORMATEX);
+        } else {
+            pmt->cbFormat         = sizeof(WAVEFORMATEX) + pwfx->cbSize;
+        }
+        pmt->pbFormat             = (PBYTE)CoTaskMemAlloc(pmt->cbFormat);
+        if (pmt->pbFormat == NULL) {
+            return E_OUTOFMEMORY;
+        }
+        if (pwfx->wFormatTag == WAVE_FORMAT_PCM) {
+            CopyMemory(pmt->pbFormat, pwfx, sizeof(PCMWAVEFORMAT));
+            ((WAVEFORMATEX *)pmt->pbFormat)->cbSize = 0;
+        } else {
+            CopyMemory(pmt->pbFormat, pwfx, pmt->cbFormat);
+        }
+    }
+    return S_OK;
+}
+
+// eliminate very many spurious warnings from MS compiler
+#pragma warning(disable:4514)
diff --git a/plugins/GSdx_legacy/baseclasses/mtype.h b/plugins/GSdx_legacy/baseclasses/mtype.h
new file mode 100644
index 0000000000..41d5829829
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/mtype.h
@@ -0,0 +1,89 @@
+//------------------------------------------------------------------------------
+// File: MtType.h
+//
+// Desc: DirectShow base classes - defines a class that holds and manages
+//       media type information.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __MTYPE__
+#define __MTYPE__
+
+/* Helper class that derived pin objects can use to compare media
+   types etc. Has same data members as the struct AM_MEDIA_TYPE defined
+   in the streams IDL file, but also has (non-virtual) functions */
+
+class CMediaType : public _AMMediaType {
+
+public:
+
+    ~CMediaType();
+    CMediaType();
+    CMediaType(const GUID * majortype);
+    CMediaType(const AM_MEDIA_TYPE&, HRESULT* phr = NULL);
+    CMediaType(const CMediaType&, HRESULT* phr = NULL);
+
+    CMediaType& operator=(const CMediaType&);
+    CMediaType& operator=(const AM_MEDIA_TYPE&);
+
+    BOOL operator == (const CMediaType&) const;
+    BOOL operator != (const CMediaType&) const;
+
+    HRESULT Set(const CMediaType& rt);
+    HRESULT Set(const AM_MEDIA_TYPE& rt);
+
+    BOOL IsValid() const;
+
+    const GUID *Type() const { return &majortype;} ;
+    void SetType(const GUID *);
+    const GUID *Subtype() const { return &subtype;} ;
+    void SetSubtype(const GUID *);
+
+    BOOL IsFixedSize() const {return bFixedSizeSamples; };
+    BOOL IsTemporalCompressed() const {return bTemporalCompression; };
+    ULONG GetSampleSize() const;
+
+    void SetSampleSize(ULONG sz);
+    void SetVariableSize();
+    void SetTemporalCompression(BOOL bCompressed);
+
+    // read/write pointer to format - can't change length without
+    // calling SetFormat, AllocFormatBuffer or ReallocFormatBuffer
+
+    BYTE*   Format() const {return pbFormat; };
+    ULONG   FormatLength() const { return cbFormat; };
+
+    void SetFormatType(const GUID *);
+    const GUID *FormatType() const {return &formattype; };
+    BOOL SetFormat(BYTE *pFormat, ULONG length);
+    void ResetFormatBuffer();
+    BYTE* AllocFormatBuffer(ULONG length);
+    BYTE* ReallocFormatBuffer(ULONG length);
+
+    void InitMediaType();
+
+    BOOL MatchesPartial(const CMediaType* ppartial) const;
+    BOOL IsPartiallySpecified(void) const;
+};
+
+
+/* General purpose functions to copy and delete a task allocated AM_MEDIA_TYPE
+   structure which is useful when using the IEnumMediaFormats interface as
+   the implementation allocates the structures which you must later delete */
+
+void WINAPI DeleteMediaType(AM_MEDIA_TYPE *pmt);
+AM_MEDIA_TYPE * WINAPI CreateMediaType(AM_MEDIA_TYPE const *pSrc);
+HRESULT WINAPI CopyMediaType(AM_MEDIA_TYPE *pmtTarget, const AM_MEDIA_TYPE *pmtSource);
+void WINAPI FreeMediaType(AM_MEDIA_TYPE& mt);
+
+//  Initialize a media type from a WAVEFORMATEX
+
+STDAPI CreateAudioMediaType(
+    const WAVEFORMATEX *pwfx,
+    AM_MEDIA_TYPE *pmt,
+    BOOL bSetFormat);
+
+#endif /* __MTYPE__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/outputq.cpp b/plugins/GSdx_legacy/baseclasses/outputq.cpp
new file mode 100644
index 0000000000..7f7532f72a
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/outputq.cpp
@@ -0,0 +1,794 @@
+//------------------------------------------------------------------------------
+// File: OutputQ.cpp
+//
+// Desc: DirectShow base classes - implements COutputQueue class used by an
+//       output pin which may sometimes want to queue output samples on a
+//       separate thread and sometimes call Receive() directly on the input
+//       pin.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+
+
+//
+//  COutputQueue Constructor :
+//
+//  Determines if a thread is to be created and creates resources
+//
+//     pInputPin  - the downstream input pin we're queueing samples to
+//
+//     phr        - changed to a failure code if this function fails
+//                  (otherwise unchanges)
+//
+//     bAuto      - Ask pInputPin if it can block in Receive by calling
+//                  its ReceiveCanBlock method and create a thread if
+//                  it can block, otherwise not.
+//
+//     bQueue     - if bAuto == FALSE then we create a thread if and only
+//                  if bQueue == TRUE
+//
+//     lBatchSize - work in batches of lBatchSize
+//
+//     bBatchEact - Use exact batch sizes so don't send until the
+//                  batch is full or SendAnyway() is called
+//
+//     lListSize  - If we create a thread make the list of samples queued
+//                  to the thread have this size cache
+//
+//     dwPriority - If we create a thread set its priority to this
+//
+COutputQueue::COutputQueue(
+             IPin         *pInputPin,          //  Pin to send stuff to
+             HRESULT      *phr,                //  'Return code'
+             BOOL          bAuto,              //  Ask pin if queue or not
+             BOOL          bQueue,             //  Send through queue
+             LONG          lBatchSize,         //  Batch
+             BOOL          bBatchExact,        //  Batch exactly to BatchSize
+             LONG          lListSize,
+             DWORD         dwPriority,
+             bool          bFlushingOpt        // flushing optimization
+            ) : m_lBatchSize(lBatchSize),
+                m_bBatchExact(bBatchExact && (lBatchSize > 1)),
+                m_hThread(NULL),
+                m_hSem(NULL),
+                m_List(NULL),
+                m_pPin(pInputPin),
+                m_ppSamples(NULL),
+                m_lWaiting(0),
+                m_pInputPin(NULL),
+                m_bSendAnyway(FALSE),
+                m_nBatched(0),
+                m_bFlushing(FALSE),
+                m_bFlushed(TRUE),
+                m_bFlushingOpt(bFlushingOpt),
+                m_bTerminate(FALSE),
+                m_hEventPop(NULL),
+                m_hr(S_OK)
+{
+    ASSERT(m_lBatchSize > 0);
+
+
+    if (FAILED(*phr)) {
+        return;
+    }
+
+    //  Check the input pin is OK and cache its IMemInputPin interface
+
+    *phr = pInputPin->QueryInterface(IID_IMemInputPin, (void **)&m_pInputPin);
+    if (FAILED(*phr)) {
+        return;
+    }
+
+    // See if we should ask the downstream pin
+
+    if (bAuto) {
+        HRESULT hr = m_pInputPin->ReceiveCanBlock();
+        if (SUCCEEDED(hr)) {
+            bQueue = hr == S_OK;
+        }
+    }
+
+    //  Create our sample batch
+
+    m_ppSamples = new PMEDIASAMPLE[m_lBatchSize];
+    if (m_ppSamples == NULL) {
+        *phr = E_OUTOFMEMORY;
+        return;
+    }
+
+    //  If we're queueing allocate resources
+
+    if (bQueue) {
+        DbgLog((LOG_TRACE, 2, TEXT("Creating thread for output pin")));
+        m_hSem = CreateSemaphore(NULL, 0, 0x7FFFFFFF, NULL);
+        if (m_hSem == NULL) {
+            DWORD dwError = GetLastError();
+            *phr = AmHresultFromWin32(dwError);
+            return;
+        }
+        m_List = new CSampleList(NAME("Sample Queue List"),
+                                 lListSize,
+                                 FALSE         // No lock
+                                );
+        if (m_List == NULL) {
+            *phr = E_OUTOFMEMORY;
+            return;
+        }
+
+
+        DWORD dwThreadId;
+        m_hThread = CreateThread(NULL,
+                                 0,
+                                 InitialThreadProc,
+                                 (LPVOID)this,
+                                 0,
+                                 &dwThreadId);
+        if (m_hThread == NULL) {
+            DWORD dwError = GetLastError();
+            *phr = AmHresultFromWin32(dwError);
+            return;
+        }
+        SetThreadPriority(m_hThread, dwPriority);
+    } else {
+        DbgLog((LOG_TRACE, 2, TEXT("Calling input pin directly - no thread")));
+    }
+}
+
+//
+//  COutputQueuee Destructor :
+//
+//  Free all resources -
+//
+//      Thread,
+//      Batched samples
+//
+COutputQueue::~COutputQueue()
+{
+    DbgLog((LOG_TRACE, 3, TEXT("COutputQueue::~COutputQueue")));
+    /*  Free our pointer */
+    if (m_pInputPin != NULL) {
+        m_pInputPin->Release();
+    }
+    if (m_hThread != NULL) {
+        {
+            CAutoLock lck(this);
+            m_bTerminate = TRUE;
+            m_hr = S_FALSE;
+            NotifyThread();
+        }
+        DbgWaitForSingleObject(m_hThread);
+        EXECUTE_ASSERT(CloseHandle(m_hThread));
+
+        //  The thread frees the samples when asked to terminate
+
+        ASSERT(m_List->GetCount() == 0);
+        delete m_List;
+    } else {
+        FreeSamples();
+    }
+    if (m_hSem != NULL) {
+        EXECUTE_ASSERT(CloseHandle(m_hSem));
+    }
+    delete [] m_ppSamples;
+}
+
+//
+//  Call the real thread proc as a member function
+//
+DWORD WINAPI COutputQueue::InitialThreadProc(LPVOID pv)
+{
+    HRESULT hrCoInit = CAMThread::CoInitializeHelper();
+
+    COutputQueue *pSampleQueue = (COutputQueue *)pv;
+    DWORD dwReturn = pSampleQueue->ThreadProc();
+
+    if(hrCoInit == S_OK) {
+        CoUninitialize();
+    }
+
+    return dwReturn;
+}
+
+//
+//  Thread sending the samples downstream :
+//
+//  When there is nothing to do the thread sets m_lWaiting (while
+//  holding the critical section) and then waits for m_hSem to be
+//  set (not holding the critical section)
+//
+DWORD COutputQueue::ThreadProc()
+{
+    while (TRUE) {
+        BOOL          bWait = FALSE;
+        IMediaSample *pSample;
+        LONG          lNumberToSend; // Local copy
+        NewSegmentPacket* ppacket;
+
+        //
+        //  Get a batch of samples and send it if possible
+        //  In any case exit the loop if there is a control action
+        //  requested
+        //
+        {
+            CAutoLock lck(this);
+            while (TRUE) {
+
+                if (m_bTerminate) {
+                    FreeSamples();
+                    return 0;
+                }
+                if (m_bFlushing) {
+                    FreeSamples();
+                    SetEvent(m_evFlushComplete);
+                }
+
+                //  Get a sample off the list
+
+                pSample = m_List->RemoveHead();
+		// inform derived class we took something off the queue
+		if (m_hEventPop) {
+                    //DbgLog((LOG_TRACE,3,TEXT("Queue: Delivered  SET EVENT")));
+		    SetEvent(m_hEventPop);
+		}
+
+                if (pSample != NULL &&
+                    !IsSpecialSample(pSample)) {
+
+                    //  If its just a regular sample just add it to the batch
+                    //  and exit the loop if the batch is full
+
+                    m_ppSamples[m_nBatched++] = pSample;
+                    if (m_nBatched == m_lBatchSize) {
+                        break;
+                    }
+                } else {
+
+                    //  If there was nothing in the queue and there's nothing
+                    //  to send (either because there's nothing or the batch
+                    //  isn't full) then prepare to wait
+
+                    if (pSample == NULL &&
+                        (m_bBatchExact || m_nBatched == 0)) {
+
+                        //  Tell other thread to set the event when there's
+                        //  something do to
+
+                        ASSERT(m_lWaiting == 0);
+                        m_lWaiting++;
+                        bWait      = TRUE;
+                    } else {
+
+                        //  We break out of the loop on SEND_PACKET unless
+                        //  there's nothing to send
+
+                        if (pSample == SEND_PACKET && m_nBatched == 0) {
+                            continue;
+                        }
+
+                        if (pSample == NEW_SEGMENT) {
+                            // now we need the parameters - we are
+                            // guaranteed that the next packet contains them
+                            ppacket = (NewSegmentPacket *) m_List->RemoveHead();
+			    // we took something off the queue
+			    if (m_hEventPop) {
+                    	        //DbgLog((LOG_TRACE,3,TEXT("Queue: Delivered  SET EVENT")));
+		    	        SetEvent(m_hEventPop);
+			    }
+
+                            ASSERT(ppacket);
+                        }
+                        //  EOS_PACKET falls through here and we exit the loop
+                        //  In this way it acts like SEND_PACKET
+                    }
+                    break;
+                }
+            }
+            if (!bWait) {
+                // We look at m_nBatched from the client side so keep
+                // it up to date inside the critical section
+                lNumberToSend = m_nBatched;  // Local copy
+                m_nBatched = 0;
+            }
+        }
+
+        //  Wait for some more data
+
+        if (bWait) {
+            DbgWaitForSingleObject(m_hSem);
+            continue;
+        }
+
+
+
+        //  OK - send it if there's anything to send
+        //  We DON'T check m_bBatchExact here because either we've got
+        //  a full batch or we dropped through because we got
+        //  SEND_PACKET or EOS_PACKET - both of which imply we should
+        //  flush our batch
+
+        if (lNumberToSend != 0) {
+            long nProcessed;
+            if (m_hr == S_OK) {
+                ASSERT(!m_bFlushed);
+                HRESULT hr = m_pInputPin->ReceiveMultiple(m_ppSamples,
+                                                          lNumberToSend,
+                                                          &nProcessed);
+                /*  Don't overwrite a flushing state HRESULT */
+                CAutoLock lck(this);
+                if (m_hr == S_OK) {
+                    m_hr = hr;
+                }
+                ASSERT(!m_bFlushed);
+            }
+            while (lNumberToSend != 0) {
+                m_ppSamples[--lNumberToSend]->Release();
+            }
+            if (m_hr != S_OK) {
+
+                //  In any case wait for more data - S_OK just
+                //  means there wasn't an error
+
+                DbgLog((LOG_ERROR, 2, TEXT("ReceiveMultiple returned %8.8X"),
+                       m_hr));
+            }
+        }
+
+        //  Check for end of stream
+
+        if (pSample == EOS_PACKET) {
+
+            //  We don't send even end of stream on if we've previously
+            //  returned something other than S_OK
+            //  This is because in that case the pin which returned
+            //  something other than S_OK should have either sent
+            //  EndOfStream() or notified the filter graph
+
+            if (m_hr == S_OK) {
+                DbgLog((LOG_TRACE, 2, TEXT("COutputQueue sending EndOfStream()")));
+                HRESULT hr = m_pPin->EndOfStream();
+                if (FAILED(hr)) {
+                    DbgLog((LOG_ERROR, 2, TEXT("COutputQueue got code 0x%8.8X from EndOfStream()")));
+                }
+            }
+        }
+
+        //  Data from a new source
+
+        if (pSample == RESET_PACKET) {
+            m_hr = S_OK;
+            SetEvent(m_evFlushComplete);
+        }
+
+        if (pSample == NEW_SEGMENT) {
+            m_pPin->NewSegment(ppacket->tStart, ppacket->tStop, ppacket->dRate);
+            delete ppacket;
+        }
+    }
+}
+
+//  Send batched stuff anyway
+void COutputQueue::SendAnyway()
+{
+    if (!IsQueued()) {
+
+        //  m_bSendAnyway is a private parameter checked in ReceiveMultiple
+
+        m_bSendAnyway = TRUE;
+        LONG nProcessed;
+        ReceiveMultiple(NULL, 0, &nProcessed);
+        m_bSendAnyway = FALSE;
+
+    } else {
+        CAutoLock lck(this);
+        QueueSample(SEND_PACKET);
+        NotifyThread();
+    }
+}
+
+void
+COutputQueue::NewSegment(
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop,
+    double dRate)
+{
+    if (!IsQueued()) {
+        if (S_OK == m_hr) {
+            if (m_bBatchExact) {
+                SendAnyway();
+            }
+            m_pPin->NewSegment(tStart, tStop, dRate);
+        }
+    } else {
+        if (m_hr == S_OK) {
+            //
+            // we need to queue the new segment to appear in order in the
+            // data, but we need to pass parameters to it. Rather than
+            // take the hit of wrapping every single sample so we can tell
+            // special ones apart, we queue special pointers to indicate
+            // special packets, and we guarantee (by holding the
+            // critical section) that the packet immediately following a
+            // NEW_SEGMENT value is a NewSegmentPacket containing the
+            // parameters.
+            NewSegmentPacket * ppack = new NewSegmentPacket;
+            if (ppack == NULL) {
+                return;
+            }
+            ppack->tStart = tStart;
+            ppack->tStop = tStop;
+            ppack->dRate = dRate;
+
+            CAutoLock lck(this);
+            QueueSample(NEW_SEGMENT);
+            QueueSample( (IMediaSample*) ppack);
+            NotifyThread();
+        }
+    }
+}
+
+
+//
+//  End of Stream is queued to output device
+//
+void COutputQueue::EOS()
+{
+    CAutoLock lck(this);
+    if (!IsQueued()) {
+        if (m_bBatchExact) {
+            SendAnyway();
+        }
+        if (m_hr == S_OK) {
+            DbgLog((LOG_TRACE, 2, TEXT("COutputQueue sending EndOfStream()")));
+            m_bFlushed = FALSE;
+            HRESULT hr = m_pPin->EndOfStream();
+            if (FAILED(hr)) {
+                DbgLog((LOG_ERROR, 2, TEXT("COutputQueue got code 0x%8.8X from EndOfStream()")));
+            }
+        }
+    } else {
+        if (m_hr == S_OK) {
+            m_bFlushed = FALSE;
+            QueueSample(EOS_PACKET);
+            NotifyThread();
+        }
+    }
+}
+
+//
+//  Flush all the samples in the queue
+//
+void COutputQueue::BeginFlush()
+{
+    if (IsQueued()) {
+        {
+            CAutoLock lck(this);
+
+            // block receives -- we assume this is done by the
+            // filter in which we are a component
+
+            // discard all queued data
+
+            m_bFlushing = TRUE;
+
+            //  Make sure we discard all samples from now on
+
+            if (m_hr == S_OK) {
+                m_hr = S_FALSE;
+            }
+
+            // Optimize so we don't keep calling downstream all the time
+
+            if (m_bFlushed && m_bFlushingOpt) {
+                return;
+            }
+
+            // Make sure we really wait for the flush to complete
+            m_evFlushComplete.Reset();
+
+            NotifyThread();
+        }
+
+        // pass this downstream
+
+        m_pPin->BeginFlush();
+    } else {
+        // pass downstream first to avoid deadlocks
+        m_pPin->BeginFlush();
+        CAutoLock lck(this);
+        // discard all queued data
+
+        m_bFlushing = TRUE;
+
+        //  Make sure we discard all samples from now on
+
+        if (m_hr == S_OK) {
+            m_hr = S_FALSE;
+        }
+    }
+
+}
+
+//
+// leave flush mode - pass this downstream
+void COutputQueue::EndFlush()
+{
+    {
+        CAutoLock lck(this);
+        ASSERT(m_bFlushing);
+        if (m_bFlushingOpt && m_bFlushed && IsQueued()) {
+            m_bFlushing = FALSE;
+            m_hr = S_OK;
+            return;
+        }
+    }
+
+    // sync with pushing thread -- done in BeginFlush
+    // ensure no more data to go downstream -- done in BeginFlush
+    //
+    // Because we are synching here there is no need to hold the critical
+    // section (in fact we'd deadlock if we did!)
+
+    if (IsQueued()) {
+        m_evFlushComplete.Wait();
+    } else {
+        FreeSamples();
+    }
+
+    //  Be daring - the caller has guaranteed no samples will arrive
+    //  before EndFlush() returns
+
+    m_bFlushing = FALSE;
+    m_bFlushed  = TRUE;
+
+    // call EndFlush on downstream pins
+
+    m_pPin->EndFlush();
+
+    m_hr = S_OK;
+}
+
+//  COutputQueue::QueueSample
+//
+//  private method to Send a sample to the output queue
+//  The critical section MUST be held when this is called
+
+void COutputQueue::QueueSample(IMediaSample *pSample)
+{
+    if (NULL == m_List->AddTail(pSample)) {
+        if (!IsSpecialSample(pSample)) {
+            pSample->Release();
+        }
+    }
+}
+
+//
+//  COutputQueue::Receive()
+//
+//  Send a single sample by the multiple sample route
+//  (NOTE - this could be optimized if necessary)
+//
+//  On return the sample will have been Release()'d
+//
+
+HRESULT COutputQueue::Receive(IMediaSample *pSample)
+{
+    LONG nProcessed;
+    return ReceiveMultiple(&pSample, 1, &nProcessed);
+}
+
+//
+//  COutputQueue::ReceiveMultiple()
+//
+//  Send a set of samples to the downstream pin
+//
+//      ppSamples           - array of samples
+//      nSamples            - how many
+//      nSamplesProcessed   - How many were processed
+//
+//  On return all samples will have been Release()'d
+//
+
+HRESULT COutputQueue::ReceiveMultiple (
+    IMediaSample **ppSamples,
+    long nSamples,
+    long *nSamplesProcessed)
+{
+    CAutoLock lck(this);
+    //  Either call directly or queue up the samples
+
+    if (!IsQueued()) {
+
+        //  If we already had a bad return code then just return
+
+        if (S_OK != m_hr) {
+
+            //  If we've never received anything since the last Flush()
+            //  and the sticky return code is not S_OK we must be
+            //  flushing
+            //  ((!A || B) is equivalent to A implies B)
+            ASSERT(!m_bFlushed || m_bFlushing);
+
+            //  We're supposed to Release() them anyway!
+            *nSamplesProcessed = 0;
+            for (int i = 0; i < nSamples; i++) {
+                DbgLog((LOG_TRACE, 3, TEXT("COutputQueue (direct) : Discarding %d samples code 0x%8.8X"),
+                        nSamples, m_hr));
+                ppSamples[i]->Release();
+            }
+
+            return m_hr;
+        }
+        //
+        //  If we're flushing the sticky return code should be S_FALSE
+        //
+        ASSERT(!m_bFlushing);
+        m_bFlushed = FALSE;
+
+        ASSERT(m_nBatched < m_lBatchSize);
+        ASSERT(m_nBatched == 0 || m_bBatchExact);
+
+        //  Loop processing the samples in batches
+
+        LONG iLost = 0;
+        long iDone;
+        for (iDone = 0;
+             iDone < nSamples || (m_nBatched != 0 && m_bSendAnyway);
+            ) {
+
+//pragma message (REMIND("Implement threshold scheme"))
+            ASSERT(m_nBatched < m_lBatchSize);
+            if (iDone < nSamples) {
+                m_ppSamples[m_nBatched++] = ppSamples[iDone++];
+            }
+            if (m_nBatched == m_lBatchSize ||
+                nSamples == 0 && (m_bSendAnyway || !m_bBatchExact)) {
+                LONG nDone;
+                DbgLog((LOG_TRACE, 4, TEXT("Batching %d samples"),
+                       m_nBatched));
+
+                if (m_hr == S_OK) {
+                    m_hr = m_pInputPin->ReceiveMultiple(m_ppSamples,
+                                                        m_nBatched,
+                                                        &nDone);
+                } else {
+                    nDone = 0;
+                }
+                iLost += m_nBatched - nDone;
+                for (LONG i = 0; i < m_nBatched; i++) {
+                    m_ppSamples[i]->Release();
+                }
+                m_nBatched = 0;
+            }
+        }
+        *nSamplesProcessed = iDone - iLost;
+        if (*nSamplesProcessed < 0) {
+            *nSamplesProcessed = 0;
+        }
+        return m_hr;
+    } else {
+        /*  We're sending to our thread */
+
+        if (m_hr != S_OK) {
+            *nSamplesProcessed = 0;
+            DbgLog((LOG_TRACE, 3, TEXT("COutputQueue (queued) : Discarding %d samples code 0x%8.8X"),
+                    nSamples, m_hr));
+            for (int i = 0; i < nSamples; i++) {
+                ppSamples[i]->Release();
+            }
+            return m_hr;
+        }
+        m_bFlushed = FALSE;
+        for (long i = 0; i < nSamples; i++) {
+            QueueSample(ppSamples[i]);
+        }
+        *nSamplesProcessed = nSamples;
+        if (!m_bBatchExact ||
+            m_nBatched + m_List->GetCount() >= m_lBatchSize) {
+            NotifyThread();
+        }
+        return S_OK;
+    }
+}
+
+//  Get ready for new data - cancels sticky m_hr
+void COutputQueue::Reset()
+{
+    if (!IsQueued()) {
+        m_hr = S_OK;
+    } else {
+        CAutoLock lck(this);
+        QueueSample(RESET_PACKET);
+        NotifyThread();
+        m_evFlushComplete.Wait();
+    }
+}
+
+//  Remove and Release() all queued and Batched samples
+void COutputQueue::FreeSamples()
+{
+    CAutoLock lck(this);
+    if (IsQueued()) {
+        while (TRUE) {
+            IMediaSample *pSample = m_List->RemoveHead();
+	    // inform derived class we took something off the queue
+	    if (m_hEventPop) {
+                //DbgLog((LOG_TRACE,3,TEXT("Queue: Delivered  SET EVENT")));
+	        SetEvent(m_hEventPop);
+	    }
+
+            if (pSample == NULL) {
+                break;
+            }
+            if (!IsSpecialSample(pSample)) {
+                pSample->Release();
+            } else {
+                if (pSample == NEW_SEGMENT) {
+                    //  Free NEW_SEGMENT packet
+                    NewSegmentPacket *ppacket =
+                        (NewSegmentPacket *) m_List->RemoveHead();
+		    // inform derived class we took something off the queue
+		    if (m_hEventPop) {
+                        //DbgLog((LOG_TRACE,3,TEXT("Queue: Delivered  SET EVENT")));
+		        SetEvent(m_hEventPop);
+		    }
+
+                    ASSERT(ppacket != NULL);
+                    delete ppacket;
+                }
+            }
+        }
+    }
+    for (int i = 0; i < m_nBatched; i++) {
+        m_ppSamples[i]->Release();
+    }
+    m_nBatched = 0;
+}
+
+//  Notify the thread if there is something to do
+//
+//  The critical section MUST be held when this is called
+void COutputQueue::NotifyThread()
+{
+    //  Optimize - no need to signal if it's not waiting
+    ASSERT(IsQueued());
+    if (m_lWaiting) {
+        ReleaseSemaphore(m_hSem, m_lWaiting, NULL);
+        m_lWaiting = 0;
+    }
+}
+
+//  See if there's any work to do
+//  Returns
+//      TRUE  if there is nothing on the queue and nothing in the batch
+//            and all data has been sent
+//      FALSE otherwise
+//
+BOOL COutputQueue::IsIdle()
+{
+    CAutoLock lck(this);
+
+    //  We're idle if
+    //      there is no thread (!IsQueued()) OR
+    //      the thread is waiting for more work  (m_lWaiting != 0)
+    //  AND
+    //      there's nothing in the current batch (m_nBatched == 0)
+
+    if (IsQueued() && m_lWaiting == 0 || m_nBatched != 0) {
+        return FALSE;
+    } else {
+
+        //  If we're idle it shouldn't be possible for there
+        //  to be anything on the work queue
+
+        ASSERT(!IsQueued() || m_List->GetCount() == 0);
+        return TRUE;
+    }
+}
+
+
+void COutputQueue::SetPopEvent(HANDLE hEvent)
+{
+    m_hEventPop = hEvent;
+}
diff --git a/plugins/GSdx_legacy/baseclasses/outputq.h b/plugins/GSdx_legacy/baseclasses/outputq.h
new file mode 100644
index 0000000000..596893730b
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/outputq.h
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+// File: OutputQ.h
+//
+// Desc: DirectShow base classes -  defines the COutputQueue class, which
+//       makes a queue of samples and sends them to an output pin.  The
+//       class will optionally send the samples to the pin directly.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+typedef CGenericList<IMediaSample> CSampleList;
+
+class COutputQueue : public CCritSec
+{
+public:
+    //  Constructor
+    COutputQueue(IPin      *pInputPin,          //  Pin to send stuff to
+                 HRESULT   *phr,                //  'Return code'
+                 BOOL       bAuto = TRUE,       //  Ask pin if blocks
+                 BOOL       bQueue = TRUE,      //  Send through queue (ignored if
+                                                //  bAuto set)
+                 LONG       lBatchSize = 1,     //  Batch
+                 BOOL       bBatchExact = FALSE,//  Batch exactly to BatchSize
+                 LONG       lListSize =         //  Likely number in the list
+                                DEFAULTCACHE,
+                 DWORD      dwPriority =        //  Priority of thread to create
+                                THREAD_PRIORITY_NORMAL,
+                 bool       bFlushingOpt = false // flushing optimization
+                );
+    ~COutputQueue();
+
+    // enter flush state - discard all data
+    void BeginFlush();      // Begin flushing samples
+
+    // re-enable receives (pass this downstream)
+    void EndFlush();        // Complete flush of samples - downstream
+                            // pin guaranteed not to block at this stage
+
+    void EOS();             // Call this on End of stream
+
+    void SendAnyway();      // Send batched samples anyway (if bBatchExact set)
+
+    void NewSegment(
+            REFERENCE_TIME tStart,
+            REFERENCE_TIME tStop,
+            double dRate);
+
+    HRESULT Receive(IMediaSample *pSample);
+
+    // do something with these media samples
+    HRESULT ReceiveMultiple (
+        IMediaSample **pSamples,
+        long nSamples,
+        long *nSamplesProcessed);
+
+    void Reset();           // Reset m_hr ready for more data
+
+    //  See if its idle or not
+    BOOL IsIdle();
+
+    // give the class an event to fire after everything removed from the queue
+    void SetPopEvent(HANDLE hEvent);
+
+protected:
+    static DWORD WINAPI InitialThreadProc(LPVOID pv);
+    DWORD ThreadProc();
+    BOOL  IsQueued()
+    {
+        return m_List != NULL;
+    }
+
+    //  The critical section MUST be held when this is called
+    void QueueSample(IMediaSample *pSample);
+
+    BOOL IsSpecialSample(IMediaSample *pSample)
+    {
+        return (DWORD_PTR)pSample > (DWORD_PTR)(LONG_PTR)(-16);
+    }
+
+    //  Remove and Release() batched and queued samples
+    void FreeSamples();
+
+    //  Notify the thread there is something to do
+    void NotifyThread();
+
+
+protected:
+    //  Queue 'messages'
+    #define SEND_PACKET      ((IMediaSample *)(LONG_PTR)(-2))  // Send batch
+    #define EOS_PACKET       ((IMediaSample *)(LONG_PTR)(-3))  // End of stream
+    #define RESET_PACKET     ((IMediaSample *)(LONG_PTR)(-4))  // Reset m_hr
+    #define NEW_SEGMENT      ((IMediaSample *)(LONG_PTR)(-5))  // send NewSegment
+
+    // new segment packet is always followed by one of these
+    struct NewSegmentPacket {
+        REFERENCE_TIME tStart;
+        REFERENCE_TIME tStop;
+        double dRate;
+    };
+
+    // Remember input stuff
+    IPin          * const m_pPin;
+    IMemInputPin  *       m_pInputPin;
+    BOOL            const m_bBatchExact;
+    LONG            const m_lBatchSize;
+
+    CSampleList   *       m_List;
+    HANDLE                m_hSem;
+    CAMEvent                m_evFlushComplete;
+    HANDLE                m_hThread;
+    IMediaSample  **      m_ppSamples;
+    LONG                  m_nBatched;
+
+    //  Wait optimization
+    LONG                  m_lWaiting;
+    //  Flush synchronization
+    BOOL                  m_bFlushing;
+
+    // flushing optimization. some downstream filters have trouble
+    // with the queue's flushing optimization. other rely on it
+    BOOL                  m_bFlushed;
+    bool                  m_bFlushingOpt;
+
+    //  Terminate now
+    BOOL                  m_bTerminate;
+
+    //  Send anyway flag for batching
+    BOOL                  m_bSendAnyway;
+
+    //  Deferred 'return code'
+    BOOL volatile         m_hr;
+
+    // an event that can be fired after every deliver
+    HANDLE m_hEventPop;
+};
+
diff --git a/plugins/GSdx_legacy/baseclasses/pstream.cpp b/plugins/GSdx_legacy/baseclasses/pstream.cpp
new file mode 100644
index 0000000000..3a15b39e91
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/pstream.cpp
@@ -0,0 +1,196 @@
+//------------------------------------------------------------------------------
+// File: PStream.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+
+#ifdef PERF
+#include <measure.h>
+#endif
+
+
+//
+// Constructor
+//
+CPersistStream::CPersistStream(IUnknown *punk, HRESULT *phr)
+    : mPS_fDirty(FALSE)
+{
+    mPS_dwFileVersion = GetSoftwareVersion();
+}
+
+
+//
+// Destructor
+//
+CPersistStream::~CPersistStream() {
+    // Nothing to do
+}
+
+#if 0
+SAMPLE CODE TO COPY - not active at the moment
+
+//
+// NonDelegatingQueryInterface
+//
+// This object supports IPersist & IPersistStream
+STDMETHODIMP CPersistStream::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    if (riid == IID_IPersist) {
+        return GetInterface((IPersist *) this, ppv);
+    }
+    else if (riid == IID_IPersistStream) {
+        return GetInterface((IPersistStream *) this, ppv);
+    }
+    else {
+        return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+#endif
+
+
+//
+// WriteToStream
+//
+// Writes to the stream (default action is to write nothing)
+HRESULT CPersistStream::WriteToStream(IStream *pStream)
+{
+    // You can override this to do things like
+    // hr = pStream->Write(MyStructure, sizeof(MyStructure), NULL);
+
+    return NOERROR;
+}
+
+
+
+HRESULT CPersistStream::ReadFromStream(IStream * pStream)
+{
+    // You can override this to do things like
+    // hr = pStream->Read(MyStructure, sizeof(MyStructure), NULL);
+
+    return NOERROR;
+}
+
+
+//
+// Load
+//
+// Load all the data from the given stream
+STDMETHODIMP CPersistStream::Load(LPSTREAM pStm)
+{
+    HRESULT hr;
+    // Load the version number then the data
+    mPS_dwFileVersion = ReadInt(pStm, hr);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return ReadFromStream(pStm);
+}  // Load
+
+
+
+//
+// Save
+//
+// Save the contents of this Stream.
+STDMETHODIMP CPersistStream::Save(LPSTREAM pStm, BOOL fClearDirty)
+{
+
+    HRESULT hr = WriteInt(pStm, GetSoftwareVersion());
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    hr = WriteToStream(pStm);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    mPS_fDirty = !fClearDirty;
+
+    return hr;
+} // Save
+
+
+// WriteInt
+//
+// Writes an integer to an IStream as 11 UNICODE characters followed by one space.
+// You could use this for shorts or unsigneds or anything (up to 32 bits)
+// where the value isn't actually truncated by squeezing it into 32 bits.
+// Values such as (unsigned) 0x80000000 would come out as -2147483648
+// but would then load as 0x80000000 through ReadInt.  Cast as you please.
+
+STDAPI WriteInt(IStream *pIStream, int n)
+{
+    WCHAR Buff[13];  // Allows for trailing null that we don't write
+    (void)StringCchPrintfW(Buff, NUMELMS(Buff), L"%011d ",n);
+    return pIStream->Write(&(Buff[0]), 12*sizeof(WCHAR), NULL);
+} // WriteInt
+
+
+// ReadInt
+//
+// Reads an integer from an IStream.
+// Read as 4 bytes.  You could use this for shorts or unsigneds or anything
+// where the value isn't actually truncated by squeezing it into 32 bits
+// Striped down subset of what sscanf can do (without dragging in the C runtime)
+
+STDAPI_(int) ReadInt(IStream *pIStream, HRESULT &hr)
+{
+
+    int Sign = 1;
+    unsigned int n = 0;    // result wil be n*Sign
+    WCHAR wch;
+
+    hr = pIStream->Read( &wch, sizeof(wch), NULL);
+    if (FAILED(hr)) {
+        return 0;
+    }
+
+    if (wch==L'-'){
+        Sign = -1;
+        hr = pIStream->Read( &wch, sizeof(wch), NULL);
+        if (FAILED(hr)) {
+            return 0;
+        }
+    }
+
+    for( ; ; ) {
+        if (wch>=L'0' && wch<=L'9') {
+            n = 10*n+(int)(wch-L'0');
+        } else if (  wch == L' '
+                  || wch == L'\t'
+                  || wch == L'\r'
+                  || wch == L'\n'
+                  || wch == L'\0'
+                  ) {
+            break;
+        } else {
+            hr = VFW_E_INVALID_FILE_FORMAT;
+            return 0;
+        }
+
+        hr = pIStream->Read( &wch, sizeof(wch), NULL);
+        if (FAILED(hr)) {
+            return 0;
+        }
+    }
+
+    if (n==0x80000000 && Sign==-1) {
+        // This is the negative number that has no positive version!
+        return (int)n;
+    }
+    else return (int)n * Sign;
+} // ReadInt
+
+
+// The microsoft C/C++ compile generates level 4 warnings to the effect that
+// a particular inline function (from some base class) was not needed.
+// This line gets rid of hundreds of such unwanted messages and makes
+// -W4 compilation feasible:
+#pragma warning(disable: 4514)
diff --git a/plugins/GSdx_legacy/baseclasses/pstream.h b/plugins/GSdx_legacy/baseclasses/pstream.h
new file mode 100644
index 0000000000..0611a4e311
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/pstream.h
@@ -0,0 +1,114 @@
+//------------------------------------------------------------------------------
+// File: PStream.h
+//
+// Desc: DirectShow base classes - defines a class for persistent properties
+//       of filters.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __PSTREAM__
+#define __PSTREAM__
+
+// Base class for persistent properties of filters
+// (i.e. filter properties in saved graphs)
+
+// The simplest way to use this is:
+// 1. Arrange for your filter to inherit this class
+// 2. Implement in your class WriteToStream and ReadFromStream
+//    These will override the "do nothing" functions here.
+// 3. Change your NonDelegatingQueryInterface to handle IPersistStream
+// 4. Implement SizeMax to return the number of bytes of data you save.
+//    If you save UNICODE data, don't forget a char is 2 bytes.
+// 5. Whenever your data changes, call SetDirty()
+//
+// At some point you may decide to alter, or extend the format of your data.
+// At that point you will wish that you had a version number in all the old
+// saved graphs, so that you can tell, when you read them, whether they
+// represent the old or new form.  To assist you in this, this class
+// writes and reads a version number.
+// When it writes, it calls GetSoftwareVersion()  to enquire what version
+// of the software we have at the moment.  (In effect this is a version number
+// of the data layout in the file).  It writes this as the first thing in the data.
+// If you want to change the version, implement (override) GetSoftwareVersion().
+// It reads this from the file into mPS_dwFileVersion before calling ReadFromStream,
+// so in ReadFromStream you can check mPS_dwFileVersion to see if you are reading
+// an old version file.
+// Normally you should accept files whose version is no newer than the software
+// version that's reading them.
+
+
+// CPersistStream
+//
+// Implements IPersistStream.
+// See 'OLE Programmers Reference (Vol 1):Structured Storage Overview' for
+// more implementation information.
+class CPersistStream : public IPersistStream {
+    private:
+
+        // Internal state:
+
+    protected:
+        DWORD     mPS_dwFileVersion;         // version number of file (being read)
+        BOOL      mPS_fDirty;
+
+    public:
+
+        // IPersistStream methods
+
+        STDMETHODIMP IsDirty()
+            {return (mPS_fDirty ? S_OK : S_FALSE);}  // note FALSE means clean
+        STDMETHODIMP Load(LPSTREAM pStm);
+        STDMETHODIMP Save(LPSTREAM pStm, BOOL fClearDirty);
+        STDMETHODIMP GetSizeMax(ULARGE_INTEGER * pcbSize)
+                         // Allow 24 bytes for version.
+                         { pcbSize->QuadPart = 12*sizeof(WCHAR)+SizeMax(); return NOERROR; }
+
+        // implementation
+
+        CPersistStream(IUnknown *punk, HRESULT *phr);
+        ~CPersistStream();
+
+        HRESULT SetDirty(BOOL fDirty)
+            { mPS_fDirty = fDirty; return NOERROR;}
+
+
+        // override to reveal IPersist & IPersistStream
+        // STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+        // --- IPersist ---
+
+        // You must override this to provide your own class id
+        STDMETHODIMP GetClassID(CLSID *pClsid) PURE;
+
+        // overrideable if you want
+        // file version number.  Override it if you ever change format
+        virtual DWORD GetSoftwareVersion(void) { return 0; }
+
+
+        //=========================================================================
+        // OVERRIDE THESE to read and write your data
+        // OVERRIDE THESE to read and write your data
+        // OVERRIDE THESE to read and write your data
+
+        virtual int SizeMax() {return 0;}
+        virtual HRESULT WriteToStream(IStream *pStream);
+        virtual HRESULT ReadFromStream(IStream *pStream);
+        //=========================================================================
+
+    private:
+
+};
+
+
+// --- Useful helpers ---
+
+
+// Writes an int to an IStream as UNICODE.
+STDAPI WriteInt(IStream *pIStream, int n);
+
+// inverse of WriteInt
+STDAPI_(int) ReadInt(IStream *pIStream, HRESULT &hr);
+
+#endif // __PSTREAM__
diff --git a/plugins/GSdx_legacy/baseclasses/pullpin.cpp b/plugins/GSdx_legacy/baseclasses/pullpin.cpp
new file mode 100644
index 0000000000..81d0199351
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/pullpin.cpp
@@ -0,0 +1,527 @@
+//------------------------------------------------------------------------------
+// File: PullPin.cpp
+//
+// Desc: DirectShow base classes - implements CPullPin class that pulls data
+//       from IAsyncReader.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include "pullpin.h"
+
+
+
+CPullPin::CPullPin()
+  : m_pReader(NULL),
+    m_pAlloc(NULL),
+    m_State(TM_Exit)
+{
+}
+
+CPullPin::~CPullPin()
+{
+    Disconnect();
+}
+
+// returns S_OK if successfully connected to an IAsyncReader interface
+// from this object
+// Optional allocator should be proposed as a preferred allocator if
+// necessary
+HRESULT
+CPullPin::Connect(IUnknown* pUnk, IMemAllocator* pAlloc, BOOL bSync)
+{
+    CAutoLock lock(&m_AccessLock);
+
+    if (m_pReader) {
+	return VFW_E_ALREADY_CONNECTED;
+    }
+
+    HRESULT hr = pUnk->QueryInterface(IID_IAsyncReader, (void**)&m_pReader);
+    if (FAILED(hr)) {
+	return(hr);
+    }
+
+    hr = DecideAllocator(pAlloc, NULL);
+    if (FAILED(hr)) {
+	Disconnect();
+	return hr;
+    }
+
+    LONGLONG llTotal, llAvail;
+    hr = m_pReader->Length(&llTotal, &llAvail);
+    if (FAILED(hr)) {
+	Disconnect();
+	return hr;
+    }
+
+    // convert from file position to reference time
+    m_tDuration = llTotal * UNITS;
+    m_tStop = m_tDuration;
+    m_tStart = 0;
+
+    m_bSync = bSync;
+
+    return S_OK;
+}
+
+// disconnect any connection made in Connect
+HRESULT
+CPullPin::Disconnect()
+{
+    CAutoLock lock(&m_AccessLock);
+
+    StopThread();
+
+    if (m_pReader) {
+	m_pReader->Release();
+	m_pReader = NULL;
+    }
+
+    if (m_pAlloc) {
+	m_pAlloc->Release();
+	m_pAlloc = NULL;
+    }
+
+    return S_OK;
+}
+
+// agree an allocator using RequestAllocator - optional
+// props param specifies your requirements (non-zero fields).
+// returns an error code if fail to match requirements.
+// optional IMemAllocator interface is offered as a preferred allocator
+// but no error occurs if it can't be met.
+HRESULT
+CPullPin::DecideAllocator(
+    IMemAllocator * pAlloc,
+    ALLOCATOR_PROPERTIES * pProps)
+{
+    ALLOCATOR_PROPERTIES *pRequest;
+    ALLOCATOR_PROPERTIES Request;
+    if (pProps == NULL) {
+	Request.cBuffers = 3;
+	Request.cbBuffer = 64*1024;
+	Request.cbAlign = 0;
+	Request.cbPrefix = 0;
+	pRequest = &Request;
+    } else {
+	pRequest = pProps;
+    }
+    HRESULT hr = m_pReader->RequestAllocator(
+		    pAlloc,
+		    pRequest,
+		    &m_pAlloc);
+    return hr;
+}
+
+// start pulling data
+HRESULT
+CPullPin::Active(void)
+{
+    ASSERT(!ThreadExists());
+    return StartThread();
+}
+
+// stop pulling data
+HRESULT
+CPullPin::Inactive(void)
+{
+    StopThread();
+
+    return S_OK;
+}
+
+HRESULT
+CPullPin::Seek(REFERENCE_TIME tStart, REFERENCE_TIME tStop)
+{
+    CAutoLock lock(&m_AccessLock);
+
+    ThreadMsg AtStart = m_State;
+
+    if (AtStart == TM_Start) {
+	BeginFlush();
+	PauseThread();
+	EndFlush();
+    }
+
+    m_tStart = tStart;
+    m_tStop = tStop;
+
+    HRESULT hr = S_OK;
+    if (AtStart == TM_Start) {
+	hr = StartThread();
+    }
+
+    return hr;
+}
+
+HRESULT
+CPullPin::Duration(REFERENCE_TIME* ptDuration)
+{
+    *ptDuration = m_tDuration;
+    return S_OK;
+}
+
+
+HRESULT
+CPullPin::StartThread()
+{
+    CAutoLock lock(&m_AccessLock);
+
+    if (!m_pAlloc || !m_pReader) {
+	return E_UNEXPECTED;
+    }
+
+    HRESULT hr;
+    if (!ThreadExists()) {
+
+	// commit allocator
+	hr = m_pAlloc->Commit();
+	if (FAILED(hr)) {
+	    return hr;
+	}
+
+	// start thread
+	if (!Create()) {
+	    return E_FAIL;
+	}
+    }
+
+    m_State = TM_Start;
+    hr = (HRESULT) CallWorker(m_State);
+    return hr;
+}
+
+HRESULT
+CPullPin::PauseThread()
+{
+    CAutoLock lock(&m_AccessLock);
+
+    if (!ThreadExists()) {
+	return E_UNEXPECTED;
+    }
+
+    // need to flush to ensure the thread is not blocked
+    // in WaitForNext
+    HRESULT hr = m_pReader->BeginFlush();
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    m_State = TM_Pause;
+    hr = CallWorker(TM_Pause);
+
+    m_pReader->EndFlush();
+    return hr;
+}
+
+HRESULT
+CPullPin::StopThread()
+{
+    CAutoLock lock(&m_AccessLock);
+
+    if (!ThreadExists()) {
+	return S_FALSE;
+    }
+
+    // need to flush to ensure the thread is not blocked
+    // in WaitForNext
+    HRESULT hr = m_pReader->BeginFlush();
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    m_State = TM_Exit;
+    hr = CallWorker(TM_Exit);
+
+    m_pReader->EndFlush();
+
+    // wait for thread to completely exit
+    Close();
+
+    // decommit allocator
+    if (m_pAlloc) {
+	m_pAlloc->Decommit();
+    }
+
+    return S_OK;
+}
+
+
+DWORD
+CPullPin::ThreadProc(void)
+{
+    while(1) {
+	DWORD cmd = GetRequest();
+	switch(cmd) {
+	case TM_Exit:
+	    Reply(S_OK);
+	    return 0;
+
+	case TM_Pause:
+	    // we are paused already
+	    Reply(S_OK);
+	    break;
+
+	case TM_Start:
+	    Reply(S_OK);
+	    Process();
+	    break;
+	}
+
+	// at this point, there should be no outstanding requests on the
+	// upstream filter.
+	// We should force begin/endflush to ensure that this is true.
+	// !!!Note that we may currently be inside a BeginFlush/EndFlush pair
+	// on another thread, but the premature EndFlush will do no harm now
+	// that we are idle.
+	m_pReader->BeginFlush();
+	CleanupCancelled();
+	m_pReader->EndFlush();
+    }
+}
+
+HRESULT
+CPullPin::QueueSample(
+    REFERENCE_TIME& tCurrent,
+    REFERENCE_TIME tAlignStop,
+    BOOL bDiscontinuity
+    )
+{
+    IMediaSample* pSample;
+
+    HRESULT hr = m_pAlloc->GetBuffer(&pSample, NULL, NULL, 0);
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    LONGLONG tStopThis = tCurrent + (pSample->GetSize() * UNITS);
+    if (tStopThis > tAlignStop) {
+	tStopThis = tAlignStop;
+    }
+    pSample->SetTime(&tCurrent, &tStopThis);
+    tCurrent = tStopThis;
+
+    pSample->SetDiscontinuity(bDiscontinuity);
+
+    hr = m_pReader->Request(
+			pSample,
+			0);
+    if (FAILED(hr)) {
+	pSample->Release();
+
+	CleanupCancelled();
+	OnError(hr);
+    }
+    return hr;
+}
+
+HRESULT
+CPullPin::CollectAndDeliver(
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop)
+{
+    IMediaSample* pSample = NULL;   // better be sure pSample is set
+    DWORD_PTR dwUnused;
+    HRESULT hr = m_pReader->WaitForNext(
+			INFINITE,
+			&pSample,
+			&dwUnused);
+    if (FAILED(hr)) {
+	if (pSample) {
+	    pSample->Release();
+	}
+    } else {
+	hr = DeliverSample(pSample, tStart, tStop);
+    }
+    if (FAILED(hr)) {
+	CleanupCancelled();
+	OnError(hr);
+    }
+    return hr;
+
+}
+
+HRESULT
+CPullPin::DeliverSample(
+    IMediaSample* pSample,
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop
+    )
+{
+    // fix up sample if past actual stop (for sector alignment)
+    REFERENCE_TIME t1, t2;
+    pSample->GetTime(&t1, &t2);
+    if (t2 > tStop) {
+	t2 = tStop;
+    }
+
+    // adjust times to be relative to (aligned) start time
+    t1 -= tStart;
+    t2 -= tStart;
+    pSample->SetTime(&t1, &t2);
+
+
+    HRESULT hr = Receive(pSample);
+    pSample->Release();
+    return hr;
+}
+
+void
+CPullPin::Process(void)
+{
+    // is there anything to do?
+    if (m_tStop <= m_tStart) {
+	EndOfStream();
+	return;
+    }
+
+    BOOL bDiscontinuity = TRUE;
+
+    // if there is more than one sample at the allocator,
+    // then try to queue 2 at once in order to overlap.
+    // -- get buffer count and required alignment
+    ALLOCATOR_PROPERTIES Actual;
+    HRESULT hr = m_pAlloc->GetProperties(&Actual);
+
+    // align the start position downwards
+    REFERENCE_TIME tStart = AlignDown(m_tStart / UNITS, Actual.cbAlign) * UNITS;
+    REFERENCE_TIME tCurrent = tStart;
+
+    REFERENCE_TIME tStop = m_tStop;
+    if (tStop > m_tDuration) {
+	tStop = m_tDuration;
+    }
+
+    // align the stop position - may be past stop, but that
+    // doesn't matter
+    REFERENCE_TIME tAlignStop = AlignUp(tStop / UNITS, Actual.cbAlign) * UNITS;
+
+
+    DWORD dwRequest;
+
+    if (!m_bSync) {
+
+	//  Break out of the loop either if we get to the end or we're asked
+	//  to do something else
+	while (tCurrent < tAlignStop) {
+
+	    // Break out without calling EndOfStream if we're asked to
+	    // do something different
+	    if (CheckRequest(&dwRequest)) {
+		return;
+	    }
+
+	    // queue a first sample
+	    if (Actual.cBuffers > 1) {
+
+		hr = QueueSample(tCurrent, tAlignStop, TRUE);
+		bDiscontinuity = FALSE;
+
+		if (FAILED(hr)) {
+		    return;
+		}
+	    }
+
+
+
+	    // loop queueing second and waiting for first..
+	    while (tCurrent < tAlignStop) {
+
+		hr = QueueSample(tCurrent, tAlignStop, bDiscontinuity);
+		bDiscontinuity = FALSE;
+
+		if (FAILED(hr)) {
+		    return;
+		}
+
+		hr = CollectAndDeliver(tStart, tStop);
+		if (S_OK != hr) {
+
+		    // stop if error, or if downstream filter said
+		    // to stop.
+		    return;
+		}
+	    }
+
+	    if (Actual.cBuffers > 1) {
+		hr = CollectAndDeliver(tStart, tStop);
+		if (FAILED(hr)) {
+		    return;
+		}
+	    }
+	}
+    } else {
+
+	// sync version of above loop
+	while (tCurrent < tAlignStop) {
+
+	    // Break out without calling EndOfStream if we're asked to
+	    // do something different
+	    if (CheckRequest(&dwRequest)) {
+		return;
+	    }
+
+	    IMediaSample* pSample;
+
+	    hr = m_pAlloc->GetBuffer(&pSample, NULL, NULL, 0);
+	    if (FAILED(hr)) {
+		OnError(hr);
+		return;
+	    }
+
+	    LONGLONG tStopThis = tCurrent + (pSample->GetSize() * UNITS);
+	    if (tStopThis > tAlignStop) {
+		tStopThis = tAlignStop;
+	    }
+	    pSample->SetTime(&tCurrent, &tStopThis);
+	    tCurrent = tStopThis;
+
+	    if (bDiscontinuity) {
+		pSample->SetDiscontinuity(TRUE);
+		bDiscontinuity = FALSE;
+	    }
+
+	    hr = m_pReader->SyncReadAligned(pSample);
+
+	    if (FAILED(hr)) {
+		pSample->Release();
+		OnError(hr);
+		return;
+	    }
+
+	    hr = DeliverSample(pSample, tStart, tStop);
+	    if (hr != S_OK) {
+		if (FAILED(hr)) {
+		    OnError(hr);
+		}
+		return;
+	    }
+	}
+    }
+
+    EndOfStream();
+}
+
+// after a flush, cancelled i/o will be waiting for collection
+// and release
+void
+CPullPin::CleanupCancelled(void)
+{
+    while (1) {
+	IMediaSample * pSample;
+	DWORD_PTR dwUnused;
+
+	HRESULT hr = m_pReader->WaitForNext(
+			    0,          // no wait
+			    &pSample,
+			    &dwUnused);
+	if(pSample) {
+	    pSample->Release();
+	} else {
+	    // no more samples
+	    return;
+	}
+    }
+}
diff --git a/plugins/GSdx_legacy/baseclasses/pullpin.h b/plugins/GSdx_legacy/baseclasses/pullpin.h
new file mode 100644
index 0000000000..654670138c
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/pullpin.h
@@ -0,0 +1,152 @@
+//------------------------------------------------------------------------------
+// File: PullPin.h
+//
+// Desc: DirectShow base classes - defines CPullPin class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __PULLPIN_H__
+#define __PULLPIN_H__
+
+//
+// CPullPin
+//
+// object supporting pulling data from an IAsyncReader interface.
+// Given a start/stop position, calls a pure Receive method with each
+// IMediaSample received.
+//
+// This is essentially for use in a MemInputPin when it finds itself
+// connected to an IAsyncReader pin instead of a pushing pin.
+//
+
+class CPullPin : public CAMThread
+{
+    IAsyncReader*       m_pReader;
+    REFERENCE_TIME      m_tStart;
+    REFERENCE_TIME      m_tStop;
+    REFERENCE_TIME      m_tDuration;
+    BOOL                m_bSync;
+
+    enum ThreadMsg {
+	TM_Pause,       // stop pulling and wait for next message
+	TM_Start,       // start pulling
+	TM_Exit,        // stop and exit
+    };
+
+    ThreadMsg m_State;
+
+    // override pure thread proc from CAMThread
+    DWORD ThreadProc(void);
+
+    // running pull method (check m_bSync)
+    void Process(void);
+
+    // clean up any cancelled i/o after a flush
+    void CleanupCancelled(void);
+
+    // suspend thread from pulling, eg during seek
+    HRESULT PauseThread();
+
+    // start thread pulling - create thread if necy
+    HRESULT StartThread();
+
+    // stop and close thread
+    HRESULT StopThread();
+
+    // called from ProcessAsync to queue and collect requests
+    HRESULT QueueSample(
+		REFERENCE_TIME& tCurrent,
+		REFERENCE_TIME tAlignStop,
+		BOOL bDiscontinuity);
+
+    HRESULT CollectAndDeliver(
+		REFERENCE_TIME tStart,
+		REFERENCE_TIME tStop);
+
+    HRESULT DeliverSample(
+		IMediaSample* pSample,
+		REFERENCE_TIME tStart,
+		REFERENCE_TIME tStop);
+
+protected:
+    IMemAllocator *     m_pAlloc;
+
+public:
+    CPullPin();
+    virtual ~CPullPin();
+
+    // returns S_OK if successfully connected to an IAsyncReader interface
+    // from this object
+    // Optional allocator should be proposed as a preferred allocator if
+    // necessary
+    // bSync is TRUE if we are to use sync reads instead of the
+    // async methods.
+    HRESULT Connect(IUnknown* pUnk, IMemAllocator* pAlloc, BOOL bSync);
+
+    // disconnect any connection made in Connect
+    HRESULT Disconnect();
+
+    // agree an allocator using RequestAllocator - optional
+    // props param specifies your requirements (non-zero fields).
+    // returns an error code if fail to match requirements.
+    // optional IMemAllocator interface is offered as a preferred allocator
+    // but no error occurs if it can't be met.
+    virtual HRESULT DecideAllocator(
+		IMemAllocator* pAlloc,
+		ALLOCATOR_PROPERTIES * pProps);
+
+    // set start and stop position. if active, will start immediately at
+    // the new position. Default is 0 to duration
+    HRESULT Seek(REFERENCE_TIME tStart, REFERENCE_TIME tStop);
+
+    // return the total duration
+    HRESULT Duration(REFERENCE_TIME* ptDuration);
+
+    // start pulling data
+    HRESULT Active(void);
+
+    // stop pulling data
+    HRESULT Inactive(void);
+
+    // helper functions
+    LONGLONG AlignDown(LONGLONG ll, LONG lAlign) {
+	// aligning downwards is just truncation
+	return ll & ~(lAlign-1);
+    };
+
+    LONGLONG AlignUp(LONGLONG ll, LONG lAlign) {
+	// align up: round up to next boundary
+	return (ll + (lAlign -1)) & ~(lAlign -1);
+    };
+
+    // GetReader returns the (addrefed) IAsyncReader interface
+    // for SyncRead etc
+    IAsyncReader* GetReader() {
+	m_pReader->AddRef();
+	return m_pReader;
+    };
+
+    // -- pure --
+
+    // override this to handle data arrival
+    // return value other than S_OK will stop data
+    virtual HRESULT Receive(IMediaSample*) PURE;
+
+    // override this to handle end-of-stream
+    virtual HRESULT EndOfStream(void) PURE;
+
+    // called on runtime errors that will have caused pulling
+    // to stop
+    // these errors are all returned from the upstream filter, who
+    // will have already reported any errors to the filtergraph.
+    virtual void OnError(HRESULT hr) PURE;
+
+    // flush this pin and all downstream
+    virtual HRESULT BeginFlush() PURE;
+    virtual HRESULT EndFlush() PURE;
+
+};
+
+#endif //__PULLPIN_H__
diff --git a/plugins/GSdx_legacy/baseclasses/refclock.cpp b/plugins/GSdx_legacy/baseclasses/refclock.cpp
new file mode 100644
index 0000000000..636351a806
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/refclock.cpp
@@ -0,0 +1,340 @@
+//------------------------------------------------------------------------------
+// File: RefClock.cpp
+//
+// Desc: DirectShow base classes - implements the IReferenceClock interface.
+//
+// Copyright (c)  Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include <limits.h>
+
+
+
+// 'this' used in constructor list
+#pragma warning(disable:4355)
+
+
+STDMETHODIMP CBaseReferenceClock::NonDelegatingQueryInterface(
+    REFIID riid,
+    void ** ppv)
+{
+    HRESULT hr;
+
+    if (riid == IID_IReferenceClock)
+    {
+        hr = GetInterface((IReferenceClock *) this, ppv);
+    }
+    else
+    {
+        hr = CUnknown::NonDelegatingQueryInterface(riid, ppv);
+    }
+    return hr;
+}
+
+CBaseReferenceClock::~CBaseReferenceClock()
+{
+
+    if (m_TimerResolution) timeEndPeriod(m_TimerResolution);
+
+    m_pSchedule->DumpLinkedList();
+
+    if (m_hThread)
+    {
+        m_bAbort = TRUE;
+        TriggerThread();
+        WaitForSingleObject( m_hThread, INFINITE );
+        EXECUTE_ASSERT( CloseHandle(m_hThread) );
+        m_hThread = 0;
+        EXECUTE_ASSERT( CloseHandle(m_pSchedule->GetEvent()) );
+	delete m_pSchedule;
+    }
+}
+
+// A derived class may supply a hThreadEvent if it has its own thread that will take care
+// of calling the schedulers Advise method.  (Refere to CBaseReferenceClock::AdviseThread()
+// to see what such a thread has to do.)
+CBaseReferenceClock::CBaseReferenceClock( TCHAR *pName, LPUNKNOWN pUnk, HRESULT *phr, CAMSchedule * pShed )
+: CUnknown( pName, pUnk )
+, m_rtLastGotTime(0)
+, m_TimerResolution(0)
+, m_bAbort( FALSE )
+, m_pSchedule( pShed ? pShed : new CAMSchedule(CreateEvent(NULL, FALSE, FALSE, NULL)) )
+, m_hThread(0)
+{
+
+
+    ASSERT(m_pSchedule);
+    if (!m_pSchedule)
+    {
+	*phr = E_OUTOFMEMORY;
+    }
+    else
+    {
+	// Set up the highest resolution timer we can manage
+	TIMECAPS tc;
+	m_TimerResolution = (TIMERR_NOERROR == timeGetDevCaps(&tc, sizeof(tc)))
+			    ? tc.wPeriodMin
+			    : 1;
+
+	timeBeginPeriod(m_TimerResolution);
+
+	/* Initialise our system times - the derived clock should set the right values */
+	m_dwPrevSystemTime = timeGetTime();
+	m_rtPrivateTime = (UNITS / MILLISECONDS) * m_dwPrevSystemTime;
+
+	#ifdef PERF
+	    m_idGetSystemTime = MSR_REGISTER(TEXT("CBaseReferenceClock::GetTime"));
+	#endif
+
+	if ( !pShed )
+	{
+	    DWORD ThreadID;
+	    m_hThread = ::CreateThread(NULL,                  // Security attributes
+				       (DWORD) 0,             // Initial stack size
+				       AdviseThreadFunction,  // Thread start address
+				       (LPVOID) this,         // Thread parameter
+				       (DWORD) 0,             // Creation flags
+				       &ThreadID);            // Thread identifier
+
+	    if (m_hThread)
+	    {
+		SetThreadPriority( m_hThread, THREAD_PRIORITY_TIME_CRITICAL );
+	    }
+	    else
+	    {
+		*phr = E_FAIL;
+		EXECUTE_ASSERT( CloseHandle(m_pSchedule->GetEvent()) );
+		delete m_pSchedule;
+	    }
+	}
+    }
+}
+
+void CBaseReferenceClock::Restart (IN REFERENCE_TIME rtMinTime)
+{
+    Lock();
+    m_rtLastGotTime = rtMinTime ;
+    Unlock();
+}
+
+STDMETHODIMP CBaseReferenceClock::GetTime(REFERENCE_TIME *pTime)
+{
+    HRESULT hr;
+    if (pTime)
+    {
+        REFERENCE_TIME rtNow;
+        Lock();
+        rtNow = GetPrivateTime();
+        if (rtNow > m_rtLastGotTime)
+        {
+            m_rtLastGotTime = rtNow;
+            hr = S_OK;
+        }
+        else
+        {
+            hr = S_FALSE;
+        }
+        *pTime = m_rtLastGotTime;
+        Unlock();
+        MSR_INTEGER(m_idGetSystemTime, LONG((*pTime) / (UNITS/MILLISECONDS)) );
+    }
+    else hr = E_POINTER;
+
+    return hr;
+}
+
+/* Ask for an async notification that a time has elapsed */
+
+STDMETHODIMP CBaseReferenceClock::AdviseTime(
+    REFERENCE_TIME baseTime,         // base reference time
+    REFERENCE_TIME streamTime,       // stream offset time
+    HEVENT hEvent,                  // advise via this event
+    DWORD_PTR *pdwAdviseCookie)         // where your cookie goes
+{
+    CheckPointer(pdwAdviseCookie, E_POINTER);
+    *pdwAdviseCookie = 0;
+
+    // Check that the event is not already set
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject(HANDLE(hEvent),0));
+
+    HRESULT hr;
+
+    const REFERENCE_TIME lRefTime = baseTime + streamTime;
+    if ( lRefTime <= 0 || lRefTime == MAX_TIME )
+    {
+        hr = E_INVALIDARG;
+    }
+    else
+    {
+        *pdwAdviseCookie = m_pSchedule->AddAdvisePacket( lRefTime, 0, HANDLE(hEvent), FALSE );
+        hr = *pdwAdviseCookie ? NOERROR : E_OUTOFMEMORY;
+    }
+    return hr;
+}
+
+
+/* Ask for an asynchronous periodic notification that a time has elapsed */
+
+STDMETHODIMP CBaseReferenceClock::AdvisePeriodic(
+    REFERENCE_TIME StartTime,         // starting at this time
+    REFERENCE_TIME PeriodTime,        // time between notifications
+    HSEMAPHORE hSemaphore,           // advise via a semaphore
+    DWORD_PTR *pdwAdviseCookie)          // where your cookie goes
+{
+    CheckPointer(pdwAdviseCookie, E_POINTER);
+    *pdwAdviseCookie = 0;
+
+    HRESULT hr;
+    if (StartTime > 0 && PeriodTime > 0 && StartTime != MAX_TIME )
+    {
+        *pdwAdviseCookie = m_pSchedule->AddAdvisePacket( StartTime, PeriodTime, HANDLE(hSemaphore), TRUE );
+        hr = *pdwAdviseCookie ? NOERROR : E_OUTOFMEMORY;
+    }
+    else hr = E_INVALIDARG;
+
+    return hr;
+}
+
+
+STDMETHODIMP CBaseReferenceClock::Unadvise(DWORD_PTR dwAdviseCookie)
+{
+    return m_pSchedule->Unadvise(dwAdviseCookie);
+}
+
+
+REFERENCE_TIME CBaseReferenceClock::GetPrivateTime()
+{
+    CAutoLock cObjectLock(this);
+
+
+    /* If the clock has wrapped then the current time will be less than
+     * the last time we were notified so add on the extra milliseconds
+     *
+     * The time period is long enough so that the likelihood of
+     * successive calls spanning the clock cycle is not considered.
+     */
+
+    DWORD dwTime = timeGetTime();
+    {
+        m_rtPrivateTime += Int32x32To64(UNITS / MILLISECONDS, (DWORD)(dwTime - m_dwPrevSystemTime));
+        m_dwPrevSystemTime = dwTime;
+    }
+
+    return m_rtPrivateTime;
+}
+
+
+/* Adjust the current time by the input value.  This allows an
+   external time source to work out some of the latency of the clock
+   system and adjust the "current" time accordingly.  The intent is
+   that the time returned to the user is synchronised to a clock
+   source and allows drift to be catered for.
+
+   For example: if the clock source detects a drift it can pass a delta
+   to the current time rather than having to set an explicit time.
+*/
+
+STDMETHODIMP CBaseReferenceClock::SetTimeDelta(const REFERENCE_TIME & TimeDelta)
+{
+#ifdef DEBUG
+
+    // Just break if passed an improper time delta value
+    LONGLONG llDelta = TimeDelta > 0 ? TimeDelta : -TimeDelta;
+    if (llDelta > UNITS * 1000) {
+        DbgLog((LOG_TRACE, 0, TEXT("Bad Time Delta")));
+        //DebugBreak();
+    }
+
+    // We're going to calculate a "severity" for the time change. Max -1
+    // min 8.  We'll then use this as the debug logging level for a
+    // debug log message.
+    const LONG usDelta = LONG(TimeDelta/10);      // Delta in micro-secs
+
+    DWORD delta        = abs(usDelta);            // varying delta
+    // Severity == 8 - ceil(log<base 8>(abs( micro-secs delta)))
+    int   Severity     = 8;
+    while ( delta > 0 )
+    {
+        delta >>= 3;                              // div 8
+        Severity--;
+    }
+
+    // Sev == 0 => > 2 second delta!
+    DbgLog((LOG_TIMING, Severity < 0 ? 0 : Severity,
+        TEXT("Sev %2i: CSystemClock::SetTimeDelta(%8ld us) %lu -> %lu ms."),
+        Severity, usDelta, DWORD(ConvertToMilliseconds(m_rtPrivateTime)),
+        DWORD(ConvertToMilliseconds(TimeDelta+m_rtPrivateTime)) ));
+
+    // Don't want the DbgBreak to fire when running stress on debug-builds.
+    #ifdef BREAK_ON_SEVERE_TIME_DELTA
+        if (Severity < 0)
+            DbgBreakPoint(TEXT("SetTimeDelta > 16 seconds!"),
+                          TEXT(__FILE__),__LINE__);
+    #endif
+
+#endif
+
+    CAutoLock cObjectLock(this);
+    m_rtPrivateTime += TimeDelta;
+    // If time goes forwards, and we have advises, then we need to
+    // trigger the thread so that it can re-evaluate its wait time.
+    // Since we don't want the cost of the thread switches if the change
+    // is really small, only do it if clock goes forward by more than
+    // 0.5 millisecond.  If the time goes backwards, the thread will
+    // wake up "early" (relativly speaking) and will re-evaluate at
+    // that time.
+    if ( TimeDelta > 5000 && m_pSchedule->GetAdviseCount() > 0 ) TriggerThread();
+    return NOERROR;
+}
+
+// Thread stuff
+
+DWORD __stdcall CBaseReferenceClock::AdviseThreadFunction(LPVOID p)
+{
+    return DWORD(reinterpret_cast<CBaseReferenceClock*>(p)->AdviseThread());
+}
+
+HRESULT CBaseReferenceClock::AdviseThread()
+{
+    DWORD dwWait = INFINITE;
+
+    // The first thing we do is wait until something interesting happens
+    // (meaning a first advise or shutdown).  This prevents us calling
+    // GetPrivateTime immediately which is goodness as that is a virtual
+    // routine and the derived class may not yet be constructed.  (This
+    // thread is created in the base class constructor.)
+
+    while ( !m_bAbort )
+    {
+        // Wait for an interesting event to happen
+        DbgLog((LOG_TIMING, 3, TEXT("CBaseRefClock::AdviseThread() Delay: %lu ms"), dwWait ));
+        WaitForSingleObject(m_pSchedule->GetEvent(), dwWait);
+        if (m_bAbort) break;
+
+        // There are several reasons why we need to work from the internal
+        // time, mainly to do with what happens when time goes backwards.
+        // Mainly, it stop us looping madly if an event is just about to
+        // expire when the clock goes backward (i.e. GetTime stop for a
+        // while).
+        const REFERENCE_TIME  rtNow = GetPrivateTime();
+
+        DbgLog((LOG_TIMING, 3,
+              TEXT("CBaseRefClock::AdviseThread() Woke at = %lu ms"),
+              ConvertToMilliseconds(rtNow) ));
+
+        // We must add in a millisecond, since this is the resolution of our
+        // WaitForSingleObject timer.  Failure to do so will cause us to loop
+        // franticly for (approx) 1 a millisecond.
+        m_rtNextAdvise = m_pSchedule->Advise( 10000 + rtNow );
+        LONGLONG llWait = m_rtNextAdvise - rtNow;
+
+        ASSERT( llWait > 0 );
+
+        llWait = ConvertToMilliseconds(llWait);
+        // DON'T replace this with a max!! (The type's of these things is VERY important)
+        dwWait = (llWait > REFERENCE_TIME(UINT_MAX)) ? UINT_MAX : DWORD(llWait);
+    };
+    return NOERROR;
+}
diff --git a/plugins/GSdx_legacy/baseclasses/refclock.h b/plugins/GSdx_legacy/baseclasses/refclock.h
new file mode 100644
index 0000000000..a47ae7b033
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/refclock.h
@@ -0,0 +1,171 @@
+//------------------------------------------------------------------------------
+// File: RefClock.h
+//
+// Desc: DirectShow base classes - defines the IReferenceClock interface.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __BASEREFCLOCK__
+#define __BASEREFCLOCK__
+
+#include "dsschedule.h"
+
+const UINT RESOLUTION = 1;                      /* High resolution timer */
+const INT ADVISE_CACHE = 4;                     /* Default cache size */
+const LONGLONG MAX_TIME = 0x7FFFFFFFFFFFFFFF;   /* Maximum LONGLONG value */
+
+inline LONGLONG WINAPI ConvertToMilliseconds(const REFERENCE_TIME& RT)
+{
+    /* This converts an arbitrary value representing a reference time
+       into a MILLISECONDS value for use in subsequent system calls */
+
+    return (RT / (UNITS / MILLISECONDS));
+}
+
+/* This class hierarchy will support an IReferenceClock interface so
+   that an audio card (or other externally driven clock) can update the
+   system wide clock that everyone uses.
+
+   The interface will be pretty thin with probably just one update method
+   This interface has not yet been defined.
+ */
+
+/* This abstract base class implements the IReferenceClock
+ * interface.  Classes that actually provide clock signals (from
+ * whatever source) have to be derived from this class.
+ *
+ * The abstract class provides implementations for:
+ * 	CUnknown support
+ *      locking support (CCritSec)
+ *	client advise code (creates a thread)
+ *
+ * Question: what can we do about quality?  Change the timer
+ * resolution to lower the system load?  Up the priority of the
+ * timer thread to force more responsive signals?
+ *
+ * During class construction we create a worker thread that is destroyed during
+ * destuction.  This thread executes a series of WaitForSingleObject calls,
+ * waking up when a command is given to the thread or the next wake up point
+ * is reached.  The wakeup points are determined by clients making Advise
+ * calls.
+ *
+ * Each advise call defines a point in time when they wish to be notified.  A
+ * periodic advise is a series of these such events.  We maintain a list of
+ * advise links and calculate when the nearest event notification is due for.
+ * We then call WaitForSingleObject with a timeout equal to this time.  The
+ * handle we wait on is used by the class to signal that something has changed
+ * and that we must reschedule the next event.  This typically happens when
+ * someone comes in and asks for an advise link while we are waiting for an
+ * event to timeout.
+ *
+ * While we are modifying the list of advise requests we
+ * are protected from interference through a critical section.  Clients are NOT
+ * advised through callbacks.  One shot clients have an event set, while
+ * periodic clients have a semaphore released for each event notification.  A
+ * semaphore allows a client to be kept up to date with the number of events
+ * actually triggered and be assured that they can't miss multiple events being
+ * set.
+ *
+ * Keeping track of advises is taken care of by the CAMSchedule class.
+ */
+
+class CBaseReferenceClock
+: public CUnknown, public IReferenceClock, public CCritSec
+{
+protected:
+    virtual ~CBaseReferenceClock();     // Don't let me be created on the stack!
+public:
+    CBaseReferenceClock(TCHAR *pName, LPUNKNOWN pUnk, HRESULT *phr, CAMSchedule * pSched = 0 );
+
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid,void ** ppv);
+
+    DECLARE_IUNKNOWN
+
+    /* IReferenceClock methods */
+    // Derived classes must implement GetPrivateTime().  All our GetTime
+    // does is call GetPrivateTime and then check so that time does not
+    // go backwards.  A return code of S_FALSE implies that the internal
+    // clock has gone backwards and GetTime time has halted until internal
+    // time has caught up. (Don't know if this will be much use to folk,
+    // but it seems odd not to use the return code for something useful.)
+    STDMETHODIMP GetTime(REFERENCE_TIME *pTime);
+    // When this is called, it sets m_rtLastGotTime to the time it returns.
+
+    /* Provide standard mechanisms for scheduling events */
+
+    /* Ask for an async notification that a time has elapsed */
+    STDMETHODIMP AdviseTime(
+        REFERENCE_TIME baseTime,        // base reference time
+        REFERENCE_TIME streamTime,      // stream offset time
+        HEVENT hEvent,                  // advise via this event
+        DWORD_PTR *pdwAdviseCookie          // where your cookie goes
+    );
+
+    /* Ask for an asynchronous periodic notification that a time has elapsed */
+    STDMETHODIMP AdvisePeriodic(
+        REFERENCE_TIME StartTime,       // starting at this time
+        REFERENCE_TIME PeriodTime,      // time between notifications
+        HSEMAPHORE hSemaphore,          // advise via a semaphore
+        DWORD_PTR *pdwAdviseCookie          // where your cookie goes
+    );
+
+    /* Cancel a request for notification(s) - if the notification was
+     * a one shot timer then this function doesn't need to be called
+     * as the advise is automatically cancelled, however it does no
+     * harm to explicitly cancel a one-shot advise.  It is REQUIRED that
+     * clients call Unadvise to clear a Periodic advise setting.
+     */
+
+    STDMETHODIMP Unadvise(DWORD_PTR dwAdviseCookie);
+
+    /* Methods for the benefit of derived classes or outer objects */
+
+    // GetPrivateTime() is the REAL clock.  GetTime is just a cover for
+    // it.  Derived classes will probably override this method but not
+    // GetTime() itself.
+    // The important point about GetPrivateTime() is it's allowed to go
+    // backwards.  Our GetTime() will keep returning the LastGotTime
+    // until GetPrivateTime() catches up.
+    virtual REFERENCE_TIME GetPrivateTime();
+
+    /* Provide a method for correcting drift */
+    STDMETHODIMP SetTimeDelta( const REFERENCE_TIME& TimeDelta );
+
+    CAMSchedule * GetSchedule() const { return m_pSchedule; }
+
+private:
+    REFERENCE_TIME m_rtPrivateTime;     // Current best estimate of time
+    DWORD          m_dwPrevSystemTime;  // Last vaule we got from timeGetTime
+    REFERENCE_TIME m_rtLastGotTime;     // Last time returned by GetTime
+    REFERENCE_TIME m_rtNextAdvise;      // Time of next advise
+    UINT           m_TimerResolution;
+
+#ifdef PERF
+    int m_idGetSystemTime;
+#endif
+
+// Thread stuff
+public:
+    void TriggerThread()                	// Wakes thread up.  Need to do this if
+    {						// time to next advise needs reevaluating.
+	EXECUTE_ASSERT(SetEvent(m_pSchedule->GetEvent()));
+    }
+
+
+private:
+    BOOL           m_bAbort;            // Flag used for thread shutdown
+    HANDLE         m_hThread;           // Thread handle
+
+    HRESULT AdviseThread();             // Method in which the advise thread runs
+    static DWORD __stdcall AdviseThreadFunction(LPVOID); // Function used to get there
+
+protected:
+    CAMSchedule * const m_pSchedule;
+
+    void Restart (IN REFERENCE_TIME rtMinTime = 0I64) ;
+};
+
+#endif
+
diff --git a/plugins/GSdx_legacy/baseclasses/reftime.h b/plugins/GSdx_legacy/baseclasses/reftime.h
new file mode 100644
index 0000000000..2cf70df998
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/reftime.h
@@ -0,0 +1,116 @@
+//------------------------------------------------------------------------------
+// File: RefTime.h
+//
+// Desc: DirectShow base classes - defines CRefTime, a class that manages
+//       reference times.
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+//------------------------------------------------------------------------------
+
+
+//
+// CRefTime
+//
+// Manage reference times.
+// Shares same data layout as REFERENCE_TIME, but adds some (nonvirtual)
+// functions providing simple comparison, conversion and arithmetic.
+//
+// A reference time (at the moment) is a unit of seconds represented in
+// 100ns units as is used in the Win32 FILETIME structure. BUT the time
+// a REFERENCE_TIME represents is NOT the time elapsed since 1/1/1601 it
+// will either be stream time or reference time depending upon context
+//
+// This class provides simple arithmetic operations on reference times
+//
+// keep non-virtual otherwise the data layout will not be the same as
+// REFERENCE_TIME
+
+
+// -----
+// note that you are safe to cast a CRefTime* to a REFERENCE_TIME*, but
+// you will need to do so explicitly
+// -----
+
+
+#ifndef __REFTIME__
+#define __REFTIME__
+
+
+const LONGLONG MILLISECONDS = (1000);            // 10 ^ 3
+const LONGLONG NANOSECONDS = (1000000000);       // 10 ^ 9
+const LONGLONG UNITS = (NANOSECONDS / 100);      // 10 ^ 7
+
+/*  Unfortunately an inline function here generates a call to __allmul
+    - even for constants!
+*/
+#define MILLISECONDS_TO_100NS_UNITS(lMs) \
+    Int32x32To64((lMs), (UNITS / MILLISECONDS))
+
+class CRefTime
+{
+public:
+
+    // *MUST* be the only data member so that this class is exactly
+    // equivalent to a REFERENCE_TIME.
+    // Also, must be *no virtual functions*
+
+    REFERENCE_TIME m_time;
+
+    inline CRefTime()
+    {
+        // default to 0 time
+        m_time = 0;
+    };
+
+    inline CRefTime(LONG msecs)
+    {
+        m_time = MILLISECONDS_TO_100NS_UNITS(msecs);
+    };
+
+    inline CRefTime(REFERENCE_TIME rt)
+    {
+        m_time = rt;
+    };
+
+    inline operator REFERENCE_TIME() const
+    {
+        return m_time;
+    };
+
+    inline CRefTime& operator=(const CRefTime& rt)
+    {
+        m_time = rt.m_time;
+        return *this;
+    };
+
+    inline CRefTime& operator=(const LONGLONG ll)
+    {
+        m_time = ll;
+        return *this;
+    };
+
+    inline CRefTime& operator+=(const CRefTime& rt)
+    {
+        return (*this = *this + rt);
+    };
+
+    inline CRefTime& operator-=(const CRefTime& rt)
+    {
+        return (*this = *this - rt);
+    };
+
+    inline LONG Millisecs(void)
+    {
+        return (LONG)(m_time / (UNITS / MILLISECONDS));
+    };
+
+    inline LONGLONG GetUnits(void)
+    {
+        return m_time;
+    };
+};
+
+const LONGLONG TimeZero = 0;
+
+#endif /* __REFTIME__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/renbase.cpp b/plugins/GSdx_legacy/baseclasses/renbase.cpp
new file mode 100644
index 0000000000..8eb8a85dfc
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/renbase.cpp
@@ -0,0 +1,2844 @@
+//------------------------------------------------------------------------------
+// File: RenBase.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"        // DirectShow base class definitions
+#include <mmsystem.h>       // Needed for definition of timeGetTime
+#include <limits.h>         // Standard data type limit definitions
+#include "measure.h"        // Used for time critical log functions
+
+#pragma warning(disable:4355)
+
+//  Helper function for clamping time differences
+int inline TimeDiff(REFERENCE_TIME rt)
+{
+    if (rt < - (50 * UNITS)) {
+        return -(50 * UNITS);
+    } else
+    if (rt > 50 * UNITS) {
+        return 50 * UNITS;
+    } else return (int)rt;
+}
+
+// Implements the CBaseRenderer class
+
+CBaseRenderer::CBaseRenderer(REFCLSID RenderClass, // CLSID for this renderer
+                             TCHAR *pName,         // Debug ONLY description
+                             LPUNKNOWN pUnk,       // Aggregated owner object
+                             HRESULT *phr) :       // General OLE return code
+
+    CBaseFilter(pName,pUnk,&m_InterfaceLock,RenderClass),
+    m_evComplete(TRUE),
+    m_bAbort(FALSE),
+    m_pPosition(NULL),
+    m_ThreadSignal(TRUE),
+    m_bStreaming(FALSE),
+    m_bEOS(FALSE),
+    m_bEOSDelivered(FALSE),
+    m_pMediaSample(NULL),
+    m_dwAdvise(0),
+    m_pQSink(NULL),
+    m_pInputPin(NULL),
+    m_bRepaintStatus(TRUE),
+    m_SignalTime(0),
+    m_bInReceive(FALSE),
+    m_EndOfStreamTimer(0)
+{
+    Ready();
+#ifdef PERF
+    m_idBaseStamp = MSR_REGISTER(TEXT("BaseRenderer: sample time stamp"));
+    m_idBaseRenderTime = MSR_REGISTER(TEXT("BaseRenderer: draw time (msec)"));
+    m_idBaseAccuracy = MSR_REGISTER(TEXT("BaseRenderer: Accuracy (msec)"));
+#endif
+}
+
+
+// Delete the dynamically allocated IMediaPosition and IMediaSeeking helper
+// object. The object is created when somebody queries us. These are standard
+// control interfaces for seeking and setting start/stop positions and rates.
+// We will probably also have made an input pin based on CRendererInputPin
+// that has to be deleted, it's created when an enumerator calls our GetPin
+
+CBaseRenderer::~CBaseRenderer()
+{
+    ASSERT(m_bStreaming == FALSE);
+    ASSERT(m_EndOfStreamTimer == 0);
+    StopStreaming();
+    ClearPendingSample();
+
+    // Delete any IMediaPosition implementation
+
+    if (m_pPosition) {
+        delete m_pPosition;
+        m_pPosition = NULL;
+    }
+
+    // Delete any input pin created
+
+    if (m_pInputPin) {
+        delete m_pInputPin;
+        m_pInputPin = NULL;
+    }
+
+    // Release any Quality sink
+
+    ASSERT(m_pQSink == NULL);
+}
+
+
+// This returns the IMediaPosition and IMediaSeeking interfaces
+
+HRESULT CBaseRenderer::GetMediaPositionInterface(REFIID riid,void **ppv)
+{
+    CAutoLock cObjectCreationLock(&m_ObjectCreationLock);
+    if (m_pPosition) {
+        return m_pPosition->NonDelegatingQueryInterface(riid,ppv);
+    }
+
+    HRESULT hr = NOERROR;
+
+    // Create implementation of this dynamically since sometimes we may
+    // never try and do a seek. The helper object implements a position
+    // control interface (IMediaPosition) which in fact simply takes the
+    // calls normally from the filter graph and passes them upstream
+
+    m_pPosition = new CRendererPosPassThru(NAME("Renderer CPosPassThru"),
+                                           CBaseFilter::GetOwner(),
+                                           (HRESULT *) &hr,
+                                           GetPin(0));
+    if (m_pPosition == NULL) {
+        return E_OUTOFMEMORY;
+    }
+
+    if (FAILED(hr)) {
+        delete m_pPosition;
+        m_pPosition = NULL;
+        return E_NOINTERFACE;
+    }
+    return GetMediaPositionInterface(riid,ppv);
+}
+
+
+// Overriden to say what interfaces we support and where
+
+STDMETHODIMP CBaseRenderer::NonDelegatingQueryInterface(REFIID riid,void **ppv)
+{
+    // Do we have this interface
+
+    if (riid == IID_IMediaPosition || riid == IID_IMediaSeeking) {
+        return GetMediaPositionInterface(riid,ppv);
+    } else {
+        return CBaseFilter::NonDelegatingQueryInterface(riid,ppv);
+    }
+}
+
+
+// This is called whenever we change states, we have a manual reset event that
+// is signalled whenever we don't won't the source filter thread to wait in us
+// (such as in a stopped state) and likewise is not signalled whenever it can
+// wait (during paused and running) this function sets or resets the thread
+// event. The event is used to stop source filter threads waiting in Receive
+
+HRESULT CBaseRenderer::SourceThreadCanWait(BOOL bCanWait)
+{
+    if (bCanWait == TRUE) {
+        m_ThreadSignal.Reset();
+    } else {
+        m_ThreadSignal.Set();
+    }
+    return NOERROR;
+}
+
+
+#ifdef DEBUG
+// Dump the current renderer state to the debug terminal. The hardest part of
+// the renderer is the window where we unlock everything to wait for a clock
+// to signal it is time to draw or for the application to cancel everything
+// by stopping the filter. If we get things wrong we can leave the thread in
+// WaitForRenderTime with no way for it to ever get out and we will deadlock
+
+void CBaseRenderer::DisplayRendererState()
+{
+    DbgLog((LOG_TIMING, 1, TEXT("\nTimed out in WaitForRenderTime")));
+
+    // No way should this be signalled at this point
+
+    BOOL bSignalled = m_ThreadSignal.Check();
+    DbgLog((LOG_TIMING, 1, TEXT("Signal sanity check %d"),bSignalled));
+
+    // Now output the current renderer state variables
+
+    DbgLog((LOG_TIMING, 1, TEXT("Filter state %d"),m_State));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Abort flag %d"),m_bAbort));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Streaming flag %d"),m_bStreaming));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Clock advise link %d"),m_dwAdvise));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Current media sample %x"),m_pMediaSample));
+
+    DbgLog((LOG_TIMING, 1, TEXT("EOS signalled %d"),m_bEOS));
+
+    DbgLog((LOG_TIMING, 1, TEXT("EOS delivered %d"),m_bEOSDelivered));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Repaint status %d"),m_bRepaintStatus));
+
+
+    // Output the delayed end of stream timer information
+
+    DbgLog((LOG_TIMING, 1, TEXT("End of stream timer %x"),m_EndOfStreamTimer));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Deliver time %s"),CDisp((LONGLONG)m_SignalTime)));
+
+
+    // Should never timeout during a flushing state
+
+    BOOL bFlushing = m_pInputPin->IsFlushing();
+    DbgLog((LOG_TIMING, 1, TEXT("Flushing sanity check %d"),bFlushing));
+
+    // Display the time we were told to start at
+    DbgLog((LOG_TIMING, 1, TEXT("Last run time %s"),CDisp((LONGLONG)m_tStart.m_time)));
+
+    // Have we got a reference clock
+    if (m_pClock == NULL) return;
+
+    // Get the current time from the wall clock
+
+    CRefTime CurrentTime,StartTime,EndTime;
+    m_pClock->GetTime((REFERENCE_TIME*) &CurrentTime);
+    CRefTime Offset = CurrentTime - m_tStart;
+
+    // Display the current time from the clock
+
+    DbgLog((LOG_TIMING, 1, TEXT("Clock time %s"),CDisp((LONGLONG)CurrentTime.m_time)));
+
+    DbgLog((LOG_TIMING, 1, TEXT("Time difference %dms"),Offset.Millisecs()));
+
+
+    // Do we have a sample ready to render
+    if (m_pMediaSample == NULL) return;
+
+    m_pMediaSample->GetTime((REFERENCE_TIME*)&StartTime, (REFERENCE_TIME*)&EndTime);
+    DbgLog((LOG_TIMING, 1, TEXT("Next sample stream times (Start %d End %d ms)"),
+           StartTime.Millisecs(),EndTime.Millisecs()));
+
+    // Calculate how long it is until it is due for rendering
+    CRefTime Wait = (m_tStart + StartTime) - CurrentTime;
+    DbgLog((LOG_TIMING, 1, TEXT("Wait required %d ms"),Wait.Millisecs()));
+}
+#endif
+
+
+// Wait until the clock sets the timer event or we're otherwise signalled. We
+// set an arbitrary timeout for this wait and if it fires then we display the
+// current renderer state on the debugger. It will often fire if the filter's
+// left paused in an application however it may also fire during stress tests
+// if the synchronisation with application seeks and state changes is faulty
+
+#define RENDER_TIMEOUT 10000
+
+HRESULT CBaseRenderer::WaitForRenderTime()
+{
+    HANDLE WaitObjects[] = { m_ThreadSignal, m_RenderEvent };
+    DWORD Result = WAIT_TIMEOUT;
+
+    // Wait for either the time to arrive or for us to be stopped
+
+    OnWaitStart();
+    while (Result == WAIT_TIMEOUT) {
+        Result = WaitForMultipleObjects(2,WaitObjects,FALSE,RENDER_TIMEOUT);
+
+#ifdef DEBUG
+        if (Result == WAIT_TIMEOUT) DisplayRendererState();
+#endif
+
+    }
+    OnWaitEnd();
+
+    // We may have been awoken without the timer firing
+
+    if (Result == WAIT_OBJECT_0) {
+        return VFW_E_STATE_CHANGED;
+    }
+
+    SignalTimerFired();
+    return NOERROR;
+}
+
+
+// Poll waiting for Receive to complete.  This really matters when
+// Receive may set the palette and cause window messages
+// The problem is that if we don't really wait for a renderer to
+// stop processing we can deadlock waiting for a transform which
+// is calling the renderer's Receive() method because the transform's
+// Stop method doesn't know to process window messages to unblock
+// the renderer's Receive processing
+void CBaseRenderer::WaitForReceiveToComplete()
+{
+    for (;;) {
+        if (!m_bInReceive) {
+            break;
+        }
+
+        MSG msg;
+        //  Receive all interthread sendmessages
+        PeekMessage(&msg, NULL, WM_NULL, WM_NULL, PM_NOREMOVE);
+
+        Sleep(1);
+    }
+
+    // If the wakebit for QS_POSTMESSAGE is set, the PeekMessage call
+    // above just cleared the changebit which will cause some messaging
+    // calls to block (waitMessage, MsgWaitFor...) now.
+    // Post a dummy message to set the QS_POSTMESSAGE bit again
+    if (HIWORD(GetQueueStatus(QS_POSTMESSAGE)) & QS_POSTMESSAGE) {
+        //  Send dummy message
+        PostThreadMessage(GetCurrentThreadId(), WM_NULL, 0, 0);
+    }
+}
+
+// A filter can have four discrete states, namely Stopped, Running, Paused,
+// Intermediate. We are in an intermediate state if we are currently trying
+// to pause but haven't yet got the first sample (or if we have been flushed
+// in paused state and therefore still have to wait for a sample to arrive)
+
+// This class contains an event called m_evComplete which is signalled when
+// the current state is completed and is not signalled when we are waiting to
+// complete the last state transition. As mentioned above the only time we
+// use this at the moment is when we wait for a media sample in paused state
+// If while we are waiting we receive an end of stream notification from the
+// source filter then we know no data is imminent so we can reset the event
+// This means that when we transition to paused the source filter must call
+// end of stream on us or send us an image otherwise we'll hang indefinately
+
+
+// Simple internal way of getting the real state
+
+FILTER_STATE CBaseRenderer::GetRealState() {
+    return m_State;
+}
+
+
+// The renderer doesn't complete the full transition to paused states until
+// it has got one media sample to render. If you ask it for its state while
+// it's waiting it will return the state along with VFW_S_STATE_INTERMEDIATE
+
+STDMETHODIMP CBaseRenderer::GetState(DWORD dwMSecs,FILTER_STATE *State)
+{
+    CheckPointer(State,E_POINTER);
+
+    if (WaitDispatchingMessages(m_evComplete, dwMSecs) == WAIT_TIMEOUT) {
+        *State = m_State;
+        return VFW_S_STATE_INTERMEDIATE;
+    }
+    *State = m_State;
+    return NOERROR;
+}
+
+
+// If we're pausing and we have no samples we don't complete the transition
+// to State_Paused and we return S_FALSE. However if the m_bAbort flag has
+// been set then all samples are rejected so there is no point waiting for
+// one. If we do have a sample then return NOERROR. We will only ever return
+// VFW_S_STATE_INTERMEDIATE from GetState after being paused with no sample
+// (calling GetState after either being stopped or Run will NOT return this)
+
+HRESULT CBaseRenderer::CompleteStateChange(FILTER_STATE OldState)
+{
+    // Allow us to be paused when disconnected
+
+    if (m_pInputPin->IsConnected() == FALSE) {
+        Ready();
+        return S_OK;
+    }
+
+    // Have we run off the end of stream
+
+    if (IsEndOfStream() == TRUE) {
+        Ready();
+        return S_OK;
+    }
+
+    // Make sure we get fresh data after being stopped
+
+    if (HaveCurrentSample() == TRUE) {
+        if (OldState != State_Stopped) {
+            Ready();
+            return S_OK;
+        }
+    }
+    NotReady();
+    return S_FALSE;
+}
+
+
+// When we stop the filter the things we do are:-
+
+//      Decommit the allocator being used in the connection
+//      Release the source filter if it's waiting in Receive
+//      Cancel any advise link we set up with the clock
+//      Any end of stream signalled is now obsolete so reset
+//      Allow us to be stopped when we are not connected
+
+STDMETHODIMP CBaseRenderer::Stop()
+{
+    CAutoLock cRendererLock(&m_InterfaceLock);
+
+    // Make sure there really is a state change
+
+    if (m_State == State_Stopped) {
+        return NOERROR;
+    }
+
+    // Is our input pin connected
+
+    if (m_pInputPin->IsConnected() == FALSE) {
+        NOTE("Input pin is not connected");
+        m_State = State_Stopped;
+        return NOERROR;
+    }
+
+    CBaseFilter::Stop();
+
+    // If we are going into a stopped state then we must decommit whatever
+    // allocator we are using it so that any source filter waiting in the
+    // GetBuffer can be released and unlock themselves for a state change
+
+    if (m_pInputPin->Allocator()) {
+        m_pInputPin->Allocator()->Decommit();
+    }
+
+    // Cancel any scheduled rendering
+
+    SetRepaintStatus(TRUE);
+    StopStreaming();
+    SourceThreadCanWait(FALSE);
+    ResetEndOfStream();
+    CancelNotification();
+
+    // There should be no outstanding clock advise
+    ASSERT(CancelNotification() == S_FALSE);
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+    ASSERT(m_EndOfStreamTimer == 0);
+
+    Ready();
+    WaitForReceiveToComplete();
+    m_bAbort = FALSE;
+
+    return NOERROR;
+}
+
+
+// When we pause the filter the things we do are:-
+
+//      Commit the allocator being used in the connection
+//      Allow a source filter thread to wait in Receive
+//      Cancel any clock advise link (we may be running)
+//      Possibly complete the state change if we have data
+//      Allow us to be paused when we are not connected
+
+STDMETHODIMP CBaseRenderer::Pause()
+{
+    CAutoLock cRendererLock(&m_InterfaceLock);
+    FILTER_STATE OldState = m_State;
+    ASSERT(m_pInputPin->IsFlushing() == FALSE);
+
+    // Make sure there really is a state change
+
+    if (m_State == State_Paused) {
+        return CompleteStateChange(State_Paused);
+    }
+
+    // Has our input pin been connected
+
+    if (m_pInputPin->IsConnected() == FALSE) {
+        NOTE("Input pin is not connected");
+        m_State = State_Paused;
+        return CompleteStateChange(State_Paused);
+    }
+
+    // Pause the base filter class
+
+    HRESULT hr = CBaseFilter::Pause();
+    if (FAILED(hr)) {
+        NOTE("Pause failed");
+        return hr;
+    }
+
+    // Enable EC_REPAINT events again
+
+    SetRepaintStatus(TRUE);
+    StopStreaming();
+    SourceThreadCanWait(TRUE);
+    CancelNotification();
+    ResetEndOfStreamTimer();
+
+    // If we are going into a paused state then we must commit whatever
+    // allocator we are using it so that any source filter can call the
+    // GetBuffer and expect to get a buffer without returning an error
+
+    if (m_pInputPin->Allocator()) {
+        m_pInputPin->Allocator()->Commit();
+    }
+
+    // There should be no outstanding advise
+    ASSERT(CancelNotification() == S_FALSE);
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+    ASSERT(m_EndOfStreamTimer == 0);
+    ASSERT(m_pInputPin->IsFlushing() == FALSE);
+
+    // When we come out of a stopped state we must clear any image we were
+    // holding onto for frame refreshing. Since renderers see state changes
+    // first we can reset ourselves ready to accept the source thread data
+    // Paused or running after being stopped causes the current position to
+    // be reset so we're not interested in passing end of stream signals
+
+    if (OldState == State_Stopped) {
+        m_bAbort = FALSE;
+        ClearPendingSample();
+    }
+    return CompleteStateChange(OldState);
+}
+
+
+// When we run the filter the things we do are:-
+
+//      Commit the allocator being used in the connection
+//      Allow a source filter thread to wait in Receive
+//      Signal the render event just to get us going
+//      Start the base class by calling StartStreaming
+//      Allow us to be run when we are not connected
+//      Signal EC_COMPLETE if we are not connected
+
+STDMETHODIMP CBaseRenderer::Run(REFERENCE_TIME StartTime)
+{
+    CAutoLock cRendererLock(&m_InterfaceLock);
+    FILTER_STATE OldState = m_State;
+
+    // Make sure there really is a state change
+
+    if (m_State == State_Running) {
+        return NOERROR;
+    }
+
+    // Send EC_COMPLETE if we're not connected
+
+    if (m_pInputPin->IsConnected() == FALSE) {
+        NotifyEvent(EC_COMPLETE,S_OK,(LONG_PTR)(IBaseFilter *)this);
+        m_State = State_Running;
+        return NOERROR;
+    }
+
+    Ready();
+
+    // Pause the base filter class
+
+    HRESULT hr = CBaseFilter::Run(StartTime);
+    if (FAILED(hr)) {
+        NOTE("Run failed");
+        return hr;
+    }
+
+    // Allow the source thread to wait
+    ASSERT(m_pInputPin->IsFlushing() == FALSE);
+    SourceThreadCanWait(TRUE);
+    SetRepaintStatus(FALSE);
+
+    // There should be no outstanding advise
+    ASSERT(CancelNotification() == S_FALSE);
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+    ASSERT(m_EndOfStreamTimer == 0);
+    ASSERT(m_pInputPin->IsFlushing() == FALSE);
+
+    // If we are going into a running state then we must commit whatever
+    // allocator we are using it so that any source filter can call the
+    // GetBuffer and expect to get a buffer without returning an error
+
+    if (m_pInputPin->Allocator()) {
+        m_pInputPin->Allocator()->Commit();
+    }
+
+    // When we come out of a stopped state we must clear any image we were
+    // holding onto for frame refreshing. Since renderers see state changes
+    // first we can reset ourselves ready to accept the source thread data
+    // Paused or running after being stopped causes the current position to
+    // be reset so we're not interested in passing end of stream signals
+
+    if (OldState == State_Stopped) {
+        m_bAbort = FALSE;
+        ClearPendingSample();
+    }
+    return StartStreaming();
+}
+
+
+// Return the number of input pins we support
+
+int CBaseRenderer::GetPinCount()
+{
+    return 1;
+}
+
+
+// We only support one input pin and it is numbered zero
+
+CBasePin *CBaseRenderer::GetPin(int n)
+{
+    CAutoLock cObjectCreationLock(&m_ObjectCreationLock);
+
+    // Should only ever be called with zero
+    ASSERT(n == 0);
+
+    if (n != 0) {
+        return NULL;
+    }
+
+    // Create the input pin if not already done so
+
+    if (m_pInputPin == NULL) {
+
+        // hr must be initialized to NOERROR because
+        // CRendererInputPin's constructor only changes
+        // hr's value if an error occurs.
+        HRESULT hr = NOERROR;
+
+        m_pInputPin = new CRendererInputPin(this,&hr,L"In");
+        if (NULL == m_pInputPin) {
+            return NULL;
+        }
+
+        if (FAILED(hr)) {
+            delete m_pInputPin;
+            m_pInputPin = NULL;
+            return NULL;
+        }
+    }
+    return m_pInputPin;
+}
+
+
+// If "In" then return the IPin for our input pin, otherwise NULL and error
+
+STDMETHODIMP CBaseRenderer::FindPin(LPCWSTR Id, IPin **ppPin)
+{
+    CheckPointer(ppPin,E_POINTER);
+
+    if (0==lstrcmpW(Id,L"In")) {
+        *ppPin = GetPin(0);
+        ASSERT(*ppPin);
+        (*ppPin)->AddRef();
+    } else {
+        *ppPin = NULL;
+        return VFW_E_NOT_FOUND;
+    }
+    return NOERROR;
+}
+
+
+// Called when the input pin receives an EndOfStream notification. If we have
+// not got a sample, then notify EC_COMPLETE now. If we have samples, then set
+// m_bEOS and check for this on completing samples. If we're waiting to pause
+// then complete the transition to paused state by setting the state event
+
+HRESULT CBaseRenderer::EndOfStream()
+{
+    // Ignore these calls if we are stopped
+
+    if (m_State == State_Stopped) {
+        return NOERROR;
+    }
+
+    // If we have a sample then wait for it to be rendered
+
+    m_bEOS = TRUE;
+    if (m_pMediaSample) {
+        return NOERROR;
+    }
+
+    // If we are waiting for pause then we are now ready since we cannot now
+    // carry on waiting for a sample to arrive since we are being told there
+    // won't be any. This sets an event that the GetState function picks up
+
+    Ready();
+
+    // Only signal completion now if we are running otherwise queue it until
+    // we do run in StartStreaming. This is used when we seek because a seek
+    // causes a pause where early notification of completion is misleading
+
+    if (m_bStreaming) {
+        SendEndOfStream();
+    }
+    return NOERROR;
+}
+
+
+// When we are told to flush we should release the source thread
+
+HRESULT CBaseRenderer::BeginFlush()
+{
+    // If paused then report state intermediate until we get some data
+
+    if (m_State == State_Paused) {
+        NotReady();
+    }
+
+    SourceThreadCanWait(FALSE);
+    CancelNotification();
+    ClearPendingSample();
+    //  Wait for Receive to complete
+    WaitForReceiveToComplete();
+
+    return NOERROR;
+}
+
+
+// After flushing the source thread can wait in Receive again
+
+HRESULT CBaseRenderer::EndFlush()
+{
+    // Reset the current sample media time
+    if (m_pPosition) m_pPosition->ResetMediaTime();
+
+    // There should be no outstanding advise
+
+    ASSERT(CancelNotification() == S_FALSE);
+    SourceThreadCanWait(TRUE);
+    return NOERROR;
+}
+
+
+// We can now send EC_REPAINTs if so required
+
+HRESULT CBaseRenderer::CompleteConnect(IPin *pReceivePin)
+{
+    // The caller should always hold the interface lock because
+    // the function uses CBaseFilter::m_State.
+    ASSERT(CritCheckIn(&m_InterfaceLock));
+
+    m_bAbort = FALSE;
+
+    if (State_Running == GetRealState()) {
+        HRESULT hr = StartStreaming();
+        if (FAILED(hr)) {
+            return hr;
+        }
+
+        SetRepaintStatus(FALSE);
+    } else {
+        SetRepaintStatus(TRUE);
+    }
+
+    return NOERROR;
+}
+
+
+// Called when we go paused or running
+
+HRESULT CBaseRenderer::Active()
+{
+    return NOERROR;
+}
+
+
+// Called when we go into a stopped state
+
+HRESULT CBaseRenderer::Inactive()
+{
+    if (m_pPosition) {
+        m_pPosition->ResetMediaTime();
+    }
+    //  People who derive from this may want to override this behaviour
+    //  to keep hold of the sample in some circumstances
+    ClearPendingSample();
+
+    return NOERROR;
+}
+
+
+// Tell derived classes about the media type agreed
+
+HRESULT CBaseRenderer::SetMediaType(const CMediaType *pmt)
+{
+    return NOERROR;
+}
+
+
+// When we break the input pin connection we should reset the EOS flags. When
+// we are asked for either IMediaPosition or IMediaSeeking we will create a
+// CPosPassThru object to handles media time pass through. When we're handed
+// samples we store (by calling CPosPassThru::RegisterMediaTime) their media
+// times so we can then return a real current position of data being rendered
+
+HRESULT CBaseRenderer::BreakConnect()
+{
+    // Do we have a quality management sink
+
+    if (m_pQSink) {
+        m_pQSink->Release();
+        m_pQSink = NULL;
+    }
+
+    // Check we have a valid connection
+
+    if (m_pInputPin->IsConnected() == FALSE) {
+        return S_FALSE;
+    }
+
+    // Check we are stopped before disconnecting
+    if (m_State != State_Stopped && !m_pInputPin->CanReconnectWhenActive()) {
+        return VFW_E_NOT_STOPPED;
+    }
+
+    SetRepaintStatus(FALSE);
+    ResetEndOfStream();
+    ClearPendingSample();
+    m_bAbort = FALSE;
+
+    if (State_Running == m_State) {
+        StopStreaming();
+    }
+
+    return NOERROR;
+}
+
+
+// Retrieves the sample times for this samples (note the sample times are
+// passed in by reference not value). We return S_FALSE to say schedule this
+// sample according to the times on the sample. We also return S_OK in
+// which case the object should simply render the sample data immediately
+
+HRESULT CBaseRenderer::GetSampleTimes(IMediaSample *pMediaSample,
+                                      REFERENCE_TIME *pStartTime,
+                                      REFERENCE_TIME *pEndTime)
+{
+    ASSERT(m_dwAdvise == 0);
+    ASSERT(pMediaSample);
+
+    // If the stop time for this sample is before or the same as start time,
+    // then just ignore it (release it) and schedule the next one in line
+    // Source filters should always fill in the start and end times properly!
+
+    if (SUCCEEDED(pMediaSample->GetTime(pStartTime, pEndTime))) {
+        if (*pEndTime < *pStartTime) {
+            return VFW_E_START_TIME_AFTER_END;
+        }
+    } else {
+        // no time set in the sample... draw it now?
+        return S_OK;
+    }
+
+    // Can't synchronise without a clock so we return S_OK which tells the
+    // caller that the sample should be rendered immediately without going
+    // through the overhead of setting a timer advise link with the clock
+
+    if (m_pClock == NULL) {
+        return S_OK;
+    }
+    return ShouldDrawSampleNow(pMediaSample,pStartTime,pEndTime);
+}
+
+
+// By default all samples are drawn according to their time stamps so we
+// return S_FALSE. Returning S_OK means draw immediately, this is used
+// by the derived video renderer class in its quality management.
+
+HRESULT CBaseRenderer::ShouldDrawSampleNow(IMediaSample *pMediaSample,
+                                           REFERENCE_TIME *ptrStart,
+                                           REFERENCE_TIME *ptrEnd)
+{
+    return S_FALSE;
+}
+
+
+// We must always reset the current advise time to zero after a timer fires
+// because there are several possible ways which lead us not to do any more
+// scheduling such as the pending image being cleared after state changes
+
+void CBaseRenderer::SignalTimerFired()
+{
+    m_dwAdvise = 0;
+}
+
+
+// Cancel any notification currently scheduled. This is called by the owning
+// window object when it is told to stop streaming. If there is no timer link
+// outstanding then calling this is benign otherwise we go ahead and cancel
+// We must always reset the render event as the quality management code can
+// signal immediate rendering by setting the event without setting an advise
+// link. If we're subsequently stopped and run the first attempt to setup an
+// advise link with the reference clock will find the event still signalled
+
+HRESULT CBaseRenderer::CancelNotification()
+{
+    ASSERT(m_dwAdvise == 0 || m_pClock);
+    DWORD_PTR dwAdvise = m_dwAdvise;
+
+    // Have we a live advise link
+
+    if (m_dwAdvise) {
+        m_pClock->Unadvise(m_dwAdvise);
+        SignalTimerFired();
+        ASSERT(m_dwAdvise == 0);
+    }
+
+    // Clear the event and return our status
+
+    m_RenderEvent.Reset();
+    return (dwAdvise ? S_OK : S_FALSE);
+}
+
+
+// Responsible for setting up one shot advise links with the clock
+// Return FALSE if the sample is to be dropped (not drawn at all)
+// Return TRUE if the sample is to be drawn and in this case also
+// arrange for m_RenderEvent to be set at the appropriate time
+
+BOOL CBaseRenderer::ScheduleSample(IMediaSample *pMediaSample)
+{
+    REFERENCE_TIME StartSample, EndSample;
+
+    // Is someone pulling our leg
+
+    if (pMediaSample == NULL) {
+        return FALSE;
+    }
+
+    // Get the next sample due up for rendering.  If there aren't any ready
+    // then GetNextSampleTimes returns an error.  If there is one to be done
+    // then it succeeds and yields the sample times. If it is due now then
+    // it returns S_OK other if it's to be done when due it returns S_FALSE
+
+    HRESULT hr = GetSampleTimes(pMediaSample, &StartSample, &EndSample);
+    if (FAILED(hr)) {
+        return FALSE;
+    }
+
+    // If we don't have a reference clock then we cannot set up the advise
+    // time so we simply set the event indicating an image to render. This
+    // will cause us to run flat out without any timing or synchronisation
+
+    if (hr == S_OK) {
+        EXECUTE_ASSERT(SetEvent((HANDLE) m_RenderEvent));
+        return TRUE;
+    }
+
+    ASSERT(m_dwAdvise == 0);
+    ASSERT(m_pClock);
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+
+    // We do have a valid reference clock interface so we can ask it to
+    // set an event when the image comes due for rendering. We pass in
+    // the reference time we were told to start at and also the current
+    // stream time which is the offset from the start reference time
+
+    hr = m_pClock->AdviseTime(
+            (REFERENCE_TIME) m_tStart,          // Start run time
+            StartSample,                        // Stream time
+            (HEVENT)(HANDLE) m_RenderEvent,     // Render notification
+            &m_dwAdvise);                       // Advise cookie
+
+    if (SUCCEEDED(hr)) {
+        return TRUE;
+    }
+
+    // We could not schedule the next sample for rendering despite the fact
+    // we have a valid sample here. This is a fair indication that either
+    // the system clock is wrong or the time stamp for the sample is duff
+
+    ASSERT(m_dwAdvise == 0);
+    return FALSE;
+}
+
+
+// This is called when a sample comes due for rendering. We pass the sample
+// on to the derived class. After rendering we will initialise the timer for
+// the next sample, NOTE signal that the last one fired first, if we don't
+// do this it thinks there is still one outstanding that hasn't completed
+
+HRESULT CBaseRenderer::Render(IMediaSample *pMediaSample)
+{
+    // If the media sample is NULL then we will have been notified by the
+    // clock that another sample is ready but in the mean time someone has
+    // stopped us streaming which causes the next sample to be released
+
+    if (pMediaSample == NULL) {
+        return S_FALSE;
+    }
+
+    // If we have stopped streaming then don't render any more samples, the
+    // thread that got in and locked us and then reset this flag does not
+    // clear the pending sample as we can use it to refresh any output device
+
+    if (m_bStreaming == FALSE) {
+        return S_FALSE;
+    }
+
+    // Time how long the rendering takes
+
+    OnRenderStart(pMediaSample);
+    DoRenderSample(pMediaSample);
+    OnRenderEnd(pMediaSample);
+
+    return NOERROR;
+}
+
+
+// Checks if there is a sample waiting at the renderer
+
+BOOL CBaseRenderer::HaveCurrentSample()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    return (m_pMediaSample == NULL ? FALSE : TRUE);
+}
+
+
+// Returns the current sample waiting at the video renderer. We AddRef the
+// sample before returning so that should it come due for rendering the
+// person who called this method will hold the remaining reference count
+// that will stop the sample being added back onto the allocator free list
+
+IMediaSample *CBaseRenderer::GetCurrentSample()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    if (m_pMediaSample) {
+        m_pMediaSample->AddRef();
+    }
+    return m_pMediaSample;
+}
+
+
+// Called when the source delivers us a sample. We go through a few checks to
+// make sure the sample can be rendered. If we are running (streaming) then we
+// have the sample scheduled with the reference clock, if we are not streaming
+// then we have received an sample in paused mode so we can complete any state
+// transition. On leaving this function everything will be unlocked so an app
+// thread may get in and change our state to stopped (for example) in which
+// case it will also signal the thread event so that our wait call is stopped
+
+HRESULT CBaseRenderer::PrepareReceive(IMediaSample *pMediaSample)
+{
+    CAutoLock cInterfaceLock(&m_InterfaceLock);
+    m_bInReceive = TRUE;
+
+    // Check our flushing and filter state
+
+    // This function must hold the interface lock because it calls
+    // CBaseInputPin::Receive() and CBaseInputPin::Receive() uses
+    // CBasePin::m_bRunTimeError.
+    HRESULT hr = m_pInputPin->CBaseInputPin::Receive(pMediaSample);
+
+    if (hr != NOERROR) {
+        m_bInReceive = FALSE;
+        return E_FAIL;
+    }
+
+    // Has the type changed on a media sample. We do all rendering
+    // synchronously on the source thread, which has a side effect
+    // that only one buffer is ever outstanding. Therefore when we
+    // have Receive called we can go ahead and change the format
+    // Since the format change can cause a SendMessage we just don't
+    // lock
+    if (m_pInputPin->SampleProps()->pMediaType) {
+        hr = m_pInputPin->SetMediaType(
+                (CMediaType *)m_pInputPin->SampleProps()->pMediaType);
+        if (FAILED(hr)) {
+            m_bInReceive = FALSE;
+            return hr;
+        }
+    }
+
+
+    CAutoLock cSampleLock(&m_RendererLock);
+
+    ASSERT(IsActive() == TRUE);
+    ASSERT(m_pInputPin->IsFlushing() == FALSE);
+    ASSERT(m_pInputPin->IsConnected() == TRUE);
+    ASSERT(m_pMediaSample == NULL);
+
+    // Return an error if we already have a sample waiting for rendering
+    // source pins must serialise the Receive calls - we also check that
+    // no data is being sent after the source signalled an end of stream
+
+    if (m_pMediaSample || m_bEOS || m_bAbort) {
+        Ready();
+        m_bInReceive = FALSE;
+        return E_UNEXPECTED;
+    }
+
+    // Store the media times from this sample
+    if (m_pPosition) m_pPosition->RegisterMediaTime(pMediaSample);
+
+    // Schedule the next sample if we are streaming
+
+    if ((m_bStreaming == TRUE) && (ScheduleSample(pMediaSample) == FALSE)) {
+        ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+        ASSERT(CancelNotification() == S_FALSE);
+        m_bInReceive = FALSE;
+        return VFW_E_SAMPLE_REJECTED;
+    }
+
+    // Store the sample end time for EC_COMPLETE handling
+    m_SignalTime = m_pInputPin->SampleProps()->tStop;
+
+    // BEWARE we sometimes keep the sample even after returning the thread to
+    // the source filter such as when we go into a stopped state (we keep it
+    // to refresh the device with) so we must AddRef it to keep it safely. If
+    // we start flushing the source thread is released and any sample waiting
+    // will be released otherwise GetBuffer may never return (see BeginFlush)
+
+    m_pMediaSample = pMediaSample;
+    m_pMediaSample->AddRef();
+
+    if (m_bStreaming == FALSE) {
+        SetRepaintStatus(TRUE);
+    }
+    return NOERROR;
+}
+
+
+// Called by the source filter when we have a sample to render. Under normal
+// circumstances we set an advise link with the clock, wait for the time to
+// arrive and then render the data using the PURE virtual DoRenderSample that
+// the derived class will have overriden. After rendering the sample we may
+// also signal EOS if it was the last one sent before EndOfStream was called
+
+HRESULT CBaseRenderer::Receive(IMediaSample *pSample)
+{
+    ASSERT(pSample);
+
+    // It may return VFW_E_SAMPLE_REJECTED code to say don't bother
+
+    HRESULT hr = PrepareReceive(pSample);
+    ASSERT(m_bInReceive == SUCCEEDED(hr));
+    if (FAILED(hr)) {
+        if (hr == VFW_E_SAMPLE_REJECTED) {
+            return NOERROR;
+        }
+        return hr;
+    }
+
+    // We realize the palette in "PrepareRender()" so we have to give away the
+    // filter lock here.
+    if (m_State == State_Paused) {
+        PrepareRender();
+        // no need to use InterlockedExchange
+        m_bInReceive = FALSE;
+        {
+            // We must hold both these locks
+            CAutoLock cRendererLock(&m_InterfaceLock);
+            if (m_State == State_Stopped)
+                return NOERROR;
+
+            m_bInReceive = TRUE;
+            CAutoLock cSampleLock(&m_RendererLock);
+            OnReceiveFirstSample(pSample);
+        }
+        Ready();
+    }
+    // Having set an advise link with the clock we sit and wait. We may be
+    // awoken by the clock firing or by a state change. The rendering call
+    // will lock the critical section and check we can still render the data
+
+    hr = WaitForRenderTime();
+    if (FAILED(hr)) {
+        m_bInReceive = FALSE;
+        return NOERROR;
+    }
+
+    PrepareRender();
+
+    //  Set this here and poll it until we work out the locking correctly
+    //  It can't be right that the streaming stuff grabs the interface
+    //  lock - after all we want to be able to wait for this stuff
+    //  to complete
+    m_bInReceive = FALSE;
+
+    // We must hold both these locks
+    CAutoLock cRendererLock(&m_InterfaceLock);
+
+    // since we gave away the filter wide lock, the sate of the filter could
+    // have chnaged to Stopped
+    if (m_State == State_Stopped)
+        return NOERROR;
+
+    CAutoLock cSampleLock(&m_RendererLock);
+
+    // Deal with this sample
+
+    Render(m_pMediaSample);
+    ClearPendingSample();
+    SendEndOfStream();
+    CancelNotification();
+    return NOERROR;
+}
+
+
+// This is called when we stop or are inactivated to clear the pending sample
+// We release the media sample interface so that they can be allocated to the
+// source filter again, unless of course we are changing state to inactive in
+// which case GetBuffer will return an error. We must also reset the current
+// media sample to NULL so that we know we do not currently have an image
+
+HRESULT CBaseRenderer::ClearPendingSample()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    if (m_pMediaSample) {
+        m_pMediaSample->Release();
+        m_pMediaSample = NULL;
+    }
+    return NOERROR;
+}
+
+
+// Used to signal end of stream according to the sample end time
+
+void CALLBACK EndOfStreamTimer(UINT uID,        // Timer identifier
+                               UINT uMsg,       // Not currently used
+                               DWORD_PTR dwUser,// User information
+                               DWORD_PTR dw1,   // Windows reserved
+                               DWORD_PTR dw2)   // is also reserved
+{
+    CBaseRenderer *pRenderer = (CBaseRenderer *) dwUser;
+    NOTE1("EndOfStreamTimer called (%d)",uID);
+    pRenderer->TimerCallback();
+}
+
+//  Do the timer callback work
+void CBaseRenderer::TimerCallback()
+{
+    //  Lock for synchronization (but don't hold this lock when calling
+    //  timeKillEvent)
+    CAutoLock cRendererLock(&m_RendererLock);
+
+    // See if we should signal end of stream now
+
+    if (m_EndOfStreamTimer) {
+        m_EndOfStreamTimer = 0;
+        SendEndOfStream();
+    }
+}
+
+
+// If we are at the end of the stream signal the filter graph but do not set
+// the state flag back to FALSE. Once we drop off the end of the stream we
+// leave the flag set (until a subsequent ResetEndOfStream). Each sample we
+// get delivered will update m_SignalTime to be the last sample's end time.
+// We must wait this long before signalling end of stream to the filtergraph
+
+#define TIMEOUT_DELIVERYWAIT 50
+#define TIMEOUT_RESOLUTION 10
+
+HRESULT CBaseRenderer::SendEndOfStream()
+{
+    ASSERT(CritCheckIn(&m_RendererLock));
+    if (m_bEOS == FALSE || m_bEOSDelivered || m_EndOfStreamTimer) {
+        return NOERROR;
+    }
+
+    // If there is no clock then signal immediately
+    if (m_pClock == NULL) {
+        return NotifyEndOfStream();
+    }
+
+    // How long into the future is the delivery time
+
+    REFERENCE_TIME Signal = m_tStart + m_SignalTime;
+    REFERENCE_TIME CurrentTime;
+    m_pClock->GetTime(&CurrentTime);
+    LONG Delay = LONG((Signal - CurrentTime) / 10000);
+
+    // Dump the timing information to the debugger
+
+    NOTE1("Delay until end of stream delivery %d",Delay);
+    NOTE1("Current %s",(LPCTSTR)CDisp((LONGLONG)CurrentTime));
+    NOTE1("Signal %s",(LPCTSTR)CDisp((LONGLONG)Signal));
+
+    // Wait for the delivery time to arrive
+
+    if (Delay < TIMEOUT_DELIVERYWAIT) {
+        return NotifyEndOfStream();
+    }
+
+    // Signal a timer callback on another worker thread
+
+    m_EndOfStreamTimer = CompatibleTimeSetEvent((UINT) Delay, // Period of timer
+                                      TIMEOUT_RESOLUTION,     // Timer resolution
+                                      EndOfStreamTimer,       // Callback function
+                                      DWORD_PTR(this),        // Used information
+                                      TIME_ONESHOT);          // Type of callback
+    if (m_EndOfStreamTimer == 0) {
+        return NotifyEndOfStream();
+    }
+    return NOERROR;
+}
+
+
+// Signals EC_COMPLETE to the filtergraph manager
+
+HRESULT CBaseRenderer::NotifyEndOfStream()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    ASSERT(m_bEOSDelivered == FALSE);
+    ASSERT(m_EndOfStreamTimer == 0);
+
+    // Has the filter changed state
+
+    if (m_bStreaming == FALSE) {
+        ASSERT(m_EndOfStreamTimer == 0);
+        return NOERROR;
+    }
+
+    // Reset the end of stream timer
+    m_EndOfStreamTimer = 0;
+
+    // If we've been using the IMediaPosition interface, set it's start
+    // and end media "times" to the stop position by hand.  This ensures
+    // that we actually get to the end, even if the MPEG guestimate has
+    // been bad or if the quality management dropped the last few frames
+
+    if (m_pPosition) m_pPosition->EOS();
+    m_bEOSDelivered = TRUE;
+    NOTE("Sending EC_COMPLETE...");
+    return NotifyEvent(EC_COMPLETE,S_OK,(LONG_PTR)(IBaseFilter *)this);
+}
+
+
+// Reset the end of stream flag, this is typically called when we transfer to
+// stopped states since that resets the current position back to the start so
+// we will receive more samples or another EndOfStream if there aren't any. We
+// keep two separate flags one to say we have run off the end of the stream
+// (this is the m_bEOS flag) and another to say we have delivered EC_COMPLETE
+// to the filter graph. We need the latter otherwise we can end up sending an
+// EC_COMPLETE every time the source changes state and calls our EndOfStream
+
+HRESULT CBaseRenderer::ResetEndOfStream()
+{
+    ResetEndOfStreamTimer();
+    CAutoLock cRendererLock(&m_RendererLock);
+
+    m_bEOS = FALSE;
+    m_bEOSDelivered = FALSE;
+    m_SignalTime = 0;
+
+    return NOERROR;
+}
+
+
+// Kills any outstanding end of stream timer
+
+void CBaseRenderer::ResetEndOfStreamTimer()
+{
+    ASSERT(CritCheckOut(&m_RendererLock));
+    if (m_EndOfStreamTimer) {
+        timeKillEvent(m_EndOfStreamTimer);
+        m_EndOfStreamTimer = 0;
+    }
+}
+
+
+// This is called when we start running so that we can schedule any pending
+// image we have with the clock and display any timing information. If we
+// don't have any sample but we have queued an EOS flag then we send it. If
+// we do have a sample then we wait until that has been rendered before we
+// signal the filter graph otherwise we may change state before it's done
+
+HRESULT CBaseRenderer::StartStreaming()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    if (m_bStreaming == TRUE) {
+        return NOERROR;
+    }
+
+    // Reset the streaming times ready for running
+
+    m_bStreaming = TRUE;
+
+    timeBeginPeriod(1);
+    OnStartStreaming();
+
+    // There should be no outstanding advise
+    ASSERT(WAIT_TIMEOUT == WaitForSingleObject((HANDLE)m_RenderEvent,0));
+    ASSERT(CancelNotification() == S_FALSE);
+
+    // If we have an EOS and no data then deliver it now
+
+    if (m_pMediaSample == NULL) {
+        return SendEndOfStream();
+    }
+
+    // Have the data rendered
+
+    ASSERT(m_pMediaSample);
+    if (!ScheduleSample(m_pMediaSample))
+        m_RenderEvent.Set();
+
+    return NOERROR;
+}
+
+
+// This is called when we stop streaming so that we can set our internal flag
+// indicating we are not now to schedule any more samples arriving. The state
+// change methods in the filter implementation take care of cancelling any
+// clock advise link we have set up and clearing any pending sample we have
+
+HRESULT CBaseRenderer::StopStreaming()
+{
+    CAutoLock cRendererLock(&m_RendererLock);
+    m_bEOSDelivered = FALSE;
+
+    if (m_bStreaming == TRUE) {
+        m_bStreaming = FALSE;
+        OnStopStreaming();
+        timeEndPeriod(1);
+    }
+    return NOERROR;
+}
+
+
+// We have a boolean flag that is reset when we have signalled EC_REPAINT to
+// the filter graph. We set this when we receive an image so that should any
+// conditions arise again we can send another one. By having a flag we ensure
+// we don't flood the filter graph with redundant calls. We do not set the
+// event when we receive an EndOfStream call since there is no point in us
+// sending further EC_REPAINTs. In particular the AutoShowWindow method and
+// the DirectDraw object use this method to control the window repainting
+
+void CBaseRenderer::SetRepaintStatus(BOOL bRepaint)
+{
+    CAutoLock cSampleLock(&m_RendererLock);
+    m_bRepaintStatus = bRepaint;
+}
+
+
+// Pass the window handle to the upstream filter
+
+void CBaseRenderer::SendNotifyWindow(IPin *pPin,HWND hwnd)
+{
+    IMediaEventSink *pSink;
+
+    // Does the pin support IMediaEventSink
+    HRESULT hr = pPin->QueryInterface(IID_IMediaEventSink,(void **)&pSink);
+    if (SUCCEEDED(hr)) {
+        pSink->Notify(EC_NOTIFY_WINDOW,LONG_PTR(hwnd),0);
+        pSink->Release();
+    }
+    NotifyEvent(EC_NOTIFY_WINDOW,LONG_PTR(hwnd),0);
+}
+
+
+// Signal an EC_REPAINT to the filter graph. This can be used to have data
+// sent to us. For example when a video window is first displayed it may
+// not have an image to display, at which point it signals EC_REPAINT. The
+// filtergraph will either pause the graph if stopped or if already paused
+// it will call put_CurrentPosition of the current position. Setting the
+// current position to itself has the stream flushed and the image resent
+
+#define RLOG(_x_) DbgLog((LOG_TRACE,1,TEXT(_x_)));
+
+void CBaseRenderer::SendRepaint()
+{
+    CAutoLock cSampleLock(&m_RendererLock);
+    ASSERT(m_pInputPin);
+
+    // We should not send repaint notifications when...
+    //    - An end of stream has been notified
+    //    - Our input pin is being flushed
+    //    - The input pin is not connected
+    //    - We have aborted a video playback
+    //    - There is a repaint already sent
+
+    if (m_bAbort == FALSE) {
+        if (m_pInputPin->IsConnected() == TRUE) {
+            if (m_pInputPin->IsFlushing() == FALSE) {
+                if (IsEndOfStream() == FALSE) {
+                    if (m_bRepaintStatus == TRUE) {
+                        IPin *pPin = (IPin *) m_pInputPin;
+                        NotifyEvent(EC_REPAINT,(LONG_PTR) pPin,0);
+                        SetRepaintStatus(FALSE);
+                        RLOG("Sending repaint");
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// When a video window detects a display change (WM_DISPLAYCHANGE message) it
+// can send an EC_DISPLAY_CHANGED event code along with the renderer pin. The
+// filtergraph will stop everyone and reconnect our input pin. As we're then
+// reconnected we can accept the media type that matches the new display mode
+// since we may no longer be able to draw the current image type efficiently
+
+BOOL CBaseRenderer::OnDisplayChange()
+{
+    // Ignore if we are not connected yet
+
+    CAutoLock cSampleLock(&m_RendererLock);
+    if (m_pInputPin->IsConnected() == FALSE) {
+        return FALSE;
+    }
+
+    RLOG("Notification of EC_DISPLAY_CHANGE");
+
+    // Pass our input pin as parameter on the event
+
+    IPin *pPin = (IPin *) m_pInputPin;
+    m_pInputPin->AddRef();
+    NotifyEvent(EC_DISPLAY_CHANGED,(LONG_PTR) pPin,0);
+    SetAbortSignal(TRUE);
+    ClearPendingSample();
+    m_pInputPin->Release();
+
+    return TRUE;
+}
+
+
+// Called just before we start drawing.
+// Store the current time in m_trRenderStart to allow the rendering time to be
+// logged.  Log the time stamp of the sample and how late it is (neg is early)
+
+void CBaseRenderer::OnRenderStart(IMediaSample *pMediaSample)
+{
+#ifdef PERF
+    REFERENCE_TIME trStart, trEnd;
+    pMediaSample->GetTime(&trStart, &trEnd);
+
+    MSR_INTEGER(m_idBaseStamp, (int)trStart);     // dump low order 32 bits
+
+    m_pClock->GetTime(&m_trRenderStart);
+    MSR_INTEGER(0, (int)m_trRenderStart);
+    REFERENCE_TIME trStream;
+    trStream = m_trRenderStart-m_tStart;     // convert reftime to stream time
+    MSR_INTEGER(0,(int)trStream);
+
+    const int trLate = (int)(trStream - trStart);
+    MSR_INTEGER(m_idBaseAccuracy, trLate/10000);  // dump in mSec
+#endif
+
+} // OnRenderStart
+
+
+// Called directly after drawing an image.
+// calculate the time spent drawing and log it.
+
+void CBaseRenderer::OnRenderEnd(IMediaSample *pMediaSample)
+{
+#ifdef PERF
+    REFERENCE_TIME trNow;
+    m_pClock->GetTime(&trNow);
+    MSR_INTEGER(0,(int)trNow);
+    int t = (int)((trNow - m_trRenderStart)/10000);   // convert UNITS->msec
+    MSR_INTEGER(m_idBaseRenderTime, t);
+#endif
+} // OnRenderEnd
+
+
+
+
+// Constructor must be passed the base renderer object
+
+CRendererInputPin::CRendererInputPin(CBaseRenderer *pRenderer,
+                                     HRESULT *phr,
+                                     LPCWSTR pPinName) :
+    CBaseInputPin(NAME("Renderer pin"),
+                  pRenderer,
+                  &pRenderer->m_InterfaceLock,
+                  (HRESULT *) phr,
+                  pPinName)
+{
+    m_pRenderer = pRenderer;
+    ASSERT(m_pRenderer);
+}
+
+
+// Signals end of data stream on the input pin
+
+STDMETHODIMP CRendererInputPin::EndOfStream()
+{
+    CAutoLock cRendererLock(&m_pRenderer->m_InterfaceLock);
+    CAutoLock cSampleLock(&m_pRenderer->m_RendererLock);
+
+    // Make sure we're streaming ok
+
+    HRESULT hr = CheckStreaming();
+    if (hr != NOERROR) {
+        return hr;
+    }
+
+    // Pass it onto the renderer
+
+    hr = m_pRenderer->EndOfStream();
+    if (SUCCEEDED(hr)) {
+        hr = CBaseInputPin::EndOfStream();
+    }
+    return hr;
+}
+
+
+// Signals start of flushing on the input pin - we do the final reset end of
+// stream with the renderer lock unlocked but with the interface lock locked
+// We must do this because we call timeKillEvent, our timer callback method
+// has to take the renderer lock to serialise our state. Therefore holding a
+// renderer lock when calling timeKillEvent could cause a deadlock condition
+
+STDMETHODIMP CRendererInputPin::BeginFlush()
+{
+    CAutoLock cRendererLock(&m_pRenderer->m_InterfaceLock);
+    {
+        CAutoLock cSampleLock(&m_pRenderer->m_RendererLock);
+        CBaseInputPin::BeginFlush();
+        m_pRenderer->BeginFlush();
+    }
+    return m_pRenderer->ResetEndOfStream();
+}
+
+
+// Signals end of flushing on the input pin
+
+STDMETHODIMP CRendererInputPin::EndFlush()
+{
+    CAutoLock cRendererLock(&m_pRenderer->m_InterfaceLock);
+    CAutoLock cSampleLock(&m_pRenderer->m_RendererLock);
+
+    HRESULT hr = m_pRenderer->EndFlush();
+    if (SUCCEEDED(hr)) {
+        hr = CBaseInputPin::EndFlush();
+    }
+    return hr;
+}
+
+
+// Pass the sample straight through to the renderer object
+
+STDMETHODIMP CRendererInputPin::Receive(IMediaSample *pSample)
+{
+    HRESULT hr = m_pRenderer->Receive(pSample);
+    if (FAILED(hr)) {
+
+        // A deadlock could occur if the caller holds the renderer lock and
+        // attempts to acquire the interface lock.
+        ASSERT(CritCheckOut(&m_pRenderer->m_RendererLock));
+
+        {
+            // The interface lock must be held when the filter is calling
+            // IsStopped() or IsFlushing().  The interface lock must also
+            // be held because the function uses m_bRunTimeError.
+            CAutoLock cRendererLock(&m_pRenderer->m_InterfaceLock);
+
+            // We do not report errors which occur while the filter is stopping,
+            // flushing or if the m_bAbort flag is set .  Errors are expected to
+            // occur during these operations and the streaming thread correctly
+            // handles the errors.
+            if (!IsStopped() && !IsFlushing() && !m_pRenderer->m_bAbort && !m_bRunTimeError) {
+
+                // EC_ERRORABORT's first parameter is the error which caused
+                // the event and its' last parameter is 0.  See the Direct
+                // Show SDK documentation for more information.
+                m_pRenderer->NotifyEvent(EC_ERRORABORT,hr,0);
+
+                {
+                    CAutoLock alRendererLock(&m_pRenderer->m_RendererLock);
+                    if (m_pRenderer->IsStreaming() && !m_pRenderer->IsEndOfStreamDelivered()) {
+                        m_pRenderer->NotifyEndOfStream();
+                    }
+                }
+
+                m_bRunTimeError = TRUE;
+            }
+        }
+    }
+
+    return hr;
+}
+
+
+// Called when the input pin is disconnected
+
+HRESULT CRendererInputPin::BreakConnect()
+{
+    HRESULT hr = m_pRenderer->BreakConnect();
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return CBaseInputPin::BreakConnect();
+}
+
+
+// Called when the input pin is connected
+
+HRESULT CRendererInputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = m_pRenderer->CompleteConnect(pReceivePin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return CBaseInputPin::CompleteConnect(pReceivePin);
+}
+
+
+// Give the pin id of our one and only pin
+
+STDMETHODIMP CRendererInputPin::QueryId(LPWSTR *Id)
+{
+    CheckPointer(Id,E_POINTER);
+
+    const size_t len = 4;
+    *Id = (LPWSTR)CoTaskMemAlloc(len * sizeof(WCHAR));
+    if (*Id == NULL) {
+        return E_OUTOFMEMORY;
+    }
+    (void)StringCchCopyW(*Id, len, L"In");
+    return NOERROR;
+}
+
+
+// Will the filter accept this media type
+
+HRESULT CRendererInputPin::CheckMediaType(const CMediaType *pmt)
+{
+    return m_pRenderer->CheckMediaType(pmt);
+}
+
+
+// Called when we go paused or running
+
+HRESULT CRendererInputPin::Active()
+{
+    return m_pRenderer->Active();
+}
+
+
+// Called when we go into a stopped state
+
+HRESULT CRendererInputPin::Inactive()
+{
+    // The caller must hold the interface lock because
+    // this function uses m_bRunTimeError.
+    ASSERT(CritCheckIn(&m_pRenderer->m_InterfaceLock));
+
+    m_bRunTimeError = FALSE;
+
+    return m_pRenderer->Inactive();
+}
+
+
+// Tell derived classes about the media type agreed
+
+HRESULT CRendererInputPin::SetMediaType(const CMediaType *pmt)
+{
+    HRESULT hr = CBaseInputPin::SetMediaType(pmt);
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return m_pRenderer->SetMediaType(pmt);
+}
+
+
+// We do not keep an event object to use when setting up a timer link with
+// the clock but are given a pointer to one by the owning object through the
+// SetNotificationObject method - this must be initialised before starting
+// We can override the default quality management process to have it always
+// draw late frames, this is currently done by having the following registry
+// key (actually an INI key) called DrawLateFrames set to 1 (default is 0)
+
+const TCHAR AMQUALITY[] = TEXT("ActiveMovie");
+const TCHAR DRAWLATEFRAMES[] = TEXT("DrawLateFrames");
+
+CBaseVideoRenderer::CBaseVideoRenderer(
+      REFCLSID RenderClass, // CLSID for this renderer
+      TCHAR *pName,         // Debug ONLY description
+      LPUNKNOWN pUnk,       // Aggregated owner object
+      HRESULT *phr) :       // General OLE return code
+
+    CBaseRenderer(RenderClass,pName,pUnk,phr),
+    m_cFramesDropped(0),
+    m_cFramesDrawn(0),
+    m_bSupplierHandlingQuality(FALSE)
+{
+    ResetStreamingTimes();
+
+#ifdef PERF
+    m_idTimeStamp       = MSR_REGISTER(TEXT("Frame time stamp"));
+    m_idEarliness       = MSR_REGISTER(TEXT("Earliness fudge"));
+    m_idTarget          = MSR_REGISTER(TEXT("Target (mSec)"));
+    m_idSchLateTime     = MSR_REGISTER(TEXT("mSec late when scheduled"));
+    m_idDecision        = MSR_REGISTER(TEXT("Scheduler decision code"));
+    m_idQualityRate     = MSR_REGISTER(TEXT("Quality rate sent"));
+    m_idQualityTime     = MSR_REGISTER(TEXT("Quality time sent"));
+    m_idWaitReal        = MSR_REGISTER(TEXT("Render wait"));
+    // m_idWait            = MSR_REGISTER(TEXT("wait time recorded (msec)"));
+    m_idFrameAccuracy   = MSR_REGISTER(TEXT("Frame accuracy (msecs)"));
+    m_bDrawLateFrames = GetProfileInt(AMQUALITY, DRAWLATEFRAMES, FALSE);
+    //m_idSendQuality      = MSR_REGISTER(TEXT("Processing Quality message"));
+
+    m_idRenderAvg       = MSR_REGISTER(TEXT("Render draw time Avg"));
+    m_idFrameAvg        = MSR_REGISTER(TEXT("FrameAvg"));
+    m_idWaitAvg         = MSR_REGISTER(TEXT("WaitAvg"));
+    m_idDuration        = MSR_REGISTER(TEXT("Duration"));
+    m_idThrottle        = MSR_REGISTER(TEXT("Audio-video throttle wait"));
+    // m_idDebug           = MSR_REGISTER(TEXT("Debug stuff"));
+#endif // PERF
+} // Constructor
+
+
+// Destructor is just a placeholder
+
+CBaseVideoRenderer::~CBaseVideoRenderer()
+{
+    ASSERT(m_dwAdvise == 0);
+}
+
+
+// The timing functions in this class are called by the window object and by
+// the renderer's allocator.
+// The windows object calls timing functions as it receives media sample
+// images for drawing using GDI.
+// The allocator calls timing functions when it starts passing DCI/DirectDraw
+// surfaces which are not rendered in the same way; The decompressor writes
+// directly to the surface with no separate rendering, so those code paths
+// call direct into us.  Since we only ever hand out DCI/DirectDraw surfaces
+// when we have allocated one and only one image we know there cannot be any
+// conflict between the two.
+//
+// We use timeGetTime to return the timing counts we use (since it's relative
+// performance we are interested in rather than absolute compared to a clock)
+// The window object sets the accuracy of the system clock (normally 1ms) by
+// calling timeBeginPeriod/timeEndPeriod when it changes streaming states
+
+
+// Reset all times controlling streaming.
+// Set them so that
+// 1. Frames will not initially be dropped
+// 2. The first frame will definitely be drawn (achieved by saying that there
+//    has not ben a frame drawn for a long time).
+
+HRESULT CBaseVideoRenderer::ResetStreamingTimes()
+{
+    m_trLastDraw = -1000;     // set up as first frame since ages (1 sec) ago
+    m_tStreamingStart = timeGetTime();
+    m_trRenderAvg = 0;
+    m_trFrameAvg = -1;        // -1000 fps == "unset"
+    m_trDuration = 0;         // 0 - strange value
+    m_trRenderLast = 0;
+    m_trWaitAvg = 0;
+    m_tRenderStart = 0;
+    m_cFramesDrawn = 0;
+    m_cFramesDropped = 0;
+    m_iTotAcc = 0;
+    m_iSumSqAcc = 0;
+    m_iSumSqFrameTime = 0;
+    m_trFrame = 0;          // hygiene - not really needed
+    m_trLate = 0;           // hygiene - not really needed
+    m_iSumFrameTime = 0;
+    m_nNormal = 0;
+    m_trEarliness = 0;
+    m_trTarget = -300000;  // 30mSec early
+    m_trThrottle = 0;
+    m_trRememberStampForPerf = 0;
+
+#ifdef PERF
+    m_trRememberFrameForPerf = 0;
+#endif
+
+    return NOERROR;
+} // ResetStreamingTimes
+
+
+// Reset all times controlling streaming. Note that we're now streaming. We
+// don't need to set the rendering event to have the source filter released
+// as it is done during the Run processing. When we are run we immediately
+// release the source filter thread and draw any image waiting (that image
+// may already have been drawn once as a poster frame while we were paused)
+
+HRESULT CBaseVideoRenderer::OnStartStreaming()
+{
+    ResetStreamingTimes();
+    return NOERROR;
+} // OnStartStreaming
+
+
+// Called at end of streaming.  Fixes times for property page report
+
+HRESULT CBaseVideoRenderer::OnStopStreaming()
+{
+    m_tStreamingStart = timeGetTime()-m_tStreamingStart;
+    return NOERROR;
+} // OnStopStreaming
+
+
+// Called when we start waiting for a rendering event.
+// Used to update times spent waiting and not waiting.
+
+void CBaseVideoRenderer::OnWaitStart()
+{
+    MSR_START(m_idWaitReal);
+} // OnWaitStart
+
+
+// Called when we are awoken from the wait in the window OR by our allocator
+// when it is hanging around until the next sample is due for rendering on a
+// DCI/DirectDraw surface. We add the wait time into our rolling average.
+// We grab the interface lock so that we're serialised with the application
+// thread going through the run code - which in due course ends up calling
+// ResetStreaming times - possibly as we run through this section of code
+
+void CBaseVideoRenderer::OnWaitEnd()
+{
+#ifdef PERF
+    MSR_STOP(m_idWaitReal);
+    // for a perf build we want to know just exactly how late we REALLY are.
+    // even if this means that we have to look at the clock again.
+
+    REFERENCE_TIME trRealStream;     // the real time now expressed as stream time.
+#if 0
+    m_pClock->GetTime(&trRealStream); // Calling clock here causes W95 deadlock!
+#else
+    // We will be discarding overflows like mad here!
+    // This is wrong really because timeGetTime() can wrap but it's
+    // only for PERF
+    REFERENCE_TIME tr = timeGetTime()*10000;
+    trRealStream = tr + m_llTimeOffset;
+#endif
+    trRealStream -= m_tStart;     // convert to stream time (this is a reftime)
+
+    if (m_trRememberStampForPerf==0) {
+        // This is probably the poster frame at the start, and it is not scheduled
+        // in the usual way at all.  Just count it.  The rememberstamp gets set
+        // in ShouldDrawSampleNow, so this does invalid frame recording until we
+        // actually start playing.
+        PreparePerformanceData(0, 0);
+    } else {
+        int trLate = (int)(trRealStream - m_trRememberStampForPerf);
+        int trFrame = (int)(tr - m_trRememberFrameForPerf);
+        PreparePerformanceData(trLate, trFrame);
+    }
+    m_trRememberFrameForPerf = tr;
+#endif //PERF
+} // OnWaitEnd
+
+
+// Put data on one side that describes the lateness of the current frame.
+// We don't yet know whether it will actually be drawn.  In direct draw mode,
+// this decision is up to the filter upstream, and it could change its mind.
+// The rules say that if it did draw it must call Receive().  One way or
+// another we eventually get into either OnRenderStart or OnDirectRender and
+// these both call RecordFrameLateness to update the statistics.
+
+void CBaseVideoRenderer::PreparePerformanceData(int trLate, int trFrame)
+{
+    m_trLate = trLate;
+    m_trFrame = trFrame;
+} // PreparePerformanceData
+
+
+// update the statistics:
+// m_iTotAcc, m_iSumSqAcc, m_iSumSqFrameTime, m_iSumFrameTime, m_cFramesDrawn
+// Note that because the properties page reports using these variables,
+// 1. We need to be inside a critical section
+// 2. They must all be updated together.  Updating the sums here and the count
+// elsewhere can result in imaginary jitter (i.e. attempts to find square roots
+// of negative numbers) in the property page code.
+
+void CBaseVideoRenderer::RecordFrameLateness(int trLate, int trFrame)
+{
+    // Record how timely we are.
+    int tLate = trLate/10000;
+
+    // Best estimate of moment of appearing on the screen is average of
+    // start and end draw times.  Here we have only the end time.  This may
+    // tend to show us as spuriously late by up to 1/2 frame rate achieved.
+    // Decoder probably monitors draw time.  We don't bother.
+    MSR_INTEGER( m_idFrameAccuracy, tLate );
+
+    // This is a kludge - we can get frames that are very late
+    // especially (at start-up) and they invalidate the statistics.
+    // So ignore things that are more than 1 sec off.
+    if (tLate>1000 || tLate<-1000) {
+        if (m_cFramesDrawn<=1) {
+            tLate = 0;
+        } else if (tLate>0) {
+            tLate = 1000;
+        } else {
+            tLate = -1000;
+        }
+    }
+    // The very first frame often has a invalid time, so don't
+    // count it into the statistics.   (???)
+    if (m_cFramesDrawn>1) {
+        m_iTotAcc += tLate;
+        m_iSumSqAcc += (tLate*tLate);
+    }
+
+    // calculate inter-frame time.  Doesn't make sense for first frame
+    // second frame suffers from invalid first frame stamp.
+    if (m_cFramesDrawn>2) {
+        int tFrame = trFrame/10000;    // convert to mSec else it overflows
+
+        // This is a kludge.  It can overflow anyway (a pause can cause
+        // a very long inter-frame time) and it overflows at 2**31/10**7
+        // or about 215 seconds i.e. 3min 35sec
+        if (tFrame>1000||tFrame<0) tFrame = 1000;
+        m_iSumSqFrameTime += tFrame*tFrame;
+        ASSERT(m_iSumSqFrameTime>=0);
+        m_iSumFrameTime += tFrame;
+    }
+    ++m_cFramesDrawn;
+
+} // RecordFrameLateness
+
+
+void CBaseVideoRenderer::ThrottleWait()
+{
+    if (m_trThrottle>0) {
+        int iThrottle = m_trThrottle/10000;    // convert to mSec
+        MSR_INTEGER( m_idThrottle, iThrottle);
+        DbgLog((LOG_TRACE, 0, TEXT("Throttle %d ms"), iThrottle));
+        Sleep(iThrottle);
+    } else {
+        Sleep(0);
+    }
+} // ThrottleWait
+
+
+// Whenever a frame is rendered it goes though either OnRenderStart
+// or OnDirectRender.  Data that are generated during ShouldDrawSample
+// are added to the statistics by calling RecordFrameLateness from both
+// these two places.
+
+// Called in place of OnRenderStart..OnRenderEnd
+// When a DirectDraw image is drawn
+void CBaseVideoRenderer::OnDirectRender(IMediaSample *pMediaSample)
+{
+    m_trRenderAvg = 0;
+    m_trRenderLast = 5000000;  // If we mode switch, we do NOT want this
+                               // to inhibit the new average getting going!
+                               // so we set it to half a second
+    // MSR_INTEGER(m_idRenderAvg, m_trRenderAvg/10000);
+    RecordFrameLateness(m_trLate, m_trFrame);
+    ThrottleWait();
+} // OnDirectRender
+
+
+// Called just before we start drawing.  All we do is to get the current clock
+// time (from the system) and return.  We have to store the start render time
+// in a member variable because it isn't used until we complete the drawing
+// The rest is just performance logging.
+
+void CBaseVideoRenderer::OnRenderStart(IMediaSample *pMediaSample)
+{
+    RecordFrameLateness(m_trLate, m_trFrame);
+    m_tRenderStart = timeGetTime();
+} // OnRenderStart
+
+
+// Called directly after drawing an image.  We calculate the time spent in the
+// drawing code and if this doesn't appear to have any odd looking spikes in
+// it then we add it to the current average draw time.  Measurement spikes may
+// occur if the drawing thread is interrupted and switched to somewhere else.
+
+void CBaseVideoRenderer::OnRenderEnd(IMediaSample *pMediaSample)
+{
+    // The renderer time can vary erratically if we are interrupted so we do
+    // some smoothing to help get more sensible figures out but even that is
+    // not enough as figures can go 9,10,9,9,83,9 and we must disregard 83
+
+    int tr = (timeGetTime() - m_tRenderStart)*10000;   // convert mSec->UNITS
+    if (tr < m_trRenderAvg*2 || tr < 2 * m_trRenderLast) {
+        // DO_MOVING_AVG(m_trRenderAvg, tr);
+        m_trRenderAvg = (tr + (AVGPERIOD-1)*m_trRenderAvg)/AVGPERIOD;
+    }
+    m_trRenderLast = tr;
+    ThrottleWait();
+} // OnRenderEnd
+
+
+STDMETHODIMP CBaseVideoRenderer::SetSink( IQualityControl * piqc)
+{
+
+    m_pQSink = piqc;
+
+    return NOERROR;
+} // SetSink
+
+
+STDMETHODIMP CBaseVideoRenderer::Notify( IBaseFilter * pSelf, Quality q)
+{
+    // NOTE:  We are NOT getting any locks here.  We could be called
+    // asynchronously and possibly even on a time critical thread of
+    // someone else's - so we do the minumum.  We only set one state
+    // variable (an integer) and if that happens to be in the middle
+    // of another thread reading it they will just get either the new
+    // or the old value.  Locking would achieve no more than this.
+
+    // It might be nice to check that we are being called from m_pGraph, but
+    // it turns out to be a millisecond or so per throw!
+
+    // This is heuristics, these numbers are aimed at being "what works"
+    // rather than anything based on some theory.
+    // We use a hyperbola because it's easy to calculate and it includes
+    // a panic button asymptote (which we push off just to the left)
+    // The throttling fits the following table (roughly)
+    // Proportion   Throttle (msec)
+    //     >=1000         0
+    //        900         3
+    //        800         7
+    //        700        11
+    //        600        17
+    //        500        25
+    //        400        35
+    //        300        50
+    //        200        72
+    //        125       100
+    //        100       112
+    //         50       146
+    //          0       200
+
+    // (some evidence that we could go for a sharper kink - e.g. no throttling
+    // until below the 750 mark - might give fractionally more frames on a
+    // P60-ish machine).  The easy way to get these coefficients is to use
+    // Renbase.xls follow the instructions therein using excel solver.
+
+    if (q.Proportion>=1000) { m_trThrottle = 0; }
+    else {
+        // The DWORD is to make quite sure I get unsigned arithmetic
+        // as the constant is between 2**31 and 2**32
+        m_trThrottle = -330000 + (388880000/(q.Proportion+167));
+    }
+    return NOERROR;
+} // Notify
+
+
+// Send a message to indicate what our supplier should do about quality.
+// Theory:
+// What a supplier wants to know is "is the frame I'm working on NOW
+// going to be late?".
+// F1 is the frame at the supplier (as above)
+// Tf1 is the due time for F1
+// T1 is the time at that point (NOW!)
+// Tr1 is the time that f1 WILL actually be rendered
+// L1 is the latency of the graph for frame F1 = Tr1-T1
+// D1 (for delay) is how late F1 will be beyond its due time i.e.
+// D1 = (Tr1-Tf1) which is what the supplier really wants to know.
+// Unfortunately Tr1 is in the future and is unknown, so is L1
+//
+// We could estimate L1 by its value for a previous frame,
+// L0 = Tr0-T0 and work off
+// D1' = ((T1+L0)-Tf1) = (T1 + (Tr0-T0) -Tf1)
+// Rearranging terms:
+// D1' = (T1-T0) + (Tr0-Tf1)
+//       adding (Tf0-Tf0) and rearranging again:
+//     = (T1-T0) + (Tr0-Tf0) + (Tf0-Tf1)
+//     = (T1-T0) - (Tf1-Tf0) + (Tr0-Tf0)
+// But (Tr0-Tf0) is just D0 - how late frame zero was, and this is the
+// Late field in the quality message that we send.
+// The other two terms just state what correction should be applied before
+// using the lateness of F0 to predict the lateness of F1.
+// (T1-T0) says how much time has actually passed (we have lost this much)
+// (Tf1-Tf0) says how much time should have passed if we were keeping pace
+// (we have gained this much).
+//
+// Suppliers should therefore work off:
+//    Quality.Late + (T1-T0)  - (Tf1-Tf0)
+// and see if this is "acceptably late" or even early (i.e. negative).
+// They get T1 and T0 by polling the clock, they get Tf1 and Tf0 from
+// the time stamps in the frames.  They get Quality.Late from us.
+//
+
+HRESULT CBaseVideoRenderer::SendQuality(REFERENCE_TIME trLate,
+                                        REFERENCE_TIME trRealStream)
+{
+    Quality q;
+    HRESULT hr;
+
+    // If we are the main user of time, then report this as Flood/Dry.
+    // If our suppliers are, then report it as Famine/Glut.
+    //
+    // We need to take action, but avoid hunting.  Hunting is caused by
+    // 1. Taking too much action too soon and overshooting
+    // 2. Taking too long to react (so averaging can CAUSE hunting).
+    //
+    // The reason why we use trLate as well as Wait is to reduce hunting;
+    // if the wait time is coming down and about to go into the red, we do
+    // NOT want to rely on some average which is only telling is that it used
+    // to be OK once.
+
+    q.TimeStamp = (REFERENCE_TIME)trRealStream;
+
+    if (m_trFrameAvg<0) {
+        q.Type = Famine;      // guess
+    }
+    // Is the greater part of the time taken bltting or something else
+    else if (m_trFrameAvg > 2*m_trRenderAvg) {
+        q.Type = Famine;                        // mainly other
+    } else {
+        q.Type = Flood;                         // mainly bltting
+    }
+
+    q.Proportion = 1000;               // default
+
+    if (m_trFrameAvg<0) {
+        // leave it alone - we don't know enough
+    }
+    else if ( trLate> 0 ) {
+        // try to catch up over the next second
+        // We could be Really, REALLY late, but rendering all the frames
+        // anyway, just because it's so cheap.
+
+        q.Proportion = 1000 - (int)((trLate)/(UNITS/1000));
+        if (q.Proportion<500) {
+           q.Proportion = 500;      // don't go daft. (could've been negative!)
+        } else {
+        }
+
+    } else if (  m_trWaitAvg>20000
+              && trLate<-20000
+              ){
+        // Go cautiously faster - aim at 2mSec wait.
+        if (m_trWaitAvg>=m_trFrameAvg) {
+            // This can happen because of some fudges.
+            // The waitAvg is how long we originally planned to wait
+            // The frameAvg is more honest.
+            // It means that we are spending a LOT of time waiting
+            q.Proportion = 2000;    // double.
+        } else {
+            if (m_trFrameAvg+20000 > m_trWaitAvg) {
+                q.Proportion
+                    = 1000 * (m_trFrameAvg / (m_trFrameAvg + 20000 - m_trWaitAvg));
+            } else {
+                // We're apparently spending more than the whole frame time waiting.
+                // Assume that the averages are slightly out of kilter, but that we
+                // are indeed doing a lot of waiting.  (This leg probably never
+                // happens, but the code avoids any potential divide by zero).
+                q.Proportion = 2000;
+            }
+        }
+
+        if (q.Proportion>2000) {
+            q.Proportion = 2000;    // don't go crazy.
+        }
+    }
+
+    // Tell the supplier how late frames are when they get rendered
+    // That's how late we are now.
+    // If we are in directdraw mode then the guy upstream can see the drawing
+    // times and we'll just report on the start time.  He can figure out any
+    // offset to apply.  If we are in DIB Section mode then we will apply an
+    // extra offset which is half of our drawing time.  This is usually small
+    // but can sometimes be the dominant effect.  For this we will use the
+    // average drawing time rather than the last frame.  If the last frame took
+    // a long time to draw and made us late, that's already in the lateness
+    // figure.  We should not add it in again unless we expect the next frame
+    // to be the same.  We don't, we expect the average to be a better shot.
+    // In direct draw mode the RenderAvg will be zero.
+
+    q.Late = trLate + m_trRenderAvg/2;
+
+    // log what we're doing
+    MSR_INTEGER(m_idQualityRate, q.Proportion);
+    MSR_INTEGER( m_idQualityTime, (int)q.Late / 10000 );
+
+    // A specific sink interface may be set through IPin
+
+    if (m_pQSink==NULL) {
+        // Get our input pin's peer.  We send quality management messages
+        // to any nominated receiver of these things (set in the IPin
+        // interface), or else to our source filter.
+
+        IQualityControl *pQC = NULL;
+        IPin *pOutputPin = m_pInputPin->GetConnected();
+        ASSERT(pOutputPin != NULL);
+
+        // And get an AddRef'd quality control interface
+
+        hr = pOutputPin->QueryInterface(IID_IQualityControl,(void**) &pQC);
+        if (SUCCEEDED(hr)) {
+            m_pQSink = pQC;
+        }
+    }
+    if (m_pQSink) {
+        return m_pQSink->Notify(this,q);
+    }
+
+    return S_FALSE;
+
+} // SendQuality
+
+
+// We are called with a valid IMediaSample image to decide whether this is to
+// be drawn or not.  There must be a reference clock in operation.
+// Return S_OK if it is to be drawn Now (as soon as possible)
+// Return S_FALSE if it is to be drawn when it's due
+// Return an error if we want to drop it
+// m_nNormal=-1 indicates that we dropped the previous frame and so this
+// one should be drawn early.  Respect it and update it.
+// Use current stream time plus a number of heuristics (detailed below)
+// to make the decision
+
+HRESULT CBaseVideoRenderer::ShouldDrawSampleNow(IMediaSample *pMediaSample,
+                                                REFERENCE_TIME *ptrStart,
+                                                REFERENCE_TIME *ptrEnd)
+{
+
+    // Don't call us unless there's a clock interface to synchronise with
+    ASSERT(m_pClock);
+
+    MSR_INTEGER(m_idTimeStamp, (int)((*ptrStart)>>32));   // high order 32 bits
+    MSR_INTEGER(m_idTimeStamp, (int)(*ptrStart));         // low order 32 bits
+
+    // We lose a bit of time depending on the monitor type waiting for the next
+    // screen refresh.  On average this might be about 8mSec - so it will be
+    // later than we think when the picture appears.  To compensate a bit
+    // we bias the media samples by -8mSec i.e. 80000 UNITs.
+    // We don't ever make a stream time negative (call it paranoia)
+    if (*ptrStart>=80000) {
+        *ptrStart -= 80000;
+        *ptrEnd -= 80000;       // bias stop to to retain valid frame duration
+    }
+
+    // Cache the time stamp now.  We will want to compare what we did with what
+    // we started with (after making the monitor allowance).
+    m_trRememberStampForPerf = *ptrStart;
+
+    // Get reference times (current and late)
+    REFERENCE_TIME trRealStream;     // the real time now expressed as stream time.
+    m_pClock->GetTime(&trRealStream);
+#ifdef PERF
+    // While the reference clock is expensive:
+    // Remember the offset from timeGetTime and use that.
+    // This overflows all over the place, but when we subtract to get
+    // differences the overflows all cancel out.
+    m_llTimeOffset = trRealStream-timeGetTime()*10000;
+#endif
+    trRealStream -= m_tStart;     // convert to stream time (this is a reftime)
+
+    // We have to wory about two versions of "lateness".  The truth, which we
+    // try to work out here and the one measured against m_trTarget which
+    // includes long term feedback.  We report statistics against the truth
+    // but for operational decisions we work to the target.
+    // We use TimeDiff to make sure we get an integer because we
+    // may actually be late (or more likely early if there is a big time
+    // gap) by a very long time.
+    const int trTrueLate = TimeDiff(trRealStream - *ptrStart);
+    const int trLate = trTrueLate;
+
+    MSR_INTEGER(m_idSchLateTime, trTrueLate/10000);
+
+    // Send quality control messages upstream, measured against target
+    HRESULT hr = SendQuality(trLate, trRealStream);
+    // Note: the filter upstream is allowed to this FAIL meaning "you do it".
+    m_bSupplierHandlingQuality = (hr==S_OK);
+
+    // Decision time!  Do we drop, draw when ready or draw immediately?
+
+    const int trDuration = (int)(*ptrEnd - *ptrStart);
+    {
+        // We need to see if the frame rate of the file has just changed.
+        // This would make comparing our previous frame rate with the current
+        // frame rate inefficent.  Hang on a moment though.  I've seen files
+        // where the frames vary between 33 and 34 mSec so as to average
+        // 30fps.  A minor variation like that won't hurt us.
+        int t = m_trDuration/32;
+        if (  trDuration > m_trDuration+t
+           || trDuration < m_trDuration-t
+           ) {
+            // There's a major variation.  Reset the average frame rate to
+            // exactly the current rate to disable decision 9002 for this frame,
+            // and remember the new rate.
+            m_trFrameAvg = trDuration;
+            m_trDuration = trDuration;
+        }
+    }
+
+    MSR_INTEGER(m_idEarliness, m_trEarliness/10000);
+    MSR_INTEGER(m_idRenderAvg, m_trRenderAvg/10000);
+    MSR_INTEGER(m_idFrameAvg, m_trFrameAvg/10000);
+    MSR_INTEGER(m_idWaitAvg, m_trWaitAvg/10000);
+    MSR_INTEGER(m_idDuration, trDuration/10000);
+
+#ifdef PERF
+    if (S_OK==pMediaSample->IsDiscontinuity()) {
+        MSR_INTEGER(m_idDecision, 9000);
+    }
+#endif
+
+    // Control the graceful slide back from slow to fast machine mode.
+    // After a frame drop accept an early frame and set the earliness to here
+    // If this frame is already later than the earliness then slide it to here
+    // otherwise do the standard slide (reduce by about 12% per frame).
+    // Note: earliness is normally NEGATIVE
+    BOOL bJustDroppedFrame
+        = (  m_bSupplierHandlingQuality
+          //  Can't use the pin sample properties because we might
+          //  not be in Receive when we call this
+          && (S_OK == pMediaSample->IsDiscontinuity())          // he just dropped one
+          )
+       || (m_nNormal==-1);                          // we just dropped one
+
+
+    // Set m_trEarliness (slide back from slow to fast machine mode)
+    if (trLate>0) {
+        m_trEarliness = 0;   // we are no longer in fast machine mode at all!
+    } else if (  (trLate>=m_trEarliness) || bJustDroppedFrame) {
+        m_trEarliness = trLate;  // Things have slipped of their own accord
+    } else {
+        m_trEarliness = m_trEarliness - m_trEarliness/8;  // graceful slide
+    }
+
+    // prepare the new wait average - but don't pollute the old one until
+    // we have finished with it.
+    int trWaitAvg;
+    {
+        // We never mix in a negative wait.  This causes us to believe in fast machines
+        // slightly more.
+        int trL = trLate<0 ? -trLate : 0;
+        trWaitAvg = (trL + m_trWaitAvg*(AVGPERIOD-1))/AVGPERIOD;
+    }
+
+
+    int trFrame;
+    {
+        REFERENCE_TIME tr = trRealStream - m_trLastDraw; // Cd be large - 4 min pause!
+        if (tr>10000000) {
+            tr = 10000000;   // 1 second - arbitrarily.
+        }
+        trFrame = int(tr);
+    }
+
+    // We will DRAW this frame IF...
+    if (
+          // ...the time we are spending drawing is a small fraction of the total
+          // observed inter-frame time so that dropping it won't help much.
+          (3*m_trRenderAvg <= m_trFrameAvg)
+
+         // ...or our supplier is NOT handling things and the next frame would
+         // be less timely than this one or our supplier CLAIMS to be handling
+         // things, and is now less than a full FOUR frames late.
+       || ( m_bSupplierHandlingQuality
+          ? (trLate <= trDuration*4)
+          : (trLate+trLate < trDuration)
+          )
+
+          // ...or we are on average waiting for over eight milliseconds then
+          // this may be just a glitch.  Draw it and we'll hope to catch up.
+       || (m_trWaitAvg > 80000)
+
+          // ...or we haven't drawn an image for over a second.  We will update
+          // the display, which stops the video looking hung.
+          // Do this regardless of how late this media sample is.
+       || ((trRealStream - m_trLastDraw) > UNITS)
+
+    ) {
+        HRESULT Result;
+
+        // We are going to play this frame.  We may want to play it early.
+        // We will play it early if we think we are in slow machine mode.
+        // If we think we are NOT in slow machine mode, we will still play
+        // it early by m_trEarliness as this controls the graceful slide back.
+        // and in addition we aim at being m_trTarget late rather than "on time".
+
+        BOOL bPlayASAP = FALSE;
+
+        // we will play it AT ONCE (slow machine mode) if...
+
+            // ...we are playing catch-up
+        if ( bJustDroppedFrame) {
+            bPlayASAP = TRUE;
+            MSR_INTEGER(m_idDecision, 9001);
+        }
+
+            // ...or if we are running below the true frame rate
+            // exact comparisons are glitchy, for these measurements,
+            // so add an extra 5% or so
+        else if (  (m_trFrameAvg > trDuration + trDuration/16)
+
+                   // It's possible to get into a state where we are losing ground, but
+                   // are a very long way ahead.  To avoid this or recover from it
+                   // we refuse to play early by more than 10 frames.
+                && (trLate > - trDuration*10)
+                ){
+            bPlayASAP = TRUE;
+            MSR_INTEGER(m_idDecision, 9002);
+        }
+#if 0
+            // ...or if we have been late and are less than one frame early
+        else if (  (trLate + trDuration > 0)
+                && (m_trWaitAvg<=20000)
+                ) {
+            bPlayASAP = TRUE;
+            MSR_INTEGER(m_idDecision, 9003);
+        }
+#endif
+        // We will NOT play it at once if we are grossly early.  On very slow frame
+        // rate movies - e.g. clock.avi - it is not a good idea to leap ahead just
+        // because we got starved (for instance by the net) and dropped one frame
+        // some time or other.  If we are more than 900mSec early, then wait.
+        if (trLate<-9000000) {
+            bPlayASAP = FALSE;
+        }
+
+        if (bPlayASAP) {
+
+            m_nNormal = 0;
+            MSR_INTEGER(m_idDecision, 0);
+            // When we are here, we are in slow-machine mode.  trLate may well
+            // oscillate between negative and positive when the supplier is
+            // dropping frames to keep sync.  We should not let that mislead
+            // us into thinking that we have as much as zero spare time!
+            // We just update with a zero wait.
+            m_trWaitAvg = (m_trWaitAvg*(AVGPERIOD-1))/AVGPERIOD;
+
+            // Assume that we draw it immediately.  Update inter-frame stats
+            m_trFrameAvg = (trFrame + m_trFrameAvg*(AVGPERIOD-1))/AVGPERIOD;
+#ifndef PERF
+            // If this is NOT a perf build, then report what we know so far
+            // without looking at the clock any more.  This assumes that we
+            // actually wait for exactly the time we hope to.  It also reports
+            // how close we get to the manipulated time stamps that we now have
+            // rather than the ones we originally started with.  It will
+            // therefore be a little optimistic.  However it's fast.
+            PreparePerformanceData(trTrueLate, trFrame);
+#endif
+            m_trLastDraw = trRealStream;
+            if (m_trEarliness > trLate) {
+                m_trEarliness = trLate;  // if we are actually early, this is neg
+            }
+            Result = S_OK;                   // Draw it now
+
+        } else {
+            ++m_nNormal;
+            // Set the average frame rate to EXACTLY the ideal rate.
+            // If we are exiting slow-machine mode then we will have caught up
+            // and be running ahead, so as we slide back to exact timing we will
+            // have a longer than usual gap at this point.  If we record this
+            // real gap then we'll think that we're running slow and go back
+            // into slow-machine mode and vever get it straight.
+            m_trFrameAvg = trDuration;
+            MSR_INTEGER(m_idDecision, 1);
+
+            // Play it early by m_trEarliness and by m_trTarget
+
+            {
+                int trE = m_trEarliness;
+                if (trE < -m_trFrameAvg) {
+                    trE = -m_trFrameAvg;
+                }
+                *ptrStart += trE;           // N.B. earliness is negative
+            }
+
+            int Delay = -trTrueLate;
+            Result = Delay<=0 ? S_OK : S_FALSE;     // OK = draw now, FALSE = wait
+
+            m_trWaitAvg = trWaitAvg;
+
+            // Predict when it will actually be drawn and update frame stats
+
+            if (Result==S_FALSE) {   // We are going to wait
+                trFrame = TimeDiff(*ptrStart-m_trLastDraw);
+                m_trLastDraw = *ptrStart;
+            } else {
+                // trFrame is already = trRealStream-m_trLastDraw;
+                m_trLastDraw = trRealStream;
+            }
+#ifndef PERF
+            int iAccuracy;
+            if (Delay>0) {
+                // Report lateness based on when we intend to play it
+                iAccuracy = TimeDiff(*ptrStart-m_trRememberStampForPerf);
+            } else {
+                // Report lateness based on playing it *now*.
+                iAccuracy = trTrueLate;     // trRealStream-RememberStampForPerf;
+            }
+            PreparePerformanceData(iAccuracy, trFrame);
+#endif
+        }
+        return Result;
+    }
+
+    // We are going to drop this frame!
+    // Of course in DirectDraw mode the guy upstream may draw it anyway.
+
+    // This will probably give a large negative wack to the wait avg.
+    m_trWaitAvg = trWaitAvg;
+
+#ifdef PERF
+    // Respect registry setting - debug only!
+    if (m_bDrawLateFrames) {
+       return S_OK;                        // draw it when it's ready
+    }                                      // even though it's late.
+#endif
+
+    // We are going to drop this frame so draw the next one early
+    // n.b. if the supplier is doing direct draw then he may draw it anyway
+    // but he's doing something funny to arrive here in that case.
+
+    MSR_INTEGER(m_idDecision, 2);
+    m_nNormal = -1;
+    return E_FAIL;                         // drop it
+
+} // ShouldDrawSampleNow
+
+
+// NOTE we're called by both the window thread and the source filter thread
+// so we have to be protected by a critical section (locked before called)
+// Also, when the window thread gets signalled to render an image, it always
+// does so regardless of how late it is. All the degradation is done when we
+// are scheduling the next sample to be drawn. Hence when we start an advise
+// link to draw a sample, that sample's time will always become the last one
+// drawn - unless of course we stop streaming in which case we cancel links
+
+BOOL CBaseVideoRenderer::ScheduleSample(IMediaSample *pMediaSample)
+{
+    // We override ShouldDrawSampleNow to add quality management
+
+    BOOL bDrawImage = CBaseRenderer::ScheduleSample(pMediaSample);
+    if (bDrawImage == FALSE) {
+	++m_cFramesDropped;
+	return FALSE;
+    }
+
+    // m_cFramesDrawn must NOT be updated here.  It has to be updated
+    // in RecordFrameLateness at the same time as the other statistics.
+    return TRUE;
+}
+
+
+// Implementation of IQualProp interface needed to support the property page
+// This is how the property page gets the data out of the scheduler. We are
+// passed into the constructor the owning object in the COM sense, this will
+// either be the video renderer or an external IUnknown if we're aggregated.
+// We initialise our CUnknown base class with this interface pointer. Then
+// all we have to do is to override NonDelegatingQueryInterface to expose
+// our IQualProp interface. The AddRef and Release are handled automatically
+// by the base class and will be passed on to the appropriate outer object
+
+STDMETHODIMP CBaseVideoRenderer::get_FramesDroppedInRenderer(int *pcFramesDropped)
+{
+    CheckPointer(pcFramesDropped,E_POINTER);
+    CAutoLock cVideoLock(&m_InterfaceLock);
+    *pcFramesDropped = m_cFramesDropped;
+    return NOERROR;
+} // get_FramesDroppedInRenderer
+
+
+// Set *pcFramesDrawn to the number of frames drawn since
+// streaming started.
+
+STDMETHODIMP CBaseVideoRenderer::get_FramesDrawn( int *pcFramesDrawn)
+{
+    CheckPointer(pcFramesDrawn,E_POINTER);
+    CAutoLock cVideoLock(&m_InterfaceLock);
+    *pcFramesDrawn = m_cFramesDrawn;
+    return NOERROR;
+} // get_FramesDrawn
+
+
+// Set iAvgFrameRate to the frames per hundred secs since
+// streaming started.  0 otherwise.
+
+STDMETHODIMP CBaseVideoRenderer::get_AvgFrameRate( int *piAvgFrameRate)
+{
+    CheckPointer(piAvgFrameRate,E_POINTER);
+    CAutoLock cVideoLock(&m_InterfaceLock);
+
+    int t;
+    if (m_bStreaming) {
+        t = timeGetTime()-m_tStreamingStart;
+    } else {
+        t = m_tStreamingStart;
+    }
+
+    if (t<=0) {
+        *piAvgFrameRate = 0;
+        ASSERT(m_cFramesDrawn == 0);
+    } else {
+        // i is frames per hundred seconds
+        *piAvgFrameRate = MulDiv(100000, m_cFramesDrawn, t);
+    }
+    return NOERROR;
+} // get_AvgFrameRate
+
+
+// Set *piAvg to the average sync offset since streaming started
+// in mSec.  The sync offset is the time in mSec between when the frame
+// should have been drawn and when the frame was actually drawn.
+
+STDMETHODIMP CBaseVideoRenderer::get_AvgSyncOffset( int *piAvg)
+{
+    CheckPointer(piAvg,E_POINTER);
+    CAutoLock cVideoLock(&m_InterfaceLock);
+
+    if (NULL==m_pClock) {
+        *piAvg = 0;
+        return NOERROR;
+    }
+
+    // Note that we didn't gather the stats on the first frame
+    // so we use m_cFramesDrawn-1 here
+    if (m_cFramesDrawn<=1) {
+        *piAvg = 0;
+    } else {
+        *piAvg = (int)(m_iTotAcc / (m_cFramesDrawn-1));
+    }
+    return NOERROR;
+} // get_AvgSyncOffset
+
+
+// To avoid dragging in the maths library - a cheap
+// approximate integer square root.
+// We do this by getting a starting guess which is between 1
+// and 2 times too large, followed by THREE iterations of
+// Newton Raphson.  (That will give accuracy to the nearest mSec
+// for the range in question - roughly 0..1000)
+//
+// It would be faster to use a linear interpolation and ONE NR, but
+// who cares.  If anyone does - the best linear interpolation is
+// to approximates sqrt(x) by
+// y = x * (sqrt(2)-1) + 1 - 1/sqrt(2) + 1/(8*(sqrt(2)-1))
+// 0r y = x*0.41421 + 0.59467
+// This minimises the maximal error in the range in question.
+// (error is about +0.008883 and then one NR will give error .0000something
+// (Of course these are integers, so you can't just multiply by 0.41421
+// you'd have to do some sort of MulDiv).
+// Anyone wanna check my maths?  (This is only for a property display!)
+
+int isqrt(int x)
+{
+    int s = 1;
+    // Make s an initial guess for sqrt(x)
+    if (x > 0x40000000) {
+       s = 0x8000;     // prevent any conceivable closed loop
+    } else {
+        while (s*s<x) {    // loop cannot possible go more than 31 times
+            s = 2*s;       // normally it goes about 6 times
+        }
+        // Three NR iterations.
+        if (x==0) {
+           s= 0; // Wouldn't it be tragic to divide by zero whenever our
+                 // accuracy was perfect!
+        } else {
+            s = (s*s+x)/(2*s);
+            if (s>=0) s = (s*s+x)/(2*s);
+            if (s>=0) s = (s*s+x)/(2*s);
+        }
+    }
+    return s;
+}
+
+//
+//  Do estimates for standard deviations for per-frame
+//  statistics
+//
+HRESULT CBaseVideoRenderer::GetStdDev(
+    int nSamples,
+    int *piResult,
+    LONGLONG llSumSq,
+    LONGLONG iTot
+)
+{
+    CheckPointer(piResult,E_POINTER);
+    CAutoLock cVideoLock(&m_InterfaceLock);
+
+    if (NULL==m_pClock) {
+        *piResult = 0;
+        return NOERROR;
+    }
+
+    // If S is the Sum of the Squares of observations and
+    //    T the Total (i.e. sum) of the observations and there were
+    //    N observations, then an estimate of the standard deviation is
+    //      sqrt( (S - T**2/N) / (N-1) )
+
+    if (nSamples<=1) {
+        *piResult = 0;
+    } else {
+        LONGLONG x;
+        // First frames have invalid stamps, so we get no stats for them
+        // So we need 2 frames to get 1 datum, so N is cFramesDrawn-1
+
+        // so we use m_cFramesDrawn-1 here
+        x = llSumSq - llMulDiv(iTot, iTot, nSamples, 0);
+        x = x / (nSamples-1);
+        ASSERT(x>=0);
+        *piResult = isqrt((LONG)x);
+    }
+    return NOERROR;
+}
+
+// Set *piDev to the standard deviation in mSec of the sync offset
+// of each frame since streaming started.
+
+STDMETHODIMP CBaseVideoRenderer::get_DevSyncOffset( int *piDev)
+{
+    // First frames have invalid stamps, so we get no stats for them
+    // So we need 2 frames to get 1 datum, so N is cFramesDrawn-1
+    return GetStdDev(m_cFramesDrawn - 1,
+                     piDev,
+                     m_iSumSqAcc,
+                     m_iTotAcc);
+} // get_DevSyncOffset
+
+
+// Set *piJitter to the standard deviation in mSec of the inter-frame time
+// of frames since streaming started.
+
+STDMETHODIMP CBaseVideoRenderer::get_Jitter( int *piJitter)
+{
+    // First frames have invalid stamps, so we get no stats for them
+    // So second frame gives invalid inter-frame time
+    // So we need 3 frames to get 1 datum, so N is cFramesDrawn-2
+    return GetStdDev(m_cFramesDrawn - 2,
+                     piJitter,
+                     m_iSumSqFrameTime,
+                     m_iSumFrameTime);
+} // get_Jitter
+
+
+// Overidden to return our IQualProp interface
+
+STDMETHODIMP
+CBaseVideoRenderer::NonDelegatingQueryInterface(REFIID riid,VOID **ppv)
+{
+    // We return IQualProp and delegate everything else
+
+    if (riid == IID_IQualProp) {
+        return GetInterface( (IQualProp *)this, ppv);
+    } else if (riid == IID_IQualityControl) {
+        return GetInterface( (IQualityControl *)this, ppv);
+    }
+    return CBaseRenderer::NonDelegatingQueryInterface(riid,ppv);
+}
+
+
+// Override JoinFilterGraph so that, just before leaving
+// the graph we can send an EC_WINDOW_DESTROYED event
+
+STDMETHODIMP
+CBaseVideoRenderer::JoinFilterGraph(IFilterGraph *pGraph,LPCWSTR pName)
+{
+    // Since we send EC_ACTIVATE, we also need to ensure
+    // we send EC_WINDOW_DESTROYED or the resource manager may be
+    // holding us as a focus object
+    if (!pGraph && m_pGraph) {
+
+        // We were in a graph and now we're not
+        // Do this properly in case we are aggregated
+        IBaseFilter* pFilter;
+        QueryInterface(IID_IBaseFilter,(void **) &pFilter);
+        NotifyEvent(EC_WINDOW_DESTROYED, (LPARAM) pFilter, 0);
+        pFilter->Release();
+    }
+    return CBaseFilter::JoinFilterGraph(pGraph, pName);
+}
+
+
+// This removes a large number of level 4 warnings from the
+// Microsoft compiler which in this case are not very useful
+#pragma warning(disable: 4514)
+
diff --git a/plugins/GSdx_legacy/baseclasses/renbase.h b/plugins/GSdx_legacy/baseclasses/renbase.h
new file mode 100644
index 0000000000..5352e873b4
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/renbase.h
@@ -0,0 +1,478 @@
+//------------------------------------------------------------------------------
+// File: RenBase.h
+//
+// Desc: DirectShow base classes - defines a generic ActiveX base renderer
+//       class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __RENBASE__
+#define __RENBASE__
+
+// Forward class declarations
+
+class CBaseRenderer;
+class CBaseVideoRenderer;
+class CRendererInputPin;
+
+// This is our input pin class that channels calls to the renderer
+
+class CRendererInputPin : public CBaseInputPin
+{
+protected:
+
+    CBaseRenderer *m_pRenderer;
+
+public:
+
+    CRendererInputPin(CBaseRenderer *pRenderer,
+                      HRESULT *phr,
+                      LPCWSTR Name);
+
+    // Overriden from the base pin classes
+
+    HRESULT BreakConnect();
+    HRESULT CompleteConnect(IPin *pReceivePin);
+    HRESULT SetMediaType(const CMediaType *pmt);
+    HRESULT CheckMediaType(const CMediaType *pmt);
+    HRESULT Active();
+    HRESULT Inactive();
+
+    // Add rendering behaviour to interface functions
+
+    STDMETHODIMP QueryId(LPWSTR *Id);
+    STDMETHODIMP EndOfStream();
+    STDMETHODIMP BeginFlush();
+    STDMETHODIMP EndFlush();
+    STDMETHODIMP Receive(IMediaSample *pMediaSample);
+
+    // Helper
+    IMemAllocator inline *Allocator() const
+    {
+        return m_pAllocator;
+    }
+};
+
+// Main renderer class that handles synchronisation and state changes
+
+class CBaseRenderer : public CBaseFilter
+{
+protected:
+
+    friend class CRendererInputPin;
+
+    friend void CALLBACK EndOfStreamTimer(UINT uID,      // Timer identifier
+                                          UINT uMsg,     // Not currently used
+                                          DWORD_PTR dwUser,  // User information
+                                          DWORD_PTR dw1,     // Windows reserved
+                                          DWORD_PTR dw2);    // Is also reserved
+
+    CRendererPosPassThru *m_pPosition;  // Media seeking pass by object
+    CAMEvent m_RenderEvent;             // Used to signal timer events
+    CAMEvent m_ThreadSignal;            // Signalled to release worker thread
+    CAMEvent m_evComplete;              // Signalled when state complete
+    BOOL m_bAbort;                      // Stop us from rendering more data
+    BOOL m_bStreaming;                  // Are we currently streaming
+    DWORD_PTR m_dwAdvise;                   // Timer advise cookie
+    IMediaSample *m_pMediaSample;       // Current image media sample
+    BOOL m_bEOS;                        // Any more samples in the stream
+    BOOL m_bEOSDelivered;               // Have we delivered an EC_COMPLETE
+    CRendererInputPin *m_pInputPin;     // Our renderer input pin object
+    CCritSec m_InterfaceLock;           // Critical section for interfaces
+    CCritSec m_RendererLock;            // Controls access to internals
+    IQualityControl * m_pQSink;         // QualityControl sink
+    BOOL m_bRepaintStatus;              // Can we signal an EC_REPAINT
+    //  Avoid some deadlocks by tracking filter during stop
+    volatile BOOL  m_bInReceive;        // Inside Receive between PrepareReceive
+                                        // And actually processing the sample
+    REFERENCE_TIME m_SignalTime;        // Time when we signal EC_COMPLETE
+    UINT m_EndOfStreamTimer;            // Used to signal end of stream
+    CCritSec m_ObjectCreationLock;      // This lock protects the creation and
+                                        // of m_pPosition and m_pInputPin.  It
+                                        // ensures that two threads cannot create
+                                        // either object simultaneously.
+
+public:
+
+    CBaseRenderer(REFCLSID RenderClass, // CLSID for this renderer
+                  TCHAR *pName,         // Debug ONLY description
+                  LPUNKNOWN pUnk,       // Aggregated owner object
+                  HRESULT *phr);        // General OLE return code
+
+    ~CBaseRenderer();
+
+    // Overriden to say what interfaces we support and where
+
+    virtual HRESULT GetMediaPositionInterface(REFIID riid,void **ppv);
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID, void **);
+
+    virtual HRESULT SourceThreadCanWait(BOOL bCanWait);
+
+#ifdef DEBUG
+    // Debug only dump of the renderer state
+    void DisplayRendererState();
+#endif
+    virtual HRESULT WaitForRenderTime();
+    virtual HRESULT CompleteStateChange(FILTER_STATE OldState);
+
+    // Return internal information about this filter
+
+    BOOL IsEndOfStream() { return m_bEOS; };
+    BOOL IsEndOfStreamDelivered() { return m_bEOSDelivered; };
+    BOOL IsStreaming() { return m_bStreaming; };
+    void SetAbortSignal(BOOL bAbort) { m_bAbort = bAbort; };
+    virtual void OnReceiveFirstSample(IMediaSample *pMediaSample) { };
+    CAMEvent *GetRenderEvent() { return &m_RenderEvent; };
+
+    // Permit access to the transition state
+
+    void Ready() { m_evComplete.Set(); };
+    void NotReady() { m_evComplete.Reset(); };
+    BOOL CheckReady() { return m_evComplete.Check(); };
+
+    virtual int GetPinCount();
+    virtual CBasePin *GetPin(int n);
+    FILTER_STATE GetRealState();
+    void SendRepaint();
+    void SendNotifyWindow(IPin *pPin,HWND hwnd);
+    BOOL OnDisplayChange();
+    void SetRepaintStatus(BOOL bRepaint);
+
+    // Override the filter and pin interface functions
+
+    STDMETHODIMP Stop();
+    STDMETHODIMP Pause();
+    STDMETHODIMP Run(REFERENCE_TIME StartTime);
+    STDMETHODIMP GetState(DWORD dwMSecs,FILTER_STATE *State);
+    STDMETHODIMP FindPin(LPCWSTR Id, IPin **ppPin);
+
+    // These are available for a quality management implementation
+
+    virtual void OnRenderStart(IMediaSample *pMediaSample);
+    virtual void OnRenderEnd(IMediaSample *pMediaSample);
+    virtual HRESULT OnStartStreaming() { return NOERROR; };
+    virtual HRESULT OnStopStreaming() { return NOERROR; };
+    virtual void OnWaitStart() { };
+    virtual void OnWaitEnd() { };
+    virtual void PrepareRender() { };
+
+#ifdef PERF
+    REFERENCE_TIME m_trRenderStart; // Just before we started drawing
+                                    // Set in OnRenderStart, Used in OnRenderEnd
+    int m_idBaseStamp;              // MSR_id for frame time stamp
+    int m_idBaseRenderTime;         // MSR_id for true wait time
+    int m_idBaseAccuracy;           // MSR_id for time frame is late (int)
+#endif
+
+    // Quality management implementation for scheduling rendering
+
+    virtual BOOL ScheduleSample(IMediaSample *pMediaSample);
+    virtual HRESULT GetSampleTimes(IMediaSample *pMediaSample,
+                                   REFERENCE_TIME *pStartTime,
+                                   REFERENCE_TIME *pEndTime);
+
+    virtual HRESULT ShouldDrawSampleNow(IMediaSample *pMediaSample,
+                                        REFERENCE_TIME *ptrStart,
+                                        REFERENCE_TIME *ptrEnd);
+
+    // Lots of end of stream complexities
+
+    void TimerCallback();
+    void ResetEndOfStreamTimer();
+    HRESULT NotifyEndOfStream();
+    virtual HRESULT SendEndOfStream();
+    virtual HRESULT ResetEndOfStream();
+    virtual HRESULT EndOfStream();
+
+    // Rendering is based around the clock
+
+    void SignalTimerFired();
+    virtual HRESULT CancelNotification();
+    virtual HRESULT ClearPendingSample();
+
+    // Called when the filter changes state
+
+    virtual HRESULT Active();
+    virtual HRESULT Inactive();
+    virtual HRESULT StartStreaming();
+    virtual HRESULT StopStreaming();
+    virtual HRESULT BeginFlush();
+    virtual HRESULT EndFlush();
+
+    // Deal with connections and type changes
+
+    virtual HRESULT BreakConnect();
+    virtual HRESULT SetMediaType(const CMediaType *pmt);
+    virtual HRESULT CompleteConnect(IPin *pReceivePin);
+
+    // These look after the handling of data samples
+
+    virtual HRESULT PrepareReceive(IMediaSample *pMediaSample);
+    virtual HRESULT Receive(IMediaSample *pMediaSample);
+    virtual BOOL HaveCurrentSample();
+    virtual IMediaSample *GetCurrentSample();
+    virtual HRESULT Render(IMediaSample *pMediaSample);
+
+    // Derived classes MUST override these
+    virtual HRESULT DoRenderSample(IMediaSample *pMediaSample) PURE;
+    virtual HRESULT CheckMediaType(const CMediaType *) PURE;
+
+    // Helper
+    void WaitForReceiveToComplete();
+};
+
+
+// CBaseVideoRenderer is a renderer class (see its ancestor class) and
+// it handles scheduling of media samples so that they are drawn at the
+// correct time by the reference clock.  It implements a degradation
+// strategy.  Possible degradation modes are:
+//    Drop frames here (only useful if the drawing takes significant time)
+//    Signal supplier (upstream) to drop some frame(s) - i.e. one-off skip.
+//    Signal supplier to change the frame rate - i.e. ongoing skipping.
+//    Or any combination of the above.
+// In order to determine what's useful to try we need to know what's going
+// on.  This is done by timing various operations (including the supplier).
+// This timing is done by using timeGetTime as it is accurate enough and
+// usually cheaper than calling the reference clock.  It also tells the
+// truth if there is an audio break and the reference clock stops.
+// We provide a number of public entry points (named OnXxxStart, OnXxxEnd)
+// which the rest of the renderer calls at significant moments.  These do
+// the timing.
+
+// the number of frames that the sliding averages are averaged over.
+// the rule is (1024*NewObservation + (AVGPERIOD-1) * PreviousAverage)/AVGPERIOD
+#define AVGPERIOD 4
+#define DO_MOVING_AVG(avg,obs) (avg = (1024*obs + (AVGPERIOD-1)*avg)/AVGPERIOD)
+// Spot the bug in this macro - I can't. but it doesn't work!
+
+class CBaseVideoRenderer : public CBaseRenderer,    // Base renderer class
+                           public IQualProp,        // Property page guff
+                           public IQualityControl   // Allow throttling
+{
+protected:
+
+    // Hungarian:
+    //     tFoo is the time Foo in mSec (beware m_tStart from filter.h)
+    //     trBar is the time Bar by the reference clock
+
+    //******************************************************************
+    // State variables to control synchronisation
+    //******************************************************************
+
+    // Control of sending Quality messages.  We need to know whether
+    // we are in trouble (e.g. frames being dropped) and where the time
+    // is being spent.
+
+    // When we drop a frame we play the next one early.
+    // The frame after that is likely to wait before drawing and counting this
+    // wait as spare time is unfair, so we count it as a zero wait.
+    // We therefore need to know whether we are playing frames early or not.
+
+    int m_nNormal;                  // The number of consecutive frames
+                                    // drawn at their normal time (not early)
+                                    // -1 means we just dropped a frame.
+
+#ifdef PERF
+    BOOL m_bDrawLateFrames;         // Don't drop any frames (debug and I'm
+                                    // not keen on people using it!)
+#endif
+
+    BOOL m_bSupplierHandlingQuality;// The response to Quality messages says
+                                    // our supplier is handling things.
+                                    // We will allow things to go extra late
+                                    // before dropping frames.  We will play
+                                    // very early after he has dropped one.
+
+    // Control of scheduling, frame dropping etc.
+    // We need to know where the time is being spent so as to tell whether
+    // we should be taking action here, signalling supplier or what.
+    // The variables are initialised to a mode of NOT dropping frames.
+    // They will tell the truth after a few frames.
+    // We typically record a start time for an event, later we get the time
+    // again and subtract to get the elapsed time, and we average this over
+    // a few frames.  The average is used to tell what mode we are in.
+
+    // Although these are reference times (64 bit) they are all DIFFERENCES
+    // between times which are small.  An int will go up to 214 secs before
+    // overflow.  Avoiding 64 bit multiplications and divisions seems
+    // worth while.
+
+
+
+    // Audio-video throttling.  If the user has turned up audio quality
+    // very high (in principle it could be any other stream, not just audio)
+    // then we can receive cries for help via the graph manager.  In this case
+    // we put in a wait for some time after rendering each frame.
+    int m_trThrottle;
+
+    // The time taken to render (i.e. BitBlt) frames controls which component
+    // needs to degrade.  If the blt is expensive, the renderer degrades.
+    // If the blt is cheap it's done anyway and the supplier degrades.
+    int m_trRenderAvg;              // Time frames are taking to blt
+    int m_trRenderLast;             // Time for last frame blt
+    int m_tRenderStart;             // Just before we started drawing (mSec)
+                                    // derived from timeGetTime.
+
+    // When frames are dropped we will play the next frame as early as we can.
+    // If it was a false alarm and the machine is fast we slide gently back to
+    // normal timing.  To do this, we record the offset showing just how early
+    // we really are.  This will normally be negative meaning early or zero.
+    int m_trEarliness;
+
+    // Target provides slow long-term feedback to try to reduce the
+    // average sync offset to zero.  Whenever a frame is actually rendered
+    // early we add a msec or two, whenever late we take off a few.
+    // We add or take off 1/32 of the error time.
+    // Eventually we should be hovering around zero.  For a really bad case
+    // where we were (say) 300mSec off, it might take 100 odd frames to
+    // settle down.  The rate of change of this is intended to be slower
+    // than any other mechanism in Quartz, thereby avoiding hunting.
+    int m_trTarget;
+
+    // The proportion of time spent waiting for the right moment to blt
+    // controls whether we bother to drop a frame or whether we reckon that
+    // we're doing well enough that we can stand a one-frame glitch.
+    int m_trWaitAvg;                // Average of last few wait times
+                                    // (actually we just average how early
+                                    // we were).  Negative here means LATE.
+
+    // The average inter-frame time.
+    // This is used to calculate the proportion of the time used by the
+    // three operations (supplying us, waiting, rendering)
+    int m_trFrameAvg;               // Average inter-frame time
+    int m_trDuration;               // duration of last frame.
+
+#ifdef PERF
+    // Performance logging identifiers
+    int m_idTimeStamp;              // MSR_id for frame time stamp
+    int m_idEarliness;              // MSR_id for earliness fudge
+    int m_idTarget;                 // MSR_id for Target fudge
+    int m_idWaitReal;               // MSR_id for true wait time
+    int m_idWait;                   // MSR_id for wait time recorded
+    int m_idFrameAccuracy;          // MSR_id for time frame is late (int)
+    int m_idRenderAvg;              // MSR_id for Render time recorded (int)
+    int m_idSchLateTime;            // MSR_id for lateness at scheduler
+    int m_idQualityRate;            // MSR_id for Quality rate requested
+    int m_idQualityTime;            // MSR_id for Quality time requested
+    int m_idDecision;               // MSR_id for decision code
+    int m_idDuration;               // MSR_id for duration of a frame
+    int m_idThrottle;               // MSR_id for audio-video throttling
+    //int m_idDebug;                  // MSR_id for trace style debugging
+    //int m_idSendQuality;          // MSR_id for timing the notifications per se
+#endif // PERF
+    REFERENCE_TIME m_trRememberStampForPerf;  // original time stamp of frame
+                                              // with no earliness fudges etc.
+#ifdef PERF
+    REFERENCE_TIME m_trRememberFrameForPerf;  // time when previous frame rendered
+
+    // debug...
+    int m_idFrameAvg;
+    int m_idWaitAvg;
+#endif
+
+    // PROPERTY PAGE
+    // This has edit fields that show the user what's happening
+    // These member variables hold these counts.
+
+    int m_cFramesDropped;           // cumulative frames dropped IN THE RENDERER
+    int m_cFramesDrawn;             // Frames since streaming started seen BY THE
+                                    // RENDERER (some may be dropped upstream)
+
+    // Next two support average sync offset and standard deviation of sync offset.
+    LONGLONG m_iTotAcc;                  // Sum of accuracies in mSec
+    LONGLONG m_iSumSqAcc;           // Sum of squares of (accuracies in mSec)
+
+    // Next two allow jitter calculation.  Jitter is std deviation of frame time.
+    REFERENCE_TIME m_trLastDraw;    // Time of prev frame (for inter-frame times)
+    LONGLONG m_iSumSqFrameTime;     // Sum of squares of (inter-frame time in mSec)
+    LONGLONG m_iSumFrameTime;            // Sum of inter-frame times in mSec
+
+    // To get performance statistics on frame rate, jitter etc, we need
+    // to record the lateness and inter-frame time.  What we actually need are the
+    // data above (sum, sum of squares and number of entries for each) but the data
+    // is generated just ahead of time and only later do we discover whether the
+    // frame was actually drawn or not.  So we have to hang on to the data
+    int m_trLate;                   // hold onto frame lateness
+    int m_trFrame;                  // hold onto inter-frame time
+
+    int m_tStreamingStart;          // if streaming then time streaming started
+                                    // else time of last streaming session
+                                    // used for property page statistics
+#ifdef PERF
+    LONGLONG m_llTimeOffset;        // timeGetTime()*10000+m_llTimeOffset==ref time
+#endif
+
+public:
+
+
+    CBaseVideoRenderer(REFCLSID RenderClass, // CLSID for this renderer
+                       TCHAR *pName,         // Debug ONLY description
+                       LPUNKNOWN pUnk,       // Aggregated owner object
+                       HRESULT *phr);        // General OLE return code
+
+    ~CBaseVideoRenderer();
+
+    // IQualityControl methods - Notify allows audio-video throttling
+
+    STDMETHODIMP SetSink( IQualityControl * piqc);
+    STDMETHODIMP Notify( IBaseFilter * pSelf, Quality q);
+
+    // These provide a full video quality management implementation
+
+    void OnRenderStart(IMediaSample *pMediaSample);
+    void OnRenderEnd(IMediaSample *pMediaSample);
+    void OnWaitStart();
+    void OnWaitEnd();
+    HRESULT OnStartStreaming();
+    HRESULT OnStopStreaming();
+    void ThrottleWait();
+
+    // Handle the statistics gathering for our quality management
+
+    void PreparePerformanceData(int trLate, int trFrame);
+    virtual void RecordFrameLateness(int trLate, int trFrame);
+    virtual void OnDirectRender(IMediaSample *pMediaSample);
+    virtual HRESULT ResetStreamingTimes();
+    BOOL ScheduleSample(IMediaSample *pMediaSample);
+    HRESULT ShouldDrawSampleNow(IMediaSample *pMediaSample,
+                                REFERENCE_TIME *ptrStart,
+                                REFERENCE_TIME *ptrEnd);
+
+    virtual HRESULT SendQuality(REFERENCE_TIME trLate, REFERENCE_TIME trRealStream);
+    STDMETHODIMP JoinFilterGraph(IFilterGraph * pGraph, LPCWSTR pName);
+
+    //
+    //  Do estimates for standard deviations for per-frame
+    //  statistics
+    //
+    //  *piResult = (llSumSq - iTot * iTot / m_cFramesDrawn - 1) /
+    //                            (m_cFramesDrawn - 2)
+    //  or 0 if m_cFramesDrawn <= 3
+    //
+    HRESULT GetStdDev(
+        int nSamples,
+        int *piResult,
+        LONGLONG llSumSq,
+        LONGLONG iTot
+    );
+public:
+
+    // IQualProp property page support
+
+    STDMETHODIMP get_FramesDroppedInRenderer(int *cFramesDropped);
+    STDMETHODIMP get_FramesDrawn(int *pcFramesDrawn);
+    STDMETHODIMP get_AvgFrameRate(int *piAvgFrameRate);
+    STDMETHODIMP get_Jitter(int *piJitter);
+    STDMETHODIMP get_AvgSyncOffset(int *piAvg);
+    STDMETHODIMP get_DevSyncOffset(int *piDev);
+
+    // Implement an IUnknown interface and expose IQualProp
+
+    DECLARE_IUNKNOWN
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid,VOID **ppv);
+};
+
+#endif // __RENBASE__
+
diff --git a/plugins/GSdx_legacy/baseclasses/schedule.cpp b/plugins/GSdx_legacy/baseclasses/schedule.cpp
new file mode 100644
index 0000000000..4a3ca5930e
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/schedule.cpp
@@ -0,0 +1,287 @@
+//------------------------------------------------------------------------------
+// File: Schedule.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+
+// DbgLog values (all on LOG_TIMING):
+//
+// 2 for schedulting, firing and shunting of events
+// 3 for wait delays and wake-up times of event thread
+// 4 for details of whats on the list when the thread awakes
+
+/* Construct & destructors */
+
+CAMSchedule::CAMSchedule( HANDLE ev )
+: CBaseObject(TEXT("CAMSchedule"))
+, head(&z, 0), z(0, MAX_TIME)
+, m_dwNextCookie(0), m_dwAdviseCount(0)
+, m_pAdviseCache(0), m_dwCacheCount(0)
+, m_ev( ev )
+{
+    head.m_dwAdviseCookie = z.m_dwAdviseCookie = 0;
+}
+
+CAMSchedule::~CAMSchedule()
+{
+    m_Serialize.Lock();
+
+    // Delete cache
+    CAdvisePacket * p = m_pAdviseCache;
+    while (p)
+    {
+        CAdvisePacket *const p_next = p->m_next;
+        delete p;
+        p = p_next;
+    }
+
+    ASSERT( m_dwAdviseCount == 0 );
+    // Better to be safe than sorry
+    if ( m_dwAdviseCount > 0 )
+    {
+        DumpLinkedList();
+        while ( !head.m_next->IsZ() )
+        {
+            head.DeleteNext();
+            --m_dwAdviseCount;
+        }
+    }
+
+    // If, in the debug version, we assert twice, it means, not only
+    // did we have left over advises, but we have also let m_dwAdviseCount
+    // get out of sync. with the number of advises actually on the list.
+    ASSERT( m_dwAdviseCount == 0 );
+
+    m_Serialize.Unlock();
+}
+
+/* Public methods */
+
+DWORD CAMSchedule::GetAdviseCount()
+{
+    // No need to lock, m_dwAdviseCount is 32bits & declared volatile
+    return m_dwAdviseCount;
+}
+
+REFERENCE_TIME CAMSchedule::GetNextAdviseTime()
+{
+    CAutoLock lck(&m_Serialize); // Need to stop the linked list from changing
+    return head.m_next->m_rtEventTime;
+}
+
+DWORD_PTR CAMSchedule::AddAdvisePacket
+( const REFERENCE_TIME & time1
+, const REFERENCE_TIME & time2
+, HANDLE h, BOOL periodic
+)
+{
+    // Since we use MAX_TIME as a sentry, we can't afford to
+    // schedule a notification at MAX_TIME
+    ASSERT( time1 < MAX_TIME );
+    DWORD_PTR Result;
+    CAdvisePacket * p;
+
+    m_Serialize.Lock();
+
+    if (m_pAdviseCache)
+    {
+        p = m_pAdviseCache;
+        m_pAdviseCache = p->m_next;
+        --m_dwCacheCount;
+    }
+    else
+    {
+        p = new CAdvisePacket();
+    }
+    if (p)
+    {
+        p->m_rtEventTime = time1; p->m_rtPeriod = time2;
+        p->m_hNotify = h; p->m_bPeriodic = periodic;
+        Result = AddAdvisePacket( p );
+    }
+    else Result = 0;
+
+    m_Serialize.Unlock();
+
+    return Result;
+}
+
+HRESULT CAMSchedule::Unadvise(DWORD_PTR dwAdviseCookie)
+{
+    HRESULT hr = S_FALSE;
+    CAdvisePacket * p_prev = &head;
+    CAdvisePacket * p_n;
+    m_Serialize.Lock();
+    while (1) // The Next() method returns NULL when it hits z
+    {
+		p_n = p_prev->Next();
+		if (!p_n) break;
+
+        if ( p_n->m_dwAdviseCookie == dwAdviseCookie )
+        {
+            Delete( p_prev->RemoveNext() );
+            --m_dwAdviseCount;
+            hr = S_OK;
+	    // Having found one cookie that matches, there should be no more
+            #ifdef DEBUG
+	       while (p_n = p_prev->Next())
+               {
+                   ASSERT(p_n->m_dwAdviseCookie != dwAdviseCookie);
+                   p_prev = p_n;
+               }
+            #endif
+            break;
+        }
+        p_prev = p_n;
+    };
+    m_Serialize.Unlock();
+    return hr;
+}
+
+REFERENCE_TIME CAMSchedule::Advise( const REFERENCE_TIME & rtTime )
+{
+    REFERENCE_TIME  rtNextTime;
+    CAdvisePacket * pAdvise;
+
+    DbgLog((LOG_TIMING, 2,
+        TEXT("CAMSchedule::Advise( %lu ms )"), ULONG(rtTime / (UNITS / MILLISECONDS))));
+
+    CAutoLock lck(&m_Serialize);
+
+    #ifdef DEBUG
+        if (DbgCheckModuleLevel(LOG_TIMING, 4)) DumpLinkedList();
+    #endif
+
+    //  Note - DON'T cache the difference, it might overflow
+    while ( rtTime >= (rtNextTime = (pAdvise=head.m_next)->m_rtEventTime) &&
+            !pAdvise->IsZ() )
+    {
+        ASSERT(pAdvise->m_dwAdviseCookie); // If this is zero, its the head or the tail!!
+
+        ASSERT(pAdvise->m_hNotify != INVALID_HANDLE_VALUE);
+
+        if (pAdvise->m_bPeriodic == TRUE)
+        {
+            ReleaseSemaphore(pAdvise->m_hNotify,1,NULL);
+            pAdvise->m_rtEventTime += pAdvise->m_rtPeriod;
+            ShuntHead();
+        }
+        else
+        {
+            ASSERT( pAdvise->m_bPeriodic == FALSE );
+            EXECUTE_ASSERT(SetEvent(pAdvise->m_hNotify));
+            --m_dwAdviseCount;
+            Delete( head.RemoveNext() );
+        }
+
+    }
+
+    DbgLog((LOG_TIMING, 3,
+            TEXT("CAMSchedule::Advise() Next time stamp: %lu ms, for advise %lu."),
+            DWORD(rtNextTime / (UNITS / MILLISECONDS)), pAdvise->m_dwAdviseCookie ));
+
+    return rtNextTime;
+}
+
+/* Private methods */
+
+DWORD_PTR CAMSchedule::AddAdvisePacket( CAdvisePacket * pPacket )
+{
+    ASSERT(pPacket->m_rtEventTime >= 0 && pPacket->m_rtEventTime < MAX_TIME);
+    ASSERT(CritCheckIn(&m_Serialize));
+
+    CAdvisePacket * p_prev = &head;
+    CAdvisePacket * p_n;
+
+    const DWORD_PTR Result = pPacket->m_dwAdviseCookie = ++m_dwNextCookie;
+    // This relies on the fact that z is a sentry with a maximal m_rtEventTime
+    for(;;p_prev = p_n)
+    {
+        p_n = p_prev->m_next;
+        if ( p_n->m_rtEventTime >= pPacket->m_rtEventTime ) break;
+    }
+    p_prev->InsertAfter( pPacket );
+    ++m_dwAdviseCount;
+
+    DbgLog((LOG_TIMING, 2, TEXT("Added advise %lu, for thread 0x%02X, scheduled at %lu"),
+    	pPacket->m_dwAdviseCookie, GetCurrentThreadId(), (pPacket->m_rtEventTime / (UNITS / MILLISECONDS)) ));
+
+    // If packet added at the head, then clock needs to re-evaluate wait time.
+    if ( p_prev == &head ) SetEvent( m_ev );
+
+    return Result;
+}
+
+void CAMSchedule::Delete( CAdvisePacket * pPacket )
+{
+    if ( m_dwCacheCount >= dwCacheMax ) delete pPacket;
+    else
+    {
+        m_Serialize.Lock();
+        pPacket->m_next = m_pAdviseCache;
+        m_pAdviseCache = pPacket;
+        ++m_dwCacheCount;
+        m_Serialize.Unlock();
+    }
+}
+
+
+// Takes the head of the list & repositions it
+void CAMSchedule::ShuntHead()
+{
+    CAdvisePacket * p_prev = &head;
+    CAdvisePacket * p_n;
+
+    m_Serialize.Lock();
+    CAdvisePacket *const pPacket = head.m_next;
+
+    // This will catch both an empty list,
+    // and if somehow a MAX_TIME time gets into the list
+    // (which would also break this method).
+    ASSERT( pPacket->m_rtEventTime < MAX_TIME );
+
+    // This relies on the fact that z is a sentry with a maximal m_rtEventTime
+    for(;;p_prev = p_n)
+    {
+        p_n = p_prev->m_next;
+        if ( p_n->m_rtEventTime > pPacket->m_rtEventTime ) break;
+    }
+    // If p_prev == pPacket then we're already in the right place
+    if (p_prev != pPacket)
+    {
+        head.m_next = pPacket->m_next;
+        (p_prev->m_next = pPacket)->m_next = p_n;
+    }
+    #ifdef DEBUG
+        DbgLog((LOG_TIMING, 2, TEXT("Periodic advise %lu, shunted to %lu"),
+    	    pPacket->m_dwAdviseCookie, (pPacket->m_rtEventTime / (UNITS / MILLISECONDS)) ));
+    #endif
+    m_Serialize.Unlock();
+}
+
+
+#ifdef DEBUG
+void CAMSchedule::DumpLinkedList()
+{
+    m_Serialize.Lock();
+    int i=0;
+    DbgLog((LOG_TIMING, 1, TEXT("CAMSchedule::DumpLinkedList() this = 0x%p"), this));
+    for ( CAdvisePacket * p = &head
+        ; p
+        ; p = p->m_next         , i++
+        )
+    {
+        DbgLog((LOG_TIMING, 1, TEXT("Advise List # %lu, Cookie %d,  RefTime %lu"),
+            i,
+	    p->m_dwAdviseCookie,
+	    p->m_rtEventTime / (UNITS / MILLISECONDS)
+            ));
+    }
+    m_Serialize.Unlock();
+}
+#endif
diff --git a/plugins/GSdx_legacy/baseclasses/schedule.h b/plugins/GSdx_legacy/baseclasses/schedule.h
new file mode 100644
index 0000000000..e7873072c0
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/schedule.h
@@ -0,0 +1,128 @@
+//------------------------------------------------------------------------------
+// File: Schedule.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __CAMSchedule__
+#define __CAMSchedule__
+
+class CAMSchedule : private CBaseObject
+{
+public:
+    virtual ~CAMSchedule();
+    // ev is the event we should fire if the advise time needs re-evaluating
+    CAMSchedule( HANDLE ev );
+
+    DWORD GetAdviseCount();
+    REFERENCE_TIME GetNextAdviseTime();
+
+    // We need a method for derived classes to add advise packets, we return the cookie
+    DWORD_PTR AddAdvisePacket( const REFERENCE_TIME & time1, const REFERENCE_TIME & time2, HANDLE h, BOOL periodic );
+    // And a way to cancel
+    HRESULT Unadvise(DWORD_PTR dwAdviseCookie);
+
+    // Tell us the time please, and we'll dispatch the expired events.  We return the time of the next event.
+    // NB: The time returned will be "useless" if you start adding extra Advises.  But that's the problem of
+    // whoever is using this helper class (typically a clock).
+    REFERENCE_TIME Advise( const REFERENCE_TIME & rtTime );
+
+    // Get the event handle which will be set if advise time requires re-evaluation.
+    HANDLE GetEvent() const { return m_ev; }
+
+private:
+    // We define the nodes that will be used in our singly linked list
+    // of advise packets.  The list is ordered by time, with the
+    // elements that will expire first at the front.
+    class CAdvisePacket
+    {
+    public:
+        CAdvisePacket()
+        {}
+
+        CAdvisePacket * m_next;
+        DWORD_PTR       m_dwAdviseCookie;
+        REFERENCE_TIME  m_rtEventTime;      // Time at which event should be set
+        REFERENCE_TIME  m_rtPeriod;         // Periodic time
+        HANDLE          m_hNotify;          // Handle to event or semephore
+        BOOL            m_bPeriodic;        // TRUE => Periodic event
+
+        CAdvisePacket( CAdvisePacket * next, LONGLONG time ) : m_next(next), m_rtEventTime(time)
+        {}
+
+        void InsertAfter( CAdvisePacket * p )
+        {
+            p->m_next = m_next;
+            m_next    = p;
+        }
+
+        int IsZ() const // That is, is it the node that represents the end of the list
+        { return m_next == 0; }
+
+        CAdvisePacket * RemoveNext()
+        {
+            CAdvisePacket *const next = m_next;
+            CAdvisePacket *const new_next = next->m_next;
+            m_next = new_next;
+            return next;
+        }
+
+        void DeleteNext()
+        {
+            delete RemoveNext();
+        }
+
+        CAdvisePacket * Next() const
+        {
+            CAdvisePacket * result = m_next;
+            if (result->IsZ()) result = 0;
+            return result;
+        }
+
+        DWORD_PTR Cookie() const
+        { return m_dwAdviseCookie; }
+    };
+
+    // Structure is:
+    // head -> elmt1 -> elmt2 -> z -> null
+    // So an empty list is:       head -> z -> null
+    // Having head & z as links makes insertaion,
+    // deletion and shunting much easier.
+    CAdvisePacket   head, z;            // z is both a tail and a sentry
+
+    volatile DWORD_PTR  m_dwNextCookie;     // Strictly increasing
+    volatile DWORD  m_dwAdviseCount;    // Number of elements on list
+
+    CCritSec        m_Serialize;
+
+    // AddAdvisePacket: adds the packet, returns the cookie (0 if failed)
+    DWORD_PTR AddAdvisePacket( CAdvisePacket * pPacket );
+    // Event that we should set if the packed added above will be the next to fire.
+    const HANDLE m_ev;
+
+    // A Shunt is where we have changed the first element in the
+    // list and want it re-evaluating (i.e. repositioned) in
+    // the list.
+    void ShuntHead();
+
+    // Rather than delete advise packets, we cache them for future use
+    CAdvisePacket * m_pAdviseCache;
+    DWORD           m_dwCacheCount;
+    enum { dwCacheMax = 5 };             // Don't bother caching more than five
+
+    void Delete( CAdvisePacket * pLink );// This "Delete" will cache the Link
+
+// Attributes and methods for debugging
+public:
+#ifdef DEBUG
+    void DumpLinkedList();
+#else
+    void DumpLinkedList() {}
+#endif
+
+};
+
+#endif // __CAMSchedule__
diff --git a/plugins/GSdx_legacy/baseclasses/seekpt.cpp b/plugins/GSdx_legacy/baseclasses/seekpt.cpp
new file mode 100644
index 0000000000..d01eea326e
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/seekpt.cpp
@@ -0,0 +1,83 @@
+//------------------------------------------------------------------------------
+// File: SeekPT.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include "seekpt.h"
+
+//==================================================================
+// CreateInstance
+// This goes in the factory template table to create new instances
+// If there is already a mapper instance - return that, else make one
+// and save it in a static variable so that forever after we can return that.
+//==================================================================
+
+CUnknown * CSeekingPassThru::CreateInstance(LPUNKNOWN pUnk, HRESULT *phr)
+{
+    return new CSeekingPassThru(NAME("Seeking PassThru"),pUnk, phr);
+}
+
+
+STDMETHODIMP CSeekingPassThru::NonDelegatingQueryInterface(REFIID riid, void ** ppv)
+{
+    if (riid == IID_ISeekingPassThru) {
+        return GetInterface((ISeekingPassThru *) this, ppv);
+    } else {
+        if (m_pPosPassThru &&
+            (riid == IID_IMediaSeeking ||
+             riid == IID_IMediaPosition)) {
+            return m_pPosPassThru->NonDelegatingQueryInterface(riid,ppv);
+        } else {
+            return CUnknown::NonDelegatingQueryInterface(riid, ppv);
+        }
+    }
+}
+
+
+CSeekingPassThru::CSeekingPassThru( TCHAR *pName, LPUNKNOWN pUnk, HRESULT *phr )
+                            : CUnknown(pName, pUnk, phr),
+                            m_pPosPassThru(NULL)
+{
+}
+
+
+CSeekingPassThru::~CSeekingPassThru()
+{
+    delete m_pPosPassThru;
+}
+
+STDMETHODIMP CSeekingPassThru::Init(BOOL bRendererSeeking, IPin *pPin)
+{
+    HRESULT hr = NOERROR;
+    if (m_pPosPassThru) {
+        hr = E_FAIL;
+    } else {
+        m_pPosPassThru =
+            bRendererSeeking ?
+                new CRendererPosPassThru(
+                    NAME("Render Seeking COM object"),
+                    (IUnknown *)this,
+                    &hr,
+                    pPin) :
+                new CPosPassThru(
+                    NAME("Render Seeking COM object"),
+                    (IUnknown *)this,
+                    &hr,
+                    pPin);
+        if (!m_pPosPassThru) {
+            hr = E_OUTOFMEMORY;
+        } else {
+            if (FAILED(hr)) {
+                delete m_pPosPassThru;
+                m_pPosPassThru = NULL;
+            }
+        }
+    }
+    return hr;
+}
+
diff --git a/plugins/GSdx_legacy/baseclasses/seekpt.h b/plugins/GSdx_legacy/baseclasses/seekpt.h
new file mode 100644
index 0000000000..1d26dfee0a
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/seekpt.h
@@ -0,0 +1,30 @@
+//------------------------------------------------------------------------------
+// File: SeekPT.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __seekpt_h__
+#define __seekpt_h__
+
+
+class CSeekingPassThru : public ISeekingPassThru, public CUnknown
+{
+public:
+    static CUnknown *CreateInstance(LPUNKNOWN pUnk, HRESULT *phr);
+    CSeekingPassThru(TCHAR *pName, LPUNKNOWN pUnk, HRESULT *phr);
+    ~CSeekingPassThru();
+
+    DECLARE_IUNKNOWN;
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void ** ppv);
+
+    STDMETHODIMP Init(BOOL bSupportRendering, IPin *pPin);
+
+private:
+    CPosPassThru              *m_pPosPassThru;
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/baseclasses/source.cpp b/plugins/GSdx_legacy/baseclasses/source.cpp
new file mode 100644
index 0000000000..4a21d18fbe
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/source.cpp
@@ -0,0 +1,522 @@
+//------------------------------------------------------------------------------
+// File: Source.cpp
+//
+// Desc: DirectShow  base classes - implements CSource, which is a Quartz
+//       source filter 'template.'
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// Locking Strategy.
+//
+// Hold the filter critical section (m_pFilter->pStateLock()) to serialise
+// access to functions. Note that, in general, this lock may be held
+// by a function when the worker thread may want to hold it. Therefore
+// if you wish to access shared state from the worker thread you will
+// need to add another critical section object. The execption is during
+// the threads processing loop, when it is safe to get the filter critical
+// section from within FillBuffer().
+
+#include "streams.h"
+
+
+//
+// CSource::Constructor
+//
+// Initialise the pin count for the filter. The user will create the pins in
+// the derived class.
+CSource::CSource(TCHAR *pName, LPUNKNOWN lpunk, CLSID clsid)
+    : CBaseFilter(pName, lpunk, &m_cStateLock, clsid),
+      m_iPins(0),
+      m_paStreams(NULL)
+{
+}
+
+CSource::CSource(TCHAR *pName, LPUNKNOWN lpunk, CLSID clsid, HRESULT *phr)
+    : CBaseFilter(pName, lpunk, &m_cStateLock, clsid),
+      m_iPins(0),
+      m_paStreams(NULL)
+{
+    UNREFERENCED_PARAMETER(phr);
+}
+
+#ifdef UNICODE
+CSource::CSource(CHAR *pName, LPUNKNOWN lpunk, CLSID clsid)
+    : CBaseFilter(pName, lpunk, &m_cStateLock, clsid),
+      m_iPins(0),
+      m_paStreams(NULL)
+{
+}
+
+CSource::CSource(CHAR *pName, LPUNKNOWN lpunk, CLSID clsid, HRESULT *phr)
+    : CBaseFilter(pName, lpunk, &m_cStateLock, clsid),
+      m_iPins(0),
+      m_paStreams(NULL)
+{
+    UNREFERENCED_PARAMETER(phr);
+}
+#endif
+
+//
+// CSource::Destructor
+//
+CSource::~CSource()
+{
+    /*  Free our pins and pin array */
+    while (m_iPins != 0) {
+	// deleting the pins causes them to be removed from the array...
+	delete m_paStreams[m_iPins - 1];
+    }
+
+    ASSERT(m_paStreams == NULL);
+}
+
+
+//
+//  Add a new pin
+//
+HRESULT CSource::AddPin(CSourceStream *pStream)
+{
+    CAutoLock lock(&m_cStateLock);
+
+    /*  Allocate space for this pin and the old ones */
+    CSourceStream **paStreams = new CSourceStream *[m_iPins + 1];
+    if (paStreams == NULL) {
+        return E_OUTOFMEMORY;
+    }
+    if (m_paStreams != NULL) {
+        CopyMemory((PVOID)paStreams, (PVOID)m_paStreams,
+                   m_iPins * sizeof(m_paStreams[0]));
+        paStreams[m_iPins] = pStream;
+        delete [] m_paStreams;
+    }
+    m_paStreams = paStreams;
+    m_paStreams[m_iPins] = pStream;
+    m_iPins++;
+    return S_OK;
+}
+
+//
+//  Remove a pin - pStream is NOT deleted
+//
+HRESULT CSource::RemovePin(CSourceStream *pStream)
+{
+    int i;
+    for (i = 0; i < m_iPins; i++) {
+        if (m_paStreams[i] == pStream) {
+            if (m_iPins == 1) {
+                delete [] m_paStreams;
+                m_paStreams = NULL;
+            } else {
+                /*  no need to reallocate */
+		while (++i < m_iPins)
+		    m_paStreams[i - 1] = m_paStreams[i];
+            }
+            m_iPins--;
+            return S_OK;
+        }
+    }
+    return S_FALSE;
+}
+
+//
+// FindPin
+//
+// Set *ppPin to the IPin* that has the id Id.
+// or to NULL if the Id cannot be matched.
+STDMETHODIMP CSource::FindPin(LPCWSTR Id, IPin **ppPin)
+{
+    CheckPointer(ppPin,E_POINTER);
+    ValidateReadWritePtr(ppPin,sizeof(IPin *));
+    // The -1 undoes the +1 in QueryId and ensures that totally invalid
+    // strings (for which WstrToInt delivers 0) give a deliver a NULL pin.
+    int i = WstrToInt(Id) -1;
+    *ppPin = GetPin(i);
+    if (*ppPin!=NULL){
+        (*ppPin)->AddRef();
+        return NOERROR;
+    } else {
+        return VFW_E_NOT_FOUND;
+    }
+}
+
+//
+// FindPinNumber
+//
+// return the number of the pin with this IPin* or -1 if none
+int CSource::FindPinNumber(IPin *iPin) {
+    int i;
+    for (i=0; i<m_iPins; ++i) {
+        if ((IPin *)(m_paStreams[i])==iPin) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+//
+// GetPinCount
+//
+// Returns the number of pins this filter has
+int CSource::GetPinCount(void) {
+
+    CAutoLock lock(&m_cStateLock);
+    return m_iPins;
+}
+
+
+//
+// GetPin
+//
+// Return a non-addref'd pointer to pin n
+// needed by CBaseFilter
+CBasePin *CSource::GetPin(int n) {
+
+    CAutoLock lock(&m_cStateLock);
+
+    // n must be in the range 0..m_iPins-1
+    // if m_iPins>n  && n>=0 it follows that m_iPins>0
+    // which is what used to be checked (i.e. checking that we have a pin)
+    if ((n >= 0) && (n < m_iPins)) {
+
+        ASSERT(m_paStreams[n]);
+	return m_paStreams[n];
+    }
+    return NULL;
+}
+
+
+//
+
+
+// *
+// * --- CSourceStream ----
+// *
+
+//
+// Set Id to point to a CoTaskMemAlloc'd
+STDMETHODIMP CSourceStream::QueryId(LPWSTR *Id) {
+    CheckPointer(Id,E_POINTER);
+    ValidateReadWritePtr(Id,sizeof(LPWSTR));
+
+    // We give the pins id's which are 1,2,...
+    // FindPinNumber returns -1 for an invalid pin
+    int i = 1+ m_pFilter->FindPinNumber(this);
+    if (i<1) return VFW_E_NOT_FOUND;
+    *Id = (LPWSTR)CoTaskMemAlloc(4*sizeof(WCHAR));
+    if (*Id==NULL) {
+       return E_OUTOFMEMORY;
+    }
+    IntToWstr(i, *Id, 4);
+    return NOERROR;
+}
+
+
+
+//
+// CSourceStream::Constructor
+//
+// increments the number of pins present on the filter
+CSourceStream::CSourceStream(
+    TCHAR *pObjectName,
+    HRESULT *phr,
+    CSource *ps,
+    LPCWSTR pPinName)
+    : CBaseOutputPin(pObjectName, ps, ps->pStateLock(), phr, pPinName),
+      m_pFilter(ps) {
+
+     *phr = m_pFilter->AddPin(this);
+}
+
+#ifdef UNICODE
+CSourceStream::CSourceStream(
+    char *pObjectName,
+    HRESULT *phr,
+    CSource *ps,
+    LPCWSTR pPinName)
+    : CBaseOutputPin(pObjectName, ps, ps->pStateLock(), phr, pPinName),
+      m_pFilter(ps) {
+
+     *phr = m_pFilter->AddPin(this);
+}
+#endif
+//
+// CSourceStream::Destructor
+//
+// Decrements the number of pins on this filter
+CSourceStream::~CSourceStream(void) {
+
+     m_pFilter->RemovePin(this);
+}
+
+
+//
+// CheckMediaType
+//
+// Do we support this type? Provides the default support for 1 type.
+HRESULT CSourceStream::CheckMediaType(const CMediaType *pMediaType) {
+
+    CAutoLock lock(m_pFilter->pStateLock());
+
+    CMediaType mt;
+    GetMediaType(&mt);
+
+    if (mt == *pMediaType) {
+        return NOERROR;
+    }
+
+    return E_FAIL;
+}
+
+
+//
+// GetMediaType/3
+//
+// By default we support only one type
+// iPosition indexes are 0-n
+HRESULT CSourceStream::GetMediaType(int iPosition, CMediaType *pMediaType) {
+
+    CAutoLock lock(m_pFilter->pStateLock());
+
+    if (iPosition<0) {
+        return E_INVALIDARG;
+    }
+    if (iPosition>0) {
+        return VFW_S_NO_MORE_ITEMS;
+    }
+    return GetMediaType(pMediaType);
+}
+
+
+//
+// Active
+//
+// The pin is active - start up the worker thread
+HRESULT CSourceStream::Active(void) {
+
+    CAutoLock lock(m_pFilter->pStateLock());
+
+    HRESULT hr;
+
+    if (m_pFilter->IsActive()) {
+	return S_FALSE;	// succeeded, but did not allocate resources (they already exist...)
+    }
+
+    // do nothing if not connected - its ok not to connect to
+    // all pins of a source filter
+    if (!IsConnected()) {
+        return NOERROR;
+    }
+
+    hr = CBaseOutputPin::Active();
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    ASSERT(!ThreadExists());
+
+    // start the thread
+    if (!Create()) {
+        return E_FAIL;
+    }
+
+    // Tell thread to initialize. If OnThreadCreate Fails, so does this.
+    hr = Init();
+    if (FAILED(hr))
+	return hr;
+
+    return Pause();
+}
+
+
+//
+// Inactive
+//
+// Pin is inactive - shut down the worker thread
+// Waits for the worker to exit before returning.
+HRESULT CSourceStream::Inactive(void) {
+
+    CAutoLock lock(m_pFilter->pStateLock());
+
+    HRESULT hr;
+
+    // do nothing if not connected - its ok not to connect to
+    // all pins of a source filter
+    if (!IsConnected()) {
+        return NOERROR;
+    }
+
+    // !!! need to do this before trying to stop the thread, because
+    // we may be stuck waiting for our own allocator!!!
+
+    hr = CBaseOutputPin::Inactive();  // call this first to Decommit the allocator
+    if (FAILED(hr)) {
+	return hr;
+    }
+
+    if (ThreadExists()) {
+	hr = Stop();
+
+	if (FAILED(hr)) {
+	    return hr;
+	}
+
+	hr = Exit();
+	if (FAILED(hr)) {
+	    return hr;
+	}
+
+	Close();	// Wait for the thread to exit, then tidy up.
+    }
+
+    // hr = CBaseOutputPin::Inactive();  // call this first to Decommit the allocator
+    //if (FAILED(hr)) {
+    //	return hr;
+    //}
+
+    return NOERROR;
+}
+
+
+//
+// ThreadProc
+//
+// When this returns the thread exits
+// Return codes > 0 indicate an error occured
+DWORD CSourceStream::ThreadProc(void) {
+
+    HRESULT hr;  // the return code from calls
+    Command com;
+
+    do {
+	com = GetRequest();
+	if (com != CMD_INIT) {
+	    DbgLog((LOG_ERROR, 1, TEXT("Thread expected init command")));
+	    Reply((DWORD) E_UNEXPECTED);
+	}
+    } while (com != CMD_INIT);
+
+    DbgLog((LOG_TRACE, 1, TEXT("CSourceStream worker thread initializing")));
+
+    hr = OnThreadCreate(); // perform set up tasks
+    if (FAILED(hr)) {
+        DbgLog((LOG_ERROR, 1, TEXT("CSourceStream::OnThreadCreate failed. Aborting thread.")));
+        OnThreadDestroy();
+	Reply(hr);	// send failed return code from OnThreadCreate
+        return 1;
+    }
+
+    // Initialisation suceeded
+    Reply(NOERROR);
+
+    Command cmd;
+    do {
+	cmd = GetRequest();
+
+	switch (cmd) {
+
+	case CMD_EXIT:
+	    Reply(NOERROR);
+	    break;
+
+	case CMD_RUN:
+	    DbgLog((LOG_ERROR, 1, TEXT("CMD_RUN received before a CMD_PAUSE???")));
+	    // !!! fall through???
+
+	case CMD_PAUSE:
+	    Reply(NOERROR);
+	    DoBufferProcessingLoop();
+	    break;
+
+	case CMD_STOP:
+	    Reply(NOERROR);
+	    break;
+
+	default:
+	    DbgLog((LOG_ERROR, 1, TEXT("Unknown command %d received!"), cmd));
+	    Reply((DWORD) E_NOTIMPL);
+	    break;
+	}
+    } while (cmd != CMD_EXIT);
+
+    hr = OnThreadDestroy();	// tidy up.
+    if (FAILED(hr)) {
+        DbgLog((LOG_ERROR, 1, TEXT("CSourceStream::OnThreadDestroy failed. Exiting thread.")));
+        return 1;
+    }
+
+    DbgLog((LOG_TRACE, 1, TEXT("CSourceStream worker thread exiting")));
+    return 0;
+}
+
+
+//
+// DoBufferProcessingLoop
+//
+// Grabs a buffer and calls the users processing function.
+// Overridable, so that different delivery styles can be catered for.
+HRESULT CSourceStream::DoBufferProcessingLoop(void) {
+
+    Command com;
+
+    OnThreadStartPlay();
+
+    do {
+	while (!CheckRequest(&com)) {
+
+	    IMediaSample *pSample;
+
+	    HRESULT hr = GetDeliveryBuffer(&pSample,NULL,NULL,0);
+	    if (FAILED(hr)) {
+                Sleep(1);
+		continue;	// go round again. Perhaps the error will go away
+			    // or the allocator is decommited & we will be asked to
+			    // exit soon.
+	    }
+
+	    // Virtual function user will override.
+	    hr = FillBuffer(pSample);
+
+	    if (hr == S_OK) {
+		hr = Deliver(pSample);
+                pSample->Release();
+
+                // downstream filter returns S_FALSE if it wants us to
+                // stop or an error if it's reporting an error.
+                if(hr != S_OK)
+                {
+                  DbgLog((LOG_TRACE, 2, TEXT("Deliver() returned %08x; stopping"), hr));
+                  return S_OK;
+                }
+
+	    } else if (hr == S_FALSE) {
+                // derived class wants us to stop pushing data
+		pSample->Release();
+		DeliverEndOfStream();
+		return S_OK;
+	    } else {
+                // derived class encountered an error
+                pSample->Release();
+		DbgLog((LOG_ERROR, 1, TEXT("Error %08lX from FillBuffer!!!"), hr));
+                DeliverEndOfStream();
+                m_pFilter->NotifyEvent(EC_ERRORABORT, hr, 0);
+                return hr;
+	    }
+
+            // all paths release the sample
+	}
+
+        // For all commands sent to us there must be a Reply call!
+
+	if (com == CMD_RUN || com == CMD_PAUSE) {
+	    Reply(NOERROR);
+	} else if (com != CMD_STOP) {
+	    Reply((DWORD) E_UNEXPECTED);
+	    DbgLog((LOG_ERROR, 1, TEXT("Unexpected command!!!")));
+	}
+    } while (com != CMD_STOP);
+
+    return S_FALSE;
+}
+
diff --git a/plugins/GSdx_legacy/baseclasses/source.h b/plugins/GSdx_legacy/baseclasses/source.h
new file mode 100644
index 0000000000..bee5356837
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/source.h
@@ -0,0 +1,172 @@
+//------------------------------------------------------------------------------
+// File: Source.h
+//
+// Desc: DirectShow base classes - defines classes to simplify creation of
+//       ActiveX source filters that support continuous generation of data.
+//       No support is provided for IMediaControl or IMediaPosition.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+//
+// Derive your source filter from CSource.
+// During construction either:
+//    Create some CSourceStream objects to manage your pins
+//    Provide the user with a means of doing so eg, an IPersistFile interface.
+//
+// CSource provides:
+//    IBaseFilter interface management
+//    IMediaFilter interface management, via CBaseFilter
+//    Pin counting for CBaseFilter
+//
+// Derive a class from CSourceStream to manage your output pin types
+//  Implement GetMediaType/1 to return the type you support. If you support multiple
+//   types then overide GetMediaType/3, CheckMediaType and GetMediaTypeCount.
+//  Implement Fillbuffer() to put data into one buffer.
+//
+// CSourceStream provides:
+//    IPin management via CBaseOutputPin
+//    Worker thread management
+
+#ifndef __CSOURCE__
+#define __CSOURCE__
+
+class CSourceStream;  // The class that will handle each pin
+
+
+//
+// CSource
+//
+// Override construction to provide a means of creating
+// CSourceStream derived objects - ie a way of creating pins.
+class CSource : public CBaseFilter {
+public:
+
+    CSource(TCHAR *pName, LPUNKNOWN lpunk, CLSID clsid, HRESULT *phr);
+    CSource(TCHAR *pName, LPUNKNOWN lpunk, CLSID clsid);
+#ifdef UNICODE
+    CSource(CHAR *pName, LPUNKNOWN lpunk, CLSID clsid, HRESULT *phr);
+    CSource(CHAR *pName, LPUNKNOWN lpunk, CLSID clsid);
+#endif
+    ~CSource();
+
+    int       GetPinCount(void);
+    CBasePin *GetPin(int n);
+
+    // -- Utilities --
+
+    CCritSec*	pStateLock(void) { return &m_cStateLock; }	// provide our critical section
+
+    HRESULT     AddPin(CSourceStream *);
+    HRESULT     RemovePin(CSourceStream *);
+
+    STDMETHODIMP FindPin(
+        LPCWSTR Id,
+        IPin ** ppPin
+    );
+
+    int FindPinNumber(IPin *iPin);
+
+protected:
+
+    int             m_iPins;       // The number of pins on this filter. Updated by CSourceStream
+    	   			   // constructors & destructors.
+    CSourceStream **m_paStreams;   // the pins on this filter.
+
+    CCritSec m_cStateLock;	// Lock this to serialize function accesses to the filter state
+
+};
+
+
+//
+// CSourceStream
+//
+// Use this class to manage a stream of data that comes from a
+// pin.
+// Uses a worker thread to put data on the pin.
+class CSourceStream : public CAMThread, public CBaseOutputPin {
+public:
+
+    CSourceStream(TCHAR *pObjectName,
+                  HRESULT *phr,
+                  CSource *pms,
+                  LPCWSTR pName);
+#ifdef UNICODE
+    CSourceStream(CHAR *pObjectName,
+                  HRESULT *phr,
+                  CSource *pms,
+                  LPCWSTR pName);
+#endif
+    virtual ~CSourceStream(void);  // virtual destructor ensures derived class destructors are called too.
+
+protected:
+
+    CSource *m_pFilter;	// The parent of this stream
+
+    // *
+    // * Data Source
+    // *
+    // * The following three functions: FillBuffer, OnThreadCreate/Destroy, are
+    // * called from within the ThreadProc. They are used in the creation of
+    // * the media samples this pin will provide
+    // *
+
+    // Override this to provide the worker thread a means
+    // of processing a buffer
+    virtual HRESULT FillBuffer(IMediaSample *pSamp) PURE;
+
+    // Called as the thread is created/destroyed - use to perform
+    // jobs such as start/stop streaming mode
+    // If OnThreadCreate returns an error the thread will exit.
+    virtual HRESULT OnThreadCreate(void) {return NOERROR;};
+    virtual HRESULT OnThreadDestroy(void) {return NOERROR;};
+    virtual HRESULT OnThreadStartPlay(void) {return NOERROR;};
+
+    // *
+    // * Worker Thread
+    // *
+
+    HRESULT Active(void);    // Starts up the worker thread
+    HRESULT Inactive(void);  // Exits the worker thread.
+
+public:
+    // thread commands
+    enum Command {CMD_INIT, CMD_PAUSE, CMD_RUN, CMD_STOP, CMD_EXIT};
+    HRESULT Init(void) { return CallWorker(CMD_INIT); }
+    HRESULT Exit(void) { return CallWorker(CMD_EXIT); }
+    HRESULT Run(void) { return CallWorker(CMD_RUN); }
+    HRESULT Pause(void) { return CallWorker(CMD_PAUSE); }
+    HRESULT Stop(void) { return CallWorker(CMD_STOP); }
+
+protected:
+    Command GetRequest(void) { return (Command) CAMThread::GetRequest(); }
+    BOOL    CheckRequest(Command *pCom) { return CAMThread::CheckRequest( (DWORD *) pCom); }
+
+    // override these if you want to add thread commands
+    virtual DWORD ThreadProc(void);  		// the thread function
+
+    virtual HRESULT DoBufferProcessingLoop(void);    // the loop executed whilst running
+
+
+    // *
+    // * AM_MEDIA_TYPE support
+    // *
+
+    // If you support more than one media type then override these 2 functions
+    virtual HRESULT CheckMediaType(const CMediaType *pMediaType);
+    virtual HRESULT GetMediaType(int iPosition, CMediaType *pMediaType);  // List pos. 0-n
+
+    // If you support only one type then override this fn.
+    // This will only be called by the default implementations
+    // of CheckMediaType and GetMediaType(int, CMediaType*)
+    // You must override this fn. or the above 2!
+    virtual HRESULT GetMediaType(CMediaType *pMediaType) {return E_UNEXPECTED;}
+
+    STDMETHODIMP QueryId(
+        LPWSTR * Id
+    );
+};
+
+#endif // __CSOURCE__
+
diff --git a/plugins/GSdx_legacy/baseclasses/streams.h b/plugins/GSdx_legacy/baseclasses/streams.h
new file mode 100644
index 0000000000..35b2dee3d6
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/streams.h
@@ -0,0 +1,253 @@
+//------------------------------------------------------------------------------
+// File: Streams.h
+//
+// Desc: DirectShow base classes - defines overall streams architecture.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __STREAMS__
+#define __STREAMS__
+
+#ifdef	_MSC_VER
+// disable some level-4 warnings, use #pragma warning(enable:###) to re-enable
+#pragma warning(disable:4100) // warning C4100: unreferenced formal parameter
+#pragma warning(disable:4127) // warning C4127: conditional expression is constant
+#pragma warning(disable:4189) // warning C4189: local variable is initialized but not referenced
+#pragma warning(disable:4201) // warning C4201: nonstandard extension used : nameless struct/union
+#pragma warning(disable:4511) // warning C4511: copy constructor could not be generated
+#pragma warning(disable:4512) // warning C4512: assignment operator could not be generated
+#pragma warning(disable:4514) // warning C4514: unreferenced inline function has been removed
+#pragma warning(disable:4710) // warning C4710: 'function' not inlined
+
+#if _MSC_VER>=1100
+#define AM_NOVTABLE __declspec(novtable)
+#else
+#define AM_NOVTABLE
+#endif
+#endif	// MSC_VER
+
+// Because of differences between Visual C++ and older Microsoft SDKs,
+// you may have defined _DEBUG without defining DEBUG.  This logic
+// ensures that both will be set if Visual C++ sets _DEBUG.
+#ifdef _DEBUG
+#ifndef DEBUG
+#define DEBUG
+#endif
+#endif
+
+#include <windows.h>
+#include <windowsx.h>
+#include <olectl.h>
+
+// Disable warning message for C4201 - use of nameless struct/union
+// Otherwise, strmif.h will generate warnings for Win32 debug builds
+#pragma warning( disable : 4201 )
+
+#include <mmsystem.h>
+
+#ifndef NUMELMS
+   #define NUMELMS(aa) (sizeof(aa)/sizeof((aa)[0]))
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// The following definitions come from the Platform SDK and are required if
+// the applicaiton is being compiled with the headers from Visual C++ 6.0.
+///////////////////////////////////////////////////////////////////////////
+#ifndef InterlockedExchangePointer
+	#define InterlockedExchangePointer(Target, Value) \
+   (PVOID)InterlockedExchange((PLONG)(Target), (LONG)(Value))
+#endif
+
+#ifndef _WAVEFORMATEXTENSIBLE_
+#define _WAVEFORMATEXTENSIBLE_
+typedef struct {
+    WAVEFORMATEX    Format;
+    union {
+        WORD wValidBitsPerSample;       /* bits of precision  */
+        WORD wSamplesPerBlock;          /* valid if wBitsPerSample==0 */
+        WORD wReserved;                 /* If neither applies, set to zero. */
+    } Samples;
+    DWORD           dwChannelMask;      /* which channels are */
+                                        /* present in stream  */
+    GUID            SubFormat;
+} WAVEFORMATEXTENSIBLE, *PWAVEFORMATEXTENSIBLE;
+#endif // !_WAVEFORMATEXTENSIBLE_
+
+#if !defined(WAVE_FORMAT_EXTENSIBLE)
+#define  WAVE_FORMAT_EXTENSIBLE                 0xFFFE
+#endif // !defined(WAVE_FORMAT_EXTENSIBLE)
+
+#ifndef GetWindowLongPtr
+  #define GetWindowLongPtrA   GetWindowLongA
+  #define GetWindowLongPtrW   GetWindowLongW
+  #ifdef UNICODE
+    #define GetWindowLongPtr  GetWindowLongPtrW
+  #else
+    #define GetWindowLongPtr  GetWindowLongPtrA
+  #endif // !UNICODE
+#endif // !GetWindowLongPtr
+
+#ifndef SetWindowLongPtr
+  #define SetWindowLongPtrA   SetWindowLongA
+  #define SetWindowLongPtrW   SetWindowLongW
+  #ifdef UNICODE
+    #define SetWindowLongPtr  SetWindowLongPtrW
+  #else
+    #define SetWindowLongPtr  SetWindowLongPtrA
+  #endif // !UNICODE
+#endif // !SetWindowLongPtr
+
+#ifndef GWLP_WNDPROC
+  #define GWLP_WNDPROC        (-4)
+#endif
+#ifndef GWLP_HINSTANCE
+  #define GWLP_HINSTANCE      (-6)
+#endif
+#ifndef GWLP_HWNDPARENT
+  #define GWLP_HWNDPARENT     (-8)
+#endif
+#ifndef GWLP_USERDATA
+  #define GWLP_USERDATA       (-21)
+#endif
+#ifndef GWLP_ID
+  #define GWLP_ID             (-12)
+#endif
+#ifndef DWLP_MSGRESULT
+  #define DWLP_MSGRESULT  0
+#endif
+#ifndef DWLP_DLGPROC
+  #define DWLP_DLGPROC    DWLP_MSGRESULT + sizeof(LRESULT)
+#endif
+#ifndef DWLP_USER
+  #define DWLP_USER       DWLP_DLGPROC + sizeof(DLGPROC)
+#endif
+///////////////////////////////////////////////////////////////////////////
+// End Platform SDK definitions
+///////////////////////////////////////////////////////////////////////////
+
+#include <initguid.h>
+
+#pragma warning(disable:4201) // warning C4201: nonstandard extension used : nameless struct/union
+#include <strmif.h>     // Generated IDL header file for streams interfaces
+
+#include "reftime.h"    // Helper class for REFERENCE_TIME management
+#include "wxdebug.h"    // Debug support for logging and ASSERTs
+#include "amvideo.h"    // ActiveMovie video interfaces and definitions
+//include amaudio.h explicitly if you need it.  it requires the DirectX SDK.
+//#include "amaudio.h"    // ActiveMovie audio interfaces and definitions
+#include "wxutil.h"     // General helper classes for threads etc
+#include "combase.h"    // Base COM classes to support IUnknown
+//#include "dllsetup.h"   // Filter registration support functions
+#include "measure.h"    // Performance measurement
+#include "comlite.h"    // Light weight com function prototypes
+
+#include "cache.h"      // Simple cache container class
+#include "wxlist.h"     // Non MFC generic list class
+#include "msgthrd.h"	// CMsgThread
+#include "mtype.h"      // Helper class for managing media types
+#include "fourcc.h"     // conversions between FOURCCs and GUIDs
+#include "control.h"    // generated from control.odl
+#include "ctlutil.h"    // control interface utility classes
+#include "evcode.h"     // event code definitions
+#include "amfilter.h"   // Main streams architecture class hierachy
+#include "transfrm.h"   // Generic transform filter
+#include "transip.h"    // Generic transform-in-place filter
+#include "uuids.h"      // declaration of type GUIDs and well-known clsids
+#include "source.h"	// Generic source filter
+#include "outputq.h"    // Output pin queueing
+#include "errors.h"     // HRESULT status and error definitions
+#include "renbase.h"    // Base class for writing ActiveX renderers
+//#include "winutil.h"    // Helps with filters that manage windows
+//#include "winctrl.h"    // Implements the IVideoWindow interface
+//#include "videoctl.h"   // Specifically video related classes
+#include "refclock.h"	// Base clock class
+#include "sysclock.h"	// System clock
+#include "pstream.h"    // IPersistStream helper class
+#include "vtrans.h"     // Video Transform Filter base class
+#include "amextra.h"
+//#include "cprop.h"      // Base property page class
+#include "strmctl.h"    // IAMStreamControl support
+#include "edevdefs.h"   // External device control interface defines
+#include "audevcod.h"   // audio filter device error event codes
+
+#include <tchar.h>
+
+#define NO_SHLWAPI_STRFCNS
+//#include <atlbase.h>
+#include <strsafe.h>
+
+#ifndef NUMELMS
+   #define NUMELMS(aa) (sizeof(aa)/sizeof((aa)[0]))
+#endif
+
+class CPinInfo : public PIN_INFO
+{
+public:
+	CPinInfo() {pFilter = NULL;}
+	~CPinInfo() {if(pFilter) pFilter->Release();}
+};
+
+class CFilterInfo : public FILTER_INFO
+{
+public:
+	CFilterInfo() {pGraph = NULL;}
+	~CFilterInfo() {if(pGraph) pGraph->Release();}
+};
+
+#define BeginEnumFilters(pFilterGraph, pEnumFilters, pBaseFilter) \
+	{CComPtr<IEnumFilters> pEnumFilters; \
+	if(pFilterGraph && SUCCEEDED(pFilterGraph->EnumFilters(&pEnumFilters))) \
+	{ \
+		for(CComPtr<IBaseFilter> pBaseFilter; S_OK == pEnumFilters->Next(1, &pBaseFilter, 0); pBaseFilter = NULL) \
+		{ \
+
+#define EndEnumFilters }}}
+
+#define BeginEnumCachedFilters(pGraphConfig, pEnumFilters, pBaseFilter) \
+	{CComPtr<IEnumFilters> pEnumFilters; \
+	if(pGraphConfig && SUCCEEDED(pGraphConfig->EnumCacheFilter(&pEnumFilters))) \
+	{ \
+		for(CComPtr<IBaseFilter> pBaseFilter; S_OK == pEnumFilters->Next(1, &pBaseFilter, 0); pBaseFilter = NULL) \
+		{ \
+
+#define EndEnumCachedFilters }}}
+
+#define BeginEnumPins(pBaseFilter, pEnumPins, pPin) \
+	{CComPtr<IEnumPins> pEnumPins; \
+	if(pBaseFilter && SUCCEEDED(pBaseFilter->EnumPins(&pEnumPins))) \
+	{ \
+		for(CComPtr<IPin> pPin; S_OK == pEnumPins->Next(1, &pPin, 0); pPin = NULL) \
+		{ \
+
+#define EndEnumPins }}}
+
+#define BeginEnumMediaTypes(pPin, pEnumMediaTypes, pMediaType) \
+	{CComPtr<IEnumMediaTypes> pEnumMediaTypes; \
+	if(pPin && SUCCEEDED(pPin->EnumMediaTypes(&pEnumMediaTypes))) \
+	{ \
+		AM_MEDIA_TYPE* pMediaType = NULL; \
+		for(; S_OK == pEnumMediaTypes->Next(1, &pMediaType, NULL); DeleteMediaType(pMediaType), pMediaType = NULL) \
+		{ \
+
+#define EndEnumMediaTypes(pMediaType) } if(pMediaType) DeleteMediaType(pMediaType); }}
+
+#define BeginEnumSysDev(clsid, pMoniker) \
+	{CComPtr<ICreateDevEnum> pDevEnum4$##clsid; \
+	pDevEnum4$##clsid.CoCreateInstance(CLSID_SystemDeviceEnum); \
+	CComPtr<IEnumMoniker> pClassEnum4$##clsid; \
+	if(SUCCEEDED(pDevEnum4$##clsid->CreateClassEnumerator(clsid, &pClassEnum4$##clsid, 0)) \
+	&& pClassEnum4$##clsid) \
+	{ \
+		for(CComPtr<IMoniker> pMoniker; pClassEnum4$##clsid->Next(1, &pMoniker, 0) == S_OK; pMoniker = NULL) \
+		{ \
+
+#define EndEnumSysDev }}}
+
+#else
+    #ifdef DEBUG
+    #pragma message("STREAMS.H included TWICE")
+    #endif
+#endif // __STREAMS__
+
diff --git a/plugins/GSdx_legacy/baseclasses/strmctl.cpp b/plugins/GSdx_legacy/baseclasses/strmctl.cpp
new file mode 100644
index 0000000000..cba7e9b6ec
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/strmctl.cpp
@@ -0,0 +1,401 @@
+//------------------------------------------------------------------------------
+// File: StrmCtl.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include "strmctl.h"
+
+CBaseStreamControl::CBaseStreamControl()
+: m_StreamState(STREAM_FLOWING)
+, m_StreamStateOnStop(STREAM_FLOWING) // means no pending stop
+, m_tStartTime(MAX_TIME)
+, m_tStopTime(MAX_TIME)
+, m_dwStartCookie(0)
+, m_dwStopCookie(0)
+, m_pRefClock(NULL)
+, m_FilterState(State_Stopped)
+, m_bIsFlushing(FALSE)
+, m_bStopSendExtra(FALSE)
+{}
+
+CBaseStreamControl::~CBaseStreamControl()
+{
+    // Make sure we release the clock.
+    SetSyncSource(NULL);
+    return;
+}
+
+
+STDMETHODIMP CBaseStreamControl::StopAt(const REFERENCE_TIME * ptStop, BOOL bSendExtra, DWORD dwCookie)
+{
+    CAutoLock lck(&m_CritSec);
+    m_bStopSendExtra = FALSE;	// reset
+    m_bStopExtraSent = FALSE;
+    if (ptStop)
+    {
+        if (*ptStop == MAX_TIME)
+        {
+            DbgLog((LOG_TRACE,2,TEXT("StopAt: Cancel stop")));
+            CancelStop();
+	    // If there's now a command to start in the future, we assume
+	    // they want to be stopped when the graph is first run
+	    if (m_FilterState == State_Stopped && m_tStartTime < MAX_TIME) {
+	        m_StreamState = STREAM_DISCARDING;
+                DbgLog((LOG_TRACE,2,TEXT("graph will begin by DISCARDING")));
+	    }
+            return NOERROR;
+        }
+        DbgLog((LOG_TRACE,2,TEXT("StopAt: %dms extra=%d"),
+				(int)(*ptStop/10000), bSendExtra));
+	// if the first command is to stop in the future, then we assume they
+        // want to be started when the graph is first run
+	if (m_FilterState == State_Stopped && m_tStartTime > *ptStop) {
+	    m_StreamState = STREAM_FLOWING;
+            DbgLog((LOG_TRACE,2,TEXT("graph will begin by FLOWING")));
+	}
+        m_bStopSendExtra = bSendExtra;
+        m_tStopTime = *ptStop;
+        m_dwStopCookie = dwCookie;
+        m_StreamStateOnStop = STREAM_DISCARDING;
+    }
+    else
+    {
+        DbgLog((LOG_TRACE,2,TEXT("StopAt: now")));
+	// sending an extra frame when told to stop now would mess people up
+        m_bStopSendExtra = FALSE;
+        m_tStopTime = MAX_TIME;
+        m_dwStopCookie = 0;
+        m_StreamState = STREAM_DISCARDING;
+        m_StreamStateOnStop = STREAM_FLOWING;	// no pending stop
+    }
+    // we might change our mind what to do with a sample we're blocking
+    m_StreamEvent.Set();
+    return NOERROR;
+}
+
+STDMETHODIMP CBaseStreamControl::StartAt
+( const REFERENCE_TIME *ptStart, DWORD dwCookie )
+{
+    CAutoLock lck(&m_CritSec);
+    if (ptStart)
+    {
+        if (*ptStart == MAX_TIME)
+        {
+            DbgLog((LOG_TRACE,2,TEXT("StartAt: Cancel start")));
+            CancelStart();
+	    // If there's now a command to stop in the future, we assume
+	    // they want to be started when the graph is first run
+	    if (m_FilterState == State_Stopped && m_tStopTime < MAX_TIME) {
+                DbgLog((LOG_TRACE,2,TEXT("graph will begin by FLOWING")));
+	        m_StreamState = STREAM_FLOWING;
+	    }
+            return NOERROR;
+        }
+        DbgLog((LOG_TRACE,2,TEXT("StartAt: %dms"), (int)(*ptStart/10000)));
+	// if the first command is to start in the future, then we assume they
+        // want to be stopped when the graph is first run
+	if (m_FilterState == State_Stopped && m_tStopTime >= *ptStart) {
+            DbgLog((LOG_TRACE,2,TEXT("graph will begin by DISCARDING")));
+	    m_StreamState = STREAM_DISCARDING;
+	}
+        m_tStartTime = *ptStart;
+        m_dwStartCookie = dwCookie;
+        // if (m_tStopTime == m_tStartTime) CancelStop();
+    }
+    else
+    {
+        DbgLog((LOG_TRACE,2,TEXT("StartAt: now")));
+        m_tStartTime = MAX_TIME;
+        m_dwStartCookie = 0;
+        m_StreamState = STREAM_FLOWING;
+    }
+    // we might change our mind what to do with a sample we're blocking
+    m_StreamEvent.Set();
+    return NOERROR;
+}
+
+//  Retrieve information about current settings
+STDMETHODIMP CBaseStreamControl::GetInfo(AM_STREAM_INFO *pInfo)
+{
+    if (pInfo == NULL)
+	return E_POINTER;
+
+    pInfo->tStart = m_tStartTime;
+    pInfo->tStop  = m_tStopTime;
+    pInfo->dwStartCookie = m_dwStartCookie;
+    pInfo->dwStopCookie  = m_dwStopCookie;
+    pInfo->dwFlags = m_bStopSendExtra ? AM_STREAM_INFO_STOP_SEND_EXTRA : 0;
+    pInfo->dwFlags |= m_tStartTime == MAX_TIME ? 0 : AM_STREAM_INFO_START_DEFINED;
+    pInfo->dwFlags |= m_tStopTime == MAX_TIME ? 0 : AM_STREAM_INFO_STOP_DEFINED;
+    switch (m_StreamState) {
+    default:
+        DbgBreak("Invalid stream state");
+    case STREAM_FLOWING:
+        break;
+    case STREAM_DISCARDING:
+        pInfo->dwFlags |= AM_STREAM_INFO_DISCARDING;
+        break;
+    }
+    return S_OK;
+}
+
+
+void CBaseStreamControl::ExecuteStop()
+{
+    ASSERT(CritCheckIn(&m_CritSec));
+    m_StreamState = m_StreamStateOnStop;
+    if (m_dwStopCookie && m_pSink) {
+	DbgLog((LOG_TRACE,2,TEXT("*sending EC_STREAM_CONTROL_STOPPED (%d)"),
+							m_dwStopCookie));
+        m_pSink->Notify(EC_STREAM_CONTROL_STOPPED, (LONG_PTR)this, m_dwStopCookie);
+    }
+    CancelStop(); // This will do the tidy up
+}
+
+void CBaseStreamControl::ExecuteStart()
+{
+    ASSERT(CritCheckIn(&m_CritSec));
+    m_StreamState = STREAM_FLOWING;
+    if (m_dwStartCookie) {
+	DbgLog((LOG_TRACE,2,TEXT("*sending EC_STREAM_CONTROL_STARTED (%d)"),
+							m_dwStartCookie));
+        m_pSink->Notify(EC_STREAM_CONTROL_STARTED, (LONG_PTR)this, m_dwStartCookie);
+    }
+    CancelStart(); // This will do the tidy up
+}
+
+void CBaseStreamControl::CancelStop()
+{
+    ASSERT(CritCheckIn(&m_CritSec));
+    m_tStopTime = MAX_TIME;
+    m_dwStopCookie = 0;
+    m_StreamStateOnStop = STREAM_FLOWING;
+}
+
+void CBaseStreamControl::CancelStart()
+{
+    ASSERT(CritCheckIn(&m_CritSec));
+    m_tStartTime = MAX_TIME;
+    m_dwStartCookie = 0;
+}
+
+
+// This guy will return one of the three StreamControlState's.  Here's what the caller
+// should do for each one:
+//
+// STREAM_FLOWING:      Proceed as usual (render or pass the sample on)
+// STREAM_DISCARDING:   Calculate the time 'til *pSampleStart and wait that long
+//                      for the event handle (GetStreamEventHandle()).  If the
+//                      wait expires, throw the sample away.  If the event
+//			fires, call me back, I've changed my mind.
+//			I use pSampleStart (not Stop) so that live sources don't
+// 			block for the duration of their samples, since the clock
+//			will always read approximately pSampleStart when called
+
+
+// All through this code, you'll notice the following rules:
+// - When start and stop time are the same, it's as if start was first
+// - An event is considered inside the sample when it's >= sample start time
+//   but < sample stop time
+// - if any part of the sample is supposed to be sent, we'll send the whole
+//   thing since we don't break it into smaller pieces
+// - If we skip over a start or stop without doing it, we still signal the event
+//   and reset ourselves in case somebody's waiting for the event, and to make
+//   sure we notice that the event is past and should be forgotten
+// Here are the 19 cases that have to be handled (x=start o=stop <-->=sample):
+//
+// 1.	xo<-->		start then stop
+// 2.	ox<-->		stop then start
+// 3.	 x<o->		start
+// 4.	 o<x->		stop then start
+// 5.	 x<-->o		start
+// 6.	 o<-->x		stop
+// 7.	  <x->o		start
+// 8.	  <o->x		no change
+// 9.	  <xo>		start
+// 10.	  <ox>		stop then start
+// 11.	  <-->xo	no change
+// 12.	  <-->ox	no change
+// 13.	 x<-->		start
+// 14.    <x->		start
+// 15.    <-->x		no change
+// 16.   o<-->		stop
+// 17.	  <o->		no change
+// 18.	  <-->o		no change
+// 19.    <-->		no change
+
+
+enum CBaseStreamControl::StreamControlState CBaseStreamControl::CheckSampleTimes
+( const REFERENCE_TIME * pSampleStart, const REFERENCE_TIME * pSampleStop )
+{
+    CAutoLock lck(&m_CritSec);
+
+    ASSERT(!m_bIsFlushing);
+    ASSERT(pSampleStart && pSampleStop);
+
+    // Don't ask me how I came up with the code below to handle all 19 cases
+    // - DannyMi
+
+    if (m_tStopTime >= *pSampleStart)
+    {
+        if (m_tStartTime >= *pSampleStop)
+	    return m_StreamState;		// cases  8 11 12 15 17 18 19
+	if (m_tStopTime < m_tStartTime)
+	    ExecuteStop();			// case 10
+	ExecuteStart();                         // cases 3 5 7 9 13 14
+	return m_StreamState;
+    }
+
+    if (m_tStartTime >= *pSampleStop)
+    {
+        ExecuteStop();                          // cases 6 16
+        return m_StreamState;
+    }
+
+    if (m_tStartTime <= m_tStopTime)
+    {
+	ExecuteStart();
+	ExecuteStop();
+        return m_StreamState;		// case 1
+    }
+    else
+    {
+	ExecuteStop();
+	ExecuteStart();
+        return m_StreamState;		// cases 2 4
+    }
+}
+
+
+enum CBaseStreamControl::StreamControlState CBaseStreamControl::CheckStreamState( IMediaSample * pSample )
+{
+
+    REFERENCE_TIME rtBufferStart, rtBufferStop;
+    const BOOL bNoBufferTimes =
+              pSample == NULL ||
+              FAILED(pSample->GetTime(&rtBufferStart, &rtBufferStop));
+
+    StreamControlState state;
+    LONG lWait;
+
+    do
+        {
+ 	    // something has to break out of the blocking
+            if (m_bIsFlushing || m_FilterState == State_Stopped)
+		return STREAM_DISCARDING;
+
+            if (bNoBufferTimes) {
+                //  Can't do anything until we get a time stamp
+                state = m_StreamState;
+                break;
+            } else {
+                state = CheckSampleTimes( &rtBufferStart, &rtBufferStop );
+                if (state == STREAM_FLOWING)
+		    break;
+
+		// we aren't supposed to send this, but we've been
+		// told to send one more than we were supposed to
+		// (and the stop isn't still pending and we're streaming)
+		if (m_bStopSendExtra && !m_bStopExtraSent &&
+					m_tStopTime == MAX_TIME &&
+					m_FilterState != State_Stopped) {
+		    m_bStopExtraSent = TRUE;
+		    DbgLog((LOG_TRACE,2,TEXT("%d sending an EXTRA frame"),
+							    m_dwStopCookie));
+		    state = STREAM_FLOWING;
+		    break;
+		}
+            }
+
+            // We're in discarding mode
+
+            // If we've no clock, discard as fast as we can
+            if (!m_pRefClock) {
+		break;
+
+	    // If we're paused, we can't discard in a timely manner because
+	    // there's no such thing as stream times.  We must block until
+	    // we run or stop, or we'll end up throwing the whole stream away
+	    // as quickly as possible
+	    } else if (m_FilterState == State_Paused) {
+		lWait = INFINITE;
+
+	    } else {
+	        // wait until it's time for the sample until we say "discard"
+	        // ("discard in a timely fashion")
+	        REFERENCE_TIME rtNow;
+                EXECUTE_ASSERT(SUCCEEDED(m_pRefClock->GetTime(&rtNow)));
+                rtNow -= m_tRunStart;   // Into relative ref-time
+                lWait = LONG((rtBufferStart - rtNow)/10000); // 100ns -> ms
+                if (lWait < 10) break; // Not worth waiting - discard early
+	    }
+
+    } while(WaitForSingleObject(GetStreamEventHandle(), lWait) != WAIT_TIMEOUT);
+
+    return state;
+}
+
+
+void CBaseStreamControl::NotifyFilterState( FILTER_STATE new_state, REFERENCE_TIME tStart )
+{
+    CAutoLock lck(&m_CritSec);
+
+    // or we will get confused
+    if (m_FilterState == new_state)
+	return;
+
+    switch (new_state)
+    {
+        case State_Stopped:
+
+            DbgLog((LOG_TRACE,2,TEXT("Filter is STOPPED")));
+
+	    // execute any pending starts and stops in the right order,
+	    // to make sure all notifications get sent, and we end up
+	    // in the right state to begin next time (??? why not?)
+
+	    if (m_tStartTime != MAX_TIME && m_tStopTime == MAX_TIME) {
+		ExecuteStart();
+	    } else if (m_tStopTime != MAX_TIME && m_tStartTime == MAX_TIME) {
+		ExecuteStop();
+	    } else if (m_tStopTime != MAX_TIME && m_tStartTime != MAX_TIME) {
+		if (m_tStartTime <= m_tStopTime) {
+		    ExecuteStart();
+		    ExecuteStop();
+		} else {
+		    ExecuteStop();
+		    ExecuteStart();
+		}
+	    }
+	    // always start off flowing when the graph starts streaming
+	    // unless told otherwise
+	    m_StreamState = STREAM_FLOWING;
+            m_FilterState = new_state;
+            break;
+
+        case State_Running:
+
+            DbgLog((LOG_TRACE,2,TEXT("Filter is RUNNING")));
+
+            m_tRunStart = tStart;
+            // fall-through
+
+        default: // case State_Paused:
+            m_FilterState = new_state;
+    }
+    // unblock!
+    m_StreamEvent.Set();
+}
+
+
+void CBaseStreamControl::Flushing(BOOL bInProgress)
+{
+    CAutoLock lck(&m_CritSec);
+    m_bIsFlushing = bInProgress;
+    m_StreamEvent.Set();
+}
diff --git a/plugins/GSdx_legacy/baseclasses/strmctl.h b/plugins/GSdx_legacy/baseclasses/strmctl.h
new file mode 100644
index 0000000000..99732877c9
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/strmctl.h
@@ -0,0 +1,157 @@
+//------------------------------------------------------------------------------
+// File: StrmCtl.h
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __strmctl_h__
+#define __strmctl_h__
+
+class CBaseStreamControl : public IAMStreamControl
+{
+public:
+    // Used by the implementation
+    enum StreamControlState
+    { STREAM_FLOWING = 0x1000,
+      STREAM_DISCARDING
+    };
+
+private:
+    enum StreamControlState	m_StreamState;		// Current stream state
+    enum StreamControlState	m_StreamStateOnStop;	// State after next stop
+						// (i.e.Blocking or Discarding)
+
+    REFERENCE_TIME	m_tStartTime;	    // MAX_TIME implies none
+    REFERENCE_TIME	m_tStopTime;	    // MAX_TIME implies none
+    DWORD		m_dwStartCookie;    // Cookie for notification to app
+    DWORD		m_dwStopCookie;	    // Cookie for notification to app
+    volatile BOOL       m_bIsFlushing;        // No optimization pls!
+    volatile BOOL	m_bStopSendExtra;   // bSendExtra was set
+    volatile BOOL	m_bStopExtraSent;   // the extra one was sent
+
+    CCritSec		m_CritSec;	    // CritSec to guard above attributes
+
+    // Event to fire when we can come
+    // out of blocking, or to come out of waiting
+    // to discard if we change our minds.
+    //
+    CAMEvent			m_StreamEvent;
+
+    // All of these methods execute immediately.  Helpers for others.
+    //
+    void ExecuteStop();
+    void ExecuteStart();
+    void CancelStop();
+    void CancelStart();
+
+    // Some things we need to be told by our owning filter
+    // Your pin must also expose IAMStreamControl when QI'd for it!
+    //
+    IReferenceClock *	m_pRefClock;	    // Need it to set advises
+					    // Filter must tell us via
+					    // SetSyncSource
+    IMediaEventSink *   m_pSink;            // Event sink
+					    // Filter must tell us after it
+					    // creates it in JoinFilterGraph()
+    FILTER_STATE	m_FilterState;	    // Just need it!
+					    // Filter must tell us via
+					    // NotifyFilterState
+    REFERENCE_TIME	m_tRunStart;	    // Per the Run call to the filter
+
+    // This guy will return one of the three StreamControlState's.  Here's what
+    // the caller should do for each one:
+    //
+    // STREAM_FLOWING:		Proceed as usual (render or pass the sample on)
+    // STREAM_DISCARDING:	Calculate the time 'til *pSampleStop and wait
+    //				that long for the event handle
+    //				(GetStreamEventHandle()).  If the wait
+    //				expires, throw the sample away.  If the event
+    //				fires, call me back - I've changed my mind.
+    //
+    enum StreamControlState CheckSampleTimes( const REFERENCE_TIME * pSampleStart,
+					      const REFERENCE_TIME * pSampleStop );
+
+public:
+    // You don't have to tell us much when we're created, but there are other
+    // obligations that must be met.  See SetSyncSource & NotifyFilterState
+    // below.
+    //
+    CBaseStreamControl();
+    ~CBaseStreamControl();
+
+    // If you want this class to work properly, there are thing you need to
+    // (keep) telling it.  Filters with pins that use this class
+    // should ensure that they pass through to this method any calls they
+    // receive on their SetSyncSource.
+
+    // We need a clock to see what time it is.  This is for the
+    // "discard in a timely fashion" logic.  If we discard everything as
+    // quick as possible, a whole 60 minute file could get discarded in the
+    // first 10 seconds, and if somebody wants to turn streaming on at 30
+    // minutes into the file, and they make the call more than a few seconds
+    // after the graph is run, it may be too late!
+    // So we hold every sample until it's time has gone, then we discard it.
+    // The filter should call this when it gets a SetSyncSource
+    //
+    void SetSyncSource( IReferenceClock * pRefClock )
+    {
+	CAutoLock lck(&m_CritSec);
+	if (m_pRefClock) m_pRefClock->Release();
+	m_pRefClock = pRefClock;
+	if (m_pRefClock) m_pRefClock->AddRef();
+    }
+
+    // Set event sink for notifications
+    // The filter should call this in its JoinFilterGraph after it creates the
+    // IMediaEventSink
+    //
+    void SetFilterGraph( IMediaEventSink *pSink ) {
+        m_pSink = pSink;
+    }
+
+    // Since we schedule in stream time, we need the tStart and must track the
+    // state of our owning filter.
+    // The app should call this ever state change
+    //
+    void NotifyFilterState( FILTER_STATE new_state, REFERENCE_TIME tStart = 0 );
+
+    // Filter should call Flushing(TRUE) in BeginFlush,
+    // and Flushing(FALSE) in EndFlush.
+    //
+    void Flushing( BOOL bInProgress );
+
+
+    // The two main methods of IAMStreamControl
+
+    // Class adds default values suitable for immediate
+    // muting and unmuting of the stream.
+
+    STDMETHODIMP StopAt( const REFERENCE_TIME * ptStop = NULL,
+			 BOOL bSendExtra = FALSE,
+			 DWORD dwCookie = 0 );
+    STDMETHODIMP StartAt( const REFERENCE_TIME * ptStart = NULL,
+		    	  DWORD dwCookie = 0 );
+    STDMETHODIMP GetInfo( AM_STREAM_INFO *pInfo);
+
+    // Helper function for pin's receive method.  Call this with
+    // the sample and we'll tell you what to do with it.  We'll do a
+    // WaitForSingleObject within this call if one is required.  This is
+    // a "What should I do with this sample?" kind of call. We'll tell the
+    // caller to either flow it or discard it.
+    // If pSample is NULL we evaluate based on the current state
+    // settings
+    enum StreamControlState CheckStreamState( IMediaSample * pSample );
+
+private:
+    // These don't require locking, but we are relying on the fact that
+    // m_StreamState can be retrieved with integrity, and is a snap shot that
+    // may have just been, or may be just about to be, changed.
+    HANDLE GetStreamEventHandle() const { return m_StreamEvent; }
+    enum StreamControlState GetStreamState() const { return m_StreamState; }
+    BOOL IsStreaming() const { return m_StreamState == STREAM_FLOWING; }
+};
+
+#endif
diff --git a/plugins/GSdx_legacy/baseclasses/sysclock.cpp b/plugins/GSdx_legacy/baseclasses/sysclock.cpp
new file mode 100644
index 0000000000..5b6cf0c7fa
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/sysclock.cpp
@@ -0,0 +1,74 @@
+//------------------------------------------------------------------------------
+// File: SysClock.cpp
+//
+// Desc: DirectShow base classes - implements a system clock based on
+//       IReferenceClock.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include <limits.h>
+
+
+#ifdef FILTER_DLL
+
+/* List of class IDs and creator functions for the class factory. This
+   provides the link between the OLE entry point in the DLL and an object
+   being created. The class factory will call the static CreateInstance
+   function when it is asked to create a CLSID_SystemClock object */
+
+CFactoryTemplate g_Templates[1] = {
+    {&CLSID_SystemClock, CSystemClock::CreateInstance}
+};
+
+int g_cTemplates = sizeof(g_Templates) / sizeof(g_Templates[0]);
+#endif
+
+/* This goes in the factory template table to create new instances */
+CUnknown * WINAPI CSystemClock::CreateInstance(LPUNKNOWN pUnk,HRESULT *phr)
+{
+    return new CSystemClock(NAME("System reference clock"),pUnk, phr);
+}
+
+
+CSystemClock::CSystemClock(TCHAR *pName,LPUNKNOWN pUnk,HRESULT *phr) :
+    CBaseReferenceClock(pName, pUnk, phr)
+{
+}
+
+STDMETHODIMP CSystemClock::NonDelegatingQueryInterface(
+    REFIID riid,
+    void ** ppv)
+{
+    if (riid == IID_IPersist)
+    {
+        return GetInterface(static_cast<IPersist *>(this), ppv);
+    }
+    else if (riid == IID_IAMClockAdjust)
+    {
+        return GetInterface(static_cast<IAMClockAdjust *>(this), ppv);
+    }
+    else
+    {
+        return CBaseReferenceClock::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+/* Return the clock's clsid */
+STDMETHODIMP
+CSystemClock::GetClassID(CLSID *pClsID)
+{
+    CheckPointer(pClsID,E_POINTER);
+    ValidateReadWritePtr(pClsID,sizeof(CLSID));
+    *pClsID = CLSID_SystemClock;
+    return NOERROR;
+}
+
+
+STDMETHODIMP
+CSystemClock::SetClockDelta(REFERENCE_TIME rtDelta)
+{
+    return SetTimeDelta(rtDelta);
+}
diff --git a/plugins/GSdx_legacy/baseclasses/sysclock.h b/plugins/GSdx_legacy/baseclasses/sysclock.h
new file mode 100644
index 0000000000..d55015f2f5
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/sysclock.h
@@ -0,0 +1,39 @@
+//------------------------------------------------------------------------------
+// File: SysClock.h
+//
+// Desc: DirectShow base classes - defines a system clock implementation of
+//       IReferenceClock.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __SYSTEMCLOCK__
+#define __SYSTEMCLOCK__
+
+//
+// Base clock.  Uses timeGetTime ONLY
+// Uses most of the code in the base reference clock.
+// Provides GetTime
+//
+
+class CSystemClock : public CBaseReferenceClock, public IAMClockAdjust, public IPersist
+{
+public:
+    // We must be able to create an instance of ourselves
+    static CUnknown * WINAPI CreateInstance(LPUNKNOWN pUnk, HRESULT *phr);
+    CSystemClock(TCHAR *pName, LPUNKNOWN pUnk, HRESULT *phr);
+
+    DECLARE_IUNKNOWN
+
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid,void ** ppv);
+
+    // Yield up our class id so that we can be persisted
+    // Implement required Ipersist method
+    STDMETHODIMP GetClassID(CLSID *pClsID);
+
+    //  IAMClockAdjust methods
+    STDMETHODIMP SetClockDelta(REFERENCE_TIME rtDelta);
+}; //CSystemClock
+
+#endif /* __SYSTEMCLOCK__ */
diff --git a/plugins/GSdx_legacy/baseclasses/transfrm.cpp b/plugins/GSdx_legacy/baseclasses/transfrm.cpp
new file mode 100644
index 0000000000..6ba8af266a
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/transfrm.cpp
@@ -0,0 +1,1016 @@
+//------------------------------------------------------------------------------
+// File: Transfrm.cpp
+//
+// Desc: DirectShow base classes - implements class for simple transform
+//       filters such as video decompressors.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include "measure.h"
+
+
+// =================================================================
+// Implements the CTransformFilter class
+// =================================================================
+
+CTransformFilter::CTransformFilter(TCHAR     *pName,
+                                   LPUNKNOWN pUnk,
+                                   REFCLSID  clsid) :
+    CBaseFilter(pName,pUnk,&m_csFilter, clsid),
+    m_pInput(NULL),
+    m_pOutput(NULL),
+    m_bEOSDelivered(FALSE),
+    m_bQualityChanged(FALSE),
+    m_bSampleSkipped(FALSE)
+{
+#ifdef PERF
+    RegisterPerfId();
+#endif //  PERF
+}
+
+#ifdef UNICODE
+CTransformFilter::CTransformFilter(char     *pName,
+                                   LPUNKNOWN pUnk,
+                                   REFCLSID  clsid) :
+    CBaseFilter(pName,pUnk,&m_csFilter, clsid),
+    m_pInput(NULL),
+    m_pOutput(NULL),
+    m_bEOSDelivered(FALSE),
+    m_bQualityChanged(FALSE),
+    m_bSampleSkipped(FALSE)
+{
+#ifdef PERF
+    RegisterPerfId();
+#endif //  PERF
+}
+#endif
+
+// destructor
+
+CTransformFilter::~CTransformFilter()
+{
+    // Delete the pins
+
+    delete m_pInput;
+    delete m_pOutput;
+}
+
+
+// Transform place holder - should never be called
+HRESULT CTransformFilter::Transform(IMediaSample * pIn, IMediaSample *pOut)
+{
+    UNREFERENCED_PARAMETER(pIn);
+    UNREFERENCED_PARAMETER(pOut);
+    DbgBreak("CTransformFilter::Transform() should never be called");
+    return E_UNEXPECTED;
+}
+
+
+// return the number of pins we provide
+
+int CTransformFilter::GetPinCount()
+{
+    return 2;
+}
+
+
+// return a non-addrefed CBasePin * for the user to addref if he holds onto it
+// for longer than his pointer to us. We create the pins dynamically when they
+// are asked for rather than in the constructor. This is because we want to
+// give the derived class an oppportunity to return different pin objects
+
+// We return the objects as and when they are needed. If either of these fails
+// then we return NULL, the assumption being that the caller will realise the
+// whole deal is off and destroy us - which in turn will delete everything.
+
+CBasePin *
+CTransformFilter::GetPin(int n)
+{
+    HRESULT hr = S_OK;
+
+    // Create an input pin if necessary
+
+    if (m_pInput == NULL) {
+
+        m_pInput = new CTransformInputPin(NAME("Transform input pin"),
+                                          this,              // Owner filter
+                                          &hr,               // Result code
+                                          L"XForm In");      // Pin name
+
+
+        //  Can't fail
+        ASSERT(SUCCEEDED(hr));
+        if (m_pInput == NULL) {
+            return NULL;
+        }
+        m_pOutput = (CTransformOutputPin *)
+		   new CTransformOutputPin(NAME("Transform output pin"),
+                                            this,            // Owner filter
+                                            &hr,             // Result code
+                                            L"XForm Out");   // Pin name
+
+
+        // Can't fail
+        ASSERT(SUCCEEDED(hr));
+        if (m_pOutput == NULL) {
+            delete m_pInput;
+            m_pInput = NULL;
+        }
+    }
+
+    // Return the appropriate pin
+
+    if (n == 0) {
+        return m_pInput;
+    } else
+    if (n == 1) {
+        return m_pOutput;
+    } else {
+        return NULL;
+    }
+}
+
+
+//
+// FindPin
+//
+// If Id is In or Out then return the IPin* for that pin
+// creating the pin if need be.  Otherwise return NULL with an error.
+
+STDMETHODIMP CTransformFilter::FindPin(LPCWSTR Id, IPin **ppPin)
+{
+    CheckPointer(ppPin,E_POINTER);
+    ValidateReadWritePtr(ppPin,sizeof(IPin *));
+
+    if (0==lstrcmpW(Id,L"In")) {
+        *ppPin = GetPin(0);
+    } else if (0==lstrcmpW(Id,L"Out")) {
+        *ppPin = GetPin(1);
+    } else {
+        *ppPin = NULL;
+        return VFW_E_NOT_FOUND;
+    }
+
+    HRESULT hr = NOERROR;
+    //  AddRef() returned pointer - but GetPin could fail if memory is low.
+    if (*ppPin) {
+        (*ppPin)->AddRef();
+    } else {
+        hr = E_OUTOFMEMORY;  // probably.  There's no pin anyway.
+    }
+    return hr;
+}
+
+
+// override these two functions if you want to inform something
+// about entry to or exit from streaming state.
+
+HRESULT
+CTransformFilter::StartStreaming()
+{
+    return NOERROR;
+}
+
+
+HRESULT
+CTransformFilter::StopStreaming()
+{
+    return NOERROR;
+}
+
+
+// override this to grab extra interfaces on connection
+
+HRESULT
+CTransformFilter::CheckConnect(PIN_DIRECTION dir,IPin *pPin)
+{
+    UNREFERENCED_PARAMETER(dir);
+    UNREFERENCED_PARAMETER(pPin);
+    return NOERROR;
+}
+
+
+// place holder to allow derived classes to release any extra interfaces
+
+HRESULT
+CTransformFilter::BreakConnect(PIN_DIRECTION dir)
+{
+    UNREFERENCED_PARAMETER(dir);
+    return NOERROR;
+}
+
+
+// Let derived classes know about connection completion
+
+HRESULT
+CTransformFilter::CompleteConnect(PIN_DIRECTION direction,IPin *pReceivePin)
+{
+    UNREFERENCED_PARAMETER(direction);
+    UNREFERENCED_PARAMETER(pReceivePin);
+    return NOERROR;
+}
+
+
+// override this to know when the media type is really set
+
+HRESULT
+CTransformFilter::SetMediaType(PIN_DIRECTION direction,const CMediaType *pmt)
+{
+    UNREFERENCED_PARAMETER(direction);
+    UNREFERENCED_PARAMETER(pmt);
+    return NOERROR;
+}
+
+
+// Set up our output sample
+HRESULT
+CTransformFilter::InitializeOutputSample(IMediaSample *pSample, IMediaSample **ppOutSample)
+{
+    IMediaSample *pOutSample;
+
+    // default - times are the same
+
+    AM_SAMPLE2_PROPERTIES * const pProps = m_pInput->SampleProps();
+    DWORD dwFlags = m_bSampleSkipped ? AM_GBF_PREVFRAMESKIPPED : 0;
+
+    // This will prevent the image renderer from switching us to DirectDraw
+    // when we can't do it without skipping frames because we're not on a
+    // keyframe.  If it really has to switch us, it still will, but then we
+    // will have to wait for the next keyframe
+    if (!(pProps->dwSampleFlags & AM_SAMPLE_SPLICEPOINT)) {
+	dwFlags |= AM_GBF_NOTASYNCPOINT;
+    }
+
+    ASSERT(m_pOutput->m_pAllocator != NULL);
+    HRESULT hr = m_pOutput->m_pAllocator->GetBuffer(
+             &pOutSample
+             , pProps->dwSampleFlags & AM_SAMPLE_TIMEVALID ?
+                   &pProps->tStart : NULL
+             , pProps->dwSampleFlags & AM_SAMPLE_STOPVALID ?
+                   &pProps->tStop : NULL
+             , dwFlags
+         );
+    *ppOutSample = pOutSample;
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    ASSERT(pOutSample);
+    IMediaSample2 *pOutSample2;
+    if (SUCCEEDED(pOutSample->QueryInterface(IID_IMediaSample2,
+                                             (void **)&pOutSample2))) {
+        /*  Modify it */
+        AM_SAMPLE2_PROPERTIES OutProps;
+        EXECUTE_ASSERT(SUCCEEDED(pOutSample2->GetProperties(
+            FIELD_OFFSET(AM_SAMPLE2_PROPERTIES, tStart), (PBYTE)&OutProps)
+        ));
+        OutProps.dwTypeSpecificFlags = pProps->dwTypeSpecificFlags;
+        OutProps.dwSampleFlags =
+            (OutProps.dwSampleFlags & AM_SAMPLE_TYPECHANGED) |
+            (pProps->dwSampleFlags & ~AM_SAMPLE_TYPECHANGED);
+        OutProps.tStart = pProps->tStart;
+        OutProps.tStop  = pProps->tStop;
+        OutProps.cbData = FIELD_OFFSET(AM_SAMPLE2_PROPERTIES, dwStreamId);
+        hr = pOutSample2->SetProperties(
+            FIELD_OFFSET(AM_SAMPLE2_PROPERTIES, dwStreamId),
+            (PBYTE)&OutProps
+        );
+        if (pProps->dwSampleFlags & AM_SAMPLE_DATADISCONTINUITY) {
+            m_bSampleSkipped = FALSE;
+        }
+        pOutSample2->Release();
+    } else {
+        if (pProps->dwSampleFlags & AM_SAMPLE_TIMEVALID) {
+            pOutSample->SetTime(&pProps->tStart,
+                                &pProps->tStop);
+        }
+        if (pProps->dwSampleFlags & AM_SAMPLE_SPLICEPOINT) {
+            pOutSample->SetSyncPoint(TRUE);
+        }
+        if (pProps->dwSampleFlags & AM_SAMPLE_DATADISCONTINUITY) {
+            pOutSample->SetDiscontinuity(TRUE);
+            m_bSampleSkipped = FALSE;
+        }
+        // Copy the media times
+
+        LONGLONG MediaStart, MediaEnd;
+        if (pSample->GetMediaTime(&MediaStart,&MediaEnd) == NOERROR) {
+            pOutSample->SetMediaTime(&MediaStart,&MediaEnd);
+        }
+    }
+    return S_OK;
+}
+
+// override this to customize the transform process
+
+HRESULT
+CTransformFilter::Receive(IMediaSample *pSample)
+{
+    /*  Check for other streams and pass them on */
+    AM_SAMPLE2_PROPERTIES * const pProps = m_pInput->SampleProps();
+    if (pProps->dwStreamId != AM_STREAM_MEDIA) {
+        return m_pOutput->m_pInputPin->Receive(pSample);
+    }
+    HRESULT hr;
+    ASSERT(pSample);
+    IMediaSample * pOutSample;
+
+    // If no output to deliver to then no point sending us data
+
+    ASSERT (m_pOutput != NULL) ;
+
+    // Set up the output sample
+    hr = InitializeOutputSample(pSample, &pOutSample);
+
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    // Start timing the transform (if PERF is defined)
+    MSR_START(m_idTransform);
+
+    // have the derived class transform the data
+
+    hr = Transform(pSample, pOutSample);
+
+    // Stop the clock and log it (if PERF is defined)
+    MSR_STOP(m_idTransform);
+
+    if (FAILED(hr)) {
+	DbgLog((LOG_TRACE,1,TEXT("Error from transform")));
+    } else {
+        // the Transform() function can return S_FALSE to indicate that the
+        // sample should not be delivered; we only deliver the sample if it's
+        // really S_OK (same as NOERROR, of course.)
+        if (hr == NOERROR) {
+    	    hr = m_pOutput->m_pInputPin->Receive(pOutSample);
+            m_bSampleSkipped = FALSE;	// last thing no longer dropped
+        } else {
+            // S_FALSE returned from Transform is a PRIVATE agreement
+            // We should return NOERROR from Receive() in this cause because returning S_FALSE
+            // from Receive() means that this is the end of the stream and no more data should
+            // be sent.
+            if (S_FALSE == hr) {
+
+                //  Release the sample before calling notify to avoid
+                //  deadlocks if the sample holds a lock on the system
+                //  such as DirectDraw buffers do
+                pOutSample->Release();
+                m_bSampleSkipped = TRUE;
+                if (!m_bQualityChanged) {
+                    NotifyEvent(EC_QUALITY_CHANGE,0,0);
+                    m_bQualityChanged = TRUE;
+                }
+                return NOERROR;
+            }
+        }
+    }
+
+    // release the output buffer. If the connected pin still needs it,
+    // it will have addrefed it itself.
+    pOutSample->Release();
+
+    return hr;
+}
+
+
+// Return S_FALSE to mean "pass the note on upstream"
+// Return NOERROR (Same as S_OK)
+// to mean "I've done something about it, don't pass it on"
+HRESULT CTransformFilter::AlterQuality(Quality q)
+{
+    UNREFERENCED_PARAMETER(q);
+    return S_FALSE;
+}
+
+
+// EndOfStream received. Default behaviour is to deliver straight
+// downstream, since we have no queued data. If you overrode Receive
+// and have queue data, then you need to handle this and deliver EOS after
+// all queued data is sent
+HRESULT
+CTransformFilter::EndOfStream(void)
+{
+    HRESULT hr = NOERROR;
+    if (m_pOutput != NULL) {
+        hr = m_pOutput->DeliverEndOfStream();
+    }
+
+    return hr;
+}
+
+
+// enter flush state. Receives already blocked
+// must override this if you have queued data or a worker thread
+HRESULT
+CTransformFilter::BeginFlush(void)
+{
+    HRESULT hr = NOERROR;
+    if (m_pOutput != NULL) {
+	// block receives -- done by caller (CBaseInputPin::BeginFlush)
+
+	// discard queued data -- we have no queued data
+
+	// free anyone blocked on receive - not possible in this filter
+
+	// call downstream
+	hr = m_pOutput->DeliverBeginFlush();
+    }
+    return hr;
+}
+
+
+// leave flush state. must override this if you have queued data
+// or a worker thread
+HRESULT
+CTransformFilter::EndFlush(void)
+{
+    // sync with pushing thread -- we have no worker thread
+
+    // ensure no more data to go downstream -- we have no queued data
+
+    // call EndFlush on downstream pins
+    ASSERT (m_pOutput != NULL);
+    return m_pOutput->DeliverEndFlush();
+
+    // caller (the input pin's method) will unblock Receives
+}
+
+
+// override these so that the derived filter can catch them
+
+STDMETHODIMP
+CTransformFilter::Stop()
+{
+    CAutoLock lck1(&m_csFilter);
+    if (m_State == State_Stopped) {
+        return NOERROR;
+    }
+
+    // Succeed the Stop if we are not completely connected
+
+    ASSERT(m_pInput == NULL || m_pOutput != NULL);
+    if (m_pInput == NULL || m_pInput->IsConnected() == FALSE ||
+        m_pOutput->IsConnected() == FALSE) {
+                m_State = State_Stopped;
+                m_bEOSDelivered = FALSE;
+                return NOERROR;
+    }
+
+    ASSERT(m_pInput);
+    ASSERT(m_pOutput);
+
+    // decommit the input pin before locking or we can deadlock
+    m_pInput->Inactive();
+
+    // synchronize with Receive calls
+
+    CAutoLock lck2(&m_csReceive);
+    m_pOutput->Inactive();
+
+    // allow a class derived from CTransformFilter
+    // to know about starting and stopping streaming
+
+    HRESULT hr = StopStreaming();
+    if (SUCCEEDED(hr)) {
+	// complete the state transition
+	m_State = State_Stopped;
+	m_bEOSDelivered = FALSE;
+    }
+    return hr;
+}
+
+
+STDMETHODIMP
+CTransformFilter::Pause()
+{
+    CAutoLock lck(&m_csFilter);
+    HRESULT hr = NOERROR;
+
+    if (m_State == State_Paused) {
+        // (This space left deliberately blank)
+    }
+
+    // If we have no input pin or it isn't yet connected then when we are
+    // asked to pause we deliver an end of stream to the downstream filter.
+    // This makes sure that it doesn't sit there forever waiting for
+    // samples which we cannot ever deliver without an input connection.
+
+    else if (m_pInput == NULL || m_pInput->IsConnected() == FALSE) {
+        if (m_pOutput && m_bEOSDelivered == FALSE) {
+            m_pOutput->DeliverEndOfStream();
+            m_bEOSDelivered = TRUE;
+        }
+        m_State = State_Paused;
+    }
+
+    // We may have an input connection but no output connection
+    // However, if we have an input pin we do have an output pin
+
+    else if (m_pOutput->IsConnected() == FALSE) {
+        m_State = State_Paused;
+    }
+
+    else {
+	if (m_State == State_Stopped) {
+	    // allow a class derived from CTransformFilter
+	    // to know about starting and stopping streaming
+            CAutoLock lck2(&m_csReceive);
+	    hr = StartStreaming();
+	}
+	if (SUCCEEDED(hr)) {
+	    hr = CBaseFilter::Pause();
+	}
+    }
+
+    m_bSampleSkipped = FALSE;
+    m_bQualityChanged = FALSE;
+    return hr;
+}
+
+HRESULT
+CTransformFilter::NewSegment(
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop,
+    double dRate)
+{
+    if (m_pOutput != NULL) {
+        return m_pOutput->DeliverNewSegment(tStart, tStop, dRate);
+    }
+    return S_OK;
+}
+
+// Check streaming status
+HRESULT
+CTransformInputPin::CheckStreaming()
+{
+    ASSERT(m_pTransformFilter->m_pOutput != NULL);
+    if (!m_pTransformFilter->m_pOutput->IsConnected()) {
+        return VFW_E_NOT_CONNECTED;
+    } else {
+        //  Shouldn't be able to get any data if we're not connected!
+        ASSERT(IsConnected());
+
+        //  we're flushing
+        if (m_bFlushing) {
+            return S_FALSE;
+        }
+        //  Don't process stuff in Stopped state
+        if (IsStopped()) {
+            return VFW_E_WRONG_STATE;
+        }
+        if (m_bRunTimeError) {
+    	    return VFW_E_RUNTIME_ERROR;
+        }
+        return S_OK;
+    }
+}
+
+
+// =================================================================
+// Implements the CTransformInputPin class
+// =================================================================
+
+
+// constructor
+
+CTransformInputPin::CTransformInputPin(
+    TCHAR *pObjectName,
+    CTransformFilter *pTransformFilter,
+    HRESULT * phr,
+    LPCWSTR pName)
+    : CBaseInputPin(pObjectName, pTransformFilter, &pTransformFilter->m_csFilter, phr, pName)
+{
+    DbgLog((LOG_TRACE,2,TEXT("CTransformInputPin::CTransformInputPin")));
+    m_pTransformFilter = pTransformFilter;
+}
+
+#ifdef UNICODE
+CTransformInputPin::CTransformInputPin(
+    CHAR *pObjectName,
+    CTransformFilter *pTransformFilter,
+    HRESULT * phr,
+    LPCWSTR pName)
+    : CBaseInputPin(pObjectName, pTransformFilter, &pTransformFilter->m_csFilter, phr, pName)
+{
+    DbgLog((LOG_TRACE,2,TEXT("CTransformInputPin::CTransformInputPin")));
+    m_pTransformFilter = pTransformFilter;
+}
+#endif
+
+// provides derived filter a chance to grab extra interfaces
+
+HRESULT
+CTransformInputPin::CheckConnect(IPin *pPin)
+{
+    HRESULT hr = m_pTransformFilter->CheckConnect(PINDIR_INPUT,pPin);
+    if (FAILED(hr)) {
+    	return hr;
+    }
+    return CBaseInputPin::CheckConnect(pPin);
+}
+
+
+// provides derived filter a chance to release it's extra interfaces
+
+HRESULT
+CTransformInputPin::BreakConnect()
+{
+    //  Can't disconnect unless stopped
+    ASSERT(IsStopped());
+    m_pTransformFilter->BreakConnect(PINDIR_INPUT);
+    return CBaseInputPin::BreakConnect();
+}
+
+
+// Let derived class know when the input pin is connected
+
+HRESULT
+CTransformInputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = m_pTransformFilter->CompleteConnect(PINDIR_INPUT,pReceivePin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return CBaseInputPin::CompleteConnect(pReceivePin);
+}
+
+
+// check that we can support a given media type
+
+HRESULT
+CTransformInputPin::CheckMediaType(const CMediaType* pmt)
+{
+    // Check the input type
+
+    HRESULT hr = m_pTransformFilter->CheckInputType(pmt);
+    if (S_OK != hr) {
+        return hr;
+    }
+
+    // if the output pin is still connected, then we have
+    // to check the transform not just the input format
+
+    if ((m_pTransformFilter->m_pOutput != NULL) &&
+        (m_pTransformFilter->m_pOutput->IsConnected())) {
+            return m_pTransformFilter->CheckTransform(
+                      pmt,
+		      &m_pTransformFilter->m_pOutput->CurrentMediaType());
+    } else {
+        return hr;
+    }
+}
+
+
+// set the media type for this connection
+
+HRESULT
+CTransformInputPin::SetMediaType(const CMediaType* mtIn)
+{
+    // Set the base class media type (should always succeed)
+    HRESULT hr = CBasePin::SetMediaType(mtIn);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    // check the transform can be done (should always succeed)
+    ASSERT(SUCCEEDED(m_pTransformFilter->CheckInputType(mtIn)));
+
+    return m_pTransformFilter->SetMediaType(PINDIR_INPUT,mtIn);
+}
+
+
+// =================================================================
+// Implements IMemInputPin interface
+// =================================================================
+
+
+// provide EndOfStream that passes straight downstream
+// (there is no queued data)
+STDMETHODIMP
+CTransformInputPin::EndOfStream(void)
+{
+    CAutoLock lck(&m_pTransformFilter->m_csReceive);
+    HRESULT hr = CheckStreaming();
+    if (S_OK == hr) {
+       hr = m_pTransformFilter->EndOfStream();
+    }
+    return hr;
+}
+
+
+// enter flushing state. Call default handler to block Receives, then
+// pass to overridable method in filter
+STDMETHODIMP
+CTransformInputPin::BeginFlush(void)
+{
+    CAutoLock lck(&m_pTransformFilter->m_csFilter);
+    //  Are we actually doing anything?
+    ASSERT(m_pTransformFilter->m_pOutput != NULL);
+    if (!IsConnected() ||
+        !m_pTransformFilter->m_pOutput->IsConnected()) {
+        return VFW_E_NOT_CONNECTED;
+    }
+    HRESULT hr = CBaseInputPin::BeginFlush();
+    if (FAILED(hr)) {
+    	return hr;
+    }
+
+    return m_pTransformFilter->BeginFlush();
+}
+
+
+// leave flushing state.
+// Pass to overridable method in filter, then call base class
+// to unblock receives (finally)
+STDMETHODIMP
+CTransformInputPin::EndFlush(void)
+{
+    CAutoLock lck(&m_pTransformFilter->m_csFilter);
+    //  Are we actually doing anything?
+    ASSERT(m_pTransformFilter->m_pOutput != NULL);
+    if (!IsConnected() ||
+        !m_pTransformFilter->m_pOutput->IsConnected()) {
+        return VFW_E_NOT_CONNECTED;
+    }
+
+    HRESULT hr = m_pTransformFilter->EndFlush();
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return CBaseInputPin::EndFlush();
+}
+
+
+// here's the next block of data from the stream.
+// AddRef it yourself if you need to hold it beyond the end
+// of this call.
+
+HRESULT
+CTransformInputPin::Receive(IMediaSample * pSample)
+{
+    HRESULT hr;
+    CAutoLock lck(&m_pTransformFilter->m_csReceive);
+    ASSERT(pSample);
+
+    // check all is well with the base class
+    hr = CBaseInputPin::Receive(pSample);
+    if (S_OK == hr) {
+        hr = m_pTransformFilter->Receive(pSample);
+    }
+    return hr;
+}
+
+
+
+
+// override to pass downstream
+STDMETHODIMP
+CTransformInputPin::NewSegment(
+    REFERENCE_TIME tStart,
+    REFERENCE_TIME tStop,
+    double dRate)
+{
+    //  Save the values in the pin
+    CBasePin::NewSegment(tStart, tStop, dRate);
+    return m_pTransformFilter->NewSegment(tStart, tStop, dRate);
+}
+
+
+
+
+// =================================================================
+// Implements the CTransformOutputPin class
+// =================================================================
+
+
+// constructor
+
+CTransformOutputPin::CTransformOutputPin(
+    TCHAR *pObjectName,
+    CTransformFilter *pTransformFilter,
+    HRESULT * phr,
+    LPCWSTR pPinName)
+    : CBaseOutputPin(pObjectName, pTransformFilter, &pTransformFilter->m_csFilter, phr, pPinName),
+      m_pPosition(NULL)
+{
+    DbgLog((LOG_TRACE,2,TEXT("CTransformOutputPin::CTransformOutputPin")));
+    m_pTransformFilter = pTransformFilter;
+
+}
+
+#ifdef UNICODE
+CTransformOutputPin::CTransformOutputPin(
+    CHAR *pObjectName,
+    CTransformFilter *pTransformFilter,
+    HRESULT * phr,
+    LPCWSTR pPinName)
+    : CBaseOutputPin(pObjectName, pTransformFilter, &pTransformFilter->m_csFilter, phr, pPinName),
+      m_pPosition(NULL)
+{
+    DbgLog((LOG_TRACE,2,TEXT("CTransformOutputPin::CTransformOutputPin")));
+    m_pTransformFilter = pTransformFilter;
+
+}
+#endif
+
+// destructor
+
+CTransformOutputPin::~CTransformOutputPin()
+{
+    DbgLog((LOG_TRACE,2,TEXT("CTransformOutputPin::~CTransformOutputPin")));
+
+    if (m_pPosition) m_pPosition->Release();
+}
+
+
+// overriden to expose IMediaPosition and IMediaSeeking control interfaces
+
+STDMETHODIMP
+CTransformOutputPin::NonDelegatingQueryInterface(REFIID riid, void **ppv)
+{
+    CheckPointer(ppv,E_POINTER);
+    ValidateReadWritePtr(ppv,sizeof(PVOID));
+    *ppv = NULL;
+
+    if (riid == IID_IMediaPosition || riid == IID_IMediaSeeking) {
+
+        // we should have an input pin by now
+
+        ASSERT(m_pTransformFilter->m_pInput != NULL);
+
+        if (m_pPosition == NULL) {
+
+            HRESULT hr = CreatePosPassThru(
+                             GetOwner(),
+                             FALSE,
+                             (IPin *)m_pTransformFilter->m_pInput,
+                             &m_pPosition);
+            if (FAILED(hr)) {
+                return hr;
+            }
+        }
+        return m_pPosition->QueryInterface(riid, ppv);
+    } else {
+        return CBaseOutputPin::NonDelegatingQueryInterface(riid, ppv);
+    }
+}
+
+
+// provides derived filter a chance to grab extra interfaces
+
+HRESULT
+CTransformOutputPin::CheckConnect(IPin *pPin)
+{
+    // we should have an input connection first
+
+    ASSERT(m_pTransformFilter->m_pInput != NULL);
+    if ((m_pTransformFilter->m_pInput->IsConnected() == FALSE)) {
+	    return E_UNEXPECTED;
+    }
+
+    HRESULT hr = m_pTransformFilter->CheckConnect(PINDIR_OUTPUT,pPin);
+    if (FAILED(hr)) {
+	    return hr;
+    }
+    return CBaseOutputPin::CheckConnect(pPin);
+}
+
+
+// provides derived filter a chance to release it's extra interfaces
+
+HRESULT
+CTransformOutputPin::BreakConnect()
+{
+    //  Can't disconnect unless stopped
+    ASSERT(IsStopped());
+    m_pTransformFilter->BreakConnect(PINDIR_OUTPUT);
+    return CBaseOutputPin::BreakConnect();
+}
+
+
+// Let derived class know when the output pin is connected
+
+HRESULT
+CTransformOutputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = m_pTransformFilter->CompleteConnect(PINDIR_OUTPUT,pReceivePin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+    return CBaseOutputPin::CompleteConnect(pReceivePin);
+}
+
+
+// check a given transform - must have selected input type first
+
+HRESULT
+CTransformOutputPin::CheckMediaType(const CMediaType* pmtOut)
+{
+    // must have selected input first
+    ASSERT(m_pTransformFilter->m_pInput != NULL);
+    if ((m_pTransformFilter->m_pInput->IsConnected() == FALSE)) {
+	        return E_INVALIDARG;
+    }
+
+    return m_pTransformFilter->CheckTransform(
+				    &m_pTransformFilter->m_pInput->CurrentMediaType(),
+				    pmtOut);
+}
+
+
+// called after we have agreed a media type to actually set it in which case
+// we run the CheckTransform function to get the output format type again
+
+HRESULT
+CTransformOutputPin::SetMediaType(const CMediaType* pmtOut)
+{
+    HRESULT hr = NOERROR;
+    ASSERT(m_pTransformFilter->m_pInput != NULL);
+
+    ASSERT(m_pTransformFilter->m_pInput->CurrentMediaType().IsValid());
+
+    // Set the base class media type (should always succeed)
+    hr = CBasePin::SetMediaType(pmtOut);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+#ifdef DEBUG
+    if (FAILED(m_pTransformFilter->CheckTransform(&m_pTransformFilter->
+					m_pInput->CurrentMediaType(),pmtOut))) {
+	DbgLog((LOG_ERROR,0,TEXT("*** This filter is accepting an output media type")));
+	DbgLog((LOG_ERROR,0,TEXT("    that it can't currently transform to.  I hope")));
+	DbgLog((LOG_ERROR,0,TEXT("    it's smart enough to reconnect its input.")));
+    }
+#endif
+
+    return m_pTransformFilter->SetMediaType(PINDIR_OUTPUT,pmtOut);
+}
+
+
+// pass the buffer size decision through to the main transform class
+
+HRESULT
+CTransformOutputPin::DecideBufferSize(
+    IMemAllocator * pAllocator,
+    ALLOCATOR_PROPERTIES* pProp)
+{
+    return m_pTransformFilter->DecideBufferSize(pAllocator, pProp);
+}
+
+
+
+// return a specific media type indexed by iPosition
+
+HRESULT
+CTransformOutputPin::GetMediaType(
+    int iPosition,
+    CMediaType *pMediaType)
+{
+    ASSERT(m_pTransformFilter->m_pInput != NULL);
+
+    //  We don't have any media types if our input is not connected
+
+    if (m_pTransformFilter->m_pInput->IsConnected()) {
+        return m_pTransformFilter->GetMediaType(iPosition,pMediaType);
+    } else {
+        return VFW_S_NO_MORE_ITEMS;
+    }
+}
+
+
+// Override this if you can do something constructive to act on the
+// quality message.  Consider passing it upstream as well
+
+// Pass the quality mesage on upstream.
+
+STDMETHODIMP
+CTransformOutputPin::Notify(IBaseFilter * pSender, Quality q)
+{
+    UNREFERENCED_PARAMETER(pSender);
+    ValidateReadPtr(pSender,sizeof(IBaseFilter));
+
+    // First see if we want to handle this ourselves
+    HRESULT hr = m_pTransformFilter->AlterQuality(q);
+    if (hr!=S_FALSE) {
+        return hr;        // either S_OK or a failure
+    }
+
+    // S_FALSE means we pass the message on.
+    // Find the quality sink for our input pin and send it there
+
+    ASSERT(m_pTransformFilter->m_pInput != NULL);
+
+    return m_pTransformFilter->m_pInput->PassNotify(q);
+
+} // Notify
+
+
+// the following removes a very large number of level 4 warnings from the microsoft
+// compiler output, which are not useful at all in this case.
+#pragma warning(disable:4514)
diff --git a/plugins/GSdx_legacy/baseclasses/transfrm.h b/plugins/GSdx_legacy/baseclasses/transfrm.h
new file mode 100644
index 0000000000..695e009bd7
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/transfrm.h
@@ -0,0 +1,304 @@
+//------------------------------------------------------------------------------
+// File: Transfrm.h
+//
+// Desc: DirectShow base classes - defines classes from which simple
+//       transform codecs may be derived.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// It assumes the codec has one input and one output stream, and has no
+// interest in memory management, interface negotiation or anything else.
+//
+// derive your class from this, and supply Transform and the media type/format
+// negotiation functions. Implement that class, compile and link and
+// you're done.
+
+
+#ifndef __TRANSFRM__
+#define __TRANSFRM__
+
+// ======================================================================
+// This is the com object that represents a simple transform filter. It
+// supports IBaseFilter, IMediaFilter and two pins through nested interfaces
+// ======================================================================
+
+class CTransformFilter;
+
+// ==================================================
+// Implements the input pin
+// ==================================================
+
+class CTransformInputPin : public CBaseInputPin
+{
+    friend class CTransformFilter;
+
+protected:
+    CTransformFilter *m_pTransformFilter;
+
+
+public:
+
+    CTransformInputPin(
+        TCHAR *pObjectName,
+        CTransformFilter *pTransformFilter,
+        HRESULT * phr,
+        LPCWSTR pName);
+#ifdef UNICODE
+    CTransformInputPin(
+        char *pObjectName,
+        CTransformFilter *pTransformFilter,
+        HRESULT * phr,
+        LPCWSTR pName);
+#endif
+
+    STDMETHODIMP QueryId(LPWSTR * Id)
+    {
+        return AMGetWideString(L"In", Id);
+    }
+
+    // Grab and release extra interfaces if required
+
+    HRESULT CheckConnect(IPin *pPin);
+    HRESULT BreakConnect();
+    HRESULT CompleteConnect(IPin *pReceivePin);
+
+    // check that we can support this output type
+    HRESULT CheckMediaType(const CMediaType* mtIn);
+
+    // set the connection media type
+    HRESULT SetMediaType(const CMediaType* mt);
+
+    // --- IMemInputPin -----
+
+    // here's the next block of data from the stream.
+    // AddRef it yourself if you need to hold it beyond the end
+    // of this call.
+    STDMETHODIMP Receive(IMediaSample * pSample);
+
+    // provide EndOfStream that passes straight downstream
+    // (there is no queued data)
+    STDMETHODIMP EndOfStream(void);
+
+    // passes it to CTransformFilter::BeginFlush
+    STDMETHODIMP BeginFlush(void);
+
+    // passes it to CTransformFilter::EndFlush
+    STDMETHODIMP EndFlush(void);
+
+    STDMETHODIMP NewSegment(
+                        REFERENCE_TIME tStart,
+                        REFERENCE_TIME tStop,
+                        double dRate);
+
+    // Check if it's OK to process samples
+    virtual HRESULT CheckStreaming();
+
+    // Media type
+public:
+    CMediaType& CurrentMediaType() { return m_mt; };
+
+};
+
+// ==================================================
+// Implements the output pin
+// ==================================================
+
+class CTransformOutputPin : public CBaseOutputPin
+{
+    friend class CTransformFilter;
+
+protected:
+    CTransformFilter *m_pTransformFilter;
+
+public:
+
+    // implement IMediaPosition by passing upstream
+    IUnknown * m_pPosition;
+
+    CTransformOutputPin(
+        TCHAR *pObjectName,
+        CTransformFilter *pTransformFilter,
+        HRESULT * phr,
+        LPCWSTR pName);
+#ifdef UNICODE
+    CTransformOutputPin(
+        CHAR *pObjectName,
+        CTransformFilter *pTransformFilter,
+        HRESULT * phr,
+        LPCWSTR pName);
+#endif
+    ~CTransformOutputPin();
+
+    // override to expose IMediaPosition
+    STDMETHODIMP NonDelegatingQueryInterface(REFIID riid, void **ppv);
+
+    // --- CBaseOutputPin ------------
+
+    STDMETHODIMP QueryId(LPWSTR * Id)
+    {
+        return AMGetWideString(L"Out", Id);
+    }
+
+    // Grab and release extra interfaces if required
+
+    HRESULT CheckConnect(IPin *pPin);
+    HRESULT BreakConnect();
+    HRESULT CompleteConnect(IPin *pReceivePin);
+
+    // check that we can support this output type
+    HRESULT CheckMediaType(const CMediaType* mtOut);
+
+    // set the connection media type
+    HRESULT SetMediaType(const CMediaType *pmt);
+
+    // called from CBaseOutputPin during connection to ask for
+    // the count and size of buffers we need.
+    HRESULT DecideBufferSize(
+                IMemAllocator * pAlloc,
+                ALLOCATOR_PROPERTIES *pProp);
+
+    // returns the preferred formats for a pin
+    HRESULT GetMediaType(int iPosition,CMediaType *pMediaType);
+
+    // inherited from IQualityControl via CBasePin
+    STDMETHODIMP Notify(IBaseFilter * pSender, Quality q);
+
+    // Media type
+public:
+    CMediaType& CurrentMediaType() { return m_mt; };
+};
+
+
+class AM_NOVTABLE CTransformFilter : public CBaseFilter
+{
+
+public:
+
+    // map getpin/getpincount for base enum of pins to owner
+    // override this to return more specialised pin objects
+
+    virtual int GetPinCount();
+    virtual CBasePin * GetPin(int n);
+    STDMETHODIMP FindPin(LPCWSTR Id, IPin **ppPin);
+
+    // override state changes to allow derived transform filter
+    // to control streaming start/stop
+    STDMETHODIMP Stop();
+    STDMETHODIMP Pause();
+
+public:
+
+    CTransformFilter(TCHAR *, LPUNKNOWN, REFCLSID clsid);
+#ifdef UNICODE
+    CTransformFilter(CHAR *, LPUNKNOWN, REFCLSID clsid);
+#endif
+    ~CTransformFilter();
+
+    // =================================================================
+    // ----- override these bits ---------------------------------------
+    // =================================================================
+
+    // These must be supplied in a derived class
+
+    virtual HRESULT Transform(IMediaSample * pIn, IMediaSample *pOut);
+
+    // check if you can support mtIn
+    virtual HRESULT CheckInputType(const CMediaType* mtIn) PURE;
+
+    // check if you can support the transform from this input to this output
+    virtual HRESULT CheckTransform(const CMediaType* mtIn, const CMediaType* mtOut) PURE;
+
+    // this goes in the factory template table to create new instances
+    // static CCOMObject * CreateInstance(LPUNKNOWN, HRESULT *);
+
+    // call the SetProperties function with appropriate arguments
+    virtual HRESULT DecideBufferSize(
+                        IMemAllocator * pAllocator,
+                        ALLOCATOR_PROPERTIES *pprop) PURE;
+
+    // override to suggest OUTPUT pin media types
+    virtual HRESULT GetMediaType(int iPosition, CMediaType *pMediaType) PURE;
+
+
+
+    // =================================================================
+    // ----- Optional Override Methods           -----------------------
+    // =================================================================
+
+    // you can also override these if you want to know about streaming
+    virtual HRESULT StartStreaming();
+    virtual HRESULT StopStreaming();
+
+    // override if you can do anything constructive with quality notifications
+    virtual HRESULT AlterQuality(Quality q);
+
+    // override this to know when the media type is actually set
+    virtual HRESULT SetMediaType(PIN_DIRECTION direction,const CMediaType *pmt);
+
+    // chance to grab extra interfaces on connection
+    virtual HRESULT CheckConnect(PIN_DIRECTION dir,IPin *pPin);
+    virtual HRESULT BreakConnect(PIN_DIRECTION dir);
+    virtual HRESULT CompleteConnect(PIN_DIRECTION direction,IPin *pReceivePin);
+
+    // chance to customize the transform process
+    virtual HRESULT Receive(IMediaSample *pSample);
+
+    // Standard setup for output sample
+    HRESULT InitializeOutputSample(IMediaSample *pSample, IMediaSample **ppOutSample);
+
+    // if you override Receive, you may need to override these three too
+    virtual HRESULT EndOfStream(void);
+    virtual HRESULT BeginFlush(void);
+    virtual HRESULT EndFlush(void);
+    virtual HRESULT NewSegment(
+                        REFERENCE_TIME tStart,
+                        REFERENCE_TIME tStop,
+                        double dRate);
+
+#ifdef PERF
+    // Override to register performance measurement with a less generic string
+    // You should do this to avoid confusion with other filters
+    virtual void RegisterPerfId()
+         {m_idTransform = MSR_REGISTER(TEXT("Transform"));}
+#endif // PERF
+
+
+// implementation details
+
+protected:
+
+#ifdef PERF
+    int m_idTransform;                 // performance measuring id
+#endif
+    BOOL m_bEOSDelivered;              // have we sent EndOfStream
+    BOOL m_bSampleSkipped;             // Did we just skip a frame
+    BOOL m_bQualityChanged;            // Have we degraded?
+
+    // critical section protecting filter state.
+
+    CCritSec m_csFilter;
+
+    // critical section stopping state changes (ie Stop) while we're
+    // processing a sample.
+    //
+    // This critical section is held when processing
+    // events that occur on the receive thread - Receive() and EndOfStream().
+    //
+    // If you want to hold both m_csReceive and m_csFilter then grab
+    // m_csFilter FIRST - like CTransformFilter::Stop() does.
+
+    CCritSec m_csReceive;
+
+    // these hold our input and output pins
+
+    friend class CTransformInputPin;
+    friend class CTransformOutputPin;
+    CTransformInputPin *m_pInput;
+    CTransformOutputPin *m_pOutput;
+};
+
+#endif /* __TRANSFRM__ */
+
+
diff --git a/plugins/GSdx_legacy/baseclasses/transip.cpp b/plugins/GSdx_legacy/baseclasses/transip.cpp
new file mode 100644
index 0000000000..d58fd726dc
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/transip.cpp
@@ -0,0 +1,966 @@
+//------------------------------------------------------------------------------
+// File: TransIP.cpp
+//
+// Desc: DirectShow base classes - implements class for simple Transform-
+//       In-Place filters such as audio.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// How allocators are decided.
+//
+// An in-place transform tries to do its work in someone else's buffers.
+// It tries to persuade the filters on either side to use the same allocator
+// (and for that matter the same media type).  In desperation, if the downstream
+// filter refuses to supply an allocator and the upstream filter offers only
+// a read-only one then it will provide an allocator.
+// if the upstream filter insists on a read-only allocator then the transform
+// filter will (reluctantly) copy the data before transforming it.
+//
+// In order to pass an allocator through it needs to remember the one it got
+// from the first connection to pass it on to the second one.
+//
+// It is good if we can avoid insisting on a particular order of connection
+// (There is a precedent for insisting on the input
+// being connected first.  Insisting on the output being connected first is
+// not allowed.  That would break RenderFile.)
+//
+// The base pin classes (CBaseOutputPin and CBaseInputPin) both have a
+// m_pAllocator member which is used in places like
+// CBaseOutputPin::GetDeliveryBuffer and CBaseInputPin::Inactive.
+// To avoid lots of extra overriding, we should keep these happy
+// by using these pointers.
+//
+// When each pin is connected, it will set the corresponding m_pAllocator
+// and will have a single ref-count on that allocator.
+//
+// Refcounts are acquired by GetAllocator calls which return AddReffed
+// allocators and are released in one of:
+//     CBaseInputPin::Disconnect
+//     CBaseOutputPin::BreakConect
+// In each case m_pAllocator is set to NULL after the release, so this
+// is the last chance to ever release it.  If there should ever be
+// multiple refcounts associated with the same pointer, this had better
+// be cleared up before that happens.  To avoid such problems, we'll
+// stick with one per pointer.
+
+
+
+// RECONNECTING and STATE CHANGES
+//
+// Each pin could be disconnected, connected with a read-only allocator,
+// connected with an upstream read/write allocator, connected with an
+// allocator from downstream or connected with its own allocator.
+// Five states for each pin gives a data space of 25 states.
+//
+// Notation:
+//
+// R/W == read/write
+// R-O == read-only
+//
+// <input pin state> <output pin state> <comments>
+//
+// 00 means an unconnected pin.
+// <- means using a R/W allocator from the upstream filter
+// <= means using a R-O allocator from an upstream filter
+// || means using our own (R/W) allocator.
+// -> means using a R/W allocator from a downstream filter
+//    (a R-O allocator from downstream is nonsense, it can't ever work).
+//
+//
+// That makes 25 possible states.  Some states are nonsense (two different
+// allocators from the same place).  These are just an artifact of the notation.
+//        <=  <-  Nonsense.
+//        <-  <=  Nonsense
+// Some states are illegal (the output pin never accepts a R-O allocator):
+//        00  <=  !! Error !!
+//        <=  <=  !! Error !!
+//        ||  <=  !! Error !!
+//        ->  <=  !! Error !!
+// Three states appears to be inaccessible:
+//        ->  ||  Inaccessible
+//        ||  ->  Inaccessible
+//        ||  <-  Inaccessible
+// Some states only ever occur as intermediates with a pending reconnect which
+// is guaranteed to finish in another state.
+//        ->  00  ?? unstable goes to || 00
+//        00  <-  ?? unstable goes to 00 ||
+//        ->  <-  ?? unstable goes to -> ->
+//        <-  ||  ?? unstable goes to <- <-
+//        <-  ->  ?? unstable goes to <- <-
+// And that leaves 11 possible resting states:
+// 1      00  00  Nothing connected.
+// 2      <-  00  Input pin connected.
+// 3      <=  00  Input pin connected using R-O allocator.
+// 4      ||  00  Needs several state changes to get here.
+// 5      00  ||  Output pin connected using our allocator
+// 6      00  ->  Downstream only connected
+// 7      ||  ||  Undesirable but can be forced upon us.
+// 8      <=  ||  Copy forced.  <=  -> is preferable
+// 9      <=  ->  OK - forced to copy.
+// 10     <-  <-  Transform in place (ideal)
+// 11     ->  ->  Transform in place (ideal)
+//
+// The object of the exercise is to ensure that we finish up in states
+// 10 or 11 whenever possible.  State 10 is only possible if the upstream
+// filter has a R/W allocator (the AVI splitter notoriously
+// doesn't) and state 11 is only possible if the downstream filter does
+// offer an allocator.
+//
+// The transition table (entries marked * go via a reconnect)
+//
+// There are 8 possible transitions:
+// A: Connect upstream to filter with R-O allocator that insists on using it.
+// B: Connect upstream to filter with R-O allocator but chooses not to use it.
+// C: Connect upstream to filter with R/W allocator and insists on using it.
+// D: Connect upstream to filter with R/W allocator but chooses not to use it.
+// E: Connect downstream to a filter that offers an allocator
+// F: Connect downstream to a filter that does not offer an allocator
+// G: disconnect upstream
+// H: Disconnect downstream
+//
+//            A      B      C      D      E      F      G      H
+//           ---------------------------------------------------------
+// 00  00 1 | 3      3      2      2      6      5      .      .      |1  00  00
+// <-  00 2 | .      .      .      .      *10/11 10     1      .      |2  <-  00
+// <=  00 3 | .      .      .      .      *9/11  *7/8   1      .      |3  <=  00
+// ||  00 4 | .      .      .      .      *8     *7     1      .      |4  ||  00
+// 00  || 5 | 8      7      *10    7      .      .      .      1      |5  00  ||
+// 00  -> 6 | 9      11     *10    11     .      .      .      1      |6  00  ->
+// ||  || 7 | .      .      .      .      .      .      5      4      |7  ||  ||
+// <=  || 8 | .      .      .      .      .      .      5      3      |8  <=  ||
+// <=  -> 9 | .      .      .      .      .      .      6      3      |9  <=  ->
+// <-  <- 10| .      .      .      .      .      .      *5/6   2      |10 <-  <-
+// ->  -> 11| .      .      .      .      .      .      6      *2/3   |11 ->  ->
+//           ---------------------------------------------------------
+//            A      B      C      D      E      F      G      H
+//
+// All these states are accessible without requiring any filter to
+// change its behaviour but not all transitions are accessible, for
+// instance a transition from state 4 to anywhere other than
+// state 8 requires that the upstream filter first offer a R-O allocator
+// and then changes its mind and offer R/W.  This is NOT allowable - it
+// leads to things like the output pin getting a R/W allocator from
+// upstream and then the input pin being told it can only have a R-O one.
+// Note that you CAN change (say) the upstream filter for a different one, but
+// only as a disconnect / connect, not as a Reconnect.  (Exercise for
+// the reader is to see how you get into state 4).
+//
+// The reconnection stuff goes as follows (some of the cases shown here as
+// "no reconnect" may get one to finalise media type - an old story).
+// If there is a reconnect where it says "no reconnect" here then the
+// reconnection must not change the allocator choice.
+//
+// state 2: <- 00 transition E <- <- case C <- <- (no change)
+//                                   case D -> <- and then to -> ->
+//
+// state 2: <- 00 transition F <- <- (no reconnect)
+//
+// state 3: <= 00 transition E <= -> case A <= -> (no change)
+//                                   case B -> ->
+//                transition F <= || case A <= || (no change)
+//                                   case B || ||
+//
+// state 4: || 00 transition E || || case B -> || and then all cases to -> ->
+//                           F || || case B || || (no change)
+//
+// state 5: 00 || transition A <= || (no reconnect)
+//                           B || || (no reconnect)
+//                           C <- || all cases     <- <-
+//                           D || || (unfortunate, but upstream's choice)
+//
+// state 6: 00 -> transition A <= -> (no reconnect)
+//                           B -> -> (no reconnect)
+//                           C <- -> all cases <- <-
+//                           D -> -> (no reconnect)
+//
+// state 10:<- <- transition G 00 <- case E 00 ->
+//                                   case F 00 ||
+//
+// state 11:-> -> transition H -> 00 case A <= 00 (schizo)
+//                                   case B <= 00
+//                                   case C <- 00 (schizo)
+//                                   case D <- 00
+//
+// The Rules:
+// To sort out media types:
+// The input is reconnected
+//    if the input pin is connected and the output pin connects
+// The output is reconnected
+//    If the output pin is connected
+//    and the input pin connects to a different media type
+//
+// To sort out allocators:
+// The input is reconnected
+//    if the output disconnects and the input was using a downstream allocator
+// The output pin calls SetAllocator to pass on a new allocator
+//    if the output is connected and
+//       if the input disconnects and the output was using an upstream allocator
+//       if the input acquires an allocator different from the output one
+//          and that new allocator is not R-O
+//
+// Data is copied (i.e. call getbuffer and copy the data before transforming it)
+//    if the two allocators are different.
+
+
+
+// CHAINS of filters:
+//
+// We sit between two filters (call them A and Z).  We should finish up
+// with the same allocator on both of our pins and that should be the
+// same one that A and Z would have agreed on if we hadn't been in the
+// way.  Furthermore, it should not matter how many in-place transforms
+// are in the way.  Let B, C, D... be in-place transforms ("us").
+// Here's how it goes:
+//
+// 1.
+// A connects to B.  They agree on A's allocator.
+//   A-a->B
+//
+// 2.
+// B connects to C.  Same story. There is no point in a reconnect, but
+// B will request an input reconnect anyway.
+//   A-a->B-a->C
+//
+// 3.
+// C connects to Z.
+// C insists on using A's allocator, but compromises by requesting a reconnect.
+// of C's input.
+//   A-a->B-?->C-a->Z
+//
+// We now have pending reconnects on both A--->B and B--->C
+//
+// 4.
+// The A--->B link is reconnected.
+// A asks B for an allocator.  B sees that it has a downstream connection so
+// asks its downstream input pin i.e. C's input pin for an allocator.  C sees
+// that it too has a downstream connection so asks Z for an allocator.
+//
+// Even though Z's input pin is connected, it is being asked for an allocator.
+// It could refuse, in which case the chain is done and will use A's allocator
+// Alternatively, Z may supply one.  A chooses either Z's or A's own one.
+// B's input pin gets NotifyAllocator called to tell it the decision and it
+// propagates this downstream by calling ReceiveAllocator on its output pin
+// which calls NotifyAllocator on the next input pin downstream etc.
+// If the choice is Z then it goes:
+//   A-z->B-a->C-a->Z
+//   A-z->B-z->C-a->Z
+//   A-z->B-z->C-z->Z
+//
+// And that's IT!!  Any further (essentially spurious) reconnects peter out
+// with no change in the chain.
+
+#include "streams.h"
+#include "measure.h"
+#include "transip.h"
+
+
+// =================================================================
+// Implements the CTransInPlaceFilter class
+// =================================================================
+
+CTransInPlaceFilter::CTransInPlaceFilter
+   ( TCHAR     *pName,
+     LPUNKNOWN  pUnk,
+     REFCLSID   clsid,
+     HRESULT   *phr,
+     bool       bModifiesData
+   )
+   : CTransformFilter(pName, pUnk, clsid),
+     m_bModifiesData(bModifiesData)
+{
+#ifdef PERF
+    RegisterPerfId();
+#endif //  PERF
+
+} // constructor
+
+#ifdef UNICODE
+CTransInPlaceFilter::CTransInPlaceFilter
+   ( CHAR     *pName,
+     LPUNKNOWN  pUnk,
+     REFCLSID   clsid,
+     HRESULT   *phr,
+     bool       bModifiesData
+   )
+   : CTransformFilter(pName, pUnk, clsid),
+     m_bModifiesData(bModifiesData)
+{
+#ifdef PERF
+    RegisterPerfId();
+#endif //  PERF
+
+} // constructor
+#endif
+
+// return a non-addrefed CBasePin * for the user to addref if he holds onto it
+// for longer than his pointer to us. We create the pins dynamically when they
+// are asked for rather than in the constructor. This is because we want to
+// give the derived class an oppportunity to return different pin objects
+
+// As soon as any pin is needed we create both (this is different from the
+// usual transform filter) because enumerators, allocators etc are passed
+// through from one pin to another and it becomes very painful if the other
+// pin isn't there.  If we fail to create either pin we ensure we fail both.
+
+CBasePin *
+CTransInPlaceFilter::GetPin(int n)
+{
+    HRESULT hr = S_OK;
+
+    // Create an input pin if not already done
+
+    if (m_pInput == NULL) {
+
+        m_pInput = new CTransInPlaceInputPin( NAME("TransInPlace input pin")
+                                            , this        // Owner filter
+                                            , &hr         // Result code
+                                            , L"Input"    // Pin name
+                                            );
+
+        // Constructor for CTransInPlaceInputPin can't fail
+        ASSERT(SUCCEEDED(hr));
+    }
+
+    // Create an output pin if not already done
+
+    if (m_pInput!=NULL && m_pOutput == NULL) {
+
+        m_pOutput = new CTransInPlaceOutputPin( NAME("TransInPlace output pin")
+                                              , this       // Owner filter
+                                              , &hr        // Result code
+                                              , L"Output"  // Pin name
+                                              );
+
+        // a failed return code should delete the object
+
+        ASSERT(SUCCEEDED(hr));
+        if (m_pOutput == NULL) {
+            delete m_pInput;
+            m_pInput = NULL;
+        }
+    }
+
+    // Return the appropriate pin
+
+    ASSERT (n>=0 && n<=1);
+    if (n == 0) {
+        return m_pInput;
+    } else if (n==1) {
+        return m_pOutput;
+    } else {
+        return NULL;
+    }
+
+} // GetPin
+
+
+
+// dir is the direction of our pin.
+// pReceivePin is the pin we are connecting to.
+HRESULT CTransInPlaceFilter::CompleteConnect(PIN_DIRECTION dir,IPin *pReceivePin)
+{
+    UNREFERENCED_PARAMETER(pReceivePin);
+    ASSERT(m_pInput);
+    ASSERT(m_pOutput);
+
+    // if we are not part of a graph, then don't indirect the pointer
+    // this probably prevents use of the filter without a filtergraph
+    if (!m_pGraph) {
+        return VFW_E_NOT_IN_GRAPH;
+    }
+
+    // Always reconnect the input to account for buffering changes
+    //
+    // Because we don't get to suggest a type on ReceiveConnection
+    // we need another way of making sure the right type gets used.
+    //
+    // One way would be to have our EnumMediaTypes return our output
+    // connection type first but more deterministic and simple is to
+    // call ReconnectEx passing the type we want to reconnect with
+    // via the base class ReconeectPin method.
+
+    if (dir == PINDIR_OUTPUT) {
+        if( m_pInput->IsConnected() ) {
+            return ReconnectPin( m_pInput, &m_pOutput->CurrentMediaType() );
+        }
+        return NOERROR;
+    }
+
+    ASSERT(dir == PINDIR_INPUT);
+
+    // Reconnect output if necessary
+
+    if( m_pOutput->IsConnected() ) {
+
+        if (  m_pInput->CurrentMediaType()
+           != m_pOutput->CurrentMediaType()
+           ) {
+            return ReconnectPin( m_pOutput, &m_pInput->CurrentMediaType() );
+        }
+    }
+    return NOERROR;
+
+} // ComnpleteConnect
+
+
+//
+// DecideBufferSize
+//
+// Tell the output pin's allocator what size buffers we require.
+// *pAlloc will be the allocator our output pin is using.
+//
+
+HRESULT CTransInPlaceFilter::DecideBufferSize
+            ( IMemAllocator *pAlloc
+            , ALLOCATOR_PROPERTIES *pProperties
+            )
+{
+    ALLOCATOR_PROPERTIES Request, Actual;
+    HRESULT hr;
+
+    // If we are connected upstream, get his views
+    if (m_pInput->IsConnected()) {
+        // Get the input pin allocator, and get its size and count.
+        // we don't care about his alignment and prefix.
+
+        hr = InputPin()->PeekAllocator()->GetProperties(&Request);
+        if (FAILED(hr)) {
+            // Input connected but with a secretive allocator - enough!
+            return hr;
+        }
+    } else {
+        // We're reduced to blind guessing.  Let's guess one byte and if
+        // this isn't enough then when the other pin does get connected
+        // we can revise it.
+        ZeroMemory(&Request, sizeof(Request));
+        Request.cBuffers = 1;
+        Request.cbBuffer = 1;
+    }
+
+
+    DbgLog((LOG_MEMORY,1,TEXT("Setting Allocator Requirements")));
+    DbgLog((LOG_MEMORY,1,TEXT("Count %d, Size %d"),
+           Request.cBuffers, Request.cbBuffer));
+
+    // Pass the allocator requirements to our output side
+    // but do a little sanity checking first or we'll just hit
+    // asserts in the allocator.
+
+    pProperties->cBuffers = Request.cBuffers;
+    pProperties->cbBuffer = Request.cbBuffer;
+    pProperties->cbAlign = Request.cbAlign;
+    if (pProperties->cBuffers<=0) {pProperties->cBuffers = 1; }
+    if (pProperties->cbBuffer<=0) {pProperties->cbBuffer = 1; }
+    hr = pAlloc->SetProperties(pProperties, &Actual);
+
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    DbgLog((LOG_MEMORY,1,TEXT("Obtained Allocator Requirements")));
+    DbgLog((LOG_MEMORY,1,TEXT("Count %d, Size %d, Alignment %d"),
+           Actual.cBuffers, Actual.cbBuffer, Actual.cbAlign));
+
+    // Make sure we got the right alignment and at least the minimum required
+
+    if (  (Request.cBuffers > Actual.cBuffers)
+       || (Request.cbBuffer > Actual.cbBuffer)
+       || (Request.cbAlign  > Actual.cbAlign)
+       ) {
+        return E_FAIL;
+    }
+    return NOERROR;
+
+} // DecideBufferSize
+
+//
+// Copy
+//
+// return a pointer to an identical copy of pSample
+IMediaSample * CTransInPlaceFilter::Copy(IMediaSample *pSource)
+{
+    IMediaSample * pDest;
+
+    HRESULT hr;
+    REFERENCE_TIME tStart, tStop;
+    const BOOL bTime = S_OK == pSource->GetTime( &tStart, &tStop);
+
+    // this may block for an indeterminate amount of time
+    hr = OutputPin()->PeekAllocator()->GetBuffer(
+              &pDest
+              , bTime ? &tStart : NULL
+              , bTime ? &tStop : NULL
+              , m_bSampleSkipped ? AM_GBF_PREVFRAMESKIPPED : 0
+              );
+
+    if (FAILED(hr)) {
+        return NULL;
+    }
+
+    ASSERT(pDest);
+    IMediaSample2 *pSample2;
+    if (SUCCEEDED(pDest->QueryInterface(IID_IMediaSample2, (void **)&pSample2))) {
+        HRESULT hr = pSample2->SetProperties(
+            FIELD_OFFSET(AM_SAMPLE2_PROPERTIES, pbBuffer),
+            (PBYTE)m_pInput->SampleProps());
+        pSample2->Release();
+        if (FAILED(hr)) {
+            pDest->Release();
+            return NULL;
+        }
+    } else {
+        if (bTime) {
+            pDest->SetTime(&tStart, &tStop);
+        }
+
+        if (S_OK == pSource->IsSyncPoint()) {
+            pDest->SetSyncPoint(TRUE);
+        }
+        if (S_OK == pSource->IsDiscontinuity() || m_bSampleSkipped) {
+            pDest->SetDiscontinuity(TRUE);
+        }
+        if (S_OK == pSource->IsPreroll()) {
+            pDest->SetPreroll(TRUE);
+        }
+
+        // Copy the media type
+        AM_MEDIA_TYPE *pMediaType;
+        if (S_OK == pSource->GetMediaType(&pMediaType)) {
+            pDest->SetMediaType(pMediaType);
+            DeleteMediaType( pMediaType );
+        }
+
+    }
+
+    m_bSampleSkipped = FALSE;
+
+    // Copy the sample media times
+    REFERENCE_TIME TimeStart, TimeEnd;
+    if (pSource->GetMediaTime(&TimeStart,&TimeEnd) == NOERROR) {
+        pDest->SetMediaTime(&TimeStart,&TimeEnd);
+    }
+
+    // Copy the actual data length and the actual data.
+    {
+        const long lDataLength = pSource->GetActualDataLength();
+        pDest->SetActualDataLength(lDataLength);
+
+        // Copy the sample data
+        {
+            BYTE *pSourceBuffer, *pDestBuffer;
+            long lSourceSize  = pSource->GetSize();
+            long lDestSize = pDest->GetSize();
+
+            ASSERT(lDestSize >= lSourceSize && lDestSize >= lDataLength);
+
+            pSource->GetPointer(&pSourceBuffer);
+            pDest->GetPointer(&pDestBuffer);
+            ASSERT(lDestSize == 0 || pSourceBuffer != NULL && pDestBuffer != NULL);
+
+            CopyMemory( (PVOID) pDestBuffer, (PVOID) pSourceBuffer, lDataLength );
+        }
+    }
+
+    return pDest;
+
+} // Copy
+
+
+// override this to customize the transform process
+
+HRESULT
+CTransInPlaceFilter::Receive(IMediaSample *pSample)
+{
+    /*  Check for other streams and pass them on */
+    AM_SAMPLE2_PROPERTIES * const pProps = m_pInput->SampleProps();
+    if (pProps->dwStreamId != AM_STREAM_MEDIA) {
+        return m_pOutput->Deliver(pSample);
+    }
+    HRESULT hr;
+
+    // Start timing the TransInPlace (if PERF is defined)
+    MSR_START(m_idTransInPlace);
+
+    if (UsingDifferentAllocators()) {
+
+        // We have to copy the data.
+
+        pSample = Copy(pSample);
+
+        if (pSample==NULL) {
+            MSR_STOP(m_idTransInPlace);
+            return E_UNEXPECTED;
+        }
+    }
+
+    // have the derived class transform the data
+    hr = Transform(pSample);
+
+    // Stop the clock and log it (if PERF is defined)
+    MSR_STOP(m_idTransInPlace);
+
+    if (FAILED(hr)) {
+        DbgLog((LOG_TRACE, 1, TEXT("Error from TransInPlace")));
+        if (UsingDifferentAllocators()) {
+            pSample->Release();
+        }
+        return hr;
+    }
+
+    // the Transform() function can return S_FALSE to indicate that the
+    // sample should not be delivered; we only deliver the sample if it's
+    // really S_OK (same as NOERROR, of course.)
+    if (hr == NOERROR) {
+        hr = m_pOutput->Deliver(pSample);
+    } else {
+        //  But it would be an error to return this private workaround
+        //  to the caller ...
+        if (S_FALSE == hr) {
+            // S_FALSE returned from Transform is a PRIVATE agreement
+            // We should return NOERROR from Receive() in this cause because
+            // returning S_FALSE from Receive() means that this is the end
+            // of the stream and no more data should be sent.
+            m_bSampleSkipped = TRUE;
+            if (!m_bQualityChanged) {
+                NotifyEvent(EC_QUALITY_CHANGE,0,0);
+                m_bQualityChanged = TRUE;
+            }
+            hr = NOERROR;
+        }
+    }
+
+    // release the output buffer. If the connected pin still needs it,
+    // it will have addrefed it itself.
+    if (UsingDifferentAllocators()) {
+        pSample->Release();
+    }
+
+    return hr;
+
+} // Receive
+
+
+
+// =================================================================
+// Implements the CTransInPlaceInputPin class
+// =================================================================
+
+
+// constructor
+
+CTransInPlaceInputPin::CTransInPlaceInputPin
+    ( TCHAR               *pObjectName
+    , CTransInPlaceFilter *pFilter
+    , HRESULT             *phr
+    , LPCWSTR              pName
+    )
+    : CTransformInputPin(pObjectName,
+                         pFilter,
+                         phr,
+                         pName)
+    , m_bReadOnly(FALSE)
+    , m_pTIPFilter(pFilter)
+{
+    DbgLog((LOG_TRACE, 2
+           , TEXT("CTransInPlaceInputPin::CTransInPlaceInputPin")));
+
+} // constructor
+
+
+// =================================================================
+// Implements IMemInputPin interface
+// =================================================================
+
+
+// If the downstream filter has one then offer that (even if our own output
+// pin is not using it yet.  If the upstream filter chooses it then we will
+// tell our output pin to ReceiveAllocator).
+// Else if our output pin is using an allocator then offer that.
+//     ( This could mean offering the upstream filter his own allocator,
+//       it could mean offerring our own
+//     ) or it could mean offering the one from downstream
+// Else fail to offer any allocator at all.
+
+STDMETHODIMP CTransInPlaceInputPin::GetAllocator(IMemAllocator ** ppAllocator)
+{
+    CheckPointer(ppAllocator,E_POINTER);
+    ValidateReadWritePtr(ppAllocator,sizeof(IMemAllocator *));
+    CAutoLock cObjectLock(m_pLock);
+
+    HRESULT hr;
+
+    if ( m_pTIPFilter->m_pOutput->IsConnected() ) {
+        //  Store the allocator we got
+        hr = m_pTIPFilter->OutputPin()->ConnectedIMemInputPin()
+                                        ->GetAllocator( ppAllocator );
+        if (SUCCEEDED(hr)) {
+            m_pTIPFilter->OutputPin()->SetAllocator( *ppAllocator );
+        }
+    }
+    else {
+        //  Help upstream filter (eg TIP filter which is having to do a copy)
+        //  by providing a temp allocator here - we'll never use
+        //  this allocator because when our output is connected we'll
+        //  reconnect this pin
+        hr = CTransformInputPin::GetAllocator( ppAllocator );
+    }
+    return hr;
+
+} // GetAllocator
+
+
+
+/* Get told which allocator the upstream output pin is actually going to use */
+
+
+STDMETHODIMP
+CTransInPlaceInputPin::NotifyAllocator(
+    IMemAllocator * pAllocator,
+    BOOL bReadOnly)
+{
+    HRESULT hr = S_OK;
+    CheckPointer(pAllocator,E_POINTER);
+    ValidateReadPtr(pAllocator,sizeof(IMemAllocator));
+
+    CAutoLock cObjectLock(m_pLock);
+
+    m_bReadOnly = bReadOnly;
+    //  If we modify data then don't accept the allocator if it's
+    //  the same as the output pin's allocator
+
+    //  If our output is not connected just accept the allocator
+    //  We're never going to use this allocator because when our
+    //  output pin is connected we'll reconnect this pin
+    if (!m_pTIPFilter->OutputPin()->IsConnected()) {
+        return CTransformInputPin::NotifyAllocator(pAllocator, bReadOnly);
+    }
+
+    //  If the allocator is read-only and we're modifying data
+    //  and the allocator is the same as the output pin's
+    //  then reject
+    if (bReadOnly && m_pTIPFilter->m_bModifiesData) {
+        IMemAllocator *pOutputAllocator =
+            m_pTIPFilter->OutputPin()->PeekAllocator();
+
+        //  Make sure we have an output allocator
+        if (pOutputAllocator == NULL) {
+            hr = m_pTIPFilter->OutputPin()->ConnectedIMemInputPin()->
+                                      GetAllocator(&pOutputAllocator);
+            if(FAILED(hr)) {
+                hr = CreateMemoryAllocator(&pOutputAllocator);
+            }
+            if (SUCCEEDED(hr)) {
+                m_pTIPFilter->OutputPin()->SetAllocator(pOutputAllocator);
+                pOutputAllocator->Release();
+            }
+        }
+        if (pAllocator == pOutputAllocator) {
+            hr = E_FAIL;
+        } else if(SUCCEEDED(hr)) {
+            //  Must copy so set the allocator properties on the output
+            ALLOCATOR_PROPERTIES Props, Actual;
+            hr = pAllocator->GetProperties(&Props);
+            if (SUCCEEDED(hr)) {
+                hr = pOutputAllocator->SetProperties(&Props, &Actual);
+            }
+            if (SUCCEEDED(hr)) {
+                if (  (Props.cBuffers > Actual.cBuffers)
+                   || (Props.cbBuffer > Actual.cbBuffer)
+                   || (Props.cbAlign  > Actual.cbAlign)
+                   ) {
+                    hr =  E_FAIL;
+                }
+            }
+
+            //  Set the allocator on the output pin
+            if (SUCCEEDED(hr)) {
+                hr = m_pTIPFilter->OutputPin()->ConnectedIMemInputPin()
+                                       ->NotifyAllocator( pOutputAllocator, FALSE );
+            }
+        }
+    } else {
+        hr = m_pTIPFilter->OutputPin()->ConnectedIMemInputPin()
+                                   ->NotifyAllocator( pAllocator, bReadOnly );
+        if (SUCCEEDED(hr)) {
+            m_pTIPFilter->OutputPin()->SetAllocator( pAllocator );
+        }
+    }
+
+    if (SUCCEEDED(hr)) {
+
+        // It's possible that the old and the new are the same thing.
+        // AddRef before release ensures that we don't unload it.
+        pAllocator->AddRef();
+
+        if( m_pAllocator != NULL )
+            m_pAllocator->Release();
+
+        m_pAllocator = pAllocator;    // We have an allocator for the input pin
+    }
+
+    return hr;
+
+} // NotifyAllocator
+
+
+// EnumMediaTypes
+// - pass through to our downstream filter
+STDMETHODIMP CTransInPlaceInputPin::EnumMediaTypes( IEnumMediaTypes **ppEnum )
+{
+    // Can only pass through if connected
+    if( !m_pTIPFilter->m_pOutput->IsConnected() )
+        return VFW_E_NOT_CONNECTED;
+
+    return m_pTIPFilter->m_pOutput->GetConnected()->EnumMediaTypes( ppEnum );
+
+} // EnumMediaTypes
+
+
+// CheckMediaType
+// - agree to anything if not connected,
+// otherwise pass through to the downstream filter.
+// This assumes that the filter does not change the media type.
+
+HRESULT CTransInPlaceInputPin::CheckMediaType(const CMediaType *pmt )
+{
+    HRESULT hr = m_pTIPFilter->CheckInputType(pmt);
+    if (hr!=S_OK) return hr;
+
+    if( m_pTIPFilter->m_pOutput->IsConnected() )
+        return m_pTIPFilter->m_pOutput->GetConnected()->QueryAccept( pmt );
+    else
+        return S_OK;
+
+} // CheckMediaType
+
+
+// If upstream asks us what our requirements are, we will try to ask downstream
+// if that doesn't work, we'll just take the defaults.
+STDMETHODIMP
+CTransInPlaceInputPin::GetAllocatorRequirements(ALLOCATOR_PROPERTIES *pProps)
+{
+
+    if( m_pTIPFilter->m_pOutput->IsConnected() )
+        return m_pTIPFilter->OutputPin()
+               ->ConnectedIMemInputPin()->GetAllocatorRequirements( pProps );
+    else
+        return E_NOTIMPL;
+
+} // GetAllocatorRequirements
+
+
+// CTransInPlaceInputPin::CompleteConnect() calls CBaseInputPin::CompleteConnect()
+// and then calls CTransInPlaceFilter::CompleteConnect().  It does this because
+// CTransInPlaceFilter::CompleteConnect() can reconnect a pin and we do not
+// want to reconnect a pin if CBaseInputPin::CompleteConnect() fails.
+HRESULT
+CTransInPlaceInputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = CBaseInputPin::CompleteConnect(pReceivePin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return m_pTransformFilter->CompleteConnect(PINDIR_INPUT,pReceivePin);
+} // CompleteConnect
+
+
+// =================================================================
+// Implements the CTransInPlaceOutputPin class
+// =================================================================
+
+
+// constructor
+
+CTransInPlaceOutputPin::CTransInPlaceOutputPin(
+    TCHAR *pObjectName,
+    CTransInPlaceFilter *pFilter,
+    HRESULT * phr,
+    LPCWSTR pPinName)
+    : CTransformOutputPin( pObjectName
+                         , pFilter
+                         , phr
+                         , pPinName),
+      m_pTIPFilter(pFilter)
+{
+    DbgLog(( LOG_TRACE, 2
+           , TEXT("CTransInPlaceOutputPin::CTransInPlaceOutputPin")));
+
+} // constructor
+
+
+// EnumMediaTypes
+// - pass through to our upstream filter
+STDMETHODIMP CTransInPlaceOutputPin::EnumMediaTypes( IEnumMediaTypes **ppEnum )
+{
+    // Can only pass through if connected.
+    if( ! m_pTIPFilter->m_pInput->IsConnected() )
+        return VFW_E_NOT_CONNECTED;
+
+    return m_pTIPFilter->m_pInput->GetConnected()->EnumMediaTypes( ppEnum );
+
+} // EnumMediaTypes
+
+
+
+// CheckMediaType
+// - agree to anything if not connected,
+// otherwise pass through to the upstream filter.
+
+HRESULT CTransInPlaceOutputPin::CheckMediaType(const CMediaType *pmt )
+{
+    // Don't accept any output pin type changes if we're copying
+    // between allocators - it's too late to change the input
+    // allocator size.
+    if (m_pTIPFilter->UsingDifferentAllocators() && !m_pFilter->IsStopped()) {
+        if (*pmt == m_mt) {
+            return S_OK;
+        } else {
+            return VFW_E_TYPE_NOT_ACCEPTED;
+        }
+    }
+
+    // Assumes the type does not change.  That's why we're calling
+    // CheckINPUTType here on the OUTPUT pin.
+    HRESULT hr = m_pTIPFilter->CheckInputType(pmt);
+    if (hr!=S_OK) return hr;
+
+    if( m_pTIPFilter->m_pInput->IsConnected() )
+        return m_pTIPFilter->m_pInput->GetConnected()->QueryAccept( pmt );
+    else
+        return S_OK;
+
+} // CheckMediaType
+
+
+/* Save the allocator pointer in the output pin
+*/
+void
+CTransInPlaceOutputPin::SetAllocator(IMemAllocator * pAllocator)
+{
+    pAllocator->AddRef();
+    if (m_pAllocator) {
+        m_pAllocator->Release();
+    }
+    m_pAllocator = pAllocator;
+} // SetAllocator
+
+
+// CTransInPlaceOutputPin::CompleteConnect() calls CBaseOutputPin::CompleteConnect()
+// and then calls CTransInPlaceFilter::CompleteConnect().  It does this because
+// CTransInPlaceFilter::CompleteConnect() can reconnect a pin and we do not want to
+// reconnect a pin if CBaseOutputPin::CompleteConnect() fails.
+// CBaseOutputPin::CompleteConnect() often fails when our output pin is being connected
+// to the Video Mixing Renderer.
+HRESULT
+CTransInPlaceOutputPin::CompleteConnect(IPin *pReceivePin)
+{
+    HRESULT hr = CBaseOutputPin::CompleteConnect(pReceivePin);
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return m_pTransformFilter->CompleteConnect(PINDIR_OUTPUT,pReceivePin);
+} // CompleteConnect
diff --git a/plugins/GSdx_legacy/baseclasses/transip.h b/plugins/GSdx_legacy/baseclasses/transip.h
new file mode 100644
index 0000000000..4945f62143
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/transip.h
@@ -0,0 +1,250 @@
+//------------------------------------------------------------------------------
+// File: TransIP.h
+//
+// Desc: DirectShow base classes - defines classes from which simple
+//       Transform-In-Place filters may be derived.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+//
+// The difference between this and Transfrm.h is that Transfrm copies the data.
+//
+// It assumes the filter has one input and one output stream, and has no
+// interest in memory management, interface negotiation or anything else.
+//
+// Derive your class from this, and supply Transform and the media type/format
+// negotiation functions. Implement that class, compile and link and
+// you're done.
+
+
+#ifndef __TRANSIP__
+#define __TRANSIP__
+
+// ======================================================================
+// This is the com object that represents a simple transform filter. It
+// supports IBaseFilter, IMediaFilter and two pins through nested interfaces
+// ======================================================================
+
+class CTransInPlaceFilter;
+
+// Several of the pin functions call filter functions to do the work,
+// so you can often use the pin classes unaltered, just overriding the
+// functions in CTransInPlaceFilter.  If that's not enough and you want
+// to derive your own pin class, override GetPin in the filter to supply
+// your own pin classes to the filter.
+
+// ==================================================
+// Implements the input pin
+// ==================================================
+
+class CTransInPlaceInputPin : public CTransformInputPin
+{
+
+protected:
+    CTransInPlaceFilter * const m_pTIPFilter;    // our filter
+    BOOL                 m_bReadOnly;    // incoming stream is read only
+
+public:
+
+    CTransInPlaceInputPin(
+        TCHAR               *pObjectName,
+        CTransInPlaceFilter *pFilter,
+        HRESULT             *phr,
+        LPCWSTR              pName);
+
+    // --- IMemInputPin -----
+
+    // Provide an enumerator for media types by getting one from downstream
+    STDMETHODIMP EnumMediaTypes( IEnumMediaTypes **ppEnum );
+
+    // Say whether media type is acceptable.
+    HRESULT CheckMediaType(const CMediaType* pmt);
+
+    // Return our upstream allocator
+    STDMETHODIMP GetAllocator(IMemAllocator ** ppAllocator);
+
+    // get told which allocator the upstream output pin is actually
+    // going to use.
+    STDMETHODIMP NotifyAllocator(IMemAllocator * pAllocator,
+                                 BOOL bReadOnly);
+
+    // Allow the filter to see what allocator we have
+    // N.B. This does NOT AddRef
+    IMemAllocator * PeekAllocator() const
+        {  return m_pAllocator; }
+
+    // Pass this on downstream if it ever gets called.
+    STDMETHODIMP GetAllocatorRequirements(ALLOCATOR_PROPERTIES *pProps);
+
+    HRESULT CompleteConnect(IPin *pReceivePin);
+
+    inline const BOOL ReadOnly() { return m_bReadOnly ; }
+
+};  // CTransInPlaceInputPin
+
+// ==================================================
+// Implements the output pin
+// ==================================================
+
+class CTransInPlaceOutputPin : public CTransformOutputPin
+{
+
+protected:
+    // m_pFilter points to our CBaseFilter
+    CTransInPlaceFilter * const m_pTIPFilter;
+
+public:
+
+    CTransInPlaceOutputPin(
+        TCHAR               *pObjectName,
+        CTransInPlaceFilter *pFilter,
+        HRESULT             *phr,
+        LPCWSTR              pName);
+
+
+    // --- CBaseOutputPin ------------
+
+    // negotiate the allocator and its buffer size/count
+    // Insists on using our own allocator.  (Actually the one upstream of us).
+    // We don't override this - instead we just agree the default
+    // then let the upstream filter decide for itself on reconnect
+    // virtual HRESULT DecideAllocator(IMemInputPin * pPin, IMemAllocator ** pAlloc);
+
+    // Provide a media type enumerator.  Get it from upstream.
+    STDMETHODIMP EnumMediaTypes( IEnumMediaTypes **ppEnum );
+
+    // Say whether media type is acceptable.
+    HRESULT CheckMediaType(const CMediaType* pmt);
+
+    //  This just saves the allocator being used on the output pin
+    //  Also called by input pin's GetAllocator()
+    void SetAllocator(IMemAllocator * pAllocator);
+
+    IMemInputPin * ConnectedIMemInputPin()
+        { return m_pInputPin; }
+
+    // Allow the filter to see what allocator we have
+    // N.B. This does NOT AddRef
+    IMemAllocator * PeekAllocator() const
+        {  return m_pAllocator; }
+
+    HRESULT CompleteConnect(IPin *pReceivePin);
+
+};  // CTransInPlaceOutputPin
+
+
+class AM_NOVTABLE CTransInPlaceFilter : public CTransformFilter
+{
+
+public:
+
+    // map getpin/getpincount for base enum of pins to owner
+    // override this to return more specialised pin objects
+
+    virtual CBasePin *GetPin(int n);
+
+public:
+
+    //  Set bModifiesData == false if your derived filter does
+    //  not modify the data samples (for instance it's just copying
+    //  them somewhere else or looking at the timestamps).
+
+    CTransInPlaceFilter(TCHAR *, LPUNKNOWN, REFCLSID clsid, HRESULT *,
+                        bool bModifiesData = true);
+#ifdef UNICODE
+    CTransInPlaceFilter(CHAR *, LPUNKNOWN, REFCLSID clsid, HRESULT *,
+                        bool bModifiesData = true);
+#endif
+    // The following are defined to avoid undefined pure virtuals.
+    // Even if they are never called, they will give linkage warnings/errors
+
+    // We override EnumMediaTypes to bypass the transform class enumerator
+    // which would otherwise call this.
+    HRESULT GetMediaType(int iPosition, CMediaType *pMediaType)
+        {   DbgBreak("CTransInPlaceFilter::GetMediaType should never be called");
+            return E_UNEXPECTED;
+        }
+
+    // This is called when we actually have to provide out own allocator.
+    HRESULT DecideBufferSize(IMemAllocator*, ALLOCATOR_PROPERTIES *);
+
+    // The functions which call this in CTransform are overridden in this
+    // class to call CheckInputType with the assumption that the type
+    // does not change.  In Debug builds some calls will be made and
+    // we just ensure that they do not assert.
+    HRESULT CheckTransform(const CMediaType *mtIn, const CMediaType *mtOut)
+    {
+        return S_OK;
+    };
+
+
+    // =================================================================
+    // ----- You may want to override this -----------------------------
+    // =================================================================
+
+    HRESULT CompleteConnect(PIN_DIRECTION dir,IPin *pReceivePin);
+
+    // chance to customize the transform process
+    virtual HRESULT Receive(IMediaSample *pSample);
+
+    // =================================================================
+    // ----- You MUST override these -----------------------------------
+    // =================================================================
+
+    virtual HRESULT Transform(IMediaSample *pSample) PURE;
+
+    // this goes in the factory template table to create new instances
+    // static CCOMObject * CreateInstance(LPUNKNOWN, HRESULT *);
+
+
+#ifdef PERF
+    // Override to register performance measurement with a less generic string
+    // You should do this to avoid confusion with other filters
+    virtual void RegisterPerfId()
+         {m_idTransInPlace = MSR_REGISTER(TEXT("TransInPlace"));}
+#endif // PERF
+
+
+// implementation details
+
+protected:
+
+    IMediaSample * CTransInPlaceFilter::Copy(IMediaSample *pSource);
+
+#ifdef PERF
+    int m_idTransInPlace;                 // performance measuring id
+#endif // PERF
+    bool  m_bModifiesData;                // Does this filter change the data?
+
+    // these hold our input and output pins
+
+    friend class CTransInPlaceInputPin;
+    friend class CTransInPlaceOutputPin;
+
+    CTransInPlaceInputPin  *InputPin() const
+    {
+        return (CTransInPlaceInputPin *)m_pInput;
+    };
+    CTransInPlaceOutputPin *OutputPin() const
+    {
+        return (CTransInPlaceOutputPin *)m_pOutput;
+    };
+
+    //  Helper to see if the input and output types match
+    BOOL TypesMatch()
+    {
+        return InputPin()->CurrentMediaType() ==
+               OutputPin()->CurrentMediaType();
+    }
+
+    //  Are the input and output allocators different?
+    BOOL UsingDifferentAllocators() const
+    {
+        return InputPin()->PeekAllocator() != OutputPin()->PeekAllocator();
+    }
+}; // CTransInPlaceFilter
+
+#endif /* __TRANSIP__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/vtrans.cpp b/plugins/GSdx_legacy/baseclasses/vtrans.cpp
new file mode 100644
index 0000000000..2fb587508b
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/vtrans.cpp
@@ -0,0 +1,468 @@
+//------------------------------------------------------------------------------
+// File: Vtrans.cpp
+//
+// Desc: DirectShow base classes.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include "measure.h"
+// #include <vtransfr.h>         // now in precomp file streams.h
+
+CVideoTransformFilter::CVideoTransformFilter
+    ( TCHAR *pName, LPUNKNOWN pUnk, REFCLSID clsid)
+    : CTransformFilter(pName, pUnk, clsid)
+    , m_itrLate(0)
+    , m_nKeyFramePeriod(0)      // No QM until we see at least 2 key frames
+    , m_nFramesSinceKeyFrame(0)
+    , m_bSkipping(FALSE)
+    , m_tDecodeStart(0)
+    , m_itrAvgDecode(300000)    // 30mSec - probably allows skipping
+    , m_bQualityChanged(FALSE)
+{
+#ifdef PERF
+    RegisterPerfId();
+#endif //  PERF
+}
+
+
+CVideoTransformFilter::~CVideoTransformFilter()
+{
+  // nothing to do
+}
+
+
+// Reset our quality management state
+
+HRESULT CVideoTransformFilter::StartStreaming()
+{
+    m_itrLate = 0;
+    m_nKeyFramePeriod = 0;       // No QM until we see at least 2 key frames
+    m_nFramesSinceKeyFrame = 0;
+    m_bSkipping = FALSE;
+    m_tDecodeStart = 0;
+    m_itrAvgDecode = 300000;     // 30mSec - probably allows skipping
+    m_bQualityChanged = FALSE;
+    m_bSampleSkipped = FALSE;
+    return NOERROR;
+}
+
+
+// Overriden to reset quality management information
+
+HRESULT CVideoTransformFilter::EndFlush()
+{
+    {
+        //  Synchronize
+        CAutoLock lck(&m_csReceive);
+
+        // Reset our stats
+        //
+        // Note - we don't want to call derived classes here,
+        // we only want to reset our internal variables and this
+        // is a convenient way to do it
+        CVideoTransformFilter::StartStreaming();
+    }
+    return CTransformFilter::EndFlush();
+}
+
+
+HRESULT CVideoTransformFilter::AbortPlayback(HRESULT hr)
+{
+    NotifyEvent(EC_ERRORABORT, hr, 0);
+    m_pOutput->DeliverEndOfStream();
+    return hr;
+}
+
+
+// Receive()
+//
+// Accept a sample from upstream, decide whether to process it
+// or drop it.  If we process it then get a buffer from the
+// allocator of the downstream connection, transform it into the
+// new buffer and deliver it to the downstream filter.
+// If we decide not to process it then we do not get a buffer.
+
+// Remember that although this code will notice format changes coming into
+// the input pin, it will NOT change its output format if that results
+// in the filter needing to make a corresponding output format change.  Your
+// derived filter will have to take care of that.  (eg. a palette change if
+// the input and output is an 8 bit format).  If the input sample is discarded
+// and nothing is sent out for this Receive, please remember to put the format
+// change on the first output sample that you actually do send.
+// If your filter will produce the same output type even when the input type
+// changes, then this base class code will do everything you need.
+
+HRESULT CVideoTransformFilter::Receive(IMediaSample *pSample)
+{
+    // If the next filter downstream is the video renderer, then it may
+    // be able to operate in DirectDraw mode which saves copying the data
+    // and gives higher performance.  In that case the buffer which we
+    // get from GetDeliveryBuffer will be a DirectDraw buffer, and
+    // drawing into this buffer draws directly onto the display surface.
+    // This means that any waiting for the correct time to draw occurs
+    // during GetDeliveryBuffer, and that once the buffer is given to us
+    // the video renderer will count it in its statistics as a frame drawn.
+    // This means that any decision to drop the frame must be taken before
+    // calling GetDeliveryBuffer.
+
+    ASSERT(CritCheckIn(&m_csReceive));
+    AM_MEDIA_TYPE *pmtOut, *pmt;
+#ifdef DEBUG
+    FOURCCMap fccOut;
+#endif
+    HRESULT hr;
+    ASSERT(pSample);
+    IMediaSample * pOutSample;
+
+    // If no output pin to deliver to then no point sending us data
+    ASSERT (m_pOutput != NULL) ;
+
+    // The source filter may dynamically ask us to start transforming from a
+    // different media type than the one we're using now.  If we don't, we'll
+    // draw garbage. (typically, this is a palette change in the movie,
+    // but could be something more sinister like the compression type changing,
+    // or even the video size changing)
+
+#define rcS1 ((VIDEOINFOHEADER *)(pmt->pbFormat))->rcSource
+#define rcT1 ((VIDEOINFOHEADER *)(pmt->pbFormat))->rcTarget
+
+    pSample->GetMediaType(&pmt);
+    if (pmt != NULL && pmt->pbFormat != NULL) {
+
+	// spew some debug output
+	ASSERT(!IsEqualGUID(pmt->majortype, GUID_NULL));
+#ifdef DEBUG
+        fccOut.SetFOURCC(&pmt->subtype);
+	LONG lCompression = HEADER(pmt->pbFormat)->biCompression;
+	LONG lBitCount = HEADER(pmt->pbFormat)->biBitCount;
+	LONG lStride = (HEADER(pmt->pbFormat)->biWidth * lBitCount + 7) / 8;
+	lStride = (lStride + 3) & ~3;
+        DbgLog((LOG_TRACE,3,TEXT("*Changing input type on the fly to")));
+        DbgLog((LOG_TRACE,3,TEXT("FourCC: %lx Compression: %lx BitCount: %ld"),
+		fccOut.GetFOURCC(), lCompression, lBitCount));
+        DbgLog((LOG_TRACE,3,TEXT("biHeight: %ld rcDst: (%ld, %ld, %ld, %ld)"),
+		HEADER(pmt->pbFormat)->biHeight,
+		rcT1.left, rcT1.top, rcT1.right, rcT1.bottom));
+        DbgLog((LOG_TRACE,3,TEXT("rcSrc: (%ld, %ld, %ld, %ld) Stride: %ld"),
+		rcS1.left, rcS1.top, rcS1.right, rcS1.bottom,
+		lStride));
+#endif
+
+	// now switch to using the new format.  I am assuming that the
+	// derived filter will do the right thing when its media type is
+	// switched and streaming is restarted.
+
+	StopStreaming();
+	m_pInput->CurrentMediaType() = *pmt;
+	DeleteMediaType(pmt);
+	// if this fails, playback will stop, so signal an error
+	hr = StartStreaming();
+	if (FAILED(hr)) {
+	    return AbortPlayback(hr);
+	}
+    }
+
+    // Now that we have noticed any format changes on the input sample, it's
+    // OK to discard it.
+
+    if (ShouldSkipFrame(pSample)) {
+        MSR_NOTE(m_idSkip);
+        m_bSampleSkipped = TRUE;
+        return NOERROR;
+    }
+
+    // Set up the output sample
+    hr = InitializeOutputSample(pSample, &pOutSample);
+
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    m_bSampleSkipped = FALSE;
+
+    // The renderer may ask us to on-the-fly to start transforming to a
+    // different format.  If we don't obey it, we'll draw garbage
+
+#define rcS ((VIDEOINFOHEADER *)(pmtOut->pbFormat))->rcSource
+#define rcT ((VIDEOINFOHEADER *)(pmtOut->pbFormat))->rcTarget
+
+    pOutSample->GetMediaType(&pmtOut);
+    if (pmtOut != NULL && pmtOut->pbFormat != NULL) {
+
+	// spew some debug output
+	ASSERT(!IsEqualGUID(pmtOut->majortype, GUID_NULL));
+#ifdef DEBUG
+        fccOut.SetFOURCC(&pmtOut->subtype);
+	LONG lCompression = HEADER(pmtOut->pbFormat)->biCompression;
+	LONG lBitCount = HEADER(pmtOut->pbFormat)->biBitCount;
+	LONG lStride = (HEADER(pmtOut->pbFormat)->biWidth * lBitCount + 7) / 8;
+	lStride = (lStride + 3) & ~3;
+        DbgLog((LOG_TRACE,3,TEXT("*Changing output type on the fly to")));
+        DbgLog((LOG_TRACE,3,TEXT("FourCC: %lx Compression: %lx BitCount: %ld"),
+		fccOut.GetFOURCC(), lCompression, lBitCount));
+        DbgLog((LOG_TRACE,3,TEXT("biHeight: %ld rcDst: (%ld, %ld, %ld, %ld)"),
+		HEADER(pmtOut->pbFormat)->biHeight,
+		rcT.left, rcT.top, rcT.right, rcT.bottom));
+        DbgLog((LOG_TRACE,3,TEXT("rcSrc: (%ld, %ld, %ld, %ld) Stride: %ld"),
+		rcS.left, rcS.top, rcS.right, rcS.bottom,
+		lStride));
+#endif
+
+	// now switch to using the new format.  I am assuming that the
+	// derived filter will do the right thing when its media type is
+	// switched and streaming is restarted.
+
+	StopStreaming();
+	m_pOutput->CurrentMediaType() = *pmtOut;
+	DeleteMediaType(pmtOut);
+	hr = StartStreaming();
+
+	if (SUCCEEDED(hr)) {
+ 	    // a new format, means a new empty buffer, so wait for a keyframe
+	    // before passing anything on to the renderer.
+	    // !!! a keyframe may never come, so give up after 30 frames
+            DbgLog((LOG_TRACE,3,TEXT("Output format change means we must wait for a keyframe")));
+	    m_nWaitForKey = 30;
+
+	// if this fails, playback will stop, so signal an error
+	} else {
+
+            //  Must release the sample before calling AbortPlayback
+            //  because we might be holding the win16 lock or
+            //  ddraw lock
+            pOutSample->Release();
+	    AbortPlayback(hr);
+            return hr;
+	}
+    }
+
+    // After a discontinuity, we need to wait for the next key frame
+    if (pSample->IsDiscontinuity() == S_OK) {
+        DbgLog((LOG_TRACE,3,TEXT("Non-key discontinuity - wait for keyframe")));
+	m_nWaitForKey = 30;
+    }
+
+    // Start timing the transform (and log it if PERF is defined)
+
+    if (SUCCEEDED(hr)) {
+        m_tDecodeStart = timeGetTime();
+        MSR_START(m_idTransform);
+
+        // have the derived class transform the data
+        hr = Transform(pSample, pOutSample);
+
+        // Stop the clock (and log it if PERF is defined)
+        MSR_STOP(m_idTransform);
+        m_tDecodeStart = timeGetTime()-m_tDecodeStart;
+        m_itrAvgDecode = m_tDecodeStart*(10000/16) + 15*(m_itrAvgDecode/16);
+
+        // Maybe we're waiting for a keyframe still?
+        if (m_nWaitForKey)
+            m_nWaitForKey--;
+        if (m_nWaitForKey && pSample->IsSyncPoint() == S_OK)
+	    m_nWaitForKey = FALSE;
+
+        // if so, then we don't want to pass this on to the renderer
+        if (m_nWaitForKey && hr == NOERROR) {
+            DbgLog((LOG_TRACE,3,TEXT("still waiting for a keyframe")));
+	    hr = S_FALSE;
+	}
+    }
+
+    if (FAILED(hr)) {
+        DbgLog((LOG_TRACE,1,TEXT("Error from video transform")));
+    } else {
+        // the Transform() function can return S_FALSE to indicate that the
+        // sample should not be delivered; we only deliver the sample if it's
+        // really S_OK (same as NOERROR, of course.)
+        // Try not to return S_FALSE to a direct draw buffer (it's wasteful)
+        // Try to take the decision earlier - before you get it.
+
+        if (hr == NOERROR) {
+    	    hr = m_pOutput->Deliver(pOutSample);
+        } else {
+            // S_FALSE returned from Transform is a PRIVATE agreement
+            // We should return NOERROR from Receive() in this case because returning S_FALSE
+            // from Receive() means that this is the end of the stream and no more data should
+            // be sent.
+            if (S_FALSE == hr) {
+
+                //  We must Release() the sample before doing anything
+                //  like calling the filter graph because having the
+                //  sample means we may have the DirectDraw lock
+                //  (== win16 lock on some versions)
+                pOutSample->Release();
+                m_bSampleSkipped = TRUE;
+                if (!m_bQualityChanged) {
+                    m_bQualityChanged = TRUE;
+                    NotifyEvent(EC_QUALITY_CHANGE,0,0);
+                }
+                return NOERROR;
+            }
+        }
+    }
+
+    // release the output buffer. If the connected pin still needs it,
+    // it will have addrefed it itself.
+    pOutSample->Release();
+    ASSERT(CritCheckIn(&m_csReceive));
+
+    return hr;
+}
+
+
+
+BOOL CVideoTransformFilter::ShouldSkipFrame( IMediaSample * pIn)
+{
+    REFERENCE_TIME trStart, trStopAt;
+    HRESULT hr = pIn->GetTime(&trStart, &trStopAt);
+
+    // Don't skip frames with no timestamps
+    if (hr != S_OK)
+	return FALSE;
+
+    int itrFrame = (int)(trStopAt - trStart);  // frame duration
+
+    if(S_OK==pIn->IsSyncPoint()) {
+        MSR_INTEGER(m_idFrameType, 1);
+        if ( m_nKeyFramePeriod < m_nFramesSinceKeyFrame ) {
+            // record the max
+            m_nKeyFramePeriod = m_nFramesSinceKeyFrame;
+        }
+        m_nFramesSinceKeyFrame = 0;
+        m_bSkipping = FALSE;
+    } else {
+        MSR_INTEGER(m_idFrameType, 2);
+        if (  m_nFramesSinceKeyFrame>m_nKeyFramePeriod
+           && m_nKeyFramePeriod>0
+           ) {
+            // We haven't seen the key frame yet, but we were clearly being
+            // overoptimistic about how frequent they are.
+            m_nKeyFramePeriod = m_nFramesSinceKeyFrame;
+        }
+    }
+
+
+    // Whatever we might otherwise decide,
+    // if we are taking only a small fraction of the required frame time to decode
+    // then any quality problems are actually coming from somewhere else.
+    // Could be a net problem at the source for instance.  In this case there's
+    // no point in us skipping frames here.
+    if (m_itrAvgDecode*4>itrFrame) {
+
+        // Don't skip unless we are at least a whole frame late.
+        // (We would skip B frames if more than 1/2 frame late, but they're safe).
+        if ( m_itrLate > itrFrame ) {
+
+            // Don't skip unless the anticipated key frame would be no more than
+            // 1 frame early.  If the renderer has not been waiting (we *guess*
+            // it hasn't because we're late) then it will allow frames to be
+            // played early by up to a frame.
+
+            // Let T = Stream time from now to anticipated next key frame
+            // = (frame duration) * (KeyFramePeriod - FramesSinceKeyFrame)
+            // So we skip if T - Late < one frame  i.e.
+            //   (duration) * (freq - FramesSince) - Late < duration
+            // or (duration) * (freq - FramesSince - 1) < Late
+
+            // We don't dare skip until we have seen some key frames and have
+            // some idea how often they occur and they are reasonably frequent.
+            if (m_nKeyFramePeriod>0) {
+                // It would be crazy - but we could have a stream with key frames
+                // a very long way apart - and if they are further than about
+                // 3.5 minutes apart then we could get arithmetic overflow in
+                // reference time units.  Therefore we switch to mSec at this point
+                int it = (itrFrame/10000)
+                         * (m_nKeyFramePeriod-m_nFramesSinceKeyFrame -  1);
+                MSR_INTEGER(m_idTimeTillKey, it);
+
+                // For debug - might want to see the details - dump them as scratch pad
+#ifdef VTRANSPERF
+                MSR_INTEGER(0, itrFrame);
+                MSR_INTEGER(0, m_nFramesSinceKeyFrame);
+                MSR_INTEGER(0, m_nKeyFramePeriod);
+#endif
+                if (m_itrLate/10000 > it) {
+                    m_bSkipping = TRUE;
+                    // Now we are committed.  Once we start skipping, we
+                    // cannot stop until we hit a key frame.
+                } else {
+#ifdef VTRANSPERF
+                    MSR_INTEGER(0, 777770);  // not near enough to next key
+#endif
+                }
+            } else {
+#ifdef VTRANSPERF
+                MSR_INTEGER(0, 777771);  // Next key not predictable
+#endif
+            }
+        } else {
+#ifdef VTRANSPERF
+            MSR_INTEGER(0, 777772);  // Less than one frame late
+            MSR_INTEGER(0, m_itrLate);
+            MSR_INTEGER(0, itrFrame);
+#endif
+        }
+    } else {
+#ifdef VTRANSPERF
+        MSR_INTEGER(0, 777773);  // Decode time short - not not worth skipping
+        MSR_INTEGER(0, m_itrAvgDecode);
+        MSR_INTEGER(0, itrFrame);
+#endif
+    }
+
+    ++m_nFramesSinceKeyFrame;
+
+    if (m_bSkipping) {
+        // We will count down the lateness as we skip each frame.
+        // We re-assess each frame.  The key frame might not arrive when expected.
+        // We reset m_itrLate if we get a new Quality message, but actually that's
+        // not likely because we're not sending frames on to the Renderer.  In
+        // fact if we DID get another one it would mean that there's a long
+        // pipe between us and the renderer and we might need an altogether
+        // better strategy to avoid hunting!
+        m_itrLate = m_itrLate - itrFrame;
+    }
+
+    MSR_INTEGER(m_idLate, (int)m_itrLate/10000 ); // Note how late we think we are
+    if (m_bSkipping) {
+        if (!m_bQualityChanged) {
+            m_bQualityChanged = TRUE;
+            NotifyEvent(EC_QUALITY_CHANGE,0,0);
+        }
+    }
+    return m_bSkipping;
+}
+
+
+HRESULT CVideoTransformFilter::AlterQuality(Quality q)
+{
+    // to reduce the amount of 64 bit arithmetic, m_itrLate is an int.
+    // +, -, >, == etc  are not too bad, but * and / are painful.
+    if (m_itrLate>300000000) {
+        // Avoid overflow and silliness - more than 30 secs late is already silly
+        m_itrLate = 300000000;
+    } else {
+        m_itrLate = (int)q.Late;
+    }
+    // We ignore the other fields
+
+    // We're actually not very good at handling this.  In non-direct draw mode
+    // most of the time can be spent in the renderer which can skip any frame.
+    // In that case we'd rather the renderer handled things.
+    // Nevertheless we will keep an eye on it and if we really start getting
+    // a very long way behind then we will actually skip - but we'll still tell
+    // the renderer (or whoever is downstream) that they should handle quality.
+
+    return E_FAIL;     // Tell the renderer to do his thing.
+
+}
+
+
+
+// This will avoid several hundred useless warnings if compiled -W4 by MS VC++ v4
+#pragma warning(disable:4514)
+
diff --git a/plugins/GSdx_legacy/baseclasses/vtrans.h b/plugins/GSdx_legacy/baseclasses/vtrans.h
new file mode 100644
index 0000000000..05a8aefc1c
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/vtrans.h
@@ -0,0 +1,143 @@
+//------------------------------------------------------------------------------
+// File: VTrans.h
+//
+// Desc: DirectShow base classes - defines a video transform class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+// This class is derived from CTransformFilter, but is specialised to handle
+// the requirements of video quality control by frame dropping.
+// This is a non-in-place transform, (i.e. it copies the data) such as a decoder.
+
+class CVideoTransformFilter : public CTransformFilter
+{
+  public:
+
+    CVideoTransformFilter(TCHAR *, LPUNKNOWN, REFCLSID clsid);
+    ~CVideoTransformFilter();
+    HRESULT EndFlush();
+
+    // =================================================================
+    // ----- override these bits ---------------------------------------
+    // =================================================================
+    // The following methods are in CTransformFilter which is inherited.
+    // They are mentioned here for completeness
+    //
+    // These MUST be supplied in a derived class
+    //
+    // NOTE:
+    // virtual HRESULT Transform(IMediaSample * pIn, IMediaSample *pOut);
+    // virtual HRESULT CheckInputType(const CMediaType* mtIn) PURE;
+    // virtual HRESULT CheckTransform
+    //     (const CMediaType* mtIn, const CMediaType* mtOut) PURE;
+    // static CCOMObject * CreateInstance(LPUNKNOWN, HRESULT *);
+    // virtual HRESULT DecideBufferSize
+    //     (IMemAllocator * pAllocator, ALLOCATOR_PROPERTIES *pprop) PURE;
+    // virtual HRESULT GetMediaType(int iPosition, CMediaType *pMediaType) PURE;
+    //
+    // These MAY also be overridden
+    //
+    // virtual HRESULT StopStreaming();
+    // virtual HRESULT SetMediaType(PIN_DIRECTION direction,const CMediaType *pmt);
+    // virtual HRESULT CheckConnect(PIN_DIRECTION dir,IPin *pPin);
+    // virtual HRESULT BreakConnect(PIN_DIRECTION dir);
+    // virtual HRESULT CompleteConnect(PIN_DIRECTION direction,IPin *pReceivePin);
+    // virtual HRESULT EndOfStream(void);
+    // virtual HRESULT BeginFlush(void);
+    // virtual HRESULT EndFlush(void);
+    // virtual HRESULT NewSegment
+    //     (REFERENCE_TIME tStart,REFERENCE_TIME tStop,double dRate);
+#ifdef PERF
+
+    // If you override this - ensure that you register all these ids
+    // as well as any of your own,
+    virtual void RegisterPerfId() {
+        m_idSkip        = MSR_REGISTER(TEXT("Video Transform Skip frame"));
+        m_idFrameType   = MSR_REGISTER(TEXT("Video transform frame type"));
+        m_idLate        = MSR_REGISTER(TEXT("Video Transform Lateness"));
+        m_idTimeTillKey = MSR_REGISTER(TEXT("Video Transform Estd. time to next key"));
+        CTransformFilter::RegisterPerfId();
+    }
+#endif
+
+  protected:
+
+    // =========== QUALITY MANAGEMENT IMPLEMENTATION ========================
+    // Frames are assumed to come in three types:
+    // Type 1: an AVI key frame or an MPEG I frame.
+    //        This frame can be decoded with no history.
+    //        Dropping this frame means that no further frame can be decoded
+    //        until the next type 1 frame.
+    //        Type 1 frames are sync points.
+    // Type 2: an AVI non-key frame or an MPEG P frame.
+    //        This frame cannot be decoded unless the previous type 1 frame was
+    //        decoded and all type 2 frames since have been decoded.
+    //        Dropping this frame means that no further frame can be decoded
+    //        until the next type 1 frame.
+    // Type 3: An MPEG B frame.
+    //        This frame cannot be decoded unless the previous type 1 or 2 frame
+    //        has been decoded AND the subsequent type 1 or 2 frame has also
+    //        been decoded.  (This requires decoding the frames out of sequence).
+    //        Dropping this frame affects no other frames.  This implementation
+    //        does not allow for these.  All non-sync-point frames are treated
+    //        as being type 2.
+    //
+    // The spacing of frames of type 1 in a file is not guaranteed.  There MUST
+    // be a type 1 frame at (well, near) the start of the file in order to start
+    // decoding at all.  After that there could be one every half second or so,
+    // there could be one at the start of each scene (aka "cut", "shot") or
+    // there could be no more at all.
+    // If there is only a single type 1 frame then NO FRAMES CAN BE DROPPED
+    // without losing all the rest of the movie.  There is no way to tell whether
+    // this is the case, so we find that we are in the gambling business.
+    // To try to improve the odds, we record the greatest interval between type 1s
+    // that we have seen and we bet on things being no worse than this in the
+    // future.
+
+    // You can tell if it's a type 1 frame by calling IsSyncPoint().
+    // there is no architected way to test for a type 3, so you should override
+    // the quality management here if you have B-frames.
+
+    int m_nKeyFramePeriod; // the largest observed interval between type 1 frames
+                           // 1 means every frame is type 1, 2 means every other.
+
+    int m_nFramesSinceKeyFrame; // Used to count frames since the last type 1.
+                                // becomes the new m_nKeyFramePeriod if greater.
+
+    BOOL m_bSkipping;           // we are skipping to the next type 1 frame
+
+#ifdef PERF
+    int m_idFrameType;          // MSR id Frame type.  1=Key, 2="non-key"
+    int m_idSkip;               // MSR id skipping
+    int m_idLate;               // MSR id lateness
+    int m_idTimeTillKey;        // MSR id for guessed time till next key frame.
+#endif
+
+    virtual HRESULT StartStreaming();
+
+    HRESULT AbortPlayback(HRESULT hr);	// if something bad happens
+
+    HRESULT Receive(IMediaSample *pSample);
+
+    HRESULT AlterQuality(Quality q);
+
+    BOOL ShouldSkipFrame(IMediaSample * pIn);
+
+    int m_itrLate;              // lateness from last Quality message
+                                // (this overflows at 214 secs late).
+    int m_tDecodeStart;         // timeGetTime when decode started.
+    int m_itrAvgDecode;         // Average decode time in reference units.
+
+    BOOL m_bNoSkip;             // debug - no skipping.
+
+    // We send an EC_QUALITY_CHANGE notification to the app if we have to degrade.
+    // We send one when we start degrading, not one for every frame, this means
+    // we track whether we've sent one yet.
+    BOOL m_bQualityChanged;
+
+    // When non-zero, don't pass anything to renderer until next keyframe
+    // If there are few keys, give up and eventually draw something
+    int m_nWaitForKey;
+};
diff --git a/plugins/GSdx_legacy/baseclasses/wxdebug.cpp b/plugins/GSdx_legacy/baseclasses/wxdebug.cpp
new file mode 100644
index 0000000000..3b8667dad3
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxdebug.cpp
@@ -0,0 +1,1418 @@
+//------------------------------------------------------------------------------
+// File: WXDebug.cpp
+//
+// Desc: DirectShow base classes - implements ActiveX system debugging
+//       facilities.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef DEBUG
+#ifdef UNICODE
+#ifndef _UNICODE
+#define _UNICODE
+#endif // _UNICODE
+#endif // UNICODE
+#endif // DEBUG
+
+#ifdef DEBUG
+
+// The Win32 wsprintf() function writes a maximum of 1024 characters to it's output buffer.
+// See the documentation for wsprintf()'s lpOut parameter for more information.
+const INT iDEBUGINFO = 1024;                 // Used to format strings
+
+/* For every module and executable we store a debugging level for each of
+   the five categories (eg LOG_ERROR and LOG_TIMING). This makes it easy
+   to isolate and debug individual modules without seeing everybody elses
+   spurious debug output. The keys are stored in the registry under the
+   HKEY_LOCAL_MACHINE\SOFTWARE\Debug\<Module Name>\<KeyName> key values
+   NOTE these must be in the same order as their enumeration definition */
+
+TCHAR *pKeyNames[] = {
+    TEXT("TIMING"),      // Timing and performance measurements
+    TEXT("TRACE"),       // General step point call tracing
+    TEXT("MEMORY"),      // Memory and object allocation/destruction
+    TEXT("LOCKING"),     // Locking/unlocking of critical sections
+    TEXT("ERROR"),       // Debug error notification
+    TEXT("CUSTOM1"),
+    TEXT("CUSTOM2"),
+    TEXT("CUSTOM3"),
+    TEXT("CUSTOM4"),
+    TEXT("CUSTOM5")
+    };
+
+const TCHAR CAutoTrace::_szEntering[] = TEXT("->: %s");
+const TCHAR CAutoTrace::_szLeaving[]  = TEXT("<-: %s");
+
+const INT iMAXLEVELS = NUMELMS(pKeyNames);  // Maximum debug categories
+
+HINSTANCE m_hInst;                          // Module instance handle
+TCHAR m_ModuleName[iDEBUGINFO];             // Cut down module name
+DWORD m_Levels[iMAXLEVELS];                 // Debug level per category
+CRITICAL_SECTION m_CSDebug;                 // Controls access to list
+DWORD m_dwNextCookie;                       // Next active object ID
+ObjectDesc *pListHead = NULL;               // First active object
+DWORD m_dwObjectCount;                      // Active object count
+BOOL m_bInit = FALSE;                       // Have we been initialised
+HANDLE m_hOutput = INVALID_HANDLE_VALUE;    // Optional output written here
+DWORD dwWaitTimeout = INFINITE;             // Default timeout value
+DWORD dwTimeOffset;			    // Time of first DbgLog call
+bool g_fUseKASSERT = false;                 // don't create messagebox
+bool g_fDbgInDllEntryPoint = false;
+bool g_fAutoRefreshLevels = false;
+
+const TCHAR *pBaseKey = TEXT("SOFTWARE\\Debug");
+const TCHAR *pGlobalKey = TEXT("GLOBAL");
+static CHAR *pUnknownName = "UNKNOWN";
+
+TCHAR *TimeoutName = TEXT("TIMEOUT");
+
+/* This sets the instance handle that the debug library uses to find
+   the module's file name from the Win32 GetModuleFileName function */
+
+void WINAPI DbgInitialise(HINSTANCE hInst)
+{
+    InitializeCriticalSection(&m_CSDebug);
+    m_bInit = TRUE;
+
+    m_hInst = hInst;
+    DbgInitModuleName();
+    if (GetProfileInt(m_ModuleName, TEXT("BreakOnLoad"), 0))
+       DebugBreak();
+    DbgInitModuleSettings(false);
+    DbgInitGlobalSettings(true);
+    dwTimeOffset = timeGetTime();
+}
+
+
+/* This is called to clear up any resources the debug library uses - at the
+   moment we delete our critical section and the object list. The values we
+   retrieve from the registry are all done during initialisation but we don't
+   go looking for update notifications while we are running, if the values
+   are changed then the application has to be restarted to pick them up */
+
+void WINAPI DbgTerminate()
+{
+    if (m_hOutput != INVALID_HANDLE_VALUE) {
+       EXECUTE_ASSERT(CloseHandle(m_hOutput));
+       m_hOutput = INVALID_HANDLE_VALUE;
+    }
+    DeleteCriticalSection(&m_CSDebug);
+    m_bInit = FALSE;
+}
+
+
+/* This is called by DbgInitLogLevels to read the debug settings
+   for each logging category for this module from the registry */
+
+void WINAPI DbgInitKeyLevels(HKEY hKey, bool fTakeMax)
+{
+    LONG lReturn;               // Create key return value
+    LONG lKeyPos;               // Current key category
+    DWORD dwKeySize;            // Size of the key value
+    DWORD dwKeyType;            // Receives it's type
+    DWORD dwKeyValue;           // This fields value
+
+    /* Try and read a value for each key position in turn */
+    for (lKeyPos = 0;lKeyPos < iMAXLEVELS;lKeyPos++) {
+
+        dwKeySize = sizeof(DWORD);
+        lReturn = RegQueryValueEx(
+            hKey,                       // Handle to an open key
+            pKeyNames[lKeyPos],         // Subkey name derivation
+            NULL,                       // Reserved field
+            &dwKeyType,                 // Returns the field type
+            (LPBYTE) &dwKeyValue,       // Returns the field's value
+            &dwKeySize );               // Number of bytes transferred
+
+        /* If either the key was not available or it was not a DWORD value
+           then we ensure only the high priority debug logging is output
+           but we try and update the field to a zero filled DWORD value */
+
+        if (lReturn != ERROR_SUCCESS || dwKeyType != REG_DWORD)  {
+
+            dwKeyValue = 0;
+            lReturn = RegSetValueEx(
+                hKey,                   // Handle of an open key
+                pKeyNames[lKeyPos],     // Address of subkey name
+                (DWORD) 0,              // Reserved field
+                REG_DWORD,              // Type of the key field
+                (PBYTE) &dwKeyValue,    // Value for the field
+                sizeof(DWORD));         // Size of the field buffer
+
+            if (lReturn != ERROR_SUCCESS) {
+                DbgLog((LOG_ERROR,0,TEXT("Could not create subkey %s"),pKeyNames[lKeyPos]));
+                dwKeyValue = 0;
+            }
+        }
+        if(fTakeMax)
+        {
+            m_Levels[lKeyPos] = max(dwKeyValue,m_Levels[lKeyPos]);
+        }
+        else
+        {
+            if((m_Levels[lKeyPos] & LOG_FORCIBLY_SET) == 0) {
+                m_Levels[lKeyPos] = dwKeyValue;
+            }
+        }
+    }
+
+    /*  Read the timeout value for catching hangs */
+    dwKeySize = sizeof(DWORD);
+    lReturn = RegQueryValueEx(
+        hKey,                       // Handle to an open key
+        TimeoutName,                // Subkey name derivation
+        NULL,                       // Reserved field
+        &dwKeyType,                 // Returns the field type
+        (LPBYTE) &dwWaitTimeout,    // Returns the field's value
+        &dwKeySize );               // Number of bytes transferred
+
+    /* If either the key was not available or it was not a DWORD value
+       then we ensure only the high priority debug logging is output
+       but we try and update the field to a zero filled DWORD value */
+
+    if (lReturn != ERROR_SUCCESS || dwKeyType != REG_DWORD)  {
+
+        dwWaitTimeout = INFINITE;
+        lReturn = RegSetValueEx(
+            hKey,                   // Handle of an open key
+            TimeoutName,            // Address of subkey name
+            (DWORD) 0,              // Reserved field
+            REG_DWORD,              // Type of the key field
+            (PBYTE) &dwWaitTimeout, // Value for the field
+            sizeof(DWORD));         // Size of the field buffer
+
+        if (lReturn != ERROR_SUCCESS) {
+            DbgLog((LOG_ERROR,0,TEXT("Could not create subkey %s"),TimeoutName));
+            dwWaitTimeout = INFINITE;
+        }
+    }
+}
+
+void WINAPI DbgOutString(LPCTSTR psz)
+{
+    if (m_hOutput != INVALID_HANDLE_VALUE) {
+        UINT  cb = lstrlen(psz);
+        DWORD dw;
+#ifdef UNICODE
+        CHAR szDest[2048];
+        WideCharToMultiByte(CP_ACP, 0, psz, -1, szDest, NUMELMS(szDest), 0, 0);
+        WriteFile (m_hOutput, szDest, cb, &dw, NULL);
+#else
+        WriteFile (m_hOutput, psz, cb, &dw, NULL);
+#endif
+    } else {
+        OutputDebugString (psz);
+    }
+}
+
+/* Called by DbgInitGlobalSettings to setup alternate logging destinations
+ */
+
+void WINAPI DbgInitLogTo (
+    HKEY hKey)
+{
+    LONG  lReturn;
+    DWORD dwKeyType;
+    DWORD dwKeySize;
+    TCHAR szFile[MAX_PATH] = {0};
+    static const TCHAR cszKey[] = TEXT("LogToFile");
+
+    dwKeySize = MAX_PATH;
+    lReturn = RegQueryValueEx(
+        hKey,                       // Handle to an open key
+        cszKey,                     // Subkey name derivation
+        NULL,                       // Reserved field
+        &dwKeyType,                 // Returns the field type
+        (LPBYTE) szFile,            // Returns the field's value
+        &dwKeySize);                // Number of bytes transferred
+
+    // create an empty key if it does not already exist
+    //
+    if (lReturn != ERROR_SUCCESS || dwKeyType != REG_SZ)
+       {
+       dwKeySize = sizeof(TCHAR);
+       lReturn = RegSetValueEx(
+            hKey,                   // Handle of an open key
+            cszKey,                 // Address of subkey name
+            (DWORD) 0,              // Reserved field
+            REG_SZ,                 // Type of the key field
+            (PBYTE)szFile,          // Value for the field
+            dwKeySize);            // Size of the field buffer
+       }
+
+    // if an output-to was specified.  try to open it.
+    //
+    if (m_hOutput != INVALID_HANDLE_VALUE) {
+       EXECUTE_ASSERT(CloseHandle (m_hOutput));
+       m_hOutput = INVALID_HANDLE_VALUE;
+    }
+    if (szFile[0] != 0)
+       {
+       if (!lstrcmpi(szFile, TEXT("Console"))) {
+          m_hOutput = GetStdHandle (STD_OUTPUT_HANDLE);
+          if (m_hOutput == INVALID_HANDLE_VALUE) {
+             AllocConsole ();
+             m_hOutput = GetStdHandle (STD_OUTPUT_HANDLE);
+          }
+          SetConsoleTitle (TEXT("ActiveX Debug Output"));
+       } else if (szFile[0] &&
+                lstrcmpi(szFile, TEXT("Debug")) &&
+                lstrcmpi(szFile, TEXT("Debugger")) &&
+                lstrcmpi(szFile, TEXT("Deb")))
+          {
+            m_hOutput = CreateFile(szFile, GENERIC_WRITE,
+                                 FILE_SHARE_READ,
+                                 NULL, OPEN_ALWAYS,
+                                 FILE_ATTRIBUTE_NORMAL,
+                                 NULL);
+          if (INVALID_HANDLE_VALUE != m_hOutput)
+              {
+              static const TCHAR cszBar[] = TEXT("\r\n\r\n=====DbgInitialize()=====\r\n\r\n");
+              SetFilePointer (m_hOutput, 0, NULL, FILE_END);
+              DbgOutString (cszBar);
+              }
+          }
+       }
+}
+
+
+
+/* This is called by DbgInitLogLevels to read the global debug settings for
+   each logging category for this module from the registry. Normally each
+   module has it's own values set for it's different debug categories but
+   setting the global SOFTWARE\Debug\Global applies them to ALL modules */
+
+void WINAPI DbgInitGlobalSettings(bool fTakeMax)
+{
+    LONG lReturn;               // Create key return value
+    TCHAR szInfo[iDEBUGINFO];   // Constructs key names
+    HKEY hGlobalKey;            // Global override key
+
+    /* Construct the global base key name */
+    (void)StringCchPrintf(szInfo,NUMELMS(szInfo),TEXT("%s\\%s"),pBaseKey,pGlobalKey);
+
+    /* Create or open the key for this module */
+    lReturn = RegCreateKeyEx(HKEY_LOCAL_MACHINE,   // Handle of an open key
+                             szInfo,               // Address of subkey name
+                             (DWORD) 0,            // Reserved value
+                             NULL,                 // Address of class name
+                             (DWORD) 0,            // Special options flags
+                             KEY_ALL_ACCESS,       // Desired security access
+                             NULL,                 // Key security descriptor
+                             &hGlobalKey,          // Opened handle buffer
+                             NULL);                // What really happened
+
+    if (lReturn != ERROR_SUCCESS) {
+        DbgLog((LOG_ERROR,0,TEXT("Could not access GLOBAL module key")));
+        return;
+    }
+
+    DbgInitKeyLevels(hGlobalKey, fTakeMax);
+    RegCloseKey(hGlobalKey);
+}
+
+
+/* This sets the debugging log levels for the different categories. We start
+   by opening (or creating if not already available) the SOFTWARE\Debug key
+   that all these settings live under. We then look at the global values
+   set under SOFTWARE\Debug\Global which apply on top of the individual
+   module settings. We then load the individual module registry settings */
+
+void WINAPI DbgInitModuleSettings(bool fTakeMax)
+{
+    LONG lReturn;               // Create key return value
+    TCHAR szInfo[iDEBUGINFO];   // Constructs key names
+    HKEY hModuleKey;            // Module key handle
+
+    /* Construct the base key name */
+    (void)StringCchPrintf(szInfo,NUMELMS(szInfo), TEXT("%s\\%s"),pBaseKey,m_ModuleName);
+
+    /* Create or open the key for this module */
+    lReturn = RegCreateKeyEx(HKEY_LOCAL_MACHINE,   // Handle of an open key
+                             szInfo,               // Address of subkey name
+                             (DWORD) 0,            // Reserved value
+                             NULL,                 // Address of class name
+                             (DWORD) 0,            // Special options flags
+                             KEY_ALL_ACCESS,       // Desired security access
+                             NULL,                 // Key security descriptor
+                             &hModuleKey,          // Opened handle buffer
+                             NULL);                // What really happened
+
+    if (lReturn != ERROR_SUCCESS) {
+        DbgLog((LOG_ERROR,0,TEXT("Could not access module key")));
+        return;
+    }
+
+    DbgInitLogTo(hModuleKey);
+    DbgInitKeyLevels(hModuleKey, fTakeMax);
+    RegCloseKey(hModuleKey);
+}
+
+
+/* Initialise the module file name */
+
+void WINAPI DbgInitModuleName()
+{
+    TCHAR FullName[iDEBUGINFO];     // Load the full path and module name
+    TCHAR *pName;                   // Searches from the end for a backslash
+
+    GetModuleFileName(m_hInst,FullName,iDEBUGINFO);
+    pName = _tcsrchr(FullName,'\\');
+    if (pName == NULL) {
+        pName = FullName;
+    } else {
+        pName++;
+    }
+    (void)StringCchCopy(m_ModuleName,NUMELMS(m_ModuleName), pName);
+}
+
+struct MsgBoxMsg
+{
+    HWND hwnd;
+    TCHAR *szTitle;
+    TCHAR *szMessage;
+    DWORD dwFlags;
+    INT iResult;
+};
+
+//
+// create a thread to call MessageBox(). calling MessageBox() on
+// random threads at bad times can confuse the host (eg IE).
+//
+DWORD WINAPI MsgBoxThread(
+  LPVOID lpParameter   // thread data
+  )
+{
+    MsgBoxMsg *pmsg = (MsgBoxMsg *)lpParameter;
+    pmsg->iResult = MessageBox(
+        pmsg->hwnd,
+        pmsg->szTitle,
+        pmsg->szMessage,
+        pmsg->dwFlags);
+
+    return 0;
+}
+
+INT MessageBoxOtherThread(
+    HWND hwnd,
+    TCHAR *szTitle,
+    TCHAR *szMessage,
+    DWORD dwFlags)
+{
+    if(g_fDbgInDllEntryPoint)
+    {
+        // can't wait on another thread because we have the loader
+        // lock held in the dll entry point.
+        return MessageBox(hwnd, szTitle, szMessage, dwFlags);
+    }
+    else
+    {
+        MsgBoxMsg msg = {hwnd, szTitle, szMessage, dwFlags, 0};
+        DWORD dwid;
+        HANDLE hThread = CreateThread(
+            0,                      // security
+            0,                      // stack size
+            MsgBoxThread,
+            (void *)&msg,           // arg
+            0,                      // flags
+            &dwid);
+        if(hThread)
+        {
+            WaitForSingleObject(hThread, INFINITE);
+            CloseHandle(hThread);
+            return msg.iResult;
+        }
+
+        // break into debugger on failure.
+        return IDCANCEL;
+    }
+}
+
+/* Displays a message box if the condition evaluated to FALSE */
+
+void WINAPI DbgAssert(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine)
+{
+    if(g_fUseKASSERT)
+    {
+        DbgKernelAssert(pCondition, pFileName, iLine);
+    }
+    else
+    {
+
+        TCHAR szInfo[iDEBUGINFO];
+
+        (void)StringCchPrintf(szInfo, NUMELMS(szInfo), TEXT("%s \nAt line %d of %s\nContinue? (Cancel to debug)"),
+                 pCondition, iLine, pFileName);
+
+        INT MsgId = MessageBoxOtherThread(NULL,szInfo,TEXT("ASSERT Failed"),
+                                          MB_SYSTEMMODAL |
+                                          MB_ICONHAND |
+                                          MB_YESNOCANCEL |
+                                          MB_SETFOREGROUND);
+        switch (MsgId)
+        {
+          case IDNO:              /* Kill the application */
+
+              FatalAppExit(FALSE, TEXT("Application terminated"));
+              break;
+
+          case IDCANCEL:          /* Break into the debugger */
+
+              DebugBreak();
+              break;
+
+          case IDYES:             /* Ignore assertion continue execution */
+              break;
+        }
+    }
+}
+
+/* Displays a message box at a break point */
+
+void WINAPI DbgBreakPoint(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine)
+{
+    if(g_fUseKASSERT)
+    {
+        DbgKernelAssert(pCondition, pFileName, iLine);
+    }
+    else
+    {
+        TCHAR szInfo[iDEBUGINFO];
+
+        (void)StringCchPrintf(szInfo, NUMELMS(szInfo), TEXT("%s \nAt line %d of %s\nContinue? (Cancel to debug)"),
+                 pCondition, iLine, pFileName);
+
+        INT MsgId = MessageBoxOtherThread(NULL,szInfo,TEXT("Hard coded break point"),
+                                          MB_SYSTEMMODAL |
+                                          MB_ICONHAND |
+                                          MB_YESNOCANCEL |
+                                          MB_SETFOREGROUND);
+        switch (MsgId)
+        {
+          case IDNO:              /* Kill the application */
+
+              FatalAppExit(FALSE, TEXT("Application terminated"));
+              break;
+
+          case IDCANCEL:          /* Break into the debugger */
+
+              DebugBreak();
+              break;
+
+          case IDYES:             /* Ignore break point continue execution */
+              break;
+        }
+    }
+}
+
+void WINAPI DbgBreakPoint(const TCHAR *pFileName,INT iLine,const TCHAR* szFormatString,...)
+{
+    // A debug break point message can have at most 2000 characters if
+    // ANSI or UNICODE characters are being used.  A debug break point message
+    // can have between 1000 and 2000 double byte characters in it.  If a
+    // particular message needs more characters, then the value of this constant
+    // should be increased.
+    const DWORD MAX_BREAK_POINT_MESSAGE_SIZE = 2000;
+
+    TCHAR szBreakPointMessage[MAX_BREAK_POINT_MESSAGE_SIZE];
+
+    const DWORD MAX_CHARS_IN_BREAK_POINT_MESSAGE = sizeof(szBreakPointMessage) / sizeof(TCHAR);
+
+    va_list va;
+    va_start( va, szFormatString );
+
+    HRESULT hr = StringCchVPrintf( szBreakPointMessage, MAX_CHARS_IN_BREAK_POINT_MESSAGE, szFormatString, va );
+
+    va_end(va);
+
+    if( S_OK != hr ) {
+        DbgBreak( "ERROR in DbgBreakPoint().  The variable length debug message could not be displayed because _vsnprintf() failed." );
+        return;
+    }
+
+    ::DbgBreakPoint( szBreakPointMessage, pFileName, iLine );
+}
+
+
+/* When we initialised the library we stored in the m_Levels array the current
+   debug output level for this module for each of the five categories. When
+   some debug logging is sent to us it can be sent with a combination of the
+   categories (if it is applicable to many for example) in which case we map
+   the type's categories into their current debug levels and see if any of
+   them can be accepted. The function looks at each bit position in turn from
+   the input type field and then compares it's debug level with the modules.
+
+   A level of 0 means that output is always sent to the debugger.  This is
+   due to producing output if the input level is <= m_Levels.
+*/
+
+
+BOOL WINAPI DbgCheckModuleLevel(DWORD Type,DWORD Level)
+{
+    if(g_fAutoRefreshLevels)
+    {
+        // re-read the registry every second. We cannot use RegNotify() to
+        // notice registry changes because it's not available on win9x.
+        static DWORD g_dwLastRefresh = 0;
+        DWORD dwTime = timeGetTime();
+        if(dwTime - g_dwLastRefresh > 1000) {
+            g_dwLastRefresh = dwTime;
+
+            // there's a race condition: multiple threads could update the
+            // values. plus read and write not synchronized. no harm
+            // though.
+            DbgInitModuleSettings(false);
+        }
+    }
+
+
+    DWORD Mask = 0x01;
+
+    // If no valid bits are set return FALSE
+    if ((Type & ((1<<iMAXLEVELS)-1))) {
+
+	// speed up unconditional output.
+	if (0==Level)
+	    return(TRUE);
+
+        for (LONG lKeyPos = 0;lKeyPos < iMAXLEVELS;lKeyPos++) {
+            if (Type & Mask) {
+                if (Level <= (m_Levels[lKeyPos] & ~LOG_FORCIBLY_SET)) {
+                    return TRUE;
+                }
+            }
+            Mask <<= 1;
+        }
+    }
+    return FALSE;
+}
+
+
+/* Set debug levels to a given value */
+
+void WINAPI DbgSetModuleLevel(DWORD Type, DWORD Level)
+{
+    DWORD Mask = 0x01;
+
+    for (LONG lKeyPos = 0;lKeyPos < iMAXLEVELS;lKeyPos++) {
+        if (Type & Mask) {
+            m_Levels[lKeyPos] = Level | LOG_FORCIBLY_SET;
+        }
+        Mask <<= 1;
+    }
+}
+
+/* whether to check registry values periodically. this isn't turned
+   automatically because of the potential performance hit. */
+void WINAPI DbgSetAutoRefreshLevels(bool fAuto)
+{
+    g_fAutoRefreshLevels = fAuto;
+}
+
+#ifdef UNICODE
+//
+// warning -- this function is implemented twice for ansi applications
+// linking to the unicode library
+//
+void WINAPI DbgLogInfo(DWORD Type,DWORD Level,const CHAR *pFormat,...)
+{
+    /* Check the current level for this type combination */
+
+    BOOL bAccept = DbgCheckModuleLevel(Type,Level);
+    if (bAccept == FALSE) {
+        return;
+    }
+
+    TCHAR szInfo[2000];
+
+    /* Format the variable length parameter list */
+
+    va_list va;
+    va_start(va, pFormat);
+
+    (void)StringCchCopy(szInfo,NUMELMS(szInfo),m_ModuleName);
+    size_t len = lstrlen(szInfo);
+    (void)StringCchPrintf(szInfo + len,
+             NUMELMS(szInfo) - len,
+             TEXT("(tid %x) %8d : "),
+             GetCurrentThreadId(), timeGetTime() - dwTimeOffset);
+
+    CHAR szInfoA[2000];
+    WideCharToMultiByte(CP_ACP, 0, szInfo, -1, szInfoA, NUMELMS(szInfoA), 0, 0);
+
+    len = lstrlenA(szInfoA);
+    (void)StringCchVPrintfA(szInfoA + len, NUMELMS(szInfoA) - len, pFormat, va);
+    len = lstrlenA(szInfoA);
+    (void)StringCchCatA(szInfoA, NUMELMS(szInfoA) - len,  "\r\n");
+
+    WCHAR wszOutString[2000];
+    MultiByteToWideChar(CP_ACP, 0, szInfoA, -1, wszOutString, NUMELMS(wszOutString));
+    DbgOutString(wszOutString);
+
+    va_end(va);
+}
+
+
+void WINAPI DbgAssert(const CHAR *pCondition,const CHAR *pFileName,INT iLine)
+{
+    if(g_fUseKASSERT)
+    {
+        DbgKernelAssert(pCondition, pFileName, iLine);
+    }
+    else
+    {
+
+        TCHAR szInfo[iDEBUGINFO];
+
+        (void)StringCchPrintf(szInfo, NUMELMS(szInfo),TEXT("%S \nAt line %d of %S\nContinue? (Cancel to debug)"),
+                 pCondition, iLine, pFileName);
+
+        INT MsgId = MessageBoxOtherThread(NULL,szInfo,TEXT("ASSERT Failed"),
+                                          MB_SYSTEMMODAL |
+                                          MB_ICONHAND |
+                                          MB_YESNOCANCEL |
+                                          MB_SETFOREGROUND);
+        switch (MsgId)
+        {
+          case IDNO:              /* Kill the application */
+
+              FatalAppExit(FALSE, TEXT("Application terminated"));
+              break;
+
+          case IDCANCEL:          /* Break into the debugger */
+
+              DebugBreak();
+              break;
+
+          case IDYES:             /* Ignore assertion continue execution */
+              break;
+        }
+    }
+}
+
+/* Displays a message box at a break point */
+
+void WINAPI DbgBreakPoint(const CHAR *pCondition,const CHAR *pFileName,INT iLine)
+{
+    if(g_fUseKASSERT)
+    {
+        DbgKernelAssert(pCondition, pFileName, iLine);
+    }
+    else
+    {
+        TCHAR szInfo[iDEBUGINFO];
+
+        (void)StringCchPrintf(szInfo, NUMELMS(szInfo),TEXT("%S \nAt line %d of %S\nContinue? (Cancel to debug)"),
+                 pCondition, iLine, pFileName);
+
+        INT MsgId = MessageBoxOtherThread(NULL,szInfo,TEXT("Hard coded break point"),
+                                          MB_SYSTEMMODAL |
+                                          MB_ICONHAND |
+                                          MB_YESNOCANCEL |
+                                          MB_SETFOREGROUND);
+        switch (MsgId)
+        {
+          case IDNO:              /* Kill the application */
+
+              FatalAppExit(FALSE, TEXT("Application terminated"));
+              break;
+
+          case IDCANCEL:          /* Break into the debugger */
+
+              DebugBreak();
+              break;
+
+          case IDYES:             /* Ignore break point continue execution */
+              break;
+        }
+    }
+}
+
+void WINAPI DbgKernelAssert(const CHAR *pCondition,const CHAR *pFileName,INT iLine)
+{
+    DbgLog((LOG_ERROR,0,TEXT("Assertion FAILED (%hs) at line %d in file %hs"),
+           pCondition, iLine, pFileName));
+    DebugBreak();
+}
+
+#endif
+
+/* Print a formatted string to the debugger prefixed with this module's name
+   Because the COMBASE classes are linked statically every module loaded will
+   have their own copy of this code. It therefore helps if the module name is
+   included on the output so that the offending code can be easily found */
+
+//
+// warning -- this function is implemented twice for ansi applications
+// linking to the unicode library
+//
+void WINAPI DbgLogInfo(DWORD Type,DWORD Level,const TCHAR *pFormat,...)
+{
+
+    /* Check the current level for this type combination */
+
+    BOOL bAccept = DbgCheckModuleLevel(Type,Level);
+    if (bAccept == FALSE) {
+        return;
+    }
+
+    TCHAR szInfo[2000];
+
+    /* Format the variable length parameter list */
+
+    va_list va;
+    va_start(va, pFormat);
+
+    (void)StringCchCopy(szInfo, NUMELMS(szInfo), m_ModuleName);
+    size_t len = lstrlen(szInfo);
+    (void)StringCchPrintf(szInfo + len, NUMELMS(szInfo) - len,
+             TEXT("(tid %x) %8d : "),
+             GetCurrentThreadId(), timeGetTime() - dwTimeOffset);
+    len = lstrlen(szInfo);
+
+    (void)StringCchVPrintf(szInfo + len, NUMELMS(szInfo) - len, pFormat, va);
+
+    (void)StringCchCat(szInfo, NUMELMS(szInfo), TEXT("\r\n"));
+    DbgOutString(szInfo);
+
+    va_end(va);
+}
+
+
+/* If we are executing as a pure kernel filter we cannot display message
+   boxes to the user, this provides an alternative which puts the error
+   condition on the debugger output with a suitable eye catching message */
+
+void WINAPI DbgKernelAssert(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine)
+{
+    DbgLog((LOG_ERROR,0,TEXT("Assertion FAILED (%s) at line %d in file %s"),
+           pCondition, iLine, pFileName));
+    DebugBreak();
+}
+
+
+
+/* Each time we create an object derived from CBaseObject the constructor will
+   call us to register the creation of the new object. We are passed a string
+   description which we store away. We return a cookie that the constructor
+   uses to identify the object when it is destroyed later on. We update the
+   total number of active objects in the DLL mainly for debugging purposes */
+
+DWORD WINAPI DbgRegisterObjectCreation(const CHAR *szObjectName,
+                                       const WCHAR *wszObjectName)
+{
+    /* If this fires you have a mixed DEBUG/RETAIL build */
+
+    ASSERT(!!szObjectName ^ !!wszObjectName);
+
+    /* Create a place holder for this object description */
+
+    ObjectDesc *pObject = new ObjectDesc;
+    ASSERT(pObject);
+
+    /* It is valid to pass a NULL object name */
+    if (pObject == NULL) {
+        return FALSE;
+    }
+
+    /* Check we have been initialised - we may not be initialised when we are
+       being pulled in from an executable which has globally defined objects
+       as they are created by the C++ run time before WinMain is called */
+
+    if (m_bInit == FALSE) {
+        DbgInitialise(GetModuleHandle(NULL));
+    }
+
+    /* Grab the list critical section */
+    EnterCriticalSection(&m_CSDebug);
+
+    /* If no name then default to UNKNOWN */
+    if (!szObjectName && !wszObjectName) {
+        szObjectName = pUnknownName;
+    }
+
+    /* Put the new description at the head of the list */
+
+    pObject->m_szName = szObjectName;
+    pObject->m_wszName = wszObjectName;
+    pObject->m_dwCookie = ++m_dwNextCookie;
+    pObject->m_pNext = pListHead;
+
+    pListHead = pObject;
+    m_dwObjectCount++;
+
+    DWORD ObjectCookie = pObject->m_dwCookie;
+    ASSERT(ObjectCookie);
+
+    if(wszObjectName) {
+        DbgLog((LOG_MEMORY,2,TEXT("Object created   %d (%ls) %d Active"),
+                pObject->m_dwCookie, wszObjectName, m_dwObjectCount));
+    } else {
+        DbgLog((LOG_MEMORY,2,TEXT("Object created   %d (%hs) %d Active"),
+                pObject->m_dwCookie, szObjectName, m_dwObjectCount));
+    }
+
+    LeaveCriticalSection(&m_CSDebug);
+    return ObjectCookie;
+}
+
+
+/* This is called by the CBaseObject destructor when an object is about to be
+   destroyed, we are passed the cookie we returned during construction that
+   identifies this object. We scan the object list for a matching cookie and
+   remove the object if successful. We also update the active object count */
+
+BOOL WINAPI DbgRegisterObjectDestruction(DWORD dwCookie)
+{
+    /* Grab the list critical section */
+    EnterCriticalSection(&m_CSDebug);
+
+    ObjectDesc *pObject = pListHead;
+    ObjectDesc *pPrevious = NULL;
+
+    /* Scan the object list looking for a cookie match */
+
+    while (pObject) {
+        if (pObject->m_dwCookie == dwCookie) {
+            break;
+        }
+        pPrevious = pObject;
+        pObject = pObject->m_pNext;
+    }
+
+    if (pObject == NULL) {
+        DbgBreak("Apparently destroying a bogus object");
+        LeaveCriticalSection(&m_CSDebug);
+        return FALSE;
+    }
+
+    /* Is the object at the head of the list */
+
+    if (pPrevious == NULL) {
+        pListHead = pObject->m_pNext;
+    } else {
+        pPrevious->m_pNext = pObject->m_pNext;
+    }
+
+    /* Delete the object and update the housekeeping information */
+
+    m_dwObjectCount--;
+
+    if(pObject->m_wszName) {
+        DbgLog((LOG_MEMORY,2,TEXT("Object destroyed %d (%ls) %d Active"),
+                pObject->m_dwCookie, pObject->m_wszName, m_dwObjectCount));
+    } else {
+        DbgLog((LOG_MEMORY,2,TEXT("Object destroyed %d (%hs) %d Active"),
+                pObject->m_dwCookie, pObject->m_szName, m_dwObjectCount));
+    }
+
+    delete pObject;
+    LeaveCriticalSection(&m_CSDebug);
+    return TRUE;
+}
+
+
+/* This runs through the active object list displaying their details */
+
+void WINAPI DbgDumpObjectRegister()
+{
+    TCHAR szInfo[iDEBUGINFO];
+
+    /* Grab the list critical section */
+
+    EnterCriticalSection(&m_CSDebug);
+    ObjectDesc *pObject = pListHead;
+
+    /* Scan the object list displaying the name and cookie */
+
+    DbgLog((LOG_MEMORY,2,TEXT("")));
+    DbgLog((LOG_MEMORY,2,TEXT("   ID             Object Description")));
+    DbgLog((LOG_MEMORY,2,TEXT("")));
+
+    while (pObject) {
+        if(pObject->m_wszName) {
+            #ifdef UNICODE
+            LPCTSTR FORMAT_STRING = TEXT("%5d (%8x) %30s");
+            #else
+            LPCTSTR FORMAT_STRING = TEXT("%5d (%8x) %30S");
+            #endif
+
+            (void)StringCchPrintf(szInfo,NUMELMS(szInfo), FORMAT_STRING, pObject->m_dwCookie, &pObject, pObject->m_wszName);
+
+        } else {
+            #ifdef UNICODE
+            LPCTSTR FORMAT_STRING = TEXT("%5d (%8x) %30S");
+            #else
+            LPCTSTR FORMAT_STRING = TEXT("%5d (%8x) %30s");
+            #endif
+
+            (void)StringCchPrintf(szInfo,NUMELMS(szInfo),FORMAT_STRING,pObject->m_dwCookie, &pObject, pObject->m_szName);
+        }
+        DbgLog((LOG_MEMORY,2,szInfo));
+        pObject = pObject->m_pNext;
+    }
+
+    (void)StringCchPrintf(szInfo,NUMELMS(szInfo),TEXT("Total object count %5d"),m_dwObjectCount);
+    DbgLog((LOG_MEMORY,2,TEXT("")));
+    DbgLog((LOG_MEMORY,1,szInfo));
+    LeaveCriticalSection(&m_CSDebug);
+}
+
+/*  Debug infinite wait stuff */
+DWORD WINAPI DbgWaitForSingleObject(HANDLE h)
+{
+    DWORD dwWaitResult;
+    do {
+        dwWaitResult = WaitForSingleObject(h, dwWaitTimeout);
+        ASSERT(dwWaitResult == WAIT_OBJECT_0);
+    } while (dwWaitResult == WAIT_TIMEOUT);
+    return dwWaitResult;
+}
+DWORD WINAPI DbgWaitForMultipleObjects(DWORD nCount,
+                                CONST HANDLE *lpHandles,
+                                BOOL bWaitAll)
+{
+    DWORD dwWaitResult;
+    do {
+        dwWaitResult = WaitForMultipleObjects(nCount,
+                                              lpHandles,
+                                              bWaitAll,
+                                              dwWaitTimeout);
+        ASSERT((DWORD)(dwWaitResult - WAIT_OBJECT_0) < MAXIMUM_WAIT_OBJECTS);
+    } while (dwWaitResult == WAIT_TIMEOUT);
+    return dwWaitResult;
+}
+
+void WINAPI DbgSetWaitTimeout(DWORD dwTimeout)
+{
+    dwWaitTimeout = dwTimeout;
+}
+
+#endif /* DEBUG */
+
+#ifdef _OBJBASE_H_
+
+    /*  Stuff for printing out our GUID names */
+
+    GUID_STRING_ENTRY g_GuidNames[] = {
+    #define OUR_GUID_ENTRY(name, l, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \
+    { #name, { l, w1, w2, { b1, b2,  b3,  b4,  b5,  b6,  b7,  b8 } } },
+        #include <uuids.h>
+    };
+
+    CGuidNameList GuidNames;
+    int g_cGuidNames = sizeof(g_GuidNames) / sizeof(g_GuidNames[0]);
+
+    char *CGuidNameList::operator [] (const GUID &guid)
+    {
+        for (int i = 0; i < g_cGuidNames; i++) {
+            if (g_GuidNames[i].guid == guid) {
+                return g_GuidNames[i].szName;
+            }
+        }
+        if (guid == GUID_NULL) {
+            return "GUID_NULL";
+        }
+
+	// !!! add something to print FOURCC guids?
+
+	// shouldn't this print the hex CLSID?
+        return "Unknown GUID Name";
+    }
+
+#endif /* _OBJBASE_H_ */
+
+/*  CDisp class - display our data types */
+
+// clashes with REFERENCE_TIME
+CDisp::CDisp(LONGLONG ll, int Format)
+{
+    // note: this could be combined with CDisp(LONGLONG) by
+    // introducing a default format of CDISP_REFTIME
+    LARGE_INTEGER li;
+    li.QuadPart = ll;
+    switch (Format) {
+	case CDISP_DEC:
+	{
+	    TCHAR  temp[20];
+	    int pos=20;
+	    temp[--pos] = 0;
+	    int digit;
+	    // always output at least one digit
+	    do {
+		// Get the rightmost digit - we only need the low word
+	        digit = li.LowPart % 10;
+		li.QuadPart /= 10;
+		temp[--pos] = (TCHAR) digit+L'0';
+	    } while (li.QuadPart);
+	    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("%s"), temp+pos);
+	    break;
+	}
+	case CDISP_HEX:
+	default:
+	    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("0x%X%8.8X"), li.HighPart, li.LowPart);
+    }
+};
+
+CDisp::CDisp(REFCLSID clsid)
+{
+    WCHAR strClass[CHARS_IN_GUID+1];
+    StringFromGUID2(clsid, strClass, sizeof(strClass) / sizeof(strClass[0]));
+    ASSERT(sizeof(m_String)/sizeof(m_String[0]) >= CHARS_IN_GUID+1);
+    #ifdef UNICODE
+    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("%s"), strClass);
+    #else
+    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("%S"), strClass);
+    #endif
+};
+
+#ifdef __STREAMS__
+/*  Display stuff */
+CDisp::CDisp(CRefTime llTime)
+{
+    LPTSTR lpsz = m_String;
+    size_t len = NUMELMS(m_String);
+    LONGLONG llDiv;
+    if (llTime < 0) {
+        llTime = -llTime;
+        (void)StringCchPrintf(lpsz, len, TEXT("-"));
+        size_t t = lstrlen(lpsz);
+        lpsz += t;
+        len -= t;
+    }
+    llDiv = (LONGLONG)24 * 3600 * 10000000;
+    if (llTime >= llDiv) {
+        (void)StringCchPrintf(lpsz, len, TEXT("%d days "), (LONG)(llTime / llDiv));
+        size_t t = lstrlen(lpsz);
+        lpsz += t;
+        len -= t;
+        llTime = llTime % llDiv;
+    }
+    llDiv = (LONGLONG)3600 * 10000000;
+    if (llTime >= llDiv) {
+        (void)StringCchPrintf(lpsz, len, TEXT("%d hrs "), (LONG)(llTime / llDiv));
+        size_t t = lstrlen(lpsz);
+        lpsz += t;
+        len -= t;
+        llTime = llTime % llDiv;
+    }
+    llDiv = (LONGLONG)60 * 10000000;
+    if (llTime >= llDiv) {
+        (void)StringCchPrintf(lpsz, len,  TEXT("%d mins "), (LONG)(llTime / llDiv));
+        size_t t = lstrlen(lpsz);
+        lpsz += t;
+        len -= t;
+        llTime = llTime % llDiv;
+    }
+    (void)StringCchPrintf(lpsz, len, TEXT("%d.%3.3d sec"),
+             (LONG)llTime / 10000000,
+             (LONG)((llTime % 10000000) / 10000));
+};
+
+#endif // __STREAMS__
+
+
+/*  Display pin */
+CDisp::CDisp(IPin *pPin)
+{
+    PIN_INFO pi;
+    TCHAR str[MAX_PIN_NAME];
+    CLSID clsid;
+
+    if (pPin) {
+       pPin->QueryPinInfo(&pi);
+       pi.pFilter->GetClassID(&clsid);
+       QueryPinInfoReleaseFilter(pi);
+      #ifndef UNICODE
+       WideCharToMultiByte(GetACP(), 0, pi.achName, lstrlenW(pi.achName) + 1,
+                           str, MAX_PIN_NAME, NULL, NULL);
+      #else
+       (void)StringCchCopy(str, NUMELMS(str), pi.achName);
+      #endif
+    } else {
+       (void)StringCchCopy(str, NUMELMS(str), TEXT("NULL IPin"));
+    }
+
+    size_t len = lstrlen(str)+64;
+    m_pString = (TCHAR*) new TCHAR[len];
+    if (!m_pString) {
+	return;
+    }
+
+    #ifdef UNICODE
+    LPCTSTR FORMAT_STRING = TEXT("%S(%s)");
+    #else
+    LPCTSTR FORMAT_STRING = TEXT("%s(%s)");
+    #endif
+
+    (void)StringCchPrintf(m_pString, len, FORMAT_STRING, GuidNames[clsid], str);
+}
+
+/*  Display filter or pin */
+CDisp::CDisp(IUnknown *pUnk)
+{
+    IBaseFilter *pf;
+    HRESULT hr = pUnk->QueryInterface(IID_IBaseFilter, (void **)&pf);
+    if(SUCCEEDED(hr))
+    {
+        FILTER_INFO fi;
+        hr = pf->QueryFilterInfo(&fi);
+        if(SUCCEEDED(hr))
+        {
+            QueryFilterInfoReleaseGraph(fi);
+
+            size_t len = lstrlenW(fi.achName)  + 1;
+            m_pString = new TCHAR[len];
+            if(m_pString)
+            {
+                #ifdef UNICODE
+                LPCTSTR FORMAT_STRING = TEXT("%s");
+                #else
+                LPCTSTR FORMAT_STRING = TEXT("%S");
+                #endif
+
+                (void)StringCchPrintf(m_pString, len, FORMAT_STRING, fi.achName);
+            }
+        }
+
+        pf->Release();
+
+        return;
+    }
+
+    IPin *pp;
+    hr = pUnk->QueryInterface(IID_IPin, (void **)&pp);
+    if(SUCCEEDED(hr))
+    {
+        CDisp::CDisp(pp);
+        pp->Release();
+        return;
+    }
+}
+
+
+CDisp::~CDisp()
+{
+}
+
+CDispBasic::~CDispBasic()
+{
+    if (m_pString != m_String) {
+	delete [] m_pString;
+    }
+}
+
+CDisp::CDisp(double d)
+{
+#ifdef DEBUG
+    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("%.16g"), d);
+#else
+    (void)StringCchPrintf(m_String, NUMELMS(m_String), TEXT("%d.%03d"), (int) d, (int) ((d - (int) d) * 1000));
+#endif
+}
+
+
+/* If built for debug this will display the media type details. We convert the
+   major and subtypes into strings and also ask the base classes for a string
+   description of the subtype, so MEDIASUBTYPE_RGB565 becomes RGB 565 16 bit
+   We also display the fields in the BITMAPINFOHEADER structure, this should
+   succeed as we do not accept input types unless the format is big enough */
+
+#ifdef DEBUG
+void WINAPI DisplayType(LPTSTR label, const AM_MEDIA_TYPE *pmtIn, DWORD dwLevel)
+{
+
+    /* Dump the GUID types and a short description */
+
+    DbgLog((LOG_TRACE,dwLevel,TEXT("")));
+    DbgLog((LOG_TRACE,dwLevel,TEXT("%s  M type %hs  S type %hs"), label,
+	    GuidNames[pmtIn->majortype],
+	    GuidNames[pmtIn->subtype]));
+    DbgLog((LOG_TRACE,dwLevel,TEXT("Subtype description %s"),GetSubtypeName(&pmtIn->subtype)));
+
+    /* Dump the generic media types */
+
+    if (pmtIn->bTemporalCompression) {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Temporally compressed")));
+    } else {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Not temporally compressed")));
+    }
+
+    if (pmtIn->bFixedSizeSamples) {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Sample size %d"),pmtIn->lSampleSize));
+    } else {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Variable size samples")));
+    }
+
+    if (pmtIn->formattype == FORMAT_VideoInfo) {
+	/* Dump the contents of the BITMAPINFOHEADER structure */
+	BITMAPINFOHEADER *pbmi = HEADER(pmtIn->pbFormat);
+	VIDEOINFOHEADER *pVideoInfo = (VIDEOINFOHEADER *)pmtIn->pbFormat;
+
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Source rectangle (Left %d Top %d Right %d Bottom %d)"),
+	       pVideoInfo->rcSource.left,
+	       pVideoInfo->rcSource.top,
+	       pVideoInfo->rcSource.right,
+	       pVideoInfo->rcSource.bottom));
+
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Target rectangle (Left %d Top %d Right %d Bottom %d)"),
+	       pVideoInfo->rcTarget.left,
+	       pVideoInfo->rcTarget.top,
+	       pVideoInfo->rcTarget.right,
+	       pVideoInfo->rcTarget.bottom));
+
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Size of BITMAPINFO structure %d"),pbmi->biSize));
+	if (pbmi->biCompression < 256) {
+	    DbgLog((LOG_TRACE,dwLevel,TEXT("%dx%dx%d bit  (%d)"),
+		    pbmi->biWidth, pbmi->biHeight,
+		    pbmi->biBitCount, pbmi->biCompression));
+	} else {
+	    DbgLog((LOG_TRACE,dwLevel,TEXT("%dx%dx%d bit '%4.4hs'"),
+		    pbmi->biWidth, pbmi->biHeight,
+		    pbmi->biBitCount, &pbmi->biCompression));
+	}
+
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Image size %d"),pbmi->biSizeImage));
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Planes %d"),pbmi->biPlanes));
+	DbgLog((LOG_TRACE,dwLevel,TEXT("X Pels per metre %d"),pbmi->biXPelsPerMeter));
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Y Pels per metre %d"),pbmi->biYPelsPerMeter));
+	DbgLog((LOG_TRACE,dwLevel,TEXT("Colours used %d"),pbmi->biClrUsed));
+
+    } else if (pmtIn->majortype == MEDIATYPE_Audio) {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("     Format type %hs"),
+	    GuidNames[pmtIn->formattype]));
+	DbgLog((LOG_TRACE,dwLevel,TEXT("     Subtype %hs"),
+	    GuidNames[pmtIn->subtype]));
+
+	if ((pmtIn->subtype != MEDIASUBTYPE_MPEG1Packet)
+	  && (pmtIn->cbFormat >= sizeof(PCMWAVEFORMAT)))
+	{
+	    /* Dump the contents of the WAVEFORMATEX type-specific format structure */
+
+	    WAVEFORMATEX *pwfx = (WAVEFORMATEX *) pmtIn->pbFormat;
+            DbgLog((LOG_TRACE,dwLevel,TEXT("wFormatTag %u"), pwfx->wFormatTag));
+            DbgLog((LOG_TRACE,dwLevel,TEXT("nChannels %u"), pwfx->nChannels));
+            DbgLog((LOG_TRACE,dwLevel,TEXT("nSamplesPerSec %lu"), pwfx->nSamplesPerSec));
+            DbgLog((LOG_TRACE,dwLevel,TEXT("nAvgBytesPerSec %lu"), pwfx->nAvgBytesPerSec));
+            DbgLog((LOG_TRACE,dwLevel,TEXT("nBlockAlign %u"), pwfx->nBlockAlign));
+            DbgLog((LOG_TRACE,dwLevel,TEXT("wBitsPerSample %u"), pwfx->wBitsPerSample));
+
+            /* PCM uses a WAVEFORMAT and does not have the extra size field */
+
+            if (pmtIn->cbFormat >= sizeof(WAVEFORMATEX)) {
+                DbgLog((LOG_TRACE,dwLevel,TEXT("cbSize %u"), pwfx->cbSize));
+            }
+	} else {
+	}
+
+    } else {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("     Format type %hs"),
+	    GuidNames[pmtIn->formattype]));
+	// !!!! should add code to dump wave format, others
+    }
+}
+
+
+void WINAPI DumpGraph(IFilterGraph *pGraph, DWORD dwLevel)
+{
+    if( !pGraph )
+    {
+        return;
+    }
+
+    IEnumFilters *pFilters;
+
+    DbgLog((LOG_TRACE,dwLevel,TEXT("DumpGraph [%x]"), pGraph));
+
+    if (FAILED(pGraph->EnumFilters(&pFilters))) {
+	DbgLog((LOG_TRACE,dwLevel,TEXT("EnumFilters failed!")));
+    }
+
+    IBaseFilter *pFilter;
+    ULONG	n;
+    while (pFilters->Next(1, &pFilter, &n) == S_OK) {
+	FILTER_INFO	info;
+
+	if (FAILED(pFilter->QueryFilterInfo(&info))) {
+	    DbgLog((LOG_TRACE,dwLevel,TEXT("    Filter [%x]  -- failed QueryFilterInfo"), pFilter));
+	} else {
+	    QueryFilterInfoReleaseGraph(info);
+
+	    // !!! should QueryVendorInfo here!
+
+	    DbgLog((LOG_TRACE,dwLevel,TEXT("    Filter [%x]  '%ls'"), pFilter, info.achName));
+
+	    IEnumPins *pins;
+
+	    if (FAILED(pFilter->EnumPins(&pins))) {
+		DbgLog((LOG_TRACE,dwLevel,TEXT("EnumPins failed!")));
+	    } else {
+
+		IPin *pPin;
+		while (pins->Next(1, &pPin, &n) == S_OK) {
+		    PIN_INFO	info;
+
+		    if (FAILED(pPin->QueryPinInfo(&info))) {
+			DbgLog((LOG_TRACE,dwLevel,TEXT("          Pin [%x]  -- failed QueryPinInfo"), pPin));
+		    } else {
+			QueryPinInfoReleaseFilter(info);
+
+			IPin *pPinConnected = NULL;
+
+			HRESULT hr = pPin->ConnectedTo(&pPinConnected);
+
+			if (pPinConnected) {
+			    DbgLog((LOG_TRACE,dwLevel,TEXT("          Pin [%x]  '%ls' [%sput]")
+							   TEXT("  Connected to pin [%x]"),
+				    pPin, info.achName,
+				    info.dir == PINDIR_INPUT ? TEXT("In") : TEXT("Out"),
+				    pPinConnected));
+
+			    pPinConnected->Release();
+
+			    // perhaps we should really dump the type both ways as a sanity
+			    // check?
+			    if (info.dir == PINDIR_OUTPUT) {
+				AM_MEDIA_TYPE mt;
+
+				hr = pPin->ConnectionMediaType(&mt);
+
+				if (SUCCEEDED(hr)) {
+				    DisplayType(TEXT("Connection type"), &mt);
+
+				    FreeMediaType(mt);
+				}
+			    }
+			} else {
+			    DbgLog((LOG_TRACE,dwLevel,
+				    TEXT("          Pin [%x]  '%ls' [%sput]"),
+				    pPin, info.achName,
+				    info.dir == PINDIR_INPUT ? TEXT("In") : TEXT("Out")));
+
+			}
+		    }
+
+		    pPin->Release();
+
+		}
+
+		pins->Release();
+	    }
+
+	}
+
+	pFilter->Release();
+    }
+
+    pFilters->Release();
+
+}
+
+#endif
+
diff --git a/plugins/GSdx_legacy/baseclasses/wxdebug.h b/plugins/GSdx_legacy/baseclasses/wxdebug.h
new file mode 100644
index 0000000000..02d11b9028
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxdebug.h
@@ -0,0 +1,393 @@
+//------------------------------------------------------------------------------
+// File: WXDebug.h
+//
+// Desc: DirectShow base classes - provides debugging facilities.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __WXDEBUG__
+#define __WXDEBUG__
+
+// This library provides fairly straight forward debugging functionality, this
+// is split into two main sections. The first is assertion handling, there are
+// three types of assertions provided here. The most commonly used one is the
+// ASSERT(condition) macro which will pop up a message box including the file
+// and line number if the condition evaluates to FALSE. Then there is the
+// EXECUTE_ASSERT macro which is the same as ASSERT except the condition will
+// still be executed in NON debug builds. The final type of assertion is the
+// KASSERT macro which is more suitable for pure (perhaps kernel) filters as
+// the condition is printed onto the debugger rather than in a message box.
+//
+// The other part of the debug module facilties is general purpose logging.
+// This is accessed by calling DbgLog(). The function takes a type and level
+// field which define the type of informational string you are presenting and
+// it's relative importance. The type field can be a combination (one or more)
+// of LOG_TIMING, LOG_TRACE, LOG_MEMORY, LOG_LOCKING and LOG_ERROR. The level
+// is a DWORD value where zero defines highest important. Use of zero as the
+// debug logging level is to be encouraged ONLY for major errors or events as
+// they will ALWAYS be displayed on the debugger. Other debug output has it's
+// level matched against the current debug output level stored in the registry
+// for this module and if less than the current setting it will be displayed.
+//
+// Each module or executable has it's own debug output level for each of the
+// five types. These are read in when the DbgInitialise function is called
+// for DLLs linking to STRMBASE.LIB this is done automatically when the DLL
+// is loaded, executables must call it explicitely with the module instance
+// handle given to them through the WINMAIN entry point. An executable must
+// also call DbgTerminate when they have finished to clean up the resources
+// the debug library uses, once again this is done automatically for DLLs
+
+// These are the five different categories of logging information
+
+enum {  LOG_TIMING = 0x01,    // Timing and performance measurements
+        LOG_TRACE = 0x02,     // General step point call tracing
+        LOG_MEMORY =  0x04,   // Memory and object allocation/destruction
+        LOG_LOCKING = 0x08,   // Locking/unlocking of critical sections
+        LOG_ERROR = 0x10,     // Debug error notification
+        LOG_CUSTOM1 = 0x20,
+        LOG_CUSTOM2 = 0x40,
+        LOG_CUSTOM3 = 0x80,
+        LOG_CUSTOM4 = 0x100,
+        LOG_CUSTOM5 = 0x200,
+};
+
+#define LOG_FORCIBLY_SET 0x80000000
+
+enum {  CDISP_HEX = 0x01,
+        CDISP_DEC = 0x02};
+
+// For each object created derived from CBaseObject (in debug builds) we
+// create a descriptor that holds it's name (statically allocated memory)
+// and a cookie we assign it. We keep a list of all the active objects
+// we have registered so that we can dump a list of remaining objects
+
+typedef struct tag_ObjectDesc {
+    const CHAR *m_szName;
+    const WCHAR *m_wszName;
+    DWORD m_dwCookie;
+    tag_ObjectDesc *m_pNext;
+} ObjectDesc;
+
+#define DLLIMPORT __declspec(dllimport)
+#define DLLEXPORT __declspec(dllexport)
+
+#ifdef DEBUG
+
+    #define NAME(x) TEXT(x)
+
+    // These are used internally by the debug library (PRIVATE)
+
+    void WINAPI DbgInitKeyLevels(HKEY hKey, bool fTakeMax);
+    void WINAPI DbgInitGlobalSettings(bool fTakeMax);
+    void WINAPI DbgInitModuleSettings(bool fTakeMax);
+    void WINAPI DbgInitModuleName();
+    DWORD WINAPI DbgRegisterObjectCreation(
+        const CHAR *szObjectName, const WCHAR *wszObjectName);
+
+    BOOL WINAPI DbgRegisterObjectDestruction(DWORD dwCookie);
+
+    // These are the PUBLIC entry points
+
+    BOOL WINAPI DbgCheckModuleLevel(DWORD Type,DWORD Level);
+    void WINAPI DbgSetModuleLevel(DWORD Type,DWORD Level);
+    void WINAPI DbgSetAutoRefreshLevels(bool fAuto);
+
+    // Initialise the library with the module handle
+
+    void WINAPI DbgInitialise(HINSTANCE hInst);
+    void WINAPI DbgTerminate();
+
+    void WINAPI DbgDumpObjectRegister();
+
+    // Display error and logging to the user
+
+    void WINAPI DbgAssert(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine);
+    void WINAPI DbgBreakPoint(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine);
+    void WINAPI DbgBreakPoint(const TCHAR *pFileName,INT iLine,const TCHAR* szFormatString,...);
+
+    void WINAPI DbgKernelAssert(const TCHAR *pCondition,const TCHAR *pFileName,INT iLine);
+    void WINAPI DbgLogInfo(DWORD Type,DWORD Level,const TCHAR *pFormat,...);
+#ifdef UNICODE
+    void WINAPI DbgLogInfo(DWORD Type,DWORD Level,const CHAR *pFormat,...);
+    void WINAPI DbgAssert(const CHAR *pCondition,const CHAR *pFileName,INT iLine);
+    void WINAPI DbgBreakPoint(const CHAR *pCondition,const CHAR *pFileName,INT iLine);
+    void WINAPI DbgKernelAssert(const CHAR *pCondition,const CHAR *pFileName,INT iLine);
+#endif
+    void WINAPI DbgOutString(LPCTSTR psz);
+
+    //  Debug infinite wait stuff
+    DWORD WINAPI DbgWaitForSingleObject(HANDLE h);
+    DWORD WINAPI DbgWaitForMultipleObjects(DWORD nCount,
+                                    CONST HANDLE *lpHandles,
+                                    BOOL bWaitAll);
+    void WINAPI DbgSetWaitTimeout(DWORD dwTimeout);
+
+#ifdef __strmif_h__
+    // Display a media type: Terse at level 2, verbose at level 5
+    void WINAPI DisplayType(LPTSTR label, const AM_MEDIA_TYPE *pmtIn, DWORD dwLevel = 5);
+
+    // Dump lots of information about a filter graph
+    void WINAPI DumpGraph(IFilterGraph *pGraph, DWORD dwLevel);
+#endif
+
+    #define KASSERT(_x_) if (!(_x_))         \
+        DbgKernelAssert(TEXT(#_x_),TEXT(__FILE__),__LINE__)
+
+    //  Break on the debugger without putting up a message box
+    //  message goes to debugger instead
+
+    #define KDbgBreak(_x_)                   \
+        DbgKernelAssert(TEXT(#_x_),TEXT(__FILE__),__LINE__)
+
+    // We chose a common name for our ASSERT macro, MFC also uses this name
+    // So long as the implementation evaluates the condition and handles it
+    // then we will be ok. Rather than override the behaviour expected we
+    // will leave whatever first defines ASSERT as the handler (i.e. MFC)
+    #ifndef ASSERT
+        #define ASSERT(_x_) if (!(_x_))         \
+            DbgAssert(TEXT(#_x_),TEXT(__FILE__),__LINE__)
+    #endif
+
+    #define DbgAssertAligned( _ptr_, _alignment_ ) ASSERT( ((DWORD_PTR) (_ptr_)) % (_alignment_) == 0)
+
+    //  Put up a message box informing the user of a halt
+    //  condition in the program
+
+    #define DbgBreak(_x_)                   \
+        DbgBreakPoint(TEXT(#_x_),TEXT(__FILE__),__LINE__)
+
+    #define EXECUTE_ASSERT(_x_) ASSERT(_x_)
+    #define DbgLog(_x_) DbgLogInfo _x_
+    // MFC style trace macros
+
+    #define NOTE(_x_)             DbgLog((LOG_TRACE,5,TEXT(_x_)))
+    #define NOTE1(_x_,a)          DbgLog((LOG_TRACE,5,TEXT(_x_),a))
+    #define NOTE2(_x_,a,b)        DbgLog((LOG_TRACE,5,TEXT(_x_),a,b))
+    #define NOTE3(_x_,a,b,c)      DbgLog((LOG_TRACE,5,TEXT(_x_),a,b,c))
+    #define NOTE4(_x_,a,b,c,d)    DbgLog((LOG_TRACE,5,TEXT(_x_),a,b,c,d))
+    #define NOTE5(_x_,a,b,c,d,e)  DbgLog((LOG_TRACE,5,TEXT(_x_),a,b,c,d,e))
+
+#else
+
+    // Retail builds make public debug functions inert  - WARNING the source
+    // files do not define or build any of the entry points in debug builds
+    // (public entry points compile to nothing) so if you go trying to call
+    // any of the private entry points in your source they won't compile
+
+    #define NAME(_x_) ((TCHAR *) NULL)
+
+    #define DbgInitialise(hInst)
+    #define DbgTerminate()
+    #define DbgLog(_x_) 0
+    #define DbgOutString(psz)
+    #define DbgAssertAligned( _ptr_, _alignment_ ) 0
+
+    #define DbgRegisterObjectCreation(pObjectName)
+    #define DbgRegisterObjectDestruction(dwCookie)
+    #define DbgDumpObjectRegister()
+
+    #define DbgCheckModuleLevel(Type,Level)
+    #define DbgSetModuleLevel(Type,Level)
+    #define DbgSetAutoRefreshLevels(fAuto)
+
+    #define DbgWaitForSingleObject(h)  WaitForSingleObject(h, INFINITE)
+    #define DbgWaitForMultipleObjects(nCount, lpHandles, bWaitAll)     \
+               WaitForMultipleObjects(nCount, lpHandles, bWaitAll, INFINITE)
+    #define DbgSetWaitTimeout(dwTimeout)
+
+    #define KDbgBreak(_x_)
+    #define DbgBreak(_x_)
+
+    #define KASSERT(_x_) ((void)0)
+    #ifndef ASSERT
+	#define ASSERT(_x_) ((void)0)
+    #endif
+    #define EXECUTE_ASSERT(_x_) ((void)(_x_))
+
+    // MFC style trace macros
+
+    #define NOTE(_x_) ((void)0)
+    #define NOTE1(_x_,a) ((void)0)
+    #define NOTE2(_x_,a,b) ((void)0)
+    #define NOTE3(_x_,a,b,c) ((void)0)
+    #define NOTE4(_x_,a,b,c,d) ((void)0)
+    #define NOTE5(_x_,a,b,c,d,e) ((void)0)
+
+    #define DisplayType(label, pmtIn) ((void)0)
+    #define DumpGraph(pGraph, label) ((void)0)
+#endif
+
+
+// Checks a pointer which should be non NULL - can be used as follows.
+
+#define CheckPointer(p,ret) {if((p)==NULL) return (ret);}
+
+//   HRESULT Foo(VOID *pBar)
+//   {
+//       CheckPointer(pBar,E_INVALIDARG)
+//   }
+//
+//   Or if the function returns a boolean
+//
+//   BOOL Foo(VOID *pBar)
+//   {
+//       CheckPointer(pBar,FALSE)
+//   }
+
+// These validate pointers when symbol VFWROBUST is defined
+// This will normally be defined in debug not retail builds
+
+#ifdef DEBUG
+    #define VFWROBUST
+#endif
+
+#ifdef VFWROBUST
+
+    #define ValidateReadPtr(p,cb) \
+        {if(IsBadReadPtr((PVOID)p,cb) == TRUE) \
+            DbgBreak("Invalid read pointer");}
+
+    #define ValidateWritePtr(p,cb) \
+        {if(IsBadWritePtr((PVOID)p,cb) == TRUE) \
+            DbgBreak("Invalid write pointer");}
+
+    #define ValidateReadWritePtr(p,cb) \
+        {ValidateReadPtr(p,cb) ValidateWritePtr(p,cb)}
+
+    #define ValidateStringPtr(p) \
+        {if(IsBadStringPtr((LPCTSTR)p,INFINITE) == TRUE) \
+            DbgBreak("Invalid string pointer");}
+
+    #define ValidateStringPtrA(p) \
+        {if(IsBadStringPtrA((LPCSTR)p,INFINITE) == TRUE) \
+            DbgBreak("Invalid ANSI string pointer");}
+
+    #define ValidateStringPtrW(p) \
+        {if(IsBadStringPtrW((LPCWSTR)p,INFINITE) == TRUE) \
+            DbgBreak("Invalid UNICODE string pointer");}
+
+#else
+    #define ValidateReadPtr(p,cb) 0
+    #define ValidateWritePtr(p,cb) 0
+    #define ValidateReadWritePtr(p,cb) 0
+    #define ValidateStringPtr(p) 0
+    #define ValidateStringPtrA(p) 0
+    #define ValidateStringPtrW(p) 0
+#endif
+
+
+#ifdef _OBJBASE_H_
+
+    //  Outputting GUID names.  If you want to include the name
+    //  associated with a GUID (eg CLSID_...) then
+    //
+    //      GuidNames[yourGUID]
+    //
+    //  Returns the name defined in uuids.h as a string
+
+    typedef struct {
+        CHAR   *szName;
+        GUID    guid;
+    } GUID_STRING_ENTRY;
+
+    class CGuidNameList {
+    public:
+        CHAR *operator [] (const GUID& guid);
+    };
+
+    extern CGuidNameList GuidNames;
+
+#endif
+
+#ifndef REMIND
+    //  REMIND macro - generates warning as reminder to complete coding
+    //  (eg) usage:
+    //
+    //  #pragma message (REMIND("Add automation support"))
+
+
+    #define QUOTE(x) #x
+    #define QQUOTE(y) QUOTE(y)
+    #define REMIND(str) __FILE__ "(" QQUOTE(__LINE__) ") :  " str
+#endif
+
+//  Method to display objects in a useful format
+//
+//  eg If you want to display a LONGLONG ll in a debug string do (eg)
+//
+//  DbgLog((LOG_TRACE, n, TEXT("Value is %s"), (LPCTSTR)CDisp(ll, CDISP_HEX)));
+
+
+class CDispBasic
+{
+public:
+    CDispBasic() { m_pString = m_String; };
+    ~CDispBasic();
+protected:
+    TCHAR* m_pString;  // normally points to m_String... unless too much data
+    TCHAR m_String[50];
+};
+class CDisp : public CDispBasic
+{
+public:
+    CDisp(LONGLONG ll, int Format = CDISP_HEX); // Display a LONGLONG in CDISP_HEX or CDISP_DEC form
+    CDisp(REFCLSID clsid);      // Display a GUID
+    CDisp(double d);            // Display a floating point number
+#ifdef __strmif_h__
+#ifdef __STREAMS__
+    CDisp(CRefTime t);          // Display a Reference Time
+#endif
+    CDisp(IPin *pPin);          // Display a pin as {filter clsid}(pin name)
+    CDisp(IUnknown *pUnk);      // Display a filter or pin
+#endif // __strmif_h__
+    ~CDisp();
+
+    //  Implement cast to (LPCTSTR) as parameter to logger
+    operator LPCTSTR()
+    {
+        return (LPCTSTR)m_pString;
+    };
+};
+
+
+#if defined(DEBUG)
+class CAutoTrace
+{
+private:
+    const TCHAR* _szBlkName;
+    const int _level;
+    static const TCHAR _szEntering[];
+    static const TCHAR _szLeaving[];
+public:
+    CAutoTrace(const TCHAR* szBlkName, const int level = 15)
+        : _szBlkName(szBlkName), _level(level)
+    {DbgLog((LOG_TRACE, _level, _szEntering, _szBlkName));}
+
+    ~CAutoTrace()
+    {DbgLog((LOG_TRACE, _level, _szLeaving, _szBlkName));}
+};
+
+#if defined (__FUNCTION__)
+
+#define AMTRACEFN()  CAutoTrace __trace(TEXT(__FUNCTION__))
+#define AMTRACE(_x_) CAutoTrace __trace(TEXT(__FUNCTION__))
+
+#else
+
+#define AMTRACE(_x_) CAutoTrace __trace _x_
+#define AMTRACEFN()
+
+#endif
+
+#else
+
+#define AMTRACE(_x_)
+#define AMTRACEFN()
+
+#endif
+
+#endif // __WXDEBUG__
+
+
diff --git a/plugins/GSdx_legacy/baseclasses/wxlist.cpp b/plugins/GSdx_legacy/baseclasses/wxlist.cpp
new file mode 100644
index 0000000000..1270e8d3d4
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxlist.cpp
@@ -0,0 +1,885 @@
+//------------------------------------------------------------------------------
+// File: WXList.cpp
+//
+// Desc: DirectShow base classes - implements a non-MFC based generic list
+//       template class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+/* A generic list of pointers to objects.
+   Objectives: avoid using MFC libraries in ndm kernel mode and
+   provide a really useful list type.
+
+   The class is thread safe in that separate threads may add and
+   delete items in the list concurrently although the application
+   must ensure that constructor and destructor access is suitably
+   synchronised.
+
+   The list name must not conflict with MFC classes as an
+   application may use both
+
+   The nodes form a doubly linked, NULL terminated chain with an anchor
+   block (the list object per se) holding pointers to the first and last
+   nodes and a count of the nodes.
+   There is a node cache to reduce the allocation and freeing overhead.
+   It optionally (determined at construction time) has an Event which is
+   set whenever the list becomes non-empty and reset whenever it becomes
+   empty.
+   It optionally (determined at construction time) has a Critical Section
+   which is entered during the important part of each operation.  (About
+   all you can do outside it is some parameter checking).
+
+   The node cache is a repository of nodes that are NOT in the list to speed
+   up storage allocation.  Each list has its own cache to reduce locking and
+   serialising.  The list accesses are serialised anyway for a given list - a
+   common cache would mean that we would have to separately serialise access
+   of all lists within the cache.  Because the cache only stores nodes that are
+   not in the list, releasing the cache does not release any list nodes.  This
+   means that list nodes can be copied or rechained from one list to another
+   without danger of creating a dangling reference if the original cache goes
+   away.
+
+   Questionable design decisions:
+   1. Retaining the warts for compatibility
+   2. Keeping an element count -i.e. counting whenever we do anything
+      instead of only when we want the count.
+   3. Making the chain pointers NULL terminated.  If the list object
+      itself looks just like a node and the list is kept as a ring then
+      it reduces the number of special cases.  All inserts look the same.
+*/
+
+
+#include "streams.h"
+
+/* set cursor to the position of each element of list in turn  */
+#define INTERNALTRAVERSELIST(list, cursor)               \
+for ( cursor = (list).GetHeadPositionI()           \
+    ; cursor!=NULL                               \
+    ; cursor = (list).Next(cursor)                \
+    )
+
+
+/* set cursor to the position of each element of list in turn
+   in reverse order
+*/
+#define INTERNALREVERSETRAVERSELIST(list, cursor)        \
+for ( cursor = (list).GetTailPositionI()           \
+    ; cursor!=NULL                               \
+    ; cursor = (list).Prev(cursor)                \
+    )
+
+/* Constructor calls a separate initialisation function that
+   creates a node cache, optionally creates a lock object
+   and optionally creates a signaling object.
+
+   By default we create a locking object, a DEFAULTCACHE sized
+   cache but no event object so the list cannot be used in calls
+   to WaitForSingleObject
+*/
+CBaseList::CBaseList(TCHAR *pName,    // Descriptive list name
+                     INT iItems) :    // Node cache size
+#ifdef DEBUG
+    CBaseObject(pName),
+#endif
+    m_pFirst(NULL),
+    m_pLast(NULL),
+    m_Count(0),
+    m_Cache(iItems)
+{
+} // constructor
+
+CBaseList::CBaseList(TCHAR *pName) :  // Descriptive list name
+#ifdef DEBUG
+    CBaseObject(pName),
+#endif
+    m_pFirst(NULL),
+    m_pLast(NULL),
+    m_Count(0),
+    m_Cache(DEFAULTCACHE)
+{
+} // constructor
+
+#ifdef UNICODE
+CBaseList::CBaseList(CHAR *pName,    // Descriptive list name
+                     INT iItems) :    // Node cache size
+#ifdef DEBUG
+    CBaseObject(pName),
+#endif
+    m_pFirst(NULL),
+    m_pLast(NULL),
+    m_Count(0),
+    m_Cache(iItems)
+{
+} // constructor
+
+CBaseList::CBaseList(CHAR *pName) :  // Descriptive list name
+#ifdef DEBUG
+    CBaseObject(pName),
+#endif
+    m_pFirst(NULL),
+    m_pLast(NULL),
+    m_Count(0),
+    m_Cache(DEFAULTCACHE)
+{
+} // constructor
+
+#endif
+
+/* The destructor enumerates all the node objects in the list and
+   in the cache deleting each in turn. We do not do any processing
+   on the objects that the list holds (i.e. points to) so if they
+   represent interfaces for example the creator of the list should
+   ensure that each of them is released before deleting us
+*/
+CBaseList::~CBaseList()
+{
+    /* Delete all our list nodes */
+
+    RemoveAll();
+
+} // destructor
+
+/* Remove all the nodes from the list but don't do anything
+   with the objects that each node looks after (this is the
+   responsibility of the creator).
+   Aa a last act we reset the signalling event
+   (if available) to indicate to clients that the list
+   does not have any entries in it.
+*/
+void CBaseList::RemoveAll()
+{
+    /* Free up all the CNode objects NOTE we don't bother putting the
+       deleted nodes into the cache as this method is only really called
+       in serious times of change such as when we are being deleted at
+       which point the cache will be deleted anway */
+
+    CNode *pn = m_pFirst;
+    while (pn) {
+        CNode *op = pn;
+        pn = pn->Next();
+        delete op;
+    }
+
+    /* Reset the object count and the list pointers */
+
+    m_Count = 0;
+    m_pFirst = m_pLast = NULL;
+
+} // RemoveAll
+
+
+
+/* Return a position enumerator for the entire list.
+   A position enumerator is a pointer to a node object cast to a
+   transparent type so all we do is return the head/tail node
+   pointer in the list.
+   WARNING because the position is a pointer to a node there is
+   an implicit assumption for users a the list class that after
+   deleting an object from the list that any other position
+   enumerators that you have may be invalid (since the node
+   may be gone).
+*/
+WXLIST_POSITION CBaseList::GetHeadPositionI() const
+{
+    return (WXLIST_POSITION) m_pFirst;
+} // GetHeadPosition
+
+
+
+WXLIST_POSITION CBaseList::GetTailPositionI() const
+{
+    return (WXLIST_POSITION) m_pLast;
+} // GetTailPosition
+
+
+
+/* Get the number of objects in the list,
+   Get the lock before accessing the count.
+   Locking may not be entirely necessary but it has the side effect
+   of making sure that all operations are complete before we get it.
+   So for example if a list is being added to this list then that
+   will have completed in full before we continue rather than seeing
+   an intermediate albeit valid state
+*/
+int CBaseList::GetCountI() const
+{
+    return m_Count;
+} // GetCount
+
+
+
+/* Return the object at rp, update rp to the next object from
+   the list or NULL if you have moved over the last object.
+   You may still call this function once we return NULL but
+   we will continue to return a NULL position value
+*/
+void *CBaseList::GetNextI(WXLIST_POSITION& rp) const
+{
+    /* have we reached the end of the list */
+
+    if (rp == NULL) {
+        return NULL;
+    }
+
+    /* Lock the object before continuing */
+
+    void *pObject;
+
+    /* Copy the original position then step on */
+
+    CNode *pn = (CNode *) rp;
+    ASSERT(pn != NULL);
+    rp = (WXLIST_POSITION) pn->Next();
+
+    /* Get the object at the original position from the list */
+
+    pObject = pn->GetData();
+    // ASSERT(pObject != NULL);    // NULL pointers in the list are allowed.
+    return pObject;
+} //GetNext
+
+
+
+/* Return the object at p.
+   Asking for the object at NULL ASSERTs then returns NULL
+   The object is NOT locked.  The list is not being changed
+   in any way.  If another thread is busy deleting the object
+   then locking would only result in a change from one bad
+   behaviour to another.
+*/
+void *CBaseList::GetI(WXLIST_POSITION p) const
+{
+    if (p == NULL) {
+        return NULL;
+    }
+
+    CNode * pn = (CNode *) p;
+    void *pObject = pn->GetData();
+    // ASSERT(pObject != NULL);    // NULL pointers in the list are allowed.
+    return pObject;
+} //Get
+
+
+
+/* Return the first position in the list which holds the given pointer.
+   Return NULL if it's not found.
+*/
+WXLIST_POSITION CBaseList::FindI( void * pObj) const
+{
+    WXLIST_POSITION pn;
+    INTERNALTRAVERSELIST(*this, pn){
+        if (GetI(pn)==pObj) {
+            return pn;
+        }
+    }
+    return NULL;
+} // Find
+
+
+
+/* Remove the first node in the list (deletes the pointer to its object
+   from the list, does not free the object itself).
+   Return the pointer to its object or NULL if empty
+*/
+void *CBaseList::RemoveHeadI()
+{
+    /* All we do is get the head position and ask for that to be deleted.
+       We could special case this since some of the code path checking
+       in Remove() is redundant as we know there is no previous
+       node for example but it seems to gain little over the
+       added complexity
+    */
+
+    return RemoveI((WXLIST_POSITION)m_pFirst);
+} // RemoveHead
+
+
+
+/* Remove the last node in the list (deletes the pointer to its object
+   from the list, does not free the object itself).
+   Return the pointer to its object or NULL if empty
+*/
+void *CBaseList::RemoveTailI()
+{
+    /* All we do is get the tail position and ask for that to be deleted.
+       We could special case this since some of the code path checking
+       in Remove() is redundant as we know there is no previous
+       node for example but it seems to gain little over the
+       added complexity
+    */
+
+    return RemoveI((WXLIST_POSITION)m_pLast);
+} // RemoveTail
+
+
+
+/* Remove the pointer to the object in this position from the list.
+   Deal with all the chain pointers
+   Return a pointer to the object removed from the list.
+   The node object that is freed as a result
+   of this operation is added to the node cache where
+   it can be used again.
+   Remove(NULL) is a harmless no-op - but probably is a wart.
+*/
+void *CBaseList::RemoveI(WXLIST_POSITION pos)
+{
+    /* Lock the critical section before continuing */
+
+    // ASSERT (pos!=NULL);     // Removing NULL is to be harmless!
+    if (pos==NULL) return NULL;
+
+
+    CNode *pCurrent = (CNode *) pos;
+    ASSERT(pCurrent != NULL);
+
+    /* Update the previous node */
+
+    CNode *pNode = pCurrent->Prev();
+    if (pNode == NULL) {
+        m_pFirst = pCurrent->Next();
+    } else {
+        pNode->SetNext(pCurrent->Next());
+    }
+
+    /* Update the following node */
+
+    pNode = pCurrent->Next();
+    if (pNode == NULL) {
+        m_pLast = pCurrent->Prev();
+    } else {
+        pNode->SetPrev(pCurrent->Prev());
+    }
+
+    /* Get the object this node was looking after */
+
+    void *pObject = pCurrent->GetData();
+
+    // ASSERT(pObject != NULL);    // NULL pointers in the list are allowed.
+
+    /* Try and add the node object to the cache -
+       a NULL return code from the cache means we ran out of room.
+       The cache size is fixed by a constructor argument when the
+       list is created and defaults to DEFAULTCACHE.
+       This means that the cache will have room for this many
+       node objects. So if you have a list of media samples
+       and you know there will never be more than five active at
+       any given time of them for example then override the default
+       constructor
+    */
+
+    m_Cache.AddToCache(pCurrent);
+
+    /* If the list is empty then reset the list event */
+
+    --m_Count;
+    ASSERT(m_Count >= 0);
+    return pObject;
+} // Remove
+
+
+
+/* Add this object to the tail end of our list
+   Return the new tail position.
+*/
+
+WXLIST_POSITION CBaseList::AddTailI(void *pObject)
+{
+    /* Lock the critical section before continuing */
+
+    CNode *pNode;
+    // ASSERT(pObject);   // NULL pointers in the list are allowed.
+
+    /* If there is a node objects in the cache then use
+       that otherwise we will have to create a new one */
+
+    pNode = (CNode *) m_Cache.RemoveFromCache();
+    if (pNode == NULL) {
+        pNode = new CNode;
+    }
+
+    /* Check we have a valid object */
+
+    if (pNode == NULL) {
+        return NULL;
+    }
+
+    /* Initialise all the CNode object
+       just in case it came from the cache
+    */
+
+    pNode->SetData(pObject);
+    pNode->SetNext(NULL);
+    pNode->SetPrev(m_pLast);
+
+    if (m_pLast == NULL) {
+        m_pFirst = pNode;
+    } else {
+        m_pLast->SetNext(pNode);
+    }
+
+    /* Set the new last node pointer and also increment the number
+       of list entries, the critical section is unlocked when we
+       exit the function
+    */
+
+    m_pLast = pNode;
+    ++m_Count;
+
+    return (WXLIST_POSITION) pNode;
+} // AddTail(object)
+
+
+
+/* Add this object to the head end of our list
+   Return the new head position.
+*/
+WXLIST_POSITION CBaseList::AddHeadI(void *pObject)
+{
+    CNode *pNode;
+    // ASSERT(pObject);  // NULL pointers in the list are allowed.
+
+    /* If there is a node objects in the cache then use
+       that otherwise we will have to create a new one */
+
+    pNode = (CNode *) m_Cache.RemoveFromCache();
+    if (pNode == NULL) {
+        pNode = new CNode;
+    }
+
+    /* Check we have a valid object */
+
+    if (pNode == NULL) {
+        return NULL;
+    }
+
+    /* Initialise all the CNode object
+       just in case it came from the cache
+    */
+
+    pNode->SetData(pObject);
+
+    /* chain it in (set four pointers) */
+    pNode->SetPrev(NULL);
+    pNode->SetNext(m_pFirst);
+
+    if (m_pFirst == NULL) {
+        m_pLast = pNode;
+    } else {
+        m_pFirst->SetPrev(pNode);
+    }
+    m_pFirst = pNode;
+
+    ++m_Count;
+
+    return (WXLIST_POSITION) pNode;
+} // AddHead(object)
+
+
+
+/* Add all the elements in *pList to the tail of this list.
+   Return TRUE if it all worked, FALSE if it didn't.
+   If it fails some elements may have been added.
+*/
+BOOL CBaseList::AddTail(CBaseList *pList)
+{
+    /* lock the object before starting then enumerate
+       each entry in the source list and add them one by one to
+       our list (while still holding the object lock)
+       Lock the other list too.
+    */
+    WXLIST_POSITION pos = pList->GetHeadPositionI();
+
+    while (pos) {
+       if (NULL == AddTailI(pList->GetNextI(pos))) {
+           return FALSE;
+       }
+    }
+    return TRUE;
+} // AddTail(list)
+
+
+
+/* Add all the elements in *pList to the head of this list.
+   Return TRUE if it all worked, FALSE if it didn't.
+   If it fails some elements may have been added.
+*/
+BOOL CBaseList::AddHead(CBaseList *pList)
+{
+    /* lock the object before starting then enumerate
+       each entry in the source list and add them one by one to
+       our list (while still holding the object lock)
+       Lock the other list too.
+
+       To avoid reversing the list, traverse it backwards.
+    */
+
+    WXLIST_POSITION pos;
+
+    INTERNALREVERSETRAVERSELIST(*pList, pos) {
+        if (NULL== AddHeadI(pList->GetI(pos))){
+            return FALSE;
+        }
+    }
+    return TRUE;
+} // AddHead(list)
+
+
+
+/* Add the object after position p
+   p is still valid after the operation.
+   AddAfter(NULL,x) adds x to the start - same as AddHead
+   Return the position of the new object, NULL if it failed
+*/
+WXLIST_POSITION  CBaseList::AddAfterI(WXLIST_POSITION pos, void * pObj)
+{
+    if (pos==NULL)
+        return AddHeadI(pObj);
+
+    /* As someone else might be furkling with the list -
+       Lock the critical section before continuing
+    */
+    CNode *pAfter = (CNode *) pos;
+    ASSERT(pAfter != NULL);
+    if (pAfter==m_pLast)
+        return AddTailI(pObj);
+
+    /* set pnode to point to a new node, preferably from the cache */
+
+    CNode *pNode = (CNode *) m_Cache.RemoveFromCache();
+    if (pNode == NULL) {
+        pNode = new CNode;
+    }
+
+    /* Check we have a valid object */
+
+    if (pNode == NULL) {
+        return NULL;
+    }
+
+    /* Initialise all the CNode object
+       just in case it came from the cache
+    */
+
+    pNode->SetData(pObj);
+
+    /* It is to be added to the middle of the list - there is a before
+       and after node.  Chain it after pAfter, before pBefore.
+    */
+    CNode * pBefore = pAfter->Next();
+    ASSERT(pBefore != NULL);
+
+    /* chain it in (set four pointers) */
+    pNode->SetPrev(pAfter);
+    pNode->SetNext(pBefore);
+    pBefore->SetPrev(pNode);
+    pAfter->SetNext(pNode);
+
+    ++m_Count;
+
+    return (WXLIST_POSITION) pNode;
+
+} // AddAfter(object)
+
+
+
+BOOL CBaseList::AddAfter(WXLIST_POSITION p, CBaseList *pList)
+{
+    WXLIST_POSITION pos;
+    INTERNALTRAVERSELIST(*pList, pos) {
+        /* p follows along the elements being added */
+        p = AddAfterI(p, pList->GetI(pos));
+        if (p==NULL) return FALSE;
+    }
+    return TRUE;
+} // AddAfter(list)
+
+
+
+/* Mirror images:
+   Add the element or list after position p.
+   p is still valid after the operation.
+   AddBefore(NULL,x) adds x to the end - same as AddTail
+*/
+WXLIST_POSITION CBaseList::AddBeforeI(WXLIST_POSITION pos, void * pObj)
+{
+    if (pos==NULL)
+        return AddTailI(pObj);
+
+    /* set pnode to point to a new node, preferably from the cache */
+
+    CNode *pBefore = (CNode *) pos;
+    ASSERT(pBefore != NULL);
+    if (pBefore==m_pFirst)
+        return AddHeadI(pObj);
+
+    CNode * pNode = (CNode *) m_Cache.RemoveFromCache();
+    if (pNode == NULL) {
+        pNode = new CNode;
+    }
+
+    /* Check we have a valid object */
+
+    if (pNode == NULL) {
+        return NULL;
+    }
+
+    /* Initialise all the CNode object
+       just in case it came from the cache
+    */
+
+    pNode->SetData(pObj);
+
+    /* It is to be added to the middle of the list - there is a before
+       and after node.  Chain it after pAfter, before pBefore.
+    */
+
+    CNode * pAfter = pBefore->Prev();
+    ASSERT(pAfter != NULL);
+
+    /* chain it in (set four pointers) */
+    pNode->SetPrev(pAfter);
+    pNode->SetNext(pBefore);
+    pBefore->SetPrev(pNode);
+    pAfter->SetNext(pNode);
+
+    ++m_Count;
+
+    return (WXLIST_POSITION) pNode;
+
+} // Addbefore(object)
+
+
+
+BOOL CBaseList::AddBefore(WXLIST_POSITION p, CBaseList *pList)
+{
+    WXLIST_POSITION pos;
+    INTERNALREVERSETRAVERSELIST(*pList, pos) {
+        /* p follows along the elements being added */
+        p = AddBeforeI(p, pList->GetI(pos));
+        if (p==NULL) return FALSE;
+    }
+    return TRUE;
+} // AddBefore(list)
+
+
+
+/* Split *this after position p in *this
+   Retain as *this the tail portion of the original *this
+   Add the head portion to the tail end of *pList
+   Return TRUE if it all worked, FALSE if it didn't.
+
+   e.g.
+      foo->MoveToTail(foo->GetHeadPosition(), bar);
+          moves one element from the head of foo to the tail of bar
+      foo->MoveToTail(NULL, bar);
+          is a no-op
+      foo->MoveToTail(foo->GetTailPosition, bar);
+          concatenates foo onto the end of bar and empties foo.
+
+   A better, except excessively long name might be
+       MoveElementsFromHeadThroughPositionToOtherTail
+*/
+BOOL CBaseList::MoveToTail
+        (WXLIST_POSITION pos, CBaseList *pList)
+{
+    /* Algorithm:
+       Note that the elements (including their order) in the concatenation
+       of *pList to the head of *this is invariant.
+       1. Count elements to be moved
+       2. Join *pList onto the head of this to make one long chain
+       3. Set first/Last pointers in *this and *pList
+       4. Break the chain at the new place
+       5. Adjust counts
+       6. Set/Reset any events
+    */
+
+    if (pos==NULL) return TRUE;  // no-op.  Eliminates special cases later.
+
+
+    /* Make cMove the number of nodes to move */
+    CNode * p = (CNode *)pos;
+    int cMove = 0;            // number of nodes to move
+    while(p!=NULL) {
+       p = p->Prev();
+       ++cMove;
+    }
+
+
+    /* Join the two chains together */
+    if (pList->m_pLast!=NULL)
+        pList->m_pLast->SetNext(m_pFirst);
+    if (m_pFirst!=NULL)
+        m_pFirst->SetPrev(pList->m_pLast);
+
+
+    /* set first and last pointers */
+    p = (CNode *)pos;
+
+    if (pList->m_pFirst==NULL)
+        pList->m_pFirst = m_pFirst;
+    m_pFirst = p->Next();
+    if (m_pFirst==NULL)
+        m_pLast = NULL;
+    pList->m_pLast = p;
+
+
+    /* Break the chain after p to create the new pieces */
+    if (m_pFirst!=NULL)
+        m_pFirst->SetPrev(NULL);
+    p->SetNext(NULL);
+
+
+    /* Adjust the counts */
+    m_Count -= cMove;
+    pList->m_Count += cMove;
+
+    return TRUE;
+
+} // MoveToTail
+
+
+
+/* Mirror image of MoveToTail:
+   Split *this before position p in *this.
+   Retain in *this the head portion of the original *this
+   Add the tail portion to the start (i.e. head) of *pList
+   Return TRUE if it all worked, FALSE if it didn't.
+
+   e.g.
+      foo->MoveToHead(foo->GetTailPosition(), bar);
+          moves one element from the tail of foo to the head of bar
+      foo->MoveToHead(NULL, bar);
+          is a no-op
+      foo->MoveToHead(foo->GetHeadPosition, bar);
+          concatenates foo onto the start of bar and empties foo.
+*/
+BOOL CBaseList::MoveToHead
+        (WXLIST_POSITION pos, CBaseList *pList)
+{
+
+    /* See the comments on the algorithm in MoveToTail */
+
+    if (pos==NULL) return TRUE;  // no-op.  Eliminates special cases later.
+
+    /* Make cMove the number of nodes to move */
+    CNode * p = (CNode *)pos;
+    int cMove = 0;            // number of nodes to move
+    while(p!=NULL) {
+       p = p->Next();
+       ++cMove;
+    }
+
+
+    /* Join the two chains together */
+    if (pList->m_pFirst!=NULL)
+        pList->m_pFirst->SetPrev(m_pLast);
+    if (m_pLast!=NULL)
+        m_pLast->SetNext(pList->m_pFirst);
+
+
+    /* set first and last pointers */
+    p = (CNode *)pos;
+
+
+    if (pList->m_pLast==NULL)
+        pList->m_pLast = m_pLast;
+
+    m_pLast = p->Prev();
+    if (m_pLast==NULL)
+        m_pFirst = NULL;
+    pList->m_pFirst = p;
+
+
+    /* Break the chain after p to create the new pieces */
+    if (m_pLast!=NULL)
+        m_pLast->SetNext(NULL);
+    p->SetPrev(NULL);
+
+
+    /* Adjust the counts */
+    m_Count -= cMove;
+    pList->m_Count += cMove;
+
+    return TRUE;
+
+} // MoveToHead
+
+
+
+/* Reverse the order of the [pointers to] objects in *this
+*/
+void CBaseList::Reverse()
+{
+    /* algorithm:
+       The obvious booby trap is that you flip pointers around and lose
+       addressability to the node that you are going to process next.
+       The easy way to avoid this is do do one chain at a time.
+
+       Run along the forward chain,
+       For each node, set the reverse pointer to the one ahead of us.
+       The reverse chain is now a copy of the old forward chain, including
+       the NULL termination.
+
+       Run along the reverse chain (i.e. old forward chain again)
+       For each node set the forward pointer of the node ahead to point back
+       to the one we're standing on.
+       The first node needs special treatment,
+       it's new forward pointer is NULL.
+       Finally set the First/Last pointers
+
+    */
+    CNode * p;
+
+    // Yes we COULD use a traverse, but it would look funny!
+    p = m_pFirst;
+    while (p!=NULL) {
+        CNode * q;
+        q = p->Next();
+        p->SetNext(p->Prev());
+        p->SetPrev(q);
+        p = q;
+    }
+
+    p = m_pFirst;
+    m_pFirst = m_pLast;
+    m_pLast = p;
+
+
+#if 0     // old version
+
+    if (m_pFirst==NULL) return;          // empty list
+    if (m_pFirst->Next()==NULL) return;  // single node list
+
+
+    /* run along forward chain */
+    for ( p = m_pFirst
+        ; p!=NULL
+        ; p = p->Next()
+        ){
+        p->SetPrev(p->Next());
+    }
+
+
+    /* special case first element */
+    m_pFirst->SetNext(NULL);     // fix the old first element
+
+
+    /* run along new reverse chain i.e. old forward chain again */
+    for ( p = m_pFirst           // start at the old first element
+        ; p->Prev()!=NULL        // while there's a node still to be set
+        ; p = p->Prev()          // work in the same direction as before
+        ){
+        p->Prev()->SetNext(p);
+    }
+
+
+    /* fix forward and reverse pointers
+       - the triple XOR swap would work but all the casts look hideous */
+    p = m_pFirst;
+    m_pFirst = m_pLast;
+    m_pLast = p;
+#endif
+
+} // Reverse
diff --git a/plugins/GSdx_legacy/baseclasses/wxlist.h b/plugins/GSdx_legacy/baseclasses/wxlist.h
new file mode 100644
index 0000000000..fd5c84a004
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxlist.h
@@ -0,0 +1,543 @@
+//------------------------------------------------------------------------------
+// File: WXList.h
+//
+// Desc: DirectShow base classes - defines a non-MFC generic template list
+//       class.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+/* A generic list of pointers to objects.
+   No storage management or copying is done on the objects pointed to.
+   Objectives: avoid using MFC libraries in ndm kernel mode and
+   provide a really useful list type.
+
+   The class is thread safe in that separate threads may add and
+   delete items in the list concurrently although the application
+   must ensure that constructor and destructor access is suitably
+   synchronised. An application can cause deadlock with operations
+   which use two lists by simultaneously calling
+   list1->Operation(list2) and list2->Operation(list1).  So don't!
+
+   The names must not conflict with MFC classes as an application
+   may use both.
+   */
+
+#ifndef __WXLIST__
+#define __WXLIST__
+
+   /* A POSITION represents (in some fashion that's opaque) a cursor
+      on the list that can be set to identify any element.  NULL is
+      a valid value and several operations regard NULL as the position
+      "one step off the end of the list".  (In an n element list there
+      are n+1 places to insert and NULL is that "n+1-th" value).
+      The POSITION of an element in the list is only invalidated if
+      that element is deleted.  Move operations may mean that what
+      was a valid POSITION in one list is now a valid POSITION in
+      a different list.
+
+      Some operations which at first sight are illegal are allowed as
+      harmless no-ops.  For instance RemoveHead is legal on an empty
+      list and it returns NULL.  This allows an atomic way to test if
+      there is an element there, and if so, get it.  The two operations
+      AddTail and RemoveHead thus implement a MONITOR (See Hoare's paper).
+
+      Single element operations return POSITIONs, non-NULL means it worked.
+      whole list operations return a BOOL.  TRUE means it all worked.
+
+      This definition is the same as the POSITION type for MFCs, so we must
+      avoid defining it twice.
+   */
+struct __WXLIST_POSITION { int unused; };
+typedef __WXLIST_POSITION* WXLIST_POSITION;
+
+const int DEFAULTCACHE = 10;    /* Default node object cache size */
+
+/* A class representing one node in a list.
+   Each node knows a pointer to it's adjacent nodes and also a pointer
+   to the object that it looks after.
+   All of these pointers can be retrieved or set through member functions.
+*/
+class CBaseList
+#ifdef DEBUG
+    : public CBaseObject
+#endif
+{
+    /* Making these classes inherit from CBaseObject does nothing
+       functionally but it allows us to check there are no memory
+       leaks in debug builds.
+    */
+
+public:
+
+#ifdef DEBUG
+    class CNode : public CBaseObject {
+#else
+    class CNode {
+#endif
+
+        CNode *m_pPrev;         /* Previous node in the list */
+        CNode *m_pNext;         /* Next node in the list */
+        void *m_pObject;      /* Pointer to the object */
+
+    public:
+
+        /* Constructor - initialise the object's pointers */
+        CNode()
+#ifdef DEBUG
+            : CBaseObject(NAME("List node"))
+#endif
+        {
+        };
+
+
+        /* Return the previous node before this one */
+        CNode *Prev() const { return m_pPrev; };
+
+
+        /* Return the next node after this one */
+        CNode *Next() const { return m_pNext; };
+
+
+        /* Set the previous node before this one */
+        void SetPrev(CNode *p) { m_pPrev = p; };
+
+
+        /* Set the next node after this one */
+        void SetNext(CNode *p) { m_pNext = p; };
+
+
+        /* Get the pointer to the object for this node */
+        void *GetData() const { return m_pObject; };
+
+
+        /* Set the pointer to the object for this node */
+        void SetData(void *p) { m_pObject = p; };
+    };
+
+    class CNodeCache
+    {
+    public:
+        CNodeCache(INT iCacheSize) : m_iCacheSize(iCacheSize),
+                                     m_pHead(NULL),
+                                     m_iUsed(0)
+                                     {};
+        ~CNodeCache() {
+            CNode *pNode = m_pHead;
+            while (pNode) {
+                CNode *pCurrent = pNode;
+                pNode = pNode->Next();
+                delete pCurrent;
+            }
+        };
+        void AddToCache(CNode *pNode)
+        {
+            if (m_iUsed < m_iCacheSize) {
+                pNode->SetNext(m_pHead);
+                m_pHead = pNode;
+                m_iUsed++;
+            } else {
+                delete pNode;
+            }
+        };
+        CNode *RemoveFromCache()
+        {
+            CNode *pNode = m_pHead;
+            if (pNode != NULL) {
+                m_pHead = pNode->Next();
+                m_iUsed--;
+                ASSERT(m_iUsed >= 0);
+            } else {
+                ASSERT(m_iUsed == 0);
+            }
+            return pNode;
+        };
+    private:
+        INT m_iCacheSize;
+        INT m_iUsed;
+        CNode *m_pHead;
+    };
+
+protected:
+
+    CNode* m_pFirst;    /* Pointer to first node in the list */
+    CNode* m_pLast;     /* Pointer to the last node in the list */
+    LONG m_Count;       /* Number of nodes currently in the list */
+
+private:
+
+    CNodeCache m_Cache; /* Cache of unused node pointers */
+
+private:
+
+    /* These override the default copy constructor and assignment
+       operator for all list classes. They are in the private class
+       declaration section so that anybody trying to pass a list
+       object by value will generate a compile time error of
+       "cannot access the private member function". If these were
+       not here then the compiler will create default constructors
+       and assignment operators which when executed first take a
+       copy of all member variables and then during destruction
+       delete them all. This must not be done for any heap
+       allocated data.
+    */
+    CBaseList(const CBaseList &refList);
+    CBaseList &operator=(const CBaseList &refList);
+
+public:
+
+    CBaseList(TCHAR *pName,
+              INT iItems);
+
+    CBaseList(TCHAR *pName);
+#ifdef UNICODE
+    CBaseList(CHAR *pName,
+              INT iItems);
+
+    CBaseList(CHAR *pName);
+#endif
+    ~CBaseList();
+
+    /* Remove all the nodes from *this i.e. make the list empty */
+    void RemoveAll();
+
+
+    /* Return a cursor which identifies the first element of *this */
+    WXLIST_POSITION GetHeadPositionI() const;
+
+
+    /* Return a cursor which identifies the last element of *this */
+    WXLIST_POSITION GetTailPositionI() const;
+
+
+    /* Return the number of objects in *this */
+    int GetCountI() const;
+
+protected:
+    /* Return the pointer to the object at rp,
+       Update rp to the next node in *this
+       but make it NULL if it was at the end of *this.
+       This is a wart retained for backwards compatibility.
+       GetPrev is not implemented.
+       Use Next, Prev and Get separately.
+    */
+    void *GetNextI(WXLIST_POSITION& rp) const;
+
+
+    /* Return a pointer to the object at p
+       Asking for the object at NULL will return NULL harmlessly.
+    */
+    void *GetI(WXLIST_POSITION p) const;
+
+public:
+    /* return the next / prev position in *this
+       return NULL when going past the end/start.
+       Next(NULL) is same as GetHeadPosition()
+       Prev(NULL) is same as GetTailPosition()
+       An n element list therefore behaves like a n+1 element
+       cycle with NULL at the start/end.
+
+       !!WARNING!! - This handling of NULL is DIFFERENT from GetNext.
+
+       Some reasons are:
+       1. For a list of n items there are n+1 positions to insert
+          These are conveniently encoded as the n POSITIONs and NULL.
+       2. If you are keeping a list sorted (fairly common) and you
+          search forward for an element to insert before and don't
+          find it you finish up with NULL as the element before which
+          to insert.  You then want that NULL to be a valid WXLIST_POSITION
+          so that you can insert before it and you want that insertion
+          point to mean the (n+1)-th one that doesn't have a WXLIST_POSITION.
+          (symmetrically if you are working backwards through the list).
+       3. It simplifies the algebra which the methods generate.
+          e.g. AddBefore(p,x) is identical to AddAfter(Prev(p),x)
+          in ALL cases.  All the other arguments probably are reflections
+          of the algebraic point.
+    */
+    WXLIST_POSITION Next(WXLIST_POSITION pos) const
+    {
+        if (pos == NULL) {
+            return (WXLIST_POSITION) m_pFirst;
+        }
+        CNode *pn = (CNode *) pos;
+        return (WXLIST_POSITION) pn->Next();
+    } //Next
+
+    // See Next
+    WXLIST_POSITION Prev(WXLIST_POSITION pos) const
+    {
+        if (pos == NULL) {
+            return (WXLIST_POSITION) m_pLast;
+        }
+        CNode *pn = (CNode *) pos;
+        return (WXLIST_POSITION) pn->Prev();
+    } //Prev
+
+
+    /* Return the first position in *this which holds the given
+       pointer.  Return NULL if the pointer was not not found.
+    */
+protected:
+    WXLIST_POSITION FindI( void * pObj) const;
+
+    /* Remove the first node in *this (deletes the pointer to its
+       object from the list, does not free the object itself).
+       Return the pointer to its object.
+       If *this was already empty it will harmlessly return NULL.
+    */
+    void *RemoveHeadI();
+
+
+    /* Remove the last node in *this (deletes the pointer to its
+       object from the list, does not free the object itself).
+       Return the pointer to its object.
+       If *this was already empty it will harmlessly return NULL.
+    */
+    void *RemoveTailI();
+
+
+    /* Remove the node identified by p from the list (deletes the pointer
+       to its object from the list, does not free the object itself).
+       Asking to Remove the object at NULL will harmlessly return NULL.
+       Return the pointer to the object removed.
+    */
+    void *RemoveI(WXLIST_POSITION p);
+
+    /* Add single object *pObj to become a new last element of the list.
+       Return the new tail position, NULL if it fails.
+       If you are adding a COM objects, you might want AddRef it first.
+       Other existing POSITIONs in *this are still valid
+    */
+    WXLIST_POSITION AddTailI(void * pObj);
+public:
+
+
+    /* Add all the elements in *pList to the tail of *this.
+       This duplicates all the nodes in *pList (i.e. duplicates
+       all its pointers to objects).  It does not duplicate the objects.
+       If you are adding a list of pointers to a COM object into the list
+       it's a good idea to AddRef them all  it when you AddTail it.
+       Return TRUE if it all worked, FALSE if it didn't.
+       If it fails some elements may have been added.
+       Existing POSITIONs in *this are still valid
+
+       If you actually want to MOVE the elements, use MoveToTail instead.
+    */
+    BOOL AddTail(CBaseList *pList);
+
+
+    /* Mirror images of AddHead: */
+
+    /* Add single object to become a new first element of the list.
+       Return the new head position, NULL if it fails.
+       Existing POSITIONs in *this are still valid
+    */
+protected:
+    WXLIST_POSITION AddHeadI(void * pObj);
+public:
+
+    /* Add all the elements in *pList to the head of *this.
+       Same warnings apply as for AddTail.
+       Return TRUE if it all worked, FALSE if it didn't.
+       If it fails some of the objects may have been added.
+
+       If you actually want to MOVE the elements, use MoveToHead instead.
+    */
+    BOOL AddHead(CBaseList *pList);
+
+
+    /* Add the object *pObj to *this after position p in *this.
+       AddAfter(NULL,x) adds x to the start - equivalent to AddHead
+       Return the position of the object added, NULL if it failed.
+       Existing POSITIONs in *this are undisturbed, including p.
+    */
+protected:
+    WXLIST_POSITION AddAfterI(WXLIST_POSITION p, void * pObj);
+public:
+
+    /* Add the list *pList to *this after position p in *this
+       AddAfter(NULL,x) adds x to the start - equivalent to AddHead
+       Return TRUE if it all worked, FALSE if it didn't.
+       If it fails, some of the objects may be added
+       Existing POSITIONs in *this are undisturbed, including p.
+    */
+    BOOL AddAfter(WXLIST_POSITION p, CBaseList *pList);
+
+
+    /* Mirror images:
+       Add the object *pObj to this-List after position p in *this.
+       AddBefore(NULL,x) adds x to the end - equivalent to AddTail
+       Return the position of the new object, NULL if it fails
+       Existing POSITIONs in *this are undisturbed, including p.
+    */
+    protected:
+    WXLIST_POSITION AddBeforeI(WXLIST_POSITION p, void * pObj);
+    public:
+
+    /* Add the list *pList to *this before position p in *this
+       AddAfter(NULL,x) adds x to the start - equivalent to AddHead
+       Return TRUE if it all worked, FALSE if it didn't.
+       If it fails, some of the objects may be added
+       Existing POSITIONs in *this are undisturbed, including p.
+    */
+    BOOL AddBefore(WXLIST_POSITION p, CBaseList *pList);
+
+
+    /* Note that AddAfter(p,x) is equivalent to AddBefore(Next(p),x)
+       even in cases where p is NULL or Next(p) is NULL.
+       Similarly for mirror images etc.
+       This may make it easier to argue about programs.
+    */
+
+
+
+    /* The following operations do not copy any elements.
+       They move existing blocks of elements around by switching pointers.
+       They are fairly efficient for long lists as for short lists.
+       (Alas, the Count slows things down).
+
+       They split the list into two parts.
+       One part remains as the original list, the other part
+       is appended to the second list.  There are eight possible
+       variations:
+       Split the list {after/before} a given element
+       keep the {head/tail} portion in the original list
+       append the rest to the {head/tail} of the new list.
+
+       Since After is strictly equivalent to Before Next
+       we are not in serious need of the Before/After variants.
+       That leaves only four.
+
+       If you are processing a list left to right and dumping
+       the bits that you have processed into another list as
+       you go, the Tail/Tail variant gives the most natural result.
+       If you are processing in reverse order, Head/Head is best.
+
+       By using NULL positions and empty lists judiciously either
+       of the other two can be built up in two operations.
+
+       The definition of NULL (see Next/Prev etc) means that
+       degenerate cases include
+          "move all elements to new list"
+          "Split a list into two lists"
+          "Concatenate two lists"
+          (and quite a few no-ops)
+
+       !!WARNING!! The type checking won't buy you much if you get list
+       positions muddled up - e.g. use a WXLIST_POSITION that's in a different
+       list and see what a mess you get!
+    */
+
+    /* Split *this after position p in *this
+       Retain as *this the tail portion of the original *this
+       Add the head portion to the tail end of *pList
+       Return TRUE if it all worked, FALSE if it didn't.
+
+       e.g.
+          foo->MoveToTail(foo->GetHeadPosition(), bar);
+              moves one element from the head of foo to the tail of bar
+          foo->MoveToTail(NULL, bar);
+              is a no-op, returns NULL
+          foo->MoveToTail(foo->GetTailPosition, bar);
+              concatenates foo onto the end of bar and empties foo.
+
+       A better, except excessively long name might be
+           MoveElementsFromHeadThroughPositionToOtherTail
+    */
+    BOOL MoveToTail(WXLIST_POSITION pos, CBaseList *pList);
+
+
+    /* Mirror image:
+       Split *this before position p in *this.
+       Retain in *this the head portion of the original *this
+       Add the tail portion to the start (i.e. head) of *pList
+
+       e.g.
+          foo->MoveToHead(foo->GetTailPosition(), bar);
+              moves one element from the tail of foo to the head of bar
+          foo->MoveToHead(NULL, bar);
+              is a no-op, returns NULL
+          foo->MoveToHead(foo->GetHeadPosition, bar);
+              concatenates foo onto the start of bar and empties foo.
+    */
+    BOOL MoveToHead(WXLIST_POSITION pos, CBaseList *pList);
+
+
+    /* Reverse the order of the [pointers to] objects in *this
+    */
+    void Reverse();
+
+
+    /* set cursor to the position of each element of list in turn  */
+    #define TRAVERSELIST(list, cursor)               \
+    for ( cursor = (list).GetHeadPosition()           \
+        ; cursor!=NULL                               \
+        ; cursor = (list).Next(cursor)                \
+        )
+
+
+    /* set cursor to the position of each element of list in turn
+       in reverse order
+    */
+    #define REVERSETRAVERSELIST(list, cursor)        \
+    for ( cursor = (list).GetTailPosition()           \
+        ; cursor!=NULL                               \
+        ; cursor = (list).Prev(cursor)                \
+        )
+
+}; // end of class declaration
+
+template<class OBJECT> class CGenericList : public CBaseList
+{
+public:
+    CGenericList(TCHAR *pName,
+                 INT iItems,
+                 BOOL bLock = TRUE,
+                 BOOL bAlert = FALSE) :
+                     CBaseList(pName, iItems) {
+        UNREFERENCED_PARAMETER(bAlert);
+        UNREFERENCED_PARAMETER(bLock);
+    };
+    CGenericList(TCHAR *pName) :
+                     CBaseList(pName) {
+    };
+
+    WXLIST_POSITION GetHeadPosition() const { return (WXLIST_POSITION)m_pFirst; }
+    WXLIST_POSITION GetTailPosition() const { return (WXLIST_POSITION)m_pLast; }
+    int GetCount() const { return m_Count; }
+
+    OBJECT *GetNext(WXLIST_POSITION& rp) const { return (OBJECT *) GetNextI(rp); }
+
+    OBJECT *Get(WXLIST_POSITION p) const { return (OBJECT *) GetI(p); }
+    OBJECT *GetHead() const  { return Get(GetHeadPosition()); }
+
+    OBJECT *RemoveHead() { return (OBJECT *) RemoveHeadI(); }
+
+    OBJECT *RemoveTail() { return (OBJECT *) RemoveTailI(); }
+
+    OBJECT *Remove(WXLIST_POSITION p) { return (OBJECT *) RemoveI(p); }
+    WXLIST_POSITION AddBefore(WXLIST_POSITION p, OBJECT * pObj) { return AddBeforeI(p, pObj); }
+    WXLIST_POSITION AddAfter(WXLIST_POSITION p, OBJECT * pObj)  { return AddAfterI(p, pObj); }
+    WXLIST_POSITION AddHead(OBJECT * pObj) { return AddHeadI(pObj); }
+    WXLIST_POSITION AddTail(OBJECT * pObj)  { return AddTailI(pObj); }
+    BOOL AddTail(CGenericList<OBJECT> *pList)
+            { return CBaseList::AddTail((CBaseList *) pList); }
+    BOOL AddHead(CGenericList<OBJECT> *pList)
+            { return CBaseList::AddHead((CBaseList *) pList); }
+    BOOL AddAfter(WXLIST_POSITION p, CGenericList<OBJECT> *pList)
+            { return CBaseList::AddAfter(p, (CBaseList *) pList); };
+    BOOL AddBefore(WXLIST_POSITION p, CGenericList<OBJECT> *pList)
+            { return CBaseList::AddBefore(p, (CBaseList *) pList); };
+    WXLIST_POSITION Find( OBJECT * pObj) const { return FindI(pObj); }
+}; // end of class declaration
+
+
+
+/* These define the standard list types */
+
+typedef CGenericList<CBaseObject> CBaseObjectList;
+typedef CGenericList<IUnknown> CBaseInterfaceList;
+
+#endif /* __WXLIST__ */
+
diff --git a/plugins/GSdx_legacy/baseclasses/wxutil.cpp b/plugins/GSdx_legacy/baseclasses/wxutil.cpp
new file mode 100644
index 0000000000..b210b134b8
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxutil.cpp
@@ -0,0 +1,1246 @@
+//------------------------------------------------------------------------------
+// File: WXUtil.cpp
+//
+// Desc: DirectShow base classes - implements helper classes for building
+//       multimedia filters.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#include "streams.h"
+
+//
+//  Declare function from largeint.h we need so that PPC can build
+//
+
+//
+// Enlarged integer divide - 64-bits / 32-bits > 32-bits
+//
+
+#ifndef _X86_
+
+#define LLtoU64(x) (*(unsigned __int64*)(void*)(&(x)))
+
+__inline
+ULONG
+WINAPI
+EnlargedUnsignedDivide (
+    IN ULARGE_INTEGER Dividend,
+    IN ULONG Divisor,
+    IN PULONG Remainder
+    )
+{
+        // return remainder if necessary
+        if (Remainder != NULL)
+                *Remainder = (ULONG)(LLtoU64(Dividend) % Divisor);
+        return (ULONG)(LLtoU64(Dividend) / Divisor);
+}
+
+#else
+__inline
+ULONG
+WINAPI
+EnlargedUnsignedDivide (
+    IN ULARGE_INTEGER Dividend,
+    IN ULONG Divisor,
+    IN PULONG Remainder
+    )
+{
+    ULONG ulResult;
+    _asm {
+        mov eax,Dividend.LowPart
+        mov edx,Dividend.HighPart
+        mov ecx,Remainder
+        div Divisor
+        or  ecx,ecx
+        jz  short label
+        mov [ecx],edx
+label:
+        mov ulResult,eax
+    }
+    return ulResult;
+}
+#endif
+
+// --- CAMEvent -----------------------
+CAMEvent::CAMEvent(BOOL fManualReset)
+{
+    m_hEvent = CreateEvent(NULL, fManualReset, FALSE, NULL);
+}
+
+CAMEvent::~CAMEvent()
+{
+    if (m_hEvent) {
+	EXECUTE_ASSERT(CloseHandle(m_hEvent));
+    }
+}
+
+
+// --- CAMMsgEvent -----------------------
+// One routine.  The rest is handled in CAMEvent
+
+BOOL CAMMsgEvent::WaitMsg(DWORD dwTimeout)
+{
+    // wait for the event to be signalled, or for the
+    // timeout (in MS) to expire.  allow SENT messages
+    // to be processed while we wait
+    DWORD dwWait;
+    DWORD dwStartTime;
+
+    // set the waiting period.
+    DWORD dwWaitTime = dwTimeout;
+
+    // the timeout will eventually run down as we iterate
+    // processing messages.  grab the start time so that
+    // we can calculate elapsed times.
+    if (dwWaitTime != INFINITE) {
+        dwStartTime = timeGetTime();
+    }
+
+    do {
+        dwWait = MsgWaitForMultipleObjects(1,&m_hEvent,FALSE, dwWaitTime, QS_SENDMESSAGE);
+        if (dwWait == WAIT_OBJECT_0 + 1) {
+	    MSG Message;
+            PeekMessage(&Message,NULL,0,0,PM_NOREMOVE);
+
+	    // If we have an explicit length of time to wait calculate
+	    // the next wake up point - which might be now.
+	    // If dwTimeout is INFINITE, it stays INFINITE
+	    if (dwWaitTime != INFINITE) {
+
+		DWORD dwElapsed = timeGetTime()-dwStartTime;
+
+		dwWaitTime =
+		    (dwElapsed >= dwTimeout)
+			? 0  // wake up with WAIT_TIMEOUT
+			: dwTimeout-dwElapsed;
+	    }
+        }
+    } while (dwWait == WAIT_OBJECT_0 + 1);
+
+    // return TRUE if we woke on the event handle,
+    //        FALSE if we timed out.
+    return (dwWait == WAIT_OBJECT_0);
+}
+
+// --- CAMThread ----------------------
+
+
+CAMThread::CAMThread()
+    : m_EventSend(TRUE)     // must be manual-reset for CheckRequest()
+{
+    m_hThread = NULL;
+}
+
+CAMThread::~CAMThread() {
+    Close();
+}
+
+
+// when the thread starts, it calls this function. We unwrap the 'this'
+//pointer and call ThreadProc.
+DWORD WINAPI
+CAMThread::InitialThreadProc(LPVOID pv)
+{
+    HRESULT hrCoInit = CAMThread::CoInitializeHelper();
+    if(FAILED(hrCoInit)) {
+        DbgLog((LOG_ERROR, 1, TEXT("CoInitializeEx failed.")));
+    }
+
+    CAMThread * pThread = (CAMThread *) pv;
+
+    HRESULT hr = pThread->ThreadProc();
+
+    if(SUCCEEDED(hrCoInit)) {
+        CoUninitialize();
+    }
+
+    return hr;
+}
+
+BOOL
+CAMThread::Create()
+{
+    DWORD threadid;
+
+    CAutoLock lock(&m_AccessLock);
+
+    if (ThreadExists()) {
+	return FALSE;
+    }
+
+    m_hThread = CreateThread(
+		    NULL,
+		    0,
+		    CAMThread::InitialThreadProc,
+		    this,
+		    0,
+		    &threadid);
+
+    if (!m_hThread) {
+	return FALSE;
+    }
+
+    return TRUE;
+}
+
+DWORD
+CAMThread::CallWorker(DWORD dwParam)
+{
+    // lock access to the worker thread for scope of this object
+    CAutoLock lock(&m_AccessLock);
+
+    if (!ThreadExists()) {
+	return (DWORD) E_FAIL;
+    }
+
+    // set the parameter
+    m_dwParam = dwParam;
+
+    // signal the worker thread
+    m_EventSend.Set();
+
+    // wait for the completion to be signalled
+    m_EventComplete.Wait();
+
+    // done - this is the thread's return value
+    return m_dwReturnVal;
+}
+
+// Wait for a request from the client
+DWORD
+CAMThread::GetRequest()
+{
+    m_EventSend.Wait();
+    return m_dwParam;
+}
+
+// is there a request?
+BOOL
+CAMThread::CheckRequest(DWORD * pParam)
+{
+    if (!m_EventSend.Check()) {
+	return FALSE;
+    } else {
+	if (pParam) {
+	    *pParam = m_dwParam;
+	}
+	return TRUE;
+    }
+}
+
+// reply to the request
+void
+CAMThread::Reply(DWORD dw)
+{
+    m_dwReturnVal = dw;
+
+    // The request is now complete so CheckRequest should fail from
+    // now on
+    //
+    // This event should be reset BEFORE we signal the client or
+    // the client may Set it before we reset it and we'll then
+    // reset it (!)
+
+    m_EventSend.Reset();
+
+    // Tell the client we're finished
+
+    m_EventComplete.Set();
+}
+
+HRESULT CAMThread::CoInitializeHelper()
+{
+    // call CoInitializeEx and tell OLE not to create a window (this
+    // thread probably won't dispatch messages and will hang on
+    // broadcast msgs o/w).
+    //
+    // If CoInitEx is not available, threads that don't call CoCreate
+    // aren't affected. Threads that do will have to handle the
+    // failure. Perhaps we should fall back to CoInitialize and risk
+    // hanging?
+    //
+
+    // older versions of ole32.dll don't have CoInitializeEx
+
+    HRESULT hr = E_FAIL;
+    HINSTANCE hOle = GetModuleHandle(TEXT("ole32.dll"));
+    if(hOle)
+    {
+        typedef HRESULT (STDAPICALLTYPE *PCoInitializeEx)(
+            LPVOID pvReserved, DWORD dwCoInit);
+        PCoInitializeEx pCoInitializeEx =
+            (PCoInitializeEx)(GetProcAddress(hOle, "CoInitializeEx"));
+        if(pCoInitializeEx)
+        {
+            hr = (*pCoInitializeEx)(0, COINIT_DISABLE_OLE1DDE );
+        }
+    }
+    else
+    {
+        // caller must load ole32.dll
+        DbgBreak("couldn't locate ole32.dll");
+    }
+
+    return hr;
+}
+
+
+// destructor for CMsgThread  - cleans up any messages left in the
+// queue when the thread exited
+CMsgThread::~CMsgThread()
+{
+    if (m_hThread != NULL) {
+        WaitForSingleObject(m_hThread, INFINITE);
+        EXECUTE_ASSERT(CloseHandle(m_hThread));
+    }
+
+    WXLIST_POSITION pos = m_ThreadQueue.GetHeadPosition();
+    while (pos) {
+        CMsg * pMsg = m_ThreadQueue.GetNext(pos);
+        delete pMsg;
+    }
+    m_ThreadQueue.RemoveAll();
+
+    if (m_hSem != NULL) {
+        EXECUTE_ASSERT(CloseHandle(m_hSem));
+    }
+}
+
+BOOL
+CMsgThread::CreateThread(
+    )
+{
+    m_hSem = CreateSemaphore(NULL, 0, 0x7FFFFFFF, NULL);
+    if (m_hSem == NULL) {
+        return FALSE;
+    }
+
+    m_hThread = ::CreateThread(NULL, 0, DefaultThreadProc,
+			       (LPVOID)this, 0, &m_ThreadId);
+    return m_hThread != NULL;
+}
+
+
+// This is the threads message pump.  Here we get and dispatch messages to
+// clients thread proc until the client refuses to process a message.
+// The client returns a non-zero value to stop the message pump, this
+// value becomes the threads exit code.
+
+DWORD WINAPI
+CMsgThread::DefaultThreadProc(
+    LPVOID lpParam
+    )
+{
+    CMsgThread *lpThis = (CMsgThread *)lpParam;
+    CMsg msg;
+    LRESULT lResult;
+
+    // !!!
+    CoInitialize(NULL);
+
+    // allow a derived class to handle thread startup
+    lpThis->OnThreadInit();
+
+    do {
+	lpThis->GetThreadMsg(&msg);
+	lResult = lpThis->ThreadMessageProc(msg.uMsg,msg.dwFlags,
+					    msg.lpParam, msg.pEvent);
+    } while (lResult == 0L);
+
+    // !!!
+    CoUninitialize();
+
+    return (DWORD)lResult;
+}
+
+
+// Block until the next message is placed on the list m_ThreadQueue.
+// copies the message to the message pointed to by *pmsg
+void
+CMsgThread::GetThreadMsg(CMsg *msg)
+{
+    CMsg * pmsg = NULL;
+
+    // keep trying until a message appears
+    while (TRUE) {
+        {
+            CAutoLock lck(&m_Lock);
+            pmsg = m_ThreadQueue.RemoveHead();
+            if (pmsg == NULL) {
+                m_lWaiting++;
+            } else {
+                break;
+            }
+        }
+        // the semaphore will be signalled when it is non-empty
+        WaitForSingleObject(m_hSem, INFINITE);
+    }
+    // copy fields to caller's CMsg
+    *msg = *pmsg;
+
+    // this CMsg was allocated by the 'new' in PutThreadMsg
+    delete pmsg;
+
+}
+
+
+// NOTE: as we need to use the same binaries on Win95 as on NT this code should
+// be compiled WITHOUT unicode being defined.  Otherwise we will not pick up
+// these internal routines and the binary will not run on Win95.
+
+#ifndef UNICODE
+// Windows 95 doesn't implement this, so we provide an implementation.
+// LPWSTR
+// WINAPI
+// lstrcpyWInternal(
+//     LPWSTR lpString1,
+//     LPCWSTR lpString2
+//     )
+// {
+//     LPWSTR  lpReturn = lpString1;
+//     while (*lpString1++ = *lpString2++);
+//
+//     return lpReturn;
+// }
+
+// Windows 95 doesn't implement this, so we provide an implementation.
+LPWSTR
+WINAPI
+lstrcpynWInternal(
+    LPWSTR lpString1,
+    LPCWSTR lpString2,
+    int     iMaxLength
+    )
+{
+    ASSERT(iMaxLength);
+    LPWSTR  lpReturn = lpString1;
+    if (iMaxLength) {
+		while (--iMaxLength) {
+			if (!*lpString2) break;
+			*lpString1++ = *lpString2++;			
+		};
+
+        // If we ran out of room (which will be the case if
+        // iMaxLength is now 0) we still need to terminate the
+        // string.
+        if (!iMaxLength) *lpString1 = L'\0';
+    }
+    return lpReturn;
+}
+
+int
+WINAPI
+lstrcmpWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    )
+{
+    do {
+	WCHAR c1 = *lpString1;
+	WCHAR c2 = *lpString2;
+	if (c1 != c2)
+	    return (int) c1 - (int) c2;
+    } while (*lpString1++ && *lpString2++);
+    return 0;
+}
+
+
+int
+WINAPI
+lstrcmpiWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    )
+{
+    do {
+	WCHAR c1 = *lpString1;
+	WCHAR c2 = *lpString2;
+	if (c1 >= L'A' && c1 <= L'Z')
+	    c1 -= (WCHAR) (L'A' - L'a');
+	if (c2 >= L'A' && c2 <= L'Z')
+	    c2 -= (WCHAR) (L'A' - L'a');
+
+	if (c1 != c2)
+	    return (int) c1 - (int) c2;
+    } while (*lpString1++ && *lpString2++);
+
+    return 0;
+}
+
+
+int
+WINAPI
+lstrlenWInternal(
+    LPCWSTR lpString
+    )
+{
+    int i = -1;
+    while (*(lpString+(++i)))
+        ;
+    return i;
+}
+
+
+// int WINAPIV wsprintfWInternal(LPWSTR wszOut, LPCWSTR pszFmt, ...)
+// {
+//     char fmt[256]; // !!!
+//     char ach[256]; // !!!
+//     int i;
+//
+//     va_list va;
+//     va_start(va, pszFmt);
+//     WideCharToMultiByte(GetACP(), 0, pszFmt, -1, fmt, 256, NULL, NULL);
+//     (void)StringCchVPrintf(ach, NUMELMS(ach), fmt, va);
+//     i = lstrlenA(ach);
+//     va_end(va);
+//
+//     MultiByteToWideChar(CP_ACP, 0, ach, -1, wszOut, i+1);
+//
+//     return i;
+// }
+#else
+
+// need to provide the implementations in unicode for non-unicode
+// builds linking with the unicode strmbase.lib
+//LPWSTR WINAPI lstrcpyWInternal(
+//    LPWSTR lpString1,
+//    LPCWSTR lpString2
+//    )
+//{
+//    return lstrcpyW(lpString1, lpString2);
+//}
+
+LPWSTR WINAPI lstrcpynWInternal(
+    LPWSTR lpString1,
+    LPCWSTR lpString2,
+    int     iMaxLength
+    )
+{
+    return lstrcpynW(lpString1, lpString2, iMaxLength);
+}
+
+int WINAPI lstrcmpWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    )
+{
+    return lstrcmpW(lpString1, lpString2);
+}
+
+
+int WINAPI lstrcmpiWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    )
+{
+    return lstrcmpiW(lpString1, lpString2);
+}
+
+
+int WINAPI lstrlenWInternal(
+    LPCWSTR lpString
+    )
+{
+    return lstrlenW(lpString);
+}
+
+
+//int WINAPIV wsprintfWInternal(
+//    LPWSTR wszOut, LPCWSTR pszFmt, ...)
+//{
+//    va_list va;
+//    va_start(va, pszFmt);
+//    int i = wvsprintfW(wszOut, pszFmt, va);
+//    va_end(va);
+//    return i;
+//}
+#endif
+
+
+// Helper function - convert int to WSTR
+void WINAPI IntToWstr(int i, LPWSTR wstr, size_t len)
+{
+#ifdef UNICODE
+    (void)StringCchPrintf(wstr, len, L"%d", i);
+#else
+    TCHAR temp[32];
+    (void)StringCchPrintf(temp, NUMELMS(temp), "%d", i);
+    MultiByteToWideChar(CP_ACP, 0, temp, -1, wstr, int(len) );
+#endif
+} // IntToWstr
+
+
+#if 0
+void * memchrInternal(const void *pv, int c, size_t sz)
+{
+    BYTE *pb = (BYTE *) pv;
+    while (sz--) {
+	if (*pb == c)
+	    return (void *) pb;
+	pb++;
+    }
+    return NULL;
+}
+#endif
+
+
+#define MEMORY_ALIGNMENT        4
+#define MEMORY_ALIGNMENT_LOG2   2
+#define MEMORY_ALIGNMENT_MASK   MEMORY_ALIGNMENT - 1
+
+void * __stdcall memmoveInternal(void * dst, const void * src, size_t count)
+{
+    void * ret = dst;
+
+#ifdef _X86_
+    if (dst <= src || (char *)dst >= ((char *)src + count)) {
+
+        /*
+         * Non-Overlapping Buffers
+         * copy from lower addresses to higher addresses
+         */
+        _asm {
+            mov     esi,src
+            mov     edi,dst
+            mov     ecx,count
+            cld
+            mov     edx,ecx
+            and     edx,MEMORY_ALIGNMENT_MASK
+            shr     ecx,MEMORY_ALIGNMENT_LOG2
+            rep     movsd
+            or      ecx,edx
+            jz      memmove_done
+            rep     movsb
+memmove_done:
+        }
+    }
+    else {
+
+        /*
+         * Overlapping Buffers
+         * copy from higher addresses to lower addresses
+         */
+        _asm {
+            mov     esi,src
+            mov     edi,dst
+            mov     ecx,count
+            std
+            add     esi,ecx
+            add     edi,ecx
+            dec     esi
+            dec     edi
+            rep     movsb
+            cld
+        }
+    }
+#else
+    MoveMemory(dst, src, count);
+#endif
+
+    return ret;
+}
+
+/*  Arithmetic functions to help with time format conversions
+*/
+
+#ifdef _M_ALPHA
+// work around bug in version 12.00.8385 of the alpha compiler where
+// UInt32x32To64 sign-extends its arguments (?)
+#undef UInt32x32To64
+#define UInt32x32To64(a, b) (((ULONGLONG)((ULONG)(a)) & 0xffffffff) * ((ULONGLONG)((ULONG)(b)) & 0xffffffff))
+#endif
+
+/*   Compute (a * b + d) / c */
+LONGLONG WINAPI llMulDiv(LONGLONG a, LONGLONG b, LONGLONG c, LONGLONG d)
+{
+    /*  Compute the absolute values to avoid signed arithmetic problems */
+    ULARGE_INTEGER ua, ub;
+    DWORDLONG uc;
+
+    ua.QuadPart = (DWORDLONG)(a >= 0 ? a : -a);
+    ub.QuadPart = (DWORDLONG)(b >= 0 ? b : -b);
+    uc          = (DWORDLONG)(c >= 0 ? c : -c);
+    BOOL bSign = (a < 0) ^ (b < 0);
+
+    /*  Do long multiplication */
+    ULARGE_INTEGER p[2];
+    p[0].QuadPart  = UInt32x32To64(ua.LowPart, ub.LowPart);
+
+    /*  This next computation cannot overflow into p[1].HighPart because
+        the max number we can compute here is:
+
+                 (2 ** 32 - 1) * (2 ** 32 - 1) +  // ua.LowPart * ub.LowPart
+    (2 ** 32) *  (2 ** 31) * (2 ** 32 - 1) * 2    // x.LowPart * y.HighPart * 2
+
+    == 2 ** 96 - 2 ** 64 + (2 ** 64 - 2 ** 33 + 1)
+    == 2 ** 96 - 2 ** 33 + 1
+    < 2 ** 96
+    */
+
+    ULARGE_INTEGER x;
+    x.QuadPart     = UInt32x32To64(ua.LowPart, ub.HighPart) +
+                     UInt32x32To64(ua.HighPart, ub.LowPart) +
+                     p[0].HighPart;
+    p[0].HighPart  = x.LowPart;
+    p[1].QuadPart  = UInt32x32To64(ua.HighPart, ub.HighPart) + x.HighPart;
+
+    if (d != 0) {
+        ULARGE_INTEGER ud[2];
+        if (bSign) {
+            ud[0].QuadPart = (DWORDLONG)(-d);
+            if (d > 0) {
+                /*  -d < 0 */
+                ud[1].QuadPart = (DWORDLONG)(LONGLONG)-1;
+            } else {
+                ud[1].QuadPart = (DWORDLONG)0;
+            }
+        } else {
+            ud[0].QuadPart = (DWORDLONG)d;
+            if (d < 0) {
+                ud[1].QuadPart = (DWORDLONG)(LONGLONG)-1;
+            } else {
+                ud[1].QuadPart = (DWORDLONG)0;
+            }
+        }
+        /*  Now do extended addition */
+        ULARGE_INTEGER uliTotal;
+
+        /*  Add ls DWORDs */
+        uliTotal.QuadPart  = (DWORDLONG)ud[0].LowPart + p[0].LowPart;
+        p[0].LowPart       = uliTotal.LowPart;
+
+        /*  Propagate carry */
+        uliTotal.LowPart   = uliTotal.HighPart;
+        uliTotal.HighPart  = 0;
+
+        /*  Add 2nd most ls DWORDs */
+        uliTotal.QuadPart += (DWORDLONG)ud[0].HighPart + p[0].HighPart;
+        p[0].HighPart      = uliTotal.LowPart;
+
+        /*  Propagate carry */
+        uliTotal.LowPart   = uliTotal.HighPart;
+        uliTotal.HighPart  = 0;
+
+        /*  Add MS DWORDLONGs - no carry expected */
+        p[1].QuadPart     += ud[1].QuadPart + uliTotal.QuadPart;
+
+        /*  Now see if we got a sign change from the addition */
+        if ((LONG)p[1].HighPart < 0) {
+            bSign = !bSign;
+
+            /*  Negate the current value (ugh!) */
+            p[0].QuadPart  = ~p[0].QuadPart;
+            p[1].QuadPart  = ~p[1].QuadPart;
+            p[0].QuadPart += 1;
+            p[1].QuadPart += (p[0].QuadPart == 0);
+        }
+    }
+
+    /*  Now for the division */
+    if (c < 0) {
+        bSign = !bSign;
+    }
+
+
+    /*  This will catch c == 0 and overflow */
+    if (uc <= p[1].QuadPart) {
+        return bSign ? (LONGLONG)0x8000000000000000 :
+                       (LONGLONG)0x7FFFFFFFFFFFFFFF;
+    }
+
+    DWORDLONG ullResult;
+
+    /*  Do the division */
+    /*  If the dividend is a DWORD_LONG use the compiler */
+    if (p[1].QuadPart == 0) {
+        ullResult = p[0].QuadPart / uc;
+        return bSign ? -(LONGLONG)ullResult : (LONGLONG)ullResult;
+    }
+
+    /*  If the divisor is a DWORD then its simpler */
+    ULARGE_INTEGER ulic;
+    ulic.QuadPart = uc;
+    if (ulic.HighPart == 0) {
+        ULARGE_INTEGER uliDividend;
+        ULARGE_INTEGER uliResult;
+        DWORD dwDivisor = (DWORD)uc;
+        // ASSERT(p[1].HighPart == 0 && p[1].LowPart < dwDivisor);
+        uliDividend.HighPart = p[1].LowPart;
+        uliDividend.LowPart = p[0].HighPart;
+#ifndef USE_LARGEINT
+        uliResult.HighPart = (DWORD)(uliDividend.QuadPart / dwDivisor);
+        p[0].HighPart = (DWORD)(uliDividend.QuadPart % dwDivisor);
+        uliResult.LowPart = 0;
+        uliResult.QuadPart = p[0].QuadPart / dwDivisor + uliResult.QuadPart;
+#else
+        /*  NOTE - this routine will take exceptions if
+            the result does not fit in a DWORD
+        */
+        if (uliDividend.QuadPart >= (DWORDLONG)dwDivisor) {
+            uliResult.HighPart = EnlargedUnsignedDivide(
+                                     uliDividend,
+                                     dwDivisor,
+                                     &p[0].HighPart);
+        } else {
+            uliResult.HighPart = 0;
+        }
+        uliResult.LowPart = EnlargedUnsignedDivide(
+                                 p[0],
+                                 dwDivisor,
+                                 NULL);
+#endif
+        return bSign ? -(LONGLONG)uliResult.QuadPart :
+                        (LONGLONG)uliResult.QuadPart;
+    }
+
+
+    ullResult = 0;
+
+    /*  OK - do long division */
+    for (int i = 0; i < 64; i++) {
+        ullResult <<= 1;
+
+        /*  Shift 128 bit p left 1 */
+        p[1].QuadPart <<= 1;
+        if ((p[0].HighPart & 0x80000000) != 0) {
+            p[1].LowPart++;
+        }
+        p[0].QuadPart <<= 1;
+
+        /*  Compare */
+        if (uc <= p[1].QuadPart) {
+            p[1].QuadPart -= uc;
+            ullResult += 1;
+        }
+    }
+
+    return bSign ? - (LONGLONG)ullResult : (LONGLONG)ullResult;
+}
+
+LONGLONG WINAPI Int64x32Div32(LONGLONG a, LONG b, LONG c, LONG d)
+{
+    ULARGE_INTEGER ua;
+    DWORD ub;
+    DWORD uc;
+
+    /*  Compute the absolute values to avoid signed arithmetic problems */
+    ua.QuadPart = (DWORDLONG)(a >= 0 ? a : -a);
+    ub = (DWORD)(b >= 0 ? b : -b);
+    uc = (DWORD)(c >= 0 ? c : -c);
+    BOOL bSign = (a < 0) ^ (b < 0);
+
+    /*  Do long multiplication */
+    ULARGE_INTEGER p0;
+    DWORD p1;
+    p0.QuadPart  = UInt32x32To64(ua.LowPart, ub);
+
+    if (ua.HighPart != 0) {
+        ULARGE_INTEGER x;
+        x.QuadPart     = UInt32x32To64(ua.HighPart, ub) + p0.HighPart;
+        p0.HighPart  = x.LowPart;
+        p1   = x.HighPart;
+    } else {
+        p1 = 0;
+    }
+
+    if (d != 0) {
+        ULARGE_INTEGER ud0;
+        DWORD ud1;
+
+        if (bSign) {
+            //
+            //  Cast d to LONGLONG first otherwise -0x80000000 sign extends
+            //  incorrectly
+            //
+            ud0.QuadPart = (DWORDLONG)(-(LONGLONG)d);
+            if (d > 0) {
+                /*  -d < 0 */
+                ud1 = (DWORD)-1;
+            } else {
+                ud1 = (DWORD)0;
+            }
+        } else {
+            ud0.QuadPart = (DWORDLONG)d;
+            if (d < 0) {
+                ud1 = (DWORD)-1;
+            } else {
+                ud1 = (DWORD)0;
+            }
+        }
+        /*  Now do extended addition */
+        ULARGE_INTEGER uliTotal;
+
+        /*  Add ls DWORDs */
+        uliTotal.QuadPart  = (DWORDLONG)ud0.LowPart + p0.LowPart;
+        p0.LowPart       = uliTotal.LowPart;
+
+        /*  Propagate carry */
+        uliTotal.LowPart   = uliTotal.HighPart;
+        uliTotal.HighPart  = 0;
+
+        /*  Add 2nd most ls DWORDs */
+        uliTotal.QuadPart += (DWORDLONG)ud0.HighPart + p0.HighPart;
+        p0.HighPart      = uliTotal.LowPart;
+
+        /*  Add MS DWORDLONGs - no carry expected */
+        p1 += ud1 + uliTotal.HighPart;
+
+        /*  Now see if we got a sign change from the addition */
+        if ((LONG)p1 < 0) {
+            bSign = !bSign;
+
+            /*  Negate the current value (ugh!) */
+            p0.QuadPart  = ~p0.QuadPart;
+            p1 = ~p1;
+            p0.QuadPart += 1;
+            p1 += (p0.QuadPart == 0);
+        }
+    }
+
+    /*  Now for the division */
+    if (c < 0) {
+        bSign = !bSign;
+    }
+
+
+    /*  This will catch c == 0 and overflow */
+    if (uc <= p1) {
+        return bSign ? (LONGLONG)0x8000000000000000 :
+                       (LONGLONG)0x7FFFFFFFFFFFFFFF;
+    }
+
+    /*  Do the division */
+
+    /*  If the divisor is a DWORD then its simpler */
+    ULARGE_INTEGER uliDividend;
+    ULARGE_INTEGER uliResult;
+    DWORD dwDivisor = uc;
+    uliDividend.HighPart = p1;
+    uliDividend.LowPart = p0.HighPart;
+    /*  NOTE - this routine will take exceptions if
+        the result does not fit in a DWORD
+    */
+    if (uliDividend.QuadPart >= (DWORDLONG)dwDivisor) {
+        uliResult.HighPart = EnlargedUnsignedDivide(
+                                 uliDividend,
+                                 dwDivisor,
+                                 &p0.HighPart);
+    } else {
+        uliResult.HighPart = 0;
+    }
+    uliResult.LowPart = EnlargedUnsignedDivide(
+                             p0,
+                             dwDivisor,
+                             NULL);
+    return bSign ? -(LONGLONG)uliResult.QuadPart :
+                    (LONGLONG)uliResult.QuadPart;
+}
+
+#ifdef DEBUG
+/******************************Public*Routine******************************\
+* Debug CCritSec helpers
+*
+* We provide debug versions of the Constructor, destructor, Lock and Unlock
+* routines.  The debug code tracks who owns each critical section by
+* maintaining a depth count.
+*
+* History:
+*
+\**************************************************************************/
+
+CCritSec::CCritSec(DWORD id)
+{
+    InitializeCriticalSection(&m_CritSec);
+	m_id = id;
+    m_currentOwner = m_lockCount = 0;
+    m_fTrace = FALSE;
+}
+
+CCritSec::~CCritSec()
+{
+    DeleteCriticalSection(&m_CritSec);
+}
+
+void CCritSec::Lock()
+{
+    UINT tracelevel=3;
+    DWORD us = GetCurrentThreadId();
+    DWORD currentOwner = m_currentOwner;
+    if (currentOwner && (currentOwner != us)) {
+        // already owned, but not by us
+        if (m_fTrace) {
+            DbgLog((LOG_LOCKING, 2, TEXT("Thread %d about to wait for lock %x owned by %d"),
+                GetCurrentThreadId(), &m_CritSec, currentOwner));
+            tracelevel=2;
+	        // if we saw the message about waiting for the critical
+	        // section we ensure we see the message when we get the
+	        // critical section
+        }
+    }
+
+	EnterCriticalSection(&m_CritSec);
+
+    if (0 == m_lockCount++) {
+        // we now own it for the first time.  Set owner information
+        m_currentOwner = us;
+
+        if (m_fTrace) {
+            DbgLog((LOG_LOCKING, tracelevel, TEXT("Thread %d now owns lock %x"), m_currentOwner, &m_CritSec));
+        }
+    }
+}
+
+void CCritSec::Unlock() {
+    if (0 == --m_lockCount) {
+        // about to be unowned
+        if (m_fTrace) {
+            DbgLog((LOG_LOCKING, 3, TEXT("Thread %d releasing lock %x"), m_currentOwner, &m_CritSec));
+        }
+
+        m_currentOwner = 0;
+    }
+    LeaveCriticalSection(&m_CritSec);
+}
+
+void WINAPI DbgLockTrace(CCritSec * pcCrit, BOOL fTrace)
+{
+    pcCrit->m_fTrace = fTrace;
+}
+
+BOOL WINAPI CritCheckIn(CCritSec * pcCrit)
+{
+    return (GetCurrentThreadId() == pcCrit->m_currentOwner);
+}
+
+BOOL WINAPI CritCheckIn(const CCritSec * pcCrit)
+{
+    return (GetCurrentThreadId() == pcCrit->m_currentOwner);
+}
+
+BOOL WINAPI CritCheckOut(CCritSec * pcCrit)
+{
+    return (GetCurrentThreadId() != pcCrit->m_currentOwner);
+}
+
+BOOL WINAPI CritCheckOut(const CCritSec * pcCrit)
+{
+    return (GetCurrentThreadId() != pcCrit->m_currentOwner);
+}
+#endif
+
+
+STDAPI WriteBSTR(BSTR *pstrDest, LPCWSTR szSrc)
+{
+    *pstrDest = SysAllocString( szSrc );
+    if( !(*pstrDest) ) return E_OUTOFMEMORY;
+    return NOERROR;
+}
+
+
+STDAPI FreeBSTR(BSTR* pstr)
+{
+    if( *pstr == NULL ) return S_FALSE;
+    SysFreeString( *pstr );
+    return NOERROR;
+}
+
+
+// Return a wide string - allocating memory for it
+// Returns:
+//    S_OK          - no error
+//    E_POINTER     - ppszReturn == NULL
+//    E_OUTOFMEMORY - can't allocate memory for returned string
+STDAPI AMGetWideString(LPCWSTR psz, LPWSTR *ppszReturn)
+{
+    CheckPointer(ppszReturn, E_POINTER);
+    ValidateReadWritePtr(ppszReturn, sizeof(LPWSTR));
+    DWORD nameLen = sizeof(WCHAR) * (lstrlenW(psz)+1);
+    *ppszReturn = (LPWSTR)CoTaskMemAlloc(nameLen);
+    if (*ppszReturn == NULL) {
+       return E_OUTOFMEMORY;
+    }
+    CopyMemory(*ppszReturn, psz, nameLen);
+    return NOERROR;
+}
+
+// Waits for the HANDLE hObject.  While waiting messages sent
+// to windows on our thread by SendMessage will be processed.
+// Using this function to do waits and mutual exclusion
+// avoids some deadlocks in objects with windows.
+// Return codes are the same as for WaitForSingleObject
+DWORD WINAPI WaitDispatchingMessages(
+    HANDLE hObject,
+    DWORD dwWait,
+    HWND hwnd,
+    UINT uMsg,
+    HANDLE hEvent)
+{
+    BOOL bPeeked = FALSE;
+    DWORD dwResult;
+    DWORD dwStart = 0;
+    DWORD dwThreadPriority = 0;
+
+    static UINT uMsgId = 0;
+
+    HANDLE hObjects[2] = { hObject, hEvent };
+    if (dwWait != INFINITE && dwWait != 0) {
+        dwStart = GetTickCount();
+    }
+    for (; ; ) {
+        DWORD nCount = NULL != hEvent ? 2 : 1;
+
+        //  Minimize the chance of actually dispatching any messages
+        //  by seeing if we can lock immediately.
+        dwResult = WaitForMultipleObjects(nCount, hObjects, FALSE, 0);
+        if (dwResult < WAIT_OBJECT_0 + nCount) {
+            break;
+        }
+
+        DWORD dwTimeOut = dwWait;
+        if (dwTimeOut > 10) {
+            dwTimeOut = 10;
+        }
+        dwResult = MsgWaitForMultipleObjects(
+                             nCount,
+                             hObjects,
+                             FALSE,
+                             dwTimeOut,
+                             hwnd == NULL ? QS_SENDMESSAGE :
+                                            QS_SENDMESSAGE + QS_POSTMESSAGE);
+        if (dwResult == WAIT_OBJECT_0 + nCount ||
+            dwResult == WAIT_TIMEOUT && dwTimeOut != dwWait) {
+            MSG msg;
+            if (hwnd != NULL) {
+                while (PeekMessage(&msg, hwnd, uMsg, uMsg, PM_REMOVE)) {
+                    DispatchMessage(&msg);
+                }
+            }
+            // Do this anyway - the previous peek doesn't flush out the
+            // messages
+            PeekMessage(&msg, NULL, 0, 0, PM_NOREMOVE);
+
+            if (dwWait != INFINITE && dwWait != 0) {
+                DWORD dwNow = GetTickCount();
+
+                // Working with differences handles wrap-around
+                DWORD dwDiff = dwNow - dwStart;
+                if (dwDiff > dwWait) {
+                    dwWait = 0;
+                } else {
+                    dwWait -= dwDiff;
+                }
+                dwStart = dwNow;
+            }
+            if (!bPeeked) {
+                //  Raise our priority to prevent our message queue
+                //  building up
+                dwThreadPriority = GetThreadPriority(GetCurrentThread());
+                if (dwThreadPriority < THREAD_PRIORITY_HIGHEST) {
+                    SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
+                }
+                bPeeked = TRUE;
+            }
+        } else {
+            break;
+        }
+    }
+    if (bPeeked) {
+        SetThreadPriority(GetCurrentThread(), dwThreadPriority);
+        if (HIWORD(GetQueueStatus(QS_POSTMESSAGE)) & QS_POSTMESSAGE) {
+            if (uMsgId == 0) {
+                uMsgId = RegisterWindowMessage(TEXT("AMUnblock"));
+            }
+            if (uMsgId != 0) {
+                MSG msg;
+                //  Remove old ones
+                while (PeekMessage(&msg, (HWND)-1, uMsgId, uMsgId, PM_REMOVE)) {
+                }
+            }
+            PostThreadMessage(GetCurrentThreadId(), uMsgId, 0, 0);
+        }
+    }
+    return dwResult;
+}
+
+HRESULT AmGetLastErrorToHResult()
+{
+    DWORD dwLastError = GetLastError();
+    if(dwLastError != 0)
+    {
+        return HRESULT_FROM_WIN32(dwLastError);
+    }
+    else
+    {
+        return E_FAIL;
+    }
+}
+
+IUnknown* QzAtlComPtrAssign(IUnknown** pp, IUnknown* lp)
+{
+    if (lp != NULL)
+        lp->AddRef();
+    if (*pp)
+        (*pp)->Release();
+    *pp = lp;
+    return lp;
+}
+
+/******************************************************************************
+
+CompatibleTimeSetEvent
+
+    CompatibleTimeSetEvent() sets the TIME_KILL_SYNCHRONOUS flag before calling
+timeSetEvent() if the current operating system supports it.  TIME_KILL_SYNCHRONOUS
+is supported on Windows XP and later operating systems.
+
+Parameters:
+- The same parameters as timeSetEvent().  See timeSetEvent()'s documentation in
+the Platform SDK for more information.
+
+Return Value:
+- The same return value as timeSetEvent().  See timeSetEvent()'s documentation in
+the Platform SDK for more information.
+
+******************************************************************************/
+MMRESULT CompatibleTimeSetEvent( UINT uDelay, UINT uResolution, LPTIMECALLBACK lpTimeProc, DWORD_PTR dwUser, UINT fuEvent )
+{
+    #if WINVER >= 0x0501
+    {
+        static bool fCheckedVersion = false;
+        static bool fTimeKillSynchronousFlagAvailable = false;
+
+        if( !fCheckedVersion ) {
+            fTimeKillSynchronousFlagAvailable = TimeKillSynchronousFlagAvailable();
+            fCheckedVersion = true;
+        }
+
+        if( fTimeKillSynchronousFlagAvailable ) {
+            fuEvent = fuEvent | TIME_KILL_SYNCHRONOUS;
+        }
+    }
+    #endif // WINVER >= 0x0501
+
+    return timeSetEvent( uDelay, uResolution, lpTimeProc, dwUser, fuEvent );
+}
+
+bool TimeKillSynchronousFlagAvailable( void )
+{
+    OSVERSIONINFO osverinfo;
+
+    osverinfo.dwOSVersionInfoSize = sizeof(osverinfo);
+
+    if( GetVersionEx( &osverinfo ) ) {
+
+        // Windows XP's major version is 5 and its' minor version is 1.
+        // timeSetEvent() started supporting the TIME_KILL_SYNCHRONOUS flag
+        // in Windows XP.
+        if( (osverinfo.dwMajorVersion > 5) ||
+            ( (osverinfo.dwMajorVersion == 5) && (osverinfo.dwMinorVersion >= 1) ) ) {
+            return true;
+        }
+    }
+
+    return false;
+}
diff --git a/plugins/GSdx_legacy/baseclasses/wxutil.h b/plugins/GSdx_legacy/baseclasses/wxutil.h
new file mode 100644
index 0000000000..f0d598d23f
--- /dev/null
+++ b/plugins/GSdx_legacy/baseclasses/wxutil.h
@@ -0,0 +1,541 @@
+//------------------------------------------------------------------------------
+// File: WXUtil.h
+//
+// Desc: DirectShow base classes - defines helper classes and functions for
+//       building multimedia filters.
+//
+// Copyright (c) Microsoft Corporation.  All rights reserved.
+//------------------------------------------------------------------------------
+
+
+#ifndef __WXUTIL__
+#define __WXUTIL__
+
+// eliminate spurious "statement has no effect" warnings.
+#pragma warning(disable: 4705)
+
+// wrapper for whatever critical section we have
+class CCritSec {
+
+    // make copy constructor and assignment operator inaccessible
+
+    CCritSec(const CCritSec &refCritSec);
+    CCritSec &operator=(const CCritSec &refCritSec);
+
+    CRITICAL_SECTION m_CritSec;
+
+#ifdef DEBUG
+public:
+	DWORD	m_id;
+    DWORD   m_currentOwner;
+    DWORD   m_lockCount;
+    BOOL    m_fTrace;        // Trace this one
+public:
+    CCritSec(DWORD id = 0);
+    ~CCritSec();
+    void Lock();
+    void Unlock();
+#else
+
+public:
+    CCritSec() {
+        InitializeCriticalSection(&m_CritSec);
+    };
+
+    ~CCritSec() {
+        DeleteCriticalSection(&m_CritSec);
+    };
+
+    void Lock() {
+        EnterCriticalSection(&m_CritSec);
+    };
+
+    void Unlock() {
+        LeaveCriticalSection(&m_CritSec);
+    };
+#endif
+};
+
+//
+// To make deadlocks easier to track it is useful to insert in the
+// code an assertion that says whether we own a critical section or
+// not.  We make the routines that do the checking globals to avoid
+// having different numbers of member functions in the debug and
+// retail class implementations of CCritSec.  In addition we provide
+// a routine that allows usage of specific critical sections to be
+// traced.  This is NOT on by default - there are far too many.
+//
+
+#ifdef DEBUG
+    BOOL WINAPI CritCheckIn(CCritSec * pcCrit);
+    BOOL WINAPI CritCheckIn(const CCritSec * pcCrit);
+    BOOL WINAPI CritCheckOut(CCritSec * pcCrit);
+    BOOL WINAPI CritCheckOut(const CCritSec * pcCrit);
+    void WINAPI DbgLockTrace(CCritSec * pcCrit, BOOL fTrace);
+#else
+    #define CritCheckIn(x) TRUE
+    #define CritCheckOut(x) TRUE
+    #define DbgLockTrace(pc, fT)
+#endif
+
+
+// locks a critical section, and unlocks it automatically
+// when the lock goes out of scope
+class CAutoLock {
+
+    // make copy constructor and assignment operator inaccessible
+
+    CAutoLock(const CAutoLock &refAutoLock);
+    CAutoLock &operator=(const CAutoLock &refAutoLock);
+
+protected:
+    CCritSec * m_pLock;
+
+public:
+    CAutoLock(CCritSec * plock)
+    {
+        m_pLock = plock;
+        m_pLock->Lock();
+    };
+
+    ~CAutoLock() {
+        m_pLock->Unlock();
+    };
+};
+
+
+
+// wrapper for event objects
+class CAMEvent
+{
+
+    // make copy constructor and assignment operator inaccessible
+
+    CAMEvent(const CAMEvent &refEvent);
+    CAMEvent &operator=(const CAMEvent &refEvent);
+
+protected:
+    HANDLE m_hEvent;
+public:
+    CAMEvent(BOOL fManualReset = FALSE);
+    ~CAMEvent();
+
+    // Cast to HANDLE - we don't support this as an lvalue
+    operator HANDLE () const { return m_hEvent; };
+
+    void Set() {EXECUTE_ASSERT(SetEvent(m_hEvent));};
+    BOOL Wait(DWORD dwTimeout = INFINITE) {
+	return (WaitForSingleObject(m_hEvent, dwTimeout) == WAIT_OBJECT_0);
+    };
+    void Reset() { ResetEvent(m_hEvent); };
+    BOOL Check() { return Wait(0); };
+};
+
+
+// wrapper for event objects that do message processing
+// This adds ONE method to the CAMEvent object to allow sent
+// messages to be processed while waiting
+
+class CAMMsgEvent : public CAMEvent
+{
+
+public:
+
+    // Allow SEND messages to be processed while waiting
+    BOOL WaitMsg(DWORD dwTimeout = INFINITE);
+};
+
+// old name supported for the time being
+#define CTimeoutEvent CAMEvent
+
+// support for a worker thread
+
+// simple thread class supports creation of worker thread, synchronization
+// and communication. Can be derived to simplify parameter passing
+class AM_NOVTABLE CAMThread {
+
+    // make copy constructor and assignment operator inaccessible
+
+    CAMThread(const CAMThread &refThread);
+    CAMThread &operator=(const CAMThread &refThread);
+
+    CAMEvent m_EventSend;
+    CAMEvent m_EventComplete;
+
+    DWORD m_dwParam;
+    DWORD m_dwReturnVal;
+
+protected:
+    HANDLE m_hThread;
+
+    // thread will run this function on startup
+    // must be supplied by derived class
+    virtual DWORD ThreadProc() = 0;
+
+public:
+    CAMThread();
+    virtual ~CAMThread();
+
+    CCritSec m_AccessLock;	// locks access by client threads
+    CCritSec m_WorkerLock;	// locks access to shared objects
+
+    // thread initially runs this. param is actually 'this'. function
+    // just gets this and calls ThreadProc
+    static DWORD WINAPI InitialThreadProc(LPVOID pv);
+
+    // start thread running  - error if already running
+    BOOL Create();
+
+    // signal the thread, and block for a response
+    //
+    DWORD CallWorker(DWORD);
+
+    // accessor thread calls this when done with thread (having told thread
+    // to exit)
+    void Close() {
+        #pragma warning( push )
+        // C4312: 'type cast' : conversion from 'LONG' to 'PVOID' of greater size
+        //
+        // This code works correctly on 32-bit and 64-bit systems.
+        #pragma warning( disable : 4312 )
+        HANDLE hThread = (HANDLE)InterlockedExchangePointer(&m_hThread, 0);
+        #pragma warning( pop )
+
+        if (hThread) {
+            WaitForSingleObject(hThread, INFINITE);
+            CloseHandle(hThread);
+        }
+    };
+
+    // ThreadExists
+    // Return TRUE if the thread exists. FALSE otherwise
+    BOOL ThreadExists(void) const
+    {
+        if (m_hThread == 0) {
+            return FALSE;
+        } else {
+            return TRUE;
+        }
+    }
+
+    // wait for the next request
+    DWORD GetRequest();
+
+    // is there a request?
+    BOOL CheckRequest(DWORD * pParam);
+
+    // reply to the request
+    void Reply(DWORD);
+
+    // If you want to do WaitForMultipleObjects you'll need to include
+    // this handle in your wait list or you won't be responsive
+    HANDLE GetRequestHandle() const { return m_EventSend; };
+
+    // Find out what the request was
+    DWORD GetRequestParam() const { return m_dwParam; };
+
+    // call CoInitializeEx (COINIT_DISABLE_OLE1DDE) if
+    // available. S_FALSE means it's not available.
+    static HRESULT CoInitializeHelper();
+};
+
+
+// CQueue
+//
+// Implements a simple Queue ADT.  The queue contains a finite number of
+// objects, access to which is controlled by a semaphore.  The semaphore
+// is created with an initial count (N).  Each time an object is added
+// a call to WaitForSingleObject is made on the semaphore's handle.  When
+// this function returns a slot has been reserved in the queue for the new
+// object.  If no slots are available the function blocks until one becomes
+// available.  Each time an object is removed from the queue ReleaseSemaphore
+// is called on the semaphore's handle, thus freeing a slot in the queue.
+// If no objects are present in the queue the function blocks until an
+// object has been added.
+
+#define DEFAULT_QUEUESIZE   2
+
+template <class T> class CQueue {
+private:
+    HANDLE          hSemPut;        // Semaphore controlling queue "putting"
+    HANDLE          hSemGet;        // Semaphore controlling queue "getting"
+    CRITICAL_SECTION CritSect;      // Thread seriallization
+    int             nMax;           // Max objects allowed in queue
+    int             iNextPut;       // Array index of next "PutMsg"
+    int             iNextGet;       // Array index of next "GetMsg"
+    T              *QueueObjects;   // Array of objects (ptr's to void)
+
+    void Initialize(int n) {
+        iNextPut = iNextGet = 0;
+        nMax = n;
+        InitializeCriticalSection(&CritSect);
+        hSemPut = CreateSemaphore(NULL, n, n, NULL);
+        hSemGet = CreateSemaphore(NULL, 0, n, NULL);
+        QueueObjects = new T[n];
+    }
+
+
+public:
+    CQueue(int n) {
+        Initialize(n);
+    }
+
+    CQueue() {
+        Initialize(DEFAULT_QUEUESIZE);
+    }
+
+    ~CQueue() {
+        delete [] QueueObjects;
+        DeleteCriticalSection(&CritSect);
+        CloseHandle(hSemPut);
+        CloseHandle(hSemGet);
+    }
+
+    T GetQueueObject() {
+        int iSlot;
+        T Object;
+        LONG lPrevious;
+
+        // Wait for someone to put something on our queue, returns straight
+        // away is there is already an object on the queue.
+        //
+        WaitForSingleObject(hSemGet, INFINITE);
+
+        EnterCriticalSection(&CritSect);
+        iSlot = iNextGet++ % nMax;
+        Object = QueueObjects[iSlot];
+        LeaveCriticalSection(&CritSect);
+
+        // Release anyone waiting to put an object onto our queue as there
+        // is now space available in the queue.
+        //
+        ReleaseSemaphore(hSemPut, 1L, &lPrevious);
+        return Object;
+    }
+
+    void PutQueueObject(T Object) {
+        int iSlot;
+        LONG lPrevious;
+
+        // Wait for someone to get something from our queue, returns straight
+        // away is there is already an empty slot on the queue.
+        //
+        WaitForSingleObject(hSemPut, INFINITE);
+
+        EnterCriticalSection(&CritSect);
+        iSlot = iNextPut++ % nMax;
+        QueueObjects[iSlot] = Object;
+        LeaveCriticalSection(&CritSect);
+
+        // Release anyone waiting to remove an object from our queue as there
+        // is now an object available to be removed.
+        //
+        ReleaseSemaphore(hSemGet, 1L, &lPrevious);
+    }
+};
+
+// miscellaneous string conversion functions
+// NOTE: as we need to use the same binaries on Win95 as on NT this code should
+// be compiled WITHOUT unicode being defined.  Otherwise we will not pick up
+// these internal routines and the binary will not run on Win95.
+
+// int WINAPIV wsprintfWInternal(LPWSTR, LPCWSTR, ...);
+
+//LPWSTR
+//WINAPI
+//lstrcpyWInternal(
+//    LPWSTR lpString1,
+//    LPCWSTR lpString2
+//    );
+LPWSTR
+WINAPI
+lstrcpynWInternal(
+    LPWSTR lpString1,
+    LPCWSTR lpString2,
+    int     iMaxLength
+    );
+int
+WINAPI
+lstrcmpWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    );
+int
+WINAPI
+lstrcmpiWInternal(
+    LPCWSTR lpString1,
+    LPCWSTR lpString2
+    );
+int
+WINAPI
+lstrlenWInternal(
+    LPCWSTR lpString
+    );
+
+#ifndef UNICODE
+#define wsprintfW wsprintfWInternal
+#define lstrcpyW lstrcpyWInternal
+#define lstrcpynW lstrcpynWInternal
+#define lstrcmpW lstrcmpWInternal
+#define lstrcmpiW lstrcmpiWInternal
+#define lstrlenW lstrlenWInternal
+#endif
+
+extern "C"
+void * __stdcall memmoveInternal(void *, const void *, size_t);
+
+inline void * __cdecl memchrInternal(const void *buf, int chr, size_t cnt)
+{
+#ifdef _X86_
+    void *pRet = NULL;
+
+    _asm {
+        cld                 // make sure we get the direction right
+        mov     ecx, cnt    // num of bytes to scan
+        mov     edi, buf    // pointer byte stream
+        mov     eax, chr    // byte to scan for
+        repne   scasb       // look for the byte in the byte stream
+        jnz     exit_memchr // Z flag set if byte found
+        dec     edi         // scasb always increments edi even when it
+                            // finds the required byte
+        mov     pRet, edi
+exit_memchr:
+    }
+    return pRet;
+
+#else
+    while ( cnt && (*(unsigned char *)buf != (unsigned char)chr) ) {
+        buf = (unsigned char *)buf + 1;
+        cnt--;
+    }
+
+    return(cnt ? (void *)buf : NULL);
+#endif
+}
+
+void WINAPI IntToWstr(int i, LPWSTR wstr, size_t len);
+
+#define WstrToInt(sz) _wtoi(sz)
+#define atoiW(sz) _wtoi(sz)
+#define atoiA(sz) atoi(sz)
+
+// These are available to help managing bitmap VIDEOINFOHEADER media structures
+
+extern const DWORD bits555[3];
+extern const DWORD bits565[3];
+extern const DWORD bits888[3];
+
+// These help convert between VIDEOINFOHEADER and BITMAPINFO structures
+
+STDAPI_(const GUID) GetTrueColorType(const BITMAPINFOHEADER *pbmiHeader);
+STDAPI_(const GUID) GetBitmapSubtype(const BITMAPINFOHEADER *pbmiHeader);
+STDAPI_(WORD) GetBitCount(const GUID *pSubtype);
+
+// strmbase.lib implements this for compatibility with people who
+// managed to link to this directly.  we don't want to advertise it.
+//
+// STDAPI_(/* T */ CHAR *) GetSubtypeName(const GUID *pSubtype);
+
+STDAPI_(CHAR *) GetSubtypeNameA(const GUID *pSubtype);
+STDAPI_(WCHAR *) GetSubtypeNameW(const GUID *pSubtype);
+
+#ifdef UNICODE
+#define GetSubtypeName GetSubtypeNameW
+#else
+#define GetSubtypeName GetSubtypeNameA
+#endif
+
+STDAPI_(LONG) GetBitmapFormatSize(const BITMAPINFOHEADER *pHeader);
+STDAPI_(DWORD) GetBitmapSize(const BITMAPINFOHEADER *pHeader);
+STDAPI_(BOOL) ContainsPalette(const VIDEOINFOHEADER *pVideoInfo);
+STDAPI_(const RGBQUAD *) GetBitmapPalette(const VIDEOINFOHEADER *pVideoInfo);
+
+
+// Compares two interfaces and returns TRUE if they are on the same object
+BOOL WINAPI IsEqualObject(IUnknown *pFirst, IUnknown *pSecond);
+
+// This is for comparing pins
+#define EqualPins(pPin1, pPin2) IsEqualObject(pPin1, pPin2)
+
+
+// Arithmetic helper functions
+
+// Compute (a * b + rnd) / c
+LONGLONG WINAPI llMulDiv(LONGLONG a, LONGLONG b, LONGLONG c, LONGLONG rnd);
+LONGLONG WINAPI Int64x32Div32(LONGLONG a, LONG b, LONG c, LONG rnd);
+
+
+// Avoids us dyna-linking to SysAllocString to copy BSTR strings
+STDAPI WriteBSTR(BSTR * pstrDest, LPCWSTR szSrc);
+STDAPI FreeBSTR(BSTR* pstr);
+
+// Return a wide string - allocating memory for it
+// Returns:
+//    S_OK          - no error
+//    E_POINTER     - ppszReturn == NULL
+//    E_OUTOFMEMORY - can't allocate memory for returned string
+STDAPI AMGetWideString(LPCWSTR pszString, LPWSTR *ppszReturn);
+
+// Special wait for objects owning windows
+DWORD WINAPI WaitDispatchingMessages(
+    HANDLE hObject,
+    DWORD dwWait,
+    HWND hwnd = NULL,
+    UINT uMsg = 0,
+    HANDLE hEvent = NULL);
+
+// HRESULT_FROM_WIN32 converts ERROR_SUCCESS to a success code, but in
+// our use of HRESULT_FROM_WIN32, it typically means a function failed
+// to call SetLastError(), and we still want a failure code.
+//
+#define AmHresultFromWin32(x) (MAKE_HRESULT(SEVERITY_ERROR, FACILITY_WIN32, x))
+
+// call GetLastError and return an HRESULT value that will fail the
+// SUCCEEDED() macro.
+HRESULT AmGetLastErrorToHResult(void);
+
+// duplicate of ATL's CComPtr to avoid linker conflicts.
+
+IUnknown* QzAtlComPtrAssign(IUnknown** pp, IUnknown* lp);
+
+template <class T>
+class QzCComPtr
+{
+public:
+	typedef T _PtrClass;
+	QzCComPtr() {p=NULL;}
+	QzCComPtr(T* lp)
+	{
+		if ((p = lp) != NULL)
+			p->AddRef();
+	}
+	QzCComPtr(const QzCComPtr<T>& lp)
+	{
+		if ((p = lp.p) != NULL)
+			p->AddRef();
+	}
+	~QzCComPtr() {if (p) p->Release();}
+	void Release() {if (p) p->Release(); p=NULL;}
+	operator T*() {return (T*)p;}
+	T& operator*() {ASSERT(p!=NULL); return *p; }
+	//The assert on operator& usually indicates a bug.  If this is really
+	//what is needed, however, take the address of the p member explicitly.
+	T** operator&() { ASSERT(p==NULL); return &p; }
+	T* operator->() { ASSERT(p!=NULL); return p; }
+	T* operator=(T* lp){return (T*)QzAtlComPtrAssign((IUnknown**)&p, lp);}
+	T* operator=(const QzCComPtr<T>& lp)
+	{
+		return (T*)QzAtlComPtrAssign((IUnknown**)&p, lp.p);
+	}
+#if _MSC_VER>1020
+	bool operator!(){return (p == NULL);}
+#else
+	BOOL operator!(){return (p == NULL) ? TRUE : FALSE;}
+#endif
+	T* p;
+};
+
+MMRESULT CompatibleTimeSetEvent( UINT uDelay, UINT uResolution, LPTIMECALLBACK lpTimeProc, DWORD_PTR dwUser, UINT fuEvent );
+bool TimeKillSynchronousFlagAvailable( void );
+
+#endif /* __WXUTIL__ */
diff --git a/plugins/GSdx_legacy/boost_spsc_queue.hpp b/plugins/GSdx_legacy/boost_spsc_queue.hpp
new file mode 100644
index 0000000000..c1104a5de7
--- /dev/null
+++ b/plugins/GSdx_legacy/boost_spsc_queue.hpp
@@ -0,0 +1,177 @@
+// This version is a stripped down version of boost/lockfree/spsc_queue.hpp boost_spsc_queue.hpp
+// Rational
+// * Performance is better on linux than the standard std::queue
+// * Performance in the same on windows
+// => 100-200MB of dependency feel rather unfriendly
+
+// Potential optimization
+// * plug condition variable into the queue directly to avoid redundant m_count
+
+// * Restore boost optimization
+//   => unlikely or replace it with a % (if size is 2^n)
+
+
+//  lock-free single-producer/single-consumer ringbuffer
+//  this algorithm is implemented in various projects (linux kernel)
+//
+//  Copyright (C) 2009-2013 Tim Blechmann
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+// Boost Software License - Version 1.0 - August 17th, 2003
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+
+template <typename T, size_t max_size>
+class ringbuffer_base
+{
+    static const int padding_size = 64 - sizeof(size_t);
+
+    atomic<size_t> write_index_;
+    char padding1[padding_size]; /* force read_index and write_index to different cache lines */
+    atomic<size_t> read_index_;
+
+    T *buffer;
+
+    ringbuffer_base(ringbuffer_base const &) = delete;
+    ringbuffer_base(ringbuffer_base &&)      = delete;
+    const ringbuffer_base& operator=( const ringbuffer_base& ) = delete;
+
+public:
+    ringbuffer_base(void):
+        write_index_(0), read_index_(0)
+    {
+        // Use dynamically allocation here with no T object dependency
+        // Otherwise the ringbuffer_base destructor will call the destructor
+        // of T which crash if T is a (invalid) shared_ptr.
+        //
+        // Note another solution will be to create a char buffer as union of T
+        buffer = (T*)_aligned_malloc(sizeof(T)*max_size, 32);
+    }
+
+    ~ringbuffer_base(void) {
+        // destroy all remaining items
+        T out;
+        while (pop(out)) {};
+
+        _aligned_free(buffer);
+    }
+
+
+    static size_t next_index(size_t arg)
+    {
+        size_t ret = arg + 1;
+#if 0
+        while (unlikely(ret >= max_size))
+#else
+        while (ret >= max_size)
+#endif
+            ret -= max_size;
+        return ret;
+    }
+
+    bool push(T const & t)
+    {
+        const size_t write_index = write_index_.load(memory_order_relaxed);  // only written from push thread
+        const size_t next = next_index(write_index);
+
+        if (next == read_index_.load(memory_order_acquire))
+            return false; /* ringbuffer is full */
+
+        new (buffer + write_index) T(t); // copy-construct
+
+        write_index_.store(next, memory_order_release);
+
+        return true;
+    }
+
+    bool pop (T & ret)
+    {
+        const size_t write_index = write_index_.load(memory_order_acquire);
+        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
+        if (empty(write_index, read_index))
+            return false;
+
+        ret = buffer[read_index];
+        buffer[read_index].~T();
+
+        size_t next = next_index(read_index);
+        read_index_.store(next, memory_order_release);
+        return true;
+    }
+
+    template <typename Functor>
+    bool consume_one(Functor & f)
+    {
+        const size_t write_index = write_index_.load(memory_order_acquire);
+        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
+        if (empty(write_index, read_index))
+            return false;
+
+        f(buffer[read_index]);
+        buffer[read_index].~T();
+
+        size_t next = next_index(read_index);
+        read_index_.store(next, memory_order_release);
+        return true;
+    }
+
+public:
+    /** reset the ringbuffer
+     *
+     * \note Not thread-safe
+     * */
+    void reset(void)
+    {
+        write_index_.store(0, memory_order_relaxed);
+        read_index_.store(0, memory_order_release);
+    }
+
+    /** Check if the ringbuffer is empty
+     *
+     * \return true, if the ringbuffer is empty, false otherwise
+     * \note Due to the concurrent nature of the ringbuffer the result may be inaccurate.
+     * */
+    bool empty(void)
+    {
+        return empty(write_index_.load(memory_order_relaxed), read_index_.load(memory_order_relaxed));
+    }
+
+    /**
+     * \return true, if implementation is lock-free.
+     *
+     * */
+    bool is_lock_free(void) const
+    {
+        return write_index_.is_lock_free() && read_index_.is_lock_free();
+    }
+
+private:
+    bool empty(size_t write_index, size_t read_index)
+    {
+        return write_index == read_index;
+    }
+};
diff --git a/plugins/GSdx_legacy/config.h b/plugins/GSdx_legacy/config.h
new file mode 100644
index 0000000000..9527c023c9
--- /dev/null
+++ b/plugins/GSdx_legacy/config.h
@@ -0,0 +1,54 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+//#define ENABLE_VTUNE
+
+#define ENABLE_JIT_RASTERIZER
+
+#define EXTERNAL_SHADER_LOADING 1
+
+//#define ENABLE_DYNAMIC_CRC_HACK
+#define DYNA_DLL_PATH "c:/dev/pcsx2/trunk/tools/dynacrchack/DynaCrcHack.dll"
+
+//#define DISABLE_HW_TEXTURE_CACHE // Slow but fixes a lot of bugs
+
+//#define DISABLE_BITMASKING
+
+//#define DISABLE_COLCLAMP
+
+//#define DISABLE_DATE
+
+
+#if defined(_DEBUG) || defined(_DEVEL)
+#define ENABLE_OGL_DEBUG   // Create a debug context and check opengl command status. Allow also to dump various textures/states.
+//#define ENABLE_OGL_DEBUG_FENCE
+#endif
+//#define ENABLE_OGL_DEBUG_MEM_BW // compute the quantity of data transfered (debug purpose)
+
+#if defined(__linux__) && !(defined(_DEBUG) || defined(_DEVEL))
+#define DISABLE_PERF_MON // Burn cycle for nothing in release mode
+#endif
+
+#ifdef _WIN32
+//#define ENABLE_OPENCL
+#endif
diff --git a/plugins/GSdx_legacy/cpp_check.sh b/plugins/GSdx_legacy/cpp_check.sh
new file mode 100644
index 0000000000..a9e8b321d5
--- /dev/null
+++ b/plugins/GSdx_legacy/cpp_check.sh
@@ -0,0 +1 @@
+cppcheck --enable=warning,style,missingInclude -j 16 --platform=unix32 -D__linux__ -UENABLE_VTUNE -U_WINDOWS -U_M_AMD64 -U_MSC_VER . |& tee cpp_check.log
diff --git a/plugins/GSdx_legacy/docs/TextureCache.odg b/plugins/GSdx_legacy/docs/TextureCache.odg
new file mode 100644
index 0000000000..1c7203d897
Binary files /dev/null and b/plugins/GSdx_legacy/docs/TextureCache.odg differ
diff --git a/plugins/GSdx_legacy/linux_replay.cpp b/plugins/GSdx_legacy/linux_replay.cpp
new file mode 100644
index 0000000000..5aae1e6c8a
--- /dev/null
+++ b/plugins/GSdx_legacy/linux_replay.cpp
@@ -0,0 +1,100 @@
+/*
+ *	Copyright (C) 2011-2012 Hainaut gregory
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include <dlfcn.h>
+
+static void* handle;
+
+void help()
+{
+	fprintf(stderr, "Loader gs file\n");
+	fprintf(stderr, "ARG1 GSdx plugin\n");
+	fprintf(stderr, "ARG2 .gs file\n");
+	fprintf(stderr, "ARG3 Ini directory\n");
+	if (handle) {
+		dlclose(handle);
+	}
+	exit(1);
+}
+
+char* read_env(const char* var) {
+	char* v = getenv(var);
+	if (!v) {
+		fprintf(stderr, "Failed to get %s\n", var);
+		help();
+	}
+	return v;
+}
+
+int main ( int argc, char *argv[] )
+{
+	if (argc < 1) help();
+
+	char* plugin;
+	char* gs;
+	if (argc > 2) {
+		plugin = argv[1];
+		gs = argv[2];
+	} else {
+		plugin = read_env("GSDUMP_SO");
+		gs = argv[1];
+	}
+
+	handle = dlopen(plugin, RTLD_LAZY|RTLD_GLOBAL);
+	if (handle == NULL) {
+		fprintf(stderr, "Failed to dlopen plugin %s\n", plugin);
+		help();
+	}
+
+	__attribute__((stdcall)) void (*GSsetSettingsDir_ptr)(const char*);
+	__attribute__((stdcall)) void (*GSReplay_ptr)(char*, int);
+
+	*(void**)(&GSsetSettingsDir_ptr) = dlsym(handle, "GSsetSettingsDir");
+	*(void**)(&GSReplay_ptr) = dlsym(handle, "GSReplay");
+
+	if (argc == 2) {
+		char *ini = read_env("GSDUMP_CONF");
+
+		GSsetSettingsDir_ptr(ini);
+
+	} else if (argc == 4) {
+		(void)GSsetSettingsDir_ptr(argv[3]);
+
+	} else if ( argc == 3) {
+#ifdef XDG_STD
+		char *val = read_env("HOME");
+
+		std::string ini_dir(val);
+		ini_dir += "/.config/pcsx2/inis";
+
+		GSsetSettingsDir_ptr(ini_dir.c_str());
+#else
+		fprintf(stderr, "default ini dir only supported on XDG\n");
+		help();
+#endif
+	}
+
+	GSReplay_ptr(gs, 12);
+
+	if (handle) {
+		dlclose(handle);
+	}
+}
diff --git a/plugins/GSdx_legacy/res/convert.fx b/plugins/GSdx_legacy/res/convert.fx
new file mode 100644
index 0000000000..c482722391
--- /dev/null
+++ b/plugins/GSdx_legacy/res/convert.fx
@@ -0,0 +1,324 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+#if SHADER_MODEL >= 0x400
+
+struct VS_INPUT
+{
+	float4 p : POSITION; 
+	float2 t : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+	float4 p : SV_Position;
+	float2 t : TEXCOORD0;
+};
+
+Texture2D Texture;
+SamplerState TextureSampler;
+
+float4 sample_c(float2 uv)
+{
+	return Texture.Sample(TextureSampler, uv);
+}
+
+struct PS_INPUT
+{
+	float4 p : SV_Position;
+	float2 t : TEXCOORD0;
+};
+
+struct PS_OUTPUT
+{
+	float4 c : SV_Target0;
+};
+
+#elif SHADER_MODEL <= 0x300
+
+struct VS_INPUT
+{
+	float4 p : POSITION; 
+	float2 t : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+	float4 p : POSITION;
+	float2 t : TEXCOORD0;
+};
+
+struct PS_INPUT
+{
+#if SHADER_MODEL < 0x300
+	float4 p : TEXCOORD1;
+#else
+	float4 p : VPOS;
+#endif
+	float2 t : TEXCOORD0;
+};
+
+struct PS_OUTPUT
+{
+	float4 c : COLOR;
+};
+
+sampler Texture : register(s0);
+
+float4 sample_c(float2 uv)
+{
+	return tex2D(Texture, uv);
+}
+
+#endif
+
+VS_OUTPUT vs_main(VS_INPUT input)
+{
+	VS_OUTPUT output;
+
+	output.p = input.p;
+	output.t = input.t;
+
+	return output;
+}
+
+PS_OUTPUT ps_main0(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	output.c = sample_c(input.t);
+
+	return output;
+}
+
+PS_OUTPUT ps_main7(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	float4 c = sample_c(input.t);
+	
+	c.a = dot(c.rgb, float3(0.299, 0.587, 0.114));
+
+	output.c = c;
+
+	return output;
+}
+
+float4 ps_crt(PS_INPUT input, int i)
+{
+	float4 mask[4] = 
+	{
+		float4(1, 0, 0, 0), 
+		float4(0, 1, 0, 0), 
+		float4(0, 0, 1, 0), 
+		float4(1, 1, 1, 0)
+	};
+	
+	return sample_c(input.t) * saturate(mask[i] + 0.5f);
+}
+
+float4 ps_scanlines(PS_INPUT input, int i)
+{
+	float4 mask[2] =
+	{
+		float4(1, 1, 1, 0),
+		float4(0, 0, 0, 0)
+	};
+
+	return sample_c(input.t) * saturate(mask[i] + 0.5f);
+}
+
+#if SHADER_MODEL >= 0x400
+
+uint ps_main1(PS_INPUT input) : SV_Target0
+{
+	float4 c = sample_c(input.t);
+
+	c.a *= 256.0f / 127; // hm, 0.5 won't give us 1.0 if we just multiply with 2
+
+	uint4 i = c * float4(0x001f, 0x03e0, 0x7c00, 0x8000);
+
+	return (i.x & 0x001f) | (i.y & 0x03e0) | (i.z & 0x7c00) | (i.w & 0x8000);	
+}
+
+PS_OUTPUT ps_main2(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	clip(sample_c(input.t).a - 127.5f / 255); // >= 0x80 pass
+	
+	output.c = 0;
+
+	return output;
+}
+
+PS_OUTPUT ps_main3(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	clip(127.5f / 255 - sample_c(input.t).a); // < 0x80 pass (== 0x80 should not pass)
+	
+	output.c = 0;
+
+	return output;
+}
+
+PS_OUTPUT ps_main4(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	output.c = fmod(sample_c(input.t) * 255 + 0.5f, 256) / 255;
+
+	return output;
+}
+
+PS_OUTPUT ps_main5(PS_INPUT input) // scanlines
+{
+	PS_OUTPUT output;
+	
+	uint4 p = (uint4)input.p;
+
+	output.c = ps_scanlines(input, p.y % 2);
+
+	return output;
+}
+
+PS_OUTPUT ps_main6(PS_INPUT input) // diagonal
+{
+	PS_OUTPUT output;
+
+	uint4 p = (uint4)input.p;
+
+	output.c = ps_crt(input, (p.x + (p.y % 3)) % 3);
+
+	return output;
+}
+
+PS_OUTPUT ps_main8(PS_INPUT input) // triangular
+{
+	PS_OUTPUT output;
+
+	uint4 p = (uint4)input.p;
+
+	// output.c = ps_crt(input, ((p.x + (p.y & 1) * 3) >> 1) % 3); 
+	output.c = ps_crt(input, ((p.x + ((p.y >> 1) & 1) * 3) >> 1) % 3);
+
+	return output;
+}
+
+static const float PI = 3.14159265359f;
+PS_OUTPUT ps_main9(PS_INPUT input) // triangular
+{
+	PS_OUTPUT output;
+
+	float2 texdim, halfpixel; 
+	Texture.GetDimensions(texdim.x, texdim.y); 
+	if (ddy(input.t.y) * texdim.y > 0.5) 
+		output.c = sample_c(input.t); 
+	else
+		output.c = (0.9 - 0.4 * cos(2 * PI * input.t.y * texdim.y)) * sample_c(float2(input.t.x, (floor(input.t.y * texdim.y) + 0.5) / texdim.y));
+
+	return output;
+}
+
+#elif SHADER_MODEL <= 0x300
+
+PS_OUTPUT ps_main1(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	float4 c = sample_c(input.t);
+	
+	c.a *= 128.0f / 255; // *= 0.5f is no good here, need to do this in order to get 0x80 for 1.0f (instead of 0x7f)
+	
+	output.c = c;
+
+	return output;
+}
+
+PS_OUTPUT ps_main2(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	clip(sample_c(input.t).a - 255.0f / 255); // >= 0x80 pass
+	
+	output.c = 0;
+
+	return output;
+}
+
+PS_OUTPUT ps_main3(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	clip(254.95f / 255 - sample_c(input.t).a); // < 0x80 pass (== 0x80 should not pass)
+	
+	output.c = 0;
+
+	return output;
+}
+
+PS_OUTPUT ps_main4(PS_INPUT input)
+{
+	PS_OUTPUT output;
+	
+	output.c = 1;
+	
+	return output;
+}
+
+PS_OUTPUT ps_main5(PS_INPUT input) // scanlines
+{
+	PS_OUTPUT output;
+	
+	int4 p = (int4)input.p;
+
+	output.c = ps_scanlines(input, p.y % 2);
+
+	return output;
+}
+
+PS_OUTPUT ps_main6(PS_INPUT input) // diagonal
+{
+	PS_OUTPUT output;
+
+	int4 p = (int4)input.p;
+
+	output.c = ps_crt(input, (p.x + (p.y % 3)) % 3);
+
+	return output;
+}
+
+PS_OUTPUT ps_main8(PS_INPUT input) // triangular
+{
+	PS_OUTPUT output;
+
+	int4 p = (int4)input.p;
+
+	// output.c = ps_crt(input, ((p.x + (p.y % 2) * 3) / 2) % 3);
+	output.c = ps_crt(input, ((p.x + ((p.y / 2) % 2) * 3) / 2) % 3);
+
+	return output;
+}
+
+static const float PI = 3.14159265359f;
+PS_OUTPUT ps_main9(PS_INPUT input) // triangular
+{
+	PS_OUTPUT output;
+
+	// Needs DX9 conversion
+	/*float2 texdim, halfpixel; 
+	Texture.GetDimensions(texdim.x, texdim.y); 
+	if (ddy(input.t.y) * texdim.y > 0.5) 
+		output.c = sample_c(input.t); 
+	else
+		output.c = (0.5 - 0.5 * cos(2 * PI * input.t.y * texdim.y)) * sample_c(float2(input.t.x, (floor(input.t.y * texdim.y) + 0.5) / texdim.y));
+*/
+
+	// replacement shader
+	int4 p = (int4)input.p;
+	output.c = ps_crt(input, ((p.x + ((p.y / 2) % 2) * 3) / 2) % 3);
+
+	return output;
+}
+
+#endif
+#endif
diff --git a/plugins/GSdx_legacy/res/cs.fx b/plugins/GSdx_legacy/res/cs.fx
new file mode 100644
index 0000000000..c84211ba95
--- /dev/null
+++ b/plugins/GSdx_legacy/res/cs.fx
@@ -0,0 +1,387 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+
+#ifndef VS_TME
+#define VS_TME 1
+#define VS_FST 1
+#endif
+
+#ifndef GS_IIP
+#define GS_IIP 0
+#define GS_PRIM 2
+#endif
+
+#ifndef PS_BATCH_SIZE
+#define PS_BATCH_SIZE 2048
+#define PS_FPSM PSM_PSMCT32
+#define PS_ZPSM PSM_PSMZ16
+#endif
+
+#define PSM_PSMCT32		0
+#define PSM_PSMCT24		1
+#define PSM_PSMCT16		2
+#define PSM_PSMCT16S	10
+#define PSM_PSMT8		19
+#define PSM_PSMT4		20
+#define PSM_PSMT8H		27
+#define PSM_PSMT4HL		36
+#define PSM_PSMT4HH		44
+#define PSM_PSMZ32		48
+#define PSM_PSMZ24		49
+#define PSM_PSMZ16		50
+#define PSM_PSMZ16S		58
+
+struct VS_INPUT
+{
+	float2 st : TEXCOORD0;
+	float4 c : COLOR0;
+	float q : TEXCOORD1;
+	uint2 p : POSITION0;
+	uint z : POSITION1;
+	uint2 uv : TEXCOORD2;
+	float4 f : COLOR1;
+};
+
+struct VS_OUTPUT
+{
+	float4 p : SV_Position;
+	float2 z : TEXCOORD0;
+	float4 t : TEXCOORD1;
+	float4 c : COLOR0;
+};
+
+struct GS_OUTPUT
+{
+	float4 p : SV_Position;
+	float2 z : TEXCOORD0;
+	float4 t : TEXCOORD1;
+	float4 c : COLOR0;
+	uint id : SV_PrimitiveID;
+};
+
+cbuffer VSConstantBuffer : register(c0)
+{
+	float4 VertexScale;
+	float4 VertexOffset;
+};
+
+cbuffer PSConstantBuffer : register(c0)
+{
+	uint2 WriteMask;
+};
+
+struct FragmentLinkItem
+{
+	uint c, z, id, next;
+};
+
+RWByteAddressBuffer VideoMemory : register(u0);
+RWStructuredBuffer<FragmentLinkItem> FragmentLinkBuffer : register(u1);
+RWByteAddressBuffer StartOffsetBuffer : register(u2);
+//RWTexture2D<uint> VideoMemory : register(u2); // 8192 * 512 R8_UINT
+
+Buffer<int2> FZRowOffset : register(t0);
+Buffer<int2> FZColOffset : register(t1);
+Texture2D<float4> Palette : register(t2);
+Texture2D<float4> Texture : register(t3);
+
+VS_OUTPUT vs_main(VS_INPUT input)
+{
+	VS_OUTPUT output;
+
+	output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset;
+	output.z = float2(input.z & 0xffff, input.z >> 16); // TODO: min(input.z, 0xffffff00) ?
+
+	if(VS_TME)
+	{
+		if(VS_FST)
+		{
+			output.t.xy = input.uv;
+			output.t.w = 1.0f;
+		}
+		else
+		{
+			output.t.xy = input.st;
+			output.t.w = input.q;
+		}
+	}
+	else
+	{
+		output.t.xy = 0;
+		output.t.w = 1.0f;
+	}
+
+	output.c = input.c;
+	output.t.z = input.f.r;
+
+	return output;
+}
+
+#if GS_PRIM == 0
+
+[maxvertexcount(1)]
+void gs_main(point VS_OUTPUT input[1], inout PointStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
+{
+	GS_OUTPUT output;
+
+	output.p = input[0].p;
+	output.z = input[0].z;
+	output.t = input[0].t;
+	output.c = input[0].c;
+	output.id = id;
+
+	stream.Append(output);
+}
+
+#elif GS_PRIM == 1
+
+[maxvertexcount(2)]
+void gs_main(line VS_OUTPUT input[2], inout LineStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
+{
+	[unroll]
+	for(int i = 0; i < 2; i++)
+	{
+		GS_OUTPUT output;
+
+		output.p = input[i].p;
+		output.z = input[i].z;
+		output.t = input[i].t;
+		output.c = input[i].c;
+		output.id = id;
+
+#if GS_IIP == 0
+		if(i != 1) output.c = input[1].c;
+#endif
+
+		stream.Append(output);
+	}
+}
+
+#elif GS_PRIM == 2
+
+[maxvertexcount(3)]
+void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
+{
+	[unroll]
+	for(int i = 0; i < 3; i++)
+	{
+		GS_OUTPUT output;
+
+		output.p = input[i].p;
+		output.z = input[i].z;
+		output.t = input[i].t;
+		output.c = input[i].c;
+		output.id = id;
+
+#if GS_IIP == 0
+		if(i != 2) output.c = input[2].c;
+#endif
+
+		stream.Append(output);
+	}
+}
+
+#elif GS_PRIM == 3
+
+[maxvertexcount(4)]
+void gs_main(line VS_OUTPUT input[2], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
+{
+	GS_OUTPUT lt, rb, lb, rt;
+
+	lt.p = input[0].p;
+	lt.z = input[1].z;
+	lt.t.xy = input[0].t.xy;
+	lt.t.zw = input[1].t.zw;
+	lt.c = input[0].c;
+	lt.id = id;
+
+#if GS_IIP == 0
+	lt.c = input[1].c;
+#endif
+
+	rb.p = input[1].p;
+	rb.z = input[1].z;
+	rb.t = input[1].t;
+	rb.c = input[1].c;
+	rb.id = id;
+
+	lb = lt;	
+	lb.p.y = rb.p.y;
+	lb.t.y = rb.t.y;
+
+	rt = rb;	
+	rt.p.y = lt.p.y;
+	rt.t.y = lt.t.y;
+
+	stream.Append(lt);
+	stream.Append(lb);
+	stream.Append(rt);
+	stream.Append(rb);
+}
+
+#endif
+
+uint CompressColor32(float4 f)
+{
+	uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24);
+
+	return c.r | c.g | c.b | c.a;
+}
+
+uint DecompressColor16(uint c)
+{
+	uint r = (c & 0x001f) << 3;
+	uint g = (c & 0x03e0) << 6;
+	uint b = (c & 0x7c00) << 9;
+	uint a = (c & 0x8000) << 15;
+
+	return r | g | b | a;
+}
+
+uint ReadPixel(uint addr)
+{
+	return VideoMemory.Load(addr) >> ((addr & 2) << 3);
+}
+
+void WritePixel(uint addr, uint value, uint psm)
+{
+	uint tmp;
+
+	switch(psm)
+	{
+	case PSM_PSMCT32:
+	case PSM_PSMZ32:
+	case PSM_PSMCT24:
+	case PSM_PSMZ24:
+		VideoMemory.Store(addr, value);
+		break;
+	case PSM_PSMCT16:
+	case PSM_PSMCT16S:
+	case PSM_PSMZ16:
+	case PSM_PSMZ16S:
+		tmp = (addr & 2) << 3;
+		value = ((value << tmp) ^ VideoMemory.Load(addr)) & (0x0000ffff << tmp);
+		VideoMemory.InterlockedXor(addr, value, tmp);
+		break;
+	}
+}
+
+void ps_main0(GS_OUTPUT input)
+{
+	uint x = (uint)input.p.x;
+	uint y = (uint)input.p.y;
+
+	uint tail = FragmentLinkBuffer.IncrementCounter();
+
+	uint index = (y << 11) + x;
+	uint next = 0;
+
+	StartOffsetBuffer.InterlockedExchange(index * 4, tail, next);
+
+	FragmentLinkItem item;
+
+	// TODO: preprocess color (tfx, alpha test), z-test
+
+	item.c = CompressColor32(input.c);
+	item.z = (uint)(input.z.y * 0x10000 + input.z.x);
+	item.id = input.id;
+	item.next = next;
+
+	FragmentLinkBuffer[tail] = item;
+}
+
+void ps_main1(GS_OUTPUT input)
+{
+	uint2 pos = (uint2)input.p.xy;
+
+	// sort fragments
+
+	uint StartOffsetIndex = (pos.y << 11) + pos.x;
+
+	int index[PS_BATCH_SIZE];
+	int count = 0;
+
+	uint next = StartOffsetBuffer.Load(StartOffsetIndex * 4);
+
+	StartOffsetBuffer.Store(StartOffsetIndex * 4, 0);
+
+	[allow_uav_condition]
+	while(next != 0)
+	{
+		index[count++] = next;
+
+		next = FragmentLinkBuffer[next].next;
+	}
+
+	int N2 = 1 << (int)(ceil(log2(count)));
+
+	[allow_uav_condition]
+	for(int i = count; i < N2; i++)
+	{
+		index[i] = 0;
+	}
+
+	[allow_uav_condition]
+	for(int k = 2; k <= N2; k = 2 * k)
+	{
+		[allow_uav_condition]
+		for(int j = k >> 1; j > 0 ; j = j >> 1) 
+		{
+			[allow_uav_condition]
+			for(int i = 0; i < N2; i++) 
+			{
+				uint i_id = FragmentLinkBuffer[index[i]].id;
+
+				int ixj = i ^ j;
+
+				if(ixj > i)
+				{
+					uint ixj_id = FragmentLinkBuffer[index[ixj]].id;
+
+					if((i & k) == 0 && i_id > ixj_id)
+					{ 
+						int temp = index[i];
+						index[i] = index[ixj];
+						index[ixj] = temp;
+					}
+
+					if((i & k) != 0 && i_id < ixj_id)
+					{
+						int temp = index[i];
+						index[i] = index[ixj];
+						index[ixj] = temp;
+					}
+				}
+			}
+		}
+	}
+
+	uint2 addr = (uint2)(FZRowOffset[pos.y] + FZColOffset[pos.x]) << 1;
+
+	uint dc = ReadPixel(addr.x);
+	uint dz = ReadPixel(addr.y);
+
+	uint sc = dc;
+	uint sz = dz;
+
+	[allow_uav_condition]
+	while(--count >= 0)
+	{
+		FragmentLinkItem f = FragmentLinkBuffer[index[count]];
+
+		// TODO
+
+		if(sz < f.z)
+		{
+			sc = f.c;
+			sz = f.z;
+		}
+	}
+
+	uint c = sc; // (dc & ~WriteMask.x) | (sc & WriteMask.x);
+	uint z = 0;//sz; //(dz & ~WriteMask.y) | (sz & WriteMask.y);
+
+	WritePixel(addr.x, c, PS_FPSM);
+	WritePixel(addr.y, z, PS_ZPSM);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/res/fxaa.fx b/plugins/GSdx_legacy/res/fxaa.fx
new file mode 100644
index 0000000000..d38d54623b
--- /dev/null
+++ b/plugins/GSdx_legacy/res/fxaa.fx
@@ -0,0 +1,588 @@
+#if defined(SHADER_MODEL) || defined(FXAA_GLSL_130)
+
+#ifndef FXAA_GLSL_130
+    #define FXAA_GLSL_130 0
+#endif
+
+#define UHQ_FXAA 1          //High Quality Fast Approximate Anti Aliasing. Adapted for GSdx from Timothy Lottes FXAA 3.11.
+#define FxaaSubpixMax 0.0   //[0.00 to 1.00] Amount of subpixel aliasing removal. 0.00: Edge only antialiasing (no blurring)
+#define FxaaEarlyExit 1     //[0 or 1] Use Fxaa early exit pathing. When disabled, the entire scene is antialiased(FSAA). 0 is off, 1 is on.
+
+/*------------------------------------------------------------------------------
+							 [GLOBALS|FUNCTIONS]
+------------------------------------------------------------------------------*/
+#if (FXAA_GLSL_130 == 1)
+
+struct vertex_basic
+{
+    vec4 p;
+    vec2 t;
+};
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+
+in SHADER
+{
+    vec4 p;
+    vec2 t;
+} PSin;
+
+layout(location = 0) out vec4 SV_Target0;
+
+#else
+
+#if (SHADER_MODEL >= 0x400)
+Texture2D Texture : register(t0);
+SamplerState TextureSampler : register(s0);
+#else
+texture2D Texture : register(t0);
+sampler2D TextureSampler : register(s0);
+#define SamplerState sampler2D
+#endif
+
+cbuffer cb0
+{
+	float4 _rcpFrame : register(c0);
+};
+
+struct VS_INPUT
+{
+	float4 p : POSITION;
+	float2 t : TEXCOORD0;
+};
+
+struct VS_OUTPUT
+{
+	#if (SHADER_MODEL >= 0x400)
+	float4 p : SV_Position;
+	#else
+	float4 p : TEXCOORD1;
+	#endif
+	float2 t : TEXCOORD0;
+};
+
+struct PS_OUTPUT
+{
+	#if (SHADER_MODEL >= 0x400)
+	float4 c : SV_Target0;
+	#else
+	float4 c : COLOR0;
+	#endif
+};
+
+#endif
+
+/*------------------------------------------------------------------------------
+                             [FXAA CODE SECTION]
+------------------------------------------------------------------------------*/
+
+#if (SHADER_MODEL >= 0x500)
+#define FXAA_HLSL_5 1
+#define FXAA_GATHER4_ALPHA 1
+#elif (SHADER_MODEL >= 0x400)
+#define FXAA_HLSL_4 1
+#define FXAA_GATHER4_ALPHA 0
+#elif (FXAA_GLSL_130 == 1)
+#define FXAA_GATHER4_ALPHA 1
+#else
+#define FXAA_HLSL_3 1
+#define FXAA_GATHER4_ALPHA 0
+#endif
+
+#if (FXAA_HLSL_5 == 1)
+struct FxaaTex { SamplerState smpl; Texture2D tex; };
+#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
+#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
+#define FxaaTexAlpha4(t, p) t.tex.GatherAlpha(t.smpl, p)
+#define FxaaTexOffAlpha4(t, p, o) t.tex.GatherAlpha(t.smpl, p, o)
+#define FxaaDiscard clip(-1)
+#define FxaaSat(x) saturate(x)
+
+#elif (FXAA_HLSL_4 == 1)
+struct FxaaTex { SamplerState smpl; Texture2D tex; };
+#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)
+#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)
+#define FxaaDiscard clip(-1)
+#define FxaaSat(x) saturate(x)
+
+#elif (FXAA_HLSL_3 == 1)
+#define FxaaTex sampler2D
+#define int2 float2
+#define FxaaSat(x) saturate(x)
+#define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0))
+#define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0))
+
+#elif (FXAA_GLSL_130 == 1)
+
+#define int2 ivec2
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define FxaaDiscard discard
+#define FxaaSat(x) clamp(x, 0.0, 1.0)
+#define FxaaTex sampler2D
+#define FxaaTexTop(t, p) textureLod(t, p, 0.0)
+#define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)
+#if (FXAA_GATHER4_ALPHA == 1)
+// use #extension GL_ARB_gpu_shader5 : enable
+#define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
+#define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
+#endif
+
+#endif
+
+#define FxaaEdgeThreshold 0.063
+#define FxaaEdgeThresholdMin 0.00
+#define FXAA_QUALITY__P0 1.0
+#define FXAA_QUALITY__P1 1.5
+#define FXAA_QUALITY__P2 2.0
+#define FXAA_QUALITY__P3 2.0
+#define FXAA_QUALITY__P4 2.0
+#define FXAA_QUALITY__P5 2.0
+#define FXAA_QUALITY__P6 2.0
+#define FXAA_QUALITY__P7 2.0
+#define FXAA_QUALITY__P8 2.0
+#define FXAA_QUALITY__P9 2.0
+#define FXAA_QUALITY__P10 4.0
+#define FXAA_QUALITY__P11 8.0
+#define FXAA_QUALITY__P12 8.0
+
+/*------------------------------------------------------------------------------
+                        [GAMMA PREPASS CODE SECTION]
+------------------------------------------------------------------------------*/
+float RGBLuminance(float3 color)
+{
+	const float3 lumCoeff = float3(0.2126729, 0.7151522, 0.0721750);
+	return dot(color.rgb, lumCoeff);
+}
+
+#if (FXAA_GLSL_130 == 0)
+#define PixelSize float2(_rcpFrame.x, _rcpFrame.y)
+#endif
+
+
+float3 RGBGammaToLinear(float3 color, float gamma)
+{
+	color = FxaaSat(color);
+	color.r = (color.r <= 0.0404482362771082) ?
+	color.r / 12.92 : pow((color.r + 0.055) / 1.055, gamma);
+	color.g = (color.g <= 0.0404482362771082) ?
+	color.g / 12.92 : pow((color.g + 0.055) / 1.055, gamma);
+	color.b = (color.b <= 0.0404482362771082) ?
+	color.b / 12.92 : pow((color.b + 0.055) / 1.055, gamma);
+
+	return color;
+}
+
+float3 LinearToRGBGamma(float3 color, float gamma)
+{
+	color = FxaaSat(color);
+	color.r = (color.r <= 0.00313066844250063) ?
+	color.r * 12.92 : 1.055 * pow(color.r, 1.0 / gamma) - 0.055;
+	color.g = (color.g <= 0.00313066844250063) ?
+	color.g * 12.92 : 1.055 * pow(color.g, 1.0 / gamma) - 0.055;
+	color.b = (color.b <= 0.00313066844250063) ?
+	color.b * 12.92 : 1.055 * pow(color.b, 1.0 / gamma) - 0.055;
+
+	return color;
+}
+
+float4 PreGammaPass(float4 color, float2 uv0)
+{
+	#if (SHADER_MODEL >= 0x400)
+		color = Texture.Sample(TextureSampler, uv0);
+    #elif (FXAA_GLSL_130 == 1)
+		color = texture(TextureSampler, uv0);
+	#else
+		color = tex2D(TextureSampler, uv0);
+	#endif
+
+	const float GammaConst = 2.233;
+	color.rgb = RGBGammaToLinear(color.rgb, GammaConst);
+	color.rgb = LinearToRGBGamma(color.rgb, GammaConst);
+	color.a = RGBLuminance(color.rgb);
+
+	return color;
+}
+
+
+/*------------------------------------------------------------------------------
+                        [FXAA CODE SECTION]
+------------------------------------------------------------------------------*/
+
+float FxaaLuma(float4 rgba)
+{ 
+	rgba.w = RGBLuminance(rgba.xyz);
+	return rgba.w; 
+}
+
+float4 FxaaPixelShader(float2 pos, FxaaTex tex, float2 fxaaRcpFrame, float fxaaSubpix, float fxaaEdgeThreshold, float fxaaEdgeThresholdMin)
+{
+	float2 posM;
+	posM.x = pos.x;
+	posM.y = pos.y;
+
+	#if (FXAA_GATHER4_ALPHA == 1)
+	float4 rgbyM = FxaaTexTop(tex, posM);
+	float4 luma4A = FxaaTexAlpha4(tex, posM);
+	float4 luma4B = FxaaTexOffAlpha4(tex, posM, int2(-1, -1));
+	rgbyM.w = RGBLuminance(rgbyM.xyz);
+
+	#define lumaM rgbyM.w
+	#define lumaE luma4A.z
+	#define lumaS luma4A.x
+	#define lumaSE luma4A.y
+	#define lumaNW luma4B.w
+	#define lumaN luma4B.z
+	#define lumaW luma4B.x
+    
+	#else
+	float4 rgbyM = FxaaTexTop(tex, posM);
+	rgbyM.w = RGBLuminance(rgbyM.xyz);
+	#define lumaM rgbyM.w
+
+	float lumaS = FxaaLuma(FxaaTexOff(tex, posM, int2( 0, 1), fxaaRcpFrame.xy));
+	float lumaE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1, 0), fxaaRcpFrame.xy));
+	float lumaN = FxaaLuma(FxaaTexOff(tex, posM, int2( 0,-1), fxaaRcpFrame.xy));
+	float lumaW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 0), fxaaRcpFrame.xy));
+	#endif
+
+	float maxSM = max(lumaS, lumaM);
+	float minSM = min(lumaS, lumaM);
+	float maxESM = max(lumaE, maxSM);
+	float minESM = min(lumaE, minSM);
+	float maxWN = max(lumaN, lumaW);
+	float minWN = min(lumaN, lumaW);
+
+	float rangeMax = max(maxWN, maxESM);
+	float rangeMin = min(minWN, minESM);
+	float range = rangeMax - rangeMin;
+	float rangeMaxScaled = rangeMax * fxaaEdgeThreshold;
+	float rangeMaxClamped = max(fxaaEdgeThresholdMin, rangeMaxScaled);
+
+	bool earlyExit = range < rangeMaxClamped;
+	#if (FxaaEarlyExit == 1)
+	if(earlyExit) { return rgbyM; }
+	#endif
+
+	#if (FXAA_GATHER4_ALPHA == 0)
+	float lumaNW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1,-1), fxaaRcpFrame.xy));
+	float lumaSE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1, 1), fxaaRcpFrame.xy));
+	float lumaNE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1,-1), fxaaRcpFrame.xy));
+	float lumaSW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 1), fxaaRcpFrame.xy));
+	#else
+	float lumaNE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1,-1), fxaaRcpFrame.xy));
+	float lumaSW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 1), fxaaRcpFrame.xy));
+	#endif
+
+	float lumaNS = lumaN + lumaS;
+	float lumaWE = lumaW + lumaE;
+	float subpixRcpRange = 1.0/range;
+	float subpixNSWE = lumaNS + lumaWE;
+	float edgeHorz1 = (-2.0 * lumaM) + lumaNS;
+	float edgeVert1 = (-2.0 * lumaM) + lumaWE;
+	float lumaNESE = lumaNE + lumaSE;
+	float lumaNWNE = lumaNW + lumaNE;
+	float edgeHorz2 = (-2.0 * lumaE) + lumaNESE;
+	float edgeVert2 = (-2.0 * lumaN) + lumaNWNE;
+
+	float lumaNWSW = lumaNW + lumaSW;
+	float lumaSWSE = lumaSW + lumaSE;
+	float edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);
+	float edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);
+	float edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;
+	float edgeVert3 = (-2.0 * lumaS) + lumaSWSE;
+	float edgeHorz = abs(edgeHorz3) + edgeHorz4;
+	float edgeVert = abs(edgeVert3) + edgeVert4;
+
+	float subpixNWSWNESE = lumaNWSW + lumaNESE;
+	float lengthSign = fxaaRcpFrame.x;
+	bool horzSpan = edgeHorz >= edgeVert;
+	float subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;
+	if(!horzSpan) lumaN = lumaW;
+	if(!horzSpan) lumaS = lumaE;
+	if(horzSpan) lengthSign = fxaaRcpFrame.y;
+	float subpixB = (subpixA * (1.0/12.0)) - lumaM;
+
+	float gradientN = lumaN - lumaM;
+	float gradientS = lumaS - lumaM;
+	float lumaNN = lumaN + lumaM;
+	float lumaSS = lumaS + lumaM;
+	bool pairN = abs(gradientN) >= abs(gradientS);
+	float gradient = max(abs(gradientN), abs(gradientS));
+	if(pairN) lengthSign = -lengthSign;
+	float subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);
+
+	float2 posB;
+	posB.x = posM.x;
+	posB.y = posM.y;
+	float2 offNP;
+	offNP.x = (!horzSpan) ? 0.0 : fxaaRcpFrame.x;
+	offNP.y = ( horzSpan) ? 0.0 : fxaaRcpFrame.y;
+	if(!horzSpan) posB.x += lengthSign * 0.5;
+	if( horzSpan) posB.y += lengthSign * 0.5;
+
+	float2 posN;
+	posN.x = posB.x - offNP.x * FXAA_QUALITY__P0;
+	posN.y = posB.y - offNP.y * FXAA_QUALITY__P0;
+	float2 posP;
+	posP.x = posB.x + offNP.x * FXAA_QUALITY__P0;
+	posP.y = posB.y + offNP.y * FXAA_QUALITY__P0;
+	float subpixD = ((-2.0)*subpixC) + 3.0;
+	float lumaEndN = FxaaLuma(FxaaTexTop(tex, posN));
+	float subpixE = subpixC * subpixC;
+	float lumaEndP = FxaaLuma(FxaaTexTop(tex, posP));
+
+	if(!pairN) lumaNN = lumaSS;
+	float gradientScaled = gradient * 1.0/4.0;
+	float lumaMM = lumaM - lumaNN * 0.5;
+	float subpixF = subpixD * subpixE;
+	bool lumaMLTZero = lumaMM < 0.0;
+	lumaEndN -= lumaNN * 0.5;
+	lumaEndP -= lumaNN * 0.5;
+	bool doneN = abs(lumaEndN) >= gradientScaled;
+	bool doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1;
+	bool doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11;
+
+	if(doneNP) {
+	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+	doneN = abs(lumaEndN) >= gradientScaled;
+	doneP = abs(lumaEndP) >= gradientScaled;
+	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12;
+	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12;
+	doneNP = (!doneN) || (!doneP);
+	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12;
+	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12;
+	}}}}}}}}}}}
+
+	float dstN = posM.x - posN.x;
+	float dstP = posP.x - posM.x;
+	if(!horzSpan) dstN = posM.y - posN.y;
+	if(!horzSpan) dstP = posP.y - posM.y;
+
+	bool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;
+	float spanLength = (dstP + dstN);
+	bool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;
+	float spanLengthRcp = 1.0/spanLength;
+
+	bool directionN = dstN < dstP;
+	float dst = min(dstN, dstP);
+	bool goodSpan = directionN ? goodSpanN : goodSpanP;
+	float subpixG = subpixF * subpixF;
+	float pixelOffset = (dst * (-spanLengthRcp)) + 0.5;
+	float subpixH = subpixG * fxaaSubpix;
+
+	float pixelOffsetGood = goodSpan ? pixelOffset : 0.0;
+	float pixelOffsetSubpix = max(pixelOffsetGood, subpixH);
+	if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;
+	if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;
+
+	return float4(FxaaTexTop(tex, posM).xyz, lumaM);
+}
+
+#if (FXAA_GLSL_130 == 1)
+float4 FxaaPass(float4 FxaaColor, float2 uv0)
+#else
+float4 FxaaPass(float4 FxaaColor : COLOR0, float2 uv0 : TEXCOORD0)
+#endif
+{
+
+	#if (SHADER_MODEL >= 0x400)
+	FxaaTex tex;
+	tex.tex = Texture;
+	tex.smpl = TextureSampler;
+
+	Texture.GetDimensions(PixelSize.x, PixelSize.y);
+	FxaaColor = FxaaPixelShader(uv0, tex, 1.0/PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);
+
+    #elif (FXAA_GLSL_130 == 1)
+
+	vec2 PixelSize = textureSize(TextureSampler, 0);
+	FxaaColor = FxaaPixelShader(uv0, TextureSampler, 1.0/PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);
+
+	#else
+	FxaaTex tex;
+	tex = TextureSampler;
+	FxaaColor = FxaaPixelShader(uv0, tex, PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);
+	#endif
+
+	return FxaaColor;
+}
+
+/*------------------------------------------------------------------------------
+                      [MAIN() & COMBINE PASS CODE SECTION]
+------------------------------------------------------------------------------*/
+#if (FXAA_GLSL_130 == 1)
+
+void ps_main()
+{
+    vec4 color = texture(TextureSampler, PSin.t);
+    color      = PreGammaPass(color, PSin.t);
+    color      = FxaaPass(color, PSin.t);
+
+    SV_Target0 = color;
+}
+
+#else
+
+PS_OUTPUT ps_main(VS_OUTPUT input)
+{
+	PS_OUTPUT output;
+
+	#if (SHADER_MODEL >= 0x400)
+		float4 color = Texture.Sample(TextureSampler, input.t);
+
+		color = PreGammaPass(color, input.t);
+		color = FxaaPass(color, input.t);
+	#else
+		float4 color = tex2D(TextureSampler, input.t);
+
+		color = PreGammaPass(color, input.t);
+		color = FxaaPass(color, input.t);
+	#endif
+
+	output.c = color;
+	
+	return output;
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/convert.glsl b/plugins/GSdx_legacy/res/glsl/convert.glsl
new file mode 100644
index 0000000000..18bd32aeeb
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/convert.glsl
@@ -0,0 +1,406 @@
+//#version 420 // Keep it for editor detection
+
+struct vertex_basic
+{
+    vec4 p;
+    vec2 t;
+};
+
+
+#ifdef VERTEX_SHADER
+
+out gl_PerVertex {
+    vec4 gl_Position;
+    float gl_PointSize;
+#if !pGL_ES
+    float gl_ClipDistance[1];
+#endif
+};
+
+layout(location = 0) in vec2 POSITION;
+layout(location = 1) in vec2 TEXCOORD0;
+
+// FIXME set the interpolation (don't know what dx do)
+// flat means that there is no interpolation. The value given to the fragment shader is based on the provoking vertex conventions.
+//
+// noperspective means that there will be linear interpolation in window-space. This is usually not what you want, but it can have its uses.
+//
+// smooth, the default, means to do perspective-correct interpolation.
+//
+// The centroid qualifier only matters when multisampling. If this qualifier is not present, then the value is interpolated to the pixel's center, anywhere in the pixel, or to one of the pixel's samples. This sample may lie outside of the actual primitive being rendered, since a primitive can cover only part of a pixel's area. The centroid qualifier is used to prevent this; the interpolation point must fall within both the pixel's area and the primitive's area.
+out SHADER
+{
+    vec4 p;
+    vec2 t;
+} VSout;
+
+#define VSout_p (VSout.p)
+#define VSout_t (VSout.t)
+
+void vs_main()
+{
+    VSout_p = vec4(POSITION, 0.5f, 1.0f);
+    VSout_t = TEXCOORD0;
+    gl_Position = vec4(POSITION, 0.5f, 1.0f); // NOTE I don't know if it is possible to merge POSITION_OUT and gl_Position
+}
+
+#endif
+
+#ifdef FRAGMENT_SHADER
+
+in SHADER
+{
+    vec4 p;
+    vec2 t;
+} PSin;
+
+#define PSin_p (PSin.p)
+#define PSin_t (PSin.t)
+
+// Give a different name so I remember there is a special case!
+#if defined(ps_main1) || defined(ps_main10)
+layout(location = 0) out uint SV_Target1;
+#else
+layout(location = 0) out vec4 SV_Target0;
+#endif
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+
+layout(std140, binding = 15) uniform cb15
+{
+    ivec4 ScalingFactor;
+};
+
+vec4 sample_c()
+{
+    return texture(TextureSampler, PSin_t);
+}
+
+vec4 ps_crt(uint i)
+{
+    vec4 mask[4] = vec4[4]
+        (
+         vec4(1, 0, 0, 0),
+         vec4(0, 1, 0, 0),
+         vec4(0, 0, 1, 0),
+         vec4(1, 1, 1, 0)
+        );
+    return sample_c() * clamp((mask[i] + 0.5f), 0.0f, 1.0f);
+}
+
+#ifdef ps_main0
+void ps_main0()
+{
+    SV_Target0 = sample_c();
+}
+#endif
+
+#ifdef ps_main1
+void ps_main1()
+{
+    // Input Color is RGBA8
+
+    // We want to output a pixel on the PSMCT16* format
+    // A1-BGR5
+
+#if 0
+    // Note: dot is a good idea from pseudo. However we must be careful about float accuraccy.
+    // Here a global idea example:
+    //
+    // SV_Target1 = dot(round(sample_c() * vec4(31.f, 31.f, 31.f, 1.f)), vec4(1.f, 32.f, 1024.f, 32768.f));
+    //
+
+    // For me this code is more accurate but it will require some tests
+
+    vec4 c = sample_c() * 255.0f + 0.5f; // Denormalize value to avoid float precision issue
+
+    // shift Red: -3
+    // shift Green: -3 + 5
+    // shift Blue: -3 + 10
+    // shift Alpha: -7 + 15
+    highp uvec4 i = uvec4(c * vec4(1/8.0f, 4.0f, 128.0f, 256.0f)); // Shift value
+
+    // bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
+    SV_Target1 = (i.r & uint(0x001f)) | (i.g & uint(0x03e0)) | (i.b & uint(0x7c00)) | (i.a & uint(0x8000));
+
+#else
+    // Old code which is likely wrong.
+
+    vec4 c = sample_c();
+
+    c.a *= 256.0f / 127.0f; // hm, 0.5 won't give us 1.0 if we just multiply with 2
+
+    highp uvec4 i = uvec4(c * vec4(uint(0x001f), uint(0x03e0), uint(0x7c00), uint(0x8000)));
+
+    // bit field operation requires GL4 HW.
+    SV_Target1 = (i.x & uint(0x001f)) | (i.y & uint(0x03e0)) | (i.z & uint(0x7c00)) | (i.w & uint(0x8000));
+#endif
+
+
+}
+#endif
+
+#ifdef ps_main10
+void ps_main10()
+{
+    // Convert a GL_FLOAT32 depth texture into a 32 bits UINT texture
+    SV_Target1 = uint(exp2(32.0f) * sample_c().r);
+}
+#endif
+
+#ifdef ps_main11
+void ps_main11()
+{
+    // Convert a GL_FLOAT32 depth texture into a RGBA color texture
+    const vec4 bitSh = vec4(exp2(24.0f), exp2(16.0f), exp2(8.0f), exp2(0.0f));
+    const vec4 bitMsk = vec4(0.0, 1.0/256.0, 1.0/256.0, 1.0/256.0);
+
+    vec4 res = fract(vec4(sample_c().r) * bitSh);
+
+    SV_Target0 = (res - res.xxyz * bitMsk) * 256.0f/255.0f;
+}
+#endif
+
+#ifdef ps_main12
+void ps_main12()
+{
+    // Convert a GL_FLOAT32 (only 16 lsb) depth into a RGB5A1 color texture
+    const vec4 bitSh = vec4(exp2(32.0f), exp2(27.0f), exp2(22.0f), exp2(17.0f));
+    const uvec4 bitMsk = uvec4(0x1F, 0x1F, 0x1F, 0x1);
+    uvec4 color = uvec4(vec4(sample_c().r) * bitSh) & bitMsk;
+
+    SV_Target0 = vec4(color) / vec4(32.0f, 32.0f, 32.0f, 1.0f);
+}
+#endif
+
+#ifdef ps_main13
+void ps_main13()
+{
+    // Convert a RRGBA texture into a float depth texture
+    // FIXME: I'm afraid of the accuracy
+    const vec4 bitSh = vec4(exp2(-32.0f), exp2(-24.0f), exp2(-16.0f), exp(-8.0f)) * vec4(255.0);
+    gl_FragDepth = dot(sample_c(), bitSh);
+}
+#endif
+
+#ifdef ps_main14
+void ps_main14()
+{
+    // Same as above but without the alpha channel (24 bits Z)
+
+    // Convert a RRGBA texture into a float depth texture
+    // FIXME: I'm afraid of the accuracy
+    const vec3 bitSh = vec3(exp2(-32.0f), exp2(-24.0f), exp2(-16.0f)) * vec3(255.0);
+    gl_FragDepth = dot(sample_c().rgb, bitSh);
+}
+#endif
+
+#ifdef ps_main15
+void ps_main15()
+{
+    // Same as above but without the A/B channels (16 bits Z)
+
+    // Convert a RRGBA texture into a float depth texture
+    // FIXME: I'm afraid of the accuracy
+    const vec2 bitSh = vec2(exp2(-32.0f), exp2(-24.0f)) * vec2(255.0);
+    gl_FragDepth = dot(sample_c().rg, bitSh);
+}
+#endif
+
+#ifdef ps_main16
+void ps_main16()
+{
+    // Convert a RGB5A1 (saved as RGBA8) color to a 16 bit Z
+    // FIXME: I'm afraid of the accuracy
+    const vec4 bitSh = vec4(exp2(-32.0f), exp2(-27.0f), exp2(-22.0f), exp(-17.0f));
+    // Trunc color to drop useless lsb
+    vec4 color = trunc(sample_c() * vec4(255.0f) / vec4(8.0f, 8.0f, 8.0f, 128.0f));
+    gl_FragDepth = dot(vec4(color), bitSh);
+}
+#endif
+
+#ifdef ps_main17
+void ps_main17()
+{
+
+    // Potential speed optimization. There is a high probability that
+    // game only want to extract a single channel (blue). It will allow
+    // to remove most of the conditional operation and yield a +2/3 fps
+    // boost on MGS3
+    //
+    // Hypothesis wrong in Prince of Persia ... Seriously WTF !
+    //#define ONLY_BLUE;
+
+    // Convert a RGBA texture into a 8 bits packed texture
+    // Input column: 8x2 RGBA pixels
+    // 0: 8 RGBA
+    // 1: 8 RGBA
+    // Output column: 16x4 Index pixels
+    // 0: 8 R | 8 B
+    // 1: 8 R | 8 B
+    // 2: 8 G | 8 A
+    // 3: 8 G | 8 A
+    float c;
+
+    uvec2 sel = uvec2(gl_FragCoord.xy) % uvec2(16u, 16u);
+    ivec2 tb  = ((ivec2(gl_FragCoord.xy) & ~ivec2(15, 3)) >> 1);
+
+    int ty   = tb.y | (int(gl_FragCoord.y) & 1);
+    int txN  = tb.x | (int(gl_FragCoord.x) & 7);
+    int txH  = tb.x | ((int(gl_FragCoord.x) + 4) & 7);
+
+    txN *= ScalingFactor.x;
+    txH *= ScalingFactor.x;
+    ty  *= ScalingFactor.y;
+
+    // TODO investigate texture gather
+    vec4 cN = texelFetch(TextureSampler, ivec2(txN, ty), 0);
+    vec4 cH = texelFetch(TextureSampler, ivec2(txH, ty), 0);
+
+
+    if ((sel.y & 4u) == 0u) {
+        // Column 0 and 2
+#ifdef ONLY_BLUE
+        c = cN.b;
+#else
+        if ((sel.y & 3u) < 2u) {
+            // first 2 lines of the col
+            if (sel.x < 8u)
+                c = cN.r;
+            else
+                c = cN.b;
+        } else {
+            if (sel.x < 8u)
+                c = cH.g;
+            else
+                c = cH.a;
+        }
+#endif
+    } else {
+#ifdef ONLY_BLUE
+        c = cH.b;
+#else
+        // Column 1 and 3
+        if ((sel.y & 3u) < 2u) {
+            // first 2 lines of the col
+            if (sel.x < 8u)
+                c = cH.r;
+            else
+                c = cH.b;
+        } else {
+            if (sel.x < 8u)
+                c = cN.g;
+            else
+                c = cN.a;
+        }
+#endif
+    }
+
+
+    SV_Target0 = vec4(c);
+}
+#endif
+
+#ifdef ps_main7
+void ps_main7()
+{
+    vec4 c = sample_c();
+
+    c.a = dot(c.rgb, vec3(0.299, 0.587, 0.114));
+
+    SV_Target0 = c;
+}
+#endif
+
+#ifdef ps_main5
+vec4 ps_scanlines(uint i)
+{
+    vec4 mask[2] =
+    {
+        vec4(1, 1, 1, 0),
+        vec4(0, 0, 0, 0)
+    };
+
+    return sample_c() * clamp((mask[i] + 0.5f), 0.0f, 1.0f);
+}
+
+void ps_main5() // scanlines
+{
+    highp uvec4 p = uvec4(gl_FragCoord);
+
+    vec4 c = ps_scanlines(p.y % 2u);
+
+    SV_Target0 = c;
+}
+#endif
+
+#ifdef ps_main6
+void ps_main6() // diagonal
+{
+    highp uvec4 p = uvec4(gl_FragCoord);
+
+    vec4 c = ps_crt((p.x + (p.y % 3u)) % 3u);
+
+    SV_Target0 = c;
+}
+#endif
+
+#ifdef ps_main8
+void ps_main8() // triangular
+{
+    highp uvec4 p = uvec4(gl_FragCoord);
+
+    vec4 c = ps_crt(((p.x + ((p.y >> 1u) & 1u) * 3u) >> 1u) % 3u);
+
+    SV_Target0 = c;
+}
+#endif
+
+#ifdef ps_main9
+void ps_main9()
+{
+
+    const float PI = 3.14159265359f;
+
+    vec2 texdim = vec2(textureSize(TextureSampler, 0));
+
+    vec4 c;
+    if (dFdy(PSin_t.y) * PSin_t.y > 0.5f) {
+        c = sample_c();
+    } else {
+        float factor = (0.9f - 0.4f * cos(2.0f * PI * PSin_t.y * texdim.y));
+        c =  factor * texture(TextureSampler, vec2(PSin_t.x, (floor(PSin_t.y * texdim.y) + 0.5f) / texdim.y));
+    }
+
+    SV_Target0 = c;
+}
+#endif
+
+// Used for DATE (stencil)
+// DATM == 1
+#ifdef ps_main2
+void ps_main2()
+{
+    if(sample_c().a < (127.5f / 255.0f)) // >= 0x80 pass
+        discard;
+}
+#endif
+
+// Used for DATE (stencil)
+// DATM == 0
+#ifdef ps_main3
+void ps_main3()
+{
+    if((127.5f / 255.0f) < sample_c().a) // < 0x80 pass (== 0x80 should not pass)
+        discard;
+}
+#endif
+
+#ifdef ps_main4
+void ps_main4()
+{
+    SV_Target0 = mod(round(sample_c() * 255.0f), 256.0f) / 255.0f;
+}
+#endif
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/fxaa.fx b/plugins/GSdx_legacy/res/glsl/fxaa.fx
new file mode 120000
index 0000000000..f8a26fe1af
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/fxaa.fx
@@ -0,0 +1 @@
+../fxaa.fx
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/res/glsl/interlace.glsl b/plugins/GSdx_legacy/res/glsl/interlace.glsl
new file mode 100644
index 0000000000..8a79806fef
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/interlace.glsl
@@ -0,0 +1,67 @@
+//#version 420 // Keep it for editor detection
+
+struct vertex_basic
+{
+    vec4 p;
+    vec2 t;
+};
+
+in SHADER
+{
+    vec4 p;
+    vec2 t;
+} PSin;
+
+#define PSin_p (PSin.p)
+#define PSin_t (PSin.t)
+
+#ifdef FRAGMENT_SHADER
+
+layout(location = 0) out vec4 SV_Target0;
+
+layout(std140, binding = 11) uniform cb11
+{
+    vec2 ZrH;
+    float hH;
+};
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+
+// TODO ensure that clip (discard) is < 0 and not <= 0 ???
+void ps_main0()
+{
+    if (fract(PSin_t.y * hH) - 0.5 < 0.0)
+        discard;
+    // I'm not sure it impact us but be safe to lookup texture before conditional if
+    // see: http://www.opengl.org/wiki/GLSL_Sampler#Non-uniform_flow_control
+    vec4 c = texture(TextureSampler, PSin_t);
+
+    SV_Target0 = c;
+}
+
+void ps_main1()
+{
+    if (0.5 - fract(PSin_t.y * hH) < 0.0)
+        discard;
+    // I'm not sure it impact us but be safe to lookup texture before conditional if
+    // see: http://www.opengl.org/wiki/GLSL_Sampler#Non-uniform_flow_control
+    vec4 c = texture(TextureSampler, PSin_t);
+
+    SV_Target0 = c;
+}
+
+void ps_main2()
+{
+    vec4 c0 = texture(TextureSampler, PSin_t - ZrH);
+    vec4 c1 = texture(TextureSampler, PSin_t);
+    vec4 c2 = texture(TextureSampler, PSin_t + ZrH);
+
+    SV_Target0 = (c0 + c1 * 2.0f + c2) / 4.0f;
+}
+
+void ps_main3()
+{
+    SV_Target0 = texture(TextureSampler, PSin_t);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/merge.glsl b/plugins/GSdx_legacy/res/glsl/merge.glsl
new file mode 100644
index 0000000000..31ba8abdc3
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/merge.glsl
@@ -0,0 +1,44 @@
+//#version 420 // Keep it for editor detection
+
+struct vertex_basic
+{
+    vec4 p;
+    vec2 t;
+};
+
+in SHADER
+{
+    vec4 p;
+    vec2 t;
+} PSin;
+
+#define PSin_p (PSin.p)
+#define PSin_t (PSin.t)
+
+#ifdef FRAGMENT_SHADER
+
+layout(location = 0) out vec4 SV_Target0;
+
+layout(std140, binding = 10) uniform cb10
+{
+    vec4 BGColor;
+};
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+
+void ps_main0()
+{
+    vec4 c = texture(TextureSampler, PSin_t);
+    // Note: clamping will be done by fixed unit
+    c.a *= 2.0f;
+    SV_Target0 = c;
+}
+
+void ps_main1()
+{
+    vec4 c = texture(TextureSampler, PSin_t);
+    c.a = BGColor.a;
+    SV_Target0 = c;
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/nvidia_throughput.txt b/plugins/GSdx_legacy/res/glsl/nvidia_throughput.txt
new file mode 100644
index 0000000000..ee824f8d19
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/nvidia_throughput.txt
@@ -0,0 +1,26 @@
+Table 2. Throughput of Native Arithmetic Instructions. (Number of Operations per Clock Cycle per Multiprocessor) Compute Capability
+
+Architecture                                                                    , FER , FER , KPL , MAX
+32-bit floating-point add multiply multiply-add                                 , 32  , 48  , 192 , 128
+64-bit floating-point add multiply multiply-add                                 , 16  , 4   , 8   , 1
+32-bit floating-point reciprocal reciprocal square root log2f/exp2f/sine/cosine , 4   , 8   , 32  , 32
+32-bit integer add extended-precision add subtract extended-precision subtract  , 32  , 48  , 160 , 128
+32-bit integer multiply multiply-add extended-precision multiply-add            , 16  , 16  , 32  , Multiple instructions
+32-bit integer shift                                                            , 16  , 16  , 32  , 64
+compare minimum maximum                                                         , 32  , 48  , 160 , 64
+32-bit integer bit reverse bit field extract/insert                             , 16  , 16  , 32  , 64
+32-bit bitwise AND / OR / XOR                                                   , 32  , 160 , 160 , 128
+count of leading zeros most significant non-sign bit                            , 16  , 16  , 32  , Multiple instructions
+population count                                                                , 16  , 16  , 32  , 32
+warp shuffle                                                                    , N/A , N/A , 32  , 32
+sum of absolute difference                                                      , 16  , 16  , 32  , 64
+SIMD video instructions vabsdiff2                                               , N/A , N/A , 160 , Multiple instructions
+SIMD video instructions vabsdiff4                                               , N/A , N/A , 160 , Multiple instructions
+All other SIMD video instructions                                               , 16  , 16  , 32  , Multiple instructions
+Type conversions from 8/16-bit integer to 32-bit types                          , 16  , 16  , 128 , 32
+Type conversions from and to 64-bit types                                       , 16  , 4   , 8   , 4
+All other type conversions                                                      , 16  , 16  , 32  , 32
+
+
+Some tips:
+* bit field operations are as fast as shift operations.
diff --git a/plugins/GSdx_legacy/res/glsl/shadeboost.glsl b/plugins/GSdx_legacy/res/glsl/shadeboost.glsl
new file mode 100644
index 0000000000..717e8fe133
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/shadeboost.glsl
@@ -0,0 +1,71 @@
+//#version 420 // Keep it for editor detection
+
+/*
+** Contrast, saturation, brightness
+** Code of this function is from TGM's shader pack
+** http://irrlicht.sourceforge.net/phpBB2/viewtopic.php?t=21057
+** TGM's author comment about the license (included in the previous link)
+** "do with it, what you want! its total free!
+** (but would be nice, if you say that you used my shaders  :wink: ) but not necessary"
+*/
+
+struct vertex_basic
+{
+    vec4 p;
+    vec2 t;
+};
+
+#ifdef FRAGMENT_SHADER
+
+in SHADER
+{
+    vec4 p;
+    vec2 t;
+} PSin;
+
+#define PSin_p (PSin.p)
+#define PSin_t (PSin.t)
+
+layout(location = 0) out vec4 SV_Target0;
+
+layout(std140, binding = 12) uniform cb12
+{
+    vec4 BGColor;
+};
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+
+// For all settings: 1.0 = 100% 0.5=50% 1.5 = 150% 
+vec4 ContrastSaturationBrightness(vec4 color)
+{
+	const float sat = SB_SATURATION / 50.0;
+	const float brt = SB_BRIGHTNESS / 50.0;
+	const float con = SB_CONTRAST / 50.0;
+	
+	// Increase or decrease these values to adjust r, g and b color channels separately
+	const float AvgLumR = 0.5;
+	const float AvgLumG = 0.5;
+	const float AvgLumB = 0.5;
+	
+	const vec3 LumCoeff = vec3(0.2125, 0.7154, 0.0721);
+	
+	vec3 AvgLumin = vec3(AvgLumR, AvgLumG, AvgLumB);
+	vec3 brtColor = color.rgb * brt;
+    float dot_intensity = dot(brtColor, LumCoeff);
+	vec3 intensity = vec3(dot_intensity, dot_intensity, dot_intensity);
+	vec3 satColor = mix(intensity, brtColor, sat);
+	vec3 conColor = mix(AvgLumin, satColor, con);
+
+	color.rgb = conColor;	
+	return color;
+}
+
+
+void ps_main()
+{
+    vec4 c = texture(TextureSampler, PSin_t);
+	SV_Target0 = ContrastSaturationBrightness(c);
+}
+
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/tfx_fs.glsl b/plugins/GSdx_legacy/res/glsl/tfx_fs.glsl
new file mode 100644
index 0000000000..cb756352d0
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/tfx_fs.glsl
@@ -0,0 +1,614 @@
+//#version 420 // Keep it for text editor detection
+
+// Require for bit operation
+//#extension GL_ARB_gpu_shader5 : enable
+
+#define FMT_32 0
+#define FMT_24 1
+#define FMT_16 2
+
+#define PS_PAL_FMT (PS_TEX_FMT >> 2)
+#define PS_AEM_FMT (PS_TEX_FMT & 3)
+
+// APITRACE_DEBUG enables forced pixel output to easily detect
+// the fragment computed by primitive
+#define APITRACE_DEBUG 0
+// TEX_COORD_DEBUG output the uv coordinate as color. It is useful
+// to detect bad sampling due to upscaling
+//#define TEX_COORD_DEBUG
+// Just copy directly the texture coordinate
+#ifdef TEX_COORD_DEBUG
+#define PS_TFX 1
+#define PS_TCC 1
+#endif
+
+#define SW_BLEND (PS_BLEND_A || PS_BLEND_B || PS_BLEND_D)
+
+#ifdef FRAGMENT_SHADER
+
+in SHADER
+{
+    vec4 t_float;
+    vec4 t_int;
+    vec4 c;
+    flat vec4 fc;
+} PSin;
+
+#define PSin_c (PSin.c)
+#define PSin_fc (PSin.fc)
+
+// Same buffer but 2 colors for dual source blending
+layout(location = 0, index = 0) out vec4 SV_Target0;
+layout(location = 0, index = 1) out vec4 SV_Target1;
+
+layout(binding = 0) uniform sampler2D TextureSampler;
+layout(binding = 1) uniform sampler2D PaletteSampler;
+layout(binding = 3) uniform sampler2D RtSampler; // note 2 already use by the image below
+
+#ifndef DISABLE_GL42_image
+#if PS_DATE > 0
+// FIXME how to declare memory access
+layout(r32i, binding = 2) uniform iimage2D img_prim_min;
+// WARNING:
+// You can't enable it if you discard the fragment. The depth is still
+// updated (shadow in Shin Megami Tensei Nocturne)
+//
+// early_fragment_tests must still be enabled in the first pass of the 2 passes algo
+// First pass search the first primitive that will write the bad alpha value. Value
+// won't be written if the fragment fails the depth test.
+//
+// In theory the best solution will be do
+// 1/ copy the depth buffer
+// 2/ do the full depth (current depth writes are disabled)
+// 3/ restore the depth buffer for 2nd pass
+// Of course, it is likely too costly.
+#if PS_DATE == 1 || PS_DATE == 2
+layout(early_fragment_tests) in;
+#endif
+
+// I don't remember why I set this parameter but it is surely useless
+//layout(pixel_center_integer) in vec4 gl_FragCoord;
+#endif
+#else
+// use basic stencil
+#endif
+
+
+// Warning duplicated in both GLSL file
+layout(std140, binding = 21) uniform cb21
+{
+    vec3 FogColor;
+    float AREF;
+
+    vec4 WH;
+
+    vec2 TA;
+    float _pad0;
+    float Af;
+
+    uvec4 MskFix;
+
+    uvec4 FbMask;
+
+    vec4 HalfTexel;
+
+    vec4 MinMax;
+
+    vec2 TextureScale;
+    vec2 TC_OffsetHack;
+};
+
+vec4 sample_c(vec2 uv)
+{
+    return texture(TextureSampler, uv);
+}
+
+vec4 sample_p(float idx)
+{
+    return texture(PaletteSampler, vec2(idx, 0.0f));
+}
+
+vec4 clamp_wrap_uv(vec4 uv)
+{
+    vec4 uv_out = uv;
+
+#if PS_WMS == PS_WMT
+
+#if PS_WMS == 2
+    uv_out = clamp(uv, MinMax.xyxy, MinMax.zwzw);
+#elif PS_WMS == 3
+    uv_out = vec4((ivec4(uv * WH.xyxy) & ivec4(MskFix.xyxy)) | ivec4(MskFix.zwzw)) / WH.xyxy;
+#endif
+
+#else // PS_WMS != PS_WMT
+
+#if PS_WMS == 2
+    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);
+
+#elif PS_WMS == 3
+    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;
+
+#endif
+
+#if PS_WMT == 2
+    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);
+
+#elif PS_WMT == 3
+
+    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;
+#endif
+
+#endif
+
+    return uv_out;
+}
+
+mat4 sample_4c(vec4 uv)
+{
+    mat4 c;
+
+    // Note: texture gather can't be used because of special clamping/wrapping
+    // Also it doesn't support lod
+    c[0] = sample_c(uv.xy);
+    c[1] = sample_c(uv.zy);
+    c[2] = sample_c(uv.xw);
+    c[3] = sample_c(uv.zw);
+
+    return c;
+}
+
+vec4 sample_4_index(vec4 uv)
+{
+    vec4 c;
+
+    // Either GSdx will send a texture that contains a single channel
+    // in this case the red channel is remapped as alpha channel
+    //
+    // Or we have an old RT (ie RGBA8) that contains index (4/8) in the alpha channel
+
+    // Note: texture gather can't be used because of special clamping/wrapping
+    // Also it doesn't support lod
+    c.x = sample_c(uv.xy).a;
+    c.y = sample_c(uv.zy).a;
+    c.z = sample_c(uv.xw).a;
+    c.w = sample_c(uv.zw).a;
+
+    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value
+
+#if PS_PAL_FMT == 1
+	// 4HL
+    return vec4(i & 0xFu) / 255.0f;
+
+#elif PS_PAL_FMT == 2
+	// 4HH
+    return vec4(i >> 4u) / 255.0f;
+
+#else
+    // Most of texture will hit this code so keep normalized float value
+
+    // 8 bits
+    return c;
+#endif
+
+}
+
+mat4 sample_4p(vec4 u)
+{
+    mat4 c;
+
+    c[0] = sample_p(u.x);
+    c[1] = sample_p(u.y);
+    c[2] = sample_p(u.z);
+    c[3] = sample_p(u.w);
+
+    return c;
+}
+
+vec4 sample_color(vec2 st)
+{
+#if (PS_TCOFFSETHACK == 1)
+    st += TC_OffsetHack.xy;
+#endif
+
+    vec4 t;
+    mat4 c;
+    vec2 dd;
+
+    // FIXME I'm not sure this condition is useful (I think code will be optimized)
+#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)
+    // No software LTF and pure 32 bits RGBA texure without special texture wrapping
+    c[0] = sample_c(st);
+#ifdef TEX_COORD_DEBUG
+    c[0].rg = st.xy;
+#endif
+
+#else
+    vec4 uv;
+
+    if(PS_LTF != 0)
+    {
+        uv = st.xyxy + HalfTexel;
+        dd = fract(uv.xy * WH.zw);
+#if (PS_FST == 0)
+        // Background in Shin Megami Tensei Lucifers
+        // I suspect that uv isn't a standard number, so fract is outside of the [0;1] range
+        // Note: it is free on GPU but let's do it only for float coordinate
+        // Strangely Dx doesn't suffer from this issue.
+        dd = clamp(dd, vec2(0.0f), vec2(1.0f));
+#endif
+    }
+    else
+    {
+        uv = st.xyxy;
+    }
+
+    uv = clamp_wrap_uv(uv);
+
+#if PS_PAL_FMT != 0
+    c = sample_4p(sample_4_index(uv));
+#else
+    c = sample_4c(uv);
+#endif
+
+#ifdef TEX_COORD_DEBUG
+    c[0].rg = uv.xy;
+    c[1].rg = uv.xy;
+    c[2].rg = uv.xy;
+    c[3].rg = uv.xy;
+#endif
+
+#endif
+
+	// PERF note: using dot product reduces by 1 the number of instruction
+	// but I'm not sure it is equivalent neither faster.
+	for (int i = 0; i < 4; i++)
+	{
+        //float sum = dot(c[i].rgb, vec3(1.0f));
+#if (PS_AEM_FMT == FMT_24)
+		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;
+		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#elif (PS_AEM_FMT == FMT_16)
+		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;
+		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;
+#endif
+    }
+
+#if(PS_LTF != 0)
+    t = mix(mix(c[0], c[1], dd.x), mix(c[2], c[3], dd.x), dd.y);
+#else
+    t = c[0];
+#endif
+
+    // The 0.05f helps to fix the overbloom of sotc
+    // I think the issue is related to the rounding of texture coodinate. The linear (from fixed unit)
+    // interpolation could be slightly below the correct one.
+    return trunc(t * 255.0f + 0.05f);
+}
+
+vec4 tfx(vec4 T, vec4 C)
+{
+    vec4 C_out;
+    vec4 FxT = trunc(trunc(C) * T / 128.0f);
+
+#if (PS_TFX == 0)
+    C_out = FxT;
+#elif (PS_TFX == 1)
+    C_out = T;
+#elif (PS_TFX == 2)
+    C_out.rgb = FxT.rgb + C.a;
+    C_out.a = T.a + C.a;
+#elif (PS_TFX == 3)
+    C_out.rgb = FxT.rgb + C.a;
+    C_out.a = T.a;
+#else
+    C_out = C;
+#endif
+
+#if (PS_TCC == 0)
+    C_out.a = C.a;
+#endif
+
+#if (PS_TFX == 0) || (PS_TFX == 2) || (PS_TFX == 3)
+    // Clamp only when it is useful
+    C_out = min(C_out, 255.0f);
+#endif
+
+    return C_out;
+}
+
+void atst(vec4 C)
+{
+    // FIXME use integer cmp
+    float a = C.a;
+
+#if (PS_ATST == 0) // never
+    discard;
+#elif (PS_ATST == 1) // always
+    // nothing to do
+#elif (PS_ATST == 2) // l
+    if ((AREF - a - 0.5f) < 0.0f)
+        discard;
+#elif (PS_ATST == 3 ) // le
+    if ((AREF - a + 0.5f) < 0.0f)
+        discard;
+#elif (PS_ATST == 4) // e
+    if ((0.5f - abs(a - AREF)) < 0.0f)
+        discard;
+#elif (PS_ATST == 5) // ge
+    if ((a-AREF + 0.5f) < 0.0f)
+        discard;
+#elif (PS_ATST == 6) // g
+    if ((a-AREF - 0.5f) < 0.0f)
+        discard;
+#elif (PS_ATST == 7) // ne
+    if ((abs(a - AREF) - 0.5f) < 0.0f)
+        discard;
+#endif
+}
+
+void fog(inout vec4 C, float f)
+{
+#if PS_FOG != 0
+    C.rgb = trunc(mix(FogColor, C.rgb, f));
+#endif
+}
+
+vec4 ps_color()
+{
+    //FIXME: maybe we can set gl_Position.w = q in VS
+#if (PS_FST == 0)
+    vec4 T = sample_color(PSin.t_float.xy / vec2(PSin.t_float.w));
+#else
+    // Note xy are normalized coordinate
+    vec4 T = sample_color(PSin.t_int.xy);
+#endif
+
+#if PS_IIP == 1
+    vec4 C = tfx(T, PSin_c);
+#else
+    vec4 C = tfx(T, PSin_fc);
+#endif
+
+    atst(C);
+
+    fog(C, PSin.t_float.z);
+
+#if (PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes
+    C.rgb = vec3(255.0f);
+#endif
+
+    return C;
+}
+
+void ps_fbmask(inout vec4 C)
+{
+    // FIXME do I need special case for 16 bits
+#if PS_FBMASK
+    vec4 RT = trunc(texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0) * 255.0f + 0.1f);
+    C = vec4((uvec4(C) & ~FbMask) | (uvec4(RT) & FbMask));
+#endif
+}
+
+void ps_blend(inout vec4 Color, float As)
+{
+#if SW_BLEND
+    vec4 RT = trunc(texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0) * 255.0f + 0.1f);
+
+#if PS_DFMT == FMT_24
+    float Ad = 1.0f;
+#else
+    // FIXME FMT_16 case
+    // FIXME Ad or Ad * 2?
+    float Ad = RT.a / 128.0f;
+#endif
+
+    // Let the compiler do its jobs !
+    vec3 Cd = RT.rgb;
+    vec3 Cs = Color.rgb;
+
+#if PS_BLEND_A == 0
+    vec3 A = Cs;
+#elif PS_BLEND_A == 1
+    vec3 A = Cd;
+#else
+    vec3 A = vec3(0.0f);
+#endif
+
+#if PS_BLEND_B == 0
+    vec3 B = Cs;
+#elif PS_BLEND_B == 1
+    vec3 B = Cd;
+#else
+    vec3 B = vec3(0.0f);
+#endif
+
+#if PS_BLEND_C == 0
+    float C = As;
+#elif PS_BLEND_C == 1
+    float C = Ad;
+#else
+    float C = Af;
+#endif
+
+#if PS_BLEND_D == 0
+    vec3 D = Cs;
+#elif PS_BLEND_D == 1
+    vec3 D = Cd;
+#else
+    vec3 D = vec3(0.0f);
+#endif
+
+#if PS_BLEND_A == PS_BLEND_B
+    Color.rgb = D;
+#else
+    Color.rgb = trunc((A - B) * C + D);
+#endif
+
+    // FIXME dithering
+
+    // Correct the Color value based on the output format
+#if PS_COLCLIP == 0 && PS_HDR == 0
+    // Standard Clamp
+    Color.rgb = clamp(Color.rgb, vec3(0.0f), vec3(255.0f));
+#endif
+
+    // FIXME rouding of negative float?
+    // compiler uses trunc but it might need floor
+
+    // Warning: normally blending equation is mult(A, B) = A * B >> 7. GPU have the full accuracy
+    // GS: Color = 1, Alpha = 255 => output 1
+    // GPU: Color = 1/255, Alpha = 255/255 * 255/128 => output 1.9921875
+#if PS_DFMT == FMT_16
+    // In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania
+
+    Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xF8));
+#elif PS_COLCLIP == 1 && PS_HDR == 0
+    Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xFF));
+#endif
+
+#endif
+}
+
+void ps_main()
+{
+#if ((PS_DATE & 3) == 1 || (PS_DATE & 3) == 2)
+
+#if PS_WRITE_RG == 1
+    // Pseudo 16 bits access.
+    float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).g;
+#else
+    float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;
+#endif
+
+#if (PS_DATE & 3) == 1
+    // DATM == 0: Pixel with alpha equal to 1 will failed
+    bool bad = (127.5f / 255.0f) < rt_a;
+#elif (PS_DATE & 3) == 2
+    // DATM == 1: Pixel with alpha equal to 0 will failed
+    bool bad = rt_a < (127.5f / 255.0f);
+#endif
+
+    if (bad) {
+#if PS_DATE >= 5 || defined(DISABLE_GL42_image)
+        discard;
+#else
+        imageStore(img_prim_min, ivec2(gl_FragCoord.xy), ivec4(-1));
+        return;
+#endif
+    }
+
+#endif
+
+#if PS_DATE == 3 && !defined(DISABLE_GL42_image)
+    int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy)).r;
+    // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
+    // the bad alpha value so we must keep it.
+
+    if (gl_PrimitiveID > stencil_ceil) {
+        discard;
+    }
+#endif
+
+    vec4 C = ps_color();
+#if (APITRACE_DEBUG & 1) == 1
+    C.r = 255f;
+#endif
+#if (APITRACE_DEBUG & 2) == 2
+    C.g = 255f;
+#endif
+#if (APITRACE_DEBUG & 4) == 4
+    C.b = 255f;
+#endif
+#if (APITRACE_DEBUG & 8) == 8
+    C.a = 128f;
+#endif
+
+#if PS_SHUFFLE
+    uvec4 denorm_c = uvec4(C);
+    uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);
+
+    // Write RB part. Mask will take care of the correct destination
+#if PS_READ_BA
+    C.rb = C.bb;
+#else
+    C.rb = C.rr;
+#endif
+
+    // FIXME precompute my_TA & 0x80
+
+    // Write GA part. Mask will take care of the correct destination
+    // Note: GLSL 4.50/GL_EXT_shader_integer_mix support a mix instruction to select a component\n"
+    // However Nvidia emulate it with an if (at least on kepler arch) ...\n"
+#if PS_READ_BA
+    // bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
+    // uint my_ta = (bool(bitfieldExtract(denorm_c.a, 7, 1))) ? denorm_TA.y : denorm_TA.x;
+    // denorm_c.a = bitfieldInsert(denorm_c.a, bitfieldExtract(my_ta, 7, 1), 7, 1);
+    // c.ga = vec2(float(denorm_c.a));
+
+    if (bool(denorm_c.a & 0x80u))
+        C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
+    else
+        C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
+
+#else
+    if (bool(denorm_c.g & 0x80u))
+        C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
+    else
+        C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
+
+    // Nice idea but step/mix requires 4 instructions
+    // set / trunc / I2F / Mad
+    //
+    // float sel = step(128.0f, c.g);
+    // vec2 c_shuffle = vec2((denorm_c.gg & 0x7Fu) | (denorm_TA & 0x80u));
+    // c.ga = mix(c_shuffle.xx, c_shuffle.yy, sel);
+#endif
+
+#endif
+
+    // Must be done before alpha correction
+    float alpha_blend = C.a / 128.0f;
+
+    // Correct the ALPHA value based on the output format
+#if (PS_DFMT == FMT_16)
+    float A_one = 128.0f; // alpha output will be 0x80
+    C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;
+#elif (PS_DFMT == FMT_32) && (PS_FBA != 0)
+    if(C.a < 128.0f) C.a += 128.0f;
+#endif
+
+    // Get first primitive that will write a failling alpha value
+#if PS_DATE == 1 && !defined(DISABLE_GL42_image)
+    // DATM == 0
+    // Pixel with alpha equal to 1 will failed (128-255)
+    if (C.a > 127.5f) {
+        imageAtomicMin(img_prim_min, ivec2(gl_FragCoord.xy), gl_PrimitiveID);
+    }
+    return;
+#elif PS_DATE == 2 && !defined(DISABLE_GL42_image)
+    // DATM == 1
+    // Pixel with alpha equal to 0 will failed (0-127)
+    if (C.a < 127.5f) {
+        imageAtomicMin(img_prim_min, ivec2(gl_FragCoord.xy), gl_PrimitiveID);
+    }
+    return;
+#endif
+
+    ps_blend(C, alpha_blend);
+
+    ps_fbmask(C);
+
+#if PS_HDR == 1
+    // Use negative value to avoid overflow of the texture (in accumulation mode)
+    // Note: code were initially done for an Half-Float texture. Due to overflow
+    // the texture was upgraded to a full float. Maybe this code is useless now!
+    // Good testcase is castlevania
+    if (any(greaterThan(C.rgb, vec3(128.0f)))) {
+        C.rgb = (C.rgb - 256.0f);
+    }
+#endif
+    SV_Target0 = C / 255.0f;
+    SV_Target1 = vec4(alpha_blend);
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl/tfx_vgs.glsl b/plugins/GSdx_legacy/res/glsl/tfx_vgs.glsl
new file mode 100644
index 0000000000..c89720d644
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl/tfx_vgs.glsl
@@ -0,0 +1,263 @@
+//#version 420 // Keep it for text editor detection
+
+layout(std140, binding = 20) uniform cb20
+{
+    vec2 VertexScale;
+    vec2 VertexOffset;
+    vec2 _removed_TextureScale;
+    vec2 PointSize;
+};
+
+// Warning duplicated in both GLSL file
+layout(std140, binding = 21) uniform cb21
+{
+    vec3 FogColor;
+    float AREF;
+
+    vec4 WH;
+
+    vec2 TA;
+    float _pad0;
+    float Af;
+
+    uvec4 MskFix;
+
+    uvec4 FbMask;
+
+    vec4 HalfTexel;
+
+    vec4 MinMax;
+
+    vec2 TextureScale;
+    vec2 TC_OffsetHack;
+};
+
+#ifdef VERTEX_SHADER
+layout(location = 0) in vec2  i_st;
+layout(location = 2) in vec4  i_c;
+layout(location = 3) in float i_q;
+layout(location = 4) in uvec2 i_p;
+layout(location = 5) in uint  i_z;
+layout(location = 6) in uvec2 i_uv;
+layout(location = 7) in vec4  i_f;
+
+out SHADER
+{
+    vec4 t_float;
+    vec4 t_int;
+    vec4 c;
+    flat vec4 fc;
+} VSout;
+
+#define VSout_c (VSout.c)
+#define VSout_fc (VSout.fc)
+
+out gl_PerVertex {
+    vec4 gl_Position;
+    float gl_PointSize;
+#if !pGL_ES
+    float gl_ClipDistance[1];
+#endif
+};
+
+#ifdef ZERO_TO_ONE_DEPTH
+const float exp_min32 = exp2(-32.0f);
+#else
+const float exp_min31 = exp2(-31.0f);
+#endif
+
+void texture_coord()
+{
+    vec2 uv = (VS_WILDHACK == 1) ? vec2(i_uv &  uvec2(0x3FEF, 0x3FEF)) : vec2(i_uv);
+
+    // Float coordinate
+    VSout.t_float.xy = i_st;
+    VSout.t_float.w  = i_q;
+
+    // Integer coordinate => normalized
+    VSout.t_int.xy = uv * TextureScale;
+    // Integer coordinate => integral
+    VSout.t_int.zw = uv;
+}
+
+void vs_main()
+{
+    highp uint z;
+    if(VS_BPPZ == 1) // 24
+        z = i_z & uint(0xffffff);
+    else if(VS_BPPZ == 2) // 16
+        z = i_z & uint(0xffff);
+    else
+        z = i_z;
+
+    // pos -= 0.05 (1/320 pixel) helps avoiding rounding problems (integral part of pos is usually 5 digits, 0.05 is about as low as we can go)
+    // example: ceil(afterseveralvertextransformations(y = 133)) => 134 => line 133 stays empty
+    // input granularity is 1/16 pixel, anything smaller than that won't step drawing up/left by one pixel
+    // example: 133.0625 (133 + 1/16) should start from line 134, ceil(133.0625 - 0.05) still above 133
+    vec4 p;
+
+    p.xy = vec2(i_p) - vec2(0.05f, 0.05f);
+    p.xy = p.xy * VertexScale - VertexOffset;
+    p.w = 1.0f;
+#ifdef ZERO_TO_ONE_DEPTH
+    if(VS_LOGZ == 1) {
+        p.z = max(0.0f, log2(float(z))) / 32.0f;
+    } else {
+        p.z = float(z) * exp_min32;
+    }
+#else
+    if(VS_LOGZ == 1) {
+        p.z = max(0.0f, log2(float(z))) / 31.0f - 1.0f;
+    } else {
+        p.z = float(z) * exp_min31 - 1.0f;
+    }
+#endif
+
+    gl_Position = p;
+
+    texture_coord();
+
+    VSout_c = i_c;
+    VSout_fc = i_c;
+    VSout.t_float.z = i_f.x; // pack for with texture
+}
+
+#endif
+
+#ifdef GEOMETRY_SHADER
+
+in gl_PerVertex {
+    vec4 gl_Position;
+    float gl_PointSize;
+#if !pGL_ES
+    float gl_ClipDistance[1];
+#endif
+} gl_in[];
+//in int gl_PrimitiveIDIn;
+
+out gl_PerVertex {
+    vec4 gl_Position;
+    float gl_PointSize;
+#if !pGL_ES
+    float gl_ClipDistance[1];
+#endif
+};
+//out int gl_PrimitiveID;
+
+in SHADER
+{
+    vec4 t_float;
+    vec4 t_int;
+    vec4 c;
+    flat vec4 fc;
+} GSin[];
+
+out SHADER
+{
+    vec4 t_float;
+    vec4 t_int;
+    vec4 c;
+    flat vec4 fc;
+} GSout;
+
+layout(std140, binding = 22) uniform cb22
+{
+    vec4 rt_size;
+};
+
+
+struct vertex
+{
+    vec4 t_float;
+    vec4 t_int;
+    vec4 c;
+};
+
+void out_vertex(in vertex v)
+{
+    GSout.t_float  = v.t_float;
+    GSout.t_int    = v.t_int;
+    GSout.c        = v.c;
+    // Flat output
+#if GS_POINT == 1
+    GSout.fc       = GSin[0].fc;
+#else
+    GSout.fc       = GSin[1].fc;
+#endif
+    gl_PrimitiveID = gl_PrimitiveIDIn;
+    EmitVertex();
+}
+
+#if GS_POINT == 1
+layout(points) in;
+#else
+layout(lines) in;
+#endif
+layout(triangle_strip, max_vertices = 6) out;
+
+void gs_main()
+{
+    // left top     => GSin[0];
+    // right bottom => GSin[1];
+#if GS_POINT == 1
+    vertex rb = vertex(GSin[0].t_float, GSin[0].t_int, GSin[0].c);
+#else
+    vertex rb = vertex(GSin[1].t_float, GSin[1].t_int, GSin[1].c);
+#endif
+    vertex lt = vertex(GSin[0].t_float, GSin[0].t_int, GSin[0].c);
+
+#if GS_POINT == 1
+    vec4 rb_p = gl_in[0].gl_Position + vec4(PointSize.x, PointSize.y, 0.0f, 0.0f);
+#else
+    vec4 rb_p = gl_in[1].gl_Position;
+#endif
+    vec4 lb_p = rb_p;
+    vec4 rt_p = rb_p;
+    vec4 lt_p = gl_in[0].gl_Position;
+
+#if GS_POINT == 0
+    // flat depth
+    lt_p.z = rb_p.z;
+    // flat fog and texture perspective
+    lt.t_float.zw = rb.t_float.zw;
+    // flat color
+    lt.c = rb.c;
+#endif
+
+    // Swap texture and position coordinate
+    vertex lb    = rb;
+    lb.t_float.x = lt.t_float.x;
+    lb.t_int.x   = lt.t_int.x;
+    lb.t_int.z   = lt.t_int.z;
+    lb_p.x       = lt_p.x;
+
+    vertex rt    = rb;
+    rt_p.y       = lt_p.y;
+    rt.t_float.y = lt.t_float.y;
+    rt.t_int.y   = lt.t_int.y;
+    rt.t_int.w   = lt.t_int.w;
+
+    // Triangle 1
+    gl_Position = lt_p;
+    out_vertex(lt);
+
+    gl_Position = lb_p;
+    out_vertex(lb);
+
+    gl_Position = rt_p;
+    out_vertex(rt);
+    EndPrimitive();
+
+    // Triangle 2
+    gl_Position = lb_p;
+    out_vertex(lb);
+
+    gl_Position = rt_p;
+    out_vertex(rt);
+
+    gl_Position = rb_p;
+    out_vertex(rb);
+    EndPrimitive();
+}
+
+#endif
diff --git a/plugins/GSdx_legacy/res/glsl_source.h b/plugins/GSdx_legacy/res/glsl_source.h
new file mode 100644
index 0000000000..2c86638ffb
--- /dev/null
+++ b/plugins/GSdx_legacy/res/glsl_source.h
@@ -0,0 +1,2097 @@
+/*
+ *  This file was generated by glsl2h.pl script
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "stdafx.h"
+
+static const char* convert_glsl =
+	"//#version 420 // Keep it for editor detection\n"
+	"\n"
+	"struct vertex_basic\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"};\n"
+	"\n"
+	"\n"
+	"#ifdef VERTEX_SHADER\n"
+	"\n"
+	"out gl_PerVertex {\n"
+	"    vec4 gl_Position;\n"
+	"    float gl_PointSize;\n"
+	"#if !pGL_ES\n"
+	"    float gl_ClipDistance[1];\n"
+	"#endif\n"
+	"};\n"
+	"\n"
+	"layout(location = 0) in vec2 POSITION;\n"
+	"layout(location = 1) in vec2 TEXCOORD0;\n"
+	"\n"
+	"// FIXME set the interpolation (don't know what dx do)\n"
+	"// flat means that there is no interpolation. The value given to the fragment shader is based on the provoking vertex conventions.\n"
+	"//\n"
+	"// noperspective means that there will be linear interpolation in window-space. This is usually not what you want, but it can have its uses.\n"
+	"//\n"
+	"// smooth, the default, means to do perspective-correct interpolation.\n"
+	"//\n"
+	"// The centroid qualifier only matters when multisampling. If this qualifier is not present, then the value is interpolated to the pixel's center, anywhere in the pixel, or to one of the pixel's samples. This sample may lie outside of the actual primitive being rendered, since a primitive can cover only part of a pixel's area. The centroid qualifier is used to prevent this; the interpolation point must fall within both the pixel's area and the primitive's area.\n"
+	"out SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} VSout;\n"
+	"\n"
+	"#define VSout_p (VSout.p)\n"
+	"#define VSout_t (VSout.t)\n"
+	"\n"
+	"void vs_main()\n"
+	"{\n"
+	"    VSout_p = vec4(POSITION, 0.5f, 1.0f);\n"
+	"    VSout_t = TEXCOORD0;\n"
+	"    gl_Position = vec4(POSITION, 0.5f, 1.0f); // NOTE I don't know if it is possible to merge POSITION_OUT and gl_Position\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#ifdef FRAGMENT_SHADER\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} PSin;\n"
+	"\n"
+	"#define PSin_p (PSin.p)\n"
+	"#define PSin_t (PSin.t)\n"
+	"\n"
+	"// Give a different name so I remember there is a special case!\n"
+	"#if defined(ps_main1) || defined(ps_main10)\n"
+	"layout(location = 0) out uint SV_Target1;\n"
+	"#else\n"
+	"layout(location = 0) out vec4 SV_Target0;\n"
+	"#endif\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"\n"
+	"layout(std140, binding = 15) uniform cb15\n"
+	"{\n"
+	"    ivec4 ScalingFactor;\n"
+	"};\n"
+	"\n"
+	"vec4 sample_c()\n"
+	"{\n"
+	"    return texture(TextureSampler, PSin_t);\n"
+	"}\n"
+	"\n"
+	"vec4 ps_crt(uint i)\n"
+	"{\n"
+	"    vec4 mask[4] = vec4[4]\n"
+	"        (\n"
+	"         vec4(1, 0, 0, 0),\n"
+	"         vec4(0, 1, 0, 0),\n"
+	"         vec4(0, 0, 1, 0),\n"
+	"         vec4(1, 1, 1, 0)\n"
+	"        );\n"
+	"    return sample_c() * clamp((mask[i] + 0.5f), 0.0f, 1.0f);\n"
+	"}\n"
+	"\n"
+	"#ifdef ps_main0\n"
+	"void ps_main0()\n"
+	"{\n"
+	"    SV_Target0 = sample_c();\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main1\n"
+	"void ps_main1()\n"
+	"{\n"
+	"    // Input Color is RGBA8\n"
+	"\n"
+	"    // We want to output a pixel on the PSMCT16* format\n"
+	"    // A1-BGR5\n"
+	"\n"
+	"#if 0\n"
+	"    // Note: dot is a good idea from pseudo. However we must be careful about float accuraccy.\n"
+	"    // Here a global idea example:\n"
+	"    //\n"
+	"    // SV_Target1 = dot(round(sample_c() * vec4(31.f, 31.f, 31.f, 1.f)), vec4(1.f, 32.f, 1024.f, 32768.f));\n"
+	"    //\n"
+	"\n"
+	"    // For me this code is more accurate but it will require some tests\n"
+	"\n"
+	"    vec4 c = sample_c() * 255.0f + 0.5f; // Denormalize value to avoid float precision issue\n"
+	"\n"
+	"    // shift Red: -3\n"
+	"    // shift Green: -3 + 5\n"
+	"    // shift Blue: -3 + 10\n"
+	"    // shift Alpha: -7 + 15\n"
+	"    highp uvec4 i = uvec4(c * vec4(1/8.0f, 4.0f, 128.0f, 256.0f)); // Shift value\n"
+	"\n"
+	"    // bit field operation requires GL4 HW. Could be nice to merge it with step/mix below\n"
+	"    SV_Target1 = (i.r & uint(0x001f)) | (i.g & uint(0x03e0)) | (i.b & uint(0x7c00)) | (i.a & uint(0x8000));\n"
+	"\n"
+	"#else\n"
+	"    // Old code which is likely wrong.\n"
+	"\n"
+	"    vec4 c = sample_c();\n"
+	"\n"
+	"    c.a *= 256.0f / 127.0f; // hm, 0.5 won't give us 1.0 if we just multiply with 2\n"
+	"\n"
+	"    highp uvec4 i = uvec4(c * vec4(uint(0x001f), uint(0x03e0), uint(0x7c00), uint(0x8000)));\n"
+	"\n"
+	"    // bit field operation requires GL4 HW.\n"
+	"    SV_Target1 = (i.x & uint(0x001f)) | (i.y & uint(0x03e0)) | (i.z & uint(0x7c00)) | (i.w & uint(0x8000));\n"
+	"#endif\n"
+	"\n"
+	"\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main10\n"
+	"void ps_main10()\n"
+	"{\n"
+	"    // Convert a GL_FLOAT32 depth texture into a 32 bits UINT texture\n"
+	"    SV_Target1 = uint(exp2(32.0f) * sample_c().r);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main11\n"
+	"void ps_main11()\n"
+	"{\n"
+	"    // Convert a GL_FLOAT32 depth texture into a RGBA color texture\n"
+	"    const vec4 bitSh = vec4(exp2(24.0f), exp2(16.0f), exp2(8.0f), exp2(0.0f));\n"
+	"    const vec4 bitMsk = vec4(0.0, 1.0/256.0, 1.0/256.0, 1.0/256.0);\n"
+	"\n"
+	"    vec4 res = fract(vec4(sample_c().r) * bitSh);\n"
+	"\n"
+	"    SV_Target0 = (res - res.xxyz * bitMsk) * 256.0f/255.0f;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main12\n"
+	"void ps_main12()\n"
+	"{\n"
+	"    // Convert a GL_FLOAT32 (only 16 lsb) depth into a RGB5A1 color texture\n"
+	"    const vec4 bitSh = vec4(exp2(32.0f), exp2(27.0f), exp2(22.0f), exp2(17.0f));\n"
+	"    const uvec4 bitMsk = uvec4(0x1F, 0x1F, 0x1F, 0x1);\n"
+	"    uvec4 color = uvec4(vec4(sample_c().r) * bitSh) & bitMsk;\n"
+	"\n"
+	"    SV_Target0 = vec4(color) / vec4(32.0f, 32.0f, 32.0f, 1.0f);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main13\n"
+	"void ps_main13()\n"
+	"{\n"
+	"    // Convert a RRGBA texture into a float depth texture\n"
+	"    // FIXME: I'm afraid of the accuracy\n"
+	"    const vec4 bitSh = vec4(exp2(-32.0f), exp2(-24.0f), exp2(-16.0f), exp(-8.0f)) * vec4(255.0);\n"
+	"    gl_FragDepth = dot(sample_c(), bitSh);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main14\n"
+	"void ps_main14()\n"
+	"{\n"
+	"    // Same as above but without the alpha channel (24 bits Z)\n"
+	"\n"
+	"    // Convert a RRGBA texture into a float depth texture\n"
+	"    // FIXME: I'm afraid of the accuracy\n"
+	"    const vec3 bitSh = vec3(exp2(-32.0f), exp2(-24.0f), exp2(-16.0f)) * vec3(255.0);\n"
+	"    gl_FragDepth = dot(sample_c().rgb, bitSh);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main15\n"
+	"void ps_main15()\n"
+	"{\n"
+	"    // Same as above but without the A/B channels (16 bits Z)\n"
+	"\n"
+	"    // Convert a RRGBA texture into a float depth texture\n"
+	"    // FIXME: I'm afraid of the accuracy\n"
+	"    const vec2 bitSh = vec2(exp2(-32.0f), exp2(-24.0f)) * vec2(255.0);\n"
+	"    gl_FragDepth = dot(sample_c().rg, bitSh);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main16\n"
+	"void ps_main16()\n"
+	"{\n"
+	"    // Convert a RGB5A1 (saved as RGBA8) color to a 16 bit Z\n"
+	"    // FIXME: I'm afraid of the accuracy\n"
+	"    const vec4 bitSh = vec4(exp2(-32.0f), exp2(-27.0f), exp2(-22.0f), exp(-17.0f));\n"
+	"    // Trunc color to drop useless lsb\n"
+	"    vec4 color = trunc(sample_c() * vec4(255.0f) / vec4(8.0f, 8.0f, 8.0f, 128.0f));\n"
+	"    gl_FragDepth = dot(vec4(color), bitSh);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main17\n"
+	"void ps_main17()\n"
+	"{\n"
+	"\n"
+	"    // Potential speed optimization. There is a high probability that\n"
+	"    // game only want to extract a single channel (blue). It will allow\n"
+	"    // to remove most of the conditional operation and yield a +2/3 fps\n"
+	"    // boost on MGS3\n"
+	"    //\n"
+	"    // Hypothesis wrong in Prince of Persia ... Seriously WTF !\n"
+	"    //#define ONLY_BLUE;\n"
+	"\n"
+	"    // Convert a RGBA texture into a 8 bits packed texture\n"
+	"    // Input column: 8x2 RGBA pixels\n"
+	"    // 0: 8 RGBA\n"
+	"    // 1: 8 RGBA\n"
+	"    // Output column: 16x4 Index pixels\n"
+	"    // 0: 8 R | 8 B\n"
+	"    // 1: 8 R | 8 B\n"
+	"    // 2: 8 G | 8 A\n"
+	"    // 3: 8 G | 8 A\n"
+	"    float c;\n"
+	"\n"
+	"    uvec2 sel = uvec2(gl_FragCoord.xy) % uvec2(16u, 16u);\n"
+	"    ivec2 tb  = ((ivec2(gl_FragCoord.xy) & ~ivec2(15, 3)) >> 1);\n"
+	"\n"
+	"    int ty   = tb.y | (int(gl_FragCoord.y) & 1);\n"
+	"    int txN  = tb.x | (int(gl_FragCoord.x) & 7);\n"
+	"    int txH  = tb.x | ((int(gl_FragCoord.x) + 4) & 7);\n"
+	"\n"
+	"    txN *= ScalingFactor.x;\n"
+	"    txH *= ScalingFactor.x;\n"
+	"    ty  *= ScalingFactor.y;\n"
+	"\n"
+	"    // TODO investigate texture gather\n"
+	"    vec4 cN = texelFetch(TextureSampler, ivec2(txN, ty), 0);\n"
+	"    vec4 cH = texelFetch(TextureSampler, ivec2(txH, ty), 0);\n"
+	"\n"
+	"\n"
+	"    if ((sel.y & 4u) == 0u) {\n"
+	"        // Column 0 and 2\n"
+	"#ifdef ONLY_BLUE\n"
+	"        c = cN.b;\n"
+	"#else\n"
+	"        if ((sel.y & 3u) < 2u) {\n"
+	"            // first 2 lines of the col\n"
+	"            if (sel.x < 8u)\n"
+	"                c = cN.r;\n"
+	"            else\n"
+	"                c = cN.b;\n"
+	"        } else {\n"
+	"            if (sel.x < 8u)\n"
+	"                c = cH.g;\n"
+	"            else\n"
+	"                c = cH.a;\n"
+	"        }\n"
+	"#endif\n"
+	"    } else {\n"
+	"#ifdef ONLY_BLUE\n"
+	"        c = cH.b;\n"
+	"#else\n"
+	"        // Column 1 and 3\n"
+	"        if ((sel.y & 3u) < 2u) {\n"
+	"            // first 2 lines of the col\n"
+	"            if (sel.x < 8u)\n"
+	"                c = cH.r;\n"
+	"            else\n"
+	"                c = cH.b;\n"
+	"        } else {\n"
+	"            if (sel.x < 8u)\n"
+	"                c = cN.g;\n"
+	"            else\n"
+	"                c = cN.a;\n"
+	"        }\n"
+	"#endif\n"
+	"    }\n"
+	"\n"
+	"\n"
+	"    SV_Target0 = vec4(c);\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main7\n"
+	"void ps_main7()\n"
+	"{\n"
+	"    vec4 c = sample_c();\n"
+	"\n"
+	"    c.a = dot(c.rgb, vec3(0.299, 0.587, 0.114));\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main5\n"
+	"vec4 ps_scanlines(uint i)\n"
+	"{\n"
+	"    vec4 mask[2] =\n"
+	"    {\n"
+	"        vec4(1, 1, 1, 0),\n"
+	"        vec4(0, 0, 0, 0)\n"
+	"    };\n"
+	"\n"
+	"    return sample_c() * clamp((mask[i] + 0.5f), 0.0f, 1.0f);\n"
+	"}\n"
+	"\n"
+	"void ps_main5() // scanlines\n"
+	"{\n"
+	"    highp uvec4 p = uvec4(gl_FragCoord);\n"
+	"\n"
+	"    vec4 c = ps_scanlines(p.y % 2u);\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main6\n"
+	"void ps_main6() // diagonal\n"
+	"{\n"
+	"    highp uvec4 p = uvec4(gl_FragCoord);\n"
+	"\n"
+	"    vec4 c = ps_crt((p.x + (p.y % 3u)) % 3u);\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main8\n"
+	"void ps_main8() // triangular\n"
+	"{\n"
+	"    highp uvec4 p = uvec4(gl_FragCoord);\n"
+	"\n"
+	"    vec4 c = ps_crt(((p.x + ((p.y >> 1u) & 1u) * 3u) >> 1u) % 3u);\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main9\n"
+	"void ps_main9()\n"
+	"{\n"
+	"\n"
+	"    const float PI = 3.14159265359f;\n"
+	"\n"
+	"    vec2 texdim = vec2(textureSize(TextureSampler, 0));\n"
+	"\n"
+	"    vec4 c;\n"
+	"    if (dFdy(PSin_t.y) * PSin_t.y > 0.5f) {\n"
+	"        c = sample_c();\n"
+	"    } else {\n"
+	"        float factor = (0.9f - 0.4f * cos(2.0f * PI * PSin_t.y * texdim.y));\n"
+	"        c =  factor * texture(TextureSampler, vec2(PSin_t.x, (floor(PSin_t.y * texdim.y) + 0.5f) / texdim.y));\n"
+	"    }\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"// Used for DATE (stencil)\n"
+	"// DATM == 1\n"
+	"#ifdef ps_main2\n"
+	"void ps_main2()\n"
+	"{\n"
+	"    if(sample_c().a < (127.5f / 255.0f)) // >= 0x80 pass\n"
+	"        discard;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"// Used for DATE (stencil)\n"
+	"// DATM == 0\n"
+	"#ifdef ps_main3\n"
+	"void ps_main3()\n"
+	"{\n"
+	"    if((127.5f / 255.0f) < sample_c().a) // < 0x80 pass (== 0x80 should not pass)\n"
+	"        discard;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#ifdef ps_main4\n"
+	"void ps_main4()\n"
+	"{\n"
+	"    SV_Target0 = mod(round(sample_c() * 255.0f), 256.0f) / 255.0f;\n"
+	"}\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* interlace_glsl =
+	"//#version 420 // Keep it for editor detection\n"
+	"\n"
+	"struct vertex_basic\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"};\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} PSin;\n"
+	"\n"
+	"#define PSin_p (PSin.p)\n"
+	"#define PSin_t (PSin.t)\n"
+	"\n"
+	"#ifdef FRAGMENT_SHADER\n"
+	"\n"
+	"layout(location = 0) out vec4 SV_Target0;\n"
+	"\n"
+	"layout(std140, binding = 11) uniform cb11\n"
+	"{\n"
+	"    vec2 ZrH;\n"
+	"    float hH;\n"
+	"};\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"\n"
+	"// TODO ensure that clip (discard) is < 0 and not <= 0 ???\n"
+	"void ps_main0()\n"
+	"{\n"
+	"    if (fract(PSin_t.y * hH) - 0.5 < 0.0)\n"
+	"        discard;\n"
+	"    // I'm not sure it impact us but be safe to lookup texture before conditional if\n"
+	"    // see: http://www.opengl.org/wiki/GLSL_Sampler#Non-uniform_flow_control\n"
+	"    vec4 c = texture(TextureSampler, PSin_t);\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"\n"
+	"void ps_main1()\n"
+	"{\n"
+	"    if (0.5 - fract(PSin_t.y * hH) < 0.0)\n"
+	"        discard;\n"
+	"    // I'm not sure it impact us but be safe to lookup texture before conditional if\n"
+	"    // see: http://www.opengl.org/wiki/GLSL_Sampler#Non-uniform_flow_control\n"
+	"    vec4 c = texture(TextureSampler, PSin_t);\n"
+	"\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"\n"
+	"void ps_main2()\n"
+	"{\n"
+	"    vec4 c0 = texture(TextureSampler, PSin_t - ZrH);\n"
+	"    vec4 c1 = texture(TextureSampler, PSin_t);\n"
+	"    vec4 c2 = texture(TextureSampler, PSin_t + ZrH);\n"
+	"\n"
+	"    SV_Target0 = (c0 + c1 * 2.0f + c2) / 4.0f;\n"
+	"}\n"
+	"\n"
+	"void ps_main3()\n"
+	"{\n"
+	"    SV_Target0 = texture(TextureSampler, PSin_t);\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* merge_glsl =
+	"//#version 420 // Keep it for editor detection\n"
+	"\n"
+	"struct vertex_basic\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"};\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} PSin;\n"
+	"\n"
+	"#define PSin_p (PSin.p)\n"
+	"#define PSin_t (PSin.t)\n"
+	"\n"
+	"#ifdef FRAGMENT_SHADER\n"
+	"\n"
+	"layout(location = 0) out vec4 SV_Target0;\n"
+	"\n"
+	"layout(std140, binding = 10) uniform cb10\n"
+	"{\n"
+	"    vec4 BGColor;\n"
+	"};\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"\n"
+	"void ps_main0()\n"
+	"{\n"
+	"    vec4 c = texture(TextureSampler, PSin_t);\n"
+	"    // Note: clamping will be done by fixed unit\n"
+	"    c.a *= 2.0f;\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"\n"
+	"void ps_main1()\n"
+	"{\n"
+	"    vec4 c = texture(TextureSampler, PSin_t);\n"
+	"    c.a = BGColor.a;\n"
+	"    SV_Target0 = c;\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* shadeboost_glsl =
+	"//#version 420 // Keep it for editor detection\n"
+	"\n"
+	"/*\n"
+	"** Contrast, saturation, brightness\n"
+	"** Code of this function is from TGM's shader pack\n"
+	"** http://irrlicht.sourceforge.net/phpBB2/viewtopic.php?t=21057\n"
+	"** TGM's author comment about the license (included in the previous link)\n"
+	"** \"do with it, what you want! its total free!\n"
+	"** (but would be nice, if you say that you used my shaders  :wink: ) but not necessary\"\n"
+	"*/\n"
+	"\n"
+	"struct vertex_basic\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"};\n"
+	"\n"
+	"#ifdef FRAGMENT_SHADER\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} PSin;\n"
+	"\n"
+	"#define PSin_p (PSin.p)\n"
+	"#define PSin_t (PSin.t)\n"
+	"\n"
+	"layout(location = 0) out vec4 SV_Target0;\n"
+	"\n"
+	"layout(std140, binding = 12) uniform cb12\n"
+	"{\n"
+	"    vec4 BGColor;\n"
+	"};\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"\n"
+	"// For all settings: 1.0 = 100% 0.5=50% 1.5 = 150% \n"
+	"vec4 ContrastSaturationBrightness(vec4 color)\n"
+	"{\n"
+	"	const float sat = SB_SATURATION / 50.0;\n"
+	"	const float brt = SB_BRIGHTNESS / 50.0;\n"
+	"	const float con = SB_CONTRAST / 50.0;\n"
+	"	\n"
+	"	// Increase or decrease these values to adjust r, g and b color channels separately\n"
+	"	const float AvgLumR = 0.5;\n"
+	"	const float AvgLumG = 0.5;\n"
+	"	const float AvgLumB = 0.5;\n"
+	"	\n"
+	"	const vec3 LumCoeff = vec3(0.2125, 0.7154, 0.0721);\n"
+	"	\n"
+	"	vec3 AvgLumin = vec3(AvgLumR, AvgLumG, AvgLumB);\n"
+	"	vec3 brtColor = color.rgb * brt;\n"
+	"    float dot_intensity = dot(brtColor, LumCoeff);\n"
+	"	vec3 intensity = vec3(dot_intensity, dot_intensity, dot_intensity);\n"
+	"	vec3 satColor = mix(intensity, brtColor, sat);\n"
+	"	vec3 conColor = mix(AvgLumin, satColor, con);\n"
+	"\n"
+	"	color.rgb = conColor;	\n"
+	"	return color;\n"
+	"}\n"
+	"\n"
+	"\n"
+	"void ps_main()\n"
+	"{\n"
+	"    vec4 c = texture(TextureSampler, PSin_t);\n"
+	"	SV_Target0 = ContrastSaturationBrightness(c);\n"
+	"}\n"
+	"\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* tfx_vgs_glsl =
+	"//#version 420 // Keep it for text editor detection\n"
+	"\n"
+	"layout(std140, binding = 20) uniform cb20\n"
+	"{\n"
+	"    vec2 VertexScale;\n"
+	"    vec2 VertexOffset;\n"
+	"    vec2 _removed_TextureScale;\n"
+	"    vec2 PointSize;\n"
+	"};\n"
+	"\n"
+	"// Warning duplicated in both GLSL file\n"
+	"layout(std140, binding = 21) uniform cb21\n"
+	"{\n"
+	"    vec3 FogColor;\n"
+	"    float AREF;\n"
+	"\n"
+	"    vec4 WH;\n"
+	"\n"
+	"    vec2 TA;\n"
+	"    float _pad0;\n"
+	"    float Af;\n"
+	"\n"
+	"    uvec4 MskFix;\n"
+	"\n"
+	"    uvec4 FbMask;\n"
+	"\n"
+	"    vec4 HalfTexel;\n"
+	"\n"
+	"    vec4 MinMax;\n"
+	"\n"
+	"    vec2 TextureScale;\n"
+	"    vec2 TC_OffsetHack;\n"
+	"};\n"
+	"\n"
+	"#ifdef VERTEX_SHADER\n"
+	"layout(location = 0) in vec2  i_st;\n"
+	"layout(location = 2) in vec4  i_c;\n"
+	"layout(location = 3) in float i_q;\n"
+	"layout(location = 4) in uvec2 i_p;\n"
+	"layout(location = 5) in uint  i_z;\n"
+	"layout(location = 6) in uvec2 i_uv;\n"
+	"layout(location = 7) in vec4  i_f;\n"
+	"\n"
+	"out SHADER\n"
+	"{\n"
+	"    vec4 t_float;\n"
+	"    vec4 t_int;\n"
+	"    vec4 c;\n"
+	"    flat vec4 fc;\n"
+	"} VSout;\n"
+	"\n"
+	"#define VSout_c (VSout.c)\n"
+	"#define VSout_fc (VSout.fc)\n"
+	"\n"
+	"out gl_PerVertex {\n"
+	"    vec4 gl_Position;\n"
+	"    float gl_PointSize;\n"
+	"#if !pGL_ES\n"
+	"    float gl_ClipDistance[1];\n"
+	"#endif\n"
+	"};\n"
+	"\n"
+	"#ifdef ZERO_TO_ONE_DEPTH\n"
+	"const float exp_min32 = exp2(-32.0f);\n"
+	"#else\n"
+	"const float exp_min31 = exp2(-31.0f);\n"
+	"#endif\n"
+	"\n"
+	"void texture_coord()\n"
+	"{\n"
+	"    vec2 uv = (VS_WILDHACK == 1) ? vec2(i_uv &  uvec2(0x3FEF, 0x3FEF)) : vec2(i_uv);\n"
+	"\n"
+	"    // Float coordinate\n"
+	"    VSout.t_float.xy = i_st;\n"
+	"    VSout.t_float.w  = i_q;\n"
+	"\n"
+	"    // Integer coordinate => normalized\n"
+	"    VSout.t_int.xy = uv * TextureScale;\n"
+	"    // Integer coordinate => integral\n"
+	"    VSout.t_int.zw = uv;\n"
+	"}\n"
+	"\n"
+	"void vs_main()\n"
+	"{\n"
+	"    highp uint z;\n"
+	"    if(VS_BPPZ == 1) // 24\n"
+	"        z = i_z & uint(0xffffff);\n"
+	"    else if(VS_BPPZ == 2) // 16\n"
+	"        z = i_z & uint(0xffff);\n"
+	"    else\n"
+	"        z = i_z;\n"
+	"\n"
+	"    // pos -= 0.05 (1/320 pixel) helps avoiding rounding problems (integral part of pos is usually 5 digits, 0.05 is about as low as we can go)\n"
+	"    // example: ceil(afterseveralvertextransformations(y = 133)) => 134 => line 133 stays empty\n"
+	"    // input granularity is 1/16 pixel, anything smaller than that won't step drawing up/left by one pixel\n"
+	"    // example: 133.0625 (133 + 1/16) should start from line 134, ceil(133.0625 - 0.05) still above 133\n"
+	"    vec4 p;\n"
+	"\n"
+	"    p.xy = vec2(i_p) - vec2(0.05f, 0.05f);\n"
+	"    p.xy = p.xy * VertexScale - VertexOffset;\n"
+	"    p.w = 1.0f;\n"
+	"#ifdef ZERO_TO_ONE_DEPTH\n"
+	"    if(VS_LOGZ == 1) {\n"
+	"        p.z = max(0.0f, log2(float(z))) / 32.0f;\n"
+	"    } else {\n"
+	"        p.z = float(z) * exp_min32;\n"
+	"    }\n"
+	"#else\n"
+	"    if(VS_LOGZ == 1) {\n"
+	"        p.z = max(0.0f, log2(float(z))) / 31.0f - 1.0f;\n"
+	"    } else {\n"
+	"        p.z = float(z) * exp_min31 - 1.0f;\n"
+	"    }\n"
+	"#endif\n"
+	"\n"
+	"    gl_Position = p;\n"
+	"\n"
+	"    texture_coord();\n"
+	"\n"
+	"    VSout_c = i_c;\n"
+	"    VSout_fc = i_c;\n"
+	"    VSout.t_float.z = i_f.x; // pack for with texture\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#ifdef GEOMETRY_SHADER\n"
+	"\n"
+	"in gl_PerVertex {\n"
+	"    vec4 gl_Position;\n"
+	"    float gl_PointSize;\n"
+	"#if !pGL_ES\n"
+	"    float gl_ClipDistance[1];\n"
+	"#endif\n"
+	"} gl_in[];\n"
+	"//in int gl_PrimitiveIDIn;\n"
+	"\n"
+	"out gl_PerVertex {\n"
+	"    vec4 gl_Position;\n"
+	"    float gl_PointSize;\n"
+	"#if !pGL_ES\n"
+	"    float gl_ClipDistance[1];\n"
+	"#endif\n"
+	"};\n"
+	"//out int gl_PrimitiveID;\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 t_float;\n"
+	"    vec4 t_int;\n"
+	"    vec4 c;\n"
+	"    flat vec4 fc;\n"
+	"} GSin[];\n"
+	"\n"
+	"out SHADER\n"
+	"{\n"
+	"    vec4 t_float;\n"
+	"    vec4 t_int;\n"
+	"    vec4 c;\n"
+	"    flat vec4 fc;\n"
+	"} GSout;\n"
+	"\n"
+	"layout(std140, binding = 22) uniform cb22\n"
+	"{\n"
+	"    vec4 rt_size;\n"
+	"};\n"
+	"\n"
+	"\n"
+	"struct vertex\n"
+	"{\n"
+	"    vec4 t_float;\n"
+	"    vec4 t_int;\n"
+	"    vec4 c;\n"
+	"};\n"
+	"\n"
+	"void out_vertex(in vertex v)\n"
+	"{\n"
+	"    GSout.t_float  = v.t_float;\n"
+	"    GSout.t_int    = v.t_int;\n"
+	"    GSout.c        = v.c;\n"
+	"    // Flat output\n"
+	"#if GS_POINT == 1\n"
+	"    GSout.fc       = GSin[0].fc;\n"
+	"#else\n"
+	"    GSout.fc       = GSin[1].fc;\n"
+	"#endif\n"
+	"    gl_PrimitiveID = gl_PrimitiveIDIn;\n"
+	"    EmitVertex();\n"
+	"}\n"
+	"\n"
+	"#if GS_POINT == 1\n"
+	"layout(points) in;\n"
+	"#else\n"
+	"layout(lines) in;\n"
+	"#endif\n"
+	"layout(triangle_strip, max_vertices = 6) out;\n"
+	"\n"
+	"void gs_main()\n"
+	"{\n"
+	"    // left top     => GSin[0];\n"
+	"    // right bottom => GSin[1];\n"
+	"#if GS_POINT == 1\n"
+	"    vertex rb = vertex(GSin[0].t_float, GSin[0].t_int, GSin[0].c);\n"
+	"#else\n"
+	"    vertex rb = vertex(GSin[1].t_float, GSin[1].t_int, GSin[1].c);\n"
+	"#endif\n"
+	"    vertex lt = vertex(GSin[0].t_float, GSin[0].t_int, GSin[0].c);\n"
+	"\n"
+	"#if GS_POINT == 1\n"
+	"    vec4 rb_p = gl_in[0].gl_Position + vec4(PointSize.x, PointSize.y, 0.0f, 0.0f);\n"
+	"#else\n"
+	"    vec4 rb_p = gl_in[1].gl_Position;\n"
+	"#endif\n"
+	"    vec4 lb_p = rb_p;\n"
+	"    vec4 rt_p = rb_p;\n"
+	"    vec4 lt_p = gl_in[0].gl_Position;\n"
+	"\n"
+	"#if GS_POINT == 0\n"
+	"    // flat depth\n"
+	"    lt_p.z = rb_p.z;\n"
+	"    // flat fog and texture perspective\n"
+	"    lt.t_float.zw = rb.t_float.zw;\n"
+	"    // flat color\n"
+	"    lt.c = rb.c;\n"
+	"#endif\n"
+	"\n"
+	"    // Swap texture and position coordinate\n"
+	"    vertex lb    = rb;\n"
+	"    lb.t_float.x = lt.t_float.x;\n"
+	"    lb.t_int.x   = lt.t_int.x;\n"
+	"    lb.t_int.z   = lt.t_int.z;\n"
+	"    lb_p.x       = lt_p.x;\n"
+	"\n"
+	"    vertex rt    = rb;\n"
+	"    rt_p.y       = lt_p.y;\n"
+	"    rt.t_float.y = lt.t_float.y;\n"
+	"    rt.t_int.y   = lt.t_int.y;\n"
+	"    rt.t_int.w   = lt.t_int.w;\n"
+	"\n"
+	"    // Triangle 1\n"
+	"    gl_Position = lt_p;\n"
+	"    out_vertex(lt);\n"
+	"\n"
+	"    gl_Position = lb_p;\n"
+	"    out_vertex(lb);\n"
+	"\n"
+	"    gl_Position = rt_p;\n"
+	"    out_vertex(rt);\n"
+	"    EndPrimitive();\n"
+	"\n"
+	"    // Triangle 2\n"
+	"    gl_Position = lb_p;\n"
+	"    out_vertex(lb);\n"
+	"\n"
+	"    gl_Position = rt_p;\n"
+	"    out_vertex(rt);\n"
+	"\n"
+	"    gl_Position = rb_p;\n"
+	"    out_vertex(rb);\n"
+	"    EndPrimitive();\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* tfx_fs_all_glsl =
+	"//#version 420 // Keep it for text editor detection\n"
+	"\n"
+	"// Require for bit operation\n"
+	"//#extension GL_ARB_gpu_shader5 : enable\n"
+	"\n"
+	"#define FMT_32 0\n"
+	"#define FMT_24 1\n"
+	"#define FMT_16 2\n"
+	"\n"
+	"#define PS_PAL_FMT (PS_TEX_FMT >> 2)\n"
+	"#define PS_AEM_FMT (PS_TEX_FMT & 3)\n"
+	"\n"
+	"// APITRACE_DEBUG enables forced pixel output to easily detect\n"
+	"// the fragment computed by primitive\n"
+	"#define APITRACE_DEBUG 0\n"
+	"// TEX_COORD_DEBUG output the uv coordinate as color. It is useful\n"
+	"// to detect bad sampling due to upscaling\n"
+	"//#define TEX_COORD_DEBUG\n"
+	"// Just copy directly the texture coordinate\n"
+	"#ifdef TEX_COORD_DEBUG\n"
+	"#define PS_TFX 1\n"
+	"#define PS_TCC 1\n"
+	"#endif\n"
+	"\n"
+	"#define SW_BLEND (PS_BLEND_A || PS_BLEND_B || PS_BLEND_D)\n"
+	"\n"
+	"#ifdef FRAGMENT_SHADER\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 t_float;\n"
+	"    vec4 t_int;\n"
+	"    vec4 c;\n"
+	"    flat vec4 fc;\n"
+	"} PSin;\n"
+	"\n"
+	"#define PSin_c (PSin.c)\n"
+	"#define PSin_fc (PSin.fc)\n"
+	"\n"
+	"// Same buffer but 2 colors for dual source blending\n"
+	"layout(location = 0, index = 0) out vec4 SV_Target0;\n"
+	"layout(location = 0, index = 1) out vec4 SV_Target1;\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"layout(binding = 1) uniform sampler2D PaletteSampler;\n"
+	"layout(binding = 3) uniform sampler2D RtSampler; // note 2 already use by the image below\n"
+	"\n"
+	"#ifndef DISABLE_GL42_image\n"
+	"#if PS_DATE > 0\n"
+	"// FIXME how to declare memory access\n"
+	"layout(r32i, binding = 2) uniform iimage2D img_prim_min;\n"
+	"// WARNING:\n"
+	"// You can't enable it if you discard the fragment. The depth is still\n"
+	"// updated (shadow in Shin Megami Tensei Nocturne)\n"
+	"//\n"
+	"// early_fragment_tests must still be enabled in the first pass of the 2 passes algo\n"
+	"// First pass search the first primitive that will write the bad alpha value. Value\n"
+	"// won't be written if the fragment fails the depth test.\n"
+	"//\n"
+	"// In theory the best solution will be do\n"
+	"// 1/ copy the depth buffer\n"
+	"// 2/ do the full depth (current depth writes are disabled)\n"
+	"// 3/ restore the depth buffer for 2nd pass\n"
+	"// Of course, it is likely too costly.\n"
+	"#if PS_DATE == 1 || PS_DATE == 2\n"
+	"layout(early_fragment_tests) in;\n"
+	"#endif\n"
+	"\n"
+	"// I don't remember why I set this parameter but it is surely useless\n"
+	"//layout(pixel_center_integer) in vec4 gl_FragCoord;\n"
+	"#endif\n"
+	"#else\n"
+	"// use basic stencil\n"
+	"#endif\n"
+	"\n"
+	"\n"
+	"// Warning duplicated in both GLSL file\n"
+	"layout(std140, binding = 21) uniform cb21\n"
+	"{\n"
+	"    vec3 FogColor;\n"
+	"    float AREF;\n"
+	"\n"
+	"    vec4 WH;\n"
+	"\n"
+	"    vec2 TA;\n"
+	"    float _pad0;\n"
+	"    float Af;\n"
+	"\n"
+	"    uvec4 MskFix;\n"
+	"\n"
+	"    uvec4 FbMask;\n"
+	"\n"
+	"    vec4 HalfTexel;\n"
+	"\n"
+	"    vec4 MinMax;\n"
+	"\n"
+	"    vec2 TextureScale;\n"
+	"    vec2 TC_OffsetHack;\n"
+	"};\n"
+	"\n"
+	"vec4 sample_c(vec2 uv)\n"
+	"{\n"
+	"    return texture(TextureSampler, uv);\n"
+	"}\n"
+	"\n"
+	"vec4 sample_p(float idx)\n"
+	"{\n"
+	"    return texture(PaletteSampler, vec2(idx, 0.0f));\n"
+	"}\n"
+	"\n"
+	"vec4 clamp_wrap_uv(vec4 uv)\n"
+	"{\n"
+	"    vec4 uv_out = uv;\n"
+	"\n"
+	"#if PS_WMS == PS_WMT\n"
+	"\n"
+	"#if PS_WMS == 2\n"
+	"    uv_out = clamp(uv, MinMax.xyxy, MinMax.zwzw);\n"
+	"#elif PS_WMS == 3\n"
+	"    uv_out = vec4((ivec4(uv * WH.xyxy) & ivec4(MskFix.xyxy)) | ivec4(MskFix.zwzw)) / WH.xyxy;\n"
+	"#endif\n"
+	"\n"
+	"#else // PS_WMS != PS_WMT\n"
+	"\n"
+	"#if PS_WMS == 2\n"
+	"    uv_out.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);\n"
+	"\n"
+	"#elif PS_WMS == 3\n"
+	"    uv_out.xz = vec2((ivec2(uv.xz * WH.xx) & ivec2(MskFix.xx)) | ivec2(MskFix.zz)) / WH.xx;\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#if PS_WMT == 2\n"
+	"    uv_out.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);\n"
+	"\n"
+	"#elif PS_WMT == 3\n"
+	"\n"
+	"    uv_out.yw = vec2((ivec2(uv.yw * WH.yy) & ivec2(MskFix.yy)) | ivec2(MskFix.ww)) / WH.yy;\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"    return uv_out;\n"
+	"}\n"
+	"\n"
+	"mat4 sample_4c(vec4 uv)\n"
+	"{\n"
+	"    mat4 c;\n"
+	"\n"
+	"    // Note: texture gather can't be used because of special clamping/wrapping\n"
+	"    // Also it doesn't support lod\n"
+	"    c[0] = sample_c(uv.xy);\n"
+	"    c[1] = sample_c(uv.zy);\n"
+	"    c[2] = sample_c(uv.xw);\n"
+	"    c[3] = sample_c(uv.zw);\n"
+	"\n"
+	"    return c;\n"
+	"}\n"
+	"\n"
+	"vec4 sample_4_index(vec4 uv)\n"
+	"{\n"
+	"    vec4 c;\n"
+	"\n"
+	"    // Either GSdx will send a texture that contains a single channel\n"
+	"    // in this case the red channel is remapped as alpha channel\n"
+	"    //\n"
+	"    // Or we have an old RT (ie RGBA8) that contains index (4/8) in the alpha channel\n"
+	"\n"
+	"    // Note: texture gather can't be used because of special clamping/wrapping\n"
+	"    // Also it doesn't support lod\n"
+	"    c.x = sample_c(uv.xy).a;\n"
+	"    c.y = sample_c(uv.zy).a;\n"
+	"    c.z = sample_c(uv.xw).a;\n"
+	"    c.w = sample_c(uv.zw).a;\n"
+	"\n"
+	"    uvec4 i = uvec4(c * 255.0f + 0.5f); // Denormalize value\n"
+	"\n"
+	"#if PS_PAL_FMT == 1\n"
+	"	// 4HL\n"
+	"    return vec4(i & 0xFu) / 255.0f;\n"
+	"\n"
+	"#elif PS_PAL_FMT == 2\n"
+	"	// 4HH\n"
+	"    return vec4(i >> 4u) / 255.0f;\n"
+	"\n"
+	"#else\n"
+	"    // Most of texture will hit this code so keep normalized float value\n"
+	"\n"
+	"    // 8 bits\n"
+	"    return c;\n"
+	"#endif\n"
+	"\n"
+	"}\n"
+	"\n"
+	"mat4 sample_4p(vec4 u)\n"
+	"{\n"
+	"    mat4 c;\n"
+	"\n"
+	"    c[0] = sample_p(u.x);\n"
+	"    c[1] = sample_p(u.y);\n"
+	"    c[2] = sample_p(u.z);\n"
+	"    c[3] = sample_p(u.w);\n"
+	"\n"
+	"    return c;\n"
+	"}\n"
+	"\n"
+	"vec4 sample_color(vec2 st)\n"
+	"{\n"
+	"#if (PS_TCOFFSETHACK == 1)\n"
+	"    st += TC_OffsetHack.xy;\n"
+	"#endif\n"
+	"\n"
+	"    vec4 t;\n"
+	"    mat4 c;\n"
+	"    vec2 dd;\n"
+	"\n"
+	"    // FIXME I'm not sure this condition is useful (I think code will be optimized)\n"
+	"#if (PS_LTF == 0 && PS_AEM_FMT == FMT_32 && PS_PAL_FMT == 0 && PS_WMS < 2 && PS_WMT < 2)\n"
+	"    // No software LTF and pure 32 bits RGBA texure without special texture wrapping\n"
+	"    c[0] = sample_c(st);\n"
+	"#ifdef TEX_COORD_DEBUG\n"
+	"    c[0].rg = st.xy;\n"
+	"#endif\n"
+	"\n"
+	"#else\n"
+	"    vec4 uv;\n"
+	"\n"
+	"    if(PS_LTF != 0)\n"
+	"    {\n"
+	"        uv = st.xyxy + HalfTexel;\n"
+	"        dd = fract(uv.xy * WH.zw);\n"
+	"#if (PS_FST == 0)\n"
+	"        // Background in Shin Megami Tensei Lucifers\n"
+	"        // I suspect that uv isn't a standard number, so fract is outside of the [0;1] range\n"
+	"        // Note: it is free on GPU but let's do it only for float coordinate\n"
+	"        // Strangely Dx doesn't suffer from this issue.\n"
+	"        dd = clamp(dd, vec2(0.0f), vec2(1.0f));\n"
+	"#endif\n"
+	"    }\n"
+	"    else\n"
+	"    {\n"
+	"        uv = st.xyxy;\n"
+	"    }\n"
+	"\n"
+	"    uv = clamp_wrap_uv(uv);\n"
+	"\n"
+	"#if PS_PAL_FMT != 0\n"
+	"    c = sample_4p(sample_4_index(uv));\n"
+	"#else\n"
+	"    c = sample_4c(uv);\n"
+	"#endif\n"
+	"\n"
+	"#ifdef TEX_COORD_DEBUG\n"
+	"    c[0].rg = uv.xy;\n"
+	"    c[1].rg = uv.xy;\n"
+	"    c[2].rg = uv.xy;\n"
+	"    c[3].rg = uv.xy;\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"	// PERF note: using dot product reduces by 1 the number of instruction\n"
+	"	// but I'm not sure it is equivalent neither faster.\n"
+	"	for (int i = 0; i < 4; i++)\n"
+	"	{\n"
+	"        //float sum = dot(c[i].rgb, vec3(1.0f));\n"
+	"#if (PS_AEM_FMT == FMT_24)\n"
+	"		c[i].a = ( (PS_AEM == 0) || any(bvec3(c[i].rgb))  ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#elif (PS_AEM_FMT == FMT_16)\n"
+	"		c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || any(bvec3(c[i].rgb)) ) ? TA.x : 0.0f;\n"
+	"		//c[i].a = c[i].a >= 0.5 ? TA.y : ( (PS_AEM == 0) || (sum > 0.0f) ) ? TA.x : 0.0f;\n"
+	"#endif\n"
+	"    }\n"
+	"\n"
+	"#if(PS_LTF != 0)\n"
+	"    t = mix(mix(c[0], c[1], dd.x), mix(c[2], c[3], dd.x), dd.y);\n"
+	"#else\n"
+	"    t = c[0];\n"
+	"#endif\n"
+	"\n"
+	"    // The 0.05f helps to fix the overbloom of sotc\n"
+	"    // I think the issue is related to the rounding of texture coodinate. The linear (from fixed unit)\n"
+	"    // interpolation could be slightly below the correct one.\n"
+	"    return trunc(t * 255.0f + 0.05f);\n"
+	"}\n"
+	"\n"
+	"vec4 tfx(vec4 T, vec4 C)\n"
+	"{\n"
+	"    vec4 C_out;\n"
+	"    vec4 FxT = trunc(trunc(C) * T / 128.0f);\n"
+	"\n"
+	"#if (PS_TFX == 0)\n"
+	"    C_out = FxT;\n"
+	"#elif (PS_TFX == 1)\n"
+	"    C_out = T;\n"
+	"#elif (PS_TFX == 2)\n"
+	"    C_out.rgb = FxT.rgb + C.a;\n"
+	"    C_out.a = T.a + C.a;\n"
+	"#elif (PS_TFX == 3)\n"
+	"    C_out.rgb = FxT.rgb + C.a;\n"
+	"    C_out.a = T.a;\n"
+	"#else\n"
+	"    C_out = C;\n"
+	"#endif\n"
+	"\n"
+	"#if (PS_TCC == 0)\n"
+	"    C_out.a = C.a;\n"
+	"#endif\n"
+	"\n"
+	"#if (PS_TFX == 0) || (PS_TFX == 2) || (PS_TFX == 3)\n"
+	"    // Clamp only when it is useful\n"
+	"    C_out = min(C_out, 255.0f);\n"
+	"#endif\n"
+	"\n"
+	"    return C_out;\n"
+	"}\n"
+	"\n"
+	"void atst(vec4 C)\n"
+	"{\n"
+	"    // FIXME use integer cmp\n"
+	"    float a = C.a;\n"
+	"\n"
+	"#if (PS_ATST == 0) // never\n"
+	"    discard;\n"
+	"#elif (PS_ATST == 1) // always\n"
+	"    // nothing to do\n"
+	"#elif (PS_ATST == 2) // l\n"
+	"    if ((AREF - a - 0.5f) < 0.0f)\n"
+	"        discard;\n"
+	"#elif (PS_ATST == 3 ) // le\n"
+	"    if ((AREF - a + 0.5f) < 0.0f)\n"
+	"        discard;\n"
+	"#elif (PS_ATST == 4) // e\n"
+	"    if ((0.5f - abs(a - AREF)) < 0.0f)\n"
+	"        discard;\n"
+	"#elif (PS_ATST == 5) // ge\n"
+	"    if ((a-AREF + 0.5f) < 0.0f)\n"
+	"        discard;\n"
+	"#elif (PS_ATST == 6) // g\n"
+	"    if ((a-AREF - 0.5f) < 0.0f)\n"
+	"        discard;\n"
+	"#elif (PS_ATST == 7) // ne\n"
+	"    if ((abs(a - AREF) - 0.5f) < 0.0f)\n"
+	"        discard;\n"
+	"#endif\n"
+	"}\n"
+	"\n"
+	"void fog(inout vec4 C, float f)\n"
+	"{\n"
+	"#if PS_FOG != 0\n"
+	"    C.rgb = trunc(mix(FogColor, C.rgb, f));\n"
+	"#endif\n"
+	"}\n"
+	"\n"
+	"vec4 ps_color()\n"
+	"{\n"
+	"    //FIXME: maybe we can set gl_Position.w = q in VS\n"
+	"#if (PS_FST == 0)\n"
+	"    vec4 T = sample_color(PSin.t_float.xy / vec2(PSin.t_float.w));\n"
+	"#else\n"
+	"    // Note xy are normalized coordinate\n"
+	"    vec4 T = sample_color(PSin.t_int.xy);\n"
+	"#endif\n"
+	"\n"
+	"#if PS_IIP == 1\n"
+	"    vec4 C = tfx(T, PSin_c);\n"
+	"#else\n"
+	"    vec4 C = tfx(T, PSin_fc);\n"
+	"#endif\n"
+	"\n"
+	"    atst(C);\n"
+	"\n"
+	"    fog(C, PSin.t_float.z);\n"
+	"\n"
+	"#if (PS_CLR1 != 0) // needed for Cd * (As/Ad/F + 1) blending modes\n"
+	"    C.rgb = vec3(255.0f);\n"
+	"#endif\n"
+	"\n"
+	"    return C;\n"
+	"}\n"
+	"\n"
+	"void ps_fbmask(inout vec4 C)\n"
+	"{\n"
+	"    // FIXME do I need special case for 16 bits\n"
+	"#if PS_FBMASK\n"
+	"    vec4 RT = trunc(texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0) * 255.0f + 0.1f);\n"
+	"    C = vec4((uvec4(C) & ~FbMask) | (uvec4(RT) & FbMask));\n"
+	"#endif\n"
+	"}\n"
+	"\n"
+	"void ps_blend(inout vec4 Color, float As)\n"
+	"{\n"
+	"#if SW_BLEND\n"
+	"    vec4 RT = trunc(texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0) * 255.0f + 0.1f);\n"
+	"\n"
+	"#if PS_DFMT == FMT_24\n"
+	"    float Ad = 1.0f;\n"
+	"#else\n"
+	"    // FIXME FMT_16 case\n"
+	"    // FIXME Ad or Ad * 2?\n"
+	"    float Ad = RT.a / 128.0f;\n"
+	"#endif\n"
+	"\n"
+	"    // Let the compiler do its jobs !\n"
+	"    vec3 Cd = RT.rgb;\n"
+	"    vec3 Cs = Color.rgb;\n"
+	"\n"
+	"#if PS_BLEND_A == 0\n"
+	"    vec3 A = Cs;\n"
+	"#elif PS_BLEND_A == 1\n"
+	"    vec3 A = Cd;\n"
+	"#else\n"
+	"    vec3 A = vec3(0.0f);\n"
+	"#endif\n"
+	"\n"
+	"#if PS_BLEND_B == 0\n"
+	"    vec3 B = Cs;\n"
+	"#elif PS_BLEND_B == 1\n"
+	"    vec3 B = Cd;\n"
+	"#else\n"
+	"    vec3 B = vec3(0.0f);\n"
+	"#endif\n"
+	"\n"
+	"#if PS_BLEND_C == 0\n"
+	"    float C = As;\n"
+	"#elif PS_BLEND_C == 1\n"
+	"    float C = Ad;\n"
+	"#else\n"
+	"    float C = Af;\n"
+	"#endif\n"
+	"\n"
+	"#if PS_BLEND_D == 0\n"
+	"    vec3 D = Cs;\n"
+	"#elif PS_BLEND_D == 1\n"
+	"    vec3 D = Cd;\n"
+	"#else\n"
+	"    vec3 D = vec3(0.0f);\n"
+	"#endif\n"
+	"\n"
+	"#if PS_BLEND_A == PS_BLEND_B\n"
+	"    Color.rgb = D;\n"
+	"#else\n"
+	"    Color.rgb = trunc((A - B) * C + D);\n"
+	"#endif\n"
+	"\n"
+	"    // FIXME dithering\n"
+	"\n"
+	"    // Correct the Color value based on the output format\n"
+	"#if PS_COLCLIP == 0 && PS_HDR == 0\n"
+	"    // Standard Clamp\n"
+	"    Color.rgb = clamp(Color.rgb, vec3(0.0f), vec3(255.0f));\n"
+	"#endif\n"
+	"\n"
+	"    // FIXME rouding of negative float?\n"
+	"    // compiler uses trunc but it might need floor\n"
+	"\n"
+	"    // Warning: normally blending equation is mult(A, B) = A * B >> 7. GPU have the full accuracy\n"
+	"    // GS: Color = 1, Alpha = 255 => output 1\n"
+	"    // GPU: Color = 1/255, Alpha = 255/255 * 255/128 => output 1.9921875\n"
+	"#if PS_DFMT == FMT_16\n"
+	"    // In 16 bits format, only 5 bits of colors are used. It impacts shadows computation of Castlevania\n"
+	"\n"
+	"    Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xF8));\n"
+	"#elif PS_COLCLIP == 1 && PS_HDR == 0\n"
+	"    Color.rgb = vec3(ivec3(Color.rgb) & ivec3(0xFF));\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	"}\n"
+	"\n"
+	"void ps_main()\n"
+	"{\n"
+	"#if ((PS_DATE & 3) == 1 || (PS_DATE & 3) == 2)\n"
+	"\n"
+	"#if PS_WRITE_RG == 1\n"
+	"    // Pseudo 16 bits access.\n"
+	"    float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).g;\n"
+	"#else\n"
+	"    float rt_a = texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0).a;\n"
+	"#endif\n"
+	"\n"
+	"#if (PS_DATE & 3) == 1\n"
+	"    // DATM == 0: Pixel with alpha equal to 1 will failed\n"
+	"    bool bad = (127.5f / 255.0f) < rt_a;\n"
+	"#elif (PS_DATE & 3) == 2\n"
+	"    // DATM == 1: Pixel with alpha equal to 0 will failed\n"
+	"    bool bad = rt_a < (127.5f / 255.0f);\n"
+	"#endif\n"
+	"\n"
+	"    if (bad) {\n"
+	"#if PS_DATE >= 5 || defined(DISABLE_GL42_image)\n"
+	"        discard;\n"
+	"#else\n"
+	"        imageStore(img_prim_min, ivec2(gl_FragCoord.xy), ivec4(-1));\n"
+	"        return;\n"
+	"#endif\n"
+	"    }\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#if PS_DATE == 3 && !defined(DISABLE_GL42_image)\n"
+	"    int stencil_ceil = imageLoad(img_prim_min, ivec2(gl_FragCoord.xy)).r;\n"
+	"    // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update\n"
+	"    // the bad alpha value so we must keep it.\n"
+	"\n"
+	"    if (gl_PrimitiveID > stencil_ceil) {\n"
+	"        discard;\n"
+	"    }\n"
+	"#endif\n"
+	"\n"
+	"    vec4 C = ps_color();\n"
+	"#if (APITRACE_DEBUG & 1) == 1\n"
+	"    C.r = 255f;\n"
+	"#endif\n"
+	"#if (APITRACE_DEBUG & 2) == 2\n"
+	"    C.g = 255f;\n"
+	"#endif\n"
+	"#if (APITRACE_DEBUG & 4) == 4\n"
+	"    C.b = 255f;\n"
+	"#endif\n"
+	"#if (APITRACE_DEBUG & 8) == 8\n"
+	"    C.a = 128f;\n"
+	"#endif\n"
+	"\n"
+	"#if PS_SHUFFLE\n"
+	"    uvec4 denorm_c = uvec4(C);\n"
+	"    uvec2 denorm_TA = uvec2(vec2(TA.xy) * 255.0f + 0.5f);\n"
+	"\n"
+	"    // Write RB part. Mask will take care of the correct destination\n"
+	"#if PS_READ_BA\n"
+	"    C.rb = C.bb;\n"
+	"#else\n"
+	"    C.rb = C.rr;\n"
+	"#endif\n"
+	"\n"
+	"    // FIXME precompute my_TA & 0x80\n"
+	"\n"
+	"    // Write GA part. Mask will take care of the correct destination\n"
+	"    // Note: GLSL 4.50/GL_EXT_shader_integer_mix support a mix instruction to select a component\\n\"\n"
+	"    // However Nvidia emulate it with an if (at least on kepler arch) ...\\n\"\n"
+	"#if PS_READ_BA\n"
+	"    // bit field operation requires GL4 HW. Could be nice to merge it with step/mix below\n"
+	"    // uint my_ta = (bool(bitfieldExtract(denorm_c.a, 7, 1))) ? denorm_TA.y : denorm_TA.x;\n"
+	"    // denorm_c.a = bitfieldInsert(denorm_c.a, bitfieldExtract(my_ta, 7, 1), 7, 1);\n"
+	"    // c.ga = vec2(float(denorm_c.a));\n"
+	"\n"
+	"    if (bool(denorm_c.a & 0x80u))\n"
+	"        C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));\n"
+	"    else\n"
+	"        C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));\n"
+	"\n"
+	"#else\n"
+	"    if (bool(denorm_c.g & 0x80u))\n"
+	"        C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));\n"
+	"    else\n"
+	"        C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));\n"
+	"\n"
+	"    // Nice idea but step/mix requires 4 instructions\n"
+	"    // set / trunc / I2F / Mad\n"
+	"    //\n"
+	"    // float sel = step(128.0f, c.g);\n"
+	"    // vec2 c_shuffle = vec2((denorm_c.gg & 0x7Fu) | (denorm_TA & 0x80u));\n"
+	"    // c.ga = mix(c_shuffle.xx, c_shuffle.yy, sel);\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"    // Must be done before alpha correction\n"
+	"    float alpha_blend = C.a / 128.0f;\n"
+	"\n"
+	"    // Correct the ALPHA value based on the output format\n"
+	"#if (PS_DFMT == FMT_16)\n"
+	"    float A_one = 128.0f; // alpha output will be 0x80\n"
+	"    C.a = (PS_FBA != 0) ? A_one : step(128.0f, C.a) * A_one;\n"
+	"#elif (PS_DFMT == FMT_32) && (PS_FBA != 0)\n"
+	"    if(C.a < 128.0f) C.a += 128.0f;\n"
+	"#endif\n"
+	"\n"
+	"    // Get first primitive that will write a failling alpha value\n"
+	"#if PS_DATE == 1 && !defined(DISABLE_GL42_image)\n"
+	"    // DATM == 0\n"
+	"    // Pixel with alpha equal to 1 will failed (128-255)\n"
+	"    if (C.a > 127.5f) {\n"
+	"        imageAtomicMin(img_prim_min, ivec2(gl_FragCoord.xy), gl_PrimitiveID);\n"
+	"    }\n"
+	"    return;\n"
+	"#elif PS_DATE == 2 && !defined(DISABLE_GL42_image)\n"
+	"    // DATM == 1\n"
+	"    // Pixel with alpha equal to 0 will failed (0-127)\n"
+	"    if (C.a < 127.5f) {\n"
+	"        imageAtomicMin(img_prim_min, ivec2(gl_FragCoord.xy), gl_PrimitiveID);\n"
+	"    }\n"
+	"    return;\n"
+	"#endif\n"
+	"\n"
+	"    ps_blend(C, alpha_blend);\n"
+	"\n"
+	"    ps_fbmask(C);\n"
+	"\n"
+	"#if PS_HDR == 1\n"
+	"    // Use negative value to avoid overflow of the texture (in accumulation mode)\n"
+	"    // Note: code were initially done for an Half-Float texture. Due to overflow\n"
+	"    // the texture was upgraded to a full float. Maybe this code is useless now!\n"
+	"    // Good testcase is castlevania\n"
+	"    if (any(greaterThan(C.rgb, vec3(128.0f)))) {\n"
+	"        C.rgb = (C.rgb - 256.0f);\n"
+	"    }\n"
+	"#endif\n"
+	"    SV_Target0 = C / 255.0f;\n"
+	"    SV_Target1 = vec4(alpha_blend);\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	;
+
+static const char* fxaa_fx =
+	"#if defined(SHADER_MODEL) || defined(FXAA_GLSL_130)\n"
+	"\n"
+	"#ifndef FXAA_GLSL_130\n"
+	"    #define FXAA_GLSL_130 0\n"
+	"#endif\n"
+	"\n"
+	"#define UHQ_FXAA 1          //High Quality Fast Approximate Anti Aliasing. Adapted for GSdx from Timothy Lottes FXAA 3.11.\n"
+	"#define FxaaSubpixMax 0.0   //[0.00 to 1.00] Amount of subpixel aliasing removal. 0.00: Edge only antialiasing (no blurring)\n"
+	"#define FxaaEarlyExit 1     //[0 or 1] Use Fxaa early exit pathing. When disabled, the entire scene is antialiased(FSAA). 0 is off, 1 is on.\n"
+	"\n"
+	"/*------------------------------------------------------------------------------\n"
+	"							 [GLOBALS|FUNCTIONS]\n"
+	"------------------------------------------------------------------------------*/\n"
+	"#if (FXAA_GLSL_130 == 1)\n"
+	"\n"
+	"struct vertex_basic\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"};\n"
+	"\n"
+	"layout(binding = 0) uniform sampler2D TextureSampler;\n"
+	"\n"
+	"in SHADER\n"
+	"{\n"
+	"    vec4 p;\n"
+	"    vec2 t;\n"
+	"} PSin;\n"
+	"\n"
+	"layout(location = 0) out vec4 SV_Target0;\n"
+	"\n"
+	"#else\n"
+	"\n"
+	"#if (SHADER_MODEL >= 0x400)\n"
+	"Texture2D Texture : register(t0);\n"
+	"SamplerState TextureSampler : register(s0);\n"
+	"#else\n"
+	"texture2D Texture : register(t0);\n"
+	"sampler2D TextureSampler : register(s0);\n"
+	"#define SamplerState sampler2D\n"
+	"#endif\n"
+	"\n"
+	"cbuffer cb0\n"
+	"{\n"
+	"	float4 _rcpFrame : register(c0);\n"
+	"};\n"
+	"\n"
+	"struct VS_INPUT\n"
+	"{\n"
+	"	float4 p : POSITION;\n"
+	"	float2 t : TEXCOORD0;\n"
+	"};\n"
+	"\n"
+	"struct VS_OUTPUT\n"
+	"{\n"
+	"	#if (SHADER_MODEL >= 0x400)\n"
+	"	float4 p : SV_Position;\n"
+	"	#else\n"
+	"	float4 p : TEXCOORD1;\n"
+	"	#endif\n"
+	"	float2 t : TEXCOORD0;\n"
+	"};\n"
+	"\n"
+	"struct PS_OUTPUT\n"
+	"{\n"
+	"	#if (SHADER_MODEL >= 0x400)\n"
+	"	float4 c : SV_Target0;\n"
+	"	#else\n"
+	"	float4 c : COLOR0;\n"
+	"	#endif\n"
+	"};\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"/*------------------------------------------------------------------------------\n"
+	"                             [FXAA CODE SECTION]\n"
+	"------------------------------------------------------------------------------*/\n"
+	"\n"
+	"#if (SHADER_MODEL >= 0x500)\n"
+	"#define FXAA_HLSL_5 1\n"
+	"#define FXAA_GATHER4_ALPHA 1\n"
+	"#elif (SHADER_MODEL >= 0x400)\n"
+	"#define FXAA_HLSL_4 1\n"
+	"#define FXAA_GATHER4_ALPHA 0\n"
+	"#elif (FXAA_GLSL_130 == 1)\n"
+	"#define FXAA_GATHER4_ALPHA 1\n"
+	"#else\n"
+	"#define FXAA_HLSL_3 1\n"
+	"#define FXAA_GATHER4_ALPHA 0\n"
+	"#endif\n"
+	"\n"
+	"#if (FXAA_HLSL_5 == 1)\n"
+	"struct FxaaTex { SamplerState smpl; Texture2D tex; };\n"
+	"#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)\n"
+	"#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)\n"
+	"#define FxaaTexAlpha4(t, p) t.tex.GatherAlpha(t.smpl, p)\n"
+	"#define FxaaTexOffAlpha4(t, p, o) t.tex.GatherAlpha(t.smpl, p, o)\n"
+	"#define FxaaDiscard clip(-1)\n"
+	"#define FxaaSat(x) saturate(x)\n"
+	"\n"
+	"#elif (FXAA_HLSL_4 == 1)\n"
+	"struct FxaaTex { SamplerState smpl; Texture2D tex; };\n"
+	"#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0)\n"
+	"#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o)\n"
+	"#define FxaaDiscard clip(-1)\n"
+	"#define FxaaSat(x) saturate(x)\n"
+	"\n"
+	"#elif (FXAA_HLSL_3 == 1)\n"
+	"#define FxaaTex sampler2D\n"
+	"#define int2 float2\n"
+	"#define FxaaSat(x) saturate(x)\n"
+	"#define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0))\n"
+	"#define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0))\n"
+	"\n"
+	"#elif (FXAA_GLSL_130 == 1)\n"
+	"\n"
+	"#define int2 ivec2\n"
+	"#define float2 vec2\n"
+	"#define float3 vec3\n"
+	"#define float4 vec4\n"
+	"#define FxaaDiscard discard\n"
+	"#define FxaaSat(x) clamp(x, 0.0, 1.0)\n"
+	"#define FxaaTex sampler2D\n"
+	"#define FxaaTexTop(t, p) textureLod(t, p, 0.0)\n"
+	"#define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)\n"
+	"#if (FXAA_GATHER4_ALPHA == 1)\n"
+	"// use #extension GL_ARB_gpu_shader5 : enable\n"
+	"#define FxaaTexAlpha4(t, p) textureGather(t, p, 3)\n"
+	"#define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#define FxaaEdgeThreshold 0.063\n"
+	"#define FxaaEdgeThresholdMin 0.00\n"
+	"#define FXAA_QUALITY__P0 1.0\n"
+	"#define FXAA_QUALITY__P1 1.5\n"
+	"#define FXAA_QUALITY__P2 2.0\n"
+	"#define FXAA_QUALITY__P3 2.0\n"
+	"#define FXAA_QUALITY__P4 2.0\n"
+	"#define FXAA_QUALITY__P5 2.0\n"
+	"#define FXAA_QUALITY__P6 2.0\n"
+	"#define FXAA_QUALITY__P7 2.0\n"
+	"#define FXAA_QUALITY__P8 2.0\n"
+	"#define FXAA_QUALITY__P9 2.0\n"
+	"#define FXAA_QUALITY__P10 4.0\n"
+	"#define FXAA_QUALITY__P11 8.0\n"
+	"#define FXAA_QUALITY__P12 8.0\n"
+	"\n"
+	"/*------------------------------------------------------------------------------\n"
+	"                        [GAMMA PREPASS CODE SECTION]\n"
+	"------------------------------------------------------------------------------*/\n"
+	"float RGBLuminance(float3 color)\n"
+	"{\n"
+	"	const float3 lumCoeff = float3(0.2126729, 0.7151522, 0.0721750);\n"
+	"	return dot(color.rgb, lumCoeff);\n"
+	"}\n"
+	"\n"
+	"#if (FXAA_GLSL_130 == 0)\n"
+	"#define PixelSize float2(_rcpFrame.x, _rcpFrame.y)\n"
+	"#endif\n"
+	"\n"
+	"\n"
+	"float3 RGBGammaToLinear(float3 color, float gamma)\n"
+	"{\n"
+	"	color = FxaaSat(color);\n"
+	"	color.r = (color.r <= 0.0404482362771082) ?\n"
+	"	color.r / 12.92 : pow((color.r + 0.055) / 1.055, gamma);\n"
+	"	color.g = (color.g <= 0.0404482362771082) ?\n"
+	"	color.g / 12.92 : pow((color.g + 0.055) / 1.055, gamma);\n"
+	"	color.b = (color.b <= 0.0404482362771082) ?\n"
+	"	color.b / 12.92 : pow((color.b + 0.055) / 1.055, gamma);\n"
+	"\n"
+	"	return color;\n"
+	"}\n"
+	"\n"
+	"float3 LinearToRGBGamma(float3 color, float gamma)\n"
+	"{\n"
+	"	color = FxaaSat(color);\n"
+	"	color.r = (color.r <= 0.00313066844250063) ?\n"
+	"	color.r * 12.92 : 1.055 * pow(color.r, 1.0 / gamma) - 0.055;\n"
+	"	color.g = (color.g <= 0.00313066844250063) ?\n"
+	"	color.g * 12.92 : 1.055 * pow(color.g, 1.0 / gamma) - 0.055;\n"
+	"	color.b = (color.b <= 0.00313066844250063) ?\n"
+	"	color.b * 12.92 : 1.055 * pow(color.b, 1.0 / gamma) - 0.055;\n"
+	"\n"
+	"	return color;\n"
+	"}\n"
+	"\n"
+	"float4 PreGammaPass(float4 color, float2 uv0)\n"
+	"{\n"
+	"	#if (SHADER_MODEL >= 0x400)\n"
+	"		color = Texture.Sample(TextureSampler, uv0);\n"
+	"    #elif (FXAA_GLSL_130 == 1)\n"
+	"		color = texture(TextureSampler, uv0);\n"
+	"	#else\n"
+	"		color = tex2D(TextureSampler, uv0);\n"
+	"	#endif\n"
+	"\n"
+	"	const float GammaConst = 2.233;\n"
+	"	color.rgb = RGBGammaToLinear(color.rgb, GammaConst);\n"
+	"	color.rgb = LinearToRGBGamma(color.rgb, GammaConst);\n"
+	"	color.a = RGBLuminance(color.rgb);\n"
+	"\n"
+	"	return color;\n"
+	"}\n"
+	"\n"
+	"\n"
+	"/*------------------------------------------------------------------------------\n"
+	"                        [FXAA CODE SECTION]\n"
+	"------------------------------------------------------------------------------*/\n"
+	"\n"
+	"float FxaaLuma(float4 rgba)\n"
+	"{ \n"
+	"	rgba.w = RGBLuminance(rgba.xyz);\n"
+	"	return rgba.w; \n"
+	"}\n"
+	"\n"
+	"float4 FxaaPixelShader(float2 pos, FxaaTex tex, float2 fxaaRcpFrame, float fxaaSubpix, float fxaaEdgeThreshold, float fxaaEdgeThresholdMin)\n"
+	"{\n"
+	"	float2 posM;\n"
+	"	posM.x = pos.x;\n"
+	"	posM.y = pos.y;\n"
+	"\n"
+	"	#if (FXAA_GATHER4_ALPHA == 1)\n"
+	"	float4 rgbyM = FxaaTexTop(tex, posM);\n"
+	"	float4 luma4A = FxaaTexAlpha4(tex, posM);\n"
+	"	float4 luma4B = FxaaTexOffAlpha4(tex, posM, int2(-1, -1));\n"
+	"	rgbyM.w = RGBLuminance(rgbyM.xyz);\n"
+	"\n"
+	"	#define lumaM rgbyM.w\n"
+	"	#define lumaE luma4A.z\n"
+	"	#define lumaS luma4A.x\n"
+	"	#define lumaSE luma4A.y\n"
+	"	#define lumaNW luma4B.w\n"
+	"	#define lumaN luma4B.z\n"
+	"	#define lumaW luma4B.x\n"
+	"    \n"
+	"	#else\n"
+	"	float4 rgbyM = FxaaTexTop(tex, posM);\n"
+	"	rgbyM.w = RGBLuminance(rgbyM.xyz);\n"
+	"	#define lumaM rgbyM.w\n"
+	"\n"
+	"	float lumaS = FxaaLuma(FxaaTexOff(tex, posM, int2( 0, 1), fxaaRcpFrame.xy));\n"
+	"	float lumaE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1, 0), fxaaRcpFrame.xy));\n"
+	"	float lumaN = FxaaLuma(FxaaTexOff(tex, posM, int2( 0,-1), fxaaRcpFrame.xy));\n"
+	"	float lumaW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 0), fxaaRcpFrame.xy));\n"
+	"	#endif\n"
+	"\n"
+	"	float maxSM = max(lumaS, lumaM);\n"
+	"	float minSM = min(lumaS, lumaM);\n"
+	"	float maxESM = max(lumaE, maxSM);\n"
+	"	float minESM = min(lumaE, minSM);\n"
+	"	float maxWN = max(lumaN, lumaW);\n"
+	"	float minWN = min(lumaN, lumaW);\n"
+	"\n"
+	"	float rangeMax = max(maxWN, maxESM);\n"
+	"	float rangeMin = min(minWN, minESM);\n"
+	"	float range = rangeMax - rangeMin;\n"
+	"	float rangeMaxScaled = rangeMax * fxaaEdgeThreshold;\n"
+	"	float rangeMaxClamped = max(fxaaEdgeThresholdMin, rangeMaxScaled);\n"
+	"\n"
+	"	bool earlyExit = range < rangeMaxClamped;\n"
+	"	#if (FxaaEarlyExit == 1)\n"
+	"	if(earlyExit) { return rgbyM; }\n"
+	"	#endif\n"
+	"\n"
+	"	#if (FXAA_GATHER4_ALPHA == 0)\n"
+	"	float lumaNW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1,-1), fxaaRcpFrame.xy));\n"
+	"	float lumaSE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1, 1), fxaaRcpFrame.xy));\n"
+	"	float lumaNE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1,-1), fxaaRcpFrame.xy));\n"
+	"	float lumaSW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 1), fxaaRcpFrame.xy));\n"
+	"	#else\n"
+	"	float lumaNE = FxaaLuma(FxaaTexOff(tex, posM, int2( 1,-1), fxaaRcpFrame.xy));\n"
+	"	float lumaSW = FxaaLuma(FxaaTexOff(tex, posM, int2(-1, 1), fxaaRcpFrame.xy));\n"
+	"	#endif\n"
+	"\n"
+	"	float lumaNS = lumaN + lumaS;\n"
+	"	float lumaWE = lumaW + lumaE;\n"
+	"	float subpixRcpRange = 1.0/range;\n"
+	"	float subpixNSWE = lumaNS + lumaWE;\n"
+	"	float edgeHorz1 = (-2.0 * lumaM) + lumaNS;\n"
+	"	float edgeVert1 = (-2.0 * lumaM) + lumaWE;\n"
+	"	float lumaNESE = lumaNE + lumaSE;\n"
+	"	float lumaNWNE = lumaNW + lumaNE;\n"
+	"	float edgeHorz2 = (-2.0 * lumaE) + lumaNESE;\n"
+	"	float edgeVert2 = (-2.0 * lumaN) + lumaNWNE;\n"
+	"\n"
+	"	float lumaNWSW = lumaNW + lumaSW;\n"
+	"	float lumaSWSE = lumaSW + lumaSE;\n"
+	"	float edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);\n"
+	"	float edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);\n"
+	"	float edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;\n"
+	"	float edgeVert3 = (-2.0 * lumaS) + lumaSWSE;\n"
+	"	float edgeHorz = abs(edgeHorz3) + edgeHorz4;\n"
+	"	float edgeVert = abs(edgeVert3) + edgeVert4;\n"
+	"\n"
+	"	float subpixNWSWNESE = lumaNWSW + lumaNESE;\n"
+	"	float lengthSign = fxaaRcpFrame.x;\n"
+	"	bool horzSpan = edgeHorz >= edgeVert;\n"
+	"	float subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;\n"
+	"	if(!horzSpan) lumaN = lumaW;\n"
+	"	if(!horzSpan) lumaS = lumaE;\n"
+	"	if(horzSpan) lengthSign = fxaaRcpFrame.y;\n"
+	"	float subpixB = (subpixA * (1.0/12.0)) - lumaM;\n"
+	"\n"
+	"	float gradientN = lumaN - lumaM;\n"
+	"	float gradientS = lumaS - lumaM;\n"
+	"	float lumaNN = lumaN + lumaM;\n"
+	"	float lumaSS = lumaS + lumaM;\n"
+	"	bool pairN = abs(gradientN) >= abs(gradientS);\n"
+	"	float gradient = max(abs(gradientN), abs(gradientS));\n"
+	"	if(pairN) lengthSign = -lengthSign;\n"
+	"	float subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);\n"
+	"\n"
+	"	float2 posB;\n"
+	"	posB.x = posM.x;\n"
+	"	posB.y = posM.y;\n"
+	"	float2 offNP;\n"
+	"	offNP.x = (!horzSpan) ? 0.0 : fxaaRcpFrame.x;\n"
+	"	offNP.y = ( horzSpan) ? 0.0 : fxaaRcpFrame.y;\n"
+	"	if(!horzSpan) posB.x += lengthSign * 0.5;\n"
+	"	if( horzSpan) posB.y += lengthSign * 0.5;\n"
+	"\n"
+	"	float2 posN;\n"
+	"	posN.x = posB.x - offNP.x * FXAA_QUALITY__P0;\n"
+	"	posN.y = posB.y - offNP.y * FXAA_QUALITY__P0;\n"
+	"	float2 posP;\n"
+	"	posP.x = posB.x + offNP.x * FXAA_QUALITY__P0;\n"
+	"	posP.y = posB.y + offNP.y * FXAA_QUALITY__P0;\n"
+	"	float subpixD = ((-2.0)*subpixC) + 3.0;\n"
+	"	float lumaEndN = FxaaLuma(FxaaTexTop(tex, posN));\n"
+	"	float subpixE = subpixC * subpixC;\n"
+	"	float lumaEndP = FxaaLuma(FxaaTexTop(tex, posP));\n"
+	"\n"
+	"	if(!pairN) lumaNN = lumaSS;\n"
+	"	float gradientScaled = gradient * 1.0/4.0;\n"
+	"	float lumaMM = lumaM - lumaNN * 0.5;\n"
+	"	float subpixF = subpixD * subpixE;\n"
+	"	bool lumaMLTZero = lumaMM < 0.0;\n"
+	"	lumaEndN -= lumaNN * 0.5;\n"
+	"	lumaEndP -= lumaNN * 0.5;\n"
+	"	bool doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	bool doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1;\n"
+	"	bool doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11;\n"
+	"\n"
+	"	if(doneNP) {\n"
+	"	if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));\n"
+	"	if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));\n"
+	"	if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;\n"
+	"	if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;\n"
+	"	doneN = abs(lumaEndN) >= gradientScaled;\n"
+	"	doneP = abs(lumaEndP) >= gradientScaled;\n"
+	"	if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12;\n"
+	"	if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12;\n"
+	"	doneNP = (!doneN) || (!doneP);\n"
+	"	if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12;\n"
+	"	if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12;\n"
+	"	}}}}}}}}}}}\n"
+	"\n"
+	"	float dstN = posM.x - posN.x;\n"
+	"	float dstP = posP.x - posM.x;\n"
+	"	if(!horzSpan) dstN = posM.y - posN.y;\n"
+	"	if(!horzSpan) dstP = posP.y - posM.y;\n"
+	"\n"
+	"	bool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;\n"
+	"	float spanLength = (dstP + dstN);\n"
+	"	bool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;\n"
+	"	float spanLengthRcp = 1.0/spanLength;\n"
+	"\n"
+	"	bool directionN = dstN < dstP;\n"
+	"	float dst = min(dstN, dstP);\n"
+	"	bool goodSpan = directionN ? goodSpanN : goodSpanP;\n"
+	"	float subpixG = subpixF * subpixF;\n"
+	"	float pixelOffset = (dst * (-spanLengthRcp)) + 0.5;\n"
+	"	float subpixH = subpixG * fxaaSubpix;\n"
+	"\n"
+	"	float pixelOffsetGood = goodSpan ? pixelOffset : 0.0;\n"
+	"	float pixelOffsetSubpix = max(pixelOffsetGood, subpixH);\n"
+	"	if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;\n"
+	"	if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;\n"
+	"\n"
+	"	return float4(FxaaTexTop(tex, posM).xyz, lumaM);\n"
+	"}\n"
+	"\n"
+	"#if (FXAA_GLSL_130 == 1)\n"
+	"float4 FxaaPass(float4 FxaaColor, float2 uv0)\n"
+	"#else\n"
+	"float4 FxaaPass(float4 FxaaColor : COLOR0, float2 uv0 : TEXCOORD0)\n"
+	"#endif\n"
+	"{\n"
+	"\n"
+	"	#if (SHADER_MODEL >= 0x400)\n"
+	"	FxaaTex tex;\n"
+	"	tex.tex = Texture;\n"
+	"	tex.smpl = TextureSampler;\n"
+	"\n"
+	"	Texture.GetDimensions(PixelSize.x, PixelSize.y);\n"
+	"	FxaaColor = FxaaPixelShader(uv0, tex, 1.0/PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);\n"
+	"\n"
+	"    #elif (FXAA_GLSL_130 == 1)\n"
+	"\n"
+	"	vec2 PixelSize = textureSize(TextureSampler, 0);\n"
+	"	FxaaColor = FxaaPixelShader(uv0, TextureSampler, 1.0/PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);\n"
+	"\n"
+	"	#else\n"
+	"	FxaaTex tex;\n"
+	"	tex = TextureSampler;\n"
+	"	FxaaColor = FxaaPixelShader(uv0, tex, PixelSize.xy, FxaaSubpixMax, FxaaEdgeThreshold, FxaaEdgeThresholdMin);\n"
+	"	#endif\n"
+	"\n"
+	"	return FxaaColor;\n"
+	"}\n"
+	"\n"
+	"/*------------------------------------------------------------------------------\n"
+	"                      [MAIN() & COMBINE PASS CODE SECTION]\n"
+	"------------------------------------------------------------------------------*/\n"
+	"#if (FXAA_GLSL_130 == 1)\n"
+	"\n"
+	"void ps_main()\n"
+	"{\n"
+	"    vec4 color = texture(TextureSampler, PSin.t);\n"
+	"    color      = PreGammaPass(color, PSin.t);\n"
+	"    color      = FxaaPass(color, PSin.t);\n"
+	"\n"
+	"    SV_Target0 = color;\n"
+	"}\n"
+	"\n"
+	"#else\n"
+	"\n"
+	"PS_OUTPUT ps_main(VS_OUTPUT input)\n"
+	"{\n"
+	"	PS_OUTPUT output;\n"
+	"\n"
+	"	#if (SHADER_MODEL >= 0x400)\n"
+	"		float4 color = Texture.Sample(TextureSampler, input.t);\n"
+	"\n"
+	"		color = PreGammaPass(color, input.t);\n"
+	"		color = FxaaPass(color, input.t);\n"
+	"	#else\n"
+	"		float4 color = tex2D(TextureSampler, input.t);\n"
+	"\n"
+	"		color = PreGammaPass(color, input.t);\n"
+	"		color = FxaaPass(color, input.t);\n"
+	"	#endif\n"
+	"\n"
+	"	output.c = color;\n"
+	"	\n"
+	"	return output;\n"
+	"}\n"
+	"\n"
+	"#endif\n"
+	"\n"
+	"#endif\n"
+	;
diff --git a/plugins/GSdx_legacy/res/interlace.fx b/plugins/GSdx_legacy/res/interlace.fx
new file mode 100644
index 0000000000..9e6c76b02e
--- /dev/null
+++ b/plugins/GSdx_legacy/res/interlace.fx
@@ -0,0 +1,87 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+
+#if SHADER_MODEL >= 0x400
+
+Texture2D Texture;
+SamplerState Sampler;
+
+cbuffer cb0
+{
+	float2 ZrH;
+	float hH;
+};
+
+struct PS_INPUT
+{
+	float4 p : SV_Position;
+	float2 t : TEXCOORD0;
+};
+
+float4 ps_main0(PS_INPUT input) : SV_Target0
+{
+	clip(frac(input.t.y * hH) - 0.5);
+
+	return Texture.Sample(Sampler, input.t);
+}
+
+float4 ps_main1(PS_INPUT input) : SV_Target0
+{
+	clip(0.5 - frac(input.t.y * hH));
+
+	return Texture.Sample(Sampler, input.t);
+}
+
+float4 ps_main2(PS_INPUT input) : SV_Target0
+{
+	float4 c0 = Texture.Sample(Sampler, input.t - ZrH);
+	float4 c1 = Texture.Sample(Sampler, input.t);
+	float4 c2 = Texture.Sample(Sampler, input.t + ZrH);
+
+	return (c0 + c1 * 2 + c2) / 4;
+}
+
+float4 ps_main3(PS_INPUT input) : SV_Target0
+{
+	return Texture.Sample(Sampler, input.t);
+}
+
+#elif SHADER_MODEL <= 0x300
+
+sampler s0 : register(s0);
+
+float4 Params1 : register(c0);
+
+#define ZrH (Params1.xy)
+#define hH  (Params1.z)
+
+float4 ps_main0(float2 tex : TEXCOORD0) : COLOR
+{
+	clip(frac(tex.y * hH) - 0.5);
+
+	return tex2D(s0, tex);
+}
+
+float4 ps_main1(float2 tex : TEXCOORD0) : COLOR
+{
+	clip(0.5 - frac(tex.y * hH));
+
+	return tex2D(s0, tex);
+}
+
+float4 ps_main2(float2 tex : TEXCOORD0) : COLOR
+{
+	float4 c0 = tex2D(s0, tex - ZrH);
+	float4 c1 = tex2D(s0, tex);
+	float4 c2 = tex2D(s0, tex + ZrH);
+
+	return (c0 + c1 * 2 + c2) / 4;
+}
+
+float4 ps_main3(float2 tex : TEXCOORD0) : COLOR
+{
+	return tex2D(s0, tex);
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx_legacy/res/logo-ogl.bmp b/plugins/GSdx_legacy/res/logo-ogl.bmp
new file mode 100644
index 0000000000..03b077f322
Binary files /dev/null and b/plugins/GSdx_legacy/res/logo-ogl.bmp differ
diff --git a/plugins/GSdx_legacy/res/logo10.bmp b/plugins/GSdx_legacy/res/logo10.bmp
new file mode 100644
index 0000000000..25f6b177c7
Binary files /dev/null and b/plugins/GSdx_legacy/res/logo10.bmp differ
diff --git a/plugins/GSdx_legacy/res/logo9.bmp b/plugins/GSdx_legacy/res/logo9.bmp
new file mode 100644
index 0000000000..24cfdbeea8
Binary files /dev/null and b/plugins/GSdx_legacy/res/logo9.bmp differ
diff --git a/plugins/GSdx_legacy/res/merge.fx b/plugins/GSdx_legacy/res/merge.fx
new file mode 100644
index 0000000000..4d8236b11c
--- /dev/null
+++ b/plugins/GSdx_legacy/res/merge.fx
@@ -0,0 +1,60 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+#if SHADER_MODEL >= 0x400
+
+Texture2D Texture;
+SamplerState Sampler;
+
+cbuffer cb0
+{
+	float4 BGColor;
+};
+
+struct PS_INPUT
+{
+	float4 p : SV_Position;
+	float2 t : TEXCOORD0;
+};
+
+float4 ps_main0(PS_INPUT input) : SV_Target0
+{
+	float4 c = Texture.Sample(Sampler, input.t);
+	c.a = min(c.a * 2, 1);
+	return c;
+}
+
+float4 ps_main1(PS_INPUT input) : SV_Target0
+{
+	float4 c = Texture.Sample(Sampler, input.t);
+	c.a = BGColor.a;
+	return c;
+}
+
+#elif SHADER_MODEL <= 0x300
+
+sampler Texture : register(s0);
+
+float4 g_params[1];
+
+#define BGColor	(g_params[0])
+
+struct PS_INPUT
+{
+	float2 t : TEXCOORD0;
+};
+
+float4 ps_main0(PS_INPUT input) : COLOR
+{
+	float4 c = tex2D(Texture, input.t);
+	// a = ;
+	return c.bgra;
+}
+
+float4 ps_main1(PS_INPUT input) : COLOR
+{
+	float4 c = tex2D(Texture, input.t);
+	c.a = BGColor.a;
+	return c.bgra;
+}
+
+#endif
+#endif
diff --git a/plugins/GSdx_legacy/res/shadeboost.fx b/plugins/GSdx_legacy/res/shadeboost.fx
new file mode 100644
index 0000000000..8a30067440
--- /dev/null
+++ b/plugins/GSdx_legacy/res/shadeboost.fx
@@ -0,0 +1,76 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+
+/*
+** Contrast, saturation, brightness
+** Code of this function is from TGM's shader pack
+** http://irrlicht.sourceforge.net/phpBB2/viewtopic.php?t=21057
+*/
+
+// For all settings: 1.0 = 100% 0.5=50% 1.5 = 150% 
+float4 ContrastSaturationBrightness(float4 color) // Ported to HLSL
+{
+	const float sat = SB_SATURATION / 50.0;
+	const float brt = SB_BRIGHTNESS / 50.0;
+	const float con = SB_CONTRAST / 50.0;
+	
+	// Increase or decrease these values to adjust r, g and b color channels separately
+	const float AvgLumR = 0.5;
+	const float AvgLumG = 0.5;
+	const float AvgLumB = 0.5;
+	
+	const float3 LumCoeff = float3(0.2125, 0.7154, 0.0721);
+	
+	float3 AvgLumin = float3(AvgLumR, AvgLumG, AvgLumB);
+	float3 brtColor = color.rgb * brt;
+	float3 intensity = dot(brtColor, LumCoeff);
+	float3 satColor = lerp(intensity, brtColor, sat);
+	float3 conColor = lerp(AvgLumin, satColor, con);
+
+	color.rgb = conColor;	
+	return color;
+}
+
+#if SHADER_MODEL >= 0x400
+
+Texture2D Texture;
+SamplerState Sampler;
+
+cbuffer cb0
+{
+	float4 BGColor;
+};
+
+struct PS_INPUT
+{
+	float4 p : SV_Position;
+	float2 t : TEXCOORD0;
+};
+
+float4 ps_main(PS_INPUT input) : SV_Target0
+{
+	float4 c = Texture.Sample(Sampler, input.t);
+	return ContrastSaturationBrightness(c);
+}
+
+
+#elif SHADER_MODEL <= 0x300
+
+sampler Texture : register(s0);
+
+float4 g_params[1];
+
+#define BGColor	(g_params[0])
+
+struct PS_INPUT
+{
+	float2 t : TEXCOORD0;
+};
+
+float4 ps_main(PS_INPUT input) : COLOR
+{
+	float4 c = tex2D(Texture, input.t);
+	return ContrastSaturationBrightness(c);
+}
+
+#endif
+#endif
diff --git a/plugins/GSdx_legacy/res/tfx.cl b/plugins/GSdx_legacy/res/tfx.cl
new file mode 100644
index 0000000000..91bebeed11
--- /dev/null
+++ b/plugins/GSdx_legacy/res/tfx.cl
@@ -0,0 +1,1629 @@
+#if defined(CL_VERSION_2_0)
+
+#error hello
+
+#endif
+
+#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
+
+#ifdef cl_amd_printf
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#endif
+
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#else
+#endif
+
+#ifdef cl_amd_media_ops2
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#else
+#endif
+
+#ifndef CL_FLT_EPSILON
+#define CL_FLT_EPSILON 1.1920928955078125e-7f
+#endif
+
+#if MAX_PRIM_PER_BATCH == 64u
+	#define BIN_TYPE ulong
+#elif MAX_PRIM_PER_BATCH == 32u
+	#define BIN_TYPE uint
+#else
+	#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
+#endif
+
+#define TFX_ABA(sel) ((sel.x >> 24) & 3)
+#define TFX_ABB(sel) ((sel.x >> 26) & 3)
+#define TFX_ABC(sel) ((sel.x >> 28) & 3)
+#define TFX_ABD(sel) ((sel.x >> 30) & 3)
+#define TFX_WMS(sel) ((sel.y >>  8) & 3)
+#define TFX_WMT(sel) ((sel.y >> 10) & 3)
+
+typedef struct
+{
+	union {float4 p; struct {float x, y; uint z, f;};};
+	union {float4 tc; struct {float s, t, q; uchar4 c;};};
+} gs_vertex;
+
+typedef struct
+{
+	gs_vertex v[3];
+	uint zmin, zmax;
+	uint pb_index;
+	uint _pad;
+} gs_prim;
+
+typedef struct
+{
+	float4 dx, dy;
+	float4 zero;
+	float4 reject_corner;
+} gs_barycentric;
+
+typedef struct
+{
+	struct {uint first, last;} bounds[MAX_BIN_PER_BATCH];
+	BIN_TYPE bin[MAX_BIN_COUNT];
+	uchar4 bbox[MAX_PRIM_COUNT];
+	gs_prim prim[MAX_PRIM_COUNT];
+	gs_barycentric barycentric[MAX_PRIM_COUNT];
+} gs_env;
+
+typedef struct
+{
+	int4 scissor;
+	char dimx[4][4];
+	uint2 sel;
+	int fbp, zbp, bw;
+	uint fm, zm;
+	uchar4 fog; // rgb
+	uchar aref, afix;
+	uchar ta0, ta1;
+	int tbp[7], tbw[7];
+	int minu, maxu, minv, maxv;
+	int lod; // lcm == 1
+	int mxl;
+	float l; // TEX1.L * -0x10000
+	float k; // TEX1.K * 0x10000
+	uchar4 clut[256]; // TODO: this could be an index to a separate buffer, it may be the same across several gs_params following eachother
+} gs_param;
+
+enum GS_PRIM_CLASS
+{
+	GS_POINT_CLASS,
+	GS_LINE_CLASS,
+	GS_TRIANGLE_CLASS,
+	GS_SPRITE_CLASS
+};
+
+enum GS_PSM
+{
+	PSM_PSMCT32,
+	PSM_PSMCT24,
+	PSM_PSMCT16,
+	PSM_PSMCT16S,
+	PSM_PSMZ32,
+	PSM_PSMZ24,
+	PSM_PSMZ16,
+	PSM_PSMZ16S,
+	PSM_PSMT8,
+	PSM_PSMT4,
+	PSM_PSMT8H,
+	PSM_PSMT4HL,
+	PSM_PSMT4HH,
+};
+
+enum GS_TFX
+{
+	TFX_MODULATE	= 0,
+	TFX_DECAL		= 1,
+	TFX_HIGHLIGHT	= 2,
+	TFX_HIGHLIGHT2	= 3,
+	TFX_NONE		= 4,
+};
+
+enum GS_CLAMP
+{
+	CLAMP_REGION_REPEAT	= 0,
+	CLAMP_REPEAT		= 1,
+	CLAMP_CLAMP			= 2,
+	CLAMP_REGION_CLAMP	= 3,
+};
+
+enum GS_ZTST
+{
+	ZTST_NEVER		= 0,
+	ZTST_ALWAYS		= 1,
+	ZTST_GEQUAL		= 2,
+	ZTST_GREATER	= 3,
+};
+
+enum GS_ATST
+{
+	ATST_NEVER		= 0,
+	ATST_ALWAYS		= 1,
+	ATST_LESS		= 2,
+	ATST_LEQUAL		= 3,
+	ATST_EQUAL		= 4,
+	ATST_GEQUAL		= 5,
+	ATST_GREATER	= 6,
+	ATST_NOTEQUAL	= 7,
+};
+
+enum GS_AFAIL
+{
+	AFAIL_KEEP		= 0,
+	AFAIL_FB_ONLY	= 1,
+	AFAIL_ZB_ONLY	= 2,
+	AFAIL_RGB_ONLY	= 3,
+};
+
+__constant uchar blockTable32[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable32Z[4][8] =
+{
+	{ 24, 25, 28, 29,  8,  9, 12, 13},
+	{ 26, 27, 30, 31, 10, 11, 14, 15},
+	{ 16, 17, 20, 21,  0,  1,  4,  5},
+	{ 18, 19, 22, 23,  2,  3,  6,  7}
+};
+
+__constant uchar blockTable16[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar blockTable16S[8][4] =
+{
+	{  0,  2, 16, 18 },
+	{  1,  3, 17, 19 },
+	{  8, 10, 24, 26 },
+	{  9, 11, 25, 27 },
+	{  4,  6, 20, 22 },
+	{  5,  7, 21, 23 },
+	{ 12, 14, 28, 30 },
+	{ 13, 15, 29, 31 }
+};
+
+__constant uchar blockTable16Z[8][4] =
+{
+	{ 24, 26, 16, 18 },
+	{ 25, 27, 17, 19 },
+	{ 28, 30, 20, 22 },
+	{ 29, 31, 21, 23 },
+	{  8, 10,  0,  2 },
+	{  9, 11,  1,  3 },
+	{ 12, 14,  4,  6 },
+	{ 13, 15,  5,  7 }
+};
+
+__constant uchar blockTable16SZ[8][4] =
+{
+	{ 24, 26,  8, 10 },
+	{ 25, 27,  9, 11 },
+	{ 16, 18,  0,  2 },
+	{ 17, 19,  1,  3 },
+	{ 28, 30, 12, 14 },
+	{ 29, 31, 13, 15 },
+	{ 20, 22,  4,  6 },
+	{ 21, 23,  5,  7 }
+};
+
+__constant uchar blockTable8[4][8] =
+{
+	{  0,  1,  4,  5, 16, 17, 20, 21},
+	{  2,  3,  6,  7, 18, 19, 22, 23},
+	{  8,  9, 12, 13, 24, 25, 28, 29},
+	{ 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+__constant uchar blockTable4[8][4] =
+{
+	{  0,  2,  8, 10 },
+	{  1,  3,  9, 11 },
+	{  4,  6, 12, 14 },
+	{  5,  7, 13, 15 },
+	{ 16, 18, 24, 26 },
+	{ 17, 19, 25, 27 },
+	{ 20, 22, 28, 30 },
+	{ 21, 23, 29, 31 }
+};
+
+__constant uchar columnTable32[8][8] =
+{
+	{  0,  1,  4,  5,  8,  9, 12, 13 },
+	{  2,  3,  6,  7, 10, 11, 14, 15 },
+	{ 16, 17, 20, 21, 24, 25, 28, 29 },
+	{ 18, 19, 22, 23, 26, 27, 30, 31 },
+	{ 32, 33, 36, 37, 40, 41, 44, 45 },
+	{ 34, 35, 38, 39, 42, 43, 46, 47 },
+	{ 48, 49, 52, 53, 56, 57, 60, 61 },
+	{ 50, 51, 54, 55, 58, 59, 62, 63 },
+};
+
+__constant uchar columnTable16[8][16] =
+{
+	{   0,   2,   8,  10,  16,  18,  24,  26,
+	    1,   3,   9,  11,  17,  19,  25,  27 },
+	{   4,   6,  12,  14,  20,  22,  28,  30,
+	    5,   7,  13,  15,  21,  23,  29,  31 },
+	{  32,  34,  40,  42,  48,  50,  56,  58,
+	   33,  35,  41,  43,  49,  51,  57,  59 },
+	{  36,  38,  44,  46,  52,  54,  60,  62,
+	   37,  39,  45,  47,  53,  55,  61,  63 },
+	{  64,  66,  72,  74,  80,  82,  88,  90,
+	   65,  67,  73,  75,  81,  83,  89,  91 },
+	{  68,  70,  76,  78,  84,  86,  92,  94,
+	   69,  71,  77,  79,  85,  87,  93,  95 },
+	{  96,  98, 104, 106, 112, 114, 120, 122,
+	   97,  99, 105, 107, 113, 115, 121, 123 },
+	{ 100, 102, 108, 110, 116, 118, 124, 126,
+	  101, 103, 109, 111, 117, 119, 125, 127 },
+};
+
+__constant uchar columnTable8[16][16] =
+{
+	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
+	    2,   6,  18,  22,  34,  38,  50,  54 },
+	{   8,  12,  24,  28,  40,  44,  56,  60,
+	   10,  14,  26,  30,  42,  46,  58,  62 },
+	{  33,  37,  49,  53,   1,   5,  17,  21,
+	   35,  39,  51,  55,   3,   7,  19,  23 },
+	{  41,  45,  57,  61,   9,  13,  25,  29,
+	   43,  47,  59,  63,  11,  15,  27,  31 },
+	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
+	   98, 102, 114, 118,  66,  70,  82,  86 },
+	{ 104, 108, 120, 124,  72,  76,  88,  92,
+	  106, 110, 122, 126,  74,  78,  90,  94 },
+	{  65,  69,  81,  85,  97, 101, 113, 117,
+	   67,  71,  83,  87,  99, 103, 115, 119 },
+	{  73,  77,  89,  93, 105, 109, 121, 125,
+	   75,  79,  91,  95, 107, 111, 123, 127 },
+	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
+	  130, 134, 146, 150, 162, 166, 178, 182 },
+	{ 136, 140, 152, 156, 168, 172, 184, 188,
+	  138, 142, 154, 158, 170, 174, 186, 190 },
+	{ 161, 165, 177, 181, 129, 133, 145, 149,
+	  163, 167, 179, 183, 131, 135, 147, 151 },
+	{ 169, 173, 185, 189, 137, 141, 153, 157,
+	  171, 175, 187, 191, 139, 143, 155, 159 },
+	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
+	  226, 230, 242, 246, 194, 198, 210, 214 },
+	{ 232, 236, 248, 252, 200, 204, 216, 220,
+	  234, 238, 250, 254, 202, 206, 218, 222 },
+	{ 193, 197, 209, 213, 225, 229, 241, 245,
+	  195, 199, 211, 215, 227, 231, 243, 247 },
+	{ 201, 205, 217, 221, 233, 237, 249, 253,
+	  203, 207, 219, 223, 235, 239, 251, 255 },
+};
+
+__constant ushort columnTable4[16][32] =
+{
+	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
+	    2,  10,  34,  42,  66,  74,  98, 106,
+	    4,  12,  36,  44,  68,  76, 100, 108,
+	    6,  14,  38,  46,  70,  78, 102, 110 },
+	{  16,  24,  48,  56,  80,  88, 112, 120,
+	   18,  26,  50,  58,  82,  90, 114, 122,
+	   20,  28,  52,  60,  84,  92, 116, 124,
+	   22,  30,  54,  62,  86,  94, 118, 126 },
+	{  65,  73,  97, 105,   1,   9,  33,  41,
+	   67,  75,  99, 107,   3,  11,  35,  43,
+	   69,  77, 101, 109,   5,  13,  37,  45,
+	   71,  79, 103, 111,   7,  15,  39,  47 },
+	{  81,  89, 113, 121,  17,  25,  49,  57,
+	   83,  91, 115, 123,  19,  27,  51,  59,
+	   85,  93, 117, 125,  21,  29,  53,  61,
+	   87,  95, 119, 127,  23,  31,  55,  63 },
+	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
+	  194, 202, 226, 234, 130, 138, 162, 170,
+	  196, 204, 228, 236, 132, 140, 164, 172,
+	  198, 206, 230, 238, 134, 142, 166, 174 },
+	{ 208, 216, 240, 248, 144, 152, 176, 184,
+	  210, 218, 242, 250, 146, 154, 178, 186,
+	  212, 220, 244, 252, 148, 156, 180, 188,
+	  214, 222, 246, 254, 150, 158, 182, 190 },
+	{ 129, 137, 161, 169, 193, 201, 225, 233,
+	  131, 139, 163, 171, 195, 203, 227, 235,
+	  133, 141, 165, 173, 197, 205, 229, 237,
+	  135, 143, 167, 175, 199, 207, 231, 239 },
+	{ 145, 153, 177, 185, 209, 217, 241, 249,
+	  147, 155, 179, 187, 211, 219, 243, 251,
+	  149, 157, 181, 189, 213, 221, 245, 253,
+	  151, 159, 183, 191, 215, 223, 247, 255 },
+	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
+	  258, 266, 290, 298, 322, 330, 354, 362,
+	  260, 268, 292, 300, 324, 332, 356, 364,
+	  262, 270, 294, 302, 326, 334, 358, 366 },
+	{ 272, 280, 304, 312, 336, 344, 368, 376,
+	  274, 282, 306, 314, 338, 346, 370, 378,
+	  276, 284, 308, 316, 340, 348, 372, 380,
+	  278, 286, 310, 318, 342, 350, 374, 382 },
+	{ 321, 329, 353, 361, 257, 265, 289, 297,
+	  323, 331, 355, 363, 259, 267, 291, 299,
+	  325, 333, 357, 365, 261, 269, 293, 301,
+	  327, 335, 359, 367, 263, 271, 295, 303 },
+	{ 337, 345, 369, 377, 273, 281, 305, 313,
+	  339, 347, 371, 379, 275, 283, 307, 315,
+	  341, 349, 373, 381, 277, 285, 309, 317,
+	  343, 351, 375, 383, 279, 287, 311, 319 },
+	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
+	  450, 458, 482, 490, 386, 394, 418, 426,
+	  452, 460, 484, 492, 388, 396, 420, 428,
+	  454, 462, 486, 494, 390, 398, 422, 430 },
+	{ 464, 472, 496, 504, 400, 408, 432, 440,
+	  466, 474, 498, 506, 402, 410, 434, 442,
+	  468, 476, 500, 508, 404, 412, 436, 444,
+	  470, 478, 502, 510, 406, 414, 438, 446 },
+	{ 385, 393, 417, 425, 449, 457, 481, 489,
+	  387, 395, 419, 427, 451, 459, 483, 491,
+	  389, 397, 421, 429, 453, 461, 485, 493,
+	  391, 399, 423, 431, 455, 463, 487, 495 },
+	{ 401, 409, 433, 441, 465, 473, 497, 505,
+	  403, 411, 435, 443, 467, 475, 499, 507,
+	  405, 413, 437, 445, 469, 477, 501, 509,
+	  407, 415, 439, 447, 471, 479, 503, 511 },
+};
+
+int BlockNumber32(int x, int y, int bp, int bw)
+{
+	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+int BlockNumber16(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+int BlockNumber16S(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16S[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+int BlockNumber32Z(int x, int y, int bp, int bw)
+{
+	return bp + mad24(y & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable32Z[(y >> 3) & 3][(x >> 3) & 7];
+}
+
+int BlockNumber16Z(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16Z[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+int BlockNumber16SZ(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 1) & ~0x1f, bw, (x >> 1) & ~0x1f) + blockTable16SZ[(y >> 3) & 7][(x >> 4) & 3];
+}
+
+int BlockNumber8(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 1) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable8[(y >> 4) & 3][(x >> 4) & 7];
+}
+
+int BlockNumber4(int x, int y, int bp, int bw)
+{
+	return bp + mad24((y >> 2) & ~0x1f, bw >> 1, (x >> 2) & ~0x1f) + blockTable4[(y >> 4) & 7][(x >> 5) & 3];
+}
+
+int PixelAddress32(int x, int y, int bp, int bw)
+{
+	return (BlockNumber32(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+int PixelAddress16(int x, int y, int bp, int bw)
+{
+	return (BlockNumber16(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+int PixelAddress16S(int x, int y, int bp, int bw)
+{
+	return (BlockNumber16S(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+int PixelAddress32Z(int x, int y, int bp, int bw)
+{
+	return (BlockNumber32Z(x, y, bp, bw) << 6) + columnTable32[y & 7][x & 7];
+}
+
+int PixelAddress16Z(int x, int y, int bp, int bw)
+{
+	return (BlockNumber16Z(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+int PixelAddress16SZ(int x, int y, int bp, int bw)
+{
+	return (BlockNumber16SZ(x, y, bp, bw) << 7) + columnTable16[y & 7][x & 15];
+}
+
+int PixelAddress8(int x, int y, int bp, int bw)
+{
+	return (BlockNumber8(x, y, bp, bw) << 8) + columnTable8[y & 15][x & 15];
+}
+
+int PixelAddress4(int x, int y, int bp, int bw)
+{
+	return (BlockNumber4(x, y, bp, bw) << 9) + columnTable4[y & 15][x & 31];
+}
+
+int PixelAddress(int x, int y, int bp, int bw, int psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMT8H:
+	case PSM_PSMT4HL:
+	case PSM_PSMT4HH:
+		return PixelAddress32(x, y, bp, bw);
+	case PSM_PSMCT16: 
+		return PixelAddress16(x, y, bp, bw);
+	case PSM_PSMCT16S: 
+		return PixelAddress16S(x, y, bp, bw);
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return PixelAddress32Z(x, y, bp, bw);
+	case PSM_PSMZ16: 
+		return PixelAddress16Z(x, y, bp, bw);
+	case PSM_PSMZ16S: 
+		return PixelAddress16SZ(x, y, bp, bw);
+	case PSM_PSMT8:
+		return PixelAddress8(x, y, bp, bw);
+	case PSM_PSMT4:
+		return PixelAddress4(x, y, bp, bw);
+	}
+}
+
+uint ReadFrame(__global uchar* vm, int addr, int psm)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMCT24: 
+	case PSM_PSMZ32: 
+	case PSM_PSMZ24: 
+		return ((__global uint*)vm)[addr];
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		return ((__global ushort*)vm)[addr];
+	}
+}
+
+void WriteFrame(__global uchar* vm, int addr, int psm, uint value)
+{
+	switch(psm)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		((__global uint*)vm)[addr] = value; 
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		((__global ushort*)vm)[addr] = (ushort)value;
+		break;
+	}
+}
+
+bool is16bit(int psm)
+{
+	return psm < 8 && (psm & 3) >= 2;
+}
+
+bool is24bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 1;
+}
+
+bool is32bit(int psm)
+{
+	return psm < 8 && (psm & 3) == 0;
+}
+
+#ifdef PRIM
+
+int GetVertexPerPrim(int prim_class)
+{
+	switch(prim_class)
+	{
+	default:
+	case GS_POINT_CLASS: return 1;
+	case GS_LINE_CLASS: return 2;
+	case GS_TRIANGLE_CLASS: return 3;
+	case GS_SPRITE_CLASS: return 2;
+	}
+}
+
+#define VERTEX_PER_PRIM GetVertexPerPrim(PRIM)
+
+#endif
+
+#ifdef KERNEL_PRIM
+
+__kernel void KERNEL_PRIM(
+	__global gs_env* env,
+	__global uchar* vb_base, 
+	__global uchar* ib_base,
+	__global uchar* pb_base, 
+	uint vb_start,
+	uint ib_start,
+	uint pb_start)
+{
+	size_t prim_index = get_global_id(0);
+
+	__global gs_vertex* vb = (__global gs_vertex*)(vb_base + vb_start);
+	__global uint* ib = (__global uint*)(ib_base + ib_start);
+	__global gs_prim* prim = &env->prim[prim_index];
+	
+	ib += prim_index * VERTEX_PER_PRIM;
+
+	uint pb_index = ib[0] >> 24;
+
+	prim->pb_index = pb_index;
+
+	__global gs_param* pb = (__global gs_param*)(pb_base + pb_start + pb_index * TFX_PARAM_SIZE);
+
+	__global gs_vertex* v0 = &vb[ib[0] & 0x00ffffff];
+	__global gs_vertex* v1 = &vb[ib[1] & 0x00ffffff];
+	__global gs_vertex* v2 = &vb[ib[2] & 0x00ffffff];
+
+	int2 pmin, pmax;
+
+	if(PRIM == GS_POINT_CLASS)
+	{
+		pmin = pmax = convert_int2_rte(v0->p.xy);
+
+		prim->v[0].p = v0->p;
+		prim->v[0].tc = v0->tc;
+	}
+	else if(PRIM == GS_LINE_CLASS)
+	{
+		int2 p0 = convert_int2_rte(v0->p.xy);
+		int2 p1 = convert_int2_rte(v1->p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+	}
+	else if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+		int2 p2 = convert_int2_rtp(v2->p.xy);
+
+		pmin = min(min(p0, p1), p2);
+		pmax = max(max(p0, p1), p2);
+
+		// z needs special care, since it's a 32 bit unit, float cannot encode it exactly
+		// only interpolate the relative to zmin and hopefully small values
+
+		uint zmin = min(min(v0->z, v1->z), v2->z);
+		uint zmax = max(max(v0->z, v1->z), v2->z);
+		
+		prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
+		prim->v[0].tc = v0->tc;
+		prim->v[1].p = (float4)(v1->p.x, v1->p.y, as_float(v1->z - zmin), v1->p.w);
+		prim->v[1].tc = v1->tc;
+		prim->v[2].p = (float4)(v2->p.x, v2->p.y, as_float(v2->z - zmin), v2->p.w);
+		prim->v[2].tc = v2->tc;
+
+		prim->zmin = zmin;
+		prim->zmax = zmax;
+
+		float4 dp0 = v1->p - v0->p;
+		float4 dp1 = v0->p - v2->p;
+		float4 dp2 = v2->p - v1->p;
+
+		float cp = dp0.x * dp1.y - dp0.y * dp1.x;
+
+		if(cp != 0.0f)
+		{
+			cp = native_recip(cp);
+
+			float2 u = dp0.xy * cp;
+			float2 v = -dp1.xy * cp;
+
+			// v0 has the (0, 0, 1) barycentric coord, v1: (0, 1, 0), v2: (1, 0, 0)
+
+			gs_barycentric b;
+
+			b.dx = (float4)(-v.y, u.y, v.y - u.y, v0->p.x);
+			b.dy = (float4)(v.x, -u.x, u.x - v.x, v0->p.y);
+
+			dp0.xy = dp0.xy * sign(cp);
+			dp1.xy = dp1.xy * sign(cp);
+			dp2.xy = dp2.xy * sign(cp);
+
+			b.zero.x = select(0.0f, CL_FLT_EPSILON, (dp1.y < 0) | ((dp1.y == 0) & (dp1.x > 0)));
+			b.zero.y = select(0.0f, CL_FLT_EPSILON, (dp0.y < 0) | ((dp0.y == 0) & (dp0.x > 0)));
+			b.zero.z = select(0.0f, CL_FLT_EPSILON, (dp2.y < 0) | ((dp2.y == 0) & (dp2.x > 0)));
+			
+			// any barycentric(reject_corner) < 0, tile outside the triangle
+
+			b.reject_corner.x = 0.0f + max(max(max(b.dx.x + b.dy.x, b.dx.x), b.dy.x), 0.0f) * BIN_SIZE;
+			b.reject_corner.y = 0.0f + max(max(max(b.dx.y + b.dy.y, b.dx.y), b.dy.y), 0.0f) * BIN_SIZE;
+			b.reject_corner.z = 1.0f + max(max(max(b.dx.z + b.dy.z, b.dx.z), b.dy.z), 0.0f) * BIN_SIZE;
+
+			// TODO: accept_corner, at min value, all barycentric(accept_corner) >= 0, tile fully inside, no per pixel hittest needed
+
+			env->barycentric[prim_index] = b;
+		}
+		else // triangle has zero area
+		{
+			pmax = -1; // won't get included in any tile
+		}
+	}
+	else if(PRIM == GS_SPRITE_CLASS)
+	{
+		int2 p0 = convert_int2_rtp(v0->p.xy);
+		int2 p1 = convert_int2_rtp(v1->p.xy);
+
+		pmin = min(p0, p1);
+		pmax = max(p0, p1);
+
+		int4 mask = (int4)(v0->p.xy > v1->p.xy, 0, 0);
+
+		prim->v[0].p = select(v0->p, v1->p, mask); // pmin
+		prim->v[0].tc = select(v0->tc, v1->tc, mask);
+		prim->v[1].p = select(v1->p, v0->p, mask); // pmax
+		prim->v[1].tc = select(v1->tc, v0->tc, mask);
+		prim->v[1].tc.xy = (prim->v[1].tc.xy - prim->v[0].tc.xy) / (prim->v[1].p.xy - prim->v[0].p.xy);
+	}
+
+	int4 scissor = pb->scissor;
+
+	pmin = select(pmin, scissor.xy, pmin < scissor.xy);
+	pmax = select(pmax, scissor.zw, pmax > scissor.zw);
+
+	int4 r = (int4)(pmin, pmax + (int2)(BIN_SIZE - 1)) >> BIN_SIZE_BITS;
+
+	env->bbox[prim_index] = convert_uchar4_sat(r);
+}
+
+#endif
+
+#ifdef KERNEL_TILE
+
+int tile_in_triangle(float2 p, gs_barycentric b)
+{
+	float3 f = b.dx.xyz * (p.x - b.dx.w) + b.dy.xyz * (p.y - b.dy.w) + b.reject_corner.xyz;
+
+	f = select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10));
+
+	return all(f >= b.zero.xyz);
+}
+
+#if CLEAR == 1
+
+__kernel void KERNEL_TILE(__global gs_env* env)
+{
+	env->bounds[get_global_id(0)].first = -1;
+	env->bounds[get_global_id(0)].last = 0;
+}
+
+#elif MODE < 3
+
+#if MAX_PRIM_PER_BATCH != 32
+	#error "MAX_PRIM_PER_BATCH != 32"
+#endif
+
+#define MAX_PRIM_PER_GROUP (32u >> MODE)
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	uint batch_index = get_group_id(2) >> MODE;
+	uint prim_start = get_group_id(2) << (5 - MODE);
+	uint group_prim_index = get_local_id(2);
+	uint bin_index = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+	__global uchar4* bbox = &env->bbox[prim_start];
+	__global gs_barycentric* barycentric = &env->barycentric[prim_start];
+
+	__local uchar4 bbox_cache[MAX_PRIM_PER_GROUP];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_GROUP];
+	__local uint visible[8 << MODE];
+
+	if(get_local_id(2) == 0)
+	{
+		visible[bin_index] = 0;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint group_prim_count = min(prim_count - prim_start, MAX_PRIM_PER_GROUP);
+
+	event_t e = async_work_group_copy(bbox_cache, bbox, group_prim_count, 0);
+
+	wait_group_events(1, &e);
+
+	if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, group_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	if(group_prim_index < group_prim_count)
+	{
+		int x = bin_dim.x + get_local_id(0);
+		int y = bin_dim.y + get_local_id(1);
+
+		uchar4 r = bbox_cache[group_prim_index];
+
+		uint test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
+
+		if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+		{
+			test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[group_prim_index]);
+		}
+
+		atomic_or(&visible[bin_index], test << ((MAX_PRIM_PER_GROUP - 1) - get_local_id(2)));
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_local_id(2) == 0)
+	{
+		#if MODE == 0
+		((__global uint*)&bin[bin_index])[0] = visible[bin_index];
+		#elif MODE == 1
+		((__global ushort*)&bin[bin_index])[1 - (get_group_id(2) & 1)] = visible[bin_index];
+		#elif MODE == 2
+		((__global uchar*)&bin[bin_index])[3 - (get_group_id(2) & 3)] = visible[bin_index];
+		#endif
+
+		if(visible[bin_index] != 0)
+		{
+			atomic_min(&env->bounds[bin_index].first, batch_index);
+			atomic_max(&env->bounds[bin_index].last, batch_index);
+		}
+	}
+}
+
+#elif MODE == 3
+
+__kernel void KERNEL_TILE(
+	__global gs_env* env,
+	uint prim_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim)
+{
+	size_t batch_index = get_group_id(0);
+	size_t local_id = get_local_id(0);
+	size_t local_size = get_local_size(0);
+
+	uint batch_prim_count = min(prim_count - (batch_index << MAX_PRIM_PER_BATCH_BITS), MAX_PRIM_PER_BATCH);
+		
+	__global BIN_TYPE* bin = &env->bin[batch_index * bin_count];
+	__global uchar4* bbox = &env->bbox[batch_index << MAX_PRIM_PER_BATCH_BITS];
+	__global gs_barycentric* barycentric = &env->barycentric[batch_index << MAX_PRIM_PER_BATCH_BITS];
+
+	__local uchar4 bbox_cache[MAX_PRIM_PER_BATCH];
+	__local gs_barycentric barycentric_cache[MAX_PRIM_PER_BATCH];
+	
+	event_t e = async_work_group_copy(bbox_cache, bbox, batch_prim_count, 0);
+
+	wait_group_events(1, &e);
+
+	if(PRIM == GS_TRIANGLE_CLASS)
+	{
+		e = async_work_group_copy((__local float4*)barycentric_cache, (__global float4*)barycentric, batch_prim_count * (sizeof(gs_barycentric) / sizeof(float4)), 0);
+		
+		wait_group_events(1, &e);
+	}
+
+	for(uint bin_index = local_id; bin_index < bin_count; bin_index += local_size)
+	{
+		int y = bin_index / bin_dim.z; // TODO: very expensive, no integer divider on current hardware
+		int x = bin_index - y * bin_dim.z;
+
+		x += bin_dim.x;
+		y += bin_dim.y;
+
+		BIN_TYPE visible = 0;
+
+		for(uint i = 0; i < batch_prim_count; i++)
+		{
+			uchar4 r = bbox_cache[i];
+
+			BIN_TYPE test = (r.x <= x) & (r.z > x) & (r.y <= y) & (r.w > y);
+
+			if(PRIM == GS_TRIANGLE_CLASS && test != 0)
+			{
+				test = tile_in_triangle(convert_float2((int2)(x, y) << BIN_SIZE_BITS), barycentric_cache[i]);
+			}
+
+			visible |= test << ((MAX_PRIM_PER_BATCH - 1) - i);
+		}
+
+		bin[bin_index] = visible;
+
+		if(visible != 0)
+		{
+			atomic_min(&env->bounds[bin_index].first, batch_index);
+			atomic_max(&env->bounds[bin_index].last, batch_index);
+		}
+	}
+}
+
+#endif
+
+#endif
+
+#ifdef KERNEL_TFX
+
+bool ZTest(uint zs, uint zd)
+{ 
+	if(ZTEST)
+	{
+		if(is24bit(ZPSM)) zd &= 0x00ffffff;
+
+		switch(ZTST)
+		{
+		case ZTST_NEVER:
+			return false;
+		case ZTST_ALWAYS:
+			return true;
+		case ZTST_GEQUAL:
+			return zs >= zd;
+		case ZTST_GREATER:
+			return zs > zd;
+		}
+	}
+
+	return true;
+}
+
+bool AlphaTest(int alpha, int aref, uint* fm, uint* zm)
+{
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		break;
+	case AFAIL_FB_ONLY:
+		if(!ZWRITE) return true;
+		break;
+	case AFAIL_ZB_ONLY:
+		if(!FWRITE) return true;
+		break;
+	case AFAIL_RGB_ONLY:
+		if(!ZWRITE && is24bit(FPSM)) return true;
+		break;
+	}
+
+	uint pass;
+	
+	switch(ATST)
+	{
+	case ATST_NEVER:
+		pass = false;
+		break;
+	case ATST_ALWAYS:
+		return true;
+	case ATST_LESS:
+		pass = alpha < aref;
+		break;
+	case ATST_LEQUAL:
+		pass = alpha <= aref;
+		break;
+	case ATST_EQUAL:
+		pass = alpha == aref;
+		break;
+	case ATST_GEQUAL:
+		pass = alpha >= aref;
+		break;
+	case ATST_GREATER:
+		pass = alpha > aref;
+		break;
+	case ATST_NOTEQUAL:
+		pass = alpha != aref;
+		break;
+	}
+
+	switch(AFAIL)
+	{
+	case AFAIL_KEEP:
+		return pass;
+	case AFAIL_FB_ONLY:
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_ZB_ONLY:
+		*fm |= pass ? 0 : 0xffffffff;
+		break;
+	case AFAIL_RGB_ONLY:
+		if(is32bit(FPSM)) *fm |= pass ? 0 : 0xff000000;
+		if(is16bit(FPSM)) *fm |= pass ? 0 : 0xffff8000;
+		*zm |= pass ? 0 : 0xffffffff;
+		break;
+	}
+
+	return true;
+}
+
+bool DestAlphaTest(uint fd)
+{
+	if(DATE)
+	{
+		if(DATM)
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) != 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) != 0;
+		}
+		else
+		{
+			if(is32bit(FPSM)) return (fd & 0x80000000) == 0;
+			if(is16bit(FPSM)) return (fd & 0x00008000) == 0;
+		}
+	}
+
+	return true;
+}
+
+int Wrap(int a, int b, int c, int mode)
+{
+	if(MERGED)
+	{
+		return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0);
+	}
+	else
+	{
+		switch(mode)
+		{
+		case CLAMP_REGION_REPEAT:
+			return (a & b) | c;
+		case CLAMP_REPEAT:
+			return a & b;
+		case CLAMP_CLAMP:
+			return clamp(a, 0, c);
+		case CLAMP_REGION_CLAMP:
+			return clamp(a, b, c);
+		}
+	}
+}
+
+int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel)
+{
+	if(FWRITE && (ABE || AA1))
+	{
+		int4 cs = c;
+		int4 cd;
+
+		if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED)
+		{
+			if(is32bit(FPSM) || is24bit(FPSM))
+			{
+				cd.x = fd & 0xff;
+				cd.y = (fd >> 8) & 0xff;
+				cd.z = (fd >> 16) & 0xff;
+				cd.w = fd >> 24;
+			}
+			else if(is16bit(FPSM))
+			{
+				cd.x = (fd << 3) & 0xf8;
+				cd.y = (fd >> 2) & 0xf8;
+				cd.z = (fd >> 7) & 0xf8;
+				cd.w = (fd >> 8) & 0x80;
+			}
+		}
+
+		if(MERGED)
+		{
+			int aba = TFX_ABA(sel);
+			int abb = TFX_ABB(sel);
+			int abc = TFX_ABC(sel);
+			int abd = TFX_ABD(sel);
+
+			int ad = !is24bit(FPSM) ? cd.w : 0x80;
+
+			int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0;
+			int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0;
+			int C = abc == 0 ? cs.w : abc == 1 ? ad : afix;
+			int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0;
+
+			c.xyz = (mul24(A - B, C) >> 7) + D;
+		}
+		else
+		{
+			if(ABA != ABB)
+			{
+				switch(ABA)
+				{
+				case 0: break; // c.xyz = cs.xyz;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
+
+				switch(ABB)
+				{
+				case 0: c.xyz -= cs.xyz; break;
+				case 1: c.xyz -= cd.xyz; break;
+				case 2: break;
+				}
+
+				if(!(is24bit(FPSM) && ABC == 1))
+				{
+					int a = 0;
+
+					switch(ABC)
+					{
+					case 0: a = cs.w; break;
+					case 1: a = cd.w; break;
+					case 2: a = afix; break;
+					}
+
+					c.xyz = c.xyz * a >> 7;
+				}
+
+				switch(ABD)
+				{
+				case 0: c.xyz += cs.xyz; break;
+				case 1: c.xyz += cd.xyz; break;
+				case 2: break;
+				}
+			}
+			else
+			{
+				switch(ABD)
+				{
+				case 0: break;
+				case 1: c.xyz = cd.xyz; break;
+				case 2: c.xyz = 0; break;
+				}
+			}
+		}
+
+		if(PABE)
+		{
+			c.xyz = select(cs.xyz, c.xyz, (int3)(cs.w << 24));
+		}
+	}
+
+	return c;
+}
+
+uchar4 Expand24To32(uint rgba, uchar ta0)
+{
+	uchar4 c;
+
+	c.x = rgba & 0xff;
+	c.y = (rgba >> 8) & 0xff;
+	c.z = (rgba >> 16) & 0xff;
+	c.w = !AEM || (rgba & 0xffffff) != 0 ? ta0 : 0;
+
+	return c;
+}
+
+uchar4 Expand16To32(ushort rgba, uchar ta0, uchar ta1)
+{
+	uchar4 c;
+
+	c.x = (rgba << 3) & 0xf8;
+	c.y = (rgba >> 2) & 0xf8;
+	c.z = (rgba >> 7) & 0xf8;
+	c.w = !AEM || (rgba & 0x7fff) != 0 ? ((rgba & 0x8000) ? ta1 : ta0) : 0;
+
+	return c;
+}
+
+int4 ReadTexel(__global uchar* vm, int x, int y, int level, __global gs_param* pb)
+{
+	uchar4 c;
+
+	uint addr = PixelAddress(x, y, pb->tbp[level], pb->tbw[level], TPSM);
+
+	__global ushort* vm16 = (__global ushort*)vm;
+	__global uint* vm32 = (__global uint*)vm;
+
+	switch(TPSM)
+	{
+	default:
+	case PSM_PSMCT32: 
+	case PSM_PSMZ32:
+		c = ((__global uchar4*)vm)[addr];
+		break;
+	case PSM_PSMCT24: 
+	case PSM_PSMZ24: 
+		c = Expand24To32(vm32[addr], pb->ta0);
+		break;
+	case PSM_PSMCT16: 
+	case PSM_PSMCT16S: 
+	case PSM_PSMZ16: 
+	case PSM_PSMZ16S: 
+		c = Expand16To32(vm16[addr], pb->ta0, pb->ta1);
+		break;
+	case PSM_PSMT8:
+		c = pb->clut[vm[addr]];
+		break;
+	case PSM_PSMT4:
+		c = pb->clut[(vm[addr >> 1] >> ((addr & 1) << 2)) & 0x0f];
+		break;
+	case PSM_PSMT8H:
+		c = pb->clut[vm32[addr] >> 24];
+		break;
+	case PSM_PSMT4HL:
+		c = pb->clut[(vm32[addr] >> 24) & 0x0f];
+		break;
+	case PSM_PSMT4HH:
+		c = pb->clut[(vm32[addr] >> 28) & 0x0f];
+		break;
+	}
+
+	//printf("[%d %d] %05x %d %d %08x | %v4hhd | %08x\n", x, y, pb->tbp[level], pb->tbw[level], TPSM, addr, c, vm[addr]);
+
+	return convert_int4(c);
+}
+
+int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
+{
+	int4 c;
+
+	if(0)//if(MMIN)
+	{
+		// TODO
+	}
+	else
+	{
+		int2 uv;
+
+		if(!FST)
+		{
+			uv = convert_int2_rte(t.xy * native_recip(t.z));
+		}
+		else
+		{
+			// sfex capcom logo third drawing call at (0,223) calculated as:
+			// t0 + (p - p0) * (t - t0) / (p1 - p0)  
+			// 0.5 + (223 - 0) * (112.5 - 0.5) / (224 - 0) = 112
+			// due to rounding errors (multiply-add instruction maybe):
+			// t.y = 111.999..., uv0.y = 111, uvf.y = 15/16, off by 1/16 texel vertically after interpolation
+			// TODO: sw renderer samples at 112 exactly, check which one is correct
+
+			// last line error in persona 3 movie clips if rounding is enabled
+
+			uv = convert_int2(t.xy); 
+		}
+
+		if(LTF) uv -= 0x0008;
+
+		int2 uvf = uv & 0x000f;
+
+		int2 uv0 = uv >> 4;
+		int2 uv1 = uv0 + 1;
+
+		uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
+		uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
+		uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
+
+		int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
+		int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
+		int4 c10 = ReadTexel(tex, uv0.x, uv1.y, 0, pb);
+		int4 c11 = ReadTexel(tex, uv1.x, uv1.y, 0, pb);
+
+		if(LTF)
+		{
+			c00 = (mul24(c01 - c00, uvf.x) >> 4) + c00;
+			c10 = (mul24(c11 - c10, uvf.x) >> 4) + c10;
+			c00 = (mul24(c10 - c00, uvf.y) >> 4) + c00;
+		}
+
+		c = c00;
+	}
+
+	return c;
+}
+
+// TODO: 2x2 MSAA idea
+// downsize the rendering tile to 16x8 or 8x8 and render 2x2 sub-pixels to __local
+// hittest and ztest 2x2 (create write mask, only skip if all -1) 
+// calculate color 1x1, alpha tests 1x1
+// use mask to filter failed sub-pixels when writing to __local
+// needs the tile data to be fetched at the beginning, even if rfb/zfb is not set, unless we know the tile is fully covered
+// multiple work-items may render different prims to the same 2x2 sub-pixel, averaging can only be done after a barrier at the very end
+// pb->fm? alpha channel and following alpha tests? some games may depend on exact results, not some average
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
+	__global gs_env* env,
+	__global uchar* vm,
+	__global uchar* tex,
+	__global uchar* pb_base, 
+	uint pb_start,
+	uint prim_start, 
+	uint prim_count,
+	uint bin_count, // == bin_dim.z * bin_dim.w
+	uchar4 bin_dim,
+	uint fbp, 
+	uint zbp, 
+	uint bw)
+{
+	uint x = get_global_id(0);
+	uint y = get_global_id(1);
+
+	uint bin_x = (x >> BIN_SIZE_BITS) - bin_dim.x;
+	uint bin_y = (y >> BIN_SIZE_BITS) - bin_dim.y;
+	uint bin_index = mad24(bin_y, (uint)bin_dim.z, bin_x);
+
+	uint batch_first = env->bounds[bin_index].first;
+	uint batch_last = env->bounds[bin_index].last;
+	uint batch_start = prim_start >> MAX_PRIM_PER_BATCH_BITS;
+
+	if(batch_last < batch_first)
+	{
+		return;
+	}
+
+	uint skip;
+	
+	if(batch_start < batch_first)
+	{
+		uint n = (batch_first - batch_start) * MAX_PRIM_PER_BATCH - (prim_start & (MAX_PRIM_PER_BATCH - 1));
+
+		if(n > prim_count) 
+		{
+			return;
+		}
+
+		skip = 0;
+		prim_count -= n;
+		batch_start = batch_first;
+	}
+	else
+	{
+		skip = prim_start & (MAX_PRIM_PER_BATCH - 1);
+		prim_count += skip;
+	}
+
+	if(batch_start > batch_last) 
+	{
+		return;
+	}
+	
+	prim_count = min(prim_count, (batch_last - batch_start + 1) << MAX_PRIM_PER_BATCH_BITS);
+
+	//
+
+	int2 pi = (int2)(x, y);
+	float2 pf = convert_float2(pi);
+
+	int faddr = PixelAddress(x, y, fbp, bw, FPSM);
+	int zaddr = PixelAddress(x, y, zbp, bw, ZPSM);
+
+	uint fd, zd; // TODO: fd as int4 and only pack before writing out?
+
+	if(RFB) 
+	{
+		fd = ReadFrame(vm, faddr, FPSM);
+	}
+
+	if(RZB)
+	{
+		zd = ReadFrame(vm, zaddr, ZPSM);
+	}
+
+	// early destination alpha test
+
+	if(!DestAlphaTest(fd))
+	{
+		return;
+	}
+
+	//
+
+	uint fragments = 0;
+
+	__global BIN_TYPE* bin = &env->bin[bin_index + batch_start * bin_count]; // TODO: not needed for "one tile case"
+	__global gs_prim* prim_base = &env->prim[batch_start << MAX_PRIM_PER_BATCH_BITS];
+	__global gs_barycentric* barycentric = &env->barycentric[batch_start << MAX_PRIM_PER_BATCH_BITS];
+
+	pb_base += pb_start;
+
+	BIN_TYPE bin_value = *bin & ((BIN_TYPE)-1 >> skip);
+
+	for(uint prim_index = 0; prim_index < prim_count; prim_index += MAX_PRIM_PER_BATCH)
+	{
+		while(bin_value != 0)
+		{
+			uint i = clz(bin_value);
+
+			if(prim_index + i >= prim_count)
+			{
+				break;
+			}
+
+			bin_value ^= (BIN_TYPE)1 << ((MAX_PRIM_PER_BATCH - 1) - i); // bin_value &= (ulong)-1 >> (i + 1);
+
+			__global gs_prim* prim = &prim_base[prim_index + i];
+			__global gs_param* pb = (__global gs_param*)(pb_base + prim->pb_index * TFX_PARAM_SIZE);
+
+			if(!NOSCISSOR)
+			{
+				if(!all((pi >= pb->scissor.xy) & (pi < pb->scissor.zw)))
+				{
+					continue;
+				}
+			}
+			
+			uint2 zf;
+			float3 t;
+			int4 c;
+
+			 // TODO: do not hittest if we know the tile is fully inside the prim
+
+			if(PRIM == GS_POINT_CLASS)
+			{
+				float2 dpf = pf - prim->v[0].p.xy;
+
+				if(!all((dpf <= 0.5f) & (dpf > -0.5f)))
+				{
+					continue;
+				}
+
+				zf = as_uint2(prim->v[0].p.zw);
+				t = prim->v[0].tc.xyz;
+				c = convert_int4(prim->v[0].c);
+			}
+			else if(PRIM == GS_LINE_CLASS)
+			{
+				// TODO: find point on line prependicular to (x,y), distance.x < 0.5f || distance.y < 0.5f
+				// TODO: aa1: coverage ~ distance.x/y, slope selects x or y, zwrite disabled
+				// TODO: do not draw last pixel of the line
+
+				continue;
+			}
+			else if(PRIM == GS_TRIANGLE_CLASS)
+			{
+				// TODO: aa1: draw edge as a line
+
+				if(!ZTest(prim->zmax, zd))
+				{
+					continue;
+				}
+
+				__global gs_barycentric* b = &barycentric[prim_index + i];
+
+				float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
+
+				if(!all(select(f, (float3)(0.0f), fabs(f) < (float3)(CL_FLT_EPSILON * 10)) >= b->zero.xyz))
+				{
+					continue;
+				}
+
+				float2 zf0 = convert_float2(as_uint2(prim->v[0].p.zw));
+				float2 zf1 = convert_float2(as_uint2(prim->v[1].p.zw));
+				float2 zf2 = convert_float2(as_uint2(prim->v[2].p.zw));
+
+				zf.x = convert_uint_rte(zf0.x * f.z + zf1.x * f.x + zf2.x * f.y) + prim->zmin;
+				zf.y = convert_uint_rte(zf0.y * f.z + zf1.y * f.x + zf2.y * f.y);
+
+				t = prim->v[0].tc.xyz * f.z + prim->v[1].tc.xyz * f.x + prim->v[2].tc.xyz * f.y;
+
+				if(IIP)
+				{
+					float4 c0 = convert_float4(prim->v[0].c);
+					float4 c1 = convert_float4(prim->v[1].c);
+					float4 c2 = convert_float4(prim->v[2].c);
+
+					c = convert_int4_rte(c0 * f.z + c1 * f.x + c2 * f.y);
+				}
+				else
+				{
+					c = convert_int4(prim->v[2].c);
+				}
+			}
+			else if(PRIM == GS_SPRITE_CLASS)
+			{
+				int2 tl = convert_int2_rtp(prim->v[0].p.xy);
+				int2 br = convert_int2_rtp(prim->v[1].p.xy);
+
+				if(!all((pi >= tl) & (pi < br)))
+				{
+					continue;
+				}
+
+				zf = as_uint2(prim->v[1].p.zw);
+				
+				t.xy = prim->v[0].tc.xy + prim->v[1].tc.xy * (pf - prim->v[0].p.xy);
+				t.z = prim->v[0].tc.z;
+
+				c = convert_int4(prim->v[1].c);
+			}
+
+			// z test
+
+			uint zs = zf.x;
+
+			if(!ZTest(zs, zd))
+			{
+				continue;
+			}
+
+			// sample texture
+
+			int4 ct;
+
+			if(TFX != TFX_NONE)
+			{
+				ct = SampleTexture(tex, pb, t);
+			}
+
+			// alpha tfx
+
+			int alpha = c.w;
+
+			if(FB)
+			{
+				if(TCC)
+				{
+					switch(TFX)
+					{
+					case TFX_MODULATE:
+						c.w = clamp(mul24(ct.w, c.w) >> 7, 0, 0xff);
+						break;
+					case TFX_DECAL:
+						c.w = ct.w;
+						break;
+					case TFX_HIGHLIGHT:
+						c.w = clamp(ct.w + c.w, 0, 0xff);
+						break;
+					case TFX_HIGHLIGHT2:
+						c.w = ct.w;
+						break;
+					}
+				}
+
+				if(AA1)
+				{
+					if(!ABE || c.w == 0x80)
+					{
+						c.w = 0x80; // TODO: edge ? coverage : 0x80
+					}
+				}
+			}
+
+			// read mask
+
+			uint fm = pb->fm;
+			uint zm = pb->zm;
+
+			// alpha test
+
+			if(!AlphaTest(c.w, pb->aref, &fm, &zm))
+			{
+				continue;
+			}
+
+			// all tests done, we have a new output
+
+			fragments++;
+
+			// write z
+
+			if(ZWRITE)
+			{
+				zd = RZB ? bitselect(zs, zd, zm) : zs;
+			}
+
+			// rgb tfx
+
+			if(FWRITE)
+			{
+				switch(TFX)
+				{
+				case TFX_MODULATE:
+					c.xyz = clamp(mul24(ct.xyz, c.xyz) >> 7, 0, 0xff);
+					break;
+				case TFX_DECAL:
+					c.xyz = ct.xyz;
+					break;
+				case TFX_HIGHLIGHT:
+				case TFX_HIGHLIGHT2:					
+					c.xyz = clamp((mul24(ct.xyz, c.xyz) >> 7) + alpha, 0, 0xff);
+					break;
+				}
+			}
+
+			// fog
+
+			if(FWRITE && FGE)
+			{
+				int fog = (int)zf.y;
+
+				int3 fv = mul24(c.xyz, fog) >> 8;
+				int3 fc = mul24(convert_int4(pb->fog).xyz, 0xff - fog) >> 8;
+
+				c.xyz = fv + fc;
+			}
+
+			// alpha blend
+
+			c = AlphaBlend(c, fd, pb->afix, pb->sel);
+
+			// write frame
+
+			if(FWRITE)
+			{
+				if(DTHE && is16bit(FPSM))
+				{
+					c.xyz += pb->dimx[y & 3][x & 3];
+				}
+
+				c = COLCLAMP ? clamp(c, 0, 0xff) : c & 0xff;
+				
+				if(FBA && !is24bit(FPSM))
+				{
+					c.w |= 0x80;
+				}
+
+				uint fs;
+
+				if(is32bit(FPSM))
+				{
+					fs = (c.w << 24) | (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is24bit(FPSM))
+				{
+					fs = (c.z << 16) | (c.y << 8) | c.x;
+				}
+				else if(is16bit(FPSM))
+				{
+					fs = ((c.w & 0x80) << 8) | ((c.z & 0xf8) << 7) | ((c.y & 0xf8) << 2) | (c.x >> 3);
+				}
+
+				fd = RFB ? bitselect(fs, fd, fm) : fs;
+
+				// dest alpha test for the next loop
+
+				if(!DestAlphaTest(fd))
+				{
+					prim_index = prim_count; // game over
+
+					break;
+				}
+			}
+		}
+
+		bin += bin_count;
+		bin_value = *bin;
+	}
+
+	if(fragments > 0)
+	{
+		if(ZWRITE)
+		{
+			WriteFrame(vm, zaddr, ZPSM, zd);
+		}
+
+		if(FWRITE)
+		{
+			WriteFrame(vm, faddr, FPSM, fd);
+		}
+	}
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx_legacy/res/tfx.fx b/plugins/GSdx_legacy/res/tfx.fx
new file mode 100644
index 0000000000..518e884a6e
--- /dev/null
+++ b/plugins/GSdx_legacy/res/tfx.fx
@@ -0,0 +1,834 @@
+#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency
+#define FMT_32 0
+#define FMT_24 1
+#define FMT_16 2
+#define FMT_PAL 4 /* flag bit */
+
+// And I say this as an ATI user.
+#define ATI_SUCKS 1
+
+#if SHADER_MODEL >= 0x400
+
+#ifndef VS_BPPZ
+#define VS_BPPZ 0
+#define VS_TME 1
+#define VS_FST 1
+#endif
+
+#ifndef GS_IIP
+#define GS_IIP 0
+#define GS_PRIM 3
+#endif
+
+#ifndef PS_FST
+#define PS_FST 0
+#define PS_WMS 0
+#define PS_WMT 0
+#define PS_FMT FMT_32
+#define PS_AEM 0
+#define PS_TFX 0
+#define PS_TCC 1
+#define PS_ATST 1
+#define PS_FOG 0
+#define PS_CLR1 0
+#define PS_FBA 0
+#define PS_AOUT 0
+#define PS_LTF 1
+#define PS_COLCLIP 0
+#define PS_DATE 0
+#define PS_SPRITEHACK 0
+#define PS_TCOFFSETHACK 0
+#define PS_POINT_SAMPLER 0
+#define PS_SHUFFLE 0
+#define PS_READ_BA 0
+#endif
+
+struct VS_INPUT
+{
+	float2 st : TEXCOORD0;
+	float4 c : COLOR0;
+	float q : TEXCOORD1;
+	uint2 p : POSITION0;
+	uint z : POSITION1;
+	uint2 uv : TEXCOORD2;
+	float4 f : COLOR1;
+};
+
+struct VS_OUTPUT
+{
+	float4 p : SV_Position;
+	float4 t : TEXCOORD0;
+#if VS_RTCOPY
+	float4 tp : TEXCOORD1;
+#endif
+	float4 c : COLOR0;
+};
+
+struct PS_INPUT
+{
+	float4 p : SV_Position;
+	float4 t : TEXCOORD0;
+#if PS_DATE > 0
+	float4 tp : TEXCOORD1;
+#endif
+	float4 c : COLOR0;
+};
+
+struct PS_OUTPUT
+{
+	float4 c0 : SV_Target0;
+	float4 c1 : SV_Target1;
+};
+
+Texture2D<float4> Texture : register(t0);
+Texture2D<float4> Palette : register(t1);
+Texture2D<float4> RTCopy : register(t2);
+SamplerState TextureSampler : register(s0);
+SamplerState PaletteSampler : register(s1);
+SamplerState RTCopySampler : register(s2);
+
+cbuffer cb0
+{
+	float4 VertexScale;
+	float4 VertexOffset;
+	float2 TextureScale;
+};
+
+cbuffer cb1
+{
+	float3 FogColor;
+	float AREF;
+	float4 HalfTexel;
+	float4 WH;
+	float4 MinMax;
+	float2 MinF;
+	float2 TA;
+	uint4 MskFix;
+	float4 TC_OffsetHack;
+};
+
+float4 sample_c(float2 uv)
+{
+	if (ATI_SUCKS && PS_POINT_SAMPLER)
+	{
+		// Weird issue with ATI cards (happens on at least HD 4xxx and 5xxx),
+		// it looks like they add 127/128 of a texel to sampling coordinates
+		// occasionally causing point sampling to erroneously round up.
+		// I'm manually adjusting coordinates to the centre of texels here,
+		// though the centre is just paranoia, the top left corner works fine.
+		uv = (trunc(uv * WH.zw) + float2(0.5, 0.5)) / WH.zw;
+	}
+	return Texture.Sample(TextureSampler, uv);
+}
+
+float4 sample_p(float u)
+{
+	return Palette.Sample(PaletteSampler, u);
+}
+
+float4 sample_rt(float2 uv)
+{
+	return RTCopy.Sample(RTCopySampler, uv);
+}
+
+#elif SHADER_MODEL <= 0x300
+
+#ifndef VS_BPPZ
+#define VS_BPPZ 0
+#define VS_TME 1
+#define VS_FST 1
+#define VS_LOGZ 1
+#endif
+
+#ifndef PS_FST
+#define PS_FST 0
+#define PS_WMS 0
+#define PS_WMT 0
+#define PS_FMT FMT_32
+#define PS_AEM 0
+#define PS_TFX 0
+#define PS_TCC 0
+#define PS_ATST 4
+#define PS_FOG 0
+#define PS_CLR1 0
+#define PS_RT 0
+#define PS_LTF 0
+#define PS_COLCLIP 0
+#define PS_DATE 0
+#endif
+
+struct VS_INPUT
+{
+	float4 p : POSITION0; 
+	float2 t : TEXCOORD0;
+	float4 c : COLOR0;
+	float4 f : COLOR1;
+};
+
+struct VS_OUTPUT
+{
+	float4 p : POSITION;
+	float4 t : TEXCOORD0;
+#if VS_RTCOPY
+	float4 tp : TEXCOORD1;
+#endif
+	float4 c : COLOR0;
+};
+
+struct PS_INPUT
+{
+	float4 t : TEXCOORD0;
+#if PS_DATE > 0
+	float4 tp : TEXCOORD1;
+#endif
+	float4 c : COLOR0;
+};
+
+sampler Texture : register(s0);
+sampler Palette : register(s1);
+sampler RTCopy : register(s2);
+sampler1D UMSKFIX : register(s3);
+sampler1D VMSKFIX : register(s4);
+
+float4 vs_params[3];
+
+#define VertexScale vs_params[0]
+#define VertexOffset vs_params[1]
+#define TextureScale vs_params[2].xy
+
+float4 ps_params[7];
+
+#define FogColor	ps_params[0].bgr
+#define AREF		ps_params[0].a
+#define HalfTexel	ps_params[1]
+#define WH			ps_params[2]
+#define MinMax		ps_params[3]
+#define MinF		ps_params[4].xy
+#define TA			ps_params[4].zw
+
+#define TC_OffsetHack ps_params[6]
+
+float4 sample_c(float2 uv)
+{
+	return tex2D(Texture, uv);
+}
+
+float4 sample_p(float u)
+{
+	return tex2D(Palette, u);
+}
+
+float4 sample_rt(float2 uv)
+{
+	return tex2D(RTCopy, uv);
+}
+
+#endif
+
+float4 wrapuv(float4 uv)
+{
+	if(PS_WMS == PS_WMT)
+	{
+/*
+		if(PS_WMS == 0)
+		{
+			uv = frac(uv);
+		}
+		else if(PS_WMS == 1)
+		{
+			uv = saturate(uv);
+		}
+		else
+*/ 
+		if(PS_WMS == 2)
+		{
+			uv = clamp(uv, MinMax.xyxy, MinMax.zwzw);
+		}
+		else if(PS_WMS == 3)
+		{
+			#if SHADER_MODEL >= 0x400
+			uv = (float4)(((int4)(uv * WH.xyxy) & MskFix.xyxy) | MskFix.zwzw) / WH.xyxy;
+			#elif SHADER_MODEL <= 0x300
+			uv.x = tex1D(UMSKFIX, uv.x);
+			uv.y = tex1D(VMSKFIX, uv.y);
+			uv.z = tex1D(UMSKFIX, uv.z);
+			uv.w = tex1D(VMSKFIX, uv.w);
+			#endif
+		}
+	}
+	else
+	{
+/*	
+		if(PS_WMS == 0)
+		{
+			uv.xz = frac(uv.xz);
+		}
+		else if(PS_WMS == 1)
+		{
+			uv.xz = saturate(uv.xz);
+		}
+		else 
+*/		
+		if(PS_WMS == 2)
+		{
+			uv.xz = clamp(uv.xz, MinMax.xx, MinMax.zz);
+		}
+		else if(PS_WMS == 3)
+		{
+			#if SHADER_MODEL >= 0x400
+			uv.xz = (float2)(((int2)(uv.xz * WH.xx) & MskFix.xx) | MskFix.zz) / WH.xx;
+			#elif SHADER_MODEL <= 0x300
+			uv.x = tex1D(UMSKFIX, uv.x);
+			uv.z = tex1D(UMSKFIX, uv.z);
+			#endif
+		}
+/*
+		if(PS_WMT == 0)
+		{
+			uv.yw = frac(uv.yw);
+		}
+		else if(PS_WMT == 1)
+		{
+			uv.yw = saturate(uv.yw);
+		}
+		else 
+*/
+		if(PS_WMT == 2)
+		{
+			uv.yw = clamp(uv.yw, MinMax.yy, MinMax.ww);
+		}
+		else if(PS_WMT == 3)
+		{
+			#if SHADER_MODEL >= 0x400
+			uv.yw = (float2)(((int2)(uv.yw * WH.yy) & MskFix.yy) | MskFix.ww) / WH.yy;
+			#elif SHADER_MODEL <= 0x300
+			uv.y = tex1D(VMSKFIX, uv.y);
+			uv.w = tex1D(VMSKFIX, uv.w);
+			#endif
+		}
+	}
+	
+	return uv;
+}
+
+float2 clampuv(float2 uv)
+{
+	if(PS_WMS == 2 && PS_WMT == 2) 
+	{
+		uv = clamp(uv, MinF, MinMax.zw);
+	}
+	else if(PS_WMS == 2)
+	{
+		uv.x = clamp(uv.x, MinF.x, MinMax.z);
+	}
+	else if(PS_WMT == 2)
+	{
+		uv.y = clamp(uv.y, MinF.y, MinMax.w);
+	}
+	
+	return uv;
+}
+
+float4x4 sample_4c(float4 uv)
+{
+	float4x4 c;
+	
+	c[0] = sample_c(uv.xy);
+	c[1] = sample_c(uv.zy);
+	c[2] = sample_c(uv.xw);
+	c[3] = sample_c(uv.zw);
+
+	return c;
+}
+
+float4 sample_4a(float4 uv)
+{
+	float4 c;
+
+	c.x = sample_c(uv.xy).a;
+	c.y = sample_c(uv.zy).a;
+	c.z = sample_c(uv.xw).a;
+	c.w = sample_c(uv.zw).a;
+	
+	#if SHADER_MODEL <= 0x300
+	if(PS_RT) c *= 128.0f / 255;
+	#endif
+
+	return c * 255./256 + 0.5/256;
+}
+
+float4x4 sample_4p(float4 u)
+{
+	float4x4 c;
+	
+	c[0] = sample_p(u.x);
+	c[1] = sample_p(u.y);
+	c[2] = sample_p(u.z);
+	c[3] = sample_p(u.w);
+
+	return c;
+}
+
+float4 sample(float2 st, float q)
+{
+	if(!PS_FST) st /= q;
+
+	#if PS_TCOFFSETHACK
+	st += TC_OffsetHack.xy;
+	#endif 
+
+	float4 t;
+	float4x4 c;
+	float2 dd;
+
+/*	
+	if(!PS_LTF && PS_FMT <= FMT_16 && PS_WMS < 2 && PS_WMT < 2)
+	{
+		c[0] = sample_c(st);
+	}
+*/
+	if (!PS_LTF && PS_FMT <= FMT_16 && PS_WMS < 3 && PS_WMT < 3)
+	{
+		c[0] = sample_c(clampuv(st));
+	}
+	else
+	{
+		float4 uv;
+
+		if(PS_LTF)
+		{
+			uv = st.xyxy + HalfTexel;
+			dd = frac(uv.xy * WH.zw);
+		}
+		else
+		{
+			uv = st.xyxy;
+		}
+
+		uv = wrapuv(uv);
+
+		if(PS_FMT & FMT_PAL)
+		{
+			c = sample_4p(sample_4a(uv));
+		}
+		else
+		{
+			c = sample_4c(uv);
+		}
+	}
+
+	[unroll]
+	for (uint i = 0; i < 4; i++)
+	{
+		if((PS_FMT & ~FMT_PAL) == FMT_32)
+		{
+			#if SHADER_MODEL <= 0x300
+			if(PS_RT) c[i].a *= 128.0f / 255;
+			#endif
+		}
+		else if((PS_FMT & ~FMT_PAL) == FMT_24)
+		{
+			c[i].a = !PS_AEM || any(c[i].rgb) ? TA.x : 0;
+		}
+		else if((PS_FMT & ~FMT_PAL) == FMT_16)
+		{
+			c[i].a = c[i].a >= 0.5 ? TA.y : !PS_AEM || any(c[i].rgb) ? TA.x : 0; 
+		}
+	}
+
+	if(PS_LTF)
+	{	
+		t = lerp(lerp(c[0], c[1], dd.x), lerp(c[2], c[3], dd.x), dd.y);
+	}
+	else
+	{
+		t = c[0];
+	}
+
+	return t;
+}
+
+float4 tfx(float4 t, float4 c)
+{
+	if(PS_TFX == 0)
+	{
+		if(PS_TCC) 
+		{
+			c = c * t * 255.0f / 128;
+		}
+		else
+		{
+			c.rgb = c.rgb * t.rgb * 255.0f / 128;
+		}
+	}
+	else if(PS_TFX == 1)
+	{
+		if(PS_TCC) 
+		{
+			c = t;
+		}
+		else
+		{
+			c.rgb = t.rgb;
+		}
+	}
+	else if(PS_TFX == 2)
+	{
+		c.rgb = c.rgb * t.rgb * 255.0f / 128 + c.a;
+
+		if(PS_TCC) 
+		{
+			c.a += t.a;
+		}
+	}
+	else if(PS_TFX == 3)
+	{
+		c.rgb = c.rgb * t.rgb * 255.0f / 128 + c.a;
+
+		if(PS_TCC) 
+		{
+			c.a = t.a;
+		}
+	}
+	
+	return saturate(c);
+}
+
+void datst(PS_INPUT input)
+{
+#if PS_DATE > 0
+	float alpha = sample_rt(input.tp.xy).a;
+#if SHADER_MODEL >= 0x400
+	float alpha0x80 = 128. / 255;
+#else
+	float alpha0x80 = 1;
+#endif
+
+	if (PS_DATE == 1 && alpha >= alpha0x80)
+		discard;
+	else if (PS_DATE == 2 && alpha < alpha0x80)
+		discard;
+#endif
+}
+
+void atst(float4 c)
+{
+	float a = trunc(c.a * 255 + 0.01);
+	
+	if(PS_ATST == 0) // never
+	{
+		discard;
+	}
+	else if(PS_ATST == 1) // always
+	{
+		// nothing to do
+	}
+	else if(PS_ATST == 2) // l
+	{
+		#if PS_SPRITEHACK == 0
+		clip(AREF - a - 0.5f);
+		#endif				
+	}
+	else if(PS_ATST == 3) // le
+	{
+		clip(AREF - a + 0.5f);
+	}
+	else if(PS_ATST == 4) // e
+	{
+		clip(0.5f - abs(a - AREF));
+	}
+	else if(PS_ATST == 5) // ge
+	{
+		clip(a - AREF + 0.5f);
+	}
+	else if(PS_ATST == 6) // g
+	{
+		clip(a - AREF - 0.5f);
+	}
+	else if(PS_ATST == 7) // ne
+	{
+		clip(abs(a - AREF) - 0.5f);
+	}
+}
+
+float4 fog(float4 c, float f)
+{
+	if(PS_FOG)
+	{
+		c.rgb = lerp(FogColor, c.rgb, f);
+	}
+
+	return c;
+}
+
+float4 ps_color(PS_INPUT input)
+{
+	datst(input);
+
+	float4 t = sample(input.t.xy, input.t.w);
+
+	float4 c = tfx(t, input.c);
+
+	atst(c);
+
+	c = fog(c, input.t.z);
+
+	if (PS_COLCLIP == 2)
+	{
+		c.rgb = 256./255. - c.rgb;
+	}
+	if (PS_COLCLIP > 0)
+	{
+		c.rgb *= c.rgb < 128./255;
+	}
+
+	if(PS_CLR1) // needed for Cd * (As/Ad/F + 1) blending modes
+	{
+		c.rgb = 1; 
+	}
+
+	return c;
+}
+
+#if SHADER_MODEL >= 0x400
+
+VS_OUTPUT vs_main(VS_INPUT input)
+{
+	if(VS_BPPZ == 1) // 24
+	{
+		input.z = input.z & 0xffffff; 
+	}
+	else if(VS_BPPZ == 2) // 16
+	{
+		input.z = input.z & 0xffff;
+	}
+
+	VS_OUTPUT output;
+	
+	// pos -= 0.05 (1/320 pixel) helps avoiding rounding problems (integral part of pos is usually 5 digits, 0.05 is about as low as we can go)
+	// example: ceil(afterseveralvertextransformations(y = 133)) => 134 => line 133 stays empty
+	// input granularity is 1/16 pixel, anything smaller than that won't step drawing up/left by one pixel
+	// example: 133.0625 (133 + 1/16) should start from line 134, ceil(133.0625 - 0.05) still above 133
+	
+	float4 p = float4(input.p, input.z, 0) - float4(0.05f, 0.05f, 0, 0); 
+
+	output.p = p * VertexScale - VertexOffset;
+#if VS_RTCOPY
+	output.tp = (p * VertexScale - VertexOffset) * float4(0.5, -0.5, 0, 0) + 0.5;
+#endif
+
+	if(VS_TME)
+	{
+		if(VS_FST)
+		{
+			output.t.xy = input.uv * TextureScale;
+			output.t.w = 1.0f;
+		}
+		else
+		{
+			output.t.xy = input.st;
+			output.t.w = input.q;
+		}
+	}
+	else
+	{
+		output.t.xy = 0;
+		output.t.w = 1.0f;
+	}
+
+	output.c = input.c;
+	output.t.z = input.f.r;
+
+	return output;
+}
+
+#if GS_PRIM == 0
+
+[maxvertexcount(1)]
+void gs_main(point VS_OUTPUT input[1], inout PointStream<VS_OUTPUT> stream)
+{
+	stream.Append(input[0]);
+}
+
+#elif GS_PRIM == 1
+
+[maxvertexcount(2)]
+void gs_main(line VS_OUTPUT input[2], inout LineStream<VS_OUTPUT> stream)
+{
+	#if GS_IIP == 0
+	input[0].c = input[1].c;
+	#endif
+
+	stream.Append(input[0]);
+	stream.Append(input[1]);
+}
+
+#elif GS_PRIM == 2
+
+[maxvertexcount(3)]
+void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream<VS_OUTPUT> stream)
+{
+	#if GS_IIP == 0
+	input[0].c = input[2].c;
+	input[1].c = input[2].c;
+	#endif
+
+	stream.Append(input[0]);
+	stream.Append(input[1]);
+	stream.Append(input[2]);
+}
+
+#elif GS_PRIM == 3
+
+[maxvertexcount(4)]
+void gs_main(line VS_OUTPUT input[2], inout TriangleStream<VS_OUTPUT> stream)
+{
+	input[0].p.z = input[1].p.z;
+	input[0].t.zw = input[1].t.zw;
+
+	#if GS_IIP == 0
+	input[0].c = input[1].c;
+	#endif
+
+	VS_OUTPUT lb = input[1];
+
+	lb.p.x = input[0].p.x;
+	lb.t.x = input[0].t.x;
+
+	VS_OUTPUT rt = input[1];
+
+	rt.p.y = input[0].p.y;
+	rt.t.y = input[0].t.y;
+
+	stream.Append(input[0]);
+	stream.Append(lb);
+	stream.Append(rt);
+	stream.Append(input[1]);
+}
+
+#endif
+
+PS_OUTPUT ps_main(PS_INPUT input)
+{
+	float4 c = ps_color(input);
+
+	PS_OUTPUT output;
+
+	if (PS_SHUFFLE){
+		uint4 denorm_c = uint4(c * 255.0f + 0.5f);
+		uint2 denorm_TA = uint2(float2(TA.xy) * 255.0f + 0.5f);
+
+		// Mask will take care of the correct destination
+		if (PS_READ_BA){
+			c.rb = c.bb;
+		}
+		else {
+			c.rb = c.rr;
+		}
+		c.g = c.a;
+		if (PS_READ_BA){
+			if (denorm_c.a & 0x80)
+				c.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)) / 255.0f;
+			else
+				c.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)) / 255.0f;
+
+			//c.g = c.a;
+		}
+		else {
+			if (denorm_c.g & 0x80)
+				c.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)) / 255.0f;
+			else
+				c.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)) / 255.0f;
+
+			//c.g = c.a;
+		}
+		//Probably not right :/
+		//c.g = c.b;
+	}
+
+	output.c1 = c.a * 2; // used for alpha blending
+
+	if(PS_AOUT) // 16 bit output
+	{
+		float a = 128.0f / 255; // alpha output will be 0x80
+		
+		c.a = PS_FBA ? a : step(0.5, c.a) * a;
+	}
+	else if(PS_FBA)
+	{
+		if(c.a < 0.5) c.a += 0.5;
+	}
+
+	output.c0 = c;
+
+	return output;
+}
+
+#elif SHADER_MODEL <= 0x300
+
+VS_OUTPUT vs_main(VS_INPUT input)
+{
+	if(VS_BPPZ == 1) // 24
+	{
+		input.p.z = fmod(input.p.z, 0x1000000); 
+	}
+	else if(VS_BPPZ == 2) // 16
+	{
+		input.p.z = fmod(input.p.z, 0x10000);
+	}
+
+	VS_OUTPUT output;
+	
+	// pos -= 0.05 (1/320 pixel) helps avoiding rounding problems (integral part of pos is usually 5 digits, 0.05 is about as low as we can go)
+	// example: ceil(afterseveralvertextransformations(y = 133)) => 134 => line 133 stays empty
+	// input granularity is 1/16 pixel, anything smaller than that won't step drawing up/left by one pixel
+	// example: 133.0625 (133 + 1/16) should start from line 134, ceil(133.0625 - 0.05) still above 133
+
+	float4 p = input.p - float4(0.05f, 0.05f, 0, 0);
+
+	output.p = p * VertexScale - VertexOffset;
+#if VS_RTCOPY
+	output.tp = (p * VertexScale - VertexOffset) * float4(0.5, -0.5, 0, 0) + 0.5;
+#endif
+
+	if(VS_LOGZ)
+	{
+		output.p.z = log2(1.0f + input.p.z) / 32;
+	}
+	
+	if(VS_TME)
+	{
+		if(VS_FST)
+		{
+            output.t.xy = input.t * TextureScale;
+			output.t.w = 1.0f;
+		}
+		else
+		{
+			output.t.xy = input.t;
+			output.t.w = input.p.w;
+		}
+	}
+	else
+	{
+		output.t.xy = 0;
+		output.t.w = 1.0f;
+	}
+
+	output.c = input.c;
+	output.t.z = input.f.b;
+	
+	return output;
+}
+
+float4 ps_main(PS_INPUT input) : COLOR
+{
+	float4 c = ps_color(input);
+
+	c.a *= 2;
+
+	return c;
+}
+
+#endif
+#endif
diff --git a/plugins/GSdx_legacy/resource.h b/plugins/GSdx_legacy/resource.h
new file mode 100644
index 0000000000..19e54c8d19
--- /dev/null
+++ b/plugins/GSdx_legacy/resource.h
@@ -0,0 +1,115 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by GSdx.rc
+//
+
+#define IDC_PALTEX                      2001
+#define IDC_LOGZ                        2002
+#define IDC_CODECS                      2003
+#define IDC_RESOLUTION                  2004
+#define IDC_RESX_EDIT                   2005
+#define IDC_RESY_EDIT                   2006
+#define IDC_AA1                         2007
+#define IDC_SWTHREADS_TEXT              2008
+#define IDC_SWTHREADS                   2009
+#define IDC_SWTHREADS_EDIT              2010
+#define IDC_FILTER_TEXT                 2011
+#define IDC_FILTER                      2012
+#define IDC_DITHERING                   2013
+#define IDC_RESX                        2014
+#define IDC_RESY                        2015
+#define IDD_CONFIG                      2016
+#define IDB_LOGO9                       2017
+#define IDB_LOGO10                      2018
+#define IDB_LOGOGL                      2019
+#define IDC_FBA                         2020
+#define IDC_LOGO9                       2021
+#define IDC_LOGO11                      2022
+#define IDC_LOGOGL                      2023
+#define IDD_CAPTURE                     2024
+#define IDD_GPUCONFIG                   2025
+#define IDC_RENDERER                    2026
+#define IDC_INTERLACE                   2027
+#define IDC_ASPECTRATIO                 2028
+#define IDC_ALPHAHACK                   2029
+#define IDC_SCALE                       2030
+#define IDC_UPSCALE_MULTIPLIER          2031
+#define IDC_BROWSE                      2032
+#define IDC_OFFSETHACK                  2033
+#define IDC_FILENAME                    2034
+#define IDC_SKIPDRAWHACK                2035
+#define IDC_WIDTH                       2036
+#define IDC_HEIGHT                      2037
+#define IDC_CONFIGURE                   2038
+#define IDC_ACCURATE_BLEND_UNIT_TEXT    2039
+#define IDC_WINDOWED                    2040
+#define IDC_SKIPDRAWHACKEDIT            2041
+#define IDC_SPRITEHACK                  2042
+#define IDC_SATURATION_SLIDER           2043
+#define IDC_BRIGHTNESS_SLIDER           2044
+#define IDC_CONTRAST_SLIDER             2045
+#define IDC_SHADEBUTTON                 2046
+#define IDC_SHADEBOOST                  2047
+#define IDC_HACKS_ENABLED               2048
+#define IDC_SATURATION_TEXT             2049
+#define IDC_BRIGHTNESS_TEXT             2050
+#define IDC_CONTRAST_TEXT               2051
+#define IDC_MSAACB                      2052
+#define IDC_MSAA_TEXT                   2053
+#define IDC_HACKSBUTTON                 2054
+#define IDC_WILDHACK                    2055
+#define IDC_CHECK_DISABLE_ALL_HACKS     2056
+#define IDC_ALPHASTENCIL                2057
+#define IDC_ADAPTER                     2058
+#define IDC_TCOFFSETX                   2059
+#define IDC_TCOFFSETX2                  2060
+#define IDC_TCOFFSETY                   2061
+#define IDC_TCOFFSETY2                  2062
+#define IDC_FXAA                        2063
+#define IDC_SHADER_FX                   2064
+#define IDC_AFCOMBO_TEXT                2065
+#define IDC_AFCOMBO                     2066
+#define IDC_OPENCL_DEVICE               2067
+#define IDC_OPENCL_TEXT                 2068
+#define IDC_ACCURATE_BLEND_UNIT         2069
+#define IDC_ACCURATE_DATE               2070
+#define IDC_ROUND_SPRITE                2071
+#define IDC_ALIGN_SPRITE                2072
+#define IDC_CRC_LEVEL                   2073
+#define IDC_CRC_LEVEL_TEXT              2074
+#define IDC_TC_DEPTH                    2075
+#define IDC_COLORSPACE                  2076
+#define IDC_SHADER_FX_EDIT              2077
+#define IDC_SHADER_FX_CONF_EDIT         2078
+#define IDC_SHADER_FX_BUTTON            2079
+#define IDC_SHADER_FX_CONF_BUTTON       2080
+#define IDC_SHADER_FX_TEXT              2081
+#define IDC_SHADER_FX_CONF_TEXT         2082
+#define IDC_CUSTOM_TEXT                 2083
+#define IDC_UPSCALE_MULTIPLIER_TEXT     2084
+#define IDC_MIPMAP                      2085
+#define IDC_PRELOAD_GS                  2086
+#define IDC_TVSHADER                    2087
+#define IDC_SAFE_FBMASK                 2088
+#define IDR_CONVERT_FX                  10000
+#define IDR_TFX_FX                      10001
+#define IDR_MERGE_FX                    10002
+#define IDR_INTERLACE_FX                10003
+#define IDR_FXAA_FX                     10004
+#define IDR_CS_FX                       10005
+#define IDD_SHADER                      10006
+#define IDR_SHADEBOOST_FX               10007
+#define IDR_TFX_CL                      10008
+#define IDD_HACKS                       10009
+#define IDC_STATIC                      -1
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        10013
+#define _APS_NEXT_COMMAND_VALUE         32771
+#define _APS_NEXT_CONTROL_VALUE         2091
+#define _APS_NEXT_SYMED_VALUE           5000
+#endif
+#endif
diff --git a/plugins/GSdx_legacy/stdafx.cpp b/plugins/GSdx_legacy/stdafx.cpp
new file mode 100644
index 0000000000..b36ceb77cb
--- /dev/null
+++ b/plugins/GSdx_legacy/stdafx.cpp
@@ -0,0 +1,121 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+// stdafx.cpp : source file that includes just the standard includes
+// GSdx.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
+
+string format(const char* fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+
+	int result = -1, length = 256;
+
+	char* buffer = NULL;
+
+	while(result == -1)
+	{
+		if(buffer) delete [] buffer;
+
+		buffer = new char[length + 1];
+
+		memset(buffer, 0, length + 1);
+
+		result = vsnprintf(buffer, length, fmt, args);
+
+		length *= 2;
+	}
+
+	va_end(args);
+
+	string s(buffer);
+
+	delete [] buffer;
+
+	return s;
+}
+
+#ifdef _WIN32
+
+void* vmalloc(size_t size, bool code)
+{
+	return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, code ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE);
+}
+
+void vmfree(void* ptr, size_t size)
+{
+	VirtualFree(ptr, 0, MEM_RELEASE);
+}
+
+#else
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+void* vmalloc(size_t size, bool code)
+{
+	size_t mask = getpagesize() - 1;
+
+	size = (size + mask) & ~mask;
+
+	int flags = PROT_READ | PROT_WRITE;
+
+	if(code)
+	{
+		flags |= PROT_EXEC;
+	}
+
+	return mmap(NULL, size, flags, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+void vmfree(void* ptr, size_t size)
+{
+	size_t mask = getpagesize() - 1;
+
+	size = (size + mask) & ~mask;
+
+	munmap(ptr, size);
+}
+
+#endif
+
+#if !defined(_MSC_VER)
+
+// declare linux equivalents (alignment must be power of 2 (1,2,4...2^15)
+
+#if !defined(__USE_ISOC11) || defined(ASAN_WORKAROUND)
+
+void* _aligned_malloc(size_t size, size_t alignment)
+{
+	void *ret = 0;
+	posix_memalign(&ret, alignment, size);
+	return ret;
+}
+
+#endif
+
+#endif
diff --git a/plugins/GSdx_legacy/stdafx.h b/plugins/GSdx_legacy/stdafx.h
new file mode 100644
index 0000000000..e1ffe1ed74
--- /dev/null
+++ b/plugins/GSdx_legacy/stdafx.h
@@ -0,0 +1,456 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+
+#pragma once
+
+#include "config.h"
+
+#ifdef _WIN32
+
+#include "targetver.h"
+
+#define WIN32_LEAN_AND_MEAN             // Exclude rarely-used stuff from Windows headers
+
+#include <windows.h>
+#include <commctrl.h>
+#include <commdlg.h>
+#include <shellapi.h>
+#include <d3dcompiler.h>
+#include <d3d11.h>
+#include <d3d9.h>
+#include <comutil.h>
+#include <atlcomcli.h>
+
+#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
+
+#endif
+
+
+#ifdef ENABLE_OPENCL
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+
+#endif
+
+// put these into vc9/common7/ide/usertype.dat to have them highlighted
+
+typedef unsigned char uint8;
+typedef signed char int8;
+typedef unsigned short uint16;
+typedef signed short int16;
+typedef unsigned int uint32;
+typedef signed int int32;
+typedef unsigned long long uint64;
+typedef signed long long int64;
+#ifdef __x86_64__
+typedef uint64 uptr;
+#else
+typedef uint32 uptr;
+#endif
+
+
+// xbyak compatibilities
+typedef int64 sint64;
+#define MIE_INTEGER_TYPE_DEFINED
+
+// stdc
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <time.h>
+#include <limits.h>
+
+#include <complex>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <list>
+#include <map>
+#include <set>
+#include <queue>
+#include <algorithm>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+using namespace std;
+
+#include <memory>
+
+#include <zlib.h>
+
+#if _MSC_VER >= 1800 || !defined(_WIN32)
+#include <unordered_map>
+#include <unordered_set>
+#define hash_map unordered_map
+#define hash_set unordered_set
+#else
+#include <hash_map>
+#include <hash_set>
+using namespace stdext;
+#endif
+
+#ifdef _WIN32
+
+	// Note use GL/glcorearb.h on the future
+	#include <GL/gl.h>
+	#include <GL/glext.h>
+	#include <GL/wglext.h>
+	#include "GLLoader.h"
+
+	// hashing algoritms at: http://www.cris.com/~Ttwang/tech/inthash.htm
+	// default hash_compare does ldiv and other crazy stuff to reduce speed
+
+	template<> class hash_compare<uint32>
+	{
+	public:
+		enum {bucket_size = 1};
+
+		size_t operator()(uint32 key) const
+		{
+			key += ~(key << 15);
+			key ^= (key >> 10);
+			key += (key << 3);
+			key ^= (key >> 6);
+			key += ~(key << 11);
+			key ^= (key >> 16);
+
+			return (size_t)key;
+		}
+
+		bool operator()(uint32 a, uint32 b) const
+		{
+			return a < b;
+		}
+	};
+
+	template<> class hash_compare<uint64>
+	{
+	public:
+		enum {bucket_size = 1};
+
+		size_t operator()(uint64 key) const
+		{
+			key += ~(key << 32);
+			key ^= (key >> 22);
+			key += ~(key << 13);
+			key ^= (key >> 8);
+			key += (key << 3);
+			key ^= (key >> 15);
+			key += ~(key << 27);
+			key ^= (key >> 31);
+
+			return (size_t)key;
+		}
+
+		bool operator()(uint64 a, uint64 b) const
+		{
+			return a < b;
+		}
+	};
+
+	#define vsnprintf _vsnprintf
+	#define snprintf _snprintf
+
+	#define DIRECTORY_SEPARATOR '\\'
+
+#else
+
+	// Note use GL/glcorearb.h on the future
+	#include <GL/gl.h>
+	#include <GL/glext.h>
+	#include "GLLoader.h"
+
+	#include <sys/stat.h> // mkdir
+
+	#define DIRECTORY_SEPARATOR '/'
+
+#endif
+
+#ifdef _MSC_VER
+
+    #define __aligned(t, n) __declspec(align(n)) t
+
+    #define EXPORT_C_(type) extern "C" __declspec(dllexport) type __stdcall
+    #define EXPORT_C EXPORT_C_(void)
+
+    #define ALIGN_STACK(n) __aligned(int, n) __dummy;
+
+#else
+
+    #define __aligned(t, n) t __attribute__((aligned(n)))
+    #define __fastcall __attribute__((fastcall))
+
+    #define EXPORT_C_(type) extern "C" __attribute__((stdcall,externally_visible,visibility("default"))) type
+    #define EXPORT_C EXPORT_C_(void)
+
+    #ifdef __GNUC__
+
+        #include "assert.h"
+        #define __forceinline __inline__ __attribute__((always_inline,unused))
+        // #define __forceinline __inline__ __attribute__((__always_inline__,__gnu_inline__))
+        #define __assume(c) do { if (!(c)) __builtin_unreachable(); } while(0)
+
+        // GCC removes the variable as dead code and generates some warnings.
+        // Stack is automatically realigned due to SSE/AVX operations
+        #define ALIGN_STACK(n) (void)0;
+
+    #else
+
+        // TODO Check clang behavior
+        #define ALIGN_STACK(n) __aligned(int, n) __dummy;
+
+    #endif
+
+
+#endif
+
+extern string format(const char* fmt, ...);
+
+struct delete_object {template<class T> void operator()(T& p) {delete p;}};
+struct delete_first {template<class T> void operator()(T& p) {delete p.first;}};
+struct delete_second {template<class T> void operator()(T& p) {delete p.second;}};
+struct aligned_free_object {template<class T> void operator()(T& p) {_aligned_free(p);}};
+struct aligned_free_first {template<class T> void operator()(T& p) {_aligned_free(p.first);}};
+struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_free(p.second);}};
+
+#define countof(a) (sizeof(a) / sizeof(a[0]))
+
+#ifndef RESTRICT
+
+    #ifdef __INTEL_COMPILER
+
+        #define RESTRICT restrict
+
+    #elif defined(_MSC_VER)
+
+        #define RESTRICT __restrict
+
+    #elif defined(__GNUC__)
+
+        #define RESTRICT __restrict__
+
+    #else
+
+        #define RESTRICT
+
+    #endif
+
+#endif
+
+#if defined(_DEBUG) //&& defined(_MSC_VER)
+
+	#include <assert.h>
+	#define ASSERT assert
+
+#else
+
+	#define ASSERT(exp) ((void)0)
+
+#endif
+
+#ifdef __x86_64__
+
+	#define _M_AMD64
+
+#endif
+
+// sse
+#if defined(__GNUC__) && !defined(__x86_64__)
+// Convert gcc see define into GSdx (windows) define
+#if defined(__AVX2__)
+	#define _M_SSE 0x501
+#elif defined(__AVX__)
+	#define _M_SSE 0x500
+#elif defined(__SSE4_2__)
+	#define _M_SSE 0x402
+#elif defined(__SSE4_1__)
+	#define _M_SSE 0x401
+#elif defined(__SSSE3__)
+	#define _M_SSE 0x301
+#elif defined(__SSE2__)
+	#define _M_SSE 0x200
+#elif defined(__SSE__)
+	#define _M_SSE 0x100
+#endif
+
+#endif
+
+#if !defined(_M_SSE) && (!defined(_WIN32) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+
+	#define _M_SSE 0x200
+
+#endif
+
+#if _M_SSE >= 0x200
+
+	#include <xmmintrin.h>
+	#include <emmintrin.h>
+
+	#ifndef _MM_DENORMALS_ARE_ZERO
+	#define _MM_DENORMALS_ARE_ZERO 0x0040
+	#endif
+
+	#define MXCSR (_MM_DENORMALS_ARE_ZERO | _MM_MASK_MASK | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON)
+
+	#define _MM_TRANSPOSE4_SI128(row0, row1, row2, row3) \
+	{ \
+		__m128 tmp0 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0x44); \
+		__m128 tmp2 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0xEE); \
+		__m128 tmp1 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0x44); \
+		__m128 tmp3 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0xEE); \
+		(row0) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0x88)); \
+		(row1) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0xDD)); \
+		(row2) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0x88)); \
+		(row3) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); \
+	}
+
+#else
+
+#error TODO: GSVector4 and GSRasterizer needs SSE2
+
+#endif
+
+#if _M_SSE >= 0x301
+
+	#include <tmmintrin.h>
+
+#endif
+
+#if _M_SSE >= 0x401
+
+	#include <smmintrin.h>
+
+#endif
+
+#if _M_SSE >= 0x500
+
+	#include <immintrin.h>
+
+#endif
+
+#undef min
+#undef max
+#undef abs
+
+#if !defined(_MSC_VER)
+	#if defined(__USE_ISOC11) && !defined(ASAN_WORKAROUND) // not supported yet on gcc 4.9
+
+	#define _aligned_malloc(size, a) aligned_alloc(a, size)
+
+	#else
+
+	extern void* _aligned_malloc(size_t size, size_t alignment);
+
+	#endif
+
+	static inline void _aligned_free(void* p) {
+		free(p);
+	}
+
+	// http://svn.reactos.org/svn/reactos/trunk/reactos/include/crt/mingw32/intrin_x86.h?view=markup
+
+	__forceinline unsigned char _BitScanForward(unsigned long* const Index, const unsigned long Mask)
+	{
+		__asm__("bsfl %k[Mask], %k[Index]" : [Index] "=r" (*Index) : [Mask] "mr" (Mask));
+		
+		return Mask ? 1 : 0;
+	}
+
+	#ifdef __GNUC__
+
+	// gcc 4.8 define __rdtsc but unfortunately the compiler crash...
+	// The redefine allow to skip the gcc __rdtsc version -- Gregory
+	#define __rdtsc _lnx_rdtsc
+	//__forceinline unsigned long long __rdtsc()
+	__forceinline unsigned long long _lnx_rdtsc()
+	{
+		#if defined(__amd64__) || defined(__x86_64__)
+		unsigned long long low, high;
+		__asm__ __volatile__("rdtsc" : "=a"(low), "=d"(high));
+		return low | (high << 32);
+		#else
+		unsigned long long retval;
+		__asm__ __volatile__("rdtsc" : "=A"(retval));
+		return retval;
+		#endif
+	}
+
+	#endif
+
+#endif
+
+extern void* vmalloc(size_t size, bool code);
+extern void vmfree(void* ptr, size_t size);
+
+#ifdef _WIN32
+
+	#ifdef ENABLE_VTUNE
+
+	#include <JITProfiling.h>
+
+	#pragma comment(lib, "jitprofiling.lib")
+
+	#endif
+
+#endif
+
+#define GL_INSERT(type, code, sev, ...) \
+	do if (glDebugMessageInsert) glDebugMessageInsert(GL_DEBUG_SOURCE_APPLICATION, type, code, sev, -1, format(__VA_ARGS__).c_str()); while(0);
+
+// Except apple any sane driver support this extension
+#if defined(_DEBUG)
+#define GL_CACHE(...) GL_INSERT(GL_DEBUG_TYPE_OTHER, 0xFEAD, GL_DEBUG_SEVERITY_NOTIFICATION, __VA_ARGS__)
+#else
+#define GL_CACHE(...) (0);
+#endif
+
+#if defined(ENABLE_OGL_DEBUG)
+#define GL_PUSH(...)	do if (glPushDebugGroup) glPushDebugGroup(GL_DEBUG_SOURCE_APPLICATION, 0xBAD, -1, format(__VA_ARGS__).c_str()); while(0);
+#define GL_POP()        do if (glPopDebugGroup) glPopDebugGroup(); while(0);
+#define GL_INS(...)		GL_INSERT(GL_DEBUG_TYPE_ERROR, 0xDEAD, GL_DEBUG_SEVERITY_MEDIUM, __VA_ARGS__)
+#define GL_PERF(...)	GL_INSERT(GL_DEBUG_TYPE_PERFORMANCE, 0xFEE1, GL_DEBUG_SEVERITY_NOTIFICATION, __VA_ARGS__)
+#else
+#define GL_PUSH(...) (0);
+#define GL_POP()     (0);
+#define GL_INS(...)  (0);
+#define GL_PERF(...) (0);
+#endif
+
+// Helper path to dump texture
+#ifdef _WIN32
+const std::string root_sw("c:\\temp1\\_");
+const std::string root_hw("c:\\temp2\\_");
+#else
+const std::string root_sw("/tmp/GS_SW_dump/");
+const std::string root_hw("/tmp/GS_HW_dump/");
+#endif
diff --git a/plugins/GSdx_legacy/targetver.h b/plugins/GSdx_legacy/targetver.h
new file mode 100644
index 0000000000..ba3f81934a
--- /dev/null
+++ b/plugins/GSdx_legacy/targetver.h
@@ -0,0 +1,24 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#define _WIN32_WINNT 0x0600
diff --git a/plugins/GSdx_legacy/vsprops/ProjectRootDir.props b/plugins/GSdx_legacy/vsprops/ProjectRootDir.props
new file mode 100644
index 0000000000..b78b467682
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/ProjectRootDir.props
@@ -0,0 +1,26 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <ProjectRootDir>$(ProjectDir).</ProjectRootDir>
+    <SvnRootDir>$(ProjectRootDir)\..\..</SvnRootDir>
+    <SvnCommonDir>$(SvnRootDir)\common</SvnCommonDir>
+    <PcsxSubsection>plugins</PcsxSubsection>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <BuildMacro Include="ProjectRootDir">
+      <Value>$(ProjectRootDir)</Value>
+    </BuildMacro>
+    <BuildMacro Include="SvnRootDir">
+      <Value>$(SvnRootDir)</Value>
+    </BuildMacro>
+    <BuildMacro Include="SvnCommonDir">
+      <Value>$(SvnCommonDir)</Value>
+    </BuildMacro>
+    <BuildMacro Include="PcsxSubsection">
+      <Value>$(PcsxSubsection)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/avx2.props b/plugins/GSdx_legacy/vsprops/avx2.props
new file mode 100644
index 0000000000..928fc5b3ed
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/avx2.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>AVX2</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x501;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/avx_vs10.props b/plugins/GSdx_legacy/vsprops/avx_vs10.props
new file mode 100644
index 0000000000..876419d57e
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/avx_vs10.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>AVX</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/avx_vs2012.props b/plugins/GSdx_legacy/vsprops/avx_vs2012.props
new file mode 100644
index 0000000000..42d81e1764
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/avx_vs2012.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>AVX</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/avx_vs2013.props b/plugins/GSdx_legacy/vsprops/avx_vs2013.props
new file mode 100644
index 0000000000..42d81e1764
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/avx_vs2013.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>AVX</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x500;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/common.props b/plugins/GSdx_legacy/vsprops/common.props
new file mode 100644
index 0000000000..d5b4b2c6f7
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/common.props
@@ -0,0 +1,31 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+    <OutDir>$(SolutionDir)bin\$(PcsxSubsection)\</OutDir>
+    <IntDir>$(PlatformName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <RuntimeTypeInfo>false</RuntimeTypeInfo>
+      <WarningLevel>Level4</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4456;4458;4996;4995;4324;4100;4101;4201;4556;4127;4512;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <AdditionalIncludeDirectories>$(VTUNE_AMPLIFIER_XE_2015_DIR)include;$(SolutionDir)3rdparty;$(SolutionDir)3rdparty\libpng;$(SolutionDir)3rdparty\opencl;$(SolutionDir)3rdparty\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>d3d11.lib;d3d10_1.lib;d3d9.lib;dxgi.lib;dxguid.lib;winmm.lib;strmiids.lib;opengl32.lib;opencl.lib;comsuppw.lib;comctl32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>d3d9.dll;d3d11.dll;dxgi.dll;opengl32.dll;%(DelayLoadDLLs)</DelayLoadDLLs>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <AdditionalLibraryDirectories>$(VTUNE_AMPLIFIER_XE_2015_DIR)lib32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+    <PreBuildEvent>
+      <Command>"$(SvnCommonDir)\vsprops\preBuild.cmd" "$(ProjectRootDir)"</Command>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+</Project>
diff --git a/plugins/GSdx_legacy/vsprops/debug.props b/plugins/GSdx_legacy/vsprops/debug.props
new file mode 100644
index 0000000000..197ceaf5ce
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/debug.props
@@ -0,0 +1,16 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+    <TargetName>$(ProjectName)$(PlatformArchitecture)-$(SSEtype)-dbg</TargetName>
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>StackFrameRuntimeCheck</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+    </ClCompile>
+  </ItemDefinitionGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/release.props b/plugins/GSdx_legacy/vsprops/release.props
new file mode 100644
index 0000000000..595be69332
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/release.props
@@ -0,0 +1,26 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>$(ProjectName)$(PlatformArchitecture)-$(SSEtype)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <PreprocessorDefinitions>NDEBUG;_SECURE_SCL=0;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+    </ClCompile>
+    <Link>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+  </ItemDefinitionGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/sse2.props b/plugins/GSdx_legacy/vsprops/sse2.props
new file mode 100644
index 0000000000..97cc8ed3a8
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/sse2.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>SSE2</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x200;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/sse4.props b/plugins/GSdx_legacy/vsprops/sse4.props
new file mode 100644
index 0000000000..493174b82f
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/sse4.props
@@ -0,0 +1,20 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>SSE4</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x401;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/ssse3.props b/plugins/GSdx_legacy/vsprops/ssse3.props
new file mode 100644
index 0000000000..cb14dd851b
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/ssse3.props
@@ -0,0 +1,21 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Label="UserMacros">
+    <SSEtype>SSSE3</SSEtype>
+  </PropertyGroup>
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30128.1</_ProjectFileVersion>
+    <_PropertySheetDisplayName>sse3</_PropertySheetDisplayName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>_M_SSE=0x301;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <BuildMacro Include="SSEtype">
+      <Value>$(SSEtype)</Value>
+    </BuildMacro>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/plugins/GSdx_legacy/vsprops/x64.props b/plugins/GSdx_legacy/vsprops/x64.props
new file mode 100644
index 0000000000..55af6278c1
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/x64.props
@@ -0,0 +1,8 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemDefinitionGroup>
+    <Link>
+      <AdditionalLibraryDirectories>$(ProjectDir)vtune\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+</Project>
diff --git a/plugins/GSdx_legacy/vsprops/x86.props b/plugins/GSdx_legacy/vsprops/x86.props
new file mode 100644
index 0000000000..c6dd1d02de
--- /dev/null
+++ b/plugins/GSdx_legacy/vsprops/x86.props
@@ -0,0 +1,8 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemDefinitionGroup>
+    <Link>
+      <AdditionalLibraryDirectories>$(ProjectDir)vtune\x86;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+</Project>
diff --git a/plugins/GSdx_legacy/xbyak/xbyak.h b/plugins/GSdx_legacy/xbyak/xbyak.h
new file mode 100644
index 0000000000..4f0f85e8bf
--- /dev/null
+++ b/plugins/GSdx_legacy/xbyak/xbyak.h
@@ -0,0 +1,2156 @@
+/* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+* 
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+#ifndef XBYAK_XBYAK_H_
+#define XBYAK_XBYAK_H_
+/*!
+	@file xbyak.h
+	@brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
+	@author herumi
+	@url https://github.com/herumi/xbyak, http://homepage1.nifty.com/herumi/soft/xbyak_e.html
+	@note modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#ifndef XBYAK_NO_OP_NAMES
+	#if not +0 // trick to detect whether 'not' is operator or not
+		#error "use -fno-operator-names option if you want to use and(), or(), xor(), not() as function names, Or define XBYAK_NO_OP_NAMES and use and_(), or_(), xor_(), not_()."
+	#endif
+#endif
+
+#include <stdio.h> // for debug print
+#include <assert.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+//#define XBYAK_USE_MMAP_ALLOCATOR
+#if !defined(__GNUC__) || defined(__MINGW32__)
+	#undef XBYAK_USE_MMAP_ALLOCATOR
+#endif
+
+// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
+#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) ||\
+	 			 ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
+	#include <unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
+
+// Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
+// libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
+// These headers have been expanded/fixed in various forks.
+// In F.S.F. 'real' GCC, issues with the tr headers were resolved in GCC 4.5.
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && ((__GNUC_MINOR__ >= 5) || \
+								 ((__GLIBCXX__ >= 20070719) && (__GNUC_MINOR__ >= 2) && \
+									(defined(__INTEL_COMPILER) || defined(__llvm__))))
+	#include <tr1/unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
+	#include <unordered_map>
+	#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#else
+	#include <map>
+	#define XBYAK_STD_UNORDERED_MAP std::map
+	#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
+#endif
+#ifdef _WIN32
+	#include <windows.h>
+	#include <malloc.h>
+#elif defined(__GNUC__)
+	#include <unistd.h>
+	#include <sys/mman.h>
+	#include <stdlib.h>
+#endif
+#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
+	#include <stdint.h>
+#endif
+
+#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
+	#define XBYAK64_WIN
+#elif defined(__x86_64__)
+	#define XBYAK64_GCC
+#endif
+#if !defined(XBYAK64) && !defined(XBYAK32)
+	#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
+		#define XBYAK64
+	#else
+		#define XBYAK32
+	#endif
+#endif
+
+#if (__cplusplus >= 201103) || (_MSC_VER >= 1800)
+	#define XBYAK_VARIADIC_TEMPLATE
+#endif
+
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4514) /* remove inline function */
+	#pragma warning(disable : 4786) /* identifier is too long */
+	#pragma warning(disable : 4503) /* name is too long */
+	#pragma warning(disable : 4127) /* constant expresison */
+#endif
+
+namespace Xbyak {
+
+#include "xbyak_bin2hex.h"
+
+enum {
+	DEFAULT_MAX_CODE_SIZE = 4096,
+	VERSION = 0x4840 /* 0xABCD = A.BC(D) */
+};
+
+#ifndef MIE_INTEGER_TYPE_DEFINED
+#define MIE_INTEGER_TYPE_DEFINED
+#ifdef _MSC_VER
+	typedef unsigned __int64 uint64;
+	typedef __int64 sint64;
+#else
+	typedef uint64_t uint64;
+	typedef int64_t sint64;
+#endif
+typedef unsigned int uint32;
+typedef unsigned short uint16;
+typedef unsigned char uint8;
+#endif
+
+#ifndef MIE_ALIGN
+	#ifdef _MSC_VER
+		#define MIE_ALIGN(x) __declspec(align(x))
+	#else
+		#define MIE_ALIGN(x) __attribute__((aligned(x)))
+	#endif
+#endif
+#ifndef MIE_PACK // for shufps
+	#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
+#endif
+
+enum {
+	ERR_NONE = 0,
+	ERR_BAD_ADDRESSING,
+	ERR_CODE_IS_TOO_BIG,
+	ERR_BAD_SCALE,
+	ERR_ESP_CANT_BE_INDEX,
+	ERR_BAD_COMBINATION,
+	ERR_BAD_SIZE_OF_REGISTER,
+	ERR_IMM_IS_TOO_BIG,
+	ERR_BAD_ALIGN,
+	ERR_LABEL_IS_REDEFINED,
+	ERR_LABEL_IS_TOO_FAR,
+	ERR_LABEL_IS_NOT_FOUND,
+	ERR_CODE_ISNOT_COPYABLE,
+	ERR_BAD_PARAMETER,
+	ERR_CANT_PROTECT,
+	ERR_CANT_USE_64BIT_DISP,
+	ERR_OFFSET_IS_TOO_BIG,
+	ERR_MEM_SIZE_IS_NOT_SPECIFIED,
+	ERR_BAD_MEM_SIZE,
+	ERR_BAD_ST_COMBINATION,
+	ERR_OVER_LOCAL_LABEL, // not used
+	ERR_UNDER_LOCAL_LABEL,
+	ERR_CANT_ALLOC,
+	ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
+	ERR_BAD_PROTECT_MODE,
+	ERR_BAD_PNUM,
+	ERR_BAD_TNUM,
+	ERR_BAD_VSIB_ADDRESSING,
+	ERR_CANT_CONVERT,
+	ERR_LABEL_ISNOT_SET_BY_L,
+	ERR_LABEL_IS_ALREADY_SET_BY_L,
+	ERR_BAD_LABEL_STR,
+	ERR_MUNMAP,
+	ERR_INTERNAL
+};
+
+class Error : public std::exception {
+	int err_;
+public:
+	explicit Error(int err) : err_(err)
+	{
+		if (err_ < 0 || err_ > ERR_INTERNAL) {
+			fprintf(stderr, "bad err=%d in Xbyak::Error\n", err_);
+			exit(1);
+		}
+	}
+	operator int() const { return err_; }
+	const char *what() const throw()
+	{
+		static const char *errTbl[] = {
+			"none",
+			"bad addressing",
+			"code is too big",
+			"bad scale",
+			"esp can't be index",
+			"bad combination",
+			"bad size of register",
+			"imm is too big",
+			"bad align",
+			"label is redefined",
+			"label is too far",
+			"label is not found",
+			"code is not copyable",
+			"bad parameter",
+			"can't protect",
+			"can't use 64bit disp(use (void*))",
+			"offset is too big",
+			"MEM size is not specified",
+			"bad mem size",
+			"bad st combination",
+			"over local label",
+			"under local label",
+			"can't alloc",
+			"T_SHORT is not supported in AutoGrow",
+			"bad protect mode",
+			"bad pNum",
+			"bad tNum",
+			"bad vsib addressing",
+			"can't convert",
+			"label is not set by L()",
+			"label is already set by L()",
+			"bad label string",
+			"err munmap",
+			"internal error",
+		};
+		assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
+		return errTbl[err_];
+	}
+};
+
+inline const char *ConvertErrorToString(Error err)
+{
+	return err.what();
+}
+
+inline void *AlignedMalloc(size_t size, size_t alignment)
+{
+#ifdef __MINGW32__
+	return __mingw_aligned_malloc(size, alignment);
+#elif defined(_WIN32)
+	return _aligned_malloc(size, alignment);
+#else
+	void *p;
+	int ret = posix_memalign(&p, alignment, size);
+	return (ret == 0) ? p : 0;
+#endif
+}
+
+inline void AlignedFree(void *p)
+{
+#ifdef __MINGW32__
+	__mingw_aligned_free(p);
+#elif defined(_MSC_VER)
+	_aligned_free(p);
+#else
+	free(p);
+#endif
+}
+
+template<class To, class From>
+inline const To CastTo(From p) throw()
+{
+	return (const To)(size_t)(p);
+}
+namespace inner {
+
+static const size_t ALIGN_PAGE_SIZE = 4096;
+
+inline bool IsInDisp8(uint32 x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
+inline bool IsInInt32(uint64 x) { return ~uint64(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
+
+inline uint32 VerifyInInt32(uint64 x)
+{
+#ifdef XBYAK64
+	if (!IsInInt32(x)) throw Error(ERR_OFFSET_IS_TOO_BIG);
+#endif
+	return static_cast<uint32>(x);
+}
+
+enum LabelMode {
+	LasIs, // as is
+	Labs, // absolute
+	LaddTop // (addr + top) for mov(reg, label) with AutoGrow
+};
+
+} // inner
+
+/*
+	custom allocator
+*/
+struct Allocator {
+	virtual uint8 *alloc(size_t size) { return reinterpret_cast<uint8*>(AlignedMalloc(size, inner::ALIGN_PAGE_SIZE)); }
+	virtual void free(uint8 *p) { AlignedFree(p); }
+	virtual ~Allocator() {}
+	/* override to return false if you call protect() manually */
+	virtual bool useProtect() const { return true; }
+};
+
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+class MmapAllocator : Allocator {
+	typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, size_t> SizeList;
+	SizeList sizeList_;
+public:
+	uint8 *alloc(size_t size)
+	{
+		const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1;
+		size = (size + alignedSizeM1) & ~alignedSizeM1;
+#ifdef MAP_ANONYMOUS
+		const int mode = MAP_PRIVATE | MAP_ANONYMOUS;
+#elif defined(MAP_ANON)
+		const int mode = MAP_PRIVATE | MAP_ANON;
+#else
+		#error "not supported"
+#endif
+		void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, -1, 0);
+		if (p == MAP_FAILED) throw Error(ERR_CANT_ALLOC);
+		assert(p);
+		sizeList_[(uintptr_t)p] = size;
+		return (uint8*)p;
+	}
+	void free(uint8 *p)
+	{
+		if (p == 0) return;
+		SizeList::iterator i = sizeList_.find((uintptr_t)p);
+		if (i == sizeList_.end()) throw Error(ERR_BAD_PARAMETER);
+		if (munmap((void*)i->first, i->second) < 0) throw Error(ERR_MUNMAP);
+		sizeList_.erase(i);
+	}
+};
+#endif
+
+class Operand {
+private:
+	uint8 idx_; // 0..15, MSB = 1 if spl/bpl/sil/dil
+	uint8 kind_;
+	uint16 bit_;
+public:
+	enum Kind {
+		NONE = 0,
+		MEM = 1 << 1,
+		IMM = 1 << 2,
+		REG = 1 << 3,
+		MMX = 1 << 4,
+		XMM = 1 << 5,
+		FPU = 1 << 6,
+		YMM = 1 << 7
+	};
+	enum Code {
+#ifdef XBYAK64
+		RAX = 0, RCX, RDX, RBX, RSP, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
+		R8D = 8, R9D, R10D, R11D, R12D, R13D, R14D, R15D,
+		R8W = 8, R9W, R10W, R11W, R12W, R13W, R14W, R15W,
+		R8B = 8, R9B, R10B, R11B, R12B, R13B, R14B, R15B,
+		SPL = 4, BPL, SIL, DIL,
+#endif
+		EAX = 0, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+		AX = 0, CX, DX, BX, SP, BP, SI, DI,
+		AL = 0, CL, DL, BL, AH, CH, DH, BH
+	};
+	Operand() : idx_(0), kind_(0), bit_(0) { }
+	Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
+		: idx_(static_cast<uint8>(idx | (ext8bit ? 0x80 : 0)))
+		, kind_(static_cast<uint8>(kind))
+		, bit_(static_cast<uint16>(bit))
+	{
+		assert((bit_ & (bit_ - 1)) == 0); // bit must be power of two
+	}
+	Kind getKind() const { return static_cast<Kind>(kind_); }
+	int getIdx() const { return idx_ & 15; }
+	bool isNone() const { return kind_ == 0; }
+	bool isMMX() const { return is(MMX); }
+	bool isXMM() const { return is(XMM); }
+	bool isYMM() const { return is(YMM); }
+	bool isREG(int bit = 0) const { return is(REG, bit); }
+	bool isMEM(int bit = 0) const { return is(MEM, bit); }
+	bool isFPU() const { return is(FPU); }
+	bool isExt8bit() const { return (idx_ & 0x80) != 0; }
+	// ah, ch, dh, bh?
+	bool isHigh8bit() const
+	{
+		if (!isBit(8)) return false;
+		if (isExt8bit()) return false;
+		const int idx = getIdx();
+		return AH <= idx && idx <= BH;
+	}
+	// any bit is accetable if bit == 0
+	bool is(int kind, uint32 bit = 0) const
+	{
+		return (kind_ & kind) && (bit == 0 || (bit_ & bit)); // cf. you can set (8|16)
+	}
+	bool isBit(uint32 bit) const { return (bit_ & bit) != 0; }
+	uint32 getBit() const { return bit_; }
+	const char *toString() const
+	{
+		const int idx = getIdx();
+		if (kind_ == REG) {
+			if (isExt8bit()) {
+				static const char *tbl[4] = { "spl", "bpl", "sil", "dil" };
+				return tbl[idx - 4];
+			}
+			static const char *tbl[4][16] = {
+				{ "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b",  "r11b", "r12b", "r13b", "r14b", "r15b" },
+				{ "ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w",  "r11w", "r12w", "r13w", "r14w", "r15w" },
+				{ "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d",  "r11d", "r12d", "r13d", "r14d", "r15d" },
+				{ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10",  "r11", "r12", "r13", "r14", "r15" },
+			};
+			return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
+		} else if (isYMM()) {
+			static const char *tbl[16] = { "ym0", "ym1", "ym2", "ym3", "ym4", "ym5", "ym6", "ym7", "ym8", "ym9", "ym10", "ym11", "ym12", "ym13", "ym14", "ym15" };
+			return tbl[idx];
+		} else if (isXMM()) {
+			static const char *tbl[16] = { "xm0", "xm1", "xm2", "xm3", "xm4", "xm5", "xm6", "xm7", "xm8", "xm9", "xm10", "xm11", "xm12", "xm13", "xm14", "xm15" };
+			return tbl[idx];
+		} else if (isMMX()) {
+			static const char *tbl[8] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
+			return tbl[idx];
+		} else if (isFPU()) {
+			static const char *tbl[8] = { "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7" };
+			return tbl[idx];
+		}
+		throw Error(ERR_INTERNAL);
+	}
+	bool operator==(const Operand& rhs) const { return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_; }
+	bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
+};
+
+class Label;
+
+struct Reg8;
+struct Reg16;
+struct Reg32;
+#ifdef XBYAK64
+struct Reg64;
+#endif
+class Reg : public Operand {
+	bool hasRex() const { return isExt8bit() | isREG(64) | isExtIdx(); }
+public:
+	Reg() { }
+	Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) { }
+	Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); }
+	bool isExtIdx() const { return getIdx() > 7; }
+	uint8 getRex(const Reg& base = Reg()) const
+	{
+		return (hasRex() || base.hasRex()) ? uint8(0x40 | ((isREG(64) | base.isREG(64)) ? 8 : 0) | (isExtIdx() ? 4 : 0)| (base.isExtIdx() ? 1 : 0)) : 0;
+	}
+	Reg8 cvt8() const;
+	Reg16 cvt16() const;
+	Reg32 cvt32() const;
+#ifdef XBYAK64
+	Reg64 cvt64() const;
+#endif
+};
+
+struct Reg8 : public Reg {
+	explicit Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) { }
+};
+
+struct Reg16 : public Reg {
+	explicit Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) { }
+};
+
+struct Mmx : public Reg {
+	explicit Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) { }
+};
+
+struct Xmm : public Mmx {
+	explicit Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { }
+};
+
+struct Ymm : public Xmm {
+	explicit Ymm(int idx = 0) : Xmm(idx, Operand::YMM, 256) { }
+};
+
+struct Fpu : public Reg {
+	explicit Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) { }
+};
+
+struct Reg32e : public Reg {
+	explicit Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
+};
+struct Reg32 : public Reg32e {
+	explicit Reg32(int idx = 0) : Reg32e(idx, 32) {}
+};
+#ifdef XBYAK64
+struct Reg64 : public Reg32e {
+	explicit Reg64(int idx = 0) : Reg32e(idx, 64) {}
+};
+struct RegRip {
+	sint64 disp_;
+	Label* label_;
+	explicit RegRip(sint64 disp = 0, Label* label = 0) : disp_(disp), label_(label) {}
+	friend const RegRip operator+(const RegRip& r, sint64 disp) {
+		return RegRip(r.disp_ + disp, r.label_);
+	}
+	friend const RegRip operator-(const RegRip& r, sint64 disp) {
+		return RegRip(r.disp_ - disp, r.label_);
+	}
+	friend const RegRip operator+(const RegRip& r, Label& label) {
+		if (r.label_) throw Error(ERR_BAD_ADDRESSING);
+		return RegRip(r.disp_, &label);
+	}
+};
+#endif
+
+inline Reg8 Reg::cvt8() const
+{
+	const int idx = getIdx();
+	if (isBit(8)) return Reg8(idx, isExt8bit());
+#ifdef XBYAK32
+	if (idx >= 4) throw Error(ERR_CANT_CONVERT);
+#endif
+	return Reg8(idx, 4 <= idx && idx < 8);
+}
+
+inline Reg16 Reg::cvt16() const
+{
+	const int idx = getIdx();
+	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
+	return Reg16(idx);
+}
+
+inline Reg32 Reg::cvt32() const
+{
+	const int idx = getIdx();
+	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
+	return Reg32(idx);
+}
+
+#ifdef XBYAK64
+inline Reg64 Reg::cvt64() const
+{
+	const int idx = getIdx();
+	if (isBit(8) && (4 <= idx && idx < 8) && !isExt8bit()) throw Error(ERR_CANT_CONVERT);
+	return Reg64(idx);
+}
+#endif
+
+class RegExp {
+public:
+	struct SReg {
+		uint16 bit:9; // 32/64/128/256 none if 0
+		uint16 idx:7;
+		SReg() : bit(0), idx(0) { }
+		void set(const Reg& r) { this->bit = uint16(r.getBit()); this->idx = uint16(r.getIdx()); }
+		bool operator==(const SReg& rhs) const { return bit == rhs.bit && idx == rhs.idx; }
+	};
+	RegExp(size_t disp = 0) : disp_(disp), scale_(0) { }
+	RegExp(const Reg& r, int scale = 1)
+		: disp_(0)
+		, scale_(scale)
+	{
+		if (!r.is(Reg::REG, 32|64) && !r.is(Reg::XMM|Reg::YMM)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+		if (scale != 1 && scale != 2 && scale != 4 && scale != 8) throw Error(ERR_BAD_SCALE);
+		if (r.getBit() >= 128 || scale != 1) { // xmm/ymm is always index
+			index_.set(r);
+		} else {
+			base_.set(r);
+		}
+	}
+	bool isVsib() const { return index_.bit >= 128; }
+	bool isYMM() const { return index_.bit >= 256; }
+	RegExp optimize() const // select smaller size
+	{
+		// [reg * 2] => [reg + reg]
+		if (!isVsib() && !base_.bit && index_.bit && scale_ == 2) {
+			RegExp ret = *this;
+			ret.base_ = index_;
+			ret.scale_ = 1;
+			return ret;
+		}
+		return *this;
+	}
+	bool operator==(const RegExp& rhs) const
+	{
+		return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_;
+	}
+	const SReg& getBase() const { return base_; }
+	const SReg& getIndex() const { return index_; }
+	int getScale() const { return scale_; }
+	uint32 getDisp() const { return uint32(disp_); }
+	void verify() const
+	{
+		if (base_.bit >= 128) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+		if (index_.bit && index_.bit <= 64) {
+			if (index_.idx == Operand::ESP) throw Error(ERR_ESP_CANT_BE_INDEX);
+			if (base_.bit && base_.bit != index_.bit) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+		}
+	}
+private:
+	friend RegExp operator+(const RegExp& a, const RegExp& b);
+	friend RegExp operator-(const RegExp& e, size_t disp);
+	/*
+		[base_ + index_ * scale_ + disp_]
+		base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
+	*/
+	size_t disp_;
+	int scale_;
+	SReg base_;
+	SReg index_;
+};
+
+inline RegExp operator+(const RegExp& a, const RegExp& b)
+{
+	if (a.index_.bit && b.index_.bit) throw Error(ERR_BAD_ADDRESSING);
+	RegExp ret = a;
+	if (!ret.index_.bit) { ret.index_ = b.index_; ret.scale_ = b.scale_; }
+	if (b.base_.bit) {
+		if (ret.base_.bit) {
+			if (ret.index_.bit) throw Error(ERR_BAD_ADDRESSING);
+			// base + base => base + index * 1
+			ret.index_ = b.base_;
+			// [reg + esp] => [esp + reg]
+			if (ret.index_.idx == Operand::ESP) std::swap(ret.base_, ret.index_);
+			ret.scale_ = 1;
+		} else {
+			ret.base_ = b.base_;
+		}
+	}
+	ret.disp_ += b.disp_;
+	return ret;
+}
+inline RegExp operator*(const Reg& r, int scale)
+{
+	return RegExp(r, scale);
+}
+inline RegExp operator-(const RegExp& e, size_t disp)
+{
+	RegExp ret = e;
+	ret.disp_ -= disp;
+	return ret;
+}
+
+// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
+void *const AutoGrow = (void*)1;
+
+class CodeArray {
+	enum Type {
+		USER_BUF = 1, // use userPtr(non alignment, non protect)
+		ALLOC_BUF, // use new(alignment, protect)
+		AUTO_GROW // automatically move and grow memory if necessary
+	};
+	CodeArray(const CodeArray& rhs);
+	void operator=(const CodeArray&);
+	bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
+	struct AddrInfo {
+		size_t codeOffset; // position to write
+		size_t jmpAddr; // value to write
+		int jmpSize; // size of jmpAddr
+		inner::LabelMode mode;
+		AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
+			: codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
+		uint64 getVal(const uint8 *top) const
+		{
+			uint64 disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top) : (mode == inner::LasIs) ? jmpAddr : jmpAddr - size_t(top);
+			if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
+			return disp;
+		}
+	};
+	typedef std::list<AddrInfo> AddrInfoList;
+	AddrInfoList addrInfoList_;
+	const Type type_;
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+	MmapAllocator defaultAllocator_;
+#else
+	Allocator defaultAllocator_;
+#endif
+	Allocator *alloc_;
+protected:
+	size_t maxSize_;
+	uint8 *top_;
+	size_t size_;
+
+	/*
+		allocate new memory and copy old data to the new area
+	*/
+	void growMemory()
+	{
+		const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
+		uint8 *newTop = alloc_->alloc(newSize);
+		if (newTop == 0) throw Error(ERR_CANT_ALLOC);
+		for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
+		alloc_->free(top_);
+		top_ = newTop;
+		maxSize_ = newSize;
+	}
+	/*
+		calc jmp address for AutoGrow mode
+	*/
+	void calcJmpAddress()
+	{
+		for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
+			uint64 disp = i->getVal(top_);
+			rewrite(i->codeOffset, disp, i->jmpSize);
+		}
+		if (alloc_->useProtect() && !protect(top_, size_, true)) throw Error(ERR_CANT_PROTECT);
+	}
+public:
+	explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
+		: type_(userPtr == AutoGrow ? AUTO_GROW : userPtr ? USER_BUF : ALLOC_BUF)
+		, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
+		, maxSize_(maxSize)
+		, top_(type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
+		, size_(0)
+	{
+		if (maxSize_ > 0 && top_ == 0) throw Error(ERR_CANT_ALLOC);
+		if ((type_ == ALLOC_BUF && alloc_->useProtect()) && !protect(top_, maxSize, true)) {
+			alloc_->free(top_);
+			throw Error(ERR_CANT_PROTECT);
+		}
+	}
+	virtual ~CodeArray()
+	{
+		if (isAllocType()) {
+			if (alloc_->useProtect()) protect(top_, maxSize_, false);
+			alloc_->free(top_);
+		}
+	}
+	void resetSize()
+	{
+		size_ = 0;
+		addrInfoList_.clear();
+	}
+	void db(int code)
+	{
+		if (size_ >= maxSize_) {
+			if (type_ == AUTO_GROW) {
+				growMemory();
+			} else {
+				throw Error(ERR_CODE_IS_TOO_BIG);
+			}
+		}
+		top_[size_++] = static_cast<uint8>(code);
+	}
+	void db(const uint8 *code, int codeSize)
+	{
+		for (int i = 0; i < codeSize; i++) db(code[i]);
+	}
+	void db(uint64 code, int codeSize)
+	{
+		if (codeSize > 8) throw Error(ERR_BAD_PARAMETER);
+		for (int i = 0; i < codeSize; i++) db(static_cast<uint8>(code >> (i * 8)));
+	}
+	void dw(uint32 code) { db(code, 2); }
+	void dd(uint32 code) { db(code, 4); }
+	void dq(uint64 code) { db(code, 8); }
+	const uint8 *getCode() const { return top_; }
+	template<class F>
+	const F getCode() const { return CastTo<F>(top_); }
+	const uint8 *getCurr() const { return &top_[size_]; }
+	template<class F>
+	const F getCurr() const { return CastTo<F>(&top_[size_]); }
+	size_t getSize() const { return size_; }
+	void setSize(size_t size)
+	{
+		if (size > maxSize_) throw Error(ERR_OFFSET_IS_TOO_BIG);
+		size_ = size;
+	}
+	void dump() const
+	{
+		const uint8 *p = getCode();
+		size_t bufSize = getSize();
+		size_t remain = bufSize;
+		for (int i = 0; i < 4; i++) {
+			size_t disp = 16;
+			if (remain < 16) {
+				disp = remain;
+			}
+			for (size_t j = 0; j < 16; j++) {
+				if (j < disp) {
+					printf("%02X", p[i * 16 + j]);
+				}
+			}
+			putchar('\n');
+			remain -= disp;
+			if (remain <= 0) {
+				break;
+			}
+		}
+	}
+	/*
+		@param offset [in] offset from top
+		@param disp [in] offset from the next of jmp
+		@param size [in] write size(1, 2, 4, 8)
+	*/
+	void rewrite(size_t offset, uint64 disp, size_t size)
+	{
+		assert(offset < maxSize_);
+		if (size != 1 && size != 2 && size != 4 && size != 8) throw Error(ERR_BAD_PARAMETER);
+		uint8 *const data = top_ + offset;
+		for (size_t i = 0; i < size; i++) {
+			data[i] = static_cast<uint8>(disp >> (i * 8));
+		}
+	}
+	void save(size_t offset, size_t val, int size, inner::LabelMode mode)
+	{
+		addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
+	}
+	bool isAutoGrow() const { return type_ == AUTO_GROW; }
+	/**
+		change exec permission of memory
+		@param addr [in] buffer address
+		@param size [in] buffer size
+		@param canExec [in] true(enable to exec), false(disable to exec)
+		@return true(success), false(failure)
+	*/
+	static inline bool protect(const void *addr, size_t size, bool canExec)
+	{
+#if defined(_WIN32)
+		DWORD oldProtect;
+		return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
+#elif defined(__GNUC__)
+		size_t pageSize = sysconf(_SC_PAGESIZE);
+		size_t iaddr = reinterpret_cast<size_t>(addr);
+		size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+		int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0);
+		return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
+#else
+		return true;
+#endif
+	}
+	/**
+		get aligned memory pointer
+		@param addr [in] address
+		@param alingedSize [in] power of two
+		@return aligned addr by alingedSize
+	*/
+	static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = 16)
+	{
+		return reinterpret_cast<uint8*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
+	}
+};
+
+class Address : public Operand {
+	mutable uint8 top_[6]; // 6 = 1(ModRM) + 1(SIB) + 4(disp)
+	uint8 size_;
+	uint8 rex_;
+	size_t disp_;
+	const Label* label_;
+	bool isOnlyDisp_;
+	bool is64bitDisp_;
+	bool is32bit_;
+	mutable bool isVsib_;
+	bool isYMM_;
+	void verify() const { if (isVsib_) throw Error(ERR_BAD_VSIB_ADDRESSING); }
+public:
+	Address(uint32 sizeBit, bool isOnlyDisp, size_t disp, bool is32bit, bool is64bitDisp = false, bool isVsib = false, bool isYMM = false)
+		: Operand(0, MEM, sizeBit)
+		, size_(0)
+		, rex_(0)
+		, disp_(disp)
+		, label_(0)
+		, isOnlyDisp_(isOnlyDisp)
+		, is64bitDisp_(is64bitDisp)
+		, is32bit_(is32bit)
+		, isVsib_(isVsib)
+		, isYMM_(isYMM)
+	{
+	}
+	void db(int code)
+	{
+		if (size_ >= sizeof(top_)) throw Error(ERR_CODE_IS_TOO_BIG);
+		top_[size_++] = static_cast<uint8>(code);
+	}
+	void dd(uint32 code) { for (int i = 0; i < 4; i++) db(code >> (i * 8)); }
+	const uint8 *getCode() const { return top_; }
+	size_t getSize() const { return size_; }
+	void updateRegField(uint8 regIdx) const
+	{
+		*top_ = (*top_ & B11000111) | ((regIdx << 3) & B00111000);
+	}
+	void setVsib(bool isVsib) const { isVsib_ = isVsib; }
+	bool isVsib() const { return isVsib_; }
+	bool isYMM() const { return isYMM_; }
+	bool is32bit() const { verify(); return is32bit_; }
+	bool isOnlyDisp() const { verify(); return isOnlyDisp_; } // for mov eax
+	size_t getDisp() const { verify(); return disp_; }
+	uint8 getRex() const { verify(); return rex_; }
+	bool is64bitDisp() const { verify(); return is64bitDisp_; } // for moffset
+	void setRex(uint8 rex) { rex_ = rex; }
+	void setLabel(const Label* label) { label_ = label; }
+	const Label* getLabel() const { return label_; }
+};
+
+class AddressFrame {
+private:
+	void operator=(const AddressFrame&);
+	Address makeAddress(const RegExp& e) const
+	{
+		e.verify();
+		const bool isVsib = e.isVsib();
+		const bool isYMM = e.isYMM();
+		const RegExp::SReg& base = e.getBase();
+		const RegExp::SReg& index = e.getIndex();
+		const uint32 disp = e.getDisp();
+		Address frame(bit_, (!base.bit && !index.bit), disp, base.bit == 32 || index.bit == 32, false, isVsib, isYMM);
+		enum {
+			mod00 = 0, mod01 = 1, mod10 = 2
+		};
+		int mod;
+		if (!base.bit || ((base.idx & 7) != Operand::EBP && disp == 0)) {
+			mod = mod00;
+		} else if (inner::IsInDisp8(disp)) {
+			mod = mod01;
+		} else {
+			mod = mod10;
+		}
+		const int baseIdx = base.bit ? (base.idx & 7) : Operand::EBP;
+		/* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
+		bool hasSIB = index.bit || (base.idx & 7) == Operand::ESP;
+#ifdef XBYAK64
+		if (!base.bit && !index.bit) hasSIB = true;
+#endif
+		if (hasSIB) {
+			frame.db((mod << 6) | Operand::ESP);
+			/* SIB = [2:3:3] = [SS:index:base(=rm)] */
+			const int indexIdx = index.bit ? (index.idx & 7) : Operand::ESP;
+			const int scale = e.getScale();
+			const int ss = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
+			frame.db((ss << 6) | (indexIdx << 3) | baseIdx);
+		} else {
+			frame.db((mod << 6) | baseIdx);
+		}
+		if (mod == mod01) {
+			frame.db(disp);
+		} else if (mod == mod10 || (mod == mod00 && !base.bit)) {
+			frame.dd(disp);
+		}
+		int rex = ((index.idx >> 3) << 1) | (base.idx >> 3);
+		if (rex) rex |= 0x40;
+		frame.setRex(uint8(rex));
+		return frame;
+	}
+public:
+	const uint32 bit_;
+	explicit AddressFrame(uint32 bit) : bit_(bit) { }
+	Address operator[](const void *disp) const
+	{
+		size_t adr = reinterpret_cast<size_t>(disp);
+#ifdef XBYAK64
+		if (adr > 0xFFFFFFFFU) throw Error(ERR_OFFSET_IS_TOO_BIG);
+#endif
+		RegExp e(static_cast<uint32>(adr));
+		return operator[](e);
+	}
+#ifdef XBYAK64
+	Address operator[](uint64 disp) const
+	{
+		return Address(64, true, disp, false, true);
+	}
+	Address operator[](const RegRip& addr) const
+	{
+		Address frame(bit_, true, addr.disp_, false);
+		frame.db(0x05);
+		if (addr.label_) {
+			frame.setLabel(addr.label_);
+		} else {
+			frame.dd(inner::VerifyInInt32(addr.disp_));
+		}
+		return frame;
+	}
+#endif
+	Address operator[](const RegExp& e) const
+	{
+		return makeAddress(e.optimize());
+	}
+};
+
+struct JmpLabel {
+	size_t endOfJmp; /* offset from top to the end address of jmp */
+	int jmpSize;
+	inner::LabelMode mode;
+	size_t disp; // disp for [rip + disp]
+	explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
+		: endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp)
+	{
+	}
+};
+
+class LabelManager;
+
+class Label {
+	mutable LabelManager *mgr;
+	mutable int id;
+	friend class LabelManager;
+public:
+	Label() : mgr(0), id(0) {}
+	Label(const Label& rhs);
+	Label& operator=(const Label& rhs);
+	~Label();
+	int getId() const { return id; }
+
+	// backward compatibility
+	static std::string toStr(int num)
+	{
+		char buf[16];
+#ifdef _MSC_VER
+		_snprintf_s
+#else
+		snprintf
+#endif
+		(buf, sizeof(buf), ".%08x", num);
+		return buf;
+	}
+};
+
+class LabelManager {
+	// for string label
+	struct SlabelVal {
+		size_t offset;
+		SlabelVal(size_t offset) : offset(offset) {}
+	};
+	typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
+	typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
+	struct SlabelState {
+		SlabelDefList defList;
+		SlabelUndefList undefList;
+	};
+	typedef std::list<SlabelState> StateList;
+	// for Label class
+	struct ClabelVal {
+		ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
+		size_t offset;
+		int refCount;
+	};
+	typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
+	typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
+
+	CodeArray *base_;
+	// global : stateList_.front(), local : stateList_.back()
+	StateList stateList_;
+	mutable int labelId_;
+	ClabelDefList clabelDefList_;
+	ClabelUndefList clabelUndefList_;
+
+	int getId(const Label& label) const
+	{
+		if (label.id == 0) label.id = labelId_++;
+		return label.id;
+	}
+	template<class DefList, class UndefList, class T>
+	void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset)
+	{
+		// add label
+		typename DefList::value_type item(labelId, addrOffset);
+		std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
+		if (!ret.second) throw Error(ERR_LABEL_IS_REDEFINED);
+		// search undefined label
+		for (;;) {
+			typename UndefList::iterator itr = undefList.find(labelId);
+			if (itr == undefList.end()) break;
+			const JmpLabel *jmp = &itr->second;
+			const size_t offset = jmp->endOfJmp - jmp->jmpSize;
+			size_t disp;
+			if (jmp->mode == inner::LaddTop) {
+				disp = addrOffset;
+			} else if (jmp->mode == inner::Labs) {
+				disp = size_t(base_->getCurr());
+			} else {
+				disp = addrOffset - jmp->endOfJmp + jmp->disp;
+#ifdef XBYAK64
+				if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) throw Error(ERR_OFFSET_IS_TOO_BIG);
+#endif
+				if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32)disp)) throw Error(ERR_LABEL_IS_TOO_FAR);
+			}
+			if (base_->isAutoGrow()) {
+				base_->save(offset, disp, jmp->jmpSize, jmp->mode);
+			} else {
+				base_->rewrite(offset, disp, jmp->jmpSize);
+			}
+			undefList.erase(itr);
+		}
+	}
+	template<class DefList, class T>
+	bool getOffset_inner(const DefList& defList, size_t *offset, const T& label) const
+	{
+		typename DefList::const_iterator i = defList.find(label);
+		if (i == defList.end()) return false;
+		*offset = i->second.offset;
+		return true;
+	}
+	friend class Label;
+	void incRefCount(int id) { clabelDefList_[id].refCount++; }
+	void decRefCount(int id)
+	{
+		ClabelDefList::iterator i = clabelDefList_.find(id);
+		if (i == clabelDefList_.end()) return;
+		if (i->second.refCount == 1) {
+			clabelDefList_.erase(id);
+		} else {
+			--i->second.refCount;
+		}
+	}
+	template<class T>
+	bool hasUndefinedLabel_inner(const T& list) const
+	{
+#ifndef NDEBUG
+		for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
+			std::cerr << "undefined label:" << i->first << std::endl;
+		}
+#endif
+		return !list.empty();
+	}
+public:
+	LabelManager()
+	{
+		reset();
+	}
+	void reset()
+	{
+		base_ = 0;
+		labelId_ = 1;
+		stateList_.clear();
+		stateList_.push_back(SlabelState());
+		stateList_.push_back(SlabelState());
+	}
+	void enterLocal()
+	{
+		stateList_.push_back(SlabelState());
+	}
+	void leaveLocal()
+	{
+		if (stateList_.size() <= 2) throw Error(ERR_UNDER_LOCAL_LABEL);
+		if (hasUndefinedLabel_inner(stateList_.back().undefList)) throw Error(ERR_LABEL_IS_NOT_FOUND);
+		stateList_.pop_back();
+	}
+	void set(CodeArray *base) { base_ = base; }
+	void defineSlabel(std::string label)
+	{
+		if (label == "@b" || label == "@f") throw Error(ERR_BAD_LABEL_STR);
+		if (label == "@@") {
+			SlabelDefList& defList = stateList_.front().defList;
+			SlabelDefList::iterator i = defList.find("@f");
+			if (i != defList.end()) {
+				defList.erase(i);
+				label = "@b";
+			} else {
+				i = defList.find("@b");
+				if (i != defList.end()) {
+					defList.erase(i);
+				}
+				label = "@f";
+			}
+		}
+		SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		define_inner(st.defList, st.undefList, label, base_->getSize());
+	}
+	void defineClabel(const Label& label)
+	{
+		define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
+		label.mgr = this;
+	}
+	void assign(Label& dst, const Label& src)
+	{
+		ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
+		if (i == clabelDefList_.end()) throw Error(ERR_LABEL_ISNOT_SET_BY_L);
+		define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
+		dst.mgr = this;
+	}
+	bool getOffset(size_t *offset, std::string& label) const
+	{
+		const SlabelDefList& defList = stateList_.front().defList;
+		if (label == "@b") {
+			if (defList.find("@f") != defList.end()) {
+				label = "@f";
+			} else if (defList.find("@b") == defList.end()) {
+				throw Error(ERR_LABEL_IS_NOT_FOUND);
+			}
+		} else if (label == "@f") {
+			if (defList.find("@f") != defList.end()) {
+				label = "@b";
+			}
+		}
+		const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		return getOffset_inner(st.defList, offset, label);
+	}
+	bool getOffset(size_t *offset, const Label& label) const
+	{
+		return getOffset_inner(clabelDefList_, offset, getId(label));
+	}
+	void addUndefinedLabel(const std::string& label, const JmpLabel& jmp)
+	{
+		SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+		st.undefList.insert(SlabelUndefList::value_type(label, jmp));
+	}
+	void addUndefinedLabel(const Label& label, const JmpLabel& jmp)
+	{
+		clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
+	}
+	bool hasUndefSlabel() const
+	{
+		for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
+			if (hasUndefinedLabel_inner(i->undefList)) return true;
+		}
+		return false;
+	}
+	bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
+};
+
+inline Label::Label(const Label& rhs)
+{
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id);
+}
+inline Label& Label::operator=(const Label& rhs)
+{
+	if (id) throw Error(ERR_LABEL_IS_ALREADY_SET_BY_L);
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id);
+	return *this;
+}
+inline Label::~Label()
+{
+	if (id && mgr) mgr->decRefCount(id);
+}
+
+class CodeGenerator : public CodeArray {
+public:
+	enum LabelType {
+		T_SHORT,
+		T_NEAR,
+		T_AUTO // T_SHORT if possible
+	};
+private:
+	CodeGenerator operator=(const CodeGenerator&); // don't call
+#ifdef XBYAK64
+	enum { i32e = 32 | 64, BIT = 64 };
+	static const size_t dummyAddr = (size_t(0x11223344) << 32) | 55667788;
+	typedef Reg64 NativeReg;
+#else
+	enum { i32e = 32, BIT = 32 };
+	static const size_t dummyAddr = 0x12345678;
+	typedef Reg32 NativeReg;
+#endif
+	// (XMM, XMM|MEM)
+	static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isXMM() || op2.isMEM());
+	}
+	// (MMX, MMX|MEM) or (XMM, XMM|MEM)
+	static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2)
+	{
+		return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
+	}
+	// (XMM, MMX|MEM)
+	static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isMMX() || op2.isMEM());
+	}
+	// (MMX, XMM|MEM)
+	static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isMMX() && (op2.isXMM() || op2.isMEM());
+	}
+	// (XMM, REG32|MEM)
+	static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
+	}
+	// (REG32, XMM|MEM)
+	static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
+	}
+	// (REG32, REG32|MEM)
+	static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2)
+	{
+		return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
+	}
+	void rex(const Operand& op1, const Operand& op2 = Operand())
+	{
+		uint8 rex = 0;
+		const Operand *p1 = &op1, *p2 = &op2;
+		if (p1->isMEM()) std::swap(p1, p2);
+		if (p1->isMEM()) throw Error(ERR_BAD_COMBINATION);
+		if (p2->isMEM()) {
+			const Address& addr = static_cast<const Address&>(*p2);
+			if (BIT == 64 && addr.is32bit()) db(0x67);
+			rex = addr.getRex() | static_cast<const Reg&>(*p1).getRex();
+		} else {
+			// ModRM(reg, base);
+			rex = static_cast<const Reg&>(op2).getRex(static_cast<const Reg&>(op1));
+		}
+		// except movsx(16bit, 32/64bit)
+		if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
+		if (rex) db(rex);
+	}
+	enum AVXtype {
+		PP_NONE = 1 << 0,
+		PP_66 = 1 << 1,
+		PP_F3 = 1 << 2,
+		PP_F2 = 1 << 3,
+		MM_RESERVED = 1 << 4,
+		MM_0F = 1 << 5,
+		MM_0F38 = 1 << 6,
+		MM_0F3A = 1 << 7
+	};
+	void vex(bool r, int idx, bool is256, int type, bool x = false, bool b = false, int w = 1)
+	{
+		uint32 pp = (type & PP_66) ? 1 : (type & PP_F3) ? 2 : (type & PP_F2) ? 3 : 0;
+		uint32 vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
+		if (!b && !x && !w && (type & MM_0F)) {
+			db(0xC5); db((r ? 0 : 0x80) | vvvv);
+		} else {
+			uint32 mmmm = (type & MM_0F) ? 1 : (type & MM_0F38) ? 2 : (type & MM_0F3A) ? 3 : 0;
+			db(0xC4); db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm); db((w << 7) | vvvv);
+		}
+	}
+	LabelManager labelMgr_;
+	bool isInDisp16(uint32 x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
+	uint8 getModRM(int mod, int r1, int r2) const { return static_cast<uint8>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7)); }
+	void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE)
+	{
+		rex(reg2, reg1);
+		db(code0 | (reg1.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
+		db(getModRM(3, reg1.getIdx(), reg2.getIdx()));
+	}
+	void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE)
+	{
+		if (addr.is64bitDisp()) throw Error(ERR_CANT_USE_64BIT_DISP);
+		rex(addr, reg);
+		db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
+		addr.updateRegField(static_cast<uint8>(reg.getIdx()));
+		opAddr(addr);
+	}
+	void makeJmp(uint32 disp, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref)
+	{
+		const int shortJmpSize = 2;
+		const int longHeaderSize = longPref ? 2 : 1;
+		const int longJmpSize = longHeaderSize + 4;
+		if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
+			db(shortCode); db(disp - shortJmpSize);
+		} else {
+			if (type == T_SHORT) throw Error(ERR_LABEL_IS_TOO_FAR);
+			if (longPref) db(longPref);
+			db(longCode); dd(disp - longJmpSize);
+		}
+	}
+	template<class T>
+	void opJmp(T& label, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref)
+	{
+		if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
+		size_t offset = 0;
+		if (labelMgr_.getOffset(&offset, label)) { /* label exists */
+			makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
+		} else {
+			int jmpSize = 0;
+			if (type == T_NEAR) {
+				jmpSize = 4;
+				if (longPref) db(longPref);
+				db(longCode); dd(0);
+			} else {
+				jmpSize = 1;
+				db(shortCode); db(0);
+			}
+			JmpLabel jmp(size_, jmpSize, inner::LasIs);
+			labelMgr_.addUndefinedLabel(label, jmp);
+		}
+	}
+	void opJmpAbs(const void *addr, LabelType type, uint8 shortCode, uint8 longCode)
+	{
+		if (isAutoGrow()) {
+			if (type != T_NEAR) throw Error(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW);
+			if (size_ + 16 >= maxSize_) growMemory();
+			db(longCode);
+			dd(0);
+			save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
+		} else {
+			makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8*>(addr) - getCurr()), type, shortCode, longCode, 0);
+		}
+
+	}
+	void opAddr(const Address &addr)
+	{
+		db(addr.getCode(), static_cast<int>(addr.getSize()));
+		if (addr.getLabel()) { // [rip + Label]
+			putL_inner(*addr.getLabel(), true, addr.getDisp());
+		}
+	}
+	/* preCode is for SSSE3/SSE4 */
+	void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&), int imm8 = NONE, int preCode = NONE)
+	{
+		if (isValid && !isValid(reg, op)) throw Error(ERR_BAD_COMBINATION);
+		if (pref != NONE) db(pref);
+		if (op.isMEM()) {
+			opModM(static_cast<const Address&>(op), static_cast<const Reg&>(reg), 0x0F, preCode, code);
+		} else {
+			opModR(static_cast<const Reg&>(reg), static_cast<const Reg&>(op), 0x0F, preCode, code);
+		}
+		if (imm8 != NONE) db(imm8);
+	}
+	void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(Reg32(ext), mmx, 0x0F, code);
+		db(imm8);
+	}
+	void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE)
+	{
+		opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
+	}
+	void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
+	{
+		if (pref != NONE) db(pref);
+		if (op1.isXMM() && op2.isMEM()) {
+			opModM(static_cast<const Address&>(op2), static_cast<const Reg&>(op1), 0x0F, code);
+		} else if (op1.isMEM() && op2.isXMM()) {
+			opModM(static_cast<const Address&>(op1), static_cast<const Reg&>(op2), 0x0F, code | 1);
+		} else {
+			throw Error(ERR_BAD_COMBINATION);
+		}
+	}
+	void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false)
+	{
+		if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
+			if (mmx.isXMM()) db(0x66);
+			opModR(static_cast<const Reg&>(op), mmx, 0x0F, B11000101); db(imm);
+		} else {
+			opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, B00111010);
+		}
+	}
+	void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE, bool disableRex = false)
+	{
+		int opBit = op.getBit();
+		if (disableRex && opBit == 64) opBit = 32;
+		if (op.isREG(bit)) {
+			opModR(Reg(ext, Operand::REG, opBit), static_cast<const Reg&>(op).changeBit(opBit), code0, code1, code2);
+		} else if (op.isMEM()) {
+			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, opBit), code0, code1, code2);
+		} else {
+			throw Error(ERR_BAD_COMBINATION);
+		}
+	}
+	void opShift(const Operand& op, int imm, int ext)
+	{
+		verifyMemHasSize(op);
+		opR_ModM(op, 0, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
+		if (imm != 1) db(imm);
+	}
+	void opShift(const Operand& op, const Reg8& cl, int ext)
+	{
+		if (cl.getIdx() != Operand::CL) throw Error(ERR_BAD_COMBINATION);
+		opR_ModM(op, 0, ext, B11010010);
+	}
+	void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE)
+	{
+		if (condR) {
+			opModR(static_cast<const Reg&>(op1), static_cast<const Reg&>(op2), code0, code1, code2);
+		} else if (condM) {
+			opModM(static_cast<const Address&>(op2), static_cast<const Reg&>(op1), code0, code1, code2);
+		} else {
+			throw Error(ERR_BAD_COMBINATION);
+		}
+	}
+	void opShxd(const Operand& op, const Reg& reg, uint8 imm, int code, const Reg8 *cl = 0)
+	{
+		if (cl && cl->getIdx() != Operand::CL) throw Error(ERR_BAD_COMBINATION);
+		opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F, code | (cl ? 1 : 0));
+		if (!cl) db(imm);
+	}
+	// (REG, REG|MEM), (MEM, REG)
+	void opRM_RM(const Operand& op1, const Operand& op2, int code)
+	{
+		if (op1.isREG() && op2.isMEM()) {
+			opModM(static_cast<const Address&>(op2), static_cast<const Reg&>(op1), code | 2);
+		} else {
+			opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
+		}
+	}
+	// (REG|MEM, IMM)
+	void opRM_I(const Operand& op, uint32 imm, int code, int ext)
+	{
+		verifyMemHasSize(op);
+		uint32 immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
+		if (op.isBit(8)) immBit = 8;
+		if (op.getBit() < immBit) throw Error(ERR_IMM_IS_TOO_BIG);
+		if (op.isBit(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
+		if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al
+			rex(op);
+			db(code | 4 | (immBit == 8 ? 0 : 1));
+		} else {
+			int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
+			opR_ModM(op, 0, ext, B10000000 | tmp);
+		}
+		db(imm, immBit / 8);
+	}
+	void opIncDec(const Operand& op, int code, int ext)
+	{
+		verifyMemHasSize(op);
+#ifndef XBYAK64
+		if (op.isREG() && !op.isBit(8)) {
+			rex(op); db(code | op.getIdx());
+			return;
+		}
+#endif
+		code = B11111110;
+		if (op.isREG()) {
+			opModR(Reg(ext, Operand::REG, op.getBit()), static_cast<const Reg&>(op), code);
+		} else {
+			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
+		}
+	}
+	void opPushPop(const Operand& op, int code, int ext, int alt)
+	{
+		if (op.isREG()) {
+			if (op.isBit(16)) db(0x66);
+			if (static_cast<const Reg&>(op).getIdx() >= 8) db(0x41);
+			db(alt | (op.getIdx() & 7));
+		} else if (op.isMEM()) {
+			opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
+		} else {
+			throw Error(ERR_BAD_COMBINATION);
+		}
+	}
+	void verifyMemHasSize(const Operand& op) const
+	{
+		if (op.isMEM() && op.getBit() == 0) throw Error(ERR_MEM_SIZE_IS_NOT_SPECIFIED);
+	}
+	void opMovxx(const Reg& reg, const Operand& op, uint8 code)
+	{
+		if (op.isBit(32)) throw Error(ERR_BAD_COMBINATION);
+		int w = op.isBit(16);
+#ifdef XBYAK64
+		if (op.isHigh8bit()) throw Error(ERR_BAD_COMBINATION);
+#endif
+		bool cond = reg.isREG() && (reg.getBit() > op.getBit());
+		opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
+	}
+	void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
+	{
+		if (addr.is64bitDisp()) throw Error(ERR_CANT_USE_64BIT_DISP);
+		uint8 code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
+		if (!code) throw Error(ERR_BAD_MEM_SIZE);
+		if (m64ext && addr.isBit(64)) ext = m64ext;
+
+		rex(addr, st0);
+		db(code);
+		addr.updateRegField(ext);
+		opAddr(addr);
+	}
+	// use code1 if reg1 == st0
+	// use code2 if reg1 != st0 && reg2 == st0
+	void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32 code1, uint32 code2)
+	{
+		uint32 code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
+		if (!code) throw Error(ERR_BAD_ST_COMBINATION);
+		db(uint8(code >> 8));
+		db(uint8(code | (reg1.getIdx() | reg2.getIdx())));
+	}
+	void opFpu(const Fpu& reg, uint8 code1, uint8 code2)
+	{
+		db(code1); db(code2 | reg.getIdx());
+	}
+	void opVex(const Reg& r, const Operand *p1, const Operand *p2, int type, int code, int w)
+	{
+		bool x, b;
+		if (p2->isMEM()) {
+			const Address& addr = static_cast<const Address&>(*p2);
+			uint8 rex = addr.getRex();
+			x = (rex & 2) != 0;
+			b = (rex & 1) != 0;
+			if (BIT == 64 && addr.is32bit()) db(0x67);
+			if (BIT == 64 && w == -1) w = (rex & 4) ? 1 : 0;
+		} else {
+			x = false;
+			b = static_cast<const Reg&>(*p2).isExtIdx();
+		}
+		if (w == -1) w = 0;
+		vex(r.isExtIdx(), p1 ? p1->getIdx() : 0, r.isYMM(), type, x, b, w);
+		db(code);
+		if (p2->isMEM()) {
+			const Address& addr = static_cast<const Address&>(*p2);
+			addr.updateRegField(static_cast<uint8>(r.getIdx()));
+			opAddr(addr);
+		} else {
+			db(getModRM(3, r.getIdx(), p2->getIdx()));
+		}
+	}
+	// (r, r, r/m) if isR_R_RM
+	// (r, r/m, r)
+	void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8 code, bool isR_R_RM)
+	{
+		const Operand *p1 = &op1;
+		const Operand *p2 = &op2;
+		if (!isR_R_RM) std::swap(p1, p2);
+		const unsigned int bit = r.getBit();
+		if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) throw Error(ERR_BAD_COMBINATION);
+		int w = bit == 64;
+		opVex(r, p1, p2, type, code, w);
+	}
+	void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, bool supportYMM, int w = -1)
+	{
+		const Xmm *x2;
+		const Operand *op;
+		if (op2.isNone()) {
+			x2 = &x1;
+			op = &op1;
+		} else {
+			if (!(op1.isXMM() || (supportYMM && op1.isYMM()))) throw Error(ERR_BAD_COMBINATION);
+			x2 = static_cast<const Xmm*>(&op1);
+			op = &op2;
+		}
+		// (x1, x2, op)
+		if (!((x1.isXMM() && x2->isXMM()) || (supportYMM && x1.isYMM() && x2->isYMM()))) throw Error(ERR_BAD_COMBINATION);
+		opVex(x1, x2, op, type, code0, w);
+	}
+	// if cvt then return pointer to Xmm(idx) (or Ymm(idx)), otherwise return op
+	void opAVX_X_X_XMcvt(const Xmm& x1, const Operand& op1, const Operand& op2, bool cvt, Operand::Kind kind, int type, int code0, bool supportYMM, int w = -1)
+	{
+		// use static_cast to avoid calling unintentional copy constructor on gcc
+		opAVX_X_X_XM(x1, op1, cvt ? kind == Operand::XMM ? static_cast<const Operand&>(Xmm(op2.getIdx())) : static_cast<const Operand&>(Ymm(op2.getIdx())) : op2, type, code0, supportYMM, w);
+	}
+	// support (x, x/m, imm), (y, y/m, imm)
+	void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, bool supportYMM, int w = -1, int imm = NONE)
+	{
+		opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, op, type, code, supportYMM, w); if (imm != NONE) db((uint8)imm);
+	}
+	// QQQ:need to refactor
+	void opSp1(const Reg& reg, const Operand& op, uint8 pref, uint8 code0, uint8 code1)
+	{
+		if (reg.isBit(8)) throw Error(ERR_BAD_SIZE_OF_REGISTER);
+		bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
+		if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) throw Error(ERR_BAD_COMBINATION);
+		if (is16bit) db(0x66);
+		db(pref); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
+	}
+	void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8 code, int w, int mode)
+	{
+		if (!addr.isVsib()) throw Error(ERR_BAD_VSIB_ADDRESSING);
+		const int y_vx_y = 0;
+		const int y_vy_y = 1;
+//		const int x_vy_x = 2;
+		const bool isAddrYMM = addr.isYMM();
+		if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
+			bool isOK = false;
+			if (mode == y_vx_y) {
+				isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
+			} else if (mode == y_vy_y) {
+				isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
+			} else { // x_vy_x
+				isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
+			}
+			if (!isOK) throw Error(ERR_BAD_VSIB_ADDRESSING);
+		}
+		addr.setVsib(false);
+		opAVX_X_X_XM(isAddrYMM ? Ymm(x1.getIdx()) : x1, isAddrYMM ? Ymm(x2.getIdx()) : x2, addr, type, code, true, w);
+		addr.setVsib(true);
+	}
+public:
+	unsigned int getVersion() const { return VERSION; }
+	using CodeArray::db;
+	const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+	const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+	const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+	const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
+	const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
+	const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
+	const Reg16 ax, cx, dx, bx, sp, bp, si, di;
+	const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
+	const AddressFrame ptr, byte, word, dword, qword;
+	const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
+#ifdef XBYAK64
+	const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
+	const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
+	const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
+	const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
+	const Reg8 spl, bpl, sil, dil;
+	const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+	const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+	const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; // for my convenience
+	const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
+	const RegRip rip;
+#endif
+	void L(const std::string& label) { labelMgr_.defineSlabel(label); }
+	void L(const Label& label) { labelMgr_.defineClabel(label); }
+	/*
+		assign src to dst
+		require
+		dst : does not used by L()
+		src : used by L()
+	*/
+	void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
+	void inLocalLabel() { labelMgr_.enterLocal(); }
+	void outLocalLabel() { labelMgr_.leaveLocal(); }
+	void jmp(std::string label, LabelType type = T_AUTO)
+	{
+		opJmp(label, type, B11101011, B11101001, 0);
+	}
+	void jmp(const Label& label, LabelType type = T_AUTO)
+	{
+		opJmp(label, type, B11101011, B11101001, 0);
+	}
+	void jmp(const char *label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
+	void jmp(const void *addr, LabelType type = T_AUTO)
+	{
+		opJmpAbs(addr, type, B11101011, B11101001);
+	}
+	void jmp(const Operand& op)
+	{
+		opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true);
+	}
+	void call(const Operand& op)
+	{
+		opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true);
+	}
+	// (REG|MEM, REG)
+	void test(const Operand& op, const Reg& reg)
+	{
+		opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), B10000100);
+	}
+	// (REG|MEM, IMM)
+	void test(const Operand& op, uint32 imm)
+	{
+		verifyMemHasSize(op);
+		if (op.isREG() && op.getIdx() == 0) { // al, ax, eax
+			rex(op);
+			db(B10101000 | (op.isBit(8) ? 0 : 1));
+		} else {
+			opR_ModM(op, 0, 0, B11110110);
+		}
+		db(imm, (std::min)(op.getBit() / 8, 4U));
+	}
+	void ret(int imm = 0)
+	{
+		if (imm) {
+			db(B11000010); dw(imm);
+		} else {
+			db(B11000011);
+		}
+	}
+	// (REG16|REG32, REG16|REG32|MEM)
+	void imul(const Reg& reg, const Operand& op)
+	{
+		opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, B10101111);
+	}
+	void imul(const Reg& reg, const Operand& op, int imm)
+	{
+		int s = inner::IsInDisp8(imm) ? 1 : 0;
+		opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), B01101001 | (s << 1));
+		int size = s ? 1 : reg.isREG(16) ? 2 : 4;
+		db(imm, size);
+	}
+	void pop(const Operand& op)
+	{
+		opPushPop(op, B10001111, 0, B01011000);
+	}
+	void push(const Operand& op)
+	{
+		opPushPop(op, B11111111, 6, B01010000);
+	}
+	void push(const AddressFrame& af, uint32 imm)
+	{
+		if (af.bit_ == 8 && inner::IsInDisp8(imm)) {
+			db(B01101010); db(imm);
+		} else if (af.bit_ == 16 && isInDisp16(imm)) {
+			db(0x66); db(B01101000); dw(imm);
+		} else {
+			db(B01101000); dd(imm);
+		}
+	}
+	/* use "push(word, 4)" if you want "push word 4" */
+	void push(uint32 imm)
+	{
+		if (inner::IsInDisp8(imm)) {
+			push(byte, imm);
+		} else {
+			push(dword, imm);
+		}
+	}
+	void bswap(const Reg32e& reg)
+	{
+		opModR(Reg32(1), reg, 0x0F);
+	}
+	void mov(const Operand& reg1, const Operand& reg2)
+	{
+		const Reg *reg = 0;
+		const Address *addr = 0;
+		uint8 code = 0;
+		if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) { // mov eax|ax|al, [disp]
+			reg = &static_cast<const Reg&>(reg1);
+			addr= &static_cast<const Address&>(reg2);
+			code = B10100000;
+		} else
+		if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) { // mov [disp], eax|ax|al
+			reg = &static_cast<const Reg&>(reg2);
+			addr= &static_cast<const Address&>(reg1);
+			code = B10100010;
+		}
+#ifdef XBYAK64
+		if (addr && addr->is64bitDisp()) {
+			if (code) {
+				rex(*reg);
+				db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
+				db(addr->getDisp(), 8);
+			} else {
+				throw Error(ERR_BAD_COMBINATION);
+			}
+		} else
+#else
+		if (code && addr->isOnlyDisp()) {
+			rex(*reg, *addr);
+			db(code | (reg->isBit(8) ? 0 : 1));
+			dd(static_cast<uint32>(addr->getDisp()));
+		} else
+#endif
+		{
+			opRM_RM(reg1, reg2, B10001000);
+		}
+	}
+private:
+	/*
+		mov(r, imm) = db(imm, mov_imm(r, imm))
+	*/
+	int mov_imm(const Reg& reg, size_t imm)
+	{
+		int bit = reg.getBit();
+		const int idx = reg.getIdx();
+		int code = B10110000 | ((bit == 8 ? 0 : 1) << 3);
+		if (bit == 64 && (imm & ~size_t(0xffffffffu)) == 0) {
+			rex(Reg32(idx));
+			bit = 32;
+		} else {
+			rex(reg);
+			if (bit == 64 && inner::IsInInt32(imm)) {
+				db(B11000111);
+				code = B11000000;
+				bit = 32;
+			}
+		}
+		db(code | (idx & 7));
+		return bit / 8;
+	}
+	template<class T>
+	void putL_inner(T& label, bool relative = false, size_t disp = 0)
+	{
+		const int jmpSize = relative ? 4 : (int)sizeof(size_t);
+		if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
+		size_t offset = 0;
+		if (labelMgr_.getOffset(&offset, label)) {
+			if (relative) {
+				db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
+			} else if (isAutoGrow()) {
+				db(uint64(0), jmpSize);
+				save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
+			} else {
+				db(size_t(top_) + offset, jmpSize);
+			}
+			return;
+		}
+		db(uint64(0), jmpSize);
+		JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
+		labelMgr_.addUndefinedLabel(label, jmp);
+	}
+public:
+	void mov(const Operand& op, size_t imm)
+	{
+		verifyMemHasSize(op);
+		if (op.isREG()) {
+			const int size = mov_imm(static_cast<const Reg&>(op), imm);
+			db(imm, size);
+		} else if (op.isMEM()) {
+			opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110);
+			int size = op.getBit() / 8; if (size > 4) size = 4;
+			db(static_cast<uint32>(imm), size);
+		} else {
+			throw Error(ERR_BAD_COMBINATION);
+		}
+	}
+	void mov(const NativeReg& reg, const char *label) // can't use std::string
+	{
+		if (label == 0) {
+			mov(static_cast<const Operand&>(reg), 0); // call imm
+			return;
+		}
+		mov_imm(reg, dummyAddr);
+		putL(label);
+	}
+	void mov(const NativeReg& reg, const Label& label)
+	{
+		mov_imm(reg, dummyAddr);
+		putL(label);
+	}
+	void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
+	void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
+	/*
+		put address of label to buffer
+		@note the put size is 4(32-bit), 8(64-bit)
+	*/
+	void putL(std::string label) { putL_inner(label); }
+	void putL(const Label& label) { putL_inner(label); }
+	void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
+	void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
+	void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B11000111); }
+#ifdef XBYAK64
+	void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, B11000111); }
+#endif
+	void xadd(const Operand& op, const Reg& reg)
+	{
+		opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, B11000000 | (reg.isBit(8) ? 0 : 1));
+	}
+	void cmpxchg(const Operand& op, const Reg& reg)
+	{
+		opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xb0 | (reg.isBit(8) ? 0 : 1));
+	}
+	void xchg(const Operand& op1, const Operand& op2)
+	{
+		const Operand *p1 = &op1, *p2 = &op2;
+		if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
+			p1 = &op2; p2 = &op1;
+		}
+		if (p1->isMEM()) throw Error(ERR_BAD_COMBINATION);
+		if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
+#ifdef XBYAK64
+			&& (p2->getIdx() != 0 || !p1->isREG(32))
+#endif
+		) {
+			rex(*p2, *p1); db(0x90 | (p2->getIdx() & 7));
+			return;
+		}
+		opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(), B10000110 | (p1->isBit(8) ? 0 : 1));
+	}
+	void call(std::string label) { opJmp(label, T_NEAR, 0, B11101000, 0); }
+	// call(string label)
+	void call(const char *label) { call(std::string(label)); }
+	void call(const Label& label) { opJmp(label, T_NEAR, 0, B11101000, 0); }
+	// call(function pointer)
+#ifdef XBYAK_VARIADIC_TEMPLATE
+	template<class Ret, class... Params>
+	void call(Ret(*func)(Params...)) { call(CastTo<const void*>(func)); }
+#endif
+	void call(const void *addr) { opJmpAbs(addr, T_NEAR, 0, B11101000); }
+	// special case
+	void movd(const Address& addr, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, B01111110);
+	}
+	void movd(const Reg32& reg, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01111110);
+	}
+	void movd(const Mmx& mmx, const Address& addr)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, B01101110);
+	}
+	void movd(const Mmx& mmx, const Reg32& reg)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01101110);
+	}
+	void movq2dq(const Xmm& xmm, const Mmx& mmx)
+	{
+		db(0xF3); opModR(xmm, mmx, 0x0F, B11010110);
+	}
+	void movdq2q(const Mmx& mmx, const Xmm& xmm)
+	{
+		db(0xF2); opModR(mmx, xmm, 0x0F, B11010110);
+	}
+	void movq(const Mmx& mmx, const Operand& op)
+	{
+		if (mmx.isXMM()) db(0xF3);
+		opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? B01111110 : B01101111);
+	}
+	void movq(const Address& addr, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModM(addr, mmx, 0x0F, mmx.isXMM() ? B11010110 : B01111111);
+	}
+#ifdef XBYAK64
+	void movq(const Reg64& reg, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01111110);
+	}
+	void movq(const Mmx& mmx, const Reg64& reg)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(mmx, reg, 0x0F, B01101110);
+	}
+	void pextrq(const Operand& op, const Xmm& xmm, uint8 imm)
+	{
+		if (!op.isREG(64) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION);
+		opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, B00111010); // force to 64bit
+	}
+	void pinsrq(const Xmm& xmm, const Operand& op, uint8 imm)
+	{
+		if (!op.isREG(64) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION);
+		opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, B00111010); // force to 64bit
+	}
+	void movsxd(const Reg64& reg, const Operand& op)
+	{
+		if (!op.isBit(32)) throw Error(ERR_BAD_COMBINATION);
+		opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
+	}
+#endif
+	// MMX2 : pextrw : reg, mmx/xmm, imm
+	// SSE4 : pextrw, pextrb, pextrd, extractps : reg/mem, mmx/xmm, imm
+	void pextrw(const Operand& op, const Mmx& xmm, uint8 imm) { opExt(op, xmm, 0x15, imm, true); }
+	void pextrb(const Operand& op, const Xmm& xmm, uint8 imm) { opExt(op, xmm, 0x14, imm); }
+	void pextrd(const Operand& op, const Xmm& xmm, uint8 imm) { opExt(op, xmm, 0x16, imm); }
+	void extractps(const Operand& op, const Xmm& xmm, uint8 imm) { opExt(op, xmm, 0x17, imm); }
+	void pinsrw(const Mmx& mmx, const Operand& op, int imm)
+	{
+		if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION);
+		opGen(mmx, op, B11000100, mmx.isXMM() ? 0x66 : NONE, 0, imm);
+	}
+	void insertps(const Xmm& xmm, const Operand& op, uint8 imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, B00111010); }
+	void pinsrb(const Xmm& xmm, const Operand& op, uint8 imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, B00111010); }
+	void pinsrd(const Xmm& xmm, const Operand& op, uint8 imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, B00111010); }
+
+	void pmovmskb(const Reg32e& reg, const Mmx& mmx)
+	{
+		if (mmx.isXMM()) db(0x66);
+		opModR(reg, mmx, 0x0F, B11010111);
+	}
+	void maskmovq(const Mmx& reg1, const Mmx& reg2)
+	{
+		if (!reg1.isMMX() || !reg2.isMMX()) throw Error(ERR_BAD_COMBINATION);
+		opModR(reg1, reg2, 0x0F, B11110111);
+	}
+	void lea(const Reg32e& reg, const Address& addr) { opModM(addr, reg, B10001101); }
+
+	void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, B01010000); }
+	void movmskpd(const Reg32e& reg, const Xmm& xmm) { db(0x66); movmskps(reg, xmm); }
+	void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, B00101011); }
+	void movntdqa(const Xmm& xmm, const Address& addr) { db(0x66); opModM(addr, xmm, 0x0F, 0x38, 0x2A); }
+	void lddqu(const Xmm& xmm, const Address& addr) { db(0xF2); opModM(addr, xmm, 0x0F, B11110000); }
+	void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, B11000011); }
+	void movntq(const Address& addr, const Mmx& mmx)
+	{
+		if (!mmx.isMMX()) throw Error(ERR_BAD_COMBINATION);
+		opModM(addr, mmx, 0x0F, B11100111);
+	}
+	void crc32(const Reg32e& reg, const Operand& op)
+	{
+		if (reg.isBit(32) && op.isBit(16)) db(0x66);
+		db(0xF2);
+		opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
+	}
+	void rdrand(const Reg& r) { if (r.isBit(8)) throw Error(ERR_BAD_SIZE_OF_REGISTER); opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0f, 0xc7); }
+	void rdseed(const Reg& r) { if (r.isBit(8)) throw Error(ERR_BAD_SIZE_OF_REGISTER); opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0f, 0xc7); }
+	void rorx(const Reg32e& r, const Operand& op, uint8 imm) { opGpr(r, op, Reg32e(0, r.getBit()), MM_0F3A | PP_F2, 0xF0, false); db(imm); }
+	enum { NONE = 256 };
+	CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0, Allocator *allocator = 0)
+		: CodeArray(maxSize, userPtr, allocator)
+		, mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7)
+		, xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7)
+		, ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7)
+		, xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7) // for my convenience
+		, ym0(ymm0), ym1(ymm1), ym2(ymm2), ym3(ymm3), ym4(ymm4), ym5(ymm5), ym6(ymm6), ym7(ymm7) // for my convenience
+		, eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI)
+		, ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP), bp(Operand::BP), si(Operand::SI), di(Operand::DI)
+		, al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH)
+		, ptr(0), byte(8), word(16), dword(32), qword(64)
+		, st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7)
+#ifdef XBYAK64
+		, rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15)
+		, r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D)
+		, r8w(Operand::R8W), r9w(Operand::R9W), r10w(Operand::R10W), r11w(Operand::R11W), r12w(Operand::R12W), r13w(Operand::R13W), r14w(Operand::R14W), r15w(Operand::R15W)
+		, r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B)
+		, spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true)
+		, xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15)
+		, ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15)
+		, xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15) // for my convenience
+		, ym8(ymm8), ym9(ymm9), ym10(ymm10), ym11(ymm11), ym12(ymm12), ym13(ymm13), ym14(ymm14), ym15(ymm15) // for my convenience
+		, rip()
+#endif
+	{
+		labelMgr_.set(this);
+	}
+	void reset()
+	{
+		resetSize();
+		labelMgr_.reset();
+		labelMgr_.set(this);
+	}
+	bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
+	/*
+		call ready() to complete generating code on AutoGrow
+	*/
+	void ready()
+	{
+		if (hasUndefinedLabel()) throw Error(ERR_LABEL_IS_NOT_FOUND);
+		calcJmpAddress();
+	}
+#ifdef XBYAK_TEST
+	void dump(bool doClear = true)
+	{
+		CodeArray::dump();
+		if (doClear) size_ = 0;
+	}
+#endif
+
+#ifndef XBYAK_DONT_READ_LIST
+#include "xbyak_mnemonic.h"
+	void align(int x = 16)
+	{
+		if (x == 1) return;
+		if (x < 1 || (x & (x - 1))) throw Error(ERR_BAD_ALIGN);
+		if (isAutoGrow() && x > (int)inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", x);
+		while (size_t(getCurr()) % x) {
+			nop();
+		}
+	}
+#endif
+};
+
+namespace util {
+static const Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
+static const Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
+static const Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
+static const Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
+static const Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP), bp(Operand::BP), si(Operand::SI), di(Operand::DI);
+static const Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
+static const AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64);
+static const Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
+#ifdef XBYAK64
+static const Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
+static const Reg32 r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D);
+static const Reg16 r8w(Operand::R8W), r9w(Operand::R9W), r10w(Operand::R10W), r11w(Operand::R11W), r12w(Operand::R12W), r13w(Operand::R13W), r14w(Operand::R14W), r15w(Operand::R15W);
+static const Reg8 r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B), spl(Operand::SPL, 1), bpl(Operand::BPL, 1), sil(Operand::SIL, 1), dil(Operand::DIL, 1);
+static const Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
+static const Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
+static const RegRip rip;
+#endif
+} // util
+
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+
+} // end of namespace
+
+#endif // XBYAK_XBYAK_H_
diff --git a/plugins/GSdx_legacy/xbyak/xbyak_bin2hex.h b/plugins/GSdx_legacy/xbyak/xbyak_bin2hex.h
new file mode 100644
index 0000000000..1eb447f4db
--- /dev/null
+++ b/plugins/GSdx_legacy/xbyak/xbyak_bin2hex.h
@@ -0,0 +1,286 @@
+/* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+* 
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+enum {
+	B00000000= 0,
+	B00000001= 1,
+	B00000010= 2,
+	B00000011= 3,
+	B00000100= 4,
+	B00000101= 5,
+	B00000110= 6,
+	B00000111= 7,
+	B00001000= 8,
+	B00001001= 9,
+	B00001010= 10,
+	B00001011= 11,
+	B00001100= 12,
+	B00001101= 13,
+	B00001110= 14,
+	B00001111= 15,
+	B00010000= 16,
+	B00010001= 17,
+	B00010010= 18,
+	B00010011= 19,
+	B00010100= 20,
+	B00010101= 21,
+	B00010110= 22,
+	B00010111= 23,
+	B00011000= 24,
+	B00011001= 25,
+	B00011010= 26,
+	B00011011= 27,
+	B00011100= 28,
+	B00011101= 29,
+	B00011110= 30,
+	B00011111= 31,
+	B00100000= 32,
+	B00100001= 33,
+	B00100010= 34,
+	B00100011= 35,
+	B00100100= 36,
+	B00100101= 37,
+	B00100110= 38,
+	B00100111= 39,
+	B00101000= 40,
+	B00101001= 41,
+	B00101010= 42,
+	B00101011= 43,
+	B00101100= 44,
+	B00101101= 45,
+	B00101110= 46,
+	B00101111= 47,
+	B00110000= 48,
+	B00110001= 49,
+	B00110010= 50,
+	B00110011= 51,
+	B00110100= 52,
+	B00110101= 53,
+	B00110110= 54,
+	B00110111= 55,
+	B00111000= 56,
+	B00111001= 57,
+	B00111010= 58,
+	B00111011= 59,
+	B00111100= 60,
+	B00111101= 61,
+	B00111110= 62,
+	B00111111= 63,
+	B01000000= 64,
+	B01000001= 65,
+	B01000010= 66,
+	B01000011= 67,
+	B01000100= 68,
+	B01000101= 69,
+	B01000110= 70,
+	B01000111= 71,
+	B01001000= 72,
+	B01001001= 73,
+	B01001010= 74,
+	B01001011= 75,
+	B01001100= 76,
+	B01001101= 77,
+	B01001110= 78,
+	B01001111= 79,
+	B01010000= 80,
+	B01010001= 81,
+	B01010010= 82,
+	B01010011= 83,
+	B01010100= 84,
+	B01010101= 85,
+	B01010110= 86,
+	B01010111= 87,
+	B01011000= 88,
+	B01011001= 89,
+	B01011010= 90,
+	B01011011= 91,
+	B01011100= 92,
+	B01011101= 93,
+	B01011110= 94,
+	B01011111= 95,
+	B01100000= 96,
+	B01100001= 97,
+	B01100010= 98,
+	B01100011= 99,
+	B01100100= 100,
+	B01100101= 101,
+	B01100110= 102,
+	B01100111= 103,
+	B01101000= 104,
+	B01101001= 105,
+	B01101010= 106,
+	B01101011= 107,
+	B01101100= 108,
+	B01101101= 109,
+	B01101110= 110,
+	B01101111= 111,
+	B01110000= 112,
+	B01110001= 113,
+	B01110010= 114,
+	B01110011= 115,
+	B01110100= 116,
+	B01110101= 117,
+	B01110110= 118,
+	B01110111= 119,
+	B01111000= 120,
+	B01111001= 121,
+	B01111010= 122,
+	B01111011= 123,
+	B01111100= 124,
+	B01111101= 125,
+	B01111110= 126,
+	B01111111= 127,
+	B10000000= 128,
+	B10000001= 129,
+	B10000010= 130,
+	B10000011= 131,
+	B10000100= 132,
+	B10000101= 133,
+	B10000110= 134,
+	B10000111= 135,
+	B10001000= 136,
+	B10001001= 137,
+	B10001010= 138,
+	B10001011= 139,
+	B10001100= 140,
+	B10001101= 141,
+	B10001110= 142,
+	B10001111= 143,
+	B10010000= 144,
+	B10010001= 145,
+	B10010010= 146,
+	B10010011= 147,
+	B10010100= 148,
+	B10010101= 149,
+	B10010110= 150,
+	B10010111= 151,
+	B10011000= 152,
+	B10011001= 153,
+	B10011010= 154,
+	B10011011= 155,
+	B10011100= 156,
+	B10011101= 157,
+	B10011110= 158,
+	B10011111= 159,
+	B10100000= 160,
+	B10100001= 161,
+	B10100010= 162,
+	B10100011= 163,
+	B10100100= 164,
+	B10100101= 165,
+	B10100110= 166,
+	B10100111= 167,
+	B10101000= 168,
+	B10101001= 169,
+	B10101010= 170,
+	B10101011= 171,
+	B10101100= 172,
+	B10101101= 173,
+	B10101110= 174,
+	B10101111= 175,
+	B10110000= 176,
+	B10110001= 177,
+	B10110010= 178,
+	B10110011= 179,
+	B10110100= 180,
+	B10110101= 181,
+	B10110110= 182,
+	B10110111= 183,
+	B10111000= 184,
+	B10111001= 185,
+	B10111010= 186,
+	B10111011= 187,
+	B10111100= 188,
+	B10111101= 189,
+	B10111110= 190,
+	B10111111= 191,
+	B11000000= 192,
+	B11000001= 193,
+	B11000010= 194,
+	B11000011= 195,
+	B11000100= 196,
+	B11000101= 197,
+	B11000110= 198,
+	B11000111= 199,
+	B11001000= 200,
+	B11001001= 201,
+	B11001010= 202,
+	B11001011= 203,
+	B11001100= 204,
+	B11001101= 205,
+	B11001110= 206,
+	B11001111= 207,
+	B11010000= 208,
+	B11010001= 209,
+	B11010010= 210,
+	B11010011= 211,
+	B11010100= 212,
+	B11010101= 213,
+	B11010110= 214,
+	B11010111= 215,
+	B11011000= 216,
+	B11011001= 217,
+	B11011010= 218,
+	B11011011= 219,
+	B11011100= 220,
+	B11011101= 221,
+	B11011110= 222,
+	B11011111= 223,
+	B11100000= 224,
+	B11100001= 225,
+	B11100010= 226,
+	B11100011= 227,
+	B11100100= 228,
+	B11100101= 229,
+	B11100110= 230,
+	B11100111= 231,
+	B11101000= 232,
+	B11101001= 233,
+	B11101010= 234,
+	B11101011= 235,
+	B11101100= 236,
+	B11101101= 237,
+	B11101110= 238,
+	B11101111= 239,
+	B11110000= 240,
+	B11110001= 241,
+	B11110010= 242,
+	B11110011= 243,
+	B11110100= 244,
+	B11110101= 245,
+	B11110110= 246,
+	B11110111= 247,
+	B11111000= 248,
+	B11111001= 249,
+	B11111010= 250,
+	B11111011= 251,
+	B11111100= 252,
+	B11111101= 253,
+	B11111110= 254,
+	B11111111= 255
+};
diff --git a/plugins/GSdx_legacy/xbyak/xbyak_mnemonic.h b/plugins/GSdx_legacy/xbyak/xbyak_mnemonic.h
new file mode 100644
index 0000000000..d551c61323
--- /dev/null
+++ b/plugins/GSdx_legacy/xbyak/xbyak_mnemonic.h
@@ -0,0 +1,1489 @@
+/* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+* 
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+const char *getVersionString() const { return "4.84"; }
+void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
+void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
+void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
+void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
+void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
+void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
+void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
+void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
+void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
+void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
+void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
+void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
+void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
+void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
+void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
+void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
+void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
+void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
+void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
+void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
+void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
+void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
+void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
+void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
+void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
+void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
+void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
+void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
+void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
+void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
+void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
+void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
+void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
+void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
+void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
+void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
+void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
+void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
+void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
+void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
+void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
+void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
+void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
+void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
+void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
+void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
+void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
+void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
+void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
+void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
+void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
+void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
+void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
+void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
+void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
+void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
+void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
+void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
+void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
+void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
+void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
+void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
+void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
+void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
+void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
+void pshufw(const Mmx& mmx, const Operand& op, uint8 imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
+void pshuflw(const Mmx& mmx, const Operand& op, uint8 imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
+void pshufhw(const Mmx& mmx, const Operand& op, uint8 imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
+void pshufd(const Mmx& mmx, const Operand& op, uint8 imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
+void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
+void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); }
+void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
+void movdqu(const Address& addr, const Xmm& xmm) { db(0xF3); opModM(addr, xmm, 0x0F, 0x7F); }
+void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
+void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
+void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
+void movss(const Address& addr, const Xmm& xmm) { db(0xF3); opModM(addr, xmm, 0x0F, 0x11); }
+void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
+void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
+void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
+void movapd(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x29); }
+void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
+void movsd(const Address& addr, const Xmm& xmm) { db(0xF2); opModM(addr, xmm, 0x0F, 0x11); }
+void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
+void movupd(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x11); }
+void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
+void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
+void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
+void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
+void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
+void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
+void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
+void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
+void cmpps(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
+void cmpss(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
+void cmppd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
+void cmpsd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
+void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
+void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
+void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
+void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
+void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
+void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
+void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
+void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
+void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
+void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
+void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
+void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
+void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
+void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
+void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
+void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
+void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
+void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
+void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
+void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
+void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
+void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
+void shufps(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
+void shufpd(const Xmm& xmm, const Operand& op, uint8 imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
+void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
+void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
+void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
+void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
+void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
+void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
+void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
+void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
+void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
+void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
+void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
+void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
+void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
+void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
+void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { db(0x66);  opModR(reg1, reg2, 0x0F, 0xF7); }
+void movhlps(const Xmm& reg1, const Xmm& reg2) {  opModR(reg1, reg2, 0x0F, 0x12); }
+void movlhps(const Xmm& reg1, const Xmm& reg2) {  opModR(reg1, reg2, 0x0F, 0x16); }
+void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
+void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
+void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
+void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
+void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
+void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
+void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
+void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
+void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
+void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
+void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
+void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
+void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
+void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
+void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
+void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
+void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
+void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
+void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
+void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
+void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
+void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
+void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM); }
+void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM); }
+void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM); }
+void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
+void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
+void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
+void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
+void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
+void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
+void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
+void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
+void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
+void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
+void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
+void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
+void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B00011000); }
+void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, B00011000); }
+void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, B00011000); }
+void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, B00011000); }
+void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
+void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
+void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
+void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
+void cmovo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 0); }
+void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }
+void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }
+void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 0); }
+void cmovno(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 1); }
+void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }
+void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }
+void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 1); }
+void cmovb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
+void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
+void cmovc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
+void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
+void cmovnae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
+void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
+void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
+void cmovnb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
+void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
+void cmovae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
+void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
+void cmovnc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
+void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
+void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
+void cmove(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
+void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
+void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
+void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
+void cmovz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
+void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
+void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
+void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
+void cmovne(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
+void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
+void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
+void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
+void cmovnz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
+void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
+void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
+void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
+void cmovbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
+void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
+void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
+void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
+void cmovna(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
+void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
+void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
+void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
+void cmovnbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
+void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
+void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
+void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
+void cmova(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
+void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
+void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
+void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
+void cmovs(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 8); }
+void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }
+void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }
+void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 8); }
+void cmovns(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 9); }
+void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }
+void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }
+void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 9); }
+void cmovp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
+void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
+void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
+void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
+void cmovpe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
+void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
+void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
+void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
+void cmovnp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
+void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
+void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
+void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
+void cmovpo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
+void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
+void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
+void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
+void cmovl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
+void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
+void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
+void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
+void cmovnge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
+void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
+void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
+void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
+void cmovnl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
+void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
+void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
+void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
+void cmovge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
+void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
+void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
+void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
+void cmovle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
+void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
+void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
+void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
+void cmovng(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
+void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
+void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
+void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
+void cmovnle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
+void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
+void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
+void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
+void cmovg(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
+void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
+void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
+void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
+#ifdef XBYAK32
+void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jcxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+#else
+void jecxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(const Label& label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+#endif
+#ifdef XBYAK64
+void cdqe() { db(0x48); db(0x98); }
+void cqo() { db(0x48); db(0x99); }
+#else
+void aaa() { db(0x37); }
+void aad() { db(0xD5); db(0x0A); }
+void aam() { db(0xD4); db(0x0A); }
+void aas() { db(0x3F); }
+void daa() { db(0x27); }
+void das() { db(0x2F); }
+void popad() { db(0x61); }
+void popfd() { db(0x9D); }
+void pusha() { db(0x60); }
+void pushad() { db(0x60); }
+void pushfd() { db(0x9C); }
+void popa() { db(0x61); }
+#endif
+void cbw() { db(0x66); db(0x98); }
+void cdq() { db(0x99); }
+void clc() { db(0xF8); }
+void cld() { db(0xFC); }
+void cli() { db(0xFA); }
+void cmc() { db(0xF5); }
+void cpuid() { db(0x0F); db(0xA2); }
+void cwd() { db(0x66); db(0x99); }
+void cwde() { db(0x98); }
+void lahf() { db(0x9F); }
+void lock() { db(0xF0); }
+void nop() { db(0x90); }
+void sahf() { db(0x9E); }
+void stc() { db(0xF9); }
+void std() { db(0xFD); }
+void sti() { db(0xFB); }
+void emms() { db(0x0F); db(0x77); }
+void pause() { db(0xF3); db(0x90); }
+void sfence() { db(0x0F); db(0xAE); db(0xF8); }
+void lfence() { db(0x0F); db(0xAE); db(0xE8); }
+void mfence() { db(0x0F); db(0xAE); db(0xF0); }
+void monitor() { db(0x0F); db(0x01); db(0xC8); }
+void mwait() { db(0x0F); db(0x01); db(0xC9); }
+void rdmsr() { db(0x0F); db(0x32); }
+void rdpmc() { db(0x0F); db(0x33); }
+void rdtsc() { db(0x0F); db(0x31); }
+void rdtscp() { db(0x0F); db(0x01); db(0xF9); }
+void ud2() { db(0x0F); db(0x0B); }
+void wait() { db(0x9B); }
+void fwait() { db(0x9B); }
+void wbinvd() { db(0x0F); db(0x09); }
+void wrmsr() { db(0x0F); db(0x30); }
+void xlatb() { db(0xD7); }
+void popf() { db(0x9D); }
+void pushf() { db(0x9C); }
+void stac() { db(0x0F); db(0x01); db(0xCB); }
+void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
+void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
+void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
+void f2xm1() { db(0xD9); db(0xF0); }
+void fabs() { db(0xD9); db(0xE1); }
+void faddp() { db(0xDE); db(0xC1); }
+void fchs() { db(0xD9); db(0xE0); }
+void fcom() { db(0xD8); db(0xD1); }
+void fcomp() { db(0xD8); db(0xD9); }
+void fcompp() { db(0xDE); db(0xD9); }
+void fcos() { db(0xD9); db(0xFF); }
+void fdecstp() { db(0xD9); db(0xF6); }
+void fdivp() { db(0xDE); db(0xF9); }
+void fdivrp() { db(0xDE); db(0xF1); }
+void fincstp() { db(0xD9); db(0xF7); }
+void finit() { db(0x9B); db(0xDB); db(0xE3); }
+void fninit() { db(0xDB); db(0xE3); }
+void fld1() { db(0xD9); db(0xE8); }
+void fldl2t() { db(0xD9); db(0xE9); }
+void fldl2e() { db(0xD9); db(0xEA); }
+void fldpi() { db(0xD9); db(0xEB); }
+void fldlg2() { db(0xD9); db(0xEC); }
+void fldln2() { db(0xD9); db(0xED); }
+void fldz() { db(0xD9); db(0xEE); }
+void fmulp() { db(0xDE); db(0xC9); }
+void fnop() { db(0xD9); db(0xD0); }
+void fpatan() { db(0xD9); db(0xF3); }
+void fprem() { db(0xD9); db(0xF8); }
+void fprem1() { db(0xD9); db(0xF5); }
+void fptan() { db(0xD9); db(0xF2); }
+void frndint() { db(0xD9); db(0xFC); }
+void fscale() { db(0xD9); db(0xFD); }
+void fsin() { db(0xD9); db(0xFE); }
+void fsincos() { db(0xD9); db(0xFB); }
+void fsqrt() { db(0xD9); db(0xFA); }
+void fsubp() { db(0xDE); db(0xE9); }
+void fsubrp() { db(0xDE); db(0xE1); }
+void ftst() { db(0xD9); db(0xE4); }
+void fucom() { db(0xDD); db(0xE1); }
+void fucomp() { db(0xDD); db(0xE9); }
+void fucompp() { db(0xDA); db(0xE9); }
+void fxam() { db(0xD9); db(0xE5); }
+void fxch() { db(0xD9); db(0xC9); }
+void fxtract() { db(0xD9); db(0xF4); }
+void fyl2x() { db(0xD9); db(0xF1); }
+void fyl2xp1() { db(0xD9); db(0xF9); }
+void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
+void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
+void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
+void add(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x00, 0); }
+void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
+void and_(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x20, 4); }
+#ifndef XBYAK_NO_OP_NAMES
+void and(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
+void and(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x20, 4); }
+#endif
+void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
+void cmp(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x38, 7); }
+void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
+void or_(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x08, 1); }
+#ifndef XBYAK_NO_OP_NAMES
+void or(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
+void or(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x08, 1); }
+#endif
+void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
+void sbb(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x18, 3); }
+void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
+void sub(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x28, 5); }
+void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
+void xor_(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); }
+#ifndef XBYAK_NO_OP_NAMES
+void xor(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
+void xor(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); }
+#endif
+void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
+void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
+void bt(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xa3); }
+void bt(const Operand& op, uint8 imm) { opR_ModM(op, 16|32|64, 4, 0x0f, 0xba); db(imm); }
+void bts(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xab); }
+void bts(const Operand& op, uint8 imm) { opR_ModM(op, 16|32|64, 5, 0x0f, 0xba); db(imm); }
+void btr(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xb3); }
+void btr(const Operand& op, uint8 imm) { opR_ModM(op, 16|32|64, 6, 0x0f, 0xba); db(imm); }
+void btc(const Operand& op, const Reg& reg) { opModRM(reg, op, op.isREG(16|32|64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xbb); }
+void btc(const Operand& op, uint8 imm) { opR_ModM(op, 16|32|64, 7, 0x0f, 0xba); db(imm); }
+void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
+void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
+void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
+void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
+void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
+void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
+#ifndef XBYAK_NO_OP_NAMES
+void not(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
+#endif
+void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
+void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
+void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
+void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
+void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
+void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
+void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
+void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
+void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
+void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
+void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
+void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
+void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
+void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
+void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void shld(const Operand& op, const Reg& reg, uint8 imm) { opShxd(op, reg, imm, 0xA4); }
+void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
+void shrd(const Operand& op, const Reg& reg, uint8 imm) { opShxd(op, reg, imm, 0xAC); }
+void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
+void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
+void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
+void popcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
+void tzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
+void lzcnt(const Reg&reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
+void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
+void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
+void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
+void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
+void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
+void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
+void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
+void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
+void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
+void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
+void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
+void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
+void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
+void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
+void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
+void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8>(imm), 0x3a); }
+void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void dpps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void mpsadbw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pblendw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void roundps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void roundpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void roundss(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void roundsd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pcmpestrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pcmpestri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pcmpistrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pcmpistri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void aeskeygenassist(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
+void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
+void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
+void pclmullqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
+void pclmulhqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
+void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
+void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
+void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
+void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
+void fstcw(const Address& addr) { db(0x9B); opModM(addr, Reg32(7), 0xD9, NONE); }
+void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
+void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
+void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
+void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
+void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
+void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
+void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
+void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
+void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
+void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
+void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
+void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
+void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
+void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
+void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
+void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
+void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
+void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
+void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
+void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
+void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
+void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
+void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
+void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
+void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
+void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
+void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
+void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
+void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
+void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
+void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
+void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
+void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
+void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
+void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
+void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
+void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
+void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
+void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
+void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
+void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
+void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
+void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
+void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
+void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
+void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
+void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
+void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
+void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
+void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
+void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
+void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
+void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
+void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
+void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
+void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
+void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
+void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
+void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
+void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
+void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
+void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
+void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
+void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
+void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
+void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
+void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
+void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
+void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
+void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
+void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
+void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
+void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
+void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
+void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
+void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
+void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
+void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
+void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
+void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
+void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
+void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
+void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
+void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
+void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x58, true); }
+void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x58, true); }
+void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x58, false); }
+void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x58, false); }
+void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5C, true); }
+void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5C, true); }
+void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5C, false); }
+void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5C, false); }
+void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x59, true); }
+void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x59, true); }
+void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x59, false); }
+void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x59, false); }
+void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5E, true); }
+void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5E, true); }
+void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5E, false); }
+void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5E, false); }
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5F, true); }
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5F, true); }
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5F, false); }
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5F, false); }
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5D, true); }
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5D, true); }
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5D, false); }
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5D, false); }
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x54, true); }
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x54, true); }
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x55, true); }
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x55, true); }
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x56, true); }
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x56, true); }
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x57, true); }
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x57, true); }
+void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
+void vblendpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
+void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
+void vblendps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
+void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
+void vdppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
+void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
+void vdpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
+void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x42, true, 0); db(imm); }
+void vmpsadbw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x42, true, 0); db(imm); }
+void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0E, true, 0); db(imm); }
+void vpblendw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0E, true, 0); db(imm); }
+void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x02, true, 0); db(imm); }
+void vpblendd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x02, true, 0); db(imm); }
+void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
+void vroundsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
+void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
+void vroundss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
+void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
+void vpclmulqdq(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
+void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x0C, true, 0); }
+void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x0D, true, 0); }
+void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x47, true, 0); }
+void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x47, true, 1); }
+void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x46, true, 0); }
+void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x45, true, 0); }
+void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x45, true, 1); }
+void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
+void vcmppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
+void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F, 0xC2, true, -1); db(imm); }
+void vcmpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC2, true, -1); db(imm); }
+void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
+void vcmpsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
+void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
+void vcmpss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
+void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x5A, false, -1); }
+void vcvtsd2ss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x5A, false, -1); }
+void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x5A, false, -1); }
+void vcvtss2sd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x5A, false, -1); }
+void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
+void vinsertps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
+void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x63, true, -1); }
+void vpacksswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x63, true, -1); }
+void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x6B, true, -1); }
+void vpackssdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6B, true, -1); }
+void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x67, true, -1); }
+void vpackuswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x67, true, -1); }
+void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x2B, true, -1); }
+void vpackusdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x2B, true, -1); }
+void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xFC, true, -1); }
+void vpaddb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFC, true, -1); }
+void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xFD, true, -1); }
+void vpaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFD, true, -1); }
+void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xFE, true, -1); }
+void vpaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFE, true, -1); }
+void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD4, true, -1); }
+void vpaddq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD4, true, -1); }
+void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xEC, true, -1); }
+void vpaddsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEC, true, -1); }
+void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xED, true, -1); }
+void vpaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xED, true, -1); }
+void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDC, true, -1); }
+void vpaddusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDC, true, -1); }
+void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDD, true, -1); }
+void vpaddusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDD, true, -1); }
+void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x0F, true, -1); db(imm); }
+void vpalignr(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0F, true, -1); db(imm); }
+void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDB, true, -1); }
+void vpand(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDB, true, -1); }
+void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDF, true, -1); }
+void vpandn(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDF, true, -1); }
+void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE0, true, -1); }
+void vpavgb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE0, true, -1); }
+void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE3, true, -1); }
+void vpavgw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE3, true, -1); }
+void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x74, true, -1); }
+void vpcmpeqb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x74, true, -1); }
+void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x75, true, -1); }
+void vpcmpeqw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x75, true, -1); }
+void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x76, true, -1); }
+void vpcmpeqd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x76, true, -1); }
+void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x29, true, -1); }
+void vpcmpeqq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x29, true, -1); }
+void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x64, true, -1); }
+void vpcmpgtb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x64, true, -1); }
+void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x65, true, -1); }
+void vpcmpgtw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x65, true, -1); }
+void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x66, true, -1); }
+void vpcmpgtd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x66, true, -1); }
+void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x37, true, -1); }
+void vpcmpgtq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x37, true, -1); }
+void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x01, true, -1); }
+void vphaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x01, true, -1); }
+void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x02, true, -1); }
+void vphaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x02, true, -1); }
+void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x03, true, -1); }
+void vphaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x03, true, -1); }
+void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x05, true, -1); }
+void vphsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x05, true, -1); }
+void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x06, true, -1); }
+void vphsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x06, true, -1); }
+void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x07, true, -1); }
+void vphsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x07, true, -1); }
+void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF5, true, -1); }
+void vpmaddwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF5, true, -1); }
+void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x04, true, -1); }
+void vpmaddubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x04, true, -1); }
+void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3C, true, -1); }
+void vpmaxsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3C, true, -1); }
+void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xEE, true, -1); }
+void vpmaxsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEE, true, -1); }
+void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3D, true, -1); }
+void vpmaxsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3D, true, -1); }
+void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDE, true, -1); }
+void vpmaxub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDE, true, -1); }
+void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3E, true, -1); }
+void vpmaxuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3E, true, -1); }
+void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3F, true, -1); }
+void vpmaxud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3F, true, -1); }
+void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x38, true, -1); }
+void vpminsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x38, true, -1); }
+void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xEA, true, -1); }
+void vpminsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEA, true, -1); }
+void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x39, true, -1); }
+void vpminsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x39, true, -1); }
+void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xDA, true, -1); }
+void vpminub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDA, true, -1); }
+void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3A, true, -1); }
+void vpminuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3A, true, -1); }
+void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x3B, true, -1); }
+void vpminud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3B, true, -1); }
+void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE4, true, -1); }
+void vpmulhuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE4, true, -1); }
+void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x0B, true, -1); }
+void vpmulhrsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0B, true, -1); }
+void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE5, true, -1); }
+void vpmulhw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE5, true, -1); }
+void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD5, true, -1); }
+void vpmullw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD5, true, -1); }
+void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x40, true, -1); }
+void vpmulld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x40, true, -1); }
+void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF4, false, -1); }
+void vpmuludq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF4, false, -1); }
+void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x28, true, -1); }
+void vpmuldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x28, true, -1); }
+void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xEB, true, -1); }
+void vpor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEB, true, -1); }
+void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF6, true, -1); }
+void vpsadbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF6, true, -1); }
+void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x00, true, -1); }
+void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x08, true, -1); }
+void vpsignb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x08, true, -1); }
+void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x09, true, -1); }
+void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, true, -1); }
+void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F38 | PP_66, 0x0A, true, -1); }
+void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, true, -1); }
+void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF1, true, -1); }
+void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, true, -1); }
+void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF2, true, -1); }
+void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, true, -1); }
+void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF3, true, -1); }
+void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, true, -1); }
+void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE1, true, -1); }
+void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, true, -1); }
+void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE2, true, -1); }
+void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, true, -1); }
+void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD1, true, -1); }
+void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, true, -1); }
+void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD2, true, -1); }
+void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, true, -1); }
+void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD3, true, -1); }
+void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, true, -1); }
+void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF8, true, -1); }
+void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, true, -1); }
+void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xF9, true, -1); }
+void vpsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF9, true, -1); }
+void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xFA, true, -1); }
+void vpsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFA, true, -1); }
+void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xFB, true, -1); }
+void vpsubq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFB, true, -1); }
+void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE8, true, -1); }
+void vpsubsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE8, true, -1); }
+void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xE9, true, -1); }
+void vpsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE9, true, -1); }
+void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD8, true, -1); }
+void vpsubusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD8, true, -1); }
+void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xD9, true, -1); }
+void vpsubusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD9, true, -1); }
+void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x68, true, -1); }
+void vpunpckhbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x68, true, -1); }
+void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x69, true, -1); }
+void vpunpckhwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x69, true, -1); }
+void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x6A, true, -1); }
+void vpunpckhdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6A, true, -1); }
+void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x6D, true, -1); }
+void vpunpckhqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6D, true, -1); }
+void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x60, true, -1); }
+void vpunpcklbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x60, true, -1); }
+void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x61, true, -1); }
+void vpunpcklwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x61, true, -1); }
+void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x62, true, -1); }
+void vpunpckldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x62, true, -1); }
+void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x6C, true, -1); }
+void vpunpcklqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6C, true, -1); }
+void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xEF, true, -1); }
+void vpxor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEF, true, -1); }
+void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x53, false, -1); }
+void vrcpss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x53, false, -1); }
+void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x52, false, -1); }
+void vrsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x52, false, -1); }
+void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
+void vshufpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
+void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { opAVX_X_X_XM(x1, x2, op, MM_0F, 0xC6, true, -1); db(imm); }
+void vshufps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC6, true, -1); db(imm); }
+void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x51, false, -1); }
+void vsqrtsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x51, false, -1); }
+void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x51, false, -1); }
+void vsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x51, false, -1); }
+void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x15, true, -1); }
+void vunpckhpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x15, true, -1); }
+void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F, 0x15, true, -1); }
+void vunpckhps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x15, true, -1); }
+void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F | PP_66, 0x14, true, -1); }
+void vunpcklpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x14, true, -1); }
+void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, MM_0F, 0x14, true, -1); }
+void vunpcklps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x14, true, -1); }
+void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0xDF, false, 0, imm); }
+void vroundpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x09, true, 0, imm); }
+void vroundps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x08, true, 0, imm); }
+void vpermilpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x05, true, 0, imm); }
+void vpermilps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x04, true, 0, imm); }
+void vpcmpestri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x61, false, 0, imm); }
+void vpcmpestrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x60, false, 0, imm); }
+void vpcmpistri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x63, false, 0, imm); }
+void vpcmpistrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x62, false, 0, imm); }
+void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0E, true, 0); }
+void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0F, true, 0); }
+void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2F, false, -1); }
+void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2F, false, -1); }
+void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x5B, true, -1); }
+void vcvtps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x5B, true, -1); }
+void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x5B, true, -1); }
+void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x28, true, -1); }
+void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x28, true, -1); }
+void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x12, true, -1); }
+void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x6F, true, -1); }
+void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x6F, true, -1); }
+void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x16, true, -1); }
+void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x12, true, -1); }
+void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x10, true, -1); }
+void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x10, true, -1); }
+void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1C, true, -1); }
+void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1D, true, -1); }
+void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1E, true, -1); }
+void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x41, false, -1); }
+void vpmovsxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x20, true, -1); }
+void vpmovsxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x21, true, -1); }
+void vpmovsxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x22, true, -1); }
+void vpmovsxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x23, true, -1); }
+void vpmovsxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x24, true, -1); }
+void vpmovsxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x25, true, -1); }
+void vpmovzxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x30, true, -1); }
+void vpmovzxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x31, true, -1); }
+void vpmovzxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x32, true, -1); }
+void vpmovzxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x33, true, -1); }
+void vpmovzxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x34, true, -1); }
+void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x35, true, -1); }
+void vpshufd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x70, true, -1, imm); }
+void vpshufhw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x70, true, -1, imm); }
+void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x70, true, -1, imm); }
+void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, true, -1); }
+void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x53, true, -1); }
+void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x52, true, -1); }
+void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x51, true, -1); }
+void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x51, true, -1); }
+void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2E, false, -1); }
+void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2E, false, -1); }
+void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x29, true, -1); }
+void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x29, true, -1); }
+void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x7F, true, -1); }
+void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_F3, 0x7F, true, -1); }
+void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x11, true, -1); }
+void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x11, true, -1); }
+void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0xD0, true, -1); }
+void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0xD0, true, -1); }
+void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7C, true, -1); }
+void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7C, true, -1); }
+void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7D, true, -1); }
+void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7D, true, -1); }
+void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDC, false, 0); }
+void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDD, false, 0); }
+void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDE, false, 0); }
+void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDF, false, 0); }
+void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, MM_0F38 | PP_66, 0x2C, true, 0); }
+void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, MM_0F38 | PP_66, 0x2E, true, 0); }
+void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, MM_0F38 | PP_66, 0x2D, true, 0); }
+void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, MM_0F38 | PP_66, 0x2F, true, 0); }
+void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, MM_0F38 | PP_66, 0x8C, true, 0); }
+void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, MM_0F38 | PP_66, 0x8E, true, 0); }
+void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, MM_0F38 | PP_66, 0x8C, true, 1); }
+void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, MM_0F38 | PP_66, 0x8E, true, 1); }
+void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, MM_0F38 | PP_66, 0x36, true, 0); }
+void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, MM_0F38 | PP_66, 0x16, true, 0); }
+void vpermq(const Ymm& y, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(y, op, MM_0F3A | PP_66, 0x00, true, 1, imm); }
+void vpermpd(const Ymm& y, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(y, op, MM_0F3A | PP_66, 0x01, true, 1, imm); }
+void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
+void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
+void vcmpeqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 0); }
+void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
+void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
+void vcmpltpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 1); }
+void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
+void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
+void vcmplepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 2); }
+void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
+void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
+void vcmpunordpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 3); }
+void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
+void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
+void vcmpneqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 4); }
+void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
+void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
+void vcmpnltpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 5); }
+void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
+void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
+void vcmpnlepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 6); }
+void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
+void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
+void vcmpordpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 7); }
+void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
+void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 8); }
+void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
+void vcmpngepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 9); }
+void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
+void vcmpngtpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 10); }
+void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
+void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 11); }
+void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
+void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 12); }
+void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
+void vcmpgepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 13); }
+void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
+void vcmpgtpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 14); }
+void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
+void vcmptruepd(const Xmm& x, const Operand& op) { vcmppd(x, op, 15); }
+void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
+void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmppd(x, op, 16); }
+void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
+void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 17); }
+void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
+void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 18); }
+void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
+void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmppd(x, op, 19); }
+void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
+void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmppd(x, op, 20); }
+void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
+void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 21); }
+void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
+void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 22); }
+void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
+void vcmpord_spd(const Xmm& x, const Operand& op) { vcmppd(x, op, 23); }
+void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
+void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmppd(x, op, 24); }
+void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
+void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 25); }
+void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
+void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 26); }
+void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
+void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmppd(x, op, 27); }
+void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
+void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmppd(x, op, 28); }
+void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
+void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 29); }
+void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
+void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmppd(x, op, 30); }
+void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
+void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmppd(x, op, 31); }
+void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
+void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
+void vcmpeqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 0); }
+void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
+void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
+void vcmpltps(const Xmm& x, const Operand& op) { vcmpps(x, op, 1); }
+void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
+void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
+void vcmpleps(const Xmm& x, const Operand& op) { vcmpps(x, op, 2); }
+void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
+void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
+void vcmpunordps(const Xmm& x, const Operand& op) { vcmpps(x, op, 3); }
+void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
+void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
+void vcmpneqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 4); }
+void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
+void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
+void vcmpnltps(const Xmm& x, const Operand& op) { vcmpps(x, op, 5); }
+void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
+void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
+void vcmpnleps(const Xmm& x, const Operand& op) { vcmpps(x, op, 6); }
+void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
+void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
+void vcmpordps(const Xmm& x, const Operand& op) { vcmpps(x, op, 7); }
+void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
+void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 8); }
+void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
+void vcmpngeps(const Xmm& x, const Operand& op) { vcmpps(x, op, 9); }
+void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
+void vcmpngtps(const Xmm& x, const Operand& op) { vcmpps(x, op, 10); }
+void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
+void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpps(x, op, 11); }
+void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
+void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 12); }
+void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
+void vcmpgeps(const Xmm& x, const Operand& op) { vcmpps(x, op, 13); }
+void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
+void vcmpgtps(const Xmm& x, const Operand& op) { vcmpps(x, op, 14); }
+void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
+void vcmptrueps(const Xmm& x, const Operand& op) { vcmpps(x, op, 15); }
+void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
+void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpps(x, op, 16); }
+void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
+void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 17); }
+void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
+void vcmple_oqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 18); }
+void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
+void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpps(x, op, 19); }
+void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
+void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpps(x, op, 20); }
+void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
+void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 21); }
+void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
+void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 22); }
+void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
+void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpps(x, op, 23); }
+void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
+void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpps(x, op, 24); }
+void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
+void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 25); }
+void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
+void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 26); }
+void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
+void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpps(x, op, 27); }
+void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
+void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpps(x, op, 28); }
+void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
+void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 29); }
+void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
+void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpps(x, op, 30); }
+void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
+void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmpps(x, op, 31); }
+void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
+void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
+void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 0); }
+void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
+void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
+void vcmpltsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 1); }
+void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
+void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
+void vcmplesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 2); }
+void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
+void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
+void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 3); }
+void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
+void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
+void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 4); }
+void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
+void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
+void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 5); }
+void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
+void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
+void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 6); }
+void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
+void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
+void vcmpordsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 7); }
+void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
+void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 8); }
+void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
+void vcmpngesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 9); }
+void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
+void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 10); }
+void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
+void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 11); }
+void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
+void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 12); }
+void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
+void vcmpgesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 13); }
+void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
+void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 14); }
+void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
+void vcmptruesd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 15); }
+void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
+void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 16); }
+void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
+void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 17); }
+void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
+void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 18); }
+void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
+void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 19); }
+void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
+void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 20); }
+void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
+void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 21); }
+void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
+void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 22); }
+void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
+void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 23); }
+void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
+void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 24); }
+void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
+void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 25); }
+void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
+void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 26); }
+void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
+void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 27); }
+void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
+void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 28); }
+void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
+void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 29); }
+void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
+void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 30); }
+void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
+void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmpsd(x, op, 31); }
+void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
+void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
+void vcmpeqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 0); }
+void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
+void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
+void vcmpltss(const Xmm& x, const Operand& op) { vcmpss(x, op, 1); }
+void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
+void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
+void vcmpless(const Xmm& x, const Operand& op) { vcmpss(x, op, 2); }
+void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
+void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
+void vcmpunordss(const Xmm& x, const Operand& op) { vcmpss(x, op, 3); }
+void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
+void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
+void vcmpneqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 4); }
+void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
+void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
+void vcmpnltss(const Xmm& x, const Operand& op) { vcmpss(x, op, 5); }
+void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
+void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
+void vcmpnless(const Xmm& x, const Operand& op) { vcmpss(x, op, 6); }
+void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
+void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
+void vcmpordss(const Xmm& x, const Operand& op) { vcmpss(x, op, 7); }
+void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
+void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 8); }
+void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
+void vcmpngess(const Xmm& x, const Operand& op) { vcmpss(x, op, 9); }
+void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
+void vcmpngtss(const Xmm& x, const Operand& op) { vcmpss(x, op, 10); }
+void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
+void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpss(x, op, 11); }
+void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
+void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 12); }
+void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
+void vcmpgess(const Xmm& x, const Operand& op) { vcmpss(x, op, 13); }
+void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
+void vcmpgtss(const Xmm& x, const Operand& op) { vcmpss(x, op, 14); }
+void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
+void vcmptruess(const Xmm& x, const Operand& op) { vcmpss(x, op, 15); }
+void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
+void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpss(x, op, 16); }
+void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
+void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 17); }
+void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
+void vcmple_oqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 18); }
+void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
+void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpss(x, op, 19); }
+void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
+void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpss(x, op, 20); }
+void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
+void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 21); }
+void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
+void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 22); }
+void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
+void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpss(x, op, 23); }
+void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
+void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpss(x, op, 24); }
+void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
+void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 25); }
+void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
+void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 26); }
+void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
+void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpss(x, op, 27); }
+void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
+void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpss(x, op, 28); }
+void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
+void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 29); }
+void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
+void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpss(x, op, 30); }
+void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
+void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmpss(x, op, 31); }
+void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x16, false); }
+void vmovhpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x17, false); }
+void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, op1, op2, MM_0F, 0x16, false); }
+void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x17, false); }
+void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x12, false); }
+void vmovlpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x13, false); }
+void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x, op1, op2, MM_0F, 0x12, false); }
+void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x13, false); }
+void vfmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 1); }
+void vfmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 1); }
+void vfmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 1); }
+void vfmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 0); }
+void vfmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 0); }
+void vfmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 0); }
+void vfmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 1); }
+void vfmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 1); }
+void vfmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 1); }
+void vfmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 0); }
+void vfmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 0); }
+void vfmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 0); }
+void vfmaddsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 1); }
+void vfmaddsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 1); }
+void vfmaddsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 1); }
+void vfmaddsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 0); }
+void vfmaddsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 0); }
+void vfmaddsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 0); }
+void vfmsubadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 1); }
+void vfmsubadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 1); }
+void vfmsubadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 1); }
+void vfmsubadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 0); }
+void vfmsubadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 0); }
+void vfmsubadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 0); }
+void vfmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 1); }
+void vfmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 1); }
+void vfmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 1); }
+void vfmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 0); }
+void vfmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 0); }
+void vfmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 0); }
+void vfmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 1); }
+void vfmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 1); }
+void vfmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 1); }
+void vfmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 0); }
+void vfmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 0); }
+void vfmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 0); }
+void vfnmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 1); }
+void vfnmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 1); }
+void vfnmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 1); }
+void vfnmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 0); }
+void vfnmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 0); }
+void vfnmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 0); }
+void vfnmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 1); }
+void vfnmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 1); }
+void vfnmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 1); }
+void vfnmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 0); }
+void vfnmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 0); }
+void vfnmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 0); }
+void vfnmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 1); }
+void vfnmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 1); }
+void vfnmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 1); }
+void vfnmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 0); }
+void vfnmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 0); }
+void vfnmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 0); }
+void vfnmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 1); }
+void vfnmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 1); }
+void vfnmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 1); }
+void vfnmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 0); }
+void vfnmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 0); }
+void vfnmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 0); }
+void vaesimc(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0xDB, false, 0); }
+void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x1A, true, 0); }
+void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x5A, true, 0); }
+void vbroadcastsd(const Ymm& y, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(y, op, MM_0F38 | PP_66, 0x19, true, 0); }
+void vbroadcastss(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x18, true, 0); }
+void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x78, true, 0); }
+void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x79, true, 0); }
+void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x58, true, 0); }
+void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x59, true, 0); }
+void vextractf128(const Operand& op, const Ymm& y, uint8 imm) { opAVX_X_X_XMcvt(y, y.isXMM() ? xm0 : ym0, op, op.isXMM(), Operand::YMM, MM_0F3A | PP_66, 0x19, true, 0); db(imm); }
+void vextracti128(const Operand& op, const Ymm& y, uint8 imm) { opAVX_X_X_XMcvt(y, y.isXMM() ? xm0 : ym0, op, op.isXMM(), Operand::YMM, MM_0F3A | PP_66, 0x39, true, 0); db(imm); }
+void vextractps(const Operand& op, const Xmm& x, uint8 imm) { if (!(op.isREG(32) || op.isMEM()) || x.isYMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x.isXMM() ? xm0 : ym0, op, op.isREG(), Operand::XMM, MM_0F3A | PP_66, 0x17, false, 0); db(imm); }
+void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XMcvt(y1, y2, op, op.isXMM(), Operand::YMM, MM_0F3A | PP_66, 0x18, true, 0); db(imm); }
+void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XMcvt(y1, y2, op, op.isXMM(), Operand::YMM, MM_0F3A | PP_66, 0x38, true, 0); db(imm); }
+void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, MM_0F3A | PP_66, 0x06, true, 0); db(imm); }
+void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, MM_0F3A | PP_66, 0x46, true, 0); db(imm); }
+void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_F2, 0xF0, true, 0); }
+void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, MM_0F, 0xAE, false, -1); }
+void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, MM_0F, 0xAE, false, -1); }
+void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_66, 0xF7, false, -1); }
+void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(i32e) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, xm0, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x14, false); db(imm); }
+void vpextrw(const Reg& r, const Xmm& x, uint8 imm) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, x, MM_0F | PP_66, 0xC5, false, r.isBit(64) ? 1 : 0); db(imm); }
+void vpextrw(const Address& addr, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, addr, MM_0F3A | PP_66, 0x15, false); db(imm); }
+void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, xm0, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x16, false, 0); db(imm); }
+void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x1, x2, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x20, false); db(imm); }
+void vpinsrb(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x20, false); db(imm); }
+void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x1, x2, op, !op.isMEM(), Operand::XMM, MM_0F | PP_66, 0xC4, false); db(imm); }
+void vpinsrw(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x, op, !op.isMEM(), Operand::XMM, MM_0F | PP_66, 0xC4, false); db(imm); }
+void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x1, x2, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
+void vpinsrd(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
+void vpmovmskb(const Reg32e& r, const Xmm& x) { bool isYMM= x.isYMM(); opAVX_X_X_XM(isYMM ? Ymm(r.getIdx()) : Xmm(r.getIdx()), isYMM ? ym0 : xm0, x, MM_0F | PP_66, 0xD7, true); }
+void vpslldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym7 : xm7, x1, x2, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpslldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym7 : xm7, x, x, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsrldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym3 : xm3, x1, x2, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsrldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym3 : xm3, x, x, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsllw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym6 : xm6, x1, x2, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpsllw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym6 : xm6, x, x, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpslld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym6 : xm6, x1, x2, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpslld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym6 : xm6, x, x, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpsllq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym6 : xm6, x1, x2, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsllq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym6 : xm6, x, x, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsraw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym4 : xm4, x1, x2, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpsraw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym4 : xm4, x, x, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpsrad(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym4 : xm4, x1, x2, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpsrad(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym4 : xm4, x, x, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpsrlw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym2 : xm2, x1, x2, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpsrlw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym2 : xm2, x, x, MM_0F | PP_66, 0x71, true); db(imm); }
+void vpsrld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym2 : xm2, x1, x2, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpsrld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym2 : xm2, x, x, MM_0F | PP_66, 0x72, true); db(imm); }
+void vpsrlq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(x1.isYMM() ? ym2 : xm2, x1, x2, MM_0F | PP_66, 0x73, true); db(imm); }
+void vpsrlq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(x.isYMM() ? ym2 : xm2, x, x, MM_0F | PP_66, 0x73, true); db(imm); }
+void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
+void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
+void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
+void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
+void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
+void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
+void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
+void vmovd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x7E, false, 0); }
+void vmovq(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x7E, false, -1); }
+void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0xD6, false, -1); }
+void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_F3, 0x7E, false, -1); }
+void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, MM_0F, 0x12, false); }
+void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, MM_0F, 0x16, false); }
+void vmovmskpd(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F | PP_66, 0x50, true, 0); }
+void vmovmskps(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F, 0x50, true, 0); }
+void vmovntdq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0xE7, true); }
+void vmovntpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0x2B, true); }
+void vmovntps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F, 0x2B, true); }
+void vmovntdqa(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ymm0, addr, MM_0F38 | PP_66, 0x2A, true); }
+void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x10, false); }
+void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x10, false); }
+void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x11, false); }
+void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x10, false); }
+void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x10, false); }
+void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x11, false); }
+void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 0); }
+void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 0); }
+void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 0); }
+void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 0); }
+void vcvtsi2ss(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, op1, op2, op2.isREG(), Operand::XMM, MM_0F | PP_F3, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
+void vcvtsi2sd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, op1, op2, op2.isREG(), Operand::XMM, MM_0F | PP_F2, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
+void vcvtps2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x.isXMM() ? xm0 : ym0, op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM, MM_0F, 0x5A, true); }
+void vcvtdq2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x.isXMM() ? xm0 : ym0, op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM, MM_0F | PP_F3, 0xE6, true); }
+void vcvtpd2ps(const Xmm& x, const Operand& op) { if (x.isYMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0x5A, true); }
+void vcvtpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_F2, 0xE6, true); }
+void vcvttpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0xE6, true); }
+void vcvtph2ps(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opVex(x, NULL, &op, MM_0F38 | PP_66, 0x13, 0); }
+void vcvtps2ph(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isMEM() && !op.isXMM()) throw Error(ERR_BAD_COMBINATION); opVex(x, NULL, &op, MM_0F3A | PP_66, 0x1d, 0); db(imm); }
+#ifdef XBYAK64
+void vmovq(const Xmm& x, const Reg64& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 1); }
+void vmovq(const Reg64& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 1); }
+void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, xm0, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x16, false, 1); db(imm); }
+void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x1, x2, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
+void vpinsrq(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw Error(ERR_BAD_COMBINATION); opAVX_X_X_XMcvt(x, x, op, !op.isMEM(), Operand::XMM, MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
+void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 1); }
+void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
+void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
+void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
+#endif
+void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, MM_0F38, 0xf2, true); }
+void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, MM_0F38 | PP_F2, 0xf6, true); }
+void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, MM_0F38 | PP_F2, 0xf5, true); }
+void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, MM_0F38 | PP_F3, 0xf5, true); }
+void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, MM_0F38, 0xf7, false); }
+void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, MM_0F38, 0xf5, false); }
+void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, MM_0F38 | PP_F3, 0xf7, false); }
+void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, MM_0F38 | PP_66, 0xf7, false); }
+void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, MM_0F38 | PP_F2, 0xf7, false); }
+void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, MM_0F38, 0xf3, false); }
+void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, MM_0F38, 0xf3, false); }
+void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, MM_0F38, 0xf3, false); }
+void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x92, 1, 0); }
+void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x93, 1, 1); }
+void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x92, 0, 1); }
+void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x93, 0, 2); }
+void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 0, 1); }
+void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0, 2); }
+void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1, 0); }
+void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1, 1); }
diff --git a/plugins/GSdx_legacy/xbyak/xbyak_util.h b/plugins/GSdx_legacy/xbyak/xbyak_util.h
new file mode 100644
index 0000000000..22f0371f2b
--- /dev/null
+++ b/plugins/GSdx_legacy/xbyak/xbyak_util.h
@@ -0,0 +1,561 @@
+/* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+* 
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+* 
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef XBYAK_XBYAK_UTIL_H_
+#define XBYAK_XBYAK_UTIL_H_
+
+/**
+	utility class and functions for Xbyak
+	Xbyak::util::Clock ; rdtsc timer
+	Xbyak::util::Cpu ; detect CPU
+	@note this header is UNDER CONSTRUCTION!
+*/
+#include "xbyak.h"
+
+#ifdef _MSC_VER
+	#if (_MSC_VER < 1400) && defined(XBYAK32)
+		static inline __declspec(naked) void __cpuid(int[4], int)
+		{
+			__asm {
+				push	ebx
+				push	esi
+				mov		eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
+				cpuid
+				mov		esi, dword ptr [esp + 4 * 2 + 4] // data
+				mov		dword ptr [esi], eax
+				mov		dword ptr [esi + 4], ebx
+				mov		dword ptr [esi + 8], ecx
+				mov		dword ptr [esi + 12], edx
+				pop		esi
+				pop		ebx
+				ret
+			}
+		}
+	#else
+		#include <intrin.h> // for __cpuid
+	#endif
+#else
+	#ifndef __GNUC_PREREQ
+    	#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
+	#endif
+	#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
+		#include <cpuid.h>
+	#else
+		#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+		#else
+			#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+			#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+		#endif
+	#endif
+#endif
+
+#ifdef _MSC_VER
+extern "C" unsigned __int64 __xgetbv(int);
+#endif
+
+namespace Xbyak { namespace util {
+
+/**
+	CPU detection class
+*/
+class Cpu {
+	uint64 type_;
+	unsigned int get32bitAsBE(const char *x) const
+	{
+		return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
+	}
+	unsigned int mask(int n) const
+	{
+		return (1U << n) - 1;
+	}
+	void setFamily()
+	{
+		unsigned int data[4];
+		getCpuid(1, data);
+		stepping = data[0] & mask(4);
+		model = (data[0] >> 4) & mask(4);
+		family = (data[0] >> 8) & mask(4);
+		// type = (data[0] >> 12) & mask(2);
+		extModel = (data[0] >> 16) & mask(4);
+		extFamily = (data[0] >> 20) & mask(8);
+		if (family == 0x0f) {
+			displayFamily = family + extFamily;
+		} else {
+			displayFamily = family;
+		}
+		if (family == 6 || family == 0x0f) {
+			displayModel = (extModel << 4) + model;
+		} else {
+			displayModel = model;
+		}
+	}
+public:
+	int model;
+	int family;
+	int stepping;
+	int extModel;
+	int extFamily;
+	int displayFamily; // family + extFamily
+	int displayModel; // model + extModel
+	static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
+	{
+#ifdef _MSC_VER
+		__cpuid(reinterpret_cast<int*>(data), eaxIn);
+#else
+		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+#endif
+	}
+	static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
+	{
+#ifdef _MSC_VER
+		__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
+#else
+		__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+#endif
+	}
+	static inline uint64 getXfeature()
+	{
+#ifdef _MSC_VER
+		return __xgetbv(0);
+#else
+		unsigned int eax, edx;
+		// xgetvb is not support on gcc 4.2
+//		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+		__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
+		return ((uint64)edx << 32) | eax;
+#endif
+	}
+	typedef uint64 Type;
+	static const Type NONE = 0;
+	static const Type tMMX = 1 << 0;
+	static const Type tMMX2 = 1 << 1;
+	static const Type tCMOV = 1 << 2;
+	static const Type tSSE = 1 << 3;
+	static const Type tSSE2 = 1 << 4;
+	static const Type tSSE3 = 1 << 5;
+	static const Type tSSSE3 = 1 << 6;
+	static const Type tSSE41 = 1 << 7;
+	static const Type tSSE42 = 1 << 8;
+	static const Type tPOPCNT = 1 << 9;
+	static const Type tAESNI = 1 << 10;
+	static const Type tSSE5 = 1 << 11;
+	static const Type tOSXSAVE = 1 << 12;
+	static const Type tPCLMULQDQ = 1 << 13;
+	static const Type tAVX = 1 << 14;
+	static const Type tFMA = 1 << 15;
+
+	static const Type t3DN = 1 << 16;
+	static const Type tE3DN = 1 << 17;
+	static const Type tSSE4a = 1 << 18;
+	static const Type tRDTSCP = 1 << 19;
+	static const Type tAVX2 = 1 << 20;
+	static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
+	static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+	static const Type tLZCNT = 1 << 23;
+
+	static const Type tINTEL = 1 << 24;
+	static const Type tAMD = 1 << 25;
+
+	static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
+	static const Type tRDRAND = 1 << 27;
+	static const Type tADX = 1 << 28; // adcx, adox
+	static const Type tRDSEED = 1 << 29; // rdseed
+	static const Type tSMAP = 1 << 30; // stac
+	static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
+	static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
+	static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
+	static const Type tMOVBE = uint64(1) << 34; // mobve
+
+	Cpu()
+		: type_(NONE)
+	{
+		unsigned int data[4];
+		getCpuid(0, data);
+		const unsigned int maxNum = data[0];
+		static const char intel[] = "ntel";
+		static const char amd[] = "cAMD";
+		if (data[2] == get32bitAsBE(amd)) {
+			type_ |= tAMD;
+			getCpuid(0x80000001, data);
+			if (data[3] & (1U << 31)) type_ |= t3DN;
+			if (data[3] & (1U << 15)) type_ |= tCMOV;
+			if (data[3] & (1U << 30)) type_ |= tE3DN;
+			if (data[3] & (1U << 22)) type_ |= tMMX2;
+			if (data[3] & (1U << 27)) type_ |= tRDTSCP;
+		}
+		if (data[2] == get32bitAsBE(intel)) {
+			type_ |= tINTEL;
+			getCpuid(0x80000001, data);
+			if (data[3] & (1U << 27)) type_ |= tRDTSCP;
+			if (data[2] & (1U << 5)) type_ |= tLZCNT;
+		}
+		getCpuid(1, data);
+		if (data[2] & (1U << 0)) type_ |= tSSE3;
+		if (data[2] & (1U << 9)) type_ |= tSSSE3;
+		if (data[2] & (1U << 19)) type_ |= tSSE41;
+		if (data[2] & (1U << 20)) type_ |= tSSE42;
+		if (data[2] & (1U << 22)) type_ |= tMOVBE;
+		if (data[2] & (1U << 23)) type_ |= tPOPCNT;
+		if (data[2] & (1U << 25)) type_ |= tAESNI;
+		if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
+		if (data[2] & (1U << 27)) type_ |= tOSXSAVE;
+		if (data[2] & (1U << 30)) type_ |= tRDRAND;
+		if (data[2] & (1U << 29)) type_ |= tF16C;
+
+		if (data[3] & (1U << 15)) type_ |= tCMOV;
+		if (data[3] & (1U << 23)) type_ |= tMMX;
+		if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
+		if (data[3] & (1U << 26)) type_ |= tSSE2;
+
+		if (type_ & tOSXSAVE) {
+			// check XFEATURE_ENABLED_MASK[2:1] = '11b'
+			uint64 bv = getXfeature();
+			if ((bv & 6) == 6) {
+				if (data[2] & (1U << 28)) type_ |= tAVX;
+				if (data[2] & (1U << 12)) type_ |= tFMA;
+			}
+		}
+		if (maxNum >= 7) {
+			getCpuidEx(7, 0, data);
+			if (type_ & tAVX && data[1] & 0x20) type_ |= tAVX2;
+			if (data[1] & (1U << 3)) type_ |= tBMI1;
+			if (data[1] & (1U << 8)) type_ |= tBMI2;
+			if (data[1] & (1U << 9)) type_ |= tENHANCED_REP;
+			if (data[1] & (1U << 18)) type_ |= tRDSEED;
+			if (data[1] & (1U << 19)) type_ |= tADX;
+			if (data[1] & (1U << 20)) type_ |= tSMAP;
+			if (data[1] & (1U << 4)) type_ |= tHLE;
+			if (data[1] & (1U << 11)) type_ |= tRTM;
+		}
+		setFamily();
+	}
+	void putFamily()
+	{
+		printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
+			family, model, stepping, extFamily, extModel);
+		printf("display:family=%X, model=%X\n", displayFamily, displayModel);
+	}
+	bool has(Type type) const
+	{
+		return (type & type_) != 0;
+	}
+};
+
+class Clock {
+public:
+	static inline uint64 getRdtsc()
+	{
+#ifdef _MSC_VER
+		return __rdtsc();
+#else
+		unsigned int eax, edx;
+		__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
+		return ((uint64)edx << 32) | eax;
+#endif
+	}
+	Clock()
+		: clock_(0)
+		, count_(0)
+	{
+	}
+	void begin()
+	{
+		clock_ -= getRdtsc();
+	}
+	void end()
+	{
+		clock_ += getRdtsc();
+		count_++;
+	}
+	int getCount() const { return count_; }
+	uint64 getClock() const { return clock_; }
+	void clear() { count_ = 0; clock_ = 0; }
+private:
+	uint64 clock_;
+	int count_;
+};
+
+#ifdef XBYAK64
+const int UseRCX = 1 << 6;
+const int UseRDX = 1 << 7;
+
+class Pack {
+	static const size_t maxTblNum = 10;
+	const Xbyak::Reg64 *tbl_[maxTblNum];
+	size_t n_;
+public:
+	Pack() : n_(0) {}
+	Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
+	Pack(const Pack& rhs)
+		: n_(rhs.n_)
+	{
+		if (n_ > maxTblNum) throw Error(ERR_INTERNAL);
+		for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+	}
+	Pack(const Xbyak::Reg64& t0)
+	{ n_ = 1; tbl_[0] = &t0; }
+	Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
+	Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
+	Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
+	Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
+	Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
+	Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
+	Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
+	Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
+	Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
+	{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
+	Pack& append(const Xbyak::Reg64& t)
+	{
+		if (n_ == 10) {
+			fprintf(stderr, "ERR Pack::can't append\n");
+			throw Error(ERR_BAD_PARAMETER);
+		}
+		tbl_[n_++] = &t;
+		return *this;
+	}
+	void init(const Xbyak::Reg64 *tbl, size_t n)
+	{
+		if (n > maxTblNum) {
+			fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
+			throw Error(ERR_BAD_PARAMETER);
+		}
+		n_ = n;
+		for (size_t i = 0; i < n; i++) {
+			tbl_[i] = &tbl[i];
+		}
+	}
+	const Xbyak::Reg64& operator[](size_t n) const
+	{
+		if (n >= n_) {
+			fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
+			throw Error(ERR_BAD_PARAMETER);
+		}
+		return *tbl_[n];
+	}
+	size_t size() const { return n_; }
+	/*
+		get tbl[pos, pos + num)
+	*/
+	Pack sub(size_t pos, size_t num = size_t(-1)) const
+	{
+		if (num == size_t(-1)) num = n_ - pos;
+		if (pos + num > n_) {
+			fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
+			throw Error(ERR_BAD_PARAMETER);
+		}
+		Pack pack;
+		pack.n_ = num;
+		for (size_t i = 0; i < num; i++) {
+			pack.tbl_[i] = tbl_[pos + i];
+		}
+		return pack;
+	}
+	void put() const
+	{
+		for (size_t i = 0; i < n_; i++) {
+			printf("%s ", tbl_[i]->toString());
+		}
+		printf("\n");
+	}
+};
+
+class StackFrame {
+#ifdef XBYAK64_WIN
+	static const int noSaveNum = 6;
+	static const int rcxPos = 0;
+	static const int rdxPos = 1;
+#else
+	static const int noSaveNum = 8;
+	static const int rcxPos = 3;
+	static const int rdxPos = 2;
+#endif
+	Xbyak::CodeGenerator *code_;
+	int pNum_;
+	int tNum_;
+	bool useRcx_;
+	bool useRdx_;
+	int saveNum_;
+	int P_;
+	bool makeEpilog_;
+	Xbyak::Reg64 pTbl_[4];
+	Xbyak::Reg64 tTbl_[10];
+	Pack p_;
+	Pack t_;
+	StackFrame(const StackFrame&);
+	void operator=(const StackFrame&);
+public:
+	const Pack& p;
+	const Pack& t;
+	/*
+		make stack frame
+		@param sf [in] this
+		@param pNum [in] num of function parameter(0 <= pNum <= 4)
+		@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
+		@param stackSizeByte [in] local stack size
+		@param makeEpilog [in] automatically call close() if true
+
+		you can use
+		rax
+		gp0, ..., gp(pNum - 1)
+		gt0, ..., gt(tNum-1)
+		rcx if tNum & UseRCX
+		rdx if tNum & UseRDX
+		rsp[0..stackSizeByte - 1]
+	*/
+	StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
+		: code_(code)
+		, pNum_(pNum)
+		, tNum_(tNum & ~(UseRCX | UseRDX))
+		, useRcx_((tNum & UseRCX) != 0)
+		, useRdx_((tNum & UseRDX) != 0)
+		, saveNum_(0)
+		, P_(0)
+		, makeEpilog_(makeEpilog)
+		, p(p_)
+		, t(t_)
+	{
+		using namespace Xbyak;
+		if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
+		const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
+		if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
+		const Reg64& _rsp = code->rsp;
+		const AddressFrame& _ptr = code->ptr;
+		saveNum_ = (std::max)(0, allRegNum - noSaveNum);
+		const int *tbl = getOrderTbl() + noSaveNum;
+		P_ = saveNum_ + (stackSizeByte + 7) / 8;
+		if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
+		P_ *= 8;
+		if (P_ > 0) code->sub(_rsp, P_);
+#ifdef XBYAK64_WIN
+		for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
+			code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
+		}
+		for (int i = 4; i < saveNum_; i++) {
+			code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
+		}
+#else
+		for (int i = 0; i < saveNum_; i++) {
+			code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
+		}
+#endif
+		int pos = 0;
+		for (int i = 0; i < pNum; i++) {
+			pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+		}
+		for (int i = 0; i < tNum_; i++) {
+			tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+		}
+		if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
+		if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
+		p_.init(pTbl_, pNum);
+		t_.init(tTbl_, tNum_);
+	}
+	/*
+		make epilog manually
+		@param callRet [in] call ret() if true
+	*/
+	void close(bool callRet = true)
+	{
+		using namespace Xbyak;
+		const Reg64& _rsp = code_->rsp;
+		const AddressFrame& _ptr = code_->ptr;
+		const int *tbl = getOrderTbl() + noSaveNum;
+#ifdef XBYAK64_WIN
+		for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
+			code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
+		}
+		for (int i = 4; i < saveNum_; i++) {
+			code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
+		}
+#else
+		for (int i = 0; i < saveNum_; i++) {
+			code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
+		}
+#endif
+		if (P_ > 0) code_->add(_rsp, P_);
+
+		if (callRet) code_->ret();
+	}
+	~StackFrame()
+	{
+		if (!makeEpilog_) return;
+		try {
+			close();
+		} catch (std::exception& e) {
+			printf("ERR:StackFrame %s\n", e.what());
+			exit(1);
+		} catch (...) {
+			printf("ERR:StackFrame otherwise\n");
+			exit(1);
+		}
+	}
+private:
+	const int *getOrderTbl() const
+	{
+		using namespace Xbyak;
+		static const int tbl[] = {
+#ifdef XBYAK64_WIN
+			Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
+#else
+			Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
+#endif
+			Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
+		};
+		return &tbl[0];
+	}
+	int getRegIdx(int& pos) const
+	{
+		assert(pos < 14);
+		using namespace Xbyak;
+		const int *tbl = getOrderTbl();
+		int r = tbl[pos++];
+		if (useRcx_) {
+			if (r == Operand::RCX) { return Operand::R10; }
+			if (r == Operand::R10) { r = tbl[pos++]; }
+		}
+		if (useRdx_) {
+			if (r == Operand::RDX) { return Operand::R11; }
+			if (r == Operand::R11) { return tbl[pos++]; }
+		}
+		return r;
+	}
+};
+#endif
+
+} } // end of util
+#endif