zzogl: painfully merge the zzogl-dev branch

* new memory management * asm was replaced by intrinsic * new GLSL backend (AMD only) Cmake is probably broken anyway with the 2 plugins... * and lots of others stuff that I forgot about it ;) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5166 96395faa-99c1-11dd-bbfe-3dabce05a288
2012-04-19 21:22:08 +00:00 · 2012-04-19 21:22:08 +00:00 · e3c741bb2a
parent 8fcadb3616
commit e3c741bb2a
47 changed files with 7412 additions and 5710 deletions
--- a/cmake/BuildParameters.cmake
+++ b/cmake/BuildParameters.cmake
@ -8,6 +8,7 @@
 # Use soundtouch  internal lib: -DFORCE_INTERNAL_SOUNDTOUCH=TRUE
 # Use zlib        internal lib: -DFORCE_INTERNAL_ZLIB=TRUE
 # Use sdl1.3      internal lib: -DFORCE_INTERNAL_SDL=TRUE # Not supported yet
+# Use GLSL API(else NVIDIA_CG): -DGLSL_API=TRUE

 ### GCC optimization options
 # control C flags             : -DUSER_CMAKE_C_FLAGS="cflags"
@ -183,3 +184,10 @@ if(PACKAGE_MODE)
    # Compile all source codes with these 2 defines
    add_definitions(-DPLUGIN_DIR_COMPILATION=${PLUGIN_DIR} -DGAMEINDEX_DIR_COMPILATION=${GAMEINDEX_DIR})
 endif(PACKAGE_MODE)
+
+#-------------------------------------------------------------------------------
+# Select nvidia cg shader api by default
+#-------------------------------------------------------------------------------
+if(NOT DEFINED FORCE_INTERNAL_SOUNDTOUCH)
+	set(GLSL_API FALSE)
+endif(NOT DEFINED FORCE_INTERNAL_SOUNDTOUCH)
--- a/cmake/SearchForStuff.cmake
+++ b/cmake/SearchForStuff.cmake
@ -53,7 +53,9 @@ if(NOT FORCE_INTERNAL_ZLIB)
 endif(NOT FORCE_INTERNAL_ZLIB)

 ## Use pcsx2 package to find module
+if(NOT GLSL_API)
 	include(FindCg)
+endif(NOT GLSL_API)
 include(FindGlew)
 include(FindLibc)
 include(FindPortAudio)
--- a/cmake/SelectPcsx2Plugins.cmake
+++ b/cmake/SelectPcsx2Plugins.cmake
@ -6,11 +6,15 @@ set(msg_dep_pcsx2       "check these libraries -> wxWidgets (>=2.8.10), gtk2 (>=
 set(msg_dep_cdvdiso     "check these libraries -> bzip2 (>=1.0.5), gtk2 (>=2.16)")
 set(msg_dep_zerogs      "check these libraries -> glew (>=1.5), opengl, X11, nvidia-cg-toolkit (>=2.1)")
 set(msg_dep_gsdx        "check these libraries -> opengl, X11, pcsx2 SDL")
-set(msg_dep_zzogl       "check these libraries -> glew (>=1.5), jpeg (>=6.2), opengl, X11, nvidia-cg-toolkit (>=2.1), pcsx2 common libs")
 set(msg_dep_onepad      "check these libraries -> sdl (>=1.2)")
 set(msg_dep_zeropad     "check these libraries -> sdl (>=1.2)")
 set(msg_dep_spu2x       "check these libraries -> soundtouch (>=1.5), alsa, portaudio (>=1.9), pcsx2 common libs")
 set(msg_dep_zerospu2    "check these libraries -> soundtouch (>=1.5), alsa")
+if(GLSP_API)
+	set(msg_dep_zzogl       "check these libraries -> glew (>=1.5), jpeg (>=6.2), opengl, X11, pcsx2 common libs")
+else(GLSP_API)
+	set(msg_dep_zzogl       "check these libraries -> glew (>=1.5), jpeg (>=6.2), opengl, X11, nvidia-cg-toolkit (>=2.1), pcsx2 common libs")
+endif(GLSP_API)

 #-------------------------------------------------------------------------------
 #								Pcsx2 core & common libs
@ -153,17 +157,17 @@ endif(GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND CG_FOUND)
 # requires:	-GLEW
 #			-OpenGL
 #			-X11
-#			-CG
+#			-CG (only with cg build
 #			-JPEG
 #           -common_libs
 #---------------------------------------
-if(GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND CG_FOUND AND JPEG_FOUND AND common_libs)
+if((GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND JPEG_FOUND AND common_libs) AND (CG_FOUND OR GLSL_API))
 	set(zzogl TRUE)
-else(GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND CG_FOUND AND JPEG_FOUND AND common_libs)
+else((GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND JPEG_FOUND AND common_libs) AND (CG_FOUND OR GLSL_API))
 	set(zzogl FALSE)
    message(STATUS "Skip build of zzogl: miss some dependencies")
    message(STATUS "${msg_dep_zzogl}")
-endif(GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND CG_FOUND AND JPEG_FOUND AND common_libs)
+endif((GLEW_FOUND AND OPENGL_FOUND AND X11_FOUND AND JPEG_FOUND AND common_libs) AND (CG_FOUND OR GLSL_API))
 #---------------------------------------

 #---------------------------------------
--- a/pcsx2-codeblocks.workspace
+++ b/pcsx2-codeblocks.workspace
@ -27,7 +27,7 @@
 		<Project filename="plugins/zerospu2/Linux/ZeroSPU2.cbp">
 			<Depends filename="3rdparty/SoundTouch/SoundTouch.cbp" />
 		</Project>
-		<Project filename="plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp" />
+		<Project filename="plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp" active="1" />
 		<Project filename="plugins/GSdx/GSdx.gcc.cbp" active="1" />
 		<Project filename="3rdparty/SDL-1.3.0-5387/SDL-1.3/SDL-1.3.cbp" />
 	</Workspace>
--- a/plugins/zzogl-pg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg/opengl/CMakeLists.txt
@ -45,6 +45,14 @@ if(CMAKE_BUILD_TYPE STREQUAL Release)
    add_definitions(${CommonFlags} ${OptimizationFlags} -W)
 endif(CMAKE_BUILD_TYPE STREQUAL Release)

+# Select the shader API 
+if(GLSL_API)
+	add_definitions(-DGLSL_API)
+else(GLSL_API)
+	add_definitions(-DNVIDIA_CG_API)
+endif(GLSL_API)
+
+
 # zzogl sources
 set(zzoglSources
    GifTransfer.cpp
@ -54,17 +62,17 @@ set(zzoglSources
    GSmain.cpp
    HostMemory.cpp
    Mem.cpp
-    # memcpy_amd.cpp
    Mem_Swizzle.cpp
    Mem_Tables.cpp
    Profile.cpp
    rasterfont.cpp
-    NewRegs.cpp
    Regs.cpp
    targets.cpp
-    x86.cpp
    zerogs.cpp
    zpipe.cpp
+    ZZDepthTargets.cpp
+    ZZMemoryTargets.cpp
+    ZZRenderTargets.cpp
    ZZClut.cpp
    ZZHacks.cpp
    ZZKeyboard.cpp
@ -74,8 +82,10 @@ set(zzoglSources
    ZZoglCRTC.cpp
    ZZoglFlush.cpp
    ZZoglFlushHack.cpp
+    ZZoglMem.cpp
    ZZoglSave.cpp
    ZZoglShaders.cpp
+    ZZoglShadersGLSL.cpp
    ZZoglShoots.cpp
    ZZoglVB.cpp
    )
@ -94,7 +104,6 @@ set(zzoglHeaders
    Mem_Transmit.h
    Profile.h
    rasterfont.h
-    NewRegs.h
    Regs.h
    targets.h
    Util.h
@ -102,21 +111,19 @@ set(zzoglHeaders
    zerogs.h
    zpipe.h
    ZZClut.h
+    ZZoglFlush.h
    ZZGl.h
    ZZHacks.h
    ZZoglDrawing.h
    ZZLog.h
    ZZoglCRTC.h
    ZZoglMath.h
+    ZZoglMem.h
    ZZoglShaders.h
    ZZoglShoots.h
    ZZoglVB.h
    )

-# zzogl S sources
-set(zzoglSSources
-    x86-32.S)
-
 # zzogl shader sources
 set(zzoglShaderSources
    ctx0/ps2hw_ctx.fx
@ -131,9 +138,6 @@ set(zzoglLinuxSources
 set(zzoglLinuxHeaders
    Linux/Linux.h)

-# change language of .S-files to c++
-set_source_files_properties(${zzoglSSources} PROPERTIES LANGUAGE CXX)
-
 # add additional include directories
 include_directories(.
    Linux)
@ -142,7 +146,6 @@ include_directories(.
 add_library(${Output} SHARED
    ${zzoglSources}
    ${zzoglHeaders}
-    ${zzoglSSources}
    ${zzoglShaderSources}
    ${zzoglLinuxSources}
    ${zzoglLinuxHeaders})
@ -154,7 +157,9 @@ set_target_properties(${Output} PROPERTIES COMPILE_DEFINITIONS USE_GSOPEN2)
 target_link_libraries(${Output} Utilities)

 # link target with Cg
+if(NOT GLSP_API)
 	target_link_libraries(${Output} ${CG_LIBRARIES})
+endif(NOT GLSP_API)

 # link target with glew
 target_link_libraries(${Output} ${GLEW_LIBRARY})
@ -183,10 +188,18 @@ endif(NOT USER_CMAKE_LD_FLAGS STREQUAL "")

 if(PACKAGE_MODE)
    install(TARGETS ${Output} DESTINATION ${PLUGIN_DIR})
+    if(GLSL_API)
+        install(FILES ${PROJECT_SOURCE_DIR}/plugins/zzogl-pg/opengl/ps2hw.glsl DESTINATION ${PLUGIN_DIR})
+    else(GLSL_API)
        install(FILES ${PROJECT_SOURCE_DIR}/plugins/zzogl-pg/opengl/ps2hw.dat DESTINATION ${PLUGIN_DIR})
+    endif(GLSL_API)
 else(PACKAGE_MODE)
    install(TARGETS ${Output} DESTINATION ${CMAKE_SOURCE_DIR}/bin/plugins)
+    if(GLSL_API)
+        install(FILES ${PROJECT_SOURCE_DIR}/plugins/zzogl-pg/opengl/ps2hw.glsl DESTINATION ${CMAKE_SOURCE_DIR}/bin/plugins)
+    else(GLSL_API)
        install(FILES ${PROJECT_SOURCE_DIR}/plugins/zzogl-pg/opengl/ps2hw.dat DESTINATION ${CMAKE_SOURCE_DIR}/bin/plugins)
+    endif(GLSL_API)
 endif(PACKAGE_MODE)

 ################################### Replay Loader
--- a/plugins/zzogl-pg/opengl/GLWinX11.cpp
+++ b/plugins/zzogl-pg/opengl/GLWinX11.cpp
@ -198,13 +198,24 @@ void GLWindow::CreateContextGL()
 	GLXFBConfig *framebuffer_config = glXChooseFBConfig(glDisplay, DefaultScreen(glDisplay), NULL, &fbcount);
 	if (!framebuffer_config or !fbcount) return;

+#if 1
 	// At least create a 3.0 context with compatibility profile
 	int attribs[] = {
 		GLX_CONTEXT_MAJOR_VERSION_ARB, 3,
 		GLX_CONTEXT_MINOR_VERSION_ARB, 0,
+		// GLX_CONTEXT_PROFILE_MASK_ARB, GLX_CONTEXT_CORE_PROFILE_BIT_ARB,
 		GLX_CONTEXT_PROFILE_MASK_ARB, GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB,
 		0
 	};
+#else
+	// Create a 3.2 core context without compatibility profile
+	int attribs[] = {
+		GLX_CONTEXT_MAJOR_VERSION_ARB, 3,
+		GLX_CONTEXT_MINOR_VERSION_ARB, 2,
+		GLX_CONTEXT_PROFILE_MASK_ARB, GLX_CONTEXT_CORE_PROFILE_BIT_ARB,
+		0
+	};
+#endif
 	GLXContext context_temp = glXCreateContextAttribsARB(glDisplay, framebuffer_config[0], NULL, true, attribs);
 	if (context_temp) {
 		ZZLog::Error_Log("Create a 3.0 opengl context");
--- a/plugins/zzogl-pg/opengl/GS.h
+++ b/plugins/zzogl-pg/opengl/GS.h
@ -21,7 +21,7 @@
 #define __GS_H__


-#define USE_OLD_REGS
+#define ZZNORMAL_MEMORY

 #include "Util.h"
 #include "GifTransfer.h"
@ -39,6 +39,8 @@ extern float fFPS;

 extern int g_LastCRC;

+#define VB_NUMBUFFERS			   512 // number of vbo buffer allocated
+
 struct Vector_16F
 {
 	u16 x, y, z, w;
@ -132,8 +134,7 @@ extern GSconf conf;

 // PSM values
 // PSM types == Texture Storage Format
-enum PSM_value
-{
+enum PSM_value{
 	PSMCT32		= 0,		// 000000
 	PSMCT24		= 1,		// 000001
 	PSMCT16		= 2,		// 000010
@ -147,6 +148,8 @@ enum PSM_value
 	PSMT24Z		= 49,		// 110001
 	PSMT16Z		= 50,		// 110010
 	PSMT16SZ	= 58,		// 111010
+
+	PSMT_BAD_PSM 	= 63		// for every unknown psm.
 };

 // Check target bit mode. PSMCT32 and 32Z return 0, 24 and 24Z - 1
@ -461,7 +464,6 @@ typedef struct
 {
 	u16 aem;
 	u8 ta[2];
-	float fta[2];
 } texaInfo;

 typedef struct
@ -503,6 +505,14 @@ typedef struct
 	int fba;
 } fbaInfo;

+enum transfer_types
+{
+	XFER_HOST_TO_LOCAL = 0,
+	XFER_LOCAL_TO_HOST = 1,
+	XFER_LOCAL_TO_LOCAL = 2,
+	XFER_DEACTIVATED = 3
+};
+
 typedef struct
 {
 	Vertex gsvertex[4]; // circular buffer that contains the vertex
@ -537,15 +547,20 @@ typedef struct
 	texaInfo texa;
 	trxposInfo trxpos, trxposnew;

-	int imageWtemp, imageHtemp;
-
 	int imageTransfer;
-	int imageWnew, imageHnew, imageX, imageY, imageEndX, imageEndY;
+	bool transferring;
+	
+	Point image, imageEnd;
+	Size imageNew, imageTemp;

 	pathInfo path[4];
 	GIFRegDIMX dimx;
 	GSMemory mem;
 	GSClut clut_buffer;
+	
+	// Subject to change.
+	int vsync, interlace;
+	
 	int primNext(int inc = 1)
 	{
        // Note: ArraySize(gsvertex) == 2^n => modulo is replaced by an and instruction
@ -615,7 +630,7 @@ static __forceinline u32 RGBA16to32(u16 c)
 		   (((c) & 0x8000) ? 0xff000000 : 0);
 }

-#if 0
+#ifndef ZZNORMAL_MEMORY
 // converts float16 [0,1] to BYTE [0,255] (assumes value is in range, otherwise will take lower 8bits)
 // f is a u16
 static __forceinline u16 Float16ToBYTE(u16 f)
@ -984,4 +999,39 @@ inline void CluttingForFlushedTex(tex0Info* tex0, u32 Data, int ictx)
 #define CPSM_CSA_BITMASK 0x1f780000
 #define CPSM_CSA_NOTMASK 0xe0870000

+// I'll find a good place for these later.
+
+extern PSM_value PSM_value_Table[64];
+extern bool allowed_psm[256];				// in ZZoglMem.cpp.cpp
+inline void FillAlowedPsnTable() {
+
+	allowed_psm[PSMCT32] = true;
+	allowed_psm[PSMCT24] = true;
+	allowed_psm[PSMCT16] = true;
+	allowed_psm[PSMCT16S] = true;
+	allowed_psm[PSMT8] = true;
+	allowed_psm[PSMT4] = true;
+	allowed_psm[PSMT8H] = true;
+	allowed_psm[PSMT4HH] = true;
+	allowed_psm[PSMT4HL] = true;
+	allowed_psm[PSMT32Z] = true;
+	allowed_psm[PSMT24Z] = true;
+	allowed_psm[PSMT16Z] = true;
+	allowed_psm[PSMT16SZ] = true;
+	
+	PSM_value_Table[PSMCT32]  = PSMCT32;
+	PSM_value_Table[PSMCT24]  = PSMCT24;
+	PSM_value_Table[PSMCT16]  = PSMCT16;
+	PSM_value_Table[PSMCT16S] = PSMCT16S;
+	PSM_value_Table[PSMT8]    = PSMT8;
+	PSM_value_Table[PSMT4]    = PSMT4;
+	PSM_value_Table[PSMT8H]   = PSMT8H;
+	PSM_value_Table[PSMT4HH]  = PSMT4HH;
+	PSM_value_Table[PSMT4HL]  = PSMT4HL;
+	PSM_value_Table[PSMT32Z]  = PSMT32Z;
+	PSM_value_Table[PSMT24Z]  = PSMT24Z;
+	PSM_value_Table[PSMT16Z]  = PSMT16Z;
+	PSM_value_Table[PSMT16SZ] = PSMT16SZ;
+};
+
 #endif
--- a/plugins/zzogl-pg/opengl/GSmain.cpp
+++ b/plugins/zzogl-pg/opengl/GSmain.cpp
@ -22,6 +22,7 @@
 #include "Profile.h"
 #include "GLWin.h"
 #include "ZZoglFlushHack.h"
+#include "ZZoglShaders.h"


 using namespace std;
@ -82,7 +83,7 @@ extern int ZZSave(s8* pbydata);
 extern bool ZZLoad(s8* pbydata);

 // switches the render target to the real target, flushes the current render targets and renders the real image
-extern void RenderCRTC(int interlace);
+extern void RenderCRTC();

 #if defined(_WIN32) && defined(_DEBUG)
 HANDLE g_hCurrentThread = NULL;
@ -91,37 +92,37 @@ HANDLE g_hCurrentThread = NULL;
 extern int VALIDATE_THRESH;
 extern u32 TEXDESTROY_THRESH;

-u32 CALLBACK PS2EgetLibType()
+EXPORT_C_(u32) PS2EgetLibType()
 {
 	return PS2E_LT_GS;
 }

-char* CALLBACK PS2EgetLibName()
+EXPORT_C_(char*) PS2EgetLibName()
 {
 	return libraryName;
 }

-u32 CALLBACK PS2EgetLibVersion2(u32 type)
+EXPORT_C_(u32) PS2EgetLibVersion2(u32 type)
 {
 	return (zgsversion << 16) | (zgsrevision << 8) | zgsbuild | (zgsminor << 24);
 }

-void CALLBACK GSsetBaseMem(void* pmem)
+EXPORT_C_(void) GSsetBaseMem(void* pmem)
 {
 	g_pBasePS2Mem = (u8*)pmem;
 }

-void CALLBACK GSsetSettingsDir(const char* dir)
+EXPORT_C_(void) GSsetSettingsDir(const char* dir)
 {
 	s_strIniPath = (dir == NULL) ? wxString(L"inis") : wxString(dir, wxConvFile);
 }

-void CALLBACK GSsetLogDir(const char* dir)
+EXPORT_C_(void) GSsetLogDir(const char* dir)
 {
 	ZZLog::SetDir(dir);
 }

-void CALLBACK GSsetGameCRC(int crc, int options)
+EXPORT_C_(void) GSsetGameCRC(int crc, int options)
 {
    // build a list of function pointer for GetSkipCount (SkipDraw)
 	static GetSkipCount GSC_list[NUMBER_OF_TITLES];
@ -217,7 +218,7 @@ void CALLBACK GSsetGameCRC(int crc, int options)
 	ListHacks();
 }

-void CALLBACK GSsetFrameSkip(int frameskip)
+EXPORT_C_(void) GSsetFrameSkip(int frameskip)
 {
 	FUNCLOG
 	s_frameskipping |= frameskip;
@ -232,7 +233,7 @@ void CALLBACK GSsetFrameSkip(int frameskip)
 	}
 }

-void CALLBACK GSreset()
+EXPORT_C_(void) GSreset()
 {
 	FUNCLOG

@ -242,11 +243,11 @@ void CALLBACK GSreset()

 	gs.prac = 1;
 	prim = &gs._prim[0];
-	gs.imageTransfer = -1;
+	gs.transferring = false;
 	gs.q = 1;
 }

-void CALLBACK GSgifSoftReset(u32 mask)
+EXPORT_C_(void) GSgifSoftReset(u32 mask)
 {
 	FUNCLOG

@ -254,11 +255,11 @@ void CALLBACK GSgifSoftReset(u32 mask)
 	if (mask & 2) memset(&gs.path[1], 0, sizeof(gs.path[1]));
 	if (mask & 4) memset(&gs.path[2], 0, sizeof(gs.path[2]));

-	gs.imageTransfer = -1;
+	gs.transferring = false;
 	gs.q = 1;
 }

-s32 CALLBACK GSinit()
+EXPORT_C_(s32) GSinit()
 {
 	FUNCLOG

@ -281,7 +282,7 @@ __forceinline void InitMisc()
 	ResetRegs();
 }

-s32 CALLBACK GSopen(void *pDsp, char *Title, int multithread)
+EXPORT_C_(s32) GSopen(void *pDsp, char *Title, int multithread)
 {
 	FUNCLOG

@ -337,32 +338,34 @@ EXPORT_C_(s32) GSopen2( void* pDsp, u32 flags )
 }
 #endif

-void CALLBACK GSshutdown()
+EXPORT_C_(void) GSshutdown()
 {
 	FUNCLOG

 	ZZLog::Close();
 }
-void CALLBACK GSclose()
+EXPORT_C_(void) GSclose()
 {
 	FUNCLOG

 	ZZDestroy();
 	GLWin.CloseWindow();

+	// Free alocated memory. We could close plugin without closing pcsx2, so we SHOULD free all allocated resources
+	ZZshExitCleaning();
 	SaveStateFile = NULL;
 	SaveStateExists = true; // default value
    g_LastCRC = 0;
 }

-void CALLBACK GSirqCallback(void (*callback)())
+EXPORT_C_(void) GSirqCallback(void (*callback)())
 {
 	FUNCLOG

 	GSirq = callback;
 }

-void CALLBACK GSwriteCSR(u32 write)
+EXPORT_C_(void) GSwriteCSR(u32 write)
 {
 	FUNCLOG

@ -373,7 +376,7 @@ void CALLBACK GSwriteCSR(u32 write)
 #define access _access
 #endif

-void CALLBACK GSchangeSaveState(int newstate, const char* filename)
+EXPORT_C_(void) GSchangeSaveState(int newstate, const char* filename)
 {
 	FUNCLOG

@ -428,7 +431,7 @@ static bool get_snapshot_filename(char *filename, char* path, const char* extens
 	return true;
 }

-void CALLBACK GSmakeSnapshot(char *path)
+EXPORT_C_(void) GSmakeSnapshot(char *path)
 {
 	FUNCLOG

@ -474,7 +477,16 @@ static __forceinline void SetGSTitle()
 	GLWin.SetTitle(strtitle);
 }

-void CALLBACK GSvsync(int interlace)
+// This isn't implemented for some reason? Adding a field for it for the moment, till I get a chance to look closer.
+EXPORT_C_(void) GSsetVsync(int enabled)
+{
+	FUNCLOG
+
+	ZZLog::Debug_Log("Setting VSync to 0x%x.", enabled);
+	gs.vsync = enabled;
+}
+
+EXPORT_C_(void) GSvsync(int current_interlace)
 {
 	FUNCLOG

@ -506,8 +518,9 @@ void CALLBACK GSvsync(int interlace)

 	g_nRealFrame++;

-	// !interlace? Hmmm... Fixme.
-	RenderCRTC(!interlace);
+	// The value passed seems to either be 0 or 0x2000, and we want 0 or 1. Perhaps !! would be better...
+	gs.interlace = !current_interlace;
+	RenderCRTC();

 	GLWin.ProcessEvents();

@ -559,7 +572,7 @@ void CALLBACK GSvsync(int interlace)

 }

-void CALLBACK GSreadFIFO(u64 *pMem)
+EXPORT_C_(void) GSreadFIFO(u64 *pMem)
 {
 	FUNCLOG

@ -571,7 +584,7 @@ void CALLBACK GSreadFIFO(u64 *pMem)
 	TransferLocalHost((u32*)pMem, 1);
 }

-void CALLBACK GSreadFIFO2(u64 *pMem, int qwc)
+EXPORT_C_(void) GSreadFIFO2(u64 *pMem, int qwc)
 {
 	FUNCLOG

@ -583,7 +596,7 @@ void CALLBACK GSreadFIFO2(u64 *pMem, int qwc)
 	TransferLocalHost((u32*)pMem, qwc);
 }

-int CALLBACK GSsetupRecording(int start, void* pData)
+EXPORT_C_(int) GSsetupRecording(int start, void* pData)
 {
 	FUNCLOG

@ -595,7 +608,7 @@ int CALLBACK GSsetupRecording(int start, void* pData)
 	return 1;
 }

-s32 CALLBACK GSfreeze(int mode, freezeData *data)
+EXPORT_C_(s32) GSfreeze(int mode, freezeData *data)
 {
 	FUNCLOG

--- a/plugins/zzogl-pg/opengl/GifTransfer.cpp
+++ b/plugins/zzogl-pg/opengl/GifTransfer.cpp
@ -27,7 +27,7 @@ static int path1_count = 0;

 static int nPath3Hack = 0;

-void CALLBACK GSgetLastTag(u64* ptag)
+EXPORT_C_(void) GSgetLastTag(u64* ptag)
 {
 	FUNCLOG

@ -166,13 +166,15 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
 					int len = min(size, path->nloop);
 					//ZZLog::Error_Log("GIF_FLG_IMAGE(%d)=%d", gs.imageTransfer, len);
 					
+					if (gs.transferring)
+					{
 						switch (gs.imageTransfer)
 						{
-						case 0:
+							case XFER_HOST_TO_LOCAL:
 								TransferHostLocal(pMem, len * 4);
 								break;

-						case 1:
+							case XFER_LOCAL_TO_HOST:
 								// This can't happen; downloads can not be started or performed as part of
 								// a GIFtag operation.  They're an entirely separate process that can only be
 								// done through the ReverseFIFO transfer (aka ReadFIFO). --air
@ -180,11 +182,11 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
 								//TransferLocalHost(pMem, len);
 								break;

-						case 2:
+							case XFER_LOCAL_TO_LOCAL:
 								//TransferLocalLocal();
 								break;

-						case 3:
+							case XFER_DEACTIVATED:
 								//assert(0);
 								break;

@ -193,6 +195,8 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
 								break;
 						}
 						
+					}
+
 					pMem += len * 4;

 					path->nloop -= len;
@ -236,7 +240,7 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
 	}
 }

-void CALLBACK GSgifTransfer1(u32 *pMem, u32 addr)
+EXPORT_C_(void) GSgifTransfer1(u32 *pMem, u32 addr)
 {
 	FUNCLOG

@ -250,7 +254,7 @@ void CALLBACK GSgifTransfer1(u32 *pMem, u32 addr)
 	_GSgifTransfer<0>((u32*)((u8*)pMem + addr), (0x4000 - addr) / 16);
 }

-void CALLBACK GSgifTransfer2(u32 *pMem, u32 size)
+EXPORT_C_(void) GSgifTransfer2(u32 *pMem, u32 size)
 {
 	FUNCLOG

@ -259,7 +263,7 @@ void CALLBACK GSgifTransfer2(u32 *pMem, u32 size)
 	_GSgifTransfer<1>(const_cast<u32*>(pMem), size);
 }

-void CALLBACK GSgifTransfer3(u32 *pMem, u32 size)
+EXPORT_C_(void) GSgifTransfer3(u32 *pMem, u32 size)
 {
 	FUNCLOG

@ -268,7 +272,7 @@ void CALLBACK GSgifTransfer3(u32 *pMem, u32 size)
 	_GSgifTransfer<2>(const_cast<u32*>(pMem), size);
 }

-void CALLBACK GSgifTransfer(const u32 *pMem, u32 size)
+EXPORT_C_(void) GSgifTransfer(const u32 *pMem, u32 size)
 {
 	FUNCLOG

--- a/plugins/zzogl-pg/opengl/HostMemory.cpp
+++ b/plugins/zzogl-pg/opengl/HostMemory.cpp
@ -18,8 +18,6 @@
 */

 #include "GS.h"
-#include <Cg/cg.h>
-#include <Cg/cgGL.h>

 #include <stdlib.h>
 #include "Mem.h"
@ -117,14 +115,14 @@ void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h

    if (PSMT_ISZTEX(psm))
    {
-        // Somehow, I doubt this code is right. I'll have to look into it. For the moment, I'm keeping it the
-        // way it was. --arcum42
-
+    	// This still needs an eye kept on it.
        const BLOCK& b = m_Blocks[psm];
+		const int x2 = x + w + b.width - 1;
+		const int y2 = y + h - 1;
+        bw = bw / b.width;
        
-        bw = (bw + b.width - 1) / b.width;
-        start = bp * 256 + ((y / b.height) * bw + (x / b.width)) * 0x2000;
-        end = bp * 256  + (((y + h - 1) / b.height) * bw + (x + w + b.width - 1) / b.width) * 0x2000;
+        start = (bp + ((y / b.height) * bw + (x / b.width)) * 0x20) * 0x100;
+        end = (bp + ((y2 / b.height) * bw + (x2 / b.width)) * 0x20) * 0x100;
        return;
    }
 
@ -139,46 +137,101 @@ void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h
    }
    else
    {
-        // This is what it used to do, which doesn't seem right.
-        // Keeping it for reference, in case removing it breaks anything.
- 
-        //int newx = ((x + w - 1 + 31) & ~31) - 1;
-        //int newy = ((y + h - 1 + 15) & ~15) - 1;
-        //start = getPixelAddress4(x, y, bp, bw) / 2;
-        //end = (getPixelAddress4(max(newx, x), max(newy, y), bp, bw) + 2) / 2;
- 
        start /= 2;
        end /= 2;
    }
 }

+// Same as GetRectMemAddress, except that we know x & y are zero, so it's simplified a bit.
+void GetRectMemAddressZero(int& start, int& end, int psm, int w, int h, int bp, int bw)
+{
+    FUNCLOG
+    u32 bits = 0;
+
+    if (m_Blocks[psm].bpp == 0)
+    {
+        ZZLog::Error_Log("ZeroGS: Bad psm 0x%x.", psm);
+        start = 0;
+        end = MEMORY_END;
+        return;
+    }
+
+    if (PSMT_ISZTEX(psm))
+    {
+    	// This still needs an eye kept on it.
+        const BLOCK& b = m_Blocks[psm];
+		const int x2 = w + b.width - 1;
+		const int y2 = h - 1;
+        bw = bw / b.width;
+        
+		start = bp * 0x100;
+        end = (bp + ((y2 / b.height) * bw + (x2 / b.width)) * 0x20) * 0x100;
+        return;
+    }
+ 
+    bits = PSMT_BITS_NUM(psm);
+    start = getPixelFun[psm](0, 0, bp, bw);
+    end = getPixelFun[psm](w - 1, h - 1, bp, bw) + 1;
+ 
+    if (bits > 0)
+    {
+        start *= bits;
+        end *= bits;
+    }
+    else
+    {
+        start /= 2;
+        end /= 2;
+    }
+}
+ 
+
+void GetRectMemAddress(int& start, int& end, int psm, Point p, Size s, int bp, int bw)
+{
+	GetRectMemAddress(start, end, psm, p.x, p.y, s.w, s.h, bp, bw);
+}
+
+void GetRectMemAddress(int& start, int& end, int psm, int x, int y, Size s, int bp, int bw)
+{
+	GetRectMemAddress(start, end, psm, x, y, s.w, s.h, bp, bw);
+}
+
+void GetRectMemAddressZero(int& start, int& end, int psm, Size s, int bp, int bw)
+{
+	GetRectMemAddressZero(start, end, psm, s.w, s.h, bp, bw);
+}
+
 void InitTransferHostLocal()
 {
    FUNCLOG
 
 #if defined(_DEBUG)
    // Xenosaga 1.
-    if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw)
-        ZZLog::Debug_Log("Transfer error, width exceeded. (0x%x > 0X%x)", gs.trxpos.dx + gs.imageWnew, gs.dstbuf.bw);
+    if (gs.trxpos.dx + gs.imageNew.w > gs.dstbuf.bw)
+        ZZLog::Debug_Log("Transfer error, width exceeded. (0x%x > 0X%x)", gs.trxpos.dx + gs.imageNew.w, gs.dstbuf.bw);
 #endif
 
    //bool bHasFlushed = false;
 
-    gs.imageX = gs.trxpos.dx;
-    gs.imageY = gs.trxpos.dy;
+    gs.image.x = gs.trxpos.dx;
+    gs.image.y = gs.trxpos.dy;
 
-    gs.imageEndX = gs.imageX + gs.imageWnew;
-    gs.imageEndY = gs.imageY + gs.imageHnew;
+    gs.imageEnd.x = gs.image.x + gs.imageNew.w;
+    gs.imageEnd.y = gs.image.y + gs.imageNew.h;
 
-    assert(gs.imageEndX < 2048 && gs.imageEndY < 2048);
+    assert(gs.imageEnd.x < 2048 && gs.imageEnd.y < 2048);
 
    // This needs to be looked in to, since psm should *not* be 63.
    // hack! viewful joe
-    if (gs.dstbuf.psm == 63) gs.dstbuf.psm = 0;
+    if (gs.dstbuf.psm == 63) 
+    {
+    	ZZLog::WriteLn("gs.dstbuf.psm set to 0!");
+    	gs.dstbuf.psm = 0;
+    }
 
    int start, end;
 
-    GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
+    GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageNew, gs.dstbuf.bp, gs.dstbuf.bw);
 
    if (end > MEMORY_END)
    {
@ -189,7 +242,7 @@ void InitTransferHostLocal()
        // MEMORY_END is 0x400000...
 
        ZZLog::Warn_Log("Init host local out of bounds! (end == 0x%x)", end);
-        //gs.imageTransfer = -1;
+		//gs.transferring = false;
        end = MEMORY_END;
    }
 
@ -198,17 +251,18 @@ void InitTransferHostLocal()
    if (vb[0].nCount > 0) Flush(0);
    if (vb[1].nCount > 0) Flush(1);
 
-    //ZZLog::Prim_Log("trans: bp:%x x:%x y:%x w:%x h:%x\n", gs.dstbuf.bp, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew);
+    //ZZLog::Prim_Log("trans: bp:%x x:%x y:%x w:%x h:%x\n", gs.dstbuf.bp, gs.trxpos.dx, gs.trxpos.dy, gs.imageNew.w, gs.imageNew.h);
 }
 
 void TransferHostLocal(const void* pbyMem, u32 nQWordSize)
 {
    FUNCLOG
 
-    int start, end;
+    int start = -1, end = -1;
 
-    GetRectMemAddress(start, end, gs.dstbuf.psm, gs.imageX, gs.imageY, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
+    GetRectMemAddress(start, end, gs.dstbuf.psm, gs.image, gs.imageNew, gs.dstbuf.bp, gs.dstbuf.bw);
 	
+	if ((start == -1) || (end == -1)) ZZLog::WriteLn("start == %d, end == %d", start, end);
    assert(start < gs_imageEnd);
    end = gs_imageEnd;
 
@ -272,8 +326,8 @@ void TransferHostLocal(const void* pbyMem, u32 nQWordSize)
    {
        tex0Info t;
        t.tbp0 = gs.dstbuf.bp;
-        t.tw = gs.imageWnew;
-        t.th = gs.imageHnew;
+        t.tw = gs.imageNew.w;
+        t.th = gs.imageNew.h;
        t.tbw = gs.dstbuf.bw;
        t.psm = gs.dstbuf.psm;
        SaveTex(&t, 0);
@ -285,24 +339,24 @@ void TransferHostLocal(const void* pbyMem, u32 nQWordSize)
 void InitTransferLocalHost()
 {
    FUNCLOG
-    assert(gs.trxpos.sx + gs.imageWnew <= 2048 && gs.trxpos.sy + gs.imageHnew <= 2048);
+    assert(gs.trxpos.sx + gs.imageNew.w <= 2048 && gs.trxpos.sy + gs.imageNew.h <= 2048);
 
 #if defined(_DEBUG)
-    if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw)
-        ZZLog::Debug_Log("Transfer error, width exceeded. (0x%x > 0x%x)", gs.trxpos.sx + gs.imageWnew, gs.srcbuf.bw);
+    if (gs.trxpos.sx + gs.imageNew.w > gs.srcbuf.bw)
+        ZZLog::Debug_Log("Transfer error, width exceeded. (0x%x > 0x%x)", gs.trxpos.sx + gs.imageNew.w, gs.srcbuf.bw);
 #endif
 
-    gs.imageX = gs.trxpos.sx;
-    gs.imageY = gs.trxpos.sy;
+    gs.image.x = gs.trxpos.sx;
+    gs.image.y = gs.trxpos.sy;
 
-    gs.imageEndX = gs.imageX + gs.imageWnew;
-    gs.imageEndY = gs.imageY + gs.imageHnew;
+    gs.imageEnd.x = gs.image.x + gs.imageNew.w;
+    gs.imageEnd.y = gs.image.y + gs.imageNew.h;
 
    s_vTransferCache.resize(0);
 
    int start, end;
 
-    GetRectMemAddress(start, end, gs.srcbuf.psm, gs.trxpos.sx, gs.trxpos.sy, gs.imageWnew, gs.imageHnew, gs.srcbuf.bp, gs.srcbuf.bw);
+    GetRectMemAddress(start, end, gs.srcbuf.psm, gs.trxpos.sx, gs.trxpos.sy, gs.imageNew, gs.srcbuf.bp, gs.srcbuf.bw);
 
    ResolveInRange(start, end);
 }
@ -316,16 +370,16 @@ void TransferLocalHost(void* pbyMem, u32 nQWordSize, int& x, int& y, u8 *pstart)
    T* pbuf = (T*)pbyMem;
    u32 nSize = nQWordSize * 16 / sizeof(T);
 
-    for (; i < gs.imageEndY; ++i)
+    for (; i < gs.imageEnd.y; ++i)
    {
-        for (; j < gs.imageEndX && nSize > 0; ++j, --nSize)
+        for (; j < gs.imageEnd.x && nSize > 0; ++j, --nSize)
        {
            *pbuf++ = rp(pstart, j % 2048, i % 2048, gs.srcbuf.bw);
        }
 
-        if (j >= gs.imageEndX)
+        if (j >= gs.imageEnd.x)
        {
-            assert(j == gs.imageEndX);
+            assert(j == gs.imageEnd.x);
            j = gs.trxpos.sx;
        }
        else
@ -344,9 +398,9 @@ void TransferLocalHost_24(void* pbyMem, u32 nQWordSize, int& x, int& y, u8 *psta
    u8* pbuf = (u8*)pbyMem;
    u32 nSize = nQWordSize * 16 / 3;
 
-    for (; i < gs.imageEndY; ++i)
+    for (; i < gs.imageEnd.y; ++i)
    {
-        for (; j < gs.imageEndX && nSize > 0; ++j, --nSize)
+        for (; j < gs.imageEnd.x && nSize > 0; ++j, --nSize)
        {
            u32 p = rp(pstart, j % 2048, i % 2048, gs.srcbuf.bw);
            pbuf[0] = (u8)p;
@ -355,9 +409,9 @@ void TransferLocalHost_24(void* pbyMem, u32 nQWordSize, int& x, int& y, u8 *psta
            pbuf += 3;
        }
 
-        if (j >= gs.imageEndX)
+        if (j >= gs.imageEnd.x)
        {
-            assert(j == gs.imageEndX);
+            assert(j == gs.imageEnd.x);
            j = gs.trxpos.sx;
        }
        else
@ -372,34 +426,34 @@ void TransferLocalHost_24(void* pbyMem, u32 nQWordSize, int& x, int& y, u8 *psta
 void TransferLocalHost(void* pbyMem, u32 nQWordSize)
 {
    FUNCLOG
-    assert(gs.imageTransfer == 1);
+    assert(gs.imageTransfer == XFER_LOCAL_TO_HOST);
 
    u8* pstart = g_pbyGSMemory + 256 * gs.srcbuf.bp;
 
    switch(PSMT_BITMODE(gs.srcbuf.psm))
    {
 		case 0:
-        TransferLocalHost<u32>(pbyMem, nQWordSize, gs.imageY, gs.imageX, pstart);
+			TransferLocalHost<u32>(pbyMem, nQWordSize, gs.image.y, gs.image.x, pstart);
 			break;
 		case 1:
-        TransferLocalHost_24(pbyMem, nQWordSize, gs.imageY, gs.imageX, pstart);
+			TransferLocalHost_24(pbyMem, nQWordSize, gs.image.y, gs.image.x, pstart);
 			break;
 		case 2:
-        TransferLocalHost<u16>(pbyMem, nQWordSize, gs.imageY, gs.imageX, pstart);
+			TransferLocalHost<u16>(pbyMem, nQWordSize, gs.image.y, gs.image.x, pstart);
 			break;
 		case 3:
-        TransferLocalHost<u8>(pbyMem, nQWordSize, gs.imageY, gs.imageX, pstart);
+			TransferLocalHost<u8>(pbyMem, nQWordSize, gs.image.y, gs.image.x, pstart);
 			break;
 		default:
 			assert(0);
 			break;
    }
 
-    if (gs.imageY >= gs.imageEndY)
+    if (gs.image.y >= gs.imageEnd.y)
    {
-        ZZLog::Error_Log("gs.imageY >= gs.imageEndY!");
-        assert(gs.imageY == gs.imageEndY);
-        gs.imageTransfer = -1;
+        ZZLog::Error_Log("gs.image.y >= gs.imageEnd.y!");
+        assert(gs.image.y == gs.imageEnd.y);
+		gs.transferring = false;
    }
 }
 
@ -411,11 +465,11 @@ __forceinline void _TransferLocalLocal()
    u8* pSrcBuf = g_pbyGSMemory + gs.srcbuf.bp * 256;
    u8* pDstBuf = g_pbyGSMemory + gs.dstbuf.bp * 256;
    u32 widthlimit = 4;
-    u32 maxX = gs.trxpos.sx + gs.imageWnew;
-    u32 maxY = gs.trxpos.sy + gs.imageHnew;
+    u32 maxX = gs.trxpos.sx + gs.imageNew.w;
+    u32 maxY = gs.trxpos.sy + gs.imageNew.h;
 
    if (PSMT_BITMODE(gs.srcbuf.psm) == 0) widthlimit = 2;
-    if ((gs.imageWnew & widthlimit) != 0) return;
+    if ((gs.imageNew.w & widthlimit) != 0) return;
 
    for(u32 i = gs.trxpos.sy, i2 = gs.trxpos.dy; i < maxY; i++, i2++)
    {
@ -447,10 +501,10 @@ __forceinline void _TransferLocalLocal_4()
    _getPixelAddress_0 gdp = getPixelFun_0[gs.dstbuf.psm];
    u8* pSrcBuf = g_pbyGSMemory + gs.srcbuf.bp * 256;
    u8* pDstBuf = g_pbyGSMemory + gs.dstbuf.bp * 256;
-    u32 maxX = gs.trxpos.sx + gs.imageWnew;
-    u32 maxY = gs.trxpos.sy + gs.imageHnew;
+    u32 maxX = gs.trxpos.sx + gs.imageNew.w;
+    u32 maxY = gs.trxpos.sy + gs.imageNew.h;
 
-    assert((gs.imageWnew % 8) == 0);
+    assert((gs.imageNew.w % 8) == 0);
 
    for(u32 i = gs.trxpos.sy, i2 = gs.trxpos.dy; i < maxY; ++i, ++i2)
    {
@ -498,21 +552,21 @@ void TransferLocalLocal()
    FUNCLOG
 
    //ZZLog::Error_Log("I'z in your code, transferring your memory...");
-    assert(gs.imageTransfer == 2);
-    assert(gs.trxpos.sx + gs.imageWnew < 2048 && gs.trxpos.sy + gs.imageHnew < 2048);
-    assert(gs.trxpos.dx + gs.imageWnew < 2048 && gs.trxpos.dy + gs.imageHnew < 2048);
+    assert(gs.imageTransfer == XFER_LOCAL_TO_LOCAL);
+    assert(gs.trxpos.sx + gs.imageNew.w < 2048 && gs.trxpos.sy + gs.imageNew.h < 2048);
+    assert(gs.trxpos.dx + gs.imageNew.w < 2048 && gs.trxpos.dy + gs.imageNew.h < 2048);
    assert((gs.srcbuf.psm&0x7) == (gs.dstbuf.psm&0x7));
 
-    if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw)
-        ZZLog::Debug_Log("Transfer error, src width exceeded.(0x%x > 0x%x)", gs.trxpos.sx + gs.imageWnew, gs.srcbuf.bw);
+    if (gs.trxpos.sx + gs.imageNew.w > gs.srcbuf.bw)
+        ZZLog::Debug_Log("Transfer error, src width exceeded.(0x%x > 0x%x)", gs.trxpos.sx + gs.imageNew.w, gs.srcbuf.bw);
 
-    if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw)
-        ZZLog::Debug_Log("Transfer error, dst width exceeded.(0x%x > 0x%x)", gs.trxpos.dx + gs.imageWnew, gs.dstbuf.bw);
+    if (gs.trxpos.dx + gs.imageNew.w > gs.dstbuf.bw)
+        ZZLog::Debug_Log("Transfer error, dst width exceeded.(0x%x > 0x%x)", gs.trxpos.dx + gs.imageNew.w, gs.dstbuf.bw);
 
    int srcstart, srcend, dststart, dstend;
 
-    GetRectMemAddress(srcstart, srcend, gs.srcbuf.psm, gs.trxpos.sx, gs.trxpos.sy, gs.imageWnew, gs.imageHnew, gs.srcbuf.bp, gs.srcbuf.bw);
-    GetRectMemAddress(dststart, dstend, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
+    GetRectMemAddress(srcstart, srcend, gs.srcbuf.psm, gs.trxpos.sx, gs.trxpos.sy, gs.imageNew, gs.srcbuf.bp, gs.srcbuf.bw);
+    GetRectMemAddress(dststart, dstend, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageNew, gs.dstbuf.bp, gs.dstbuf.bw);
 
    // resolve the targs
    ResolveInRange(srcstart, srcend);
@ -547,15 +601,15 @@ void TransferLocalLocal()
    {
        tex0Info t;
        t.tbp0 = gs.dstbuf.bp;
-        t.tw = gs.imageWnew;
-        t.th = gs.imageHnew;
+        t.tw = gs.imageNew.w;
+        t.th = gs.imageNew.h;
        t.tbw = gs.dstbuf.bw;
        t.psm = gs.dstbuf.psm;
        SaveTex(&t, 0);
 
        t.tbp0 = gs.srcbuf.bp;
-        t.tw = gs.imageWnew;
-        t.th = gs.imageHnew;
+        t.tw = gs.imageNew.w;
+        t.th = gs.imageNew.h;
        t.tbw = gs.srcbuf.bw;
        t.psm = gs.srcbuf.psm;
        SaveTex(&t, 0);
@ -564,15 +618,3 @@ void TransferLocalLocal()
 #endif
 }

-__forceinline void TerminateLocalHost() 
-{
-	FUNCLOG
-	//ZZLog::Error_Log("Terminate Local Host!");
-}
-
-__forceinline void TerminateHostLocal() 
-{
-	FUNCLOG
-	gs.imageTransfer = -1;
-}
-
--- a/plugins/zzogl-pg/opengl/HostMemory.h
+++ b/plugins/zzogl-pg/opengl/HostMemory.h
@ -96,7 +96,10 @@ inline u8* _MemoryAddress(int x)
 }

 extern void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw);
-
+extern void GetRectMemAddress(int& start, int& end, int psm, Point p, Size s, int bp, int bw);
+extern void GetRectMemAddress(int& start, int& end, int psm, int x, int y, Size s, int bp, int bw);
+extern void GetRectMemAddressZero(int& start, int& end, int psm, int w, int h, int bp, int bw);
+extern void GetRectMemAddressZero(int& start, int& end, int psm, Size s, int bp, int bw);

 // called when trxdir is accessed. If host is involved, transfers memory to temp buffer byTransferBuf.
 // Otherwise performs the transfer. TODO: Perhaps divide the transfers into chunks?
@ -108,7 +111,4 @@ extern void TransferLocalHost(void* pbyMem, u32 nQWordSize);

 extern void TransferLocalLocal();

-extern void TerminateLocalHost();
-extern void TerminateHostLocal();
-
 #endif // HOSTMEMORY_H_INCLUDED
--- a/plugins/zzogl-pg/opengl/Linux/Linux.cpp
+++ b/plugins/zzogl-pg/opengl/Linux/Linux.cpp
@ -35,7 +35,7 @@ extern bool THR_bCtrl;
 static map<string, confOptsStruct> mapConfOpts;
 static gameHacks tempHacks;

-void CALLBACK GSkeyEvent(keyEvent *ev)
+EXPORT_C_(void) GSkeyEvent(keyEvent *ev)
 {
 	static bool bAlt = false;

@ -450,7 +450,7 @@ void DisplayDialog()
 	gtk_widget_destroy(dialog);
 }

-void CALLBACK GSconfigure()
+EXPORT_C_(void) GSconfigure()
 {
 	char strcurdir[256];
 	getcwd(strcurdir, 256);
@ -484,12 +484,12 @@ void SysMessage(const char *fmt, ...)
 	gtk_widget_destroy(dialog);
 }

-void CALLBACK GSabout()
+EXPORT_C_(void) GSabout()
 {
 	SysMessage("ZZOgl PG: by Zeydlitz (PG version worked on by arcum42, gregory, and the pcsx2 development team). Based off of ZeroGS, by zerofrog.");
 }

-s32 CALLBACK GStest()
+EXPORT_C_(s32) GStest()
 {
 	return 0;
 }
--- a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
+++ b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
@ -16,11 +16,14 @@
 				<Compiler>
 					<Add option="-Wall" />
 					<Add option="-g" />
+					<Add option="-I/opt/cg/include" />
+					<Add option="-L/opt/cg/lib" />
 					<Add option="-DZEROGS_DEVBUILD" />
 					<Add option="-D_DEBUG" />
 				</Compiler>
 				<Linker>
 					<Add library="../../../../../deps/debug/libUtilities.a" />
+					<Add library="Cg" />
 				</Linker>
 			</Target>
 			<Target title="Devel">
@ -33,12 +36,15 @@
 				<Compiler>
 					<Add option="-O2" />
 					<Add option="-g" />
+					<Add option="-I/opt/cg/include" />
+					<Add option="-L/opt/cg/lib" />
 					<Add option="-W" />
 					<Add option="-DZEROGS_DEVBUILD" />
 					<Add option="-DNDEBUG" />
 				</Compiler>
 				<Linker>
 					<Add library="../../../../../deps/release/libUtilities.a" />
+					<Add library="Cg" />
 				</Linker>
 			</Target>
 			<Target title="Release">
@ -50,10 +56,69 @@
 				<Option createStaticLib="1" />
 				<Compiler>
 					<Add option="-O2" />
+					<Add option="-I/opt/cg/include" />
+					<Add option="-L/opt/cg/lib" />
 					<Add option="-W" />
 					<Add option="-DRELEASE_TO_PUBLIC" />
 					<Add option="-DNDEBUG" />
 				</Compiler>
+				<Linker>
+					<Add option="-s" />
+					<Add library="../../../../../deps/release/libUtilities.a" />
+					<Add library="Cg" />
+				</Linker>
+			</Target>
+			<Target title="GLSL - Debug">
+				<Option output="../../../../../bin/plugins/ZZOgl-PG-GLSL-dbg.so" prefix_auto="0" extension_auto="0" />
+				<Option object_output="obj/Debug/" />
+				<Option type="3" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Option createStaticLib="1" />
+				<Compiler>
+					<Add option="-Wall" />
+					<Add option="-g" />
+					<Add option="-DZEROGS_DEVBUILD" />
+					<Add option="-D_DEBUG" />
+					<Add option="-DGLSL_API" />
+				</Compiler>
+				<Linker>
+					<Add library="../../../../../deps/debug/libUtilities.a" />
+				</Linker>
+			</Target>
+			<Target title="GLSL - Devel">
+				<Option output="../../../../../bin/plugins/ZZOgl-PG-GLSL-dev" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Release/" />
+				<Option type="3" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Option createStaticLib="1" />
+				<Compiler>
+					<Add option="-O2" />
+					<Add option="-g" />
+					<Add option="-W" />
+					<Add option="-DZEROGS_DEVBUILD" />
+					<Add option="-DNDEBUG" />
+					<Add option="-DGLSL_API" />
+				</Compiler>
+				<Linker>
+					<Add library="../../../../../deps/release/libUtilities.a" />
+				</Linker>
+			</Target>
+			<Target title="GLSL - Release">
+				<Option output="../../../../../bin/plugins/ZZOgl-PG-GLSL" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Release/" />
+				<Option type="3" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Option createStaticLib="1" />
+				<Compiler>
+					<Add option="-O2" />
+					<Add option="-W" />
+					<Add option="-DRELEASE_TO_PUBLIC" />
+					<Add option="-DNDEBUG" />
+					<Add option="-DGLSL_API" />
+				</Compiler>
 				<Linker>
 					<Add option="-s" />
 					<Add library="../../../../../deps/release/libUtilities.a" />
@ -68,8 +133,6 @@
 			<Add option="-Wno-unused-value" />
 			<Add option="-Wunused-variable" />
 			<Add option="-m32" />
-			<Add option="-I/opt/cg/include" />
-			<Add option="-L/opt/cg/lib" />
 			<Add option="-msse2" />
 			<Add option="-fno-regmove" />
 			<Add option="-fno-strict-aliasing" />
@ -96,10 +159,10 @@
 			<Add library="z" />
 			<Add library="dl" />
 			<Add library="stdc++" />
-			<Add library="Cg" />
 		</Linker>
 		<ExtraCommands>
 			<Add after="cp $(PROJECT_DIR)/../../ps2hw.dat $(TARGET_OUTPUT_DIR)/" />
+			<Add after="cp $(PROJECT_DIR)/../../ps2hw.glsl $(TARGET_OUTPUT_DIR)/" />
 			<Mode after="always" />
 		</ExtraCommands>
 		<Unit filename="../../CRC.h" />
@ -121,8 +184,6 @@
 		<Unit filename="../../Mem_Swizzle.h" />
 		<Unit filename="../../Mem_Tables.cpp" />
 		<Unit filename="../../Mem_Transmit.h" />
-		<Unit filename="../../NewRegs.cpp" />
-		<Unit filename="../../NewRegs.h" />
 		<Unit filename="../../Profile.cpp" />
 		<Unit filename="../../Profile.h" />
 		<Unit filename="../../Regs.cpp" />
@ -147,24 +208,29 @@
 		</Unit>
 		<Unit filename="../../ZZClut.cpp" />
 		<Unit filename="../../ZZClut.h" />
+		<Unit filename="../../ZZDepthTargets.cpp" />
 		<Unit filename="../../ZZGl.h" />
 		<Unit filename="../../ZZHacks.cpp" />
 		<Unit filename="../../ZZHacks.h" />
 		<Unit filename="../../ZZKeyboard.cpp" />
 		<Unit filename="../../ZZLog.cpp" />
 		<Unit filename="../../ZZLog.h" />
+		<Unit filename="../../ZZMemoryTargets.cpp" />
+		<Unit filename="../../ZZRenderTargets.cpp" />
 		<Unit filename="../../ZZoglCRTC.cpp" />
 		<Unit filename="../../ZZoglCRTC.h" />
 		<Unit filename="../../ZZoglCreate.cpp" />
 		<Unit filename="../../ZZoglDrawing.cpp" />
 		<Unit filename="../../ZZoglDrawing.h" />
 		<Unit filename="../../ZZoglFlush.cpp" />
+		<Unit filename="../../ZZoglFlush.h" />
 		<Unit filename="../../ZZoglFlushHack.cpp" />
 		<Unit filename="../../ZZoglFlushHack.h" />
 		<Unit filename="../../ZZoglMath.h" />
 		<Unit filename="../../ZZoglSave.cpp" />
 		<Unit filename="../../ZZoglShaders.cpp" />
 		<Unit filename="../../ZZoglShaders.h" />
+		<Unit filename="../../ZZoglShadersGLSL.cpp" />
 		<Unit filename="../../ZZoglShoots.cpp" />
 		<Unit filename="../../ZZoglShoots.h" />
 		<Unit filename="../../ZZoglVB.cpp" />
--- a/plugins/zzogl-pg/opengl/Mem.cpp
+++ b/plugins/zzogl-pg/opengl/Mem.cpp
@ -25,9 +25,11 @@
 #include "Mem_Transmit.h"
 #include "Mem_Swizzle.h"
 #ifdef ZEROGS_SSE2
-#include <emmintrin.h>
+#include <immintrin.h>
 #endif

+#ifdef ZZNORMAL_MEMORY
+
 BLOCK m_Blocks[0x40]; // do so blocks are indexable

 PCSX2_ALIGNED16(u32 tempblock[64]);
@ -53,41 +55,41 @@ u8* pstart;
 template <class T>
 static __forceinline const T* AlignOnBlockBoundry(TransferData data, TransferFuncts fun, Point alignedPt, int& endY, const T* pbuf)
 {
-	bool bCanAlign = ((MOD_POW2(gs.trxpos.dx, data.blockwidth) == 0) && (gs.imageX == gs.trxpos.dx) &&
+	bool bCanAlign = ((MOD_POW2(gs.trxpos.dx, data.blockwidth) == 0) && (gs.image.x == gs.trxpos.dx) &&
 					  (alignedPt.y > endY) && (alignedPt.x > gs.trxpos.dx));

-	if ((gs.imageEndX - gs.trxpos.dx) % data.widthlimit)
+	if ((gs.imageEnd.x - gs.trxpos.dx) % data.widthlimit)
 	{
 		/* hack */
 		int testwidth = (int)nSize -
-						(gs.imageEndY - gs.imageY) * (gs.imageEndX - gs.trxpos.dx)
-						+ (gs.imageX - gs.trxpos.dx);
+						(gs.imageEnd.y - gs.image.y) * (gs.imageEnd.x - gs.trxpos.dx)
+						+ (gs.image.x - gs.trxpos.dx);

 		if ((testwidth <= data.widthlimit) && (testwidth >= -data.widthlimit))
 		{
 			/* don't transfer */
-			/*ZZLog::Debug_Log("Bad texture %s: %d %d %d", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/
+			/*ZZLog::Debug_Log("Bad texture %s: %d %d %d", #psm, gs.trxpos.dx, gs.imageEnd.x, nQWordSize);*/
 			//ZZLog::Error_Log("Bad texture: testwidth = %d; data.widthlimit = %d", testwidth, data.widthlimit);
-			gs.imageTransfer = -1;
+			gs.transferring = false;
 		}

 		bCanAlign = false;
 	}

 	/* first align on block boundary */
-	if (MOD_POW2(gs.imageY, data.blockheight) || !bCanAlign)
+	if (MOD_POW2(gs.image.y, data.blockheight) || !bCanAlign)
 	{
 		u32 transwidth;

 		if (!bCanAlign)
-			endY = gs.imageEndY; /* transfer the whole image */
+			endY = gs.imageEnd.y; /* transfer the whole image */
 		else
-			assert(endY < gs.imageEndY);  /* part of alignment condition */
+			assert(endY < gs.imageEnd.y);  /* part of alignment condition */

-		if (((gs.imageEndX - gs.trxpos.dx) % data.widthlimit) || ((gs.imageEndX - gs.imageX) % data.widthlimit))
+		if (((gs.imageEnd.x - gs.trxpos.dx) % data.widthlimit) || ((gs.imageEnd.x - gs.image.x) % data.widthlimit))
 		{
 			/* transmit with a width of 1 */
-			transwidth = (1 + (DSTPSM == PSMT4));
+			transwidth = (1 + (gs.dstbuf.psm == PSMT4));
 		}
 		else
 		{
@ -98,7 +100,7 @@ static __forceinline const T* AlignOnBlockBoundry(TransferData data, TransferFun

 		if (pbuf == NULL) return NULL;

-		if (nSize == 0 || tempY == gs.imageEndY) return NULL;
+		if (nSize == 0 || tempY == gs.imageEnd.y) return NULL;
 	}

 	return pbuf;
@ -112,14 +114,14 @@ static __forceinline const T* TransferAligningToBlocks(TransferData data, Transf
 	_SwizzleBlock swizzle;

 	/* can align! */
-	pitch = gs.imageEndX - gs.trxpos.dx;
+	pitch = gs.imageEnd.x - gs.trxpos.dx;
 	area = pitch * data.blockheight;
-	fracX = gs.imageEndX - alignedPt.x;
+	fracX = gs.imageEnd.x - alignedPt.x;

 	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */
 	bAligned = !((uptr)pbuf & 0xf) && (TransPitch(pitch, data.transfersize) & 0xf) == 0;

-	if (bAligned || ((DSTPSM == PSMCT24) || (DSTPSM == PSMT8H) || (DSTPSM == PSMT4HH) || (DSTPSM == PSMT4HL)))
+	if (bAligned || ((gs.dstbuf.psm == PSMCT24) || (gs.dstbuf.psm == PSMT8H) || (gs.dstbuf.psm == PSMT4HH) || (gs.dstbuf.psm == PSMT4HL)))
 		swizzle = (fun.Swizzle);
 	else
 		swizzle = (fun.Swizzle_u);
@ -140,7 +142,7 @@ static __forceinline const T* TransferAligningToBlocks(TransferData data, Transf
 #endif

 		/* transfer the rest */
-		if (alignedPt.x < gs.imageEndX)
+		if (alignedPt.x < gs.imageEnd.x)
 		{
 			pbuf = TransmitHostLocalX<T>(data.psm, fun.wp, data.widthlimit, data.blockheight, alignedPt.x, pbuf);

@ -161,19 +163,19 @@ static __forceinline const T* TransferAligningToBlocks(TransferData data, Transf

 static __forceinline int FinishTransfer(TransferData data, int nLeftOver)
 {
-	if (tempY >= gs.imageEndY)
+	if (tempY >= gs.imageEnd.y)
 	{
-		assert(gs.imageTransfer == -1 || tempY == gs.imageEndY);
-		gs.imageTransfer = -1;
+		assert( gs.transferring == false || tempY == gs.imageEnd.y);
+		gs.transferring = false;
 		/*int start, end;
-		GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
+		GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageNew, gs.dstbuf.bp, gs.dstbuf.bw);
 		g_MemTargs.ClearRange(start, end);*/
 	}
 	else
 	{
 		/* update new params */
-		gs.imageY = tempY;
-		gs.imageX = tempX;
+		gs.image.y = tempY;
+		gs.image.x = tempX;
 	}

 	return (nSize * TransPitch(2, data.transfersize) + nLeftOver) / 2;
@ -182,23 +184,23 @@ static __forceinline int FinishTransfer(TransferData data, int nLeftOver)
 template <class T>
 static __forceinline int RealTransfer(u32 psm, const void* pbyMem, u32 nQWordSize)
 {
-	assert(gs.imageTransfer == 0);
+	assert(gs.imageTransfer == XFER_HOST_TO_LOCAL);
 	TransferData data = tData[psm];
 	TransferFuncts fun(psm);
 	pstart = g_pbyGSMemory + gs.dstbuf.bp * 256;
 	const T* pbuf = (const T*)pbyMem;
 	const int tp2 = TransPitch(2, data.transfersize);
 	int nLeftOver = (nQWordSize * 4 * 2) % tp2;
-	tempY = gs.imageY;
-	tempX = gs.imageX;
+	tempY = gs.image.y;
+	tempX = gs.image.x;
 	Point alignedPt;

 	nSize = (nQWordSize * 4 * 2) / tp2;
-	nSize = min(nSize, gs.imageWnew * gs.imageHnew);
+	nSize = min(nSize, gs.imageNew.w * gs.imageNew.h);

-	int endY = ROUND_UPPOW2(gs.imageY, data.blockheight);
-	alignedPt.y = ROUND_DOWNPOW2(gs.imageEndY, data.blockheight);
-	alignedPt.x = ROUND_DOWNPOW2(gs.imageEndX, data.blockwidth);
+	int endY = ROUND_UPPOW2(gs.image.y, data.blockheight);
+	alignedPt.y = ROUND_DOWNPOW2(gs.imageEnd.y, data.blockheight);
+	alignedPt.x = ROUND_DOWNPOW2(gs.imageEnd.x, data.blockwidth);

 	pbuf = AlignOnBlockBoundry<T>(data, fun, alignedPt, endY, pbuf);

@ -210,12 +212,12 @@ static __forceinline int RealTransfer(u32 psm, const void* pbyMem, u32 nQWordSiz

 	if (TransPitch(nSize, data.transfersize) / 4 > 0)
 	{
-		pbuf = TransmitHostLocalY<T>(psm, fun.wp, data.widthlimit, gs.imageEndY, pbuf);
+		pbuf = TransmitHostLocalY<T>(psm, fun.wp, data.widthlimit, gs.imageEnd.y, pbuf);

 		if (pbuf == NULL) return FinishTransfer(data, nLeftOver);

 		/* sometimes wrong sizes are sent (tekken tag) */
-		assert(gs.imageTransfer == -1 || TransPitch(nSize, data.transfersize) / 4 <= 2);
+		assert(gs.transferring == false || TransPitch(nSize, data.transfersize) / 4 <= 2);
 	}

 	return FinishTransfer(data, nLeftOver);
@ -382,3 +384,5 @@ void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, in
 	m_Blocks[PSMT4] = b;
 	m_Blocks[PSMT4].SetFun(PSMT4);
 }
+
+#endif
--- a/plugins/zzogl-pg/opengl/Mem.h
+++ b/plugins/zzogl-pg/opengl/Mem.h
@ -32,7 +32,13 @@ static __forceinline int MOD_POW2(int val, int base) { return ((val)&(base - 1))
 const int BLOCK_TEXWIDTH = 128;
 const int BLOCK_TEXHEIGHT = 512;

-extern PCSX2_ALIGNED16(u32 tempblock[64]);
+// PSM is u6 value, so we MUST guarantee, that we don't crush on incorrect psm.
+#define MAX_PSM 64
+#define TABLE_WIDTH 8
+
+#ifndef ZZNORMAL_MEMORY
+#include "ZZoglMem.h"
+#endif

 typedef u32(*_getPixelAddress)(int x, int y, u32 bp, u32 bw);
 typedef u32(*_getPixelAddress_0)(int x, int y, u32 bw);
@ -54,6 +60,7 @@ extern _SwizzleBlock swizzleBlockUnFun[64];
 extern _TransferHostLocal TransferHostLocalFun[64];
 extern _TransferLocalHost TransferLocalHostFun[64];

+
 // Both of the following structs should probably be local class variables or in a namespace,
 // but this works for the moment.

@ -68,6 +75,9 @@ struct TransferData
 	u32 psm;
 };

+#ifdef ZZNORMAL_MEMORY
+extern PCSX2_ALIGNED16(u32 tempblock[64]);
+
 struct TransferFuncts
 {
 	_writePixel_0 wp;
@ -500,6 +510,8 @@ static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw)

 ///////////////

+#endif
+
 extern int TransferHostLocal32(const void* pbyMem, u32 nQWordSize);
 extern int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize);
 extern int TransferHostLocal24(const void* pbyMem, u32 nQWordSize);
--- a/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
+++ b/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
@ -21,15 +21,11 @@
 #include "Mem.h"
 #include "Mem_Swizzle.h"
 #ifdef ZEROGS_SSE2
-#include <emmintrin.h>
+#include <immintrin.h>
 #endif

 // WARNING a sfence instruction must be call after SwizzleBlock sse2 function

-// Current port of the ASM function to intrinsic
-#define INTRINSIC_PORT_16
-#define INTRINSIC_PORT_8
-#define INTRINSIC_PORT_4
 #ifdef ZEROGS_SSE2
 static const __aligned16 u32 mask_24b_H[4] = {0xFF000000, 0x0000FFFF, 0xFF000000, 0x0000FFFF};
 static const __aligned16 u32 mask_24b_L[4] = {0x00FFFFFF, 0x00000000, 0x00FFFFFF, 0x00000000};
@ -501,29 +497,17 @@ __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)

 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_16
 	SwizzleBlock16_sse2_I<true>(dst, src, pitch);
-#else
-	SwizzleBlock16_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_8
 	SwizzleBlock8_sse2_I<true>(dst, src, pitch);
-#else
-	SwizzleBlock8_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_4
 	SwizzleBlock4_sse2_I<true>(dst, src, pitch);
-#else
-	SwizzleBlock4_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
@ -533,29 +517,17 @@ __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)

 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_16
 	SwizzleBlock16_sse2_I<false>(dst, src, pitch);
-#else
-	SwizzleBlock16u_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_8
 	SwizzleBlock8_sse2_I<false>(dst, src, pitch);
-#else
-	SwizzleBlock8u_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_4
 	SwizzleBlock4_sse2_I<false>(dst, src, pitch);
-#else
-	SwizzleBlock4u_sse2(dst, src, pitch);
-#endif
 }

 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
--- a/plugins/zzogl-pg/opengl/Mem_Tables.cpp
+++ b/plugins/zzogl-pg/opengl/Mem_Tables.cpp
@ -250,6 +250,92 @@ u32 g_pageTable16SZ[64][64];
 u32 g_pageTable8[64][128];
 u32 g_pageTable4[128][128];

+
+//maximum PSM is 58, so our arrays have 58 + 1 = 59 elements
+
+// This table is used for fasr access to memory storage data. Field meaning is following:
+// 0 -- the number (1 << [psm][0]) is number of pixels per storage format. It's  0 if stored 1 pixel, 1 for 2 pixels (16-bit), 2 for 4 pixels (PSMT8) and 3 for 8 (PSMT4)
+// 5 -- is 3 - [psm][0]. Just for speed
+// 3, 4 -- size-1 of pageTable for psm. It used to clump x, y otside boundaries.
+// 1, 2 -- the number (1 << [psm][1]) and (1 << [psm[2]]) is also size of pageTable. So [psm][3] = (1 << [psm][1]) - 1
+//	Also note, that [psm][1] =  5 + ([psm][0] + 1) / 2, and [psm][2] = 6 + [psm][0] / 2.
+// 6 -- pixel mask, (1 << [psm][5]) - 1, if be used to word, it leave only bytes for pixel formay
+// 7 -- starting position of data in word, PSMT8H, 4HL, 4HH are stored data not from the begining.
+u32 ZZ_DT[MAX_PSM][TABLE_WIDTH] = {
+	{0, 5, 6,  31,  63, 3, 0xffffffff, 0}, // 0 PSMCT32	
+	{0, 5, 6,  31,  63, 3, 0x00ffffff, 0}, // 1 PSMCT24
+	{1, 6, 6,  63,  63, 2, 0x0000ffff, 0}, // 2 PSMCT16
+	{0, }, // 3
+	{0, }, // 4
+	{0, }, // 5
+	{0, }, // 6
+	{0, }, // 7
+	{0, }, // 8
+	{0, }, // 9
+	{1, 6, 6,  63,  63, 2, 0x0000ffff, 0}, // 10 PSMCT16S
+ 	{0, }, // 11
+	{0, }, // 12
+	{0, }, // 13
+	{0, }, // 14
+	{0, }, // 15
+	{0, }, // 16
+	{0, }, // 17
+	{0, }, // 18
+	{2, 6, 7,  63, 127, 1, 0x000000ff, 0}, // 19 PSMT8
+	{3, 7, 7, 127, 127, 0, 0x0000000f, 0}, // 20 PSMT4
+	{0, }, // 21
+	{0, }, // 22
+	{0, }, // 23
+	{0, }, // 24
+	{0, }, // 25
+	{0, }, // 26
+	{0, 5, 6,  31,  63, 3, 0x000000ff, 24}, // 27 PSMT8H
+	{0, }, // 28
+	{0, }, // 29
+	{0, }, // 30
+	{0, }, // 31
+	{0, }, // 32
+	{0, }, // 33
+	{0, }, // 34
+	{0, }, // 35
+	{0, 5, 6,  31,  63, 3, 0x0000000f, 24}, // 36 PSMT4HL
+	{0, }, // 37
+	{0, }, // 38
+	{0, }, // 39
+	{0, }, // 40
+	{0, }, // 41
+	{0, }, // 42
+	{0, }, // 43
+	{0, 5, 6,  31,  63, 3, 0x0000000f, 28}, // 44 PSMT4HH
+	{0, }, // 45
+	{0, }, // 46
+	{0, }, // 47
+	{0, 5, 6,  31,  63, 3, 0xffffffff, 0}, // 48 PSMCT32Z
+	{0, 5, 6,  31,  63, 3, 0x00ffffff, 0}, // 49 PSMCT24Z
+	{1, 6, 6,  63,  63, 2, 0x0000ffff, 0}, // 50 PSMCT16Z
+	{0, }, // 51
+	{0, }, // 52
+	{0, }, // 53
+	{0, }, // 54
+	{0, }, // 55
+	{0, }, // 56
+	{0, }, // 57
+	{1, 6, 6,  63,  63, 2, 0x0000ffff, 0}, // 58 PSMCT16SZ
+	{0, }, // 59
+	{0, }, // 60
+	{0, }, // 61
+	{0, }, // 62
+	{0, }, // 63
+};
+
+
+//maxium PSM is 58, so our arrays have 58 + 1 = 59 elements
+u32** g_pageTable[MAX_PSM] = {NULL,};
+u32** g_blockTable[MAX_PSM] = {NULL, };
+u32** g_columnTable[MAX_PSM] = {NULL, };
+u32 g_pageTable2[MAX_PSM][127][127] = {0, };
+u32** g_pageTableNew[MAX_PSM] = {NULL,};
+
 /* PSM reference array
 { 	32, 24, 16, NULL, NULL, NULL, NULL, NULL,
 	NULL, NULL, 16S, NULL, NULL, NULL, NULL, NULL,
--- a/plugins/zzogl-pg/opengl/Mem_Transmit.h
+++ b/plugins/zzogl-pg/opengl/Mem_Transmit.h
@ -23,7 +23,6 @@
 #include "GS.h"
 #include "Mem.h"

-#define DSTPSM gs.dstbuf.psm
 extern int tempX, tempY;
 extern int pitch, area, fracX;
 extern int nSize;
@ -37,13 +36,13 @@ static __forceinline const T *TransmitHostLocalY_(_writePixel_0 wp, s32 widthlim
 {
 	assert((nSize % widthlimit) == 0 && widthlimit <= 4);

-	if ((gs.imageEndX - gs.trxpos.dx) % widthlimit)
+	if ((gs.imageEnd.x - gs.trxpos.dx) % widthlimit)
 	{
-		// ZZLog::GS_Log("Bad Transmission! %d %d, psm: %d", gs.trxpos.dx, gs.imageEndX, DSTPSM);
+		// ZZLog::GS_Log("Bad Transmission! %d %d, psm: %d", gs.trxpos.dx, gs.imageEnd.x, gs.dstbuf.psm);

 		for (; tempY < endY; ++tempY)
 		{
-			for (; tempX < gs.imageEndX && nSize > 0; tempX += 1, nSize -= 1, buf += 1)
+			for (; tempX < gs.imageEnd.x && nSize > 0; tempX += 1, nSize -= 1, buf += 1)
 			{
 				/* write as many pixel at one time as possible */
 				wp(pstart, tempX % 2048, tempY % 2048, buf[0], gs.dstbuf.bw);
@ -53,7 +52,7 @@ static __forceinline const T *TransmitHostLocalY_(_writePixel_0 wp, s32 widthlim

 	for (; tempY < endY; ++tempY)
 	{
-		for (; tempX < gs.imageEndX && nSize > 0; tempX += widthlimit, nSize -= widthlimit, buf += widthlimit)
+		for (; tempX < gs.imageEnd.x && nSize > 0; tempX += widthlimit, nSize -= widthlimit, buf += widthlimit)
 		{

 			/* write as many pixel at one time as possible */
@ -77,14 +76,14 @@ static __forceinline const T *TransmitHostLocalY_(_writePixel_0 wp, s32 widthlim
 			}
 		}

-		if (tempX >= gs.imageEndX)
+		if (tempX >= gs.imageEnd.x)
 		{
-			assert(tempX == gs.imageEndX);
+			assert(tempX == gs.imageEnd.x);
 			tempX = gs.trxpos.dx;
 		}
 		else
 		{
-			assert(gs.imageTransfer == -1 || nSize*sizeof(T) / 4 == 0);
+			assert(gs.transferring == false || nSize*sizeof(T) / 4 == 0);
 			return NULL;
 		}
 	}
@ -96,24 +95,24 @@ static __forceinline const T *TransmitHostLocalY_(_writePixel_0 wp, s32 widthlim
 template <class T>
 static __forceinline const T *TransmitHostLocalY_24(_writePixel_0 wp, s32 widthlimit, int endY, const T *buf)
 {
-	if (widthlimit != 8 || ((gs.imageEndX - gs.trxpos.dx) % widthlimit))
+	if (widthlimit != 8 || ((gs.imageEnd.x - gs.trxpos.dx) % widthlimit))
 	{
-		//ZZLog::GS_Log("Bad Transmission! %d %d, psm: %d", gs.trxpos.dx, gs.imageEndX, DSTPSM);
+		//ZZLog::GS_Log("Bad Transmission! %d %d, psm: %d", gs.trxpos.dx, gs.imageEnd.x, gs.dstbuf.psm);
 		for (; tempY < endY; ++tempY)
 		{
-			for (; tempX < gs.imageEndX && nSize > 0; tempX += 1, nSize -= 1, buf += 3)
+			for (; tempX < gs.imageEnd.x && nSize > 0; tempX += 1, nSize -= 1, buf += 3)
 			{
 				wp(pstart, tempX % 2048, tempY % 2048, *(u32*)(buf), gs.dstbuf.bw);
 			}

-			if (tempX >= gs.imageEndX)
+			if (tempX >= gs.imageEnd.x)
 			{
-				assert(gs.imageTransfer == -1 || tempX == gs.imageEndX);
+				assert(gs.transferring == false || tempX == gs.imageEnd.x);
 				tempX = gs.trxpos.dx;
 			}
 			else
 			{
-				assert(gs.imageTransfer == -1 || nSize == 0);
+				assert(gs.transferring == false || nSize == 0);
 				return NULL;
 			}
 		}
@ -124,7 +123,7 @@ static __forceinline const T *TransmitHostLocalY_24(_writePixel_0 wp, s32 widthl

 		for (; tempY < endY; ++tempY)
 		{
-			for (; tempX < gs.imageEndX && nSize > 0; tempX += widthlimit, nSize -= widthlimit, buf += 3 * widthlimit)
+			for (; tempX < gs.imageEnd.x && nSize > 0; tempX += widthlimit, nSize -= widthlimit, buf += 3 * widthlimit)
 			{
 				if (nSize < widthlimit) return NULL;

@ -140,9 +139,9 @@ static __forceinline const T *TransmitHostLocalY_24(_writePixel_0 wp, s32 widthl
 				wp(pstart, (tempX + 7) % 2048, tempY % 2048, *(u32*)(buf + 21), gs.dstbuf.bw);
 			}

-			if (tempX >= gs.imageEndX)
+			if (tempX >= gs.imageEnd.x)
 			{
-				assert(gs.imageTransfer == -1 || tempX == gs.imageEndX);
+				assert(gs.transferring == false || tempX == gs.imageEnd.x);
 				tempX = gs.trxpos.dx;
 			}
 			else
@ -155,7 +154,7 @@ static __forceinline const T *TransmitHostLocalY_24(_writePixel_0 wp, s32 widthl
 					nSize = 0;
 				}

-				assert(gs.imageTransfer == -1 || nSize == 0);
+				assert(gs.transferring == false || nSize == 0);

 				return NULL;
 			}
@ -171,7 +170,7 @@ static __forceinline const T *TransmitHostLocalY_4(_writePixel_0 wp, s32 widthli
 {
 	for (; tempY < endY; ++tempY)
 	{
-		for (; tempX < gs.imageEndX && nSize > 0; tempX += widthlimit, nSize -= widthlimit)
+		for (; tempX < gs.imageEnd.x && nSize > 0; tempX += widthlimit, nSize -= widthlimit)
 		{
 			/* write as many pixel at one time as possible */
 			wp(pstart, tempX % 2048, tempY % 2048, *buf&0x0f, gs.dstbuf.bw);
@ -200,13 +199,13 @@ static __forceinline const T *TransmitHostLocalY_4(_writePixel_0 wp, s32 widthli
 			}
 		}

-		if (tempX >= gs.imageEndX)
+		if (tempX >= gs.imageEnd.x)
 		{
 			tempX = gs.trxpos.dx;
 		}
 		else
 		{
-			assert(gs.imageTransfer == -1 || (nSize / 32) == 0);
+			assert(gs.transferring == false || (nSize / 32) == 0);
 			return NULL;
 		}
 	}
@ -238,7 +237,7 @@ static __forceinline const T *TransmitHostLocalX_(_writePixel_0 wp, u32 widthlim
 {
 	for (u32 tempi = 0; tempi < blockheight; ++tempi)
 	{
-		for (tempX = startX; tempX < gs.imageEndX; tempX++, buf++)
+		for (tempX = startX; tempX < gs.imageEnd.x; tempX++, buf++)
 		{
 			wp(pstart, tempX % 2048, (tempY + tempi) % 2048, buf[0], gs.dstbuf.bw);
 		}
@ -255,7 +254,7 @@ static __forceinline const T *TransmitHostLocalX_24(_writePixel_0 wp, u32 widthl
 {
 	for (u32 tempi = 0; tempi < blockheight; ++tempi)
 	{
-		for (tempX = startX; tempX < gs.imageEndX; tempX++, buf += 3)
+		for (tempX = startX; tempX < gs.imageEnd.x; tempX++, buf += 3)
 		{
 			wp(pstart, tempX % 2048, (tempY + tempi) % 2048, *(u32*)buf, gs.dstbuf.bw);
 		}
@ -272,7 +271,7 @@ static __forceinline const T *TransmitHostLocalX_4(_writePixel_0 wp, u32 widthli
 {
 	for (u32 tempi = 0; tempi < blockheight; ++tempi)
 	{
-		for (tempX = startX; tempX < gs.imageEndX; tempX += 2, buf++)
+		for (tempX = startX; tempX < gs.imageEnd.x; tempX += 2, buf++)
 		{
 			wp(pstart, tempX % 2048, (tempY + tempi) % 2048, buf[0]&0x0f, gs.dstbuf.bw);
 			wp(pstart, (tempX + 1) % 2048, (tempY + tempi) % 2048, buf[0] >> 4, gs.dstbuf.bw);
--- a/plugins/zzogl-pg/opengl/NewRegs.cpp
+++ b/plugins/zzogl-pg/opengl/NewRegs.cpp
--- a/plugins/zzogl-pg/opengl/NewRegs.h
+++ b/plugins/zzogl-pg/opengl/NewRegs.h
@ -1,973 +0,0 @@
-/*  ZZ Open GL graphics plugin
- *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
- *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-#ifndef NEWREGS_H_INCLUDED
-#define NEWREGS_H_INCLUDED
-
-#ifdef USE_OLD_REGS
-#include "Regs.h"
-#else
-
-enum GIF_REG
-{
-	GIF_REG_PRIM	= 0x00,
-	GIF_REG_RGBA	= 0x01,
-	GIF_REG_STQ		= 0x02,
-	GIF_REG_UV		= 0x03,
-	GIF_REG_XYZF2	= 0x04,
-	GIF_REG_XYZ2	= 0x05,
-	GIF_REG_TEX0_1	= 0x06,
-	GIF_REG_TEX0_2	= 0x07,
-	GIF_REG_CLAMP_1	= 0x08,
-	GIF_REG_CLAMP_2	= 0x09,
-	GIF_REG_FOG		= 0x0a,
-	GIF_REG_XYZF3	= 0x0c,
-	GIF_REG_XYZ3	= 0x0d,
-	GIF_REG_A_D		= 0x0e,
-	GIF_REG_NOP		= 0x0f,
-};
-
-enum GIF_A_D_REG
-{
-	GIF_A_D_REG_PRIM		= 0x00,
-	GIF_A_D_REG_RGBAQ		= 0x01,
-	GIF_A_D_REG_ST			= 0x02,
-	GIF_A_D_REG_UV			= 0x03,
-	GIF_A_D_REG_XYZF2		= 0x04,
-	GIF_A_D_REG_XYZ2		= 0x05,
-	GIF_A_D_REG_TEX0_1		= 0x06,
-	GIF_A_D_REG_TEX0_2		= 0x07,
-	GIF_A_D_REG_CLAMP_1		= 0x08,
-	GIF_A_D_REG_CLAMP_2		= 0x09,
-	GIF_A_D_REG_FOG			= 0x0a,
-	GIF_A_D_REG_XYZF3		= 0x0c,
-	GIF_A_D_REG_XYZ3		= 0x0d,
-	GIF_A_D_REG_NOP			= 0x0f,
-	GIF_A_D_REG_TEX1_1		= 0x14,
-	GIF_A_D_REG_TEX1_2		= 0x15,
-	GIF_A_D_REG_TEX2_1		= 0x16,
-	GIF_A_D_REG_TEX2_2		= 0x17,
-	GIF_A_D_REG_XYOFFSET_1	= 0x18,
-	GIF_A_D_REG_XYOFFSET_2	= 0x19,
-	GIF_A_D_REG_PRMODECONT	= 0x1a,
-	GIF_A_D_REG_PRMODE		= 0x1b,
-	GIF_A_D_REG_TEXCLUT		= 0x1c,
-	GIF_A_D_REG_SCANMSK		= 0x22,
-	GIF_A_D_REG_MIPTBP1_1	= 0x34,
-	GIF_A_D_REG_MIPTBP1_2	= 0x35,
-	GIF_A_D_REG_MIPTBP2_1	= 0x36,
-	GIF_A_D_REG_MIPTBP2_2	= 0x37,
-	GIF_A_D_REG_TEXA		= 0x3b,
-	GIF_A_D_REG_FOGCOL		= 0x3d,
-	GIF_A_D_REG_TEXFLUSH	= 0x3f,
-	GIF_A_D_REG_SCISSOR_1	= 0x40,
-	GIF_A_D_REG_SCISSOR_2	= 0x41,
-	GIF_A_D_REG_ALPHA_1		= 0x42,
-	GIF_A_D_REG_ALPHA_2		= 0x43,
-	GIF_A_D_REG_DIMX		= 0x44,
-	GIF_A_D_REG_DTHE		= 0x45,
-	GIF_A_D_REG_COLCLAMP	= 0x46,
-	GIF_A_D_REG_TEST_1		= 0x47,
-	GIF_A_D_REG_TEST_2		= 0x48,
-	GIF_A_D_REG_PABE		= 0x49,
-	GIF_A_D_REG_FBA_1		= 0x4a,
-	GIF_A_D_REG_FBA_2		= 0x4b,
-	GIF_A_D_REG_FRAME_1		= 0x4c,
-	GIF_A_D_REG_FRAME_2		= 0x4d,
-	GIF_A_D_REG_ZBUF_1		= 0x4e,
-	GIF_A_D_REG_ZBUF_2		= 0x4f,
-	GIF_A_D_REG_BITBLTBUF	= 0x50,
-	GIF_A_D_REG_TRXPOS		= 0x51,
-	GIF_A_D_REG_TRXREG		= 0x52,
-	GIF_A_D_REG_TRXDIR		= 0x53,
-	GIF_A_D_REG_HWREG		= 0x54,
-	GIF_A_D_REG_SIGNAL		= 0x60,
-	GIF_A_D_REG_FINISH		= 0x61,
-	GIF_A_D_REG_LABEL		= 0x62,
-};
-// In case we want to change to/from __fastcall for GIF register handlers:
-#define __gifCall __fastcall
-
-typedef void __gifCall FnType_GIFRegHandler(const u32* data);
-typedef FnType_GIFRegHandler* GIFRegHandler;
-
-extern FnType_GIFRegHandler GIFPackedRegHandlerNull;
-extern FnType_GIFRegHandler GIFPackedRegHandlerRGBA;
-extern FnType_GIFRegHandler GIFPackedRegHandlerSTQ;
-extern FnType_GIFRegHandler GIFPackedRegHandlerUV;
-extern FnType_GIFRegHandler GIFPackedRegHandlerXYZF2;
-extern FnType_GIFRegHandler GIFPackedRegHandlerXYZ2;
-extern FnType_GIFRegHandler GIFPackedRegHandlerFOG;
-extern FnType_GIFRegHandler GIFPackedRegHandlerA_D;
-extern FnType_GIFRegHandler GIFPackedRegHandlerNOP;
-
-// These are unimplemented, and fall back on the non-packed versions.
-extern FnType_GIFRegHandler GIFPackedRegHandlerPRIM;
-
-template<u32 i>
-extern FnType_GIFRegHandler GIFPackedRegHandlerTEX0;
-
-template<u32 i>
-extern FnType_GIFRegHandler GIFPackedRegHandlerCLAMP;
-
-extern FnType_GIFRegHandler GIFPackedRegHandlerXYZF3;
-extern FnType_GIFRegHandler GIFPackedRegHandlerXYZ3;
-
-extern FnType_GIFRegHandler GIFRegHandlerNull;
-extern FnType_GIFRegHandler GIFRegHandlerPRIM;
-extern FnType_GIFRegHandler GIFRegHandlerRGBAQ;
-extern FnType_GIFRegHandler GIFRegHandlerST;
-extern FnType_GIFRegHandler GIFRegHandlerUV;
-extern FnType_GIFRegHandler GIFRegHandlerXYZF2;
-extern FnType_GIFRegHandler GIFRegHandlerXYZ2;
-
-template<u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerTEX0;
-
-template<u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerCLAMP;
-
-extern FnType_GIFRegHandler GIFRegHandlerFOG;
-extern FnType_GIFRegHandler GIFRegHandlerXYZF3;
-extern FnType_GIFRegHandler GIFRegHandlerXYZ3;
-extern FnType_GIFRegHandler GIFRegHandlerNOP;
-
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerTEX1;
-
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerTEX2;
-
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerXYOFFSET;
-
-extern FnType_GIFRegHandler GIFRegHandlerPRMODECONT;
-extern FnType_GIFRegHandler GIFRegHandlerPRMODE;
-extern FnType_GIFRegHandler GIFRegHandlerTEXCLUT;
-extern FnType_GIFRegHandler GIFRegHandlerSCANMSK;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerMIPTBP1;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerMIPTBP2;
-extern FnType_GIFRegHandler GIFRegHandlerTEXA;
-extern FnType_GIFRegHandler GIFRegHandlerFOGCOL;
-extern FnType_GIFRegHandler GIFRegHandlerTEXFLUSH;
-
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerSCISSOR;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerALPHA;
-
-extern FnType_GIFRegHandler GIFRegHandlerDIMX;
-extern FnType_GIFRegHandler GIFRegHandlerDTHE;
-extern FnType_GIFRegHandler GIFRegHandlerCOLCLAMP;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerTEST;
-extern FnType_GIFRegHandler GIFRegHandlerPABE;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerFBA;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerFRAME;
-template <u32 i>
-extern FnType_GIFRegHandler GIFRegHandlerZBUF;
-extern FnType_GIFRegHandler GIFRegHandlerBITBLTBUF;
-extern FnType_GIFRegHandler GIFRegHandlerTRXPOS;
-extern FnType_GIFRegHandler GIFRegHandlerTRXREG;
-extern FnType_GIFRegHandler GIFRegHandlerTRXDIR;
-extern FnType_GIFRegHandler GIFRegHandlerHWREG;
-extern FnType_GIFRegHandler GIFRegHandlerSIGNAL;
-extern FnType_GIFRegHandler GIFRegHandlerFINISH;
-extern FnType_GIFRegHandler GIFRegHandlerLABEL;
-
-// GifReg & GifPackedReg structs from GSdx, slightly modified
-
-enum GS_ATST
-{
-	ATST_NEVER		= 0,
-	ATST_ALWAYS		= 1,
-	ATST_LESS		= 2,
-	ATST_LEQUAL		= 3,
-	ATST_EQUAL		= 4,
-	ATST_GEQUAL		= 5,
-	ATST_GREATER	= 6,
-	ATST_NOTEQUAL	= 7,
-};
-
-enum GS_AFAIL
-{
-	AFAIL_KEEP		= 0,
-	AFAIL_FB_ONLY	= 1,
-	AFAIL_ZB_ONLY	= 2,
-	AFAIL_RGB_ONLY	= 3,
-};
-
-// GIFReg
-
-REG64_(GIFReg, ALPHA)
-	u32 A:2;
-	u32 B:2;
-	u32 C:2;
-	u32 D:2;
-	u32 _PAD1:24;
-	u32 FIX:8;
-	u32 _PAD2:24;
-REG_END2
-	// opaque => output will be Cs/As
-	__forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == D && C == 2 && FIX == 0x80);}
-	__forceinline bool IsOpaque(int amin, int amax) const {return (A == B || amax == 0) && D == 0 || A == 0 && B == D && amin == 0x80 && amax == 0x80;}
-REG_END2
-
-REG64_(GIFReg, BITBLTBUF)
-	u32 SBP:14;
-	u32 _PAD1:2;
-	u32 SBW:6;
-	u32 _PAD2:2;
-	u32 SPSM:6;
-	u32 _PAD3:2;
-	u32 DBP:14;
-	u32 _PAD4:2;
-	u32 DBW:6;
-	u32 _PAD5:2;
-	u32 DPSM:6;
-	u32 _PAD6:2;
-REG_END
-
-REG64_(GIFReg, CLAMP)
-union
-{
-	struct
-	{
-		u32 WMS:2;
-		u32 WMT:2;
-		u32 MINU:10;
-		u32 MAXU:10;
-		u32 _PAD1:8;
-		u32 _PAD2:2;
-		u32 MAXV:10;
-		u32 _PAD3:20;
-	};
-
-	struct
-	{
-		u64 _PAD4:24;
-		u64 MINV:10;
-		u64 _PAD5:30;
-	};
-};
-REG_END
-
-REG64_(GIFReg, COLCLAMP)
-	u32 CLAMP:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, DIMX)
-	s32 DM00:3;
-	s32 _PAD00:1;
-	s32 DM01:3;
-	s32 _PAD01:1;
-	s32 DM02:3;
-	s32 _PAD02:1;
-	s32 DM03:3;
-	s32 _PAD03:1;
-	s32 DM10:3;
-	s32 _PAD10:1;
-	s32 DM11:3;
-	s32 _PAD11:1;
-	s32 DM12:3;
-	s32 _PAD12:1;
-	s32 DM13:3;
-	s32 _PAD13:1;
-	s32 DM20:3;
-	s32 _PAD20:1;
-	s32 DM21:3;
-	s32 _PAD21:1;
-	s32 DM22:3;
-	s32 _PAD22:1;
-	s32 DM23:3;
-	s32 _PAD23:1;
-	s32 DM30:3;
-	s32 _PAD30:1;
-	s32 DM31:3;
-	s32 _PAD31:1;
-	s32 DM32:3;
-	s32 _PAD32:1;
-	s32 DM33:3;
-	s32 _PAD33:1;
-REG_END
-
-REG64_(GIFReg, DTHE)
-	u32 DTHE:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, FBA)
-	u32 FBA:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, FINISH)
-	u32 _PAD1:32;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, FOG)
-	u32 _PAD1:32;
-	u32 _PAD2:24;
-	u32 F:8;
-REG_END
-
-REG64_(GIFReg, FOGCOL)
-	u32 FCR:8;
-	u32 FCG:8;
-	u32 FCB:8;
-	u32 _PAD1:8;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, FRAME)
-	u32 FBP:9;
-	u32 _PAD1:7;
-	u32 FBW:6;
-	u32 _PAD2:2;
-	u32 PSM:6;
-	u32 _PAD3:2;
-	u32 FBMSK:32;
-REG_END2
-	u32 Block() const {return FBP << 5;}
-REG_END2
-
-REG64_(GIFReg, HWREG)
-	u32 DATA_LOWER:32;
-	u32 DATA_UPPER:32;
-REG_END
-
-REG64_(GIFReg, LABEL)
-	u32 ID:32;
-	u32 IDMSK:32;
-REG_END
-
-REG64_(GIFReg, MIPTBP1)
-	u64 TBP1:14;
-	u64 TBW1:6;
-	u64 TBP2:14;
-	u64 TBW2:6;
-	u64 TBP3:14;
-	u64 TBW3:6;
-	u64 _PAD:4;
-REG_END
-
-REG64_(GIFReg, MIPTBP2)
-	u64 TBP4:14;
-	u64 TBW4:6;
-	u64 TBP5:14;
-	u64 TBW5:6;
-	u64 TBP6:14;
-	u64 TBW6:6;
-	u64 _PAD:4;
-REG_END
-
-REG64_(GIFReg, NOP)
-	u32 _PAD1:32;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, PABE)
-	u32 PABE:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, PRIM)
-	u32 PRIM:3;
-	u32 IIP:1;
-	u32 TME:1;
-	u32 FGE:1;
-	u32 ABE:1;
-	u32 AA1:1;
-	u32 FST:1;
-	u32 CTXT:1;
-	u32 FIX:1;
-	u32 _PAD1:21;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, PRMODE)
-	u32 _PRIM:3;
-	u32 IIP:1;
-	u32 TME:1;
-	u32 FGE:1;
-	u32 ABE:1;
-	u32 AA1:1;
-	u32 FST:1;
-	u32 CTXT:1;
-	u32 FIX:1;
-	u32 _PAD2:21;
-	u32 _PAD3:32;
-REG_END
-
-REG64_(GIFReg, PRMODECONT)
-	u32 AC:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, RGBAQ)
-	u32 R:8;
-	u32 G:8;
-	u32 B:8;
-	u32 A:8;
-	float Q;
-REG_END
-
-REG64_(GIFReg, SCANMSK)
-	u32 MSK:2;
-	u32 _PAD1:30;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, SCISSOR)
-	u32 SCAX0:11;
-	u32 _PAD1:5;
-	u32 SCAX1:11;
-	u32 _PAD2:5;
-	u32 SCAY0:11;
-	u32 _PAD3:5;
-	u32 SCAY1:11;
-	u32 _PAD4:5;
-REG_END
-
-REG64_(GIFReg, SIGNAL)
-	u32 ID:32;
-	u32 IDMSK:32;
-REG_END
-
-REG64_(GIFReg, ST)
-	float S;
-	float T;
-REG_END
-
-REG64_(GIFReg, TEST)
-	u32 ATE:1;
-	u32 ATST:3;
-	u32 AREF:8;
-	u32 AFAIL:2;
-	u32 DATE:1;
-	u32 DATM:1;
-	u32 ZTE:1;
-	u32 ZTST:2;
-	u32 _PAD1:13;
-	u32 _PAD2:32;
-REG_END2
-	__forceinline bool DoFirstPass() {return !ATE || ATST != ATST_NEVER;} // not all pixels fail automatically
-	__forceinline bool DoSecondPass() {return ATE && ATST != ATST_ALWAYS && AFAIL != AFAIL_KEEP;} // pixels may fail, write fb/z
-	__forceinline bool NoSecondPass() {return ATE && ATST != ATST_ALWAYS && AFAIL == AFAIL_KEEP;} // pixels may fail, no output
-REG_END2
-
-REG64_(GIFReg, TEX0)
-union
-{
-	struct
-	{
-		u32 TBP0:14;
-		u32 TBW:6;
-		u32 PSM:6;
-		u32 TW:4;
-		u32 _PAD1:2;
-		u32 _PAD2:2;
-		u32 TCC:1;
-		u32 TFX:2;
-		u32 CBP:14;
-		u32 CPSM:4;
-		u32 CSM:1;
-		u32 CSA:5;
-		u32 CLD:3;
-	};
-
-	struct
-	{
-		u64 _PAD3:30;
-		u64 TH:4;
-		u64 _PAD4:30;
-	};
-};
-REG_END2 
-	__forceinline bool IsRepeating() {return (u32)((u32)1 << TW) > (u32)(TBW << (u32)6);}
-REG_END2
-
-REG64_(GIFReg, TEX1)
-	u32 LCM:1;
-	u32 _PAD1:1;
-	u32 MXL:3;
-	u32 MMAG:1;
-	u32 MMIN:3;
-	u32 MTBA:1;
-	u32 _PAD2:9;
-	u32 L:2;
-	u32 _PAD3:11;
-	s32  K:12; // 1:7:4
-	u32 _PAD4:20;
-REG_END2
-	bool IsMinLinear() const {return (MMIN == 1) || (MMIN & 4);}
-	bool IsMagLinear() const {return MMAG;}
-REG_END2
-
-REG64_(GIFReg, TEX2)
-	u32 _PAD1:20;
-	u32 PSM:6;
-	u32 _PAD2:6;
-	u32 _PAD3:5;
-	u32 CBP:14;
-	u32 CPSM:4;
-	u32 CSM:1;
-	u32 CSA:5;
-	u32 CLD:3;
-REG_END
-
-REG64_(GIFReg, TEXA)
-	u32 TA0:8;
-	u32 _PAD1:7;
-	u32 AEM:1;
-	u32 _PAD2:16;
-	u32 TA1:8;
-	u32 _PAD3:24;
-REG_END
-
-REG64_(GIFReg, TEXCLUT)
-	u32 CBW:6;
-	u32 COU:6;
-	u32 COV:10;
-	u32 _PAD1:10;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, TEXFLUSH)
-	u32 _PAD1:32;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, TRXDIR)
-	u32 XDIR:2;
-	u32 _PAD1:30;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GIFReg, TRXPOS)
-	u32 SSAX:11;
-	u32 _PAD1:5;
-	u32 SSAY:11;
-	u32 _PAD2:5;
-	u32 DSAX:11;
-	u32 _PAD3:5;
-	u32 DSAY:11;
-	u32 DIRY:1;
-	u32 DIRX:1;
-	u32 _PAD4:3;
-REG_END
-
-REG64_(GIFReg, TRXREG)
-	u32 RRW:12;
-	u32 _PAD1:20;
-	u32 RRH:12;
-	u32 _PAD2:20;
-REG_END
-
-REG64_(GIFReg, UV)
-	u32 U:14;
-	u32 _PAD1:2;
-	u32 V:14;
-	u32 _PAD2:2;
-	u32 _PAD3:32;
-REG_END
-
-REG64_(GIFReg, XYOFFSET)
-	u32 OFX:16; 
-	u32 _PAD1:16;
-	u32 OFY:16; 
-	u32 _PAD2:16;
-REG_END
-
-REG64_(GIFReg, XYZ)
-	u32 X:16;
-	u32 Y:16;
-	u32 Z:32;
-REG_END
-
-REG64_(GIFReg, XYZF)
-	u32 X:16;
-	u32 Y:16;
-	u32 Z:24;
-	u32 F:8;
-REG_END
-
-REG64_(GIFReg, ZBUF)
-	u32 ZBP:9;
-	u32 _PAD1:15;
-	// u32 PSM:4;
-	// u32 _PAD2:4;
-	u32 PSM:6;
-	u32 _PAD2:2;
-	u32 ZMSK:1;
-	u32 _PAD3:31;
-REG_END2
-	u32 Block() const {return ZBP << 5;}
-REG_END2
-
-REG64_SET(GIFReg)
-	GIFRegALPHA			ALPHA;
-	GIFRegBITBLTBUF		BITBLTBUF;
-	GIFRegCLAMP			CLAMP;
-	GIFRegCOLCLAMP		COLCLAMP;
-	GIFRegDIMX			DIMX;
-	GIFRegDTHE			DTHE;
-	GIFRegFBA			FBA;
-	GIFRegFINISH		FINISH;
-	GIFRegFOG			FOG;
-	GIFRegFOGCOL		FOGCOL;
-	GIFRegFRAME			FRAME;
-	GIFRegHWREG			HWREG;
-	GIFRegLABEL			LABEL;
-	GIFRegMIPTBP1		MIPTBP1;
-	GIFRegMIPTBP2		MIPTBP2;
-	GIFRegNOP			NOP;
-	GIFRegPABE			PABE;
-	GIFRegPRIM			PRIM;
-	GIFRegPRMODE		PRMODE;
-	GIFRegPRMODECONT	PRMODECONT;
-	GIFRegRGBAQ			RGBAQ;
-	GIFRegSCANMSK		SCANMSK;
-	GIFRegSCISSOR		SCISSOR;
-	GIFRegSIGNAL		SIGNAL;
-	GIFRegST			ST;
-	GIFRegTEST			TEST;
-	GIFRegTEX0			TEX0;
-	GIFRegTEX1			TEX1;
-	GIFRegTEX2			TEX2;
-	GIFRegTEXA			TEXA;
-	GIFRegTEXCLUT		TEXCLUT;
-	GIFRegTEXFLUSH		TEXFLUSH;
-	GIFRegTRXDIR		TRXDIR;
-	GIFRegTRXPOS		TRXPOS;
-	GIFRegTRXREG		TRXREG;
-	GIFRegUV			UV;
-	GIFRegXYOFFSET		XYOFFSET;
-	GIFRegXYZ			XYZ;
-	GIFRegXYZF			XYZF;
-	GIFRegZBUF			ZBUF;
-REG_SET_END
-
-// GIFPacked
-
-REG128_(GIFPacked, PRIM)
-	u32 PRIM:11;
-	u32 _PAD1:21;
-	u32 _PAD2:32;
-	u32 _PAD3:32;
-	u32 _PAD4:32;
-REG_END
-
-REG128_(GIFPacked, RGBA)
-	u32 R:8;
-	u32 _PAD1:24;
-	u32 G:8;
-	u32 _PAD2:24;
-	u32 B:8;
-	u32 _PAD3:24;
-	u32 A:8;
-	u32 _PAD4:24;
-REG_END
-
-REG128_(GIFPacked, STQ)
-	float S;
-	float T;
-	float Q;
-	u32 _PAD1:32;
-REG_END
-
-REG128_(GIFPacked, UV)
-	u32 U:14;
-	u32 _PAD1:18;
-	u32 V:14;
-	u32 _PAD2:18;
-	u32 _PAD3:32;
-	u32 _PAD4:32;
-REG_END
-
-REG128_(GIFPacked, XYZF2)
-	u32 X:16;
-	u32 _PAD1:16;
-	u32 Y:16;
-	u32 _PAD2:16;
-	u32 _PAD3:4;
-	u32 Z:24;
-	u32 _PAD4:4;
-	u32 _PAD5:4;
-	u32 F:8;
-	u32 _PAD6:3;
-	u32 ADC:1;
-	u32 _PAD7:16;
-REG_END
-
-REG128_(GIFPacked, XYZ2)
-	u32 X:16;
-	u32 _PAD1:16;
-	u32 Y:16;
-	u32 _PAD2:16;
-	u32 Z:32;
-	u32 _PAD3:15;
-	u32 ADC:1;
-	u32 _PAD4:16;
-REG_END
-
-REG128_(GIFPacked, FOG)
-	u32 _PAD1:32;
-	u32 _PAD2:32;
-	u32 _PAD3:32;
-	u32 _PAD4:4;
-	u32 F:8;
-	u32 _PAD5:20;
-REG_END
-
-REG128_(GIFPacked, A_D)
-	u64 DATA:64;
-	u32 ADDR:8; // enum GIF_A_D_REG
-	u32 _PAD1:24;
-	u32 _PAD2:32;
-REG_END
-
-REG128_(GIFPacked, NOP)
-	u32 _PAD1:32;
-	u32 _PAD2:32;
-	u32 _PAD3:32;
-	u32 _PAD4:32;
-REG_END
-
-REG128_SET(GIFPackedReg)
-	GIFReg			r;
-	GIFPackedPRIM	PRIM;
-	GIFPackedRGBA	RGBA;
-	GIFPackedSTQ	STQ;
-	GIFPackedUV		UV;
-	GIFPackedXYZF2	XYZF2;
-	GIFPackedXYZ2	XYZ2;
-	GIFPackedFOG	FOG;
-	GIFPackedA_D	A_D;
-	GIFPackedNOP	NOP;
-REG_SET_END
-
-REG64_(GSReg, BGCOLOR)
-	u32 R:8;
-	u32 G:8;
-	u32 B:8;
-	u32 _PAD1:8;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GSReg, BUSDIR)
-	u32 DIR:1;
-	u32 _PAD1:31;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GSReg, CSR)
-	u32 SIGNAL:1;
-	u32 FINISH:1;
-	u32 HSINT:1;
-	u32 VSINT:1;
-	u32 EDWINT:1;
-	u32 ZERO1:1;
-	u32 ZERO2:1;
-	u32 _PAD1:1;
-	u32 FLUSH:1;
-	u32 RESET:1;
-	u32 _PAD2:2;
-	u32 NFIELD:1;
-	u32 FIELD:1;
-	u32 FIFO:2;
-	u32 REV:8;
-	u32 ID:8;
-	u32 _PAD3:32;
-REG_END
-
-REG64_(GSReg, DISPFB) // (-1/2)
-	u32 FBP:9;
-	u32 FBW:6;
-	u32 PSM:5;
-	u32 _PAD:12;
-	u32 DBX:11;
-	u32 DBY:11;
-	u32 _PAD2:10;
-REG_END
-
-REG64_(GSReg, DISPLAY) // (-1/2)
-	u32 DX:12;
-	u32 DY:11;
-	u32 MAGH:4;
-	u32 MAGV:2;
-	u32 _PAD:3;
-	u32 DW:12;
-	u32 DH:11;
-	u32 _PAD2:9;
-REG_END
-
-REG64_(GSReg, EXTBUF)
-	u32 EXBP:14;
-	u32 EXBW:6;
-	u32 FBIN:2;
-	u32 WFFMD:1;
-	u32 EMODA:2;
-	u32 EMODC:2;
-	u32 _PAD1:5;
-	u32 WDX:11;
-	u32 WDY:11;
-	u32 _PAD2:10;
-REG_END
-
-REG64_(GSReg, EXTDATA)
-	u32 SX:12;
-	u32 SY:11;
-	u32 SMPH:4;
-	u32 SMPV:2;
-	u32 _PAD1:3;
-	u32 WW:12;
-	u32 WH:11;
-	u32 _PAD2:9;
-REG_END
-
-REG64_(GSReg, EXTWRITE)
-	u32 WRITE;
-	u32 _PAD2:32;
-REG_END
-
-REG64_(GSReg, IMR)
-	u32 _PAD1:8;
-	u32 SIGMSK:1;
-	u32 FINISHMSK:1;
-	u32 HSMSK:1;
-	u32 VSMSK:1;
-	u32 EDWMSK:1;
-	u32 _PAD2:19;
-	u32 _PAD3:32;
-REG_END
-
-REG64_(GSReg, PMODE)
-	u32 EN1:1;
-	u32 EN2:1;
-	u32 CRTMD:3;
-	u32 MMOD:1;
-	u32 AMOD:1;
-	u32 SLBG:1;
-	u32 ALP:8;
-	u32 _PAD:16;
-	u32 _PAD1:32;
-REG_END
-
-REG64_(GSReg, SIGLBLID)
-	u32 SIGID:32;
-	u32 LBLID:32;
-REG_END
-
-REG64_(GSReg, SMODE1)
-	u32 RC:3;
-	u32 LC:7;
-	u32 T1248:2;
-	u32 SLCK:1;
-	u32 CMOD:2;
-	u32 EX:1;
-	u32 PRST:1;
-	u32 SINT:1;
-	u32 XPCK:1;
-	u32 PCK2:2;
-	u32 SPML:4;
-	u32 GCONT:1;
-	u32 PHS:1;
-	u32 PVS:1;
-	u32 PEHS:1;
-	u32 PEVS:1;
-	u32 CLKSEL:2;
-	u32 NVCK:1;
-	u32 SLCK2:1;
-	u32 VCKSEL:2;
-	u32 VHP:1;
-	u32 _PAD1:27;
-REG_END
-
-REG64_(GSReg, SMODE2)
-	u32 INT:1;
-	u32 FFMD:1;
-	u32 DPMS:2;
-	u32 _PAD2:28;
-	u32 _PAD3:32;
-REG_END
-
-REG64_(GSReg, SIGBLID)
-	u32 SIGID;
-	u32 LBLID;
-REG_END
-
-#define PMODE ((GSRegPMODE*)(g_pBasePS2Mem+0x0000))
-#define SMODE1 ((GSRegSMODE1*)(g_pBasePS2Mem+0x0010))
-#define SMODE2 ((GSRegSMODE2*)(g_pBasePS2Mem+0x0020))
-// SRFSH
-#define SYNCH1 ((GSRegSYNCH1*)(g_pBasePS2Mem+0x0040))
-#define SYNCH2 ((GSRegSYNCH2*)(g_pBasePS2Mem+0x0050))
-#define SYNCV ((GSRegSYNCV*)(g_pBasePS2Mem+0x0060))
-#define DISPFB1 ((GSRegDISPFB*)(g_pBasePS2Mem+0x0070))
-#define DISPLAY1 ((GSRegDISPLAY*)(g_pBasePS2Mem+0x0080))
-#define DISPFB2 ((GSRegDISPFB*)(g_pBasePS2Mem+0x0090))
-#define DISPLAY2 ((GSRegDISPLAY*)(g_pBasePS2Mem+0x00a0))
-#define EXTBUF ((GSRegEXTBUF*)(g_pBasePS2Mem+0x00b0))
-#define EXTDATA ((GSRegEXTDATA*)(g_pBasePS2Mem+0x00c0))
-#define EXTWRITE ((GSRegEXTWRITE*)(g_pBasePS2Mem+0x00d0))
-#define BGCOLOR ((GSRegBGCOLOR*)(g_pBasePS2Mem+0x00e0))
-#define CSR ((GSRegCSR*)(g_pBasePS2Mem+0x1000))
-#define IMR ((GSRegIMR*)(g_pBasePS2Mem+0x1010))
-#define BUSDIR ((GSRegBUSDIR*)(g_pBasePS2Mem+0x1040))
-#define SIGLBLID ((GSRegSIGBLID*)(g_pBasePS2Mem+0x1080))
-
-//
-// sps2tags.h
-//
-#define GET_GIF_REG(tag, reg) \
-	(((tag).ai32[2 + ((reg) >> 3)] >> (((reg) & 7) << 2)) & 0xf)
-
-#define GET_GSFPS (((SMODE1->CMOD&1) ? 50 : 60) / (SMODE2->INT ? 1 : 2))
-
-extern void WriteTempRegs();
-extern void SetFrameSkip(bool skip);
-extern void ResetRegs();
-
-extern void SetTexFlush();
-extern void SetFogColor(u32 fog);
-extern void SetFogColor(GIFRegFOGCOL* fog);
-extern bool CheckChangeInClut(u32 highdword, u32 psm); // returns true if clut will change after this tex0 op
-
-// flush current vertices, call before setting new registers (the main render method)
-void Flush(int context);
-void FlushBoth();
-
-// called on a primitive switch
-void Prim();
-
-#endif
-
-#endif // NEWREGS_H_INCLUDED
--- a/plugins/zzogl-pg/opengl/Regs.cpp
+++ b/plugins/zzogl-pg/opengl/Regs.cpp
@ -26,7 +26,6 @@
 #include "ZZoglVB.h"
 #include "ZZoglDrawing.h"

-#ifdef USE_OLD_REGS

 #ifdef _MSC_VER
 #pragma warning(disable:4244)
@ -157,6 +156,9 @@ void __gifCall GIFPackedRegHandlerFOG(const u32* data)
 void __gifCall GIFPackedRegHandlerA_D(const u32* data)
 {
 	FUNCLOG
+//	GIFPackedA_D* r = (GIFPackedA_D*)(data);
+//	
+//	g_GIFRegHandlers[r->ADDR](data);

 	if ((data[2] & 0xff) < 100)
 		g_GIFRegHandlers[data[2] & 0xff](data);
@ -188,21 +190,20 @@ void __gifCall GIFRegHandlerNull(const u32* data)
 void __gifCall GIFRegHandlerPRIM(const u32 *data)
 {
 	FUNCLOG
+	GIFRegPRIM* r = (GIFRegPRIM*)(data);

 	//if (data[0] & ~0x3ff)
 	//{
 		//ZZLog::Warn_Log("Warning: unknown bits in prim %8.8lx_%8.8lx", data[1], data[0]);
 	//}
 	
-
 	gs.primC = 0;
-    u16 prim_type = (data[0]) & 0x7;
-	prim->prim = prim_type;
-	gs._prim[0].prim = prim_type;
-	gs._prim[1].prim = prim_type;
-	gs._prim[1]._val = (data[0] >> 3) & 0xff;
+	prim->prim = r->PRIM;
+	gs._prim[0].prim = r->PRIM;
+	gs._prim[1].prim = r->PRIM;
+	gs._prim[1]._val = (data[0] >> 3) & 0xff; // Setting the next 8 flags after prim at once.

-    gs.new_tri_fan = !(prim_type ^ PRIM_TRIANGLE_FAN);
+    gs.new_tri_fan = !(r->PRIM ^ PRIM_TRIANGLE_FAN);
    ZZKick->DirtyValidPrevPrim();

 	Prim();
@ -211,6 +212,10 @@ void __gifCall GIFRegHandlerPRIM(const u32 *data)
 void __gifCall GIFRegHandlerRGBAQ(const u32* data)
 {
 	FUNCLOG
+//	GIFRegRGBAQ* r = (GIFRegRGBAQ*)(data);
+//	gs.rgba = (r->R | (r->G <<  8) | (r->B << 16) | (r->A << 24));
+//	gs.vertexregs.rgba = gs.rgba;
+//	gs.vertexregs.q = r->Q;
 	gs.rgba = data[0];
 	gs.vertexregs.rgba = data[0];
 	*(u32*)&gs.vertexregs.q = data[1];
@ -219,6 +224,9 @@ void __gifCall GIFRegHandlerRGBAQ(const u32* data)
 void __gifCall GIFRegHandlerST(const u32* data)
 {
 	FUNCLOG
+//	GIFRegST* r = (GIFRegST*)(data);
+//	gs.vertexregs.s = r->S;
+//	gs.vertexregs.t = r->T;
 	*(u32*)&gs.vertexregs.s = data[0] & 0xffffff00;
 	*(u32*)&gs.vertexregs.t = data[1] & 0xffffff00;
 	//*(u32*)&gs.q = data[2];
@ -445,7 +453,10 @@ void __gifCall GIFRegHandlerXYOFFSET(const u32* data)
 void __gifCall GIFRegHandlerPRMODECONT(const u32* data)
 {
 	FUNCLOG
-	gs.prac = data[0] & 0x1;
+	// Turns all the text into colored blocks on the initial Mana Khemia dialog if not run.
+	GIFRegPRMODECONT* r = (GIFRegPRMODECONT*)(data);
+//	gs.prac = data[0] & 0x1;
+	gs.prac = r->AC;
 	prim = &gs._prim[gs.prac];

 	Prim();
@ -468,6 +479,7 @@ void __gifCall GIFRegHandlerTEXCLUT(const u32* data)
 	vb[0].FlushTexData();
 	vb[1].FlushTexData();

+	//Fixme
 	gs.clut.cbw = r->CBW << 6;
 	gs.clut.cou = r->COU << 4;
 	gs.clut.cov = r->COV;
@ -477,9 +489,6 @@ void __gifCall GIFRegHandlerSCANMSK(const u32* data)
 {
 	FUNCLOG
 	GIFRegSCANMSK* r = (GIFRegSCANMSK*)(data);
-//  FlushBoth();
-//  ResolveC(&vb[0]);
-//  ResolveZ(&vb[0]);

 	gs.smask = r->MSK;
 	REG_LOG("Scanmsk == %d", gs.smask);
@ -534,23 +543,20 @@ void __gifCall GIFRegHandlerMIPTBP2(const u32* data)
 void __gifCall GIFRegHandlerTEXA(const u32* data)
 {
 	FUNCLOG
-	texaInfo newinfo;
-	newinfo.aem = (data[0] >> 15) & 0x1;
-	newinfo.ta[0] = data[0] & 0xff;
-	newinfo.ta[1] = data[1] & 0xff;
+	// Background of initial Mana Khemia dialog.
+	GIFRegTEXA* r = (GIFRegTEXA*)(data);
 	
-	if (*(u32*)&newinfo != *(u32*)&gs.texa)
+	if ((r->AEM != gs.texa.aem) || (r->TA0 != gs.texa.ta[0]) || (r->TA1 != gs.texa.ta[1]))
 	{
 		FlushBoth();

-		*(u32*)&gs.texa = *(u32*) & newinfo;
-		
-		gs.texa.fta[0] = newinfo.ta[0] / 255.0f;
-		gs.texa.fta[1] = newinfo.ta[1] / 255.0f;
-
 		vb[0].bTexConstsSync = false;
 		vb[1].bTexConstsSync = false;
 	}
+		
+	gs.texa.aem = r->AEM;
+	gs.texa.ta[0] = r->TA0;
+	gs.texa.ta[1] = r->TA1;
 }

 void __gifCall GIFRegHandlerFOGCOL(const u32* data)
@ -564,6 +570,7 @@ void __gifCall GIFRegHandlerFOGCOL(const u32* data)
 void __gifCall GIFRegHandlerTEXFLUSH(const u32* data)
 {
 	FUNCLOG
+	// Not actually handled by GSDX.
 	SetTexFlush();
 }

@ -597,6 +604,12 @@ void __gifCall GIFRegHandlerALPHA(const u32* data)
 {
 	FUNCLOG
 	alphaInfo newalpha;
+//	newalpha.a = r->A;
+//	newalpha.b = r->B;
+//	newalpha.c = r->C;
+//	newalpha.d = r->D;
+//	newalpha.fix = r->FIX;
+
 	newalpha.abcd = *(u8*)data;
 	newalpha.fix = *(u8*)(data + 1);

@ -610,9 +623,11 @@ void __gifCall GIFRegHandlerALPHA(const u32* data)
 		if (newalpha.d == 3) newalpha.d = 0;

 		*(u16*)&vb[ctxt].alpha = *(u16*) & newalpha;
+//		vb[i].alpha = newalpha;
 	}
 }

+// DIMX & DTHE are both for dithering, and not currently implemented.
 void __gifCall GIFRegHandlerDIMX(const u32* data)
 {
 	FUNCLOG
@ -647,14 +662,18 @@ template <u32 ctxt>
 void __gifCall GIFRegHandlerTEST(const u32* data)
 {
 	FUNCLOG
+//	GIFRegTEST* r = (GIFRegTEST*)(data);
 	
 	pixTest* test = &vb[ctxt].test;

-	if ((*(u32*)test & 0x0007ffff) == (data[0] & 0x0007ffff)) return;
-
+	//if (test->_val != r->ai32[0])
+	if ((*(u32*)test & 0x0007ffff) != (data[0] & 0x0007ffff))
+	{
 		Flush(ctxt);

 		*(u32*)test = data[0];
+		//test->_val = r->ai32[0];
+	}

 //  test.ate   = (data[0]	  ) & 0x1;
 //  test.atst  = (data[0] >>  1) & 0x7;
@ -688,6 +707,48 @@ void __gifCall GIFRegHandlerFBA(const u32* data)
 	vb[ctxt].fba.fba = r->FBA;
 }

+/*
+template<u32 i>
+void __gifCall GIFRegHandlerFRAME(const u32* data)
+{
+	FUNCLOG
+	// Affects opening dialogs, movie, and menu on Mana Khemia.
+	
+	GIFRegFRAME* r = (GIFRegFRAME*)(data);
+	frameInfo& gsfb = vb[i].gsfb;
+	
+	int fbw = r->FBW * 64;
+	int fbp = r->FBP * 32;
+	int fbh = 0;
+	
+	if (gs.dthe != 0)
+	{
+		// Dither here.
+		//ZZLog::Error_Log("frameWrite: Dither!");
+	}
+	
+	if ((gsfb.fbp == fbp) &&
+			(gsfb.fbw == fbw) &&
+			(gsfb.psm == r->PSM) &&
+			(gsfb.fbm == ZZOglGet_fbm_FrameBitsFix(data[0], data[1])))
+	{
+		return;
+	}
+
+	FlushBoth();
+	if (r->FBW > 0) fbh = ZZOgl_fbh_Calc(r->FBP, r->FBW, r->PSM);
+
+	gsfb.fbp = fbp;
+	gsfb.fbw = fbw;
+	gsfb.psm = r->PSM;
+	gsfb.fbh = fbh;
+	gsfb.fbm = ZZOglGet_fbm_FrameBitsFix(data[0], data[1]);
+	
+
+	vb[i].bNeedFrameCheck = 1;
+	ZZLog::Greg_Log("FRAME_%d", i);
+}*/
+
 template <u32 ctxt>
 void __gifCall GIFRegHandlerFRAME(const u32* data)
 {
@ -715,6 +776,41 @@ void __gifCall GIFRegHandlerFRAME(const u32* data)
 	vb[ctxt].bNeedFrameCheck = 1;
 }

+/*
+template <u32 i>
+void __gifCall GIFRegHandlerZBUF(const u32* data)
+{
+	FUNCLOG
+	// I'll wait a bit on this one.
+	GIFRegZBUF* r = (GIFRegZBUF*)(data);
+	ZZLog::Greg_Log("ZBUF_1");
+	
+	zbufInfo& zbuf = vb[i].zbuf;
+	int psm = (0x30 | r->PSM);
+	int zbp = r->ZBP * 32;
+
+	if (zbuf.zbp == zbp &&
+			zbuf.psm == psm &&
+			zbuf.zmsk == r->ZMSK)
+	{
+		return;
+	}
+
+	// error detection
+	if (m_Blocks[psm].bpp == 0) return;
+
+	FlushBoth();
+
+	zbuf.zbp = zbp;
+	zbuf.psm = psm;
+	zbuf.zmsk = r->ZMSK;
+
+	vb[i].zprimmask = 0xffffffff;
+
+	if (zbuf.psm > 0x31) vb[i].zprimmask = 0xffff;
+
+	vb[i].bNeedZCheck = 1;
+}*/
 template <u32 ctxt>
 void __gifCall GIFRegHandlerZBUF(const u32* data)
 {
@ -758,6 +854,17 @@ void __gifCall GIFRegHandlerBITBLTBUF(const u32* data)
 	gs.dstbufnew.psm = r->DPSM;

 	if (gs.dstbufnew.bw == 0) gs.dstbufnew.bw = 64;
+	// GSdx does this:
+	
+	/*if((gs.srcbufnew.bw & 1) && (gs.srcbufnew.psm == PSM_PSMT8 || gs.srcbufnew.psm == PSM_PSMT4))
+	{
+		gs.srcbufnew.bw &= ~1;
+	}
+
+	if((gs.dstbufnew.bw & 1) && (gs.dstbufnew.psm == PSM_PSMT8 || gs.dstbufnew.psm == PSM_PSMT4))
+	{
+		gs.dstbufnew.bw &= ~1; // namcoXcapcom: 5, 11, refered to as 4, 10 in TEX0.TBW later
+	}*/
 }

 void __gifCall GIFRegHandlerTRXPOS(const u32* data)
@ -777,53 +884,44 @@ void __gifCall GIFRegHandlerTRXREG(const u32* data)
 {
 	FUNCLOG
 	GIFRegTRXREG* r = (GIFRegTRXREG*)(data);
-	gs.imageWtemp = r->RRW;
-	gs.imageHtemp = r->RRH;
+	gs.imageTemp.w = r->RRW;
+	gs.imageTemp.h = r->RRH;
 }

 void __gifCall GIFRegHandlerTRXDIR(const u32* data)
 {
 	FUNCLOG
-	// terminate any previous transfers
-
-	switch (gs.imageTransfer)
-	{
-
-		case 0: // host->loc
-			TerminateHostLocal();
-			break;
-
-		case 1: // loc->host
-			TerminateLocalHost();
-			break;
-	}
+	GIFRegTRXDIR* r = (GIFRegTRXDIR*)(data);

 	gs.srcbuf = gs.srcbufnew;
-
 	gs.dstbuf = gs.dstbufnew;
-	gs.trxpos = gs.trxposnew;
-	gs.imageTransfer = data[0] & 0x3;
-	gs.imageWnew = gs.imageWtemp;
-	gs.imageHnew = gs.imageHtemp;
 	
-	if (gs.imageWnew > 0 && gs.imageHnew > 0)
+	gs.imageNew.w = gs.imageTemp.w;
+	gs.imageNew.h = gs.imageTemp.h;
+	
+	gs.trxpos = gs.trxposnew;
+	gs.imageTransfer = r->XDIR;
+	gs.transferring = true;
+
+	if (gs.imageNew.w > 0 && gs.imageNew.h > 0)
 	{
 		switch (gs.imageTransfer)
 		{
-			case 0: // host->loc
+			case XFER_HOST_TO_LOCAL: // host->loc
 				InitTransferHostLocal();
 				break;

-			case 1: // loc->host
+			case XFER_LOCAL_TO_HOST: // loc->host
 				InitTransferLocalHost();
 				break;

-			case 2:
+			case XFER_LOCAL_TO_LOCAL:
 				TransferLocalLocal();
 				break;

-			case 3:
-				gs.imageTransfer = -1;
+			case XFER_DEACTIVATED:
+				ZZLog::WriteLn("Image Transfer = 3?");
+				gs.transferring = false;
 				break;

 			default:
@ -833,9 +931,9 @@ void __gifCall GIFRegHandlerTRXDIR(const u32* data)
 	else
 	{
 #if defined(ZEROGS_DEVBUILD)
-		ZZLog::Warn_Log("Dummy transfer.");
+		//ZZLog::Warn_Log("Dummy transfer.");
 #endif
-		gs.imageTransfer = -1;
+		gs.transferring = false;
 	}
 }

@ -843,7 +941,7 @@ void __gifCall GIFRegHandlerHWREG(const u32* data)
 {
 	FUNCLOG

-	if (gs.imageTransfer == 0)
+	if (gs.transferring && gs.imageTransfer == XFER_HOST_TO_LOCAL)
 	{
 		TransferHostLocal(data, 2);
 	}
@ -866,14 +964,9 @@ void __gifCall GIFRegHandlerSIGNAL(const u32* data)
 	{
 		SIGLBLID->SIGID = (SIGLBLID->SIGID & ~data[1]) | (data[0] & data[1]);

-//	  if (gs.CSRw & 0x1) CSR->SIGNAL = 1;
-//	  if (!IMR->SIGMSK && GSirq)
-//		  GSirq();
-
 		if (gs.CSRw & 0x1)
 		{
 			CSR->SIGNAL = 1;
-			//gs.CSRw &= ~1;
 		}

 		if (!IMR->SIGMSK && GSirq) GSirq();
@ -889,17 +982,6 @@ void __gifCall GIFRegHandlerFINISH(const u32* data)
 		if (gs.CSRw & 0x2) CSR->FINISH = 1;

 		if (!IMR->FINISHMSK && GSirq) GSirq();
-
-//	  if( gs.CSRw & 2 ) {
-//		  //gs.CSRw &= ~2;
-//		  //CSR->FINISH = 0;
-//
-//
-//	  }
-//	  CSR->FINISH = 1;
-//
-//	  if( !IMR->FINISHMSK && GSirq )
-//		  GSirq();
 	}
 }

@ -913,7 +995,6 @@ void __gifCall GIFRegHandlerLABEL(const u32* data)
 	}
 }

-
 void SetMultithreaded()
 {
 	// Some older versions of PCSX2 didn't properly set the irq callback to NULL
@ -1077,4 +1158,3 @@ void SetFrameSkip(bool skip)
 	}
 }

-#endif
--- a/plugins/zzogl-pg/opengl/Regs.h
+++ b/plugins/zzogl-pg/opengl/Regs.h
@ -20,9 +20,6 @@
 #ifndef __GSREGS_H__
 #define __GSREGS_H__

-
-#ifdef USE_OLD_REGS
-
 enum GIF_REG
 {
 	GIF_REG_PRIM	= 0x00,
@ -193,7 +190,7 @@ enum GS_ATST
 	ATST_EQUAL		= 4,
 	ATST_GEQUAL		= 5,
 	ATST_GREATER	= 6,
-	ATST_NOTEQUAL	= 7,
+	ATST_NOTEQUAL	= 7
 };

 enum GS_AFAIL
@ -201,9 +198,24 @@ enum GS_AFAIL
 	AFAIL_KEEP		= 0,
 	AFAIL_FB_ONLY	= 1,
 	AFAIL_ZB_ONLY	= 2,
-	AFAIL_RGB_ONLY	= 3,
+	AFAIL_RGB_ONLY	= 3
 };
 
+enum GS_TFX
+{
+	TFX_MODULATE	= 0,
+	TFX_DECAL		= 1,
+	TFX_HIGHLIGHT	= 2,
+	TFX_HIGHLIGHT2	= 3
+};
+
+enum GS_CLAMP
+{
+	CLAMP_REPEAT		= 0,
+	CLAMP_CLAMP			= 1,
+	CLAMP_REGION_CLAMP	= 2,
+	CLAMP_REGION_REPEAT	= 3
+};
 // GIFReg

 REG64_(GIFReg, ALPHA)
@ -763,7 +775,8 @@ REG128_SET(GIFPackedReg)
 	GIFPackedNOP	NOP;
 REG_SET_END

-
+// This register stores the background color. Theoretically it'd get blended with the image in some cases, but we don't appear to be
+// using it. See PMODE->SLBG. GSDx *is* using it.
 REG64_(GSReg, BGCOLOR)
 	u32 R:8;
 	u32 G:8;
@ -772,12 +785,15 @@ REG64_(GSReg, BGCOLOR)
 	u32 _PAD2:32;
 REG_END

+// This register switches the direction of Fifo. 0 - Host -> Local; 1 - Local -> Host. Fifo is supposed to be empty at the time.
+// Unchecked by GSdx or ZZOgl.
 REG64_(GSReg, BUSDIR)
 	u32 DIR:1;
 	u32 _PAD1:31;
 	u32 _PAD2:32;
 REG_END

+// Mostly looks handled by pcsx2.
 REG64_(GSReg, CSR)
 	u32 SIGNAL:1;
 	u32 FINISH:1;
@ -798,6 +814,12 @@ REG64_(GSReg, CSR)
 	u32 _PAD3:32;
 REG_END

+// Settings for whichever circuit we're using. (Again, see PMODE.)
+//    --  FBP - Frame Buffer Pointer. address / 2048.
+//    --  FBW - Frame Buffer Width. width / 64.
+//    --  PSM - psm, but 5 bit. 0 - PSMCT32; 1 - PSMCT24; 2 - PSMCT16; 10 - PSMCT16S; 18  - PS-GPU24?
+//    --  DBX - Upper left x coords of rectangle.
+//    --  DBY - Upper left y coords of rectangle.
 REG64_(GSReg, DISPFB) // (-1/2)
 	u32 FBP:9;
 	u32 FBW:6;
@ -808,6 +830,14 @@ REG64_(GSReg, DISPFB) // (-1/2)
 	u32 _PAD2:10;
 REG_END

+// Settings for whichever display we're using.
+//    --  DX - X position in the display area.
+//    --  DY - Y position in the display area.
+//    --  MAGH - Horizontal Magnification; x1 - x16.
+//    --  MAGV - Vertical Magnification; x1 - x16.
+//    --  DW - Display Area Width - 1.
+//    --  DH - Display Area Height - 1.
+
 REG64_(GSReg, DISPLAY) // (-1/2)
 	u32 DX:12;
 	u32 DY:11;
@ -819,6 +849,16 @@ REG64_(GSReg, DISPLAY) // (-1/2)
 	u32 _PAD2:9;
 REG_END

+// This register has settings for the frame buffer when writing back. These next three registers are unused in ZZOgl & GSDx.
+//   --  EXBP - Base pointer of the buffer / 64.
+//   --  EXBW - Width of the buffer / 64.
+//   --  FBIN - Whether we use OUT1 or OUT2. 0 - 1; 1 - 2.
+//   --  WFFMD - Interlace Mode; 0 - Field; 1 - Frame.
+//   --  EMODA - When processing an input alpha value; 0 - write it as is; 1 Convert from RGB to luminence value Y. 2 - Same as 1, only /2. 3 - 0.
+//   --  EMODC - When processing an input color value; 0 - write it as is; 1 Convert from RGB to luminence value Y. 2 - Convert to YCbCr. 3 - Write Alpha to RGB.
+//   --  WDX - X coords.
+//   --  WDY - Y coords.
+
 REG64_(GSReg, EXTBUF)
 	u32 EXBP:14;
 	u32 EXBW:6;
@ -832,6 +872,14 @@ REG64_(GSReg, EXTBUF)
 	u32 _PAD2:10;
 REG_END

+// Sets where you read when the write above is performed.
+//   --  SX - X coords.
+//   --  SX - Y coords.
+//   --  SMPH - Horiz Sampling rate.
+//   --  SMPV -  Vert Sampling rate.
+//   --  WW - Rect Width - 1
+//   --  WH - Rect Height - 1
+
 REG64_(GSReg, EXTDATA)
 	u32 SX:12;
 	u32 SY:11;
@ -843,11 +891,13 @@ REG64_(GSReg, EXTDATA)
 	u32 _PAD2:9;
 REG_END

+// Starts or stops the aforementioned write.
 REG64_(GSReg, EXTWRITE)
 	u32 WRITE;
 	u32 _PAD2:32;
 REG_END

+// Pcsx2 handles this.
 REG64_(GSReg, IMR)
 	u32 _PAD1:8;
 	u32 SIGMSK:1;
@ -859,6 +909,16 @@ REG64_(GSReg, IMR)
 	u32 _PAD3:32;
 REG_END

+// The fields of PMODE are:
+//    --  EN1 - Read Circuit 1; 0 - off, 1 - on.
+//    --  EN2 - Read Circuit 2; 0 - off, 1 - on.
+//    --  CRTMD - Always 1.
+//    --  MMOD - For Alpha blending, the selection is: 0 - The Alpha value of circuit 1, 1 - The ALP register value.
+//    --  AMOD - The OUT1 Alpha value selection: 0 - Read circuit 1, 1 - Read Circuit 2.
+//    --  SLBG - The Alpha blending type: 0 - blended with the output of Read circuit 1, 1 - blended with the background color.
+//    --  ALP - The fixed Alpha value.
+//
+
 REG64_(GSReg, PMODE)
 	u32 EN1:1;
 	u32 EN2:1;
@ -871,11 +931,13 @@ REG64_(GSReg, PMODE)
 	u32 _PAD1:32;
 REG_END

+// Pcsx2 handles this.
 REG64_(GSReg, SIGLBLID)
 	u32 SIGID:32;
 	u32 LBLID:32;
 REG_END

+// Not sure about this one...
 REG64_(GSReg, SMODE1)
 	u32 RC:3;
 	u32 LC:7;
@ -901,6 +963,11 @@ REG64_(GSReg, SMODE1)
 	u32 _PAD1:27;
 REG_END

+// The fields of SMODE2 are:
+//    --  INT - 0 for non-interlaced; 1 for interlaced.
+//    --  FFMD - 0 for field mode (read every other line); 1 for frame mode (read every line)
+//    --  DPMS - VESA DPMS mode setting; 0 - on, 1 - standby, 2 - suspend, 3 - off.
+//
 REG64_(GSReg, SMODE2)
 	u32 INT:1;
 	u32 FFMD:1;
@ -914,6 +981,8 @@ REG64_(GSReg, SIGBLID)
 	u32 LBLID;
 REG_END

+extern u8* g_pBasePS2Mem;
+
 #define PMODE ((GSRegPMODE*)(g_pBasePS2Mem+0x0000))
 #define SMODE1 ((GSRegSMODE1*)(g_pBasePS2Mem+0x0010))
 #define SMODE2 ((GSRegSMODE2*)(g_pBasePS2Mem+0x0020))
@ -942,6 +1011,21 @@ REG_END

 #define GET_GSFPS (((SMODE1->CMOD&1) ? 50 : 60) / (SMODE2->INT ? 1 : 2))

+static __forceinline GSRegDISPLAY* Display_Reg(int circuit)
+{
+	return (circuit) ? DISPLAY2 : DISPLAY1;
+}
+
+static __forceinline GSRegDISPFB* Dispfb_Reg(int circuit)
+{
+	return (circuit) ? DISPFB2 : DISPFB1;
+}
+
+static __forceinline bool Circuit_Enabled(int circuit)
+{
+	return (circuit) ? PMODE->EN2 : PMODE->EN1;
+}
+
 extern void WriteTempRegs();
 extern void SetFrameSkip(bool skip);
 extern void ResetRegs();
@ -958,7 +1042,4 @@ void FlushBoth();
 // called on a primitive switch
 void Prim();

-#else
-#include "NewRegs.h"
-#endif
 #endif
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@ -58,10 +58,12 @@
 #include "CRC.h"
 #include "ZZLog.h"

+#ifdef _WIN32
 // need C definitions -- no mangling please!
 extern "C" u32   CALLBACK PS2EgetLibType(void);
 extern "C" u32   CALLBACK PS2EgetLibVersion2(u32 type);
 extern "C" char* CALLBACK PS2EgetLibName(void);
+#endif

 #include "ZZoglMath.h"
 #include "Profile.h"
--- a/plugins/zzogl-pg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg/opengl/ZZClut.cpp
@ -22,7 +22,7 @@
 #include "Util.h"

 #if defined(ZEROGS_SSE2)
-#include <emmintrin.h>
+#include <immintrin.h>
 #endif

 // Local Clut buffer:
--- a/plugins/zzogl-pg/opengl/ZZDepthTargets.cpp
+++ b/plugins/zzogl-pg/opengl/ZZDepthTargets.cpp
@ -0,0 +1,314 @@
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include <stdlib.h>
+#include <math.h>
+
+#include "GS.h"
+#include "Mem.h"
+#include "x86.h"
+#include "targets.h"
+#include "ZZoglShaders.h"
+#include "ZZClut.h"
+#include "ZZoglVB.h"
+
+#ifdef ZEROGS_SSE2
+#include <immintrin.h>
+#endif
+
+extern bool g_bUpdateStencil;
+
+void _Resolve(const void* psrc, int fbp, int fbw, int fbh, int psm, u32 fbm, bool mode);
+void SetWriteDepth();
+bool IsWriteDepth();
+bool IsWriteDestAlphaTest();
+
+const float g_filog32 = 0.999f / (32.0f * logf(2.0f));
+
+CDepthTarget::CDepthTarget() : CRenderTarget(), pdepth(0), pstencil(0), icount(0) {}
+
+CDepthTarget::~CDepthTarget()
+{
+	FUNCLOG
+
+	Destroy();
+}
+
+bool CDepthTarget::Create(const frameInfo& frame)
+{
+	FUNCLOG
+
+	if (!CRenderTarget::Create(frame)) return false;
+
+	GL_REPORT_ERROR();
+
+	glGenRenderbuffersEXT(1, &pdepth);
+	glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, pdepth);
+	glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH24_STENCIL8_EXT, RW(fbw), RH(fbh));
+
+	if (glGetError() != GL_NO_ERROR)
+	{
+		// try a separate depth and stencil buffer
+		glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, pdepth);
+		glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT24, RW(fbw), RH(fbh));
+
+		if (g_bUpdateStencil)
+		{
+			glGenRenderbuffersEXT(1, &pstencil);
+			glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, pstencil);
+			glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_STENCIL_INDEX8_EXT, RW(fbw), RH(fbh));
+
+			if (glGetError() != GL_NO_ERROR)
+			{
+				ZZLog::Error_Log("Failed to create depth buffer %dx%d.", RW(fbw), RH(fbh));
+				return false;
+			}
+		}
+		else
+		{
+			pstencil = 0;
+		}
+	}
+	else
+	{
+		pstencil = pdepth;
+	}
+
+	status = TS_NeedUpdate;
+
+	return true;
+}
+
+void CDepthTarget::Destroy()
+{
+	FUNCLOG
+
+	if (status)     // In this case Framebuffer extension is off-use and lead to segfault
+	{
+		ResetRenderTarget(1);
+		FB::Attach(GL_DEPTH_ATTACHMENT_EXT);
+		FB::Attach(GL_STENCIL_ATTACHMENT_EXT);
+		GL_REPORT_ERRORD();
+
+		if (pstencil != 0)
+		{
+			if (pstencil != pdepth) glDeleteRenderbuffersEXT(1, &pstencil);
+			pstencil = 0;
+		}
+
+		if (pdepth != 0)
+		{
+			glDeleteRenderbuffersEXT(1, &pdepth);
+			pdepth = 0;
+		}
+
+		GL_REPORT_ERRORD();
+	}
+
+	CRenderTarget::Destroy();
+}
+
+
+extern int g_nDepthUsed; // > 0 if depth is used
+
+void CDepthTarget::Resolve()
+{
+	FUNCLOG
+
+	if (g_nDepthUsed > 0 && conf.mrtdepth && !(status & TS_Virtual) && IsWriteDepth() && !(conf.settings().no_depth_resolve))
+		CRenderTarget::Resolve();
+	else
+	{
+		// flush if necessary
+		FlushIfNecesary(this);
+
+		if (!(status & TS_Virtual)) status |= TS_Resolved;
+	}
+
+	if (!(status&TS_Virtual))
+	{
+		SetWriteDepth();
+	}
+}
+
+void CDepthTarget::Resolve(int startrange, int endrange)
+{
+	FUNCLOG
+
+	if (g_nDepthUsed > 0 && conf.mrtdepth && !(status&TS_Virtual) && IsWriteDepth())
+	{
+		CRenderTarget::Resolve(startrange, endrange);
+	}
+	else
+	{
+		// flush if necessary
+		FlushIfNecesary(this) ;
+
+		if (!(status & TS_Virtual))
+			status |= TS_Resolved;
+	}
+
+	if (!(status&TS_Virtual))
+	{
+		SetWriteDepth();
+	}
+}
+
+void CDepthTarget::Update(int context, CRenderTarget* prndr)
+{
+	FUNCLOG
+
+	assert(!(status & TS_Virtual));
+
+	// align the rect to the nearest page
+	// note that fbp is always aligned on page boundaries
+	tex0Info texframe;
+	texframe.tbp0 = fbp;
+	texframe.tbw = fbw;
+	texframe.tw = fbw;
+	texframe.th = fbh;
+	texframe.psm = psm;
+    // FIXME some field are not initialized...
+    // in particular the clut related one
+    assert(!PSMT_ISCLUT(psm));
+
+	DisableAllgl();
+
+	VB& curvb = vb[context];
+
+	if (curvb.test.zte == 0) return;
+
+	SetShaderCaller("CDepthTarget::Update");
+
+	glEnable(GL_DEPTH_TEST);
+
+	glDepthMask(!curvb.zbuf.zmsk);
+
+	static const u32 g_dwZCmp[] = { GL_NEVER, GL_ALWAYS, GL_GEQUAL, GL_GREATER };
+
+	glDepthFunc(g_dwZCmp[curvb.test.ztst]);
+
+	// write color and zero out stencil buf, always 0 context!
+	SetTexVariablesInt(0, 0, texframe, false, &ppsBitBltDepth, 1);
+	ZZshGLSetTextureParameter(ppsBitBltDepth.prog, ppsBitBltDepth.sMemory, vb[0].pmemtarg->ptex->tex, "BitBltDepth");
+
+	float4 v = DefaultBitBltPos();
+
+	v = DefaultBitBltTex();
+
+	v.x = 1;
+	v.y = 2;
+	v.z = PSMT_IS16Z(psm) ? 1.0f : 0.0f;
+	v.w = g_filog32;
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sOneColor, v, "g_fOneColor");
+
+	float4 vdepth = g_vdepth;
+
+	if (psm == PSMT24Z)
+	{
+		vdepth.w = 0;
+	}
+	else if (psm != PSMT32Z)
+	{
+		vdepth.z = vdepth.w = 0;
+	}
+
+	assert(ppsBitBltDepth.sBitBltZ != 0);
+
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sBitBltZ, (vdepth*(255.0f / 256.0f)), "g_fBitBltZ");
+
+	assert(pdepth != 0);
+	//GLint w1 = 0;
+	//GLint h1 = 0;
+
+	FB::Attach2D(0, ptex);
+	//glGetRenderbufferParameterivEXT(GL_RENDERBUFFER_EXT, GL_RENDERBUFFER_WIDTH_EXT, &w1);
+	//glGetRenderbufferParameterivEXT(GL_RENDERBUFFER_EXT, GL_RENDERBUFFER_HEIGHT_EXT, &h1);
+	SetDepthStencilSurface();
+
+	FB::Attach2D(1);
+
+	GLenum buffer = GL_COLOR_ATTACHMENT0_EXT;
+
+	//ZZLog::Error_Log("CDepthTarget::Update: w1 = 0x%x; h1 = 0x%x", w1, h1);
+	DrawBuffers(&buffer);
+
+	SetViewport();
+
+	if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+
+	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
+
+	SET_STREAM();
+	ZZshSetVertexShader(pvsBitBlt.prog);
+	ZZshSetPixelShader(ppsBitBltDepth.prog);
+
+	DrawTriangleArray();
+
+	status = TS_Resolved;
+
+	if (!IsWriteDepth())
+	{
+		ResetRenderTarget(1);
+	}
+
+	if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
+
+	glEnable(GL_SCISSOR_TEST);
+
+#ifdef _DEBUG
+	if (g_bSaveZUpdate)
+	{
+		SaveTex(&texframe, 1);
+		SaveTexture("frame1.tga", GL_TEXTURE_RECTANGLE_NV, ptex, RW(fbw), RH(fbh));
+	}
+#endif
+}
+
+void CDepthTarget::SetDepthStencilSurface()
+{
+	FUNCLOG
+	FB::Attach(GL_DEPTH_ATTACHMENT_EXT, pdepth);
+
+	if (pstencil)
+	{
+		// there's a bug with attaching stencil and depth buffers
+		FB::Attach(GL_STENCIL_ATTACHMENT_EXT, pstencil);
+
+		if (icount++ < 8)    // not going to fail if succeeded 4 times
+		{
+			GL_REPORT_ERRORD();
+
+			if (FB::State() != GL_FRAMEBUFFER_COMPLETE_EXT)
+			{
+				FB::Attach(GL_STENCIL_ATTACHMENT_EXT);
+
+				if (pstencil != pdepth) glDeleteRenderbuffersEXT(1, &pstencil);
+
+				pstencil = 0;
+				g_bUpdateStencil = 0;
+			}
+		}
+	}
+	else
+	{
+		FB::Attach(GL_STENCIL_ATTACHMENT_EXT);
+	}
+}
+
--- a/plugins/zzogl-pg/opengl/ZZGl.h
+++ b/plugins/zzogl-pg/opengl/ZZGl.h
@ -47,8 +47,11 @@ inline void* wglGetProcAddress(const char* x)

 #endif

+#include "Mem.h"
+
 extern u32 s_stencilfunc, s_stencilref, s_stencilmask;
-// Defines
+extern GLenum s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha; // set by zgsBlendFuncSeparateEXT
+extern GLenum s_rgbeq, s_alphaeq;

 #ifndef GL_DEPTH24_STENCIL8_EXT // allows FBOs to support stencils
 #	define GL_DEPTH_STENCIL_EXT 0x84F9
@ -57,28 +60,50 @@ extern u32 s_stencilfunc, s_stencilref, s_stencilmask;
 #	define GL_TEXTURE_STENCIL_SIZE_EXT 0x88F1
 #endif

-#define GL_STENCILFUNC(func, ref, mask) { \
-	s_stencilfunc  = func; \
-	s_stencilref = ref; \
-	s_stencilmask = mask; \
-	glStencilFunc(func, ref, mask); \
+#ifdef _WIN32
+#define GL_LOADFN(name) { \
+		if( (*(void**)&name = (void*)wglGetProcAddress(#name)) == NULL ) { \
+		ZZLog::Error_Log("Failed to find %s, exiting.", #name); \
+	} \
+}
+#else
+// let GLEW take care of it
+#define GL_LOADFN(name)
+#endif
+
+static __forceinline void GL_STENCILFUNC(GLenum func, GLint ref, GLuint mask)
+{
+	s_stencilfunc  = func; 
+	s_stencilref = ref; 
+	s_stencilmask = mask; 
+	glStencilFunc(func, ref, mask); 
 }

-#define GL_STENCILFUNC_SET() glStencilFunc(s_stencilfunc, s_stencilref, s_stencilmask)
-
+static __forceinline void GL_STENCILFUNC_SET()
+{
+	glStencilFunc(s_stencilfunc, s_stencilref, s_stencilmask); 
+}

 // sets the data stream
-#define SET_STREAM() { \
-	glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)8); \
-	glSecondaryColorPointerEXT(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)12); \
-	glTexCoordPointer(3, GL_FLOAT, sizeof(VertexGPU), (void*)16); \
-	glVertexPointer(4, GL_SHORT, sizeof(VertexGPU), (void*)0); \
+static __forceinline void SET_STREAM()
+{
+	glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)8);
+	glSecondaryColorPointerEXT(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)12);
+	glTexCoordPointer(3, GL_FLOAT, sizeof(VertexGPU), (void*)16);
+	glVertexPointer(4, GL_SHORT, sizeof(VertexGPU), (void*)0);
 }

-
 // global alpha blending settings
 extern GLenum g_internalRGBAFloat16Fmt;

+//static __forceinline void SAFE_RELEASE_TEX(u32& x)
+//{
+//	if (x != 0) 
+//	{ 
+//		glDeleteTextures(1, &x);
+//		x = 0; 
+//	}
+//}
 #define SAFE_RELEASE_TEX(x) { if( (x) != 0 ) { glDeleteTextures(1, &(x)); x = 0; } }

 // inline for an extremely often used sequence
@ -104,12 +129,192 @@ extern void (APIENTRY *zgsBlendEquationSeparateEXT)(GLenum, GLenum);
 extern void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum);
 #endif

+static __forceinline void DrawTriangleArray()
+{
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+	GL_REPORT_ERRORD();
+}

-// ------------------------ Types -------------------------
+static __forceinline void DrawBuffers(GLenum *buffer)
+{
+	if (glDrawBuffers != NULL) 
+	{
+		glDrawBuffers(1, buffer);
+	}

-/////////////////////
-// graphics resources
-extern GLenum s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha; // set by zgsBlendFuncSeparateEXT
+	GL_REPORT_ERRORD();
+}
+
+
+namespace FB
+{	
+	extern u32 buf;
+
+	static __forceinline void Create()
+	{
+		glGenFramebuffersEXT(1, &buf);
+	}
+	
+	static __forceinline void Bind()
+	{
+		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, buf);
+	}
+	
+	static __forceinline void Unbind()
+	{
+		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
+	}
+		
+	static __forceinline GLenum State()
+	{
+		return glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
+	}
+
+	static __forceinline void Attach2D(int attach, int id = 0)
+	{
+		glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT + attach, GL_TEXTURE_RECTANGLE_NV, id, 0);
+		GL_REPORT_ERRORD();
+	}
+
+	static __forceinline void Attach(GLenum rend, GLuint id = 0)
+	{
+		glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, rend, GL_RENDERBUFFER_EXT, id);
+	}	
+};
+
+static __forceinline void ResetRenderTarget(int index)
+{
+	FB::Attach2D(index);
+}
+
+static __forceinline void TextureImage(GLenum tex_type, GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	glTexImage2D(tex_type, 0, iFormat, width, height, 0, format, type, pixels);
+}
+
+static __forceinline void Texture2D(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	TextureImage(GL_TEXTURE_2D, iFormat, width, height, format, type, pixels);
+}
+
+static __forceinline void Texture2D(GLint iFormat, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	TextureImage(GL_TEXTURE_2D, iFormat, BLOCK_TEXWIDTH, BLOCK_TEXHEIGHT, format, type, pixels);
+}
+	
+static __forceinline void TextureRect(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	TextureImage(GL_TEXTURE_RECTANGLE_NV, iFormat, width, height, format, type, pixels);
+}
+
+static __forceinline void TextureRect2(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	TextureImage(GL_TEXTURE_RECTANGLE, iFormat, width, height, format, type, pixels);
+}
+
+static __forceinline void Texture3D(GLint iFormat, GLint width, GLint height, GLint depth, GLenum format, GLenum type, const GLvoid* pixels)
+{
+	glTexImage3D(GL_TEXTURE_3D, 0, iFormat, width, height, depth, 0, format, type, pixels);
+}
+
+static __forceinline void setTex2DFilters(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, type);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, type);
+}
+
+static __forceinline void setTex2DWrap(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, type);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, type);
+}
+
+static __forceinline void setTex3DFilters(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, type);
+	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, type);
+}
+
+static __forceinline void setTex3DWrap(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, type);
+	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, type);
+	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_R, type);
+}
+
+static __forceinline void setRectFilters(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, type);
+	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, type);
+}
+
+static __forceinline void setRectWrap(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, type);
+	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, type);
+}
+
+static __forceinline void setRectWrap2(GLint type)
+{
+	glTexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_WRAP_S, type);
+	glTexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_WRAP_T, type);
+}
+
+static __forceinline void GL_BLEND_SET()
+{
+	zgsBlendFuncSeparateEXT(s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha);
+}
+
+static __forceinline void GL_BLEND_RGB(GLenum src, GLenum dst)
+{
+	s_srcrgb = src;
+	s_dstrgb = dst;
+	GL_BLEND_SET();
+}
+
+static __forceinline void GL_BLEND_ALPHA(GLenum src, GLenum dst)
+{
+	s_srcalpha = src;
+	s_dstalpha = dst;
+	GL_BLEND_SET();
+}
+
+static __forceinline void GL_BLEND_ALL(GLenum srcrgb, GLenum dstrgb, GLenum srcalpha, GLenum dstalpha)
+{
+	s_srcrgb = srcrgb;
+	s_dstrgb = dstrgb;
+	s_srcalpha = srcalpha;
+	s_dstalpha = dstalpha;
+	GL_BLEND_SET();
+}
+
+static __forceinline void GL_ZTEST(bool enable)
+{
+	if (enable) 
+		glEnable(GL_DEPTH_TEST);
+	else 
+		glDisable(GL_DEPTH_TEST);
+}
+
+static __forceinline void GL_ALPHATEST(bool enable)
+{
+	if (enable) 
+		glEnable(GL_ALPHA_TEST);
+	else 
+		glDisable(GL_ALPHA_TEST);
+}
+
+static __forceinline void GL_BLENDEQ_RGB(GLenum eq)
+{
+	s_rgbeq = eq;
+	zgsBlendEquationSeparateEXT(s_rgbeq, s_alphaeq);
+}
+
+static __forceinline void GL_BLENDEQ_ALPHA(GLenum eq)
+{
+	s_alphaeq = eq;
+	zgsBlendEquationSeparateEXT(s_rgbeq, s_alphaeq);
+}

 // GL prototypes
 extern PFNGLISRENDERBUFFEREXTPROC glIsRenderbufferEXT;
--- a/plugins/zzogl-pg/opengl/ZZLog.cpp
+++ b/plugins/zzogl-pg/opengl/ZZLog.cpp
@ -27,7 +27,7 @@ extern GSconf conf;
 using namespace std;

 static list<MESSAGE> listMsgs;
-
+const char* logging_prefix = "ZZOgl-PG";
 void ProcessMessages()
 {
 	FUNCLOG
@ -130,12 +130,14 @@ void _Log(const char *str)

 void _WriteToConsole(const char *str)
 {
-	fprintf(stderr,"ZZogl-PG: %s", str);
+	fprintf(stderr,"%s:  ", logging_prefix);
+	fprintf(stderr,"%s", str);
 }

 void _Print(const char *str)
 {
-	fprintf(stderr,"ZZogl-PG: %s", str);
+	fprintf(stderr,"%s:  ", logging_prefix);
+	fprintf(stderr,"%s", str);

 	if (IsLogging()) fprintf(gsLog, str);
 }
@ -169,7 +171,7 @@ void WriteToConsole(const char *fmt, ...)

 	va_start(list, fmt);

-	fprintf(stderr, "ZZogl-PG: ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	va_end(list);
 }
@ -182,7 +184,7 @@ void Print(const char *fmt, ...)

 	if (IsLogging()) vfprintf(gsLog, fmt, list);
 	
-	fprintf(stderr, "ZZogl-PG: ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);

 	va_end(list);
@ -197,7 +199,7 @@ void WriteLn(const char *fmt, ...)

 	if (IsLogging()) vfprintf(gsLog, fmt, list);
 	
-	fprintf(stderr, "ZZogl-PG: ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	va_end(list);
 	fprintf(stderr,"\n");
@ -237,7 +239,7 @@ void Prim_Log(const char *fmt, ...)
 	{
 		if (IsLogging()) vfprintf(gsLog, fmt, list);

-		fprintf(stderr, "ZZogl-PG(PRIM): ");
+		fprintf(stderr, "%s(PRIM):  ", logging_prefix);
 		vfprintf(stderr, fmt, list);

 		vprintf(fmt, list);
@ -262,7 +264,7 @@ void GS_Log(const char *fmt, ...)
 		fprintf(gsLog, "\n");
 	}
 	
-	fprintf(stderr, "ZZogl-PG: ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	fprintf(stderr, "\n");
 	
@ -283,7 +285,7 @@ void Warn_Log(const char *fmt, ...)
 		fprintf(gsLog, "\n");
 	}

-	fprintf(stderr, "ZZogl-PG:  ");
+	fprintf(stderr, "%s(Warning):  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	fprintf(stderr, "\n");
 	
@ -304,7 +306,7 @@ void Dev_Log(const char *fmt, ...)
 		fprintf(gsLog, "\n");
 	}

-	fprintf(stderr, "ZZogl-PG:  ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	fprintf(stderr, "\n");
 	
@ -325,7 +327,7 @@ void Debug_Log(const char *fmt, ...)
 		fprintf(gsLog, "\n");
 	}

-	fprintf(stderr, "ZZogl-PG:  ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	fprintf(stderr, "\n");
 	
@ -345,7 +347,7 @@ void Error_Log(const char *fmt, ...)
 		fprintf(gsLog, "\n");
 	}

-	fprintf(stderr, "ZZogl-PG:  ");
+	fprintf(stderr, "%s:  ", logging_prefix);
 	vfprintf(stderr, fmt, list);
 	fprintf(stderr, "\n");
 	
--- a/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
+++ b/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
@ -0,0 +1,620 @@
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
+#include <stdlib.h>
+#include <math.h>
+
+#include "GS.h"
+#include "Mem.h"
+#include "targets.h"
+#include "ZZClut.h"
+
+#ifdef ZEROGS_SSE2
+#include <immintrin.h>
+#endif
+
+extern int g_TransferredToGPU;
+
+extern int VALIDATE_THRESH;
+extern u32 TEXDESTROY_THRESH;
+#define FORCE_TEXDESTROY_THRESH (3) // destroy texture after FORCE_TEXDESTROY_THRESH frames
+
+void CMemoryTargetMngr::Destroy()
+{
+	FUNCLOG
+	listTargets.clear();
+	listClearedTargets.clear();
+}
+
+bool CMemoryTarget::ValidateTex(const tex0Info& tex0, int starttex, int endtex, bool bDeleteBadTex)
+{
+	FUNCLOG
+
+	if (clearmaxy == 0) return true;
+
+	int checkstarty = max(starttex, clearminy);
+	int checkendy = min(endtex, clearmaxy);
+
+	if (checkstarty >= checkendy) return true;
+
+	if (validatecount++ > VALIDATE_THRESH)
+	{
+		height = 0;
+		return false;
+	}
+
+	// lock and compare
+	assert(ptex != NULL && ptex->memptr != NULL);
+
+	int result = memcmp_mmx(ptex->memptr + MemorySize(checkstarty-realy), MemoryAddress(checkstarty), MemorySize(checkendy-checkstarty));
+	
+	if (result == 0)
+	{
+		clearmaxy = 0;
+		return true;
+	}
+
+	if (!bDeleteBadTex) return false;
+
+	// delete clearminy, clearmaxy range (not the checkstarty, checkendy range)
+	//int newstarty = 0;
+	if (clearminy <= starty)
+	{
+		if (clearmaxy < starty + height)
+		{
+			// preserve end
+			height = starty + height - clearmaxy;
+			starty = clearmaxy;
+			assert(height > 0);
+		}
+		else
+		{
+			// destroy
+			height = 0;
+		}
+	}
+	else
+	{
+		// beginning can be preserved
+		height = clearminy - starty;
+	}
+
+	clearmaxy = 0;
+
+	assert((starty >= realy) && ((starty + height) <= (realy + realheight)));
+
+	return false;
+}
+
+#define TARGET_THRESH 0x500
+
+extern int g_MaxTexWidth, g_MaxTexHeight; // Maximum height & width of supported texture.
+
+//#define SORT_TARGETS
+inline list<CMemoryTarget>::iterator CMemoryTargetMngr::DestroyTargetIter(list<CMemoryTarget>::iterator& it)
+{
+	// find the target and destroy
+	list<CMemoryTarget>::iterator itprev = it;
+	++it;
+	listClearedTargets.splice(listClearedTargets.end(), listTargets, itprev);
+
+	if (listClearedTargets.size() > TEXDESTROY_THRESH)
+	{
+		listClearedTargets.pop_front();
+	}
+
+	return it;
+}
+
+// Compare target to current texture info
+// Not same format -> 1
+// Same format, not same data (clut only) -> 2
+// identical -> 0
+int CMemoryTargetMngr::CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize)
+{
+	if (PSMT_ISCLUT(it->psm) != PSMT_ISCLUT(tex0.psm))
+		return 1;
+
+	if (PSMT_ISCLUT(tex0.psm)) {
+		if (it->psm != tex0.psm || it->cpsm != tex0.cpsm || it->clutsize != clutsize)
+			return 1;
+
+		if	(PSMT_IS32BIT(tex0.cpsm)) {
+			if (Cmp_ClutBuffer_SavedClut<u32>((u32*)&it->clut[0], tex0.csa, clutsize))
+				return 2;
+		} else {
+			if (Cmp_ClutBuffer_SavedClut<u16>((u16*)&it->clut[0], tex0.csa, clutsize))
+				return 2;
+		}
+
+	} else {
+		if (PSMT_IS16BIT(tex0.psm) != PSMT_IS16BIT(it->psm))
+			return 1;
+    }
+
+	return 0;
+}
+
+void CMemoryTargetMngr::GetClutVariables(int& clutsize, const tex0Info& tex0)
+{
+	clutsize = 0;
+
+	if (PSMT_ISCLUT(tex0.psm))
+	{
+		int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
+
+		if (PSMT_IS32BIT(tex0.cpsm))
+			clutsize = min(entries, 256 - tex0.csa * 16) * 4;
+		else
+			clutsize = min(entries, 512 - tex0.csa * 16) * 2;
+	}
+}
+
+void CMemoryTargetMngr::GetMemAddress(int& start, int& end,  const tex0Info& tex0)
+{
+	int nbStart, nbEnd;
+	GetRectMemAddressZero(nbStart, nbEnd, tex0.psm, tex0.tw, tex0.th, tex0.tbp0, tex0.tbw);
+	assert(nbStart < nbEnd);
+	nbEnd = min(nbEnd, MEMORY_END);
+
+	start = nbStart / (4 * GPU_TEXWIDTH);
+	end = (nbEnd + GPU_TEXWIDTH * 4 - 1) / (4 * GPU_TEXWIDTH);
+	assert(start < end);
+
+}
+
+CMemoryTarget* CMemoryTargetMngr::SearchExistTarget(int start, int end, int clutsize, const tex0Info& tex0, int forcevalidate)
+{
+	for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
+	{
+
+		if (it->starty <= start && it->starty + it->height >= end)
+		{
+
+			int res = CompareTarget(it, tex0, clutsize);
+
+			if (res == 1)
+			{
+				if (it->validatecount++ > VALIDATE_THRESH)
+				{
+					it = DestroyTargetIter(it);
+
+					if (listTargets.size() == 0) break;
+				}
+				else
+					++it;
+
+				continue;
+			}
+			else if (res == 2)
+			{
+				++it;
+				continue;
+			}
+
+			if (forcevalidate)   //&& listTargets.size() < TARGET_THRESH ) {
+			{
+				// do more validation checking. delete if not been used for a while
+
+				if (!it->ValidateTex(tex0, start, end, curstamp > it->usedstamp + FORCE_TEXDESTROY_THRESH))
+				{
+
+					if (it->height <= 0)
+					{
+						it = DestroyTargetIter(it);
+
+						if (listTargets.size() == 0) break;
+					}
+					else
+						++it;
+
+					continue;
+				}
+			}
+
+			it->usedstamp = curstamp;
+
+			it->validatecount = 0;
+
+			return &(*it);
+		}
+
+#ifdef SORT_TARGETS
+		else if (it->starty >= end) break;
+
+#endif
+
+		++it;
+	}
+
+	return NULL;
+}
+
+CMemoryTarget* CMemoryTargetMngr::ClearedTargetsSearch(int fmt, int widthmult, int channels, int height)
+{
+	CMemoryTarget* targ = NULL;
+
+	if (listClearedTargets.size() > 0)
+	{
+		list<CMemoryTarget>::iterator itbest = listClearedTargets.begin();
+
+		while (itbest != listClearedTargets.end())
+		{
+			if ((height == itbest->realheight) && (itbest->fmt == fmt) && (itbest->widthmult == widthmult) && (itbest->channels == channels))
+			{
+				// check channels
+				if (PIXELS_PER_WORD(itbest->psm) == channels) break;
+			}
+
+			++itbest;
+		}
+
+		if (itbest != listClearedTargets.end())
+		{
+			listTargets.splice(listTargets.end(), listClearedTargets, itbest);
+			targ = &listTargets.back();
+			targ->validatecount = 0;
+		}
+		else
+		{
+			// create a new
+			listTargets.push_back(CMemoryTarget());
+			targ = &listTargets.back();
+		}
+	}
+	else
+	{
+		listTargets.push_back(CMemoryTarget());
+		targ = &listTargets.back();
+	}
+
+	return targ;
+}
+
+CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forcevalidate)
+{
+	FUNCLOG
+	int start, end, clutsize;
+
+	GetClutVariables(clutsize, tex0);
+	GetMemAddress(start, end, tex0);
+
+	CMemoryTarget* it = SearchExistTarget(start, end, clutsize, tex0, forcevalidate);
+
+	if (it != NULL) return it;
+
+	// couldn't find so create
+	CMemoryTarget* targ;
+
+	u32 fmt;
+    u32 internal_fmt;
+	if (PSMT_ISHALF_STORAGE(tex0)) {
+        // RGBA_5551 storage format
+        fmt = GL_UNSIGNED_SHORT_1_5_5_5_REV;
+        internal_fmt = GL_RGB5_A1;
+    } else {
+        // RGBA_8888 storage format
+        fmt = GL_UNSIGNED_BYTE;
+        internal_fmt = GL_RGBA;
+    }
+
+	int widthmult = 1, channels = 1;
+
+	// If our texture is too big and could not be placed in 1 GPU texture. Pretty rare in modern cards.
+	if ((g_MaxTexHeight < 4096) && (end - start > g_MaxTexHeight)) 
+	{
+		// In this rare case we made a texture of half height and place it on the screen.
+		ZZLog::Debug_Log("Making a half height texture (start - end == 0x%x)", (end-start));
+		widthmult = 2;
+	}
+	
+	channels = PIXELS_PER_WORD(tex0.psm);
+
+	targ = ClearedTargetsSearch(fmt, widthmult, channels, end - start);
+
+	if (targ->ptex != NULL)
+	{
+		assert(end - start <= targ->realheight && targ->fmt == fmt && targ->widthmult == widthmult);
+
+		// good enough, so init
+		targ->realy = targ->starty = start;
+		targ->usedstamp = curstamp;
+		targ->psm = tex0.psm;
+		targ->cpsm = tex0.cpsm;
+		targ->height = end - start;
+	} else {
+		// not initialized yet
+		targ->fmt = fmt;
+		targ->realy = targ->starty = start;
+		targ->realheight = targ->height = end - start;
+		targ->usedstamp = curstamp;
+		targ->psm = tex0.psm;
+		targ->cpsm = tex0.cpsm;
+		targ->widthmult = widthmult;
+		targ->channels = channels;
+		targ->texH = (targ->realheight + widthmult - 1)/widthmult;
+		targ->texW = GPU_TEXWIDTH *  widthmult * channels;
+
+		// alloc the mem
+		targ->ptex = new CMemoryTarget::TEXTURE();
+		targ->ptex->ref = 1;
+	}
+
+#if defined(ZEROGS_DEVBUILD)
+	g_TransferredToGPU += MemorySize(channels * targ->height);
+#endif
+
+	// fill with data
+	if (targ->ptex->memptr == NULL)
+	{
+		targ->ptex->memptr = (u8*)_aligned_malloc(MemorySize(targ->realheight), 16);
+		assert(targ->ptex->ref > 0);
+	}
+
+	memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
+
+	__aligned16 u8* ptexdata = NULL;
+	bool has_data = false;
+
+	if (PSMT_ISCLUT(tex0.psm))
+	{
+		assert(clutsize > 0);
+
+        // Local clut parameter
+		targ->cpsm = tex0.cpsm;
+
+        // Allocate a local clut array
+        targ->clutsize = clutsize;
+        if(targ->clut == NULL)
+            targ->clut = (u8*)_aligned_malloc(clutsize, 16);
+        else {
+            // In case it could occured
+            // realloc would be better but you need to get it from libutilies first
+            // _aligned_realloc is brought in from ScopedAlloc.h now. --arcum42
+            _aligned_free(targ->clut);
+            targ->clut = (u8*)_aligned_malloc(clutsize, 16);
+        }
+
+        // texture parameter
+		ptexdata = (u8*)_aligned_malloc(CLUT_PIXEL_SIZE(tex0.cpsm) * targ->texH * targ->texW, 16);
+		has_data = true;
+
+		u8* psrc = (u8*)(MemoryAddress(targ->realy));
+
+        // Fill a local clut then build the real texture
+		if (PSMT_IS32BIT(tex0.cpsm))
+		{
+            ClutBuffer_to_Array<u32>((u32*)targ->clut, tex0.csa, clutsize);
+			Build_Clut_Texture<u32>(tex0.psm, targ->height, (u32*)targ->clut, psrc, (u32*)ptexdata);
+		}
+		else
+		{
+            ClutBuffer_to_Array<u16>((u16*)targ->clut, tex0.csa, clutsize);
+			Build_Clut_Texture<u16>(tex0.psm, targ->height, (u16*)targ->clut, psrc, (u16*)ptexdata);
+		}
+
+        assert(targ->clutsize > 0);
+	}
+	else if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
+    {
+        ptexdata = (u8*)_aligned_malloc(4 * targ->texH * targ->texW, 16);
+        has_data = true;
+
+        // needs to be 8 bit, use xmm for unpacking
+        u16* dst = (u16*)ptexdata;
+        u16* src = (u16*)(MemoryAddress(targ->realy));
+
+#ifdef ZEROGS_SSE2
+        assert(((u32)(uptr)dst) % 16 == 0);
+
+        __m128i zero_128 = _mm_setzero_si128();
+        // NOTE: future performance improvement
+        // SSE4.1 support uncacheable load 128bits. Maybe it can
+        // avoid some cache pollution
+        // NOTE2: I create multiple _n variable to mimic the previous ASM behavior
+        // but I'm not sure there are real gains.
+        for (int i = targ->height * GPU_TEXWIDTH/16 ; i > 0 ; --i)
+        {
+            // Convert 16 bits pixels to 32bits (zero extended)
+            // Batch 64 bytes (32 pixels) at once.
+            __m128i pixels_1 = _mm_load_si128((__m128i*)src);
+            __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
+            __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
+            __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
+
+            __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
+            __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
+            __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
+            __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
+
+            // Note: bypass cache
+            _mm_stream_si128((__m128i*)dst, pix_low_1);
+            _mm_stream_si128((__m128i*)(dst+8), pix_high_1);
+            _mm_stream_si128((__m128i*)(dst+16), pix_low_2);
+            _mm_stream_si128((__m128i*)(dst+24), pix_high_2);
+
+            __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
+            __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
+            __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
+            __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
+
+            // Note: bypass cache
+            _mm_stream_si128((__m128i*)(dst+32), pix_low_3);
+            _mm_stream_si128((__m128i*)(dst+40), pix_high_3);
+            _mm_stream_si128((__m128i*)(dst+48), pix_low_4);
+            _mm_stream_si128((__m128i*)(dst+56), pix_high_4);
+
+            src += 32;
+            dst += 64;
+        }
+        // It is advise to use a fence instruction after non temporal move (mm_stream) instruction...
+        // store fence insures that previous store are finish before execute new one.
+        _mm_sfence();
+#else // ZEROGS_SSE2
+
+        for (int i = 0; i < targ->height; ++i)
+        {
+            for (int j = 0; j < GPU_TEXWIDTH; ++j)
+            {
+                dst[0] = src[0];
+                dst[1] = 0;
+                dst[2] = src[1];
+                dst[3] = 0;
+                dst += 4;
+                src += 2;
+            }
+        }
+
+#endif // ZEROGS_SSE2
+    }
+    else
+    {
+        ptexdata = targ->ptex->memptr;
+        // We really don't want to deallocate memptr. As a reminder...
+        has_data = false;
+    }
+
+	// create the texture
+	GL_REPORT_ERRORD();
+
+	assert(ptexdata != NULL);
+
+	if (targ->ptex->tex == 0) glGenTextures(1, &targ->ptex->tex);
+
+	glBindTexture(GL_TEXTURE_RECTANGLE_NV, targ->ptex->tex);
+
+    TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata);
+
+	while (glGetError() != GL_NO_ERROR)
+	{
+		// release resources until can create
+		if (listClearedTargets.size() > 0)
+		{
+			listClearedTargets.pop_front();
+		}
+		else
+		{
+			if (listTargets.size() == 0)
+			{
+				ZZLog::Error_Log("Failed to create %dx%x texture.", targ->texW, targ->texH);
+				channels = 1;
+				if (has_data) _aligned_free(ptexdata);
+				return NULL;
+			}
+
+			DestroyOldest();
+		}
+
+        TextureRect(internal_fmt, targ->texW, targ->texH, GL_RGBA, fmt, ptexdata);
+	}
+
+	setRectWrap(GL_CLAMP);
+	if (has_data) _aligned_free(ptexdata);
+
+	assert(tex0.psm != 0xd);
+
+	return targ;
+}
+
+void CMemoryTargetMngr::ClearRange(int nbStartY, int nbEndY)
+{
+	FUNCLOG
+	int starty = nbStartY / (4 * GPU_TEXWIDTH);
+	int endy = (nbEndY + 4 * GPU_TEXWIDTH - 1) / (4 * GPU_TEXWIDTH);
+
+	for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
+	{
+
+		if (it->starty < endy && (it->starty + it->height) > starty)
+		{
+
+			// intersects, reduce valid texture mem (or totally delete texture)
+			// there are 4 cases
+			int miny = max(it->starty, starty);
+			int maxy = min(it->starty + it->height, endy);
+			assert(miny < maxy);
+
+			if (it->clearmaxy == 0)
+			{
+				it->clearminy = miny;
+				it->clearmaxy = maxy;
+			}
+			else
+			{
+				if (it->clearminy > miny) it->clearminy = miny;
+				if (it->clearmaxy < maxy) it->clearmaxy = maxy;
+			}
+		}
+
+		++it;
+	}
+}
+
+void CMemoryTargetMngr::DestroyCleared()
+{
+	FUNCLOG
+
+	for (list<CMemoryTarget>::iterator it = listClearedTargets.begin(); it != listClearedTargets.end();)
+	{
+		if (it->usedstamp < curstamp - (FORCE_TEXDESTROY_THRESH -1))
+		{
+			it = listClearedTargets.erase(it);
+			continue;
+		}
+
+		++it;
+	}
+
+	if ((curstamp % FORCE_TEXDESTROY_THRESH) == 0)
+	{
+		// purge old targets every FORCE_TEXDESTROY_THRESH frames
+		for (list<CMemoryTarget>::iterator it = listTargets.begin(); it != listTargets.end();)
+		{
+			if (it->usedstamp < curstamp - FORCE_TEXDESTROY_THRESH)
+			{
+				it = listTargets.erase(it);
+				continue;
+			}
+
+			++it;
+		}
+	}
+
+	++curstamp;
+}
+
+void CMemoryTargetMngr::DestroyOldest()
+{
+	FUNCLOG
+
+	if (listTargets.size() == 0)
+		return;
+
+	list<CMemoryTarget>::iterator it, itbest;
+
+	it = itbest = listTargets.begin();
+
+	while (it != listTargets.end())
+	{
+		if (it->usedstamp < itbest->usedstamp) itbest = it;
+		++it;
+	}
+
+	listTargets.erase(itbest);
+}
--- a/plugins/zzogl-pg/opengl/ZZRenderTargets.cpp
+++ b/plugins/zzogl-pg/opengl/ZZRenderTargets.cpp
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
@ -58,6 +58,17 @@ extern void ZZDestroy();
 extern void ChangeDeviceSize(int nNewWidth, int nNewHeight);

 extern GLuint vboRect;
+
+// I'm making this variable global for the moment in the course of fiddling with the interlace code 
+// to try and make it more straightforward.
+int interlace_mode = 0; // 0 - not interlacing, 1 - interlacing.
+bool bUsingStencil = false;
+
+bool INTERLACE_COUNT()
+{
+	return (interlace_mode && (gs.interlace == conf.interlace));
+}
+
 // Adjusts vertex shader BitBltPos vector v to preserve aspect ratio. It used to emulate 4:3 or 16:9.
 void AdjustTransToAspect(float4& v)
 {
@ -151,20 +162,20 @@ inline void FrameSavingHelper()
 }

 // Function populated tex0Info[2] array
-inline void FrameObtainDispinfo(u32 bInterlace, tex0Info* dispinfo)
+inline void FrameObtainDispinfo(tex0Info* dispinfo)
 {
 	for (int i = 0; i < 2; ++i)
 	{
-
-		if (!(*(u32*)(PMODE) & (1 << i)))
+		if (!Circuit_Enabled(i))
 		{
 			dispinfo[i].tw = 0;
 			dispinfo[i].th = 0;
 			continue;
 		}

-		GSRegDISPFB* pfb = i ? DISPFB2 : DISPFB1;
-		GSRegDISPLAY* pd = i ? DISPLAY2 : DISPLAY1;
+		GSRegDISPFB* pfb = Dispfb_Reg(i);
+		GSRegDISPLAY* pd = Display_Reg(i);
+		
 		int magh = pd->MAGH + 1;
 		int magv = pd->MAGV + 1;

@ -177,7 +188,8 @@ inline void FrameObtainDispinfo(u32 bInterlace, tex0Info* dispinfo)
 		// hack!!
 		// 2 * dispinfo[i].tw / dispinfo[i].th <= 1, metal slug 4

-		if (bInterlace && 2 * dispinfo[i].tw / dispinfo[i].th <= 1 && !(conf.settings().interlace_2x))
+		// Note: This is what causes the double image if interlace is off on the Final Fantasy X-2 opening.
+		if (interlace_mode && 2 * dispinfo[i].tw / dispinfo[i].th <= 1 && !(conf.settings().interlace_2x))
 		{
 			dispinfo[i].th >>= 1;
 		}
@ -187,9 +199,9 @@ inline void FrameObtainDispinfo(u32 bInterlace, tex0Info* dispinfo)
 extern bool s_bWriteDepth;

 // Something should be done before Renderering the picture.
-inline void RenderStartHelper(u32 bInterlace)
+inline void RenderStartHelper()
 {
-	if (conf.mrtdepth && pvs[8] == NULL)
+	if (conf.mrtdepth && ZZshExistProgram(pvs[8]))
 	{
 		conf.mrtdepth = 0;
 		s_bWriteDepth = false;
@ -209,14 +221,13 @@ inline void RenderStartHelper(u32 bInterlace)
 	vb[0].fba.fba = 0;
 	vb[1].fba.fba = 0;

-	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);   // switch to the backbuffer
+	FB::Unbind();   // switch to the backbuffer

 	glViewport(0, 0, GLWin.backbuffer.w, GLWin.backbuffer.h);

 	// if interlace, only clear every other vsync
-	if (!bInterlace)
+	if (!interlace_mode)
 	{
-		//u32 color = COLOR_ARGB(0, BGCOLOR->R, BGCOLOR->G, BGCOLOR->B);
 		glClear(GL_COLOR_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
 	}

@ -232,7 +243,7 @@ inline void RenderStartHelper(u32 bInterlace)

 	GL_REPORT_ERRORD();

-	if (bInterlace) g_PrevBitwiseTexX = -1;  // reset since will be using
+	if (interlace_mode) g_PrevBitwiseTexX = -1;  // reset since will be using
 }

 // Settings for interlace texture multiplied vector;
@ -240,14 +251,14 @@ inline void RenderStartHelper(u32 bInterlace)
 // on image y coords. So if we write valpha.z * F + valpha.w + 0.5, it would be switching odd
 // and even strings at each frame.
 // valpha.x and y are used for image blending.
-inline float4 RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTSHADER* prog)
+inline float4 RenderGetForClip(int psm, CRTC_TYPE render_type)
 {
 	SetShaderCaller("RenderGetForClip");
-
+	FRAGMENTSHADER* prog = curr_pps(render_type);
 	float4 valpha;
 	// first render the current render targets, then from ptexMem

-	if (psm == 1)
+	if (psm == PSMCT24)
 	{
 		valpha.x = 1;
 		valpha.y = 0;
@ -258,9 +269,9 @@ inline float4 RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTS
 		valpha.y = 1;
 	}

-	if (bInterlace)
+	if (interlace_mode)
 	{
-		if (interlace == (conf.interlace & 1))
+		if (gs.interlace == (conf.interlace & 1))
 		{
 			// pass if odd
 			valpha.z = 1.0f;
@ -286,17 +297,21 @@ inline float4 RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTS
 }

 // Put interlaced texture in use for shader prog.
-// Note: if frame interlaced it's th is halved, so we should x2 it.
-inline void RenderCreateInterlaceTex(u32 bInterlace, int th, FRAGMENTSHADER* prog)
+// Note: if the frame is interlaced, its th is halved, so we should multiply it by 2.
+inline void RenderCreateInterlaceTex(int th, CRTC_TYPE render_type)
 {
-	if (!bInterlace) return;
+	FRAGMENTSHADER* prog;
+	int interlacetex;
 	
-	int interlacetex = CreateInterlaceTex(2 * th);
+	if (!interlace_mode) return;
+	
+	prog = curr_pps(render_type);
+	interlacetex = CreateInterlaceTex(2 * th);

 	ZZshGLSetTextureParameter(prog->prog, prog->sInterlace, interlacetex, "Interlace");
 }

-// Well, do blending setup prior to second pass of half-frame drawing
+// Do blending setup prior to second pass of half-frame drawing.
 inline void RenderSetupBlending()
 {
 	// setup right blending
@ -305,12 +320,14 @@ inline void RenderSetupBlending()

 	if (PMODE->MMOD)
 	{
+		// Use the ALP register for alpha blending.
 		glBlendColorEXT(PMODE->ALP*(1 / 255.0f), PMODE->ALP*(1 / 255.0f), PMODE->ALP*(1 / 255.0f), 0.5f);
 		s_srcrgb = GL_CONSTANT_COLOR_EXT;
 		s_dstrgb = GL_ONE_MINUS_CONSTANT_COLOR_EXT;
 	}
 	else
 	{
+		// Use the alpha value of circuit 1 for alpha blending.
 		s_srcrgb = GL_SRC_ALPHA;
 		s_dstrgb = GL_ONE_MINUS_SRC_ALPHA;
 	}
@ -332,17 +349,19 @@ inline void RenderSetupBlending()
 // each frame could be drawn in two stages, so blending should be different for them
 inline void RenderSetupStencil(int i)
 {
-	glStencilMask(1 << i);
 	s_stencilmask = 1 << i;
+	glStencilMask(s_stencilmask);
 	GL_STENCILFUNC_SET();
 }

 // do stencil check for each found target i -- texturing stage
-inline void RenderUpdateStencil(int i, bool* bUsingStencil)
+inline void RenderUpdateStencil(int i)
 {
-	if (!(*bUsingStencil)) glClear(GL_STENCIL_BUFFER_BIT);
-
-	*bUsingStencil = 1;
+	if (!bUsingStencil) 
+	{
+		glClear(GL_STENCIL_BUFFER_BIT);
+		bUsingStencil = true;
+	}

 	glEnable(GL_STENCIL_TEST);
 	GL_STENCILFUNC(GL_NOTEQUAL, 3, 1 << i);
@ -351,16 +370,16 @@ inline void RenderUpdateStencil(int i, bool* bUsingStencil)
 }

 // CRTC24 could not be rendered
-inline void RenderCRTC24helper(u32 bInterlace, int interlace, int psm)
+/*inline void RenderCRTC24helper(int psm)
 {
 	ZZLog::Debug_Log("ZZogl: CRTC24!!! I'm trying to show something.");
 	SetShaderCaller("RenderCRTC24helper");
 	// assume that data is already in ptexMem (do Resolve?)
-	RenderGetForClip(bInterlace, interlace, psm, &ppsCRTC24[bInterlace]);
-	ZZshSetPixelShader(ppsCRTC24[bInterlace].prog);
+	RenderGetForClip(psm, CRTC_RENDER_24);
+	ZZshSetPixelShader(curr_ppsCRTC24()->prog);
 	
 	DrawTriangleArray();
-}
+}*/

 // Maybe I do this function global-defined. Calculate bits per pixel for
 // each psm. It's the only place with PSMCT16 which have a different bpp.
@ -394,7 +413,7 @@ inline int RenderGetOffsets(int* dby, int* movy, tex0Info& texframe, CRenderTarg
 }

 // BltBit shader calculate vertex (4 coord's pixel) position at the viewport.
-inline float4 RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
+inline float4 RenderSetTargetBitPos(int dh, int th, int movy)
 {
 	SetShaderCaller("RenderSetTargetBitPos");
 	float4 v;
@ -408,7 +427,7 @@ inline float4 RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)

 	AdjustTransToAspect(v);

-	if (isInterlace)
+	if (INTERLACE_COUNT())
 	{
 		// move down by 1 pixel
 		v.w += 1.0f / (float)dh ;
@ -423,7 +442,7 @@ inline float4 RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 // For example, use tw / X and tw / X magnify the viewport.
 // Interlaced output is little out of VB, it could be seen as an evil blinking line on top
 // and bottom, so we try to remove it.
-inline float4 RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool isInterlace)
+inline float4 RenderSetTargetBitTex(float th, float tw, float dh, float dw)
 {
 	SetShaderCaller("RenderSetTargetBitTex");

@ -432,7 +451,7 @@ inline float4 RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool

 	// Incorrect Aspect ratio on interlaced frames

-	if (isInterlace)
+	if (INTERLACE_COUNT())
 	{
 		v.y -= 1.0f / conf.height;
 		v.w += 1.0f / conf.height;
@ -455,10 +474,11 @@ inline float4 RenderSetTargetBitTrans(int th)

 // use g_fInvTexDims to store inverse texture dims
 // Seems, that Targ shader does not use it
-inline float4 RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHADER* prog)
+inline float4 RenderSetTargetInvTex(int tw, int th, CRTC_TYPE render_type)
 {
 	SetShaderCaller("RenderSetTargetInvTex");

+	FRAGMENTSHADER* prog = curr_pps(render_type);
 	float4 v = float4(0, 0, 0, 0);

 	if (prog->sInvTexDims)
@ -496,17 +516,20 @@ inline bool RenderLookForABetterTarget(int fbp, int tbp, list<CRenderTarget*>& l
 	return false;
 }

-inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int i, bool* bUsingStencil, int interlace, int bInterlace);
+inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int circuit);

 // First try to draw frame from targets. 
-inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& listTargs, int i, bool* bUsingStencil, int interlace, int bInterlace)
+inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& listTargs, int circuit)
 {
 	// get the start and end addresses of the buffer
 	int bpp = RenderGetBpp(texframe.psm);
-	GSRegDISPFB* pfb = i ? DISPFB2 : DISPFB1;
+	GSRegDISPFB* pfb = Dispfb_Reg(circuit);

 	int start, end;
-	GetRectMemAddress(start, end, texframe.psm, 0, 0, texframe.tw, texframe.th, texframe.tbp0, texframe.tbw);
+	int tex_th = (interlace_mode) ? texframe.th * 2 : texframe.th;
+	
+	//ZZLog::WriteLn("Render checking for targets, circuit %d", circuit);
+	GetRectMemAddressZero(start, end, texframe.psm, texframe.tw, tex_th, texframe.tbp0, texframe.tbw);

 	// We need share list of targets between functions
 	s_RTs.GetTargs(start, end, listTargs);
@ -517,10 +540,14 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list

 		if (ptarg->fbw == texframe.tbw && !(ptarg->status&CRenderTarget::TS_NeedUpdate) && ((256 / bpp)*(texframe.tbp0 - ptarg->fbp)) % texframe.tbw == 0)
 		{
+			FRAGMENTSHADER* pps;
 			int dby = pfb->DBY;
 			int movy = 0;
 			
-			if (RenderLookForABetterTarget(ptarg->fbp, texframe.tbp0, listTargs, it)) continue;
+			if (RenderLookForABetterTarget(ptarg->fbp, texframe.tbp0, listTargs, it))
+			{
+				continue;
+			} 

 			if (g_bSaveFinalFrame) SaveTexture("frame1.tga", GL_TEXTURE_RECTANGLE_NV, ptarg->ptex, RW(ptarg->fbw), RH(ptarg->fbh));

@ -529,37 +556,42 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list

 			if (dh >= 64)
 			{
-
-				if (ptarg->fbh - dby < texframe.th - movy && !(*bUsingStencil))
-					RenderUpdateStencil(i, bUsingStencil);
-				else if (ptarg->fbh - dby > 2 * ( texframe.th - movy )) 
+				if (ptarg->fbh - dby < tex_th - movy && !bUsingStencil)
+				{
+					RenderUpdateStencil(circuit);
+				}
+				else if (ptarg->fbh - dby > 2 * ( tex_th - movy )) // I'm not sure this is needed any more.
 				{
 					// Sometimes calculated position onscreen is misaligned, ie in FFX-2 intro. In such case some part of image are out of
 					// border's and we should move it manually.
-					dby -= ((ptarg->fbh - dby) >> 2) -  ((texframe.th + movy) >> 1) ;
+					dby -= ((ptarg->fbh - dby) >> 2) -  ((tex_th + movy) >> 1);
 				}

 				SetShaderCaller("RenderCheckForTargets");

 				// Texture
-				float4 v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby), INTERLACE_COUNT);
+				float4 v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby));
 				
 				// dest rect
-				v = RenderSetTargetBitPos(dh, texframe.th, movy, INTERLACE_COUNT);
+				v = RenderSetTargetBitPos(dh, texframe.th, movy);
 				v = RenderSetTargetBitTrans(ptarg->fbh);
-				v = RenderSetTargetInvTex(bInterlace, texframe.tbw, ptarg->fbh, &ppsCRTCTarg[bInterlace]) ; 	// FIXME. This is no use
+				v = RenderSetTargetInvTex(texframe.tbw, ptarg->fbh, CRTC_RENDER_TARG); 	// FIXME. This is no use

-				float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTCTarg[bInterlace]);
+				float4 valpha = RenderGetForClip(texframe.psm, CRTC_RENDER_TARG);
+				pps = curr_ppsCRTCTarg();

 				// inside vb[0]'s target area, so render that region only
-				ZZshGLSetTextureParameter(ppsCRTCTarg[bInterlace].prog, ppsCRTCTarg[bInterlace].sFinal, ptarg->ptex, "CRTC target");
-				RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTCTarg[bInterlace]);
+				ZZshGLSetTextureParameter(pps->prog, pps->sFinal, ptarg->ptex, "CRTC target");
+				RenderCreateInterlaceTex(texframe.th, CRTC_RENDER_TARG);

-				ZZshSetPixelShader(ppsCRTCTarg[bInterlace].prog);
+				ZZshSetPixelShader(pps->prog);

 				DrawTriangleArray();

-				if (abs(dh - (int)texframe.th) <= 1) return;
+				if (abs(dh - (int)texframe.th) <= 1) 
+				{
+					return;
+				}

 				if (abs(dh - (int)ptarg->fbh) <= 1)
 				{
@ -571,14 +603,14 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list

 		++it;
 	}
-	RenderCheckForMemory(texframe, listTargs, i, bUsingStencil, interlace, bInterlace);
+	RenderCheckForMemory(texframe, listTargs, circuit);
 }


 // The same as the previous, but from memory.
 // If you ever wondered why a picture from a minute ago suddenly flashes on the screen (say, in Mana Khemia),
 // this is the function that does it.
-inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int i, bool* bUsingStencil, int interlace, int bInterlace)
+inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int circuit)
 {
 	float4 v;
 	
@ -588,9 +620,9 @@ inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listT
 	}

 	// context has to be 0
-	if (bInterlace >= 2) ZZLog::Error_Log("CRCR Check for memory shader fault.");
+	if (interlace_mode >= 2) ZZLog::Error_Log("CRCR Check for memory shader fault.");

-	//if (!(*bUsingStencil)) RenderUpdateStencil(i, bUsingStencil);
+	//if (!bUsingStencil) RenderUpdateStencil(i);
 		
 	SetShaderCaller("RenderCheckForMemory");

@ -601,7 +633,7 @@ inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listT
 		h1 = texframe.th;
 		w2 = -0.5f;
 		h2 = -0.5f;
-		SetTexVariablesInt(0, 2, texframe, false, &ppsCRTC[bInterlace], 1);
+		SetTexVariablesInt(0, 2, texframe, false, curr_ppsCRTC(), 1);
 	}
 	else
 	{
@ -609,24 +641,24 @@ inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listT
 		h1 = 1;
 		w2 = -0.5f / (float)texframe.tw;
 		h2 = -0.5f / (float)texframe.th;
-		SetTexVariablesInt(0, 0, texframe, false, &ppsCRTC[bInterlace], 1);
+		SetTexVariablesInt(0, 0, texframe, false, curr_ppsCRTC(), 1);
 	}
 	
 	if (g_bSaveFinalFrame) SaveTex(&texframe, g_bSaveFinalFrame - 1 > 0);
 	
 	// Fixme: Why is this here?
 	// We should probably call RenderSetTargetBitTex instead.
-	v = RenderSetTargetBitTex(w1, h1, w2, h2, INTERLACE_COUNT);
+	v = RenderSetTargetBitTex(w1, h1, w2, h2);

 	// finally render from the memory (note that the stencil buffer will keep previous regions)
-	v = RenderSetTargetBitPos(1, 1, 0, INTERLACE_COUNT);
+	v = RenderSetTargetBitPos(1, 1, 0);
 	v = RenderSetTargetBitTrans(texframe.th);
-	v = RenderSetTargetInvTex(bInterlace, texframe.tw, texframe.th, &ppsCRTC[bInterlace]);
-	float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTC[bInterlace]);
+	v = RenderSetTargetInvTex(texframe.tw, texframe.th, CRTC_RENDER);
+	float4 valpha = RenderGetForClip(texframe.psm, CRTC_RENDER);

-	ZZshGLSetTextureParameter(ppsCRTC[bInterlace].prog, ppsCRTC[bInterlace].sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
-	RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTC[bInterlace]);
-	ZZshSetPixelShader(ppsCRTC[bInterlace].prog);
+	ZZshGLSetTextureParameter(curr_ppsCRTC()->prog, curr_ppsCRTC()->sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
+	RenderCreateInterlaceTex(texframe.th, CRTC_RENDER_TARG);
+	ZZshSetPixelShader(curr_ppsCRTC()->prog);
 	
 	DrawTriangleArray();
 }
@ -657,7 +689,7 @@ inline void DisplayFPS()
 	DrawText(str, left, top, 0xffc0ffff);
 }

-// SnapeShoot helper
+// Snapshot helper
 inline void MakeSnapshot()
 {
 	
@ -694,7 +726,7 @@ void ZZReset()
 	s_nLastResolveReset = 0;

 	icurctx = -1;
-	g_vsprog = g_psprog = 0;
+	g_vsprog = g_psprog = sZero;

 	ZZGSStateReset();
 	ZZDestroy();
@ -759,12 +791,10 @@ inline void AfterRendererUnimportantJob()
 	maxmin = 608;
 }

-extern u32 s_uFramebuffer;
-
 // Swich Framebuffers
 inline void AfterRendererSwitchBackToTextures()
 {
-	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, s_uFramebuffer);
+	FB::Bind();

 	g_MemTargs.DestroyCleared();

@ -815,18 +845,6 @@ inline void AfterRendererAutoresetTargets()

 				s_RTs.ResolveAll();
 				return;
-//				s_RTs.Destroy();
-//				s_DepthRTs.ResolveAll();
-//				s_DepthRTs.Destroy();
-//
-//				vb[0].prndr = NULL;
-//				vb[0].pdepth = NULL;
-//				vb[0].bNeedFrameCheck = 1;
-//				vb[0].bNeedZCheck = 1;
-//				vb[1].prndr = NULL;
-//				vb[1].pdepth = NULL;
-//				vb[1].bNeedFrameCheck = 1;
-//				vb[1].bNeedZCheck = 1;
 			}
 		}

@ -840,49 +858,59 @@ inline void AfterRendererAutoresetTargets()
 }

 int count = 0;
+
 // The main renderer function
-void RenderCRTC(int interlace)
+void RenderCRTC()
 {
-	if (FrameSkippingHelper()) return;
-
-	u32 bInterlace = SMODE2->INT && SMODE2->FFMD && (conf.interlace < 2);
-
-	RenderStartHelper(bInterlace);
-
-	bool bUsingStencil = false;
 	tex0Info dispinfo[2];
 	
-	FrameObtainDispinfo(bInterlace, dispinfo);
+	if (FrameSkippingHelper()) return;
+	
+	// If we are in frame mode and interlacing, and we haven't forced interlacing off, interlace_mode is 1.
+	interlace_mode = SMODE2->INT && SMODE2->FFMD && (conf.interlace < 2);
+	bUsingStencil = false;
+
+	RenderStartHelper();
+
+	FrameObtainDispinfo(dispinfo);
 	
 	// start from the last circuit
 	for (int i = !PMODE->SLBG; i >= 0; --i)
 	{
+		if (!Circuit_Enabled(i)) continue;
 		tex0Info& texframe = dispinfo[i];

-		if (texframe.th <= 1) continue;
+		// I don't think this is neccessary, now that we make sure the ciruit we are working with is enabled.
+		/*if (texframe.th <= 1) 
+		{
+			continue;
+		}*/
+		
 		if (SMODE2->INT && SMODE2->FFMD) 
 		{
 			texframe.th >>= 1;
 			
 			// Final Fantasy X-2 issue here.
-			if (conf.interlace == 2 && texframe.th >= 512) 
+			/*if (conf.interlace == 2 && texframe.th >= 512) 
+			{
 				texframe.th >>= 1;
+			}*/
 		}
 		
 		if (i == 0) RenderSetupBlending();
 		if (bUsingStencil) RenderSetupStencil(i);

-		if (texframe.psm == 0x12)
+		/*if (texframe.psm == 0x12) // Probably broken - 0x12 isn't a valid psm. 24 bit is 1.
 		{
-			RenderCRTC24helper(bInterlace, interlace, texframe.psm);
+			RenderCRTC24helper(texframe.psm);
 			continue;
-		}
+		}*/

 		// We shader targets between two functions, so declare it here;
 		list<CRenderTarget*> listTargs;

-		// if we could not draw image from target's do it from memory
-		RenderCheckForTargets(texframe, listTargs, i, &bUsingStencil, interlace, bInterlace);
+		// if we could not draw image from target's, do it from memory
+		RenderCheckForTargets(texframe, listTargs, i);
 	}

 	GL_REPORT_ERRORD();
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.h
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.h
@ -21,11 +21,8 @@
 #define ZZOGLCRTC_H_INCLUDED

 #include <stdlib.h>
-
 #include "targets.h"

-#define INTERLACE_COUNT (bInterlace && interlace == (conf.interlace))
-
 #ifdef _WIN32
 extern HDC		hDC;	   // Private GDI Device Context
 extern HGLRC	hRC;	   // Permanent Rendering Context
--- a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
@ -35,42 +35,6 @@
 #	include "Win32.h"
 #endif

-//------------------ Defines
-
-#ifdef _WIN32
-#define GL_LOADFN(name) { \
-		if( (*(void**)&name = (void*)wglGetProcAddress(#name)) == NULL ) { \
-		ZZLog::Error_Log("Failed to find %s, exiting.", #name); \
-	} \
-}
-#else
-// let GLEW take care of it
-#define GL_LOADFN(name)
-#endif
-
-#define GL_BLEND_RGB(src, dst) { \
-	s_srcrgb = src; \
-	s_dstrgb = dst; \
-	zgsBlendFuncSeparateEXT(s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha); \
-}
-
-#define GL_BLEND_ALPHA(src, dst) { \
-	s_srcalpha = src; \
-	s_dstalpha = dst; \
-	zgsBlendFuncSeparateEXT(s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha); \
-}
-
-#define GL_BLEND_ALL(srcrgb, dstrgb, srcalpha, dstalpha) { \
-	s_srcrgb = srcrgb; \
-	s_dstrgb = dstrgb; \
-	s_srcalpha = srcalpha; \
-	s_dstalpha = dstalpha; \
-	zgsBlendFuncSeparateEXT(s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha); \
-}
-
-#define GL_BLEND_SET() zgsBlendFuncSeparateEXT(s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha)
-#define VB_NUMBUFFERS			   512
-
 // ----------------- Types
 typedef void (APIENTRYP _PFNSWAPINTERVAL)(int);

@ -81,8 +45,8 @@ extern bool ZZshLoadExtraEffects();
 extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);

 GLuint vboRect = 0;
-vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
-int g_nCurVBOIndex = 0;
+GLuint g_vboBuffers[VB_NUMBUFFERS]; // VBOs for all drawing commands
+u32 g_nCurVBOIndex = 0;

 inline bool CreateImportantCheck();
 inline void CreateOtherCheck();
@ -125,10 +89,10 @@ void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum) = NULL;
 extern u8* s_lpShaderResources;

 // String's for shader file in developer mode
-#ifdef ZEROGS_DEVBUILD
+//#ifdef ZEROGS_DEVBUILD
 char* EFFECT_NAME = "";
 char* EFFECT_DIR = "";
-#endif
+//#endif

 /////////////////////
 // graphics resources
@ -143,12 +107,17 @@ GLenum g_internalRGBAFloat16Fmt = GL_RGBA_FLOAT16_ATI;
 u32 ptexLogo = 0;
 int nLogoWidth, nLogoHeight;
 u32 s_ptexInterlace = 0;		 // holds interlace fields
+static bool vb_buffer_allocated = false;

 //------------------ Global Variables
 int GPU_TEXWIDTH = 512;
 float g_fiGPU_TEXWIDTH = 1/512.0f;
 int g_MaxTexWidth = 4096, g_MaxTexHeight = 4096;
-u32 s_uFramebuffer = 0;
+
+namespace FB
+{
+	u32 buf = 0;
+};

 RasterFont* font_p = NULL;
 float g_fBlockMult = 1;
@ -157,7 +126,7 @@ float g_fBlockMult = 1;
 u32 ptexBlocks = 0, ptexConv16to32 = 0;	 // holds information on block tiling
 u32 ptexBilinearBlocks = 0;
 u32 ptexConv32to16 = 0;
-int g_nDepthBias = 0;
+// int g_nDepthBias = 0;

 extern void Delete_Avi_Capture();
 extern void ZZDestroy();
@ -505,7 +474,12 @@ bool ZZCreate(int _width, int _height)
 	GPU_TEXWIDTH = min (g_MaxTexWidth/8, 1024);
 	g_fiGPU_TEXWIDTH = 1.0f / GPU_TEXWIDTH;

+	// FIXME: not clean maybe re integrate the function in shader files --greg
+#ifndef GLSL_API
 	if (!CreateOpenShadersFile()) return false;
+#else
+	if (!ZZshCreateOpenShadersFile()) return false;
+#endif

 	GL_REPORT_ERROR();

@ -520,16 +494,16 @@ bool ZZCreate(int _width, int _height)

 	if (err != GL_NO_ERROR) bSuccess = false;

-	glGenFramebuffersEXT(1, &s_uFramebuffer);
+	FB::Create();

-	if (s_uFramebuffer == 0)
+	if (FB::buf == 0)
 	{
 		ZZLog::Error_Log("Failed to create the renderbuffer.");
 	}

 	GL_REPORT_ERRORD();

-	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, s_uFramebuffer);
+	FB::Bind();

 	DrawBuffers(s_drawbuffers);
 		
@ -600,14 +574,15 @@ bool ZZCreate(int _width, int _height)

 	g_nCurVBOIndex = 0;

-	g_vboBuffers.resize(VB_NUMBUFFERS);
-	glGenBuffers((GLsizei)g_vboBuffers.size(), &g_vboBuffers[0]);
-
-	for (int i = 0; i < (int)g_vboBuffers.size(); ++i)
+    if (!vb_buffer_allocated) {
+        glGenBuffers((GLsizei)ArraySize(g_vboBuffers), g_vboBuffers);
+        for (int i = 0; i < ArraySize(g_vboBuffers); ++i)
        {
            glBindBuffer(GL_ARRAY_BUFFER, g_vboBuffers[i]);
            glBufferData(GL_ARRAY_BUFFER, 0x100*sizeof(VertexGPU), NULL, GL_STREAM_DRAW);
        }
+        vb_buffer_allocated = true; // mark the buffer allocated
+    }

 	GL_REPORT_ERROR();
 	if (err != GL_NO_ERROR) bSuccess = false;
@ -616,6 +591,11 @@ bool ZZCreate(int _width, int _height)
 	g_fBlockMult = 1;
 	bool do_not_use_billinear = false;

+#ifndef ZZNORMAL_MEMORY
+	FillAlowedPsnTable();
+	FillBlockTables();
+#endif
+
 	vector<char> vBlockData, vBilinearData;
 	BLOCK::FillBlocks(vBlockData, vBilinearData, 1);

@ -781,7 +761,7 @@ bool ZZCreate(int _width, int _height)
 	// This was changed in SetAA - should we be changing it back?
 	glPointSize(1.0f);

-	g_nDepthBias = 0;
+	// g_nDepthBias = 0;

 	glEnable(GL_POLYGON_OFFSET_FILL);
 	glEnable(GL_POLYGON_OFFSET_LINE);
@ -791,7 +771,7 @@ bool ZZCreate(int _width, int _height)
 	vb[0].Init(VB_BUFFERSIZE);
 	vb[1].Init(VB_BUFFERSIZE);

-	g_vsprog = g_psprog = 0;
+	g_vsprog = g_psprog = sZero;

 	if (glGetError() == GL_NO_ERROR)
 	{
@ -823,10 +803,10 @@ void ZZDestroy()
 	vb[0].Destroy();
 	vb[1].Destroy();

-	if (g_vboBuffers.size() > 0)
+	if (vb_buffer_allocated)
 	{
-		glDeleteBuffers((GLsizei)g_vboBuffers.size(), &g_vboBuffers[0]);
-		g_vboBuffers.clear();
+		glDeleteBuffers((GLsizei)ArraySize(g_vboBuffers), g_vboBuffers);
+        vb_buffer_allocated = false; // mark the buffer unallocated
 	}

 	g_nCurVBOIndex = 0;
@ -864,8 +844,8 @@ void ZZDestroy()
 	SAFE_RELEASE_PROG(ppsCRTCTarg[1].prog);
 	SAFE_RELEASE_PROG(ppsCRTC[0].prog);
 	SAFE_RELEASE_PROG(ppsCRTC[1].prog);
-	SAFE_RELEASE_PROG(ppsCRTC24[0].prog);
-	SAFE_RELEASE_PROG(ppsCRTC24[1].prog);
+//	SAFE_RELEASE_PROG(ppsCRTC24[0].prog);
+//	SAFE_RELEASE_PROG(ppsCRTC24[1].prog);
 	SAFE_RELEASE_PROG(ppsOne.prog);

 	safe_delete(font_p);
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.h
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.h
@ -0,0 +1,123 @@
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+ 
+#ifndef ZZOGLFLUSH_H_INCLUDED
+#define ZZOGLFLUSH_H_INCLUDED
+
+#ifndef ZEROGS_DEVBUILD
+
+#define INC_GENVARS()
+#define INC_TEXVARS()
+#define INC_ALPHAVARS()
+#define INC_RESOLVE()
+
+#define g_bUpdateEffect 0
+#define g_bSaveTex 0
+#define g_bSaveResolved 0
+
+#else // defined(ZEROGS_DEVBUILD)
+
+#define INC_GENVARS() ++g_nGenVars
+#define INC_TEXVARS() ++g_nTexVars
+#define INC_ALPHAVARS() ++g_nAlphaVars
+#define INC_RESOLVE() ++g_nResolve
+
+extern bool g_bUpdateEffect;
+extern bool g_bSaveTex;	// saves the current texture
+extern bool g_bSaveResolved;
+#endif // !defined(ZEROGS_DEVBUILD)
+
+enum StencilBits
+{
+	STENCIL_ALPHABIT = 1,		// if set, dest alpha >= 0x80
+	STENCIL_PIXELWRITE = 2,		// if set, pixel just written (reset after every Flush)
+	STENCIL_FBA = 4,			// if set, just written pixel's alpha >= 0 (reset after every Flush)
+	STENCIL_SPECIAL = 8		// if set, indicates that pixel passed its alpha test (reset after every Flush)
+	//STENCIL_PBE = 16	
+};
+#define STENCIL_CLEAR	   (2|4|8|16)
+
+enum ColorMask 
+{
+	COLORMASK_RED = 1,
+	COLORMASK_GREEN = 2,
+	COLORMASK_BLUE = 4,
+	COLORMASK_ALPHA = 8
+	
+};
+#define GL_COLORMASK(mask) glColorMask(!!((mask)&COLORMASK_RED), !!((mask)&COLORMASK_GREEN), !!((mask)&COLORMASK_BLUE), !!((mask)&COLORMASK_ALPHA))
+
+// extern int g_nDepthBias;
+extern float g_fBlockMult; // used for old cards, that do not support Alpha-32float textures. We store block data in u16 and use it.
+extern u32 g_nCurVBOIndex;
+extern u8* g_pbyGSClut;
+extern int ppf;
+
+extern bool s_bTexFlush;
+
+extern vector<u32> s_vecTempTextures;		   // temporary textures, released at the end of every frame
+extern GLuint g_vboBuffers[VB_NUMBUFFERS]; // VBOs for all drawing commands
+extern CRangeManager s_RangeMngr; // manages overwritten memory				// zz
+
+#if 0
+typedef union
+{
+	struct
+	{
+		u8 _bNeedAlphaColor;		// set if vAlphaBlendColor needs to be set
+		u8 _b2XAlphaTest;		   // Only valid when bNeedAlphaColor is set. if 1st bit set set, double all alpha testing values
+		// otherwise alpha testing needs to be done separately.
+		u8 _bDestAlphaColor;		// set to 1 if blending with dest color (process only one tri at a time). If 2, dest alpha is always 1.
+		u8 _bAlphaClamping;	 // if first bit is set, do min; if second bit, do max
+	};
+
+	u32 _bAlphaState;
+} g_flag_vars;
+
+extern g_flag_vars g_vars;
+#endif
+
+//#define bNeedAlphaColor g_vars._bNeedAlphaColor
+//#define b2XAlphaTest g_vars._b2XAlphaTest
+//#define bDestAlphaColor g_vars._bDestAlphaColor
+//#define bAlphaClamping g_vars._bAlphaClamping
+
+void FlushTransferRanges(const tex0Info* ptex);						//zz
+
+// use to update the state
+void SetTexVariables(int context, FRAGMENTSHADER* pfragment);			// zz
+void SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint);		// zz
+void SetAlphaVariables(const alphaInfo& ainfo);					// zzz
+//void ResetAlphaVariables();
+
+inline void SetAlphaTestInt(pixTest curtest);
+
+inline void RenderAlphaTest(const VB& curvb, ZZshParameter sOneColor);
+inline void RenderStencil(const VB& curvb, u32 dwUsingSpecialTesting);
+inline void ProcessStencil(const VB& curvb);
+inline void RenderFBA(const VB& curvb, ZZshParameter sOneColor);
+inline void ProcessFBA(const VB& curvb, ZZshParameter sOneColor);			// zz
+
+void SetContextTarget(int context);
+
+void SetWriteDepth();
+bool IsWriteDepth();
+void SetDestAlphaTest();
+
+#endif // ZZOGLFLUSH_H_INCLUDED
--- a/plugins/zzogl-pg/opengl/ZZoglMath.h
+++ b/plugins/zzogl-pg/opengl/ZZoglMath.h
@ -212,7 +212,7 @@ typedef Vector4<float> float4;

 // Reimplement, swiping a bunch of code from GSdx and adapting it. (specifically GSVector.h)
 // This doesn't include more then half of the functions in there, as well as some of the structs...
-#include <xmmintrin.h>
+#include <immintrin.h>

 #include "Pcsx2Types.h"

--- a/plugins/zzogl-pg/opengl/ZZoglMem.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglMem.cpp
@ -0,0 +1,564 @@
+/*  ZeroGS KOSMOS
+ *  Copyright (C) 2005-2006 zerofrog@gmail.com
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "GS.h"
+#include "Util.h"
+#include "ZZoglMem.h"
+#include "targets.h"
+#include "x86.h"
+
+#include "Mem_Swizzle.h"
+
+#ifndef ZZNORMAL_MEMORY
+
+bool allowed_psm[256] = {false, };			// Sometimes we got strange unknown psm
+PSM_value PSM_value_Table[64] = {PSMT_BAD_PSM, };	// for int -> PSM_value
+
+// return array of pointer of array string,
+// We SHOULD do memory allocation for u32** -- otherwize we have a lot of trouble!
+// if bw and bh are set correctly, as dimensions of table, than array have pointers
+// to table rows, so array[i][j] = table[i][j];
+inline u32** InitTable(int bh, int bw, u32* table) {
+	u32** array = (u32**)malloc(bh * sizeof(u32*));
+	for (int i = 0; i < bh; i++) {
+		array[i] = &table[i * bw];
+	}
+	return array;
+}
+
+// initialize dynamic arrays (u32**) for each regular psm.
+inline void SetTable(int psm) {
+	switch (psm) {
+		case PSMCT32:
+			g_pageTable[psm]   = InitTable( 32,  64, &g_pageTable32[0][0]);
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable32[0][0]);
+			g_columnTable[psm] = InitTable(  8,   8, &g_columnTable32[0][0]);
+			break;
+			
+		case PSMCT24:
+			g_pageTable[psm]   = g_pageTable[PSMCT32];;
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable32[0][0]);
+			g_columnTable[psm] = InitTable(  8,   8, &g_columnTable32[0][0]);
+			break;
+			
+		case PSMCT16:
+			g_pageTable[psm]   = InitTable( 64,  64, &g_pageTable16[0][0]);
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable16[0][0]);
+			g_columnTable[psm] = InitTable(  8,  16, &g_columnTable16[0][0]);
+			break;
+			
+		case PSMCT16S:
+			g_pageTable[psm]   = InitTable( 64,  64, &g_pageTable16S[0][0]);
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable16S[0][0]);
+			g_columnTable[psm] = InitTable(  8,  16, &g_columnTable16[0][0]);
+			break;
+			
+		case PSMT8:
+			g_pageTable[psm]   = InitTable( 64, 128, &g_pageTable8[0][0]);
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable8[0][0]);
+			g_columnTable[psm] = InitTable( 16,  16, &g_columnTable8[0][0]);
+			break;
+			
+		case PSMT8H:
+			g_pageTable[psm]   = g_pageTable[PSMCT32];
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable8[0][0]);
+			g_columnTable[psm] = InitTable( 16,  16, &g_columnTable8[0][0]);
+			break;
+			
+		case PSMT4:
+			g_pageTable[psm]   = InitTable(128, 128, &g_pageTable4[0][0]);
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable4[0][0]);
+			g_columnTable[psm] = InitTable( 16,  32, &g_columnTable4[0][0]);
+			break;	
+					
+		case PSMT4HL:
+		case PSMT4HH:
+			g_pageTable[psm]   = g_pageTable[PSMCT32];
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable4[0][0]);
+			g_columnTable[psm] = InitTable( 16,  32, &g_columnTable4[0][0]);
+			break;
+			
+		case PSMT32Z:
+			g_pageTable[psm]   = InitTable( 32,  64, &g_pageTable32Z[0][0]);
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable32Z[0][0]);
+			g_columnTable[psm] = InitTable(  8,   8, &g_columnTable32[0][0]);
+			break;
+			
+		case PSMT24Z:
+			g_pageTable[psm]   = g_pageTable[PSMT32Z];
+			g_blockTable[psm]  = InitTable(  4,   8, &g_blockTable32Z[0][0]);
+			g_columnTable[psm] = InitTable(  8,   8, &g_columnTable32[0][0]);
+			break;
+			
+		case PSMT16Z:
+			g_pageTable[psm]   = InitTable( 64,  64, &g_pageTable16Z[0][0]);
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable16Z[0][0]);
+			g_columnTable[psm] = InitTable(  8,  16, &g_columnTable16[0][0]);
+			break;
+			
+		case PSMT16SZ:
+			g_pageTable[psm]   = InitTable( 64,  64, &g_pageTable16SZ[0][0]);
+			g_blockTable[psm]  = InitTable(  8,   4, &g_blockTable16SZ[0][0]);
+			g_columnTable[psm] = InitTable(  8,  16, &g_columnTable16[0][0]);
+			break;
+	}
+}
+
+// After this, the function arrays with u32** have memory set and filled. 
+void FillBlockTables() {
+	for (int i = 0; i < MAX_PSM; i++) 
+		SetTable(i);
+}
+
+// Deallocate memory for u32** arrays.
+void DestroyBlockTables() {
+	for (int i = 0; i < MAX_PSM; i++) {
+		if (g_pageTable[i] != NULL && (i != PSMT8H && i != PSMT4HL && i != PSMT4HH && i != PSMCT24 && i != PSMT24Z))
+			free(g_pageTable[i]);
+			
+		if (g_blockTable[i] != NULL)
+		      	free(g_blockTable[i]);
+		      	
+		if (g_columnTable[i] != NULL)
+			free(g_columnTable[i]);
+	}
+}
+
+void FillNewPageTable() {
+	int k = 0;
+	for (int psm = 0; psm < MAX_PSM; psm ++)
+		if (allowed_psm[psm]) {
+			for (u32 i = 0; i < 127; i++)
+				for(u32 j = 0; j < 127; j++) {
+					u32 address;
+					u32 shift;
+					
+					address = g_pageTable[psm][i & ZZ_DT[psm][3]][j & ZZ_DT[psm][4]];
+					shift = (((address << ZZ_DT[psm][5]) & 0x7 ) << 3)+ ZZ_DT[psm][7]; 				// last part is for 8H, 4HL and 4HH -- they have data from 24 and 28 byte
+					g_pageTable2[k][i][j] = (address >> ZZ_DT[psm][0]) + (shift << 16); 			// now lower 16 byte of page table is 32-bit aligned address, and upper -- 
+																	// shift.
+				}
+			g_pageTableNew[psm]   = InitTable( 128,  128, &g_pageTable2[k][0][0]);
+			k++;;
+		}
+}
+
+BLOCK m_Blocks[MAX_PSM]; // Do so that blocks are indexable.
+
+// At the begining and the end of each string we should made unaligned writes, with nSize checks. We should be sure that all
+// these pixels are inside one widthlimit space.
+template <int psm>
+inline bool DoOneTransmitStep(void* pstart, int& nSize, int endj, const void* pbuf, int& k, int& i, int& j, int widthlimit) {
+	for (; j < endj && nSize > 0; j++, k++, nSize -= 1) { 
+		writePixelMem<psm, false>((u32*)pstart, j%2048, i%2048, (u32*)(pbuf), k, gs.dstbuf.bw); 
+	}
+	
+	return (nSize == 0);
+}
+
+// FFX has PSMT8 transmit (starting intro -- sword and hairs).
+// Persona 4 texts at start are PSMCT32 (and there is also PSMCT16 transmit somwhere after that).
+// Tekken V has PSMCT24 and PSMT4 transfers
+
+// This function transfers "Y" block pixels. I use little another code than Zerofrog. My code often uses widthmult != 1 addition (Zerofrog's code
+// have an strict condition for fast path: width of transferred data should be widthlimit multiplied by j; EndY also should be multiplied. But
+// the usual data block of 255 pixels becomes transfered by 1.
+// I should check, maybe Unaligned_Start and Unaligned_End often == 0, and I could try a fastpath -- with this block off.
+template <int psm, int widthlimit>
+inline bool TRANSMIT_HOSTLOCAL_Y(u32* pbuf, int& nSize, u8* pstart, int endY, int& i, int& j, int& k) {
+//	if (psm != PSMT8 && psm != 0 && psm != PSMT4 && psm != PSMCT24)
+//		ERROR_LOG("This is usable function TRANSMIT_HOSTLOCAL_Y at ZZoglMem.cpp %d %d %d %d %d\n", psm, widthlimit, i, j, nSize);
+
+	int q = (gs.trxpos.dx - j) % widthlimit; 
+	if (DoOneTransmitStep<psm>(pstart, nSize, q, pbuf, k, i, j, widthlimit)) return true;						// After this j and dx are compatible by modyle of widthlimit
+	
+	int Unaligned_Start = (gs.trxpos.dx % widthlimit == 0) ? 0 : widthlimit - gs.trxpos.dx % widthlimit;					// gs.trpos.dx + Unaligned_Start is multiple of widthlimit
+	for (; i < endY; ++i) {
+		if (DoOneTransmitStep<psm>(pstart, nSize, j + Unaligned_Start, pbuf, k, i, j, widthlimit)) return true;			// This operation made j % widthlimit == 0.
+		//assert (j % widthlimit != 0);												 
+
+		for (; j < gs.imageEnd.x - widthlimit + 1 && nSize >= widthlimit; j += widthlimit, nSize -= widthlimit) { 			
+			writePixelsFromMemory<psm, true, widthlimit>(pstart, pbuf, k, j % 2048, i % 2048,  gs.dstbuf.bw);
+		}
+	
+		assert ( gs.imageEnd.x - j < widthlimit || nSize < widthlimit);	
+		if (DoOneTransmitStep<psm>(pstart, nSize, gs.imageEnd.x, pbuf, k, i, j, widthlimit)) return true;				// There are 2 reasons for finish of previous for: 1) nSize < widthlimit
+																		// 2) j > gs.imageEnd.x - widthlimit + 1. We would try to write pixels up do
+																		// EndX, it's no more widthlimit pixels																		
+		j = gs.trxpos.dx; 
+	}	
+
+	return false;
+}
+
+// PSMT4 -- Tekken V
+template <int psm, int widthlimit>
+inline void TRANSMIT_HOSTLOCAL_X(u32* pbuf, int& nSize, u8* pstart, int& i, int& j, int& k, int blockheight, int startX, int pitch, int fracX) {
+	if (psm != PSMT8 && psm != PSMT4)
+		ZZLog::Error_Log("This is usable function TRANSMIT_HOSTLOCAL_X at ZZoglMem.cpp %d %d %d %d %d\n", psm, widthlimit, i, j, nSize);
+
+	for(int tempi = 0; tempi < blockheight; ++tempi) { 
+		for(j = startX; j < gs.imageEnd.x; j++, k++) { 
+			writePixelMem<psm, false>((u32*)pstart, j%2048, (i + tempi)%2048, (u32*)(pbuf), k, gs.dstbuf.bw); 
+		} 
+		k += ( pitch - fracX ); 
+	} 
+} 
+
+template <int psm>
+inline int TRANSMIT_PITCH(int pitch) {
+	return (PSM_BITS_PER_PIXEL<psm>() * pitch) >> 3;
+}
+
+// ------------------------
+// |              Y       |
+// ------------------------
+// |        block     |   |
+// |   aligned area   | X |
+// |                  |   |
+// ------------------------
+// |              Y       |
+// ------------------------
+
+
+template <int psmX>
+int FinishTransfer(int i, int j, int nSize, int nLeftOver)
+{
+	if( i >= gs.imageEnd.y ) 
+	{
+		assert( gs.transferring == false || i == gs.imageEnd.y );
+		gs.transferring = false;
+	}
+	else {
+		/* update new params */
+		gs.image.y = i;
+		gs.image.x = j;
+	}
+	
+	return (nSize * TRANSMIT_PITCH<psmX>(2) + nLeftOver)/2;
+}
+ 
+template<int psmX, int widthlimit, int blockbits, int blockwidth, int blockheight>
+int TransferHostLocal(const void* pbyMem, u32 nQWordSize) 
+{ 
+	assert( gs.imageTransfer == XFER_HOST_TO_LOCAL );
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256;
+
+	int i = gs.image.y, j = gs.image.x;
+	
+	const u8* pbuf = (const u8*)pbyMem;
+	int nLeftOver = (nQWordSize*4*2)%(TRANSMIT_PITCH<psmX>(2));
+	int nSize = nQWordSize*4*2/TRANSMIT_PITCH<psmX>(2);
+	nSize = min(nSize, gs.imageNew.w * gs.imageNew.h);
+
+	int pitch, area, fracX;
+	int endY = ROUND_UPPOW2(i, blockheight);
+	Point alignedPt;
+	
+	alignedPt.x = ROUND_DOWNPOW2(gs.imageEnd.x, blockwidth);
+	alignedPt.y = ROUND_DOWNPOW2(gs.imageEnd.y, blockheight);
+	
+	bool bAligned;
+	bool bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedPt.y > endY) && alignedPt.x > gs.trxpos.dx;
+
+	if( (gs.imageEnd.x - gs.trxpos.dx) % widthlimit ) {
+		/* hack */
+		int testwidth = (int)nSize - (gs.imageEnd.y - i) * (gs.imageEnd.x - gs.trxpos.dx) + (j - gs.trxpos.dx);
+		if((testwidth <= widthlimit) && (testwidth >= -widthlimit)) {
+			/* don't transfer */
+			/*ZZLog::Debug_Log("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEnd.x, nQWordSize);*/
+			gs.transferring = false;
+		}
+		bCanAlign = false;
+	}
+
+	/* first align on block boundary */
+	if( MOD_POW2(i, blockheight) || !bCanAlign ) {
+	
+		if( !bCanAlign )
+			endY = gs.imageEnd.y; /* transfer the whole image */
+		else
+			assert( endY < gs.imageEnd.y); /* part of alignment condition */
+		
+		int limit = widthlimit;
+		if (((gs.imageEnd.x - gs.trxpos.dx) % widthlimit) || ((gs.imageEnd.x - j) % widthlimit)) 
+			/* transmit with a width of 1 */
+			limit = 1 + (gs.dstbuf.psm == PSMT4);
+		/*TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, limit, endY)*/
+		int k = 0;
+		
+		if (TRANSMIT_HOSTLOCAL_Y<psmX, widthlimit>((u32*)pbuf, nSize, pstart, endY, i, j, k)) 
+			return FinishTransfer<psmX>(i, j, nSize, nLeftOver);
+		
+		pbuf += TRANSMIT_PITCH<psmX>(k);
+		
+		if (nSize == 0 || i == gs.imageEnd.y) return FinishTransfer<psmX>(i, j, nSize, nLeftOver);
+	}
+
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx);
+
+	/* can align! */
+	pitch = gs.imageEnd.x - gs.trxpos.dx;
+	area = pitch * blockheight;
+	fracX = gs.imageEnd.x - alignedPt.x;
+
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */
+	bAligned = !((uptr)pbuf & 0xf) && ((TRANSMIT_PITCH<psmX>(pitch)&0xf) == 0);
+	
+	/* transfer aligning to blocks */
+	for(; i < alignedPt.y && nSize >= area; i += blockheight, nSize -= area) {
+	
+		for(int tempj = gs.trxpos.dx; tempj < alignedPt.x; tempj += blockwidth, pbuf += TRANSMIT_PITCH<psmX>(blockwidth)) {
+			SwizzleBlock<psmX>((u32*)(pstart + getPixelAddress<psmX>(tempj, i, gs.dstbuf.bw)*blockbits/8),
+				(u32*)pbuf, TRANSMIT_PITCH<psmX>(pitch));
+		}
+	
+		/* transfer the rest */
+		if( alignedPt.x < gs.imageEnd.x ) {
+			int k = 0;
+			TRANSMIT_HOSTLOCAL_X<psmX, widthlimit>((u32*)pbuf, nSize, pstart, i, j, k, blockheight, alignedPt.x, pitch, fracX);
+			pbuf += TRANSMIT_PITCH<psmX>(k - alignedPt.x + gs.trxpos.dx);
+		}
+		else pbuf += (blockheight-1)*TRANSMIT_PITCH<psmX>(pitch);
+		j = gs.trxpos.dx;
+	}
+
+	if( TRANSMIT_PITCH<psmX>(nSize)/4 > 0 ) {
+		int k = 0;
+		TRANSMIT_HOSTLOCAL_Y<psmX, widthlimit>((u32*)pbuf, nSize, pstart, gs.imageEnd.y, i, j, k);
+		pbuf += TRANSMIT_PITCH<psmX>(k);
+		/* sometimes wrong sizes are sent (tekken tag) */
+		assert( gs.transferring == false || TRANSMIT_PITCH<psmX>(nSize)/4 <= 2 );
+	}
+	
+	return FinishTransfer<psmX>(i, j, nSize, nLeftOver);
+}
+
+inline int TransferHostLocal32(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMCT32, 2, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT32Z, 2, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal24(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMCT24, 8, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal24Z(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT24Z, 8, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal16(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMCT16, 4, 16, 16, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal16S(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMCT16S, 4, 16, 16, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal16Z(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT16Z, 4, 16, 16, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal16SZ(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT16SZ, 4, 16, 16, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal8(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT8, 4, 8, 16, 16>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal4(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT4, 8, 4, 32, 16>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal8H(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT8H, 4, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal4HL(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT4HL, 8, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+inline int TransferHostLocal4HH(const void* pbyMem, u32 nQWordSize) 
+{
+	return TransferHostLocal<PSMT4HH, 8, 32, 8, 8>( pbyMem, nQWordSize);
+}
+
+void TransferLocalHost32(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost24(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost16(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost16S(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost8(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost4(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost8H(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost4HL(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost4HH(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost32Z(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost24Z(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost16Z(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+void TransferLocalHost16SZ(void* pbyMem, u32 nQWordSize) { FUNCLOG }
+
+inline void FILL_BLOCK(BLOCK& b, int floatfmt, vector<char>& vBlockData, vector<char>& vBilinearData, int ox, int oy, int psmX) { 
+	int bw = ZZ_DT[psmX][4] + 1;
+	int bh = ZZ_DT[psmX][3] + 1;
+	int mult = 1 << ZZ_DT[psmX][0];
+
+	b.vTexDims = float4 (BLOCK_TEXWIDTH/(float)(bw), BLOCK_TEXHEIGHT/(float)(bh), 0, 0); 
+	b.vTexBlock = float4( (float)bw/BLOCK_TEXWIDTH, (float)bh/BLOCK_TEXHEIGHT, ((float)ox+0.2f)/BLOCK_TEXWIDTH, ((float)oy+0.05f)/BLOCK_TEXHEIGHT); 
+	b.width = bw; 
+	b.height = bh; 
+	b.colwidth = bh / 4; 
+	b.colheight = bw / 8; 
+	b.bpp = 32/mult; 
+	
+	b.pageTable = g_pageTable[psmX]; 
+	b.blockTable = g_blockTable[psmX]; 
+	b.columnTable = g_columnTable[psmX]; 
+	
+	// This is never true.
+	//assert( sizeof(g_pageTable[psmX]) == bw*bh*sizeof(g_pageTable[psmX][0][0]) ); 
+	float* psrcf = (float*)&vBlockData[0] + ox + oy * BLOCK_TEXWIDTH; 
+	u16* psrcw = (u16*)&vBlockData[0] + ox + oy * BLOCK_TEXWIDTH; 
+	for(int i = 0; i < bh; ++i) { 
+		for(int j = 0; j < bw; ++j) { 
+			/* fill the table */ 
+			u32 u = g_blockTable[psmX][(i / b.colheight)][(j / b.colwidth)] * 64 * mult + g_columnTable[psmX][i%b.colheight][j%b.colwidth]; 
+			b.pageTable[i][j] = u; 
+			if( floatfmt ) { 
+				psrcf[i*BLOCK_TEXWIDTH+j] = (float)(u) / (float)(GPU_TEXWIDTH*mult); 
+			} 
+			else { 
+				psrcw[i*BLOCK_TEXWIDTH+j] = u; 
+			} 
+		} 
+	} 
+	
+	if( floatfmt ) { 
+		float4* psrcv = (float4*)&vBilinearData[0] + ox + oy * BLOCK_TEXWIDTH; 
+		for(int i = 0; i < bh; ++i) { 
+			for(int j = 0; j < bw; ++j) { 
+				float4* pv = &psrcv[i*BLOCK_TEXWIDTH+j]; 
+				pv->x = psrcf[i*BLOCK_TEXWIDTH+j]; 
+				pv->y = psrcf[i*BLOCK_TEXWIDTH+((j+1)%bw)]; 
+				pv->z = psrcf[((i+1)%bh)*BLOCK_TEXWIDTH+j]; 
+				pv->w = psrcf[((i+1)%bh)*BLOCK_TEXWIDTH+((j+1)%bw)]; 
+			} 
+		} 
+	} 
+}
+
+void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, int floatfmt)
+{
+	FUNCLOG
+	vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * (floatfmt ? 4 : 2));
+	
+	if (floatfmt)
+		vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
+
+	BLOCK b;
+
+	memset(m_Blocks, 0, sizeof(m_Blocks));
+
+	// 32
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData, 0, 0, PSMCT32);
+	b.TransferHostLocal = TransferHostLocal32;
+	b.TransferLocalHost = TransferLocalHost32;
+	m_Blocks[PSMCT32] = b;
+
+	// 24 (same as 32 except write/readPixel are different)
+	b.TransferHostLocal = TransferHostLocal24;
+	b.TransferLocalHost = TransferLocalHost24;
+	 m_Blocks[PSMCT24] = b;
+
+	// 8H (same as 32 except write/readPixel are different)
+	b.TransferHostLocal = TransferHostLocal8H;
+	b.TransferLocalHost = TransferLocalHost8H;	 
+	m_Blocks[PSMT8H] = b;
+
+	b.TransferHostLocal = TransferHostLocal4HL;
+	b.TransferLocalHost = TransferLocalHost4HL;
+	m_Blocks[PSMT4HL] = b;
+
+	b.TransferHostLocal = TransferHostLocal4HH;
+	b.TransferLocalHost = TransferLocalHost4HH;
+	m_Blocks[PSMT4HH] = b;
+
+	// 32z
+	FILL_BLOCK(b, floatfmt, vBlockData, vBilinearData, 64, 0, PSMT32Z);
+	b.TransferHostLocal = TransferHostLocal32Z;
+	b.TransferLocalHost = TransferLocalHost32Z;
+	m_Blocks[PSMT32Z] = b;
+
+	// 24Z (same as 32Z except write/readPixel are different)
+	b.TransferHostLocal = TransferHostLocal24Z;
+	b.TransferLocalHost = TransferLocalHost24Z;
+	m_Blocks[PSMT24Z] = b;
+
+	// 16
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData,  0, 32, PSMCT16);
+	b.TransferHostLocal = TransferHostLocal16;
+	b.TransferLocalHost = TransferLocalHost16;
+	m_Blocks[PSMCT16] = b;
+
+	// 16s
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData,  64, 32, PSMCT16S);
+	b.TransferHostLocal = TransferHostLocal16S;
+	b.TransferLocalHost = TransferLocalHost16S;
+	m_Blocks[PSMCT16S] = b;
+
+	// 16z
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData,  0, 96, PSMT16Z);
+	b.TransferHostLocal = TransferHostLocal16Z;
+	b.TransferLocalHost = TransferLocalHost16Z;
+	m_Blocks[PSMT16Z] = b;
+
+	// 16sz
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData, 64, 96, PSMT16SZ);
+	b.TransferHostLocal = TransferHostLocal16SZ;
+	b.TransferLocalHost = TransferLocalHost16SZ;
+	m_Blocks[PSMT16SZ] = b;
+
+	// 8
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData,  0, 160, PSMT8);
+	b.TransferHostLocal = TransferHostLocal8;
+	b.TransferLocalHost = TransferLocalHost8;
+	m_Blocks[PSMT8] = b;
+
+	// 4
+	FILL_BLOCK(b, floatfmt,  vBlockData, vBilinearData,  0, 224, PSMT4);
+	b.TransferHostLocal = TransferHostLocal4;
+	b.TransferLocalHost = TransferLocalHost4;
+	m_Blocks[PSMT4] = b;
+}
+
+#endif
--- a/plugins/zzogl-pg/opengl/ZZoglMem.h
+++ b/plugins/zzogl-pg/opengl/ZZoglMem.h
@ -0,0 +1,790 @@
+/*  ZeroGS KOSMOS
+ *  Copyright (C) 2005-2006 zerofrog@gmail.com
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __ZZOGL_MEM_H__
+#define __ZZOGL_MEM_H__
+
+#include <assert.h>
+#include <vector>
+#include "GS.h"
+#include "Util.h"
+#include "Mem.h"
+
+#ifndef ZZNORMAL_MEMORY
+
+extern u32 g_blockTable32[4][8];
+extern u32 g_blockTable32Z[4][8];
+extern u32 g_blockTable16[8][4];
+extern u32 g_blockTable16S[8][4];
+
+extern u32 g_blockTable16Z[8][4];
+
+extern u32 g_blockTable16SZ[8][4];
+
+extern u32 g_blockTable8[4][8];
+extern u32 g_blockTable4[8][4];
+
+extern u32 g_columnTable32[8][8];
+extern u32 g_columnTable16[8][16];
+extern u32 g_columnTable8[16][16];
+extern u32 g_columnTable4[16][32];
+
+//--
+
+extern u32 g_pageTable32[32][64];
+extern u32 g_pageTable32Z[32][64];
+extern u32 g_pageTable16[64][64];
+extern u32 g_pageTable16S[64][64];
+extern u32 g_pageTable16Z[64][64];
+extern u32 g_pageTable16SZ[64][64];
+extern u32 g_pageTable8[64][128];
+extern u32 g_pageTable4[128][128];
+
+
+//maximum PSM is 58, so our arrays have 58 + 1 = 59 elements
+
+// This table is used for fast access to memory storage data. 
+extern u32 ZZ_DT[MAX_PSM][TABLE_WIDTH];
+
+
+//maxium PSM is 58, so our arrays have 58 + 1 = 59 elements
+extern u32** g_pageTable[MAX_PSM];
+extern u32** g_blockTable[MAX_PSM];
+extern u32** g_columnTable[MAX_PSM];
+extern u32 g_pageTable2[MAX_PSM][127][127];
+extern u32** g_pageTableNew[MAX_PSM];
+
+// rest not visible externally
+struct BLOCK
+{
+	BLOCK() { memset(this, 0, sizeof(BLOCK)); }
+
+	// shader constants for this block
+	float4 vTexBlock;
+	float4 vTexDims;
+	int width, height;	// dims of one page in pixels
+	int bpp;
+	int colwidth, colheight;
+	u32** pageTable;	// offset inside each page
+	u32** blockTable;
+	u32** columnTable;
+
+	// Nobody use this, so we better remove it.
+//	u32 (*getPixelAddress)(int x, int y, u32 bp, u32 bw);
+//	u32 (*getPixelAddress_0)(int x, int y, u32 bw);
+//	void (*writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw);
+//	void (*writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw);
+//	u32 (*readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw);
+//	u32 (*readPixel_0)(const void* pmem, int x, int y, u32 bw);
+	int (*TransferHostLocal)(const void* pbyMem, u32 nQWordSize);
+	void (*TransferLocalHost)(void* pbyMem, u32 nQWordSize);
+
+	// texture must be of dims BLOCK_TEXWIDTH and BLOCK_TEXHEIGHT
+	static void FillBlocks(std::vector<char>& vBlockData, std::vector<char>& vBilinearData, int floatfmt);
+};
+
+void FillBlockTables();
+void DestroyBlockTables();
+void FillNewPageTable();
+
+extern BLOCK m_Blocks[];
+
+extern u32 g_blockTable32[4][8];
+extern u32 g_blockTable32Z[4][8];
+extern u32 g_blockTable16[8][4];
+extern u32 g_blockTable16S[8][4];
+extern u32 g_blockTable16Z[8][4];
+extern u32 g_blockTable16SZ[8][4];
+extern u32 g_blockTable8[4][8];
+extern u32 g_blockTable4[8][4];
+
+extern u32 g_columnTable32[8][8];
+extern u32 g_columnTable16[8][16];
+extern u32 g_columnTable8[16][16];
+extern u32 g_columnTable4[16][32];
+
+extern u32 g_pageTable32[32][64];
+extern u32 g_pageTable32Z[32][64];
+extern u32 g_pageTable16[64][64];
+extern u32 g_pageTable16S[64][64];
+extern u32 g_pageTable16Z[64][64];
+extern u32 g_pageTable16SZ[64][64];
+extern u32 g_pageTable8[64][128];
+extern u32 g_pageTable4[128][128];
+
+
+extern u32** g_pageTable[MAX_PSM];
+extern u32** g_blockTable[MAX_PSM];
+extern u32** g_columnTable[MAX_PSM];
+extern u32 ZZ_DT[MAX_PSM][TABLE_WIDTH];
+extern u32** g_pageTableNew[MAX_PSM];
+
+static __forceinline void MaskedOR(u32* dst, u32 pixel, u32 mask = 0xffffffff) {
+	if (mask == 0xffffffff)
+		*dst = pixel;
+	else
+		*dst = (*dst & (~mask)) | (pixel & mask);
+}
+
+// This two defines seems like idiotic code, but in reality it have one, but big importance -- this code
+// made psm variable (and psm2 in second case) -- constant, so optimiser could properly pass proper function
+#define PSM_SWITCHCASE(X) { \
+	switch (psm) { \
+		case PSMCT32: { \
+     			const int psmC = PSMCT32; \
+			X; } \
+			break; \
+		case PSMT32Z: { \
+			const int psmC = PSMT32Z; \
+			X; } \
+			break; \
+		case PSMCT24: { \
+      			const int psmC = PSMCT24; \
+			X; }  \
+			break; \
+		case PSMT24Z: { \
+			const int psmC = PSMT24Z; \
+			X; }  \
+			break; \
+		case PSMCT16: { \
+      			const int psmC = PSMCT16; \
+			X; }  \
+			break; \
+		case PSMCT16S: { \
+      			const int psmC = PSMCT16S; \
+			X; }  \
+			break; \
+		case PSMT16Z: { \
+			const int psmC = PSMT16Z; \
+			X; }  \
+			break; \
+		case PSMT16SZ: { \
+			const int psmC = PSMT16SZ; \
+			X; }  \
+			break; \
+		case PSMT8: { \
+			const int psmC = PSMT8; \
+			X; }  \
+			break; \
+		case PSMT8H: { \
+			const int psmC = PSMT8H; \
+			X; }  \
+			break; \
+		case PSMT4HH: { \
+			const int psmC = PSMT4HH; \
+			X; }  \
+			break; \
+		case PSMT4HL: { \
+			const int psmC = PSMT4HL; \
+			X; }  \
+			break; \
+		case PSMT4: { \
+			const int psmC = PSMT4; \
+			X; }  \
+			break; \
+	}\
+}
+
+#define PSM_SWITCHCASE_2(X) { \
+	switch (psm) { \
+		case PSMCT32: \
+			if( psm2 == PSMCT32 ) 		{ const int psmC = PSMCT32, psmC1 = PSMCT32; X; } \
+			else 				{ const int psmC = PSMCT32, psmC1 = PSMT32Z; X; } \
+			break; \
+		case PSMCT24: \
+			if( psm2 == PSMCT24 ) 		{ const int psmC = PSMCT24, psmC1 = PSMCT24; X; } \
+			else 				{ const int psmC = PSMCT24, psmC1 = PSMT24Z; X; } \
+			break; \
+		case PSMT32Z: \
+			if( psm2 == PSMT32Z ) 		{ const int psmC = PSMT32Z, psmC1 = PSMCT32; X; } \
+			else  				{ const int psmC = PSMT32Z, psmC1 = PSMT32Z; X; } \
+			break; \
+		case PSMT24Z: \
+			if( psm2 == PSMCT24 ) 		{ const int psmC = PSMT24Z, psmC1 = PSMCT24; X; } \
+			else 				{ const int psmC = PSMT24Z, psmC1 = PSMT24Z; X; } \
+			break; \
+		case PSMCT16: \
+			switch(psm2) { \
+				case PSMCT16: 		{ const int psmC = PSMCT16, psmC1 = PSMCT16; X; }  break; \
+				case PSMCT16S:  	{ const int psmC = PSMCT16, psmC1 = PSMCT16S; X; } break; \
+				case PSMT16Z: 	      	{ const int psmC = PSMCT16, psmC1 = PSMT16Z; X; }  break; \
+				case PSMT16SZ: 	      	{ const int psmC = PSMCT16, psmC1 = PSMT16SZ; X; } break; \
+			} \
+			break; \
+		case PSMCT16S: \
+			switch(psm2) { \
+				case PSMCT16: 		{ const int psmC = PSMCT16S, psmC1 = PSMCT16; X; }  break; \
+				case PSMCT16S:  	{ const int psmC = PSMCT16S, psmC1 = PSMCT16S; X; } break; \
+				case PSMT16Z: 	      	{ const int psmC = PSMCT16S, psmC1 = PSMT16Z; X; }  break; \
+				case PSMT16SZ: 	      	{ const int psmC = PSMCT16S, psmC1 = PSMT16SZ; X; } break; \
+			} \
+			break; \
+		case PSMT16Z: \
+			switch(psm2) { \
+				case PSMCT16: 		{ const int psmC = PSMT16Z, psmC1 = PSMCT16; X; }  break; \
+				case PSMCT16S:  	{ const int psmC = PSMT16Z, psmC1 = PSMCT16S; X; } break; \
+				case PSMT16Z: 	      	{ const int psmC = PSMT16Z, psmC1 = PSMT16Z; X; }  break; \
+				case PSMT16SZ: 	      	{ const int psmC = PSMT16Z, psmC1 = PSMT16SZ; X; } break; \
+			} \
+			break; \
+		case PSMT16SZ: \
+			switch(psm2) { \
+				case PSMCT16: 		{ const int psmC = PSMT16SZ, psmC1 = PSMCT16; X; }  break; \
+				case PSMCT16S:  	{ const int psmC = PSMT16SZ, psmC1 = PSMCT16S; X; } break; \
+				case PSMT16Z: 	      	{ const int psmC = PSMT16SZ, psmC1 = PSMT16Z; X; }  break; \
+				case PSMT16SZ: 	      	{ const int psmC = PSMT16SZ, psmC1 = PSMT16SZ; X; } break; \
+			} \
+			break; \
+		case PSMT8: \
+			if( psm2 == PSMT8 ) 		{ const int psmC = PSMT8, psmC1 = PSMT8; X; }   \
+			else		  		{ const int psmC = PSMT8, psmC1 = PSMT8H; X; }  \
+			break; \
+		case PSMT8H: \
+			if( psm2 == PSMT8H ) 		{ const int psmC = PSMT8H, psmC1 = PSMT8; X; }  \
+			else		  		{ const int psmC = PSMT8H, psmC1 = PSMT8H; X; } \
+			break; \
+		case PSMT4: \
+			switch(psm2) { \
+				case PSMT4: 		{ const int psmC = PSMT4, psmC1 = PSMT4; X; }  break; \
+				case PSMT4HL:  		{ const int psmC = PSMT4, psmC1 = PSMT4HL; X; } break; \
+				case PSMT4HH: 	      	{ const int psmC = PSMT4, psmC1 = PSMT4HH; X; }  break; \
+			} \
+			break; \
+		case PSMT4HL: \
+			switch(psm2) { \
+				case PSMT4: 		{ const int psmC = PSMT4HL, psmC1 = PSMT4; X; }  break; \
+				case PSMT4HL:  		{ const int psmC = PSMT4HL, psmC1 = PSMT4HL; X; } break; \
+				case PSMT4HH: 	      	{ const int psmC = PSMT4HL, psmC1 = PSMT4HH; X; }  break; \
+			} \
+			break; \
+		case PSMT4HH: \
+  			switch(psm2) { \
+				case PSMT4: 		{ const int psmC = PSMT4HH, psmC1 = PSMT4; X; }  break; \
+				case PSMT4HL:  		{ const int psmC = PSMT4HH, psmC1 = PSMT4HL; X; } break; \
+				case PSMT4HH: 	      	{ const int psmC = PSMT4HH, psmC1 = PSMT4HH; X; }  break; \
+			} \
+			break; \
+		} \
+}
+
+template <int psm> 
+static __forceinline void setPsmtConstantsX(u8& A, u8& B, u8& C, u8& D, u8& E, u8& F, u32& G, u8& H)  { 
+	switch (psm) { 
+		case PSMCT32: 
+		case PSMT32Z: 
+			A = 5; B = 6; C = 0; D = 31; E = 63; F = 0; H = 1; G = 0xffffffff; 
+			break; 
+
+		case PSMCT24: 
+		case PSMT24Z: 
+			A = 5; B = 6; C = 0; D = 31; E = 63; F = 0; H = 1; G = 0xffffff; 
+			break; 
+
+		case PSMT8H: 
+			A = 5; B = 6; C = 0; D = 31; E = 63; F = 24; H = 4; G = 0xff;
+			break;
+
+		case PSMT4HH: 
+			A = 5; B = 6; C = 0; D = 31; E = 63; F = 28; H = 8; G = 0xf;
+			break;
+	       	
+		case PSMT4HL: 
+			A = 5; B = 6; C = 0; D = 31; E = 63; F = 24; H = 8; G = 0xf;
+			break; 
+
+		case PSMCT16: 
+		case PSMT16Z: 
+		case PSMCT16S: 
+		case PSMT16SZ: 
+			A = 6; B = 6; C = 1; D = 63; E = 63; F = 0; H = 2; G = 0xffff;
+			break; 
+
+		case PSMT8: 
+			A = 6; B = 7; C = 2; D = 63; E = 127; F = 0; H = 4; G = 0xff;
+			break; 
+
+		case PSMT4: 
+			A = 7; B = 7; C = 3; D = 127; E = 127; F = 0; H = 8; G = 0xf; 
+			break; 
+	} 
+}
+
+// This is where the NEW_CODE define used to be.
+
+// ------------------------------------------ get Address functions ------------------------------------
+// Yes, only 1 function to all cases of life! 
+// Warning! We switch bp and bw for usage of default value, so be warned! It's
+// not C, it's C++, so not it.
+template <int psm>
+static __forceinline u32 getPixelAddress(int x, int y, u32 bw, u32 bp = 0) {
+	u32 basepage;
+	u32 word;
+
+	u8 A = 0, B = 0, C = 0, D = 0, E = 0, F = 0;  u32 G = 0; u8 H= 0;
+	setPsmtConstantsX<psm>(A, B, C, D, E, F, G, H); 
+	basepage = ((y>>A) * (bw>>B)) + (x>>B); 
+	word = ((bp * 64 + basepage * 2048) << C) + g_pageTable[psm][y&D][x&E];				
+
+	return word;	
+}
+
+// It's Zerofrog's function. I need to eliminate them all! All access should be 32-bit aligned.
+static __forceinline u32 getPixelAddress(int psm, int x, int y, u32 bw, u32 bp = 0) {
+	PSM_SWITCHCASE(return getPixelAddress<psmC>(x, y, bw, bp) ;)
+	return 0;
+}
+
+// This is compatibility code, for reference,
+#define Def_getPixelAddress(psmT, psmX) \
+	static __forceinline u32 getPixelAddress##psmT(int x, int y, u32 bp, u32 bw) { \
+		return getPixelAddress<psmX>(x, y, bw, bp); } \
+	static __forceinline u32 getPixelAddress##psmT##_0(int x, int y, u32 bw) { \
+		return getPixelAddress<psmX>(x, y, bw); } \
+
+Def_getPixelAddress(32, PSMCT32)
+Def_getPixelAddress(16, PSMCT16)
+Def_getPixelAddress(16S, PSMCT16S)
+Def_getPixelAddress(8, PSMT8)
+Def_getPixelAddress(4, PSMT4)
+Def_getPixelAddress(32Z, PSMT32Z)
+Def_getPixelAddress(16Z, PSMT16Z)
+Def_getPixelAddress(16SZ, PSMT16SZ)
+
+#define getPixelAddress24 getPixelAddress32
+#define getPixelAddress24_0 getPixelAddress32_0
+#define getPixelAddress8H getPixelAddress32
+#define getPixelAddress8H_0 getPixelAddress32_0
+#define getPixelAddress4HL getPixelAddress32
+#define getPixelAddress4HL_0 getPixelAddress32_0
+#define getPixelAddress4HH getPixelAddress32
+#define getPixelAddress4HH_0 getPixelAddress32_0
+#define getPixelAddress24Z getPixelAddress32Z
+#define getPixelAddress24Z_0 getPixelAddress32Z_0
+
+// Check FFX-1 (very begining) for PSMT8
+// Check Tekken menu for PSMT4
+// ZZ_DT[7] is needed only for PSMT8H, PSMT4HL and PSMT4HH -- at this case word contain data not from a begining.
+
+// This function return shift from 32-bit aligned address and shift -- number of byte in u32 order.
+// so if ((u32*)mem + getPixelAddress_Aligned32) is exact location of u32, where our pixel data stored. 
+// Just for remember:
+// PMSCT32, 24, 32Z, 24Z, 8HH, 4HL and 4HH have ZZ_DT[psm] == 3, so shift is always 0.
+// PSMCT16, 16S, 16SZ, 16Z have 		ZZ_DT[psm] == 2, so shift is 0 or 16.
+// PSMT8					ZZ_DT[psm] == 1,    shift is 0, 8, 16, 24
+// PSMT4					ZZ_DT[psm] == 0,    shift is 0, 4, 8, 12, 16, 20, 24, 28.
+
+// It allow us to made a fast access to pixels in the same basepage: if x % N == 0 (N = 1, 2, 4, 8, .. 64)
+// than we could guarantee that all pixels form x to x + N - 1 are in the same basepage.
+template <int psm>
+static __forceinline u32* getPixelBasepage(const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
+	u32 basepage;
+	u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;	
+	setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
+	basepage = ((y>>A) * (bw>>B)) + (x>>B);
+	return ((u32*)pmem + (bp * 64 + basepage * 2048));
+}
+
+// And this is offset for this pixels.
+template <int psm>
+static __forceinline u32* getPixelOffset(u32& mask, u32& shift, const void* pmem, int x, int y) {
+	u32 word;
+
+	u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
+	setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
+
+	word = (g_pageTable[psm][y&D][x&E] << (3 - C));
+	shift = ((word & 0x7) << 2) + F;
+	mask &= G << shift; 
+
+	return ((u32*)pmem + ((word & ~0x7) >> 3));
+}
+
+
+template <int psm>
+static __forceinline u32* getPixelAddress_A32(u32& mask, u32& shift, const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
+	return getPixelOffset<psm>(mask, shift, getPixelBasepage<psm>(pmem, x, y, bw, bp), x, y);
+
+}
+
+template <int psm>
+static __forceinline u32* getPixelBaseAddress_A32(const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
+	u32 word;
+	
+	u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
+	setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
+
+	word = (g_pageTable[psm][y&D][x&E] << (3 - C));
+	return ((u32*)getPixelBasepage<psm>(pmem, x, y, bw, bp) + ((word & ~0x7) >> 3));
+}
+
+// Wrapper for cases, where psm is not constant, should be avoided inside cycles
+static __forceinline u32* getPixelAddress_A32(u32& mask, u32& shift, int psm, const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
+	PSM_SWITCHCASE( return getPixelAddress_A32<psmC>(mask, shift, pmem, x, y, bw, bp) );
+	return 0;
+}
+
+static __forceinline u32* getClutAddress(u8* pmem, const tex0Info& tex0) {
+	if (PSMT_ISHALF(tex0.cpsm))
+		return (u32*)(pmem + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0) );
+	else
+		return (u32*)(pmem + 64 * (tex0.csa & 15));
+}
+
+//--------------------------------------------- Write Pixel -----------------------------------------------------------
+// Set proper mask for transfering multiple bytes per word.
+template <int psm>
+inline u32 HandleWritemask(u32 Writemask) {
+	u8 G = PSM_BITS_PER_PIXEL<psm>();
+	u32 dmask = Writemask & ((1 << G) - 1);				// drop all bits in writemask, that could not be used
+	u32 mask;
+
+	switch (psm) {
+		case PSMT8H:						// modes with non-zero start bit should be handled differently
+			return 0xff000000;
+		case PSMT4HL:
+			return 0x0f000000;
+		case PSMT4HH:
+			return 0xf0000000;
+		default:
+			mask = dmask;					// 32 targets and lower
+
+			if (G < 24) {					
+				mask |= dmask << G;			// 16 targets and lower
+			if (G < 16) {
+				mask |= dmask << (2 * G);		// 8 targets and lower
+				mask |= dmask << (3 * G);
+			if (G < 8) {
+				mask |= dmask << (4 * G);		// 4 targets
+				mask |= dmask << (5 * G);
+				mask |= dmask << (6 * G);
+				mask |= dmask << (7 * G);
+			}}}
+			return mask;
+	}
+}
+
+//push pixel data at position x,y, according psm storage format. pixel do not need to be properly masked, wrong bit's would not be used
+//mask should be made according PSM.
+template <int psm>
+static __forceinline void writePixel(void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 shift;
+	u32* p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
+
+	MaskedOR (p, pixel << shift, mask);
+}
+
+static __forceinline void writePixel(int psm, void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	PSM_SWITCHCASE(writePixel<psmC>(pmem, x, y, pixel, bw, bp, mask)); 
+}
+
+// Put pixel data from memory. Pixel is p, memory start from pixel, and we should count pmove words and shift resulting word to shift 
+// 24 targets could be outside of 32-bit borders.
+template <int psm>
+static __forceinline void pushPixelMem(u32* p, u32* pixel, int pmove, int shift, u32 mask = 0xffffffff) {
+	if (psm != PSMCT24 || psm != PSMT24Z) {
+		if (shift > 0)
+			MaskedOR (p, (*(pixel + pmove)) << (shift), mask);
+		else
+			MaskedOR (p, (*(pixel + pmove)) >> (-shift), mask);
+	}
+	else {									// for 24 and 24Z psm data could be not-aligned by 32. Merde!
+		u64 pixel64 = (*(u64*)(pixel + pmove) ) >> (-shift);		// we read more data, but for 24 targets shift always negative and resulting data is u32
+		MaskedOR(p, (u32)pixel64, mask);				// drop upper part, we don't need it. all data is stored in lower part of u64 after shift
+
+//		MaskedOR(p, (u32)((u8*)pixel + count * 3), mask);
+	}
+}
+
+// use it if pixel already shifted by needed number of bytes. 
+// offseted mean that we should skip basepage calculation, pmem is link to basepage'ed memory. Just a little quicker.
+template <int psm, int offseted>
+static __forceinline void writePixelMem(const void* pmem, int x, int y, u32* pixel, int count, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 shift;
+	u32* p;
+
+	if (offseted)	
+		p = getPixelOffset<psm>(mask, shift, pmem, x, y);
+	else
+		p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
+
+	int A = PSM_BITS_PER_PIXEL<psm>();
+
+	int pmove = (count * A) >> 5;
+	int pshift = (count * A) & 31;			// we assume, that if shift outside word, than user want next pixel data
+
+	pushPixelMem<psm>(p, pixel, pmove, (int)shift - pshift, mask);
+}	
+
+
+// This function push several pixels. Note, that for 32, 24, 8HH, 4HL, 4HH it's simply write (and pixel should not be properly masked), 16 do push 2 pixels (and x should be even).
+// 8 push 4 pixels: 0,0; 0,1; 1,0 and 1,1. 4 push 8: 0,0; 0,1; 1,0; 1,1; 2,0, 2,1; 3,0; 3,1.
+template <int psm>
+static __forceinline void writePixelWord(const void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 maskA = mask, shift;
+	u32* p = getPixelAddress_A32<psm>(maskA, shift, pmem, x, y, bw, bp);
+
+/*	if (PSM_NON_FULL_WORD<psm>())			
+		maskA = maskA & mask;
+	else
+		maskA = mask;*/
+	
+	MaskedOR (p, pixel, mask);
+}	
+
+// ------------------------------------- Read Pixel ---------------------------------------
+template <int psm>
+static __forceinline u32 readPixel(const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 shift;
+	u32* p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
+
+	return ((*p & mask) >> shift);
+}
+
+static __forceinline u32 readPixel(int psm, const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	PSM_SWITCHCASE(return readPixel<psmC>(pmem, x, y, bw, bp, mask););
+	return 0;
+}	
+
+template <int psm>
+static __forceinline u32 readPixelWord(const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 maskA = 0xffffffff, shift;
+	if (PSM_NON_FULL_WORD<psm>())
+		return *getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp) & mask;
+	else
+		return *getPixelAddress_A32<psm>(maskA, shift, pmem, x, y, bw, bp) & mask;
+}
+
+template <int psm>
+static __forceinline void fillMemoryFromPixels(u32* dst, const void* pmem, int& count, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 pixel;
+
+	u8 I  = PSM_BITS_PER_PIXEL<psm>(); 
+	int K = count / PSM_PIXELS_STORED_PER_WORD<psm>();				// offset for pmem, count for 32, count / 2 for 16, etc.
+
+		pixel = readPixel<psm>(pmem, x, y, bw, bp, mask);			// I prefer not to use for here. It's slow
+	if (I < 32) {
+		pixel += readPixel<psm>(pmem, x + 1, y, bw, bp, mask) << I;
+	if (I < 16) {									// 8 and 4 targets
+		pixel += readPixel<psm>(pmem, x + 2, y, bw, bp, mask) << (2 * I);
+		pixel += readPixel<psm>(pmem, x + 3, y, bw, bp, mask) << (3 * I);
+	if (I < 8) {									// This is for 4, 4HH and 4HL
+		pixel += readPixel<psm>(pmem, x + 4, y, bw, bp, mask) << (4 * I);
+		pixel += readPixel<psm>(pmem, x + 5, y, bw, bp, mask) << (5 * I);
+		pixel += readPixel<psm>(pmem, x + 6, y, bw, bp, mask) << (6 * I);
+		pixel += readPixel<psm>(pmem, x + 7, y, bw, bp, mask) << (7 * I);
+	}}}
+	
+	if  (I != 24) {										
+		*(dst + K) = pixel;										
+	}
+	else {										// 24. should have special care.
+//		ERROR_LOG("special care %d\n", count);
+		MaskedOR((u32*)((u8*)dst + 3 * count), pixel, 0xffffff);
+	}
+	count +=  PSM_PIXELS_STORED_PER_WORD<psm>();
+}
+
+
+// Fill count pixels form continues memory region, starting from pmem, First pixel to read have number shift in this region.
+// Read no more than count pixels. We could assert, that all this pixels would be place in the same basepage 
+// Shift is automaticaly increased by count (or decreased if count < 0)
+template <int psm, bool offseted, int count>
+static __forceinline void writePixelsFromMemory(void* dst, const void* pmem, int& shift, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	const void* base;
+	if (offseted)
+		base = getPixelBasepage<psm>(dst, x, y, bw, bp);
+	else
+		base = (const void*)dst;
+
+	shift += count;
+	writePixelMem<psm, offseted>(base, x, y, (u32*)pmem, shift - count, bw, bp, mask);				  		// I prefer not to use for here. It's slow
+	if (count < 2) return;
+	writePixelMem<psm, offseted>(base, x + 1, y, (u32*)pmem, shift - count + 1, bw, bp, mask);				  	
+	if (count < 3) return; 	
+	writePixelMem<psm, offseted>(base, x + 2, y, (u32*)pmem, shift - count + 2, bw, bp, mask);				  
+	if (count < 4) return;
+	writePixelMem<psm, offseted>(base, x + 3, y, (u32*)pmem, shift - count + 3, bw, bp, mask);				  
+	if (count < 5) return;
+	writePixelMem<psm, offseted>(base, x + 4, y, (u32*)pmem, shift - count + 4, bw, bp, mask);
+	if (count < 6) return;
+	writePixelMem<psm, offseted>(base, x + 5, y, (u32*)pmem, shift - count + 5, bw, bp, mask);
+	if (count < 7) return;
+	writePixelMem<psm, offseted>(base, x + 6, y, (u32*)pmem, shift - count + 6, bw, bp, mask);
+	if (count < 8) return;
+	writePixelMem<psm, offseted>(base, x + 7, y, (u32*)pmem, shift - count + 7, bw, bp, mask);				  	
+}
+
+// Use it if we don't know that starting pixel is aligned for multiple-pixel write
+template <int psm, bool offseted>
+static __forceinline void writeUnalignedPixelsFromMemory(void* dst, int div, const void* pmem, int& shift, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	switch (div){
+		case 0: return; 											// Pixels are aligned, so we could move on
+		case 1: writePixelsFromMemory<psm, offseted, 1>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 2: writePixelsFromMemory<psm, offseted, 2>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 3: writePixelsFromMemory<psm, offseted, 3>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 4: writePixelsFromMemory<psm, offseted, 4>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 5: writePixelsFromMemory<psm, offseted, 5>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 6: writePixelsFromMemory<psm, offseted, 6>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+		case 7: writePixelsFromMemory<psm, offseted, 7>(dst, pmem, shift, x, y, bw, bp, mask);
+			return;
+	}
+}
+
+// This little swizzle function used to convert data form memory. z is first byte in destination block, and y is number of word, in which we look look for data.
+// s is shift by number of pixels, that should be used in masking
+template <int psm, int y, int z>
+static __forceinline u32 BitmaskinPSM(u32* pmem, u8 x) {
+
+	u8 H = PSM_BITCOUNT<psm>();
+	u8 I = PSM_BITS_PER_PIXEL<psm>() ;							// length of bitmask in bits. 
+
+
+	if (PSM_BITMODE<psm>() != 1) {								// PSMCT24 and 24Z should be handle separated, as it could pass 32-bit storage.	
+		u8 k = (x & (H - 1)) * I;							// shift of PC data -- in PC we use pixels from constant position: x / H word and k is shift: x = ( x % H ) * H + k / I
+												// in PS2 we use all bit position from 0 by I pixels.
+	 
+		u32 J = ((1 << I) - 1) << k;							// bitmask (of length ) & mask, moved by position k
+
+		// gcc complains repeatedly about this always being false. I'll investigate later.
+		if (z > k)	
+			return ((*(pmem + x/H + y)) & J) << (z - k);				// we use PX data from *mem + and properly shift
+		else										// This formula loo little swizzled. 
+			return ((*(pmem + x/H + y)) & J) >> (k - z);
+	}
+	else {											// only 24 targets
+		u8* mem = ((u8*)pmem + (x * 3) + 4 * y); 					// Our pixel's is disaligned on 32-bit. So just use u8*.
+		return *(u32*)mem;								// Mask would be handled later
+	}
+}
+
+// We use this function to limit number of memory R/W. This function fill all pixels for data with coordindates x, y. inside block data.
+// Only rule is x, y should be < 8 (it automatically fill all needed pixels, that lie in blockdata, but have coords more than 8).
+template <int psm>
+static __forceinline void fillPixelsFromMemory(u32* dst, u32* pmem, int x, int y, int pitch, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
+	u32 pixel = 0;
+	const u8 H = PSM_PIXELS_PER_WORD<psm>();
+
+	if (PSM_PIXEL_SHIFT<psm>() == 0)								// We could not use calculated constants as templated parameters.
+		pixel = BitmaskinPSM<psm, 0, 0>(pmem, x);						// First pixel x,y is the common part of all psmt path's
+	else {
+		if (PSM_PIXEL_SHIFT<psm>() == 24) 							// 8H and 4HL have 1 pixel, but shifted to 24 bits. 4HH -- 28 bits.	
+			pixel = BitmaskinPSM<psm, 0, 24>(pmem, x);					
+		else
+			pixel = BitmaskinPSM<psm, 0, 28>(pmem, x);
+	}
+	if (H > 1) {
+		const u8 G = psm & 0x7;									// Bitmode, we use it for better chance of switch optimization
+		int div = ( x < 4 ) ? 4 : -4;								// secondary row have shift by +4 or -4 pixels
+
+		switch (G) {
+			case 2:
+				pixel |= BitmaskinPSM<psm, 4, 16>(pmem, x);
+				break;
+			case 3:	
+				pixel |= BitmaskinPSM<psm, 2, 16>(pmem, x);			
+				pixel |= BitmaskinPSM<psm, 0, 8>(pmem + 2 * pitch, x + div);
+				pixel |= BitmaskinPSM<psm, 2, 24>(pmem + 2 * pitch, x + div);
+				break;	
+			case 4:
+				pixel |= BitmaskinPSM<psm, 1, 8>(pmem, x);			
+				pixel |= BitmaskinPSM<psm, 2, 16>(pmem, x);		
+				pixel |= BitmaskinPSM<psm, 3, 24>(pmem, x);			
+
+				pixel |= BitmaskinPSM<psm, 0, 4>(pmem + 2 * pitch, x + div);
+				pixel |= BitmaskinPSM<psm, 1, 12>(pmem + 2 * pitch, x + div);			
+				pixel |= BitmaskinPSM<psm, 2, 20>(pmem + 2 * pitch, x + div);			
+				pixel |= BitmaskinPSM<psm, 3, 28>(pmem + 2 * pitch, x + div);			
+
+				break;				
+		}
+	}
+	writePixelWord<psm>(dst, x, y, pixel, bw, bp, HandleWritemask<psm>(mask));				// use it for 32, 24, 8H, 4HL and 4HH
+}
+
+template <int psm>
+void writeWordPixel(u32* pmem, u32 pixel, u32 mask) {
+	if (psm == PSMT4HH || psm == PSMT8H || psm == PSMT4HL || psm == PSMCT24 || psm == PSMT24Z)
+		MaskedOR(pmem, pixel, mask);
+	else	
+		*pmem = pixel;
+}
+
+// Get pixel from src and put in in src. We assume, that psm of both buffers are the same and (sx-dx) & E == (sy - dy) & D == 0;
+// Also in this case we could transfer the whole word
+template <int psm>
+void transferPixelFast(void* dst, void* src, int dx, int dy, int sx, int sy, u32 dbw, u32 sbw ) {
+	u32 Dbasepage, Sbasepage;
+	u32 word, mask = 0xffffffff;
+
+	u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
+	setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
+	assert ( ((sx-dx) & E == (sy - dy) & D) && ((sy - dy) & D == 0) );
+
+	Dbasepage = ((dy>>A) * (dbw>>B)) + (dx>>B);
+	Sbasepage = ((sy>>A) * (sbw>>B)) + (sx>>B);
+
+	word = (g_pageTable[psm][sy&D][sx&E] >> C);
+
+	u32* dstp = (u32*)dst + Dbasepage * 2048 + word;
+	u32* srcp = (u32*)src + Sbasepage * 2048 + word;
+
+	writeWordPixel<psm>(dstp, *srcp, G << F);
+}
+
+// if we could not guarantee, that buffer suize shared same page Table address
+template <int psm>
+void transferPixel(void* dst, void* src, int dx, int dy, int sx, int sy, u32 dbw, u32 sbw ) {
+	u32 mask = 0xffffffff, shift;
+	u32* dstp = getPixelAddress_A32<psm>(mask, shift, dst, dx, dy, dbw);
+	u32* srcp = getPixelAddress_A32<psm>(mask, shift, src, sx, sy, sbw);
+	writeWordPixel<psm>(dstp, *srcp, mask);								// write whole word
+}
+
+#define Def_getReadWrite(psmT, psmX) \
+	static __forceinline void writePixel##psmT(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { \
+		writePixel<psmX>(pmem, x, y, pixel, bw, bp); } \
+	static __forceinline u32 readPixel##psmT(const void* pmem, int x, int y, u32 bp, u32 bw) { \
+		return readPixel<psmX>(pmem, x, y, bw, bp); } \
+	static __forceinline void writePixel##psmT##_0(void* pmem, int x, int y, u32 pixel, u32 bw) { \
+		writePixel<psmX>(pmem, x, y, pixel, bw); } \
+	static __forceinline u32 readPixel##psmT##_0(const void* pmem, int x, int y, u32 bw) { \
+		return readPixel<psmX>(pmem, x, y, bw); }  
+
+Def_getReadWrite(32, PSMCT32); 
+Def_getReadWrite(24, PSMCT24); 
+Def_getReadWrite(16, PSMCT16); 
+Def_getReadWrite(16S, PSMCT16); 
+Def_getReadWrite(8, PSMT8); 
+Def_getReadWrite(8H, PSMT8H); 
+Def_getReadWrite(4, PSMT4); 
+Def_getReadWrite(4HH, PSMT4HH); 
+Def_getReadWrite(4HL, PSMT4HL); 
+Def_getReadWrite(32Z, PSMCT32); 
+Def_getReadWrite(24Z, PSMCT24);  
+Def_getReadWrite(16Z, PSMCT16); 
+Def_getReadWrite(16SZ, PSMCT16);
+
+#endif // Zeydlitz's code
+
+#endif /* __ZZOGL_MEM_H__ */
--- a/plugins/zzogl-pg/opengl/ZZoglSave.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglSave.cpp
@ -89,8 +89,7 @@ int ZZSave(s8* pbydata)
 	return 0;
 }

-extern u32 s_uFramebuffer;
-extern int g_nCurVBOIndex;
+extern u32 g_nCurVBOIndex;

 bool ZZLoad(s8* pbydata)
 {
@ -163,7 +162,7 @@ bool ZZLoad(s8* pbydata)

 		icurctx = -1;

-		glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, s_uFramebuffer);   // switch to the backbuffer
+		FB::Bind();   // switch to the backbuffer
 		SetFogColor(gs.fogcol);

 		GL_REPORT_ERRORD();
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
@ -17,7 +17,12 @@
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

-//#ifdef NVIDIA_CG_API 		// This code is only for NVIDIA cg-toolkit API
+// By default enable nvidia cg api
+#if !defined(GLSL_API) && !defined(NVIDIA_CG_API)
+#define NVIDIA_CG_API
+#endif
+
+#ifdef NVIDIA_CG_API 		// This code is only for NVIDIA cg-toolkit API
 // ZZogl Shader manipulation functions.

 //------------------- Includes
@ -85,10 +90,10 @@ ZZshProgram 	pvs[16] = {NULL};
 ZZshProgram 	g_vsprog = 0, g_psprog = 0;							// 2 -- ZZ
 ZZshParameter 	g_vparamPosXY[2] = {0}, g_fparamFogColor = 0;

-#ifdef DEVBUILD
-char* EFFECT_NAME;		// All this variables used for testing and set manually
-char* EFFECT_DIR;
-#endif
+//#ifdef DEVBUILD
+extern char* EFFECT_NAME;		// All this variables used for testing and set manually
+extern char* EFFECT_DIR;
+//#endif

 bool g_bCRTCBilinear = true;

@ -96,14 +101,9 @@ float4 g_vdepth, vlogz;
 FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;
 FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
 FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
-FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
+FRAGMENTSHADER ppsCRTC[2], /*ppsCRTC24[2],*/ ppsCRTCTarg[2];
 VERTEXSHADER pvsBitBlt;

-extern u32 ptexBlocks;		// holds information on block tiling. It's texture number in OpenGL -- if 0 than such texture
-extern u32 ptexConv16to32;	// does not exists. This textures should be created on start and released on finish.  
-extern u32 ptexBilinearBlocks;
-extern u32 ptexConv32to16;
-
 inline bool LoadEffects();
 extern bool s_bWriteDepth;

@ -198,6 +198,10 @@ bool ZZshStartUsingShaders() {
 	return true;
 }

+void ZZshExitCleaning() {
+	// nothing to do with cg
+}
+
 // open shader file according to build target
 bool ZZshCreateOpenShadersFile() {
 #ifndef DEVBUILD
@ -483,6 +487,48 @@ void SetupVertexProgramParameters(ZZshProgram prog, int context)
 }

 #ifndef DEVBUILD
+#if 0
+static __forceinline void LOAD_VS(int Index, ZZshProgram prog)
+{
+	assert(mapShaderResources.find(Index) != mapShaderResources.end());
+	header = mapShaderResources[Index];
+	assert((header) != NULL && (header)->index == (Index));
+	prog = cgCreateProgram(g_cgcontext, CG_OBJECT, (char*)(s_lpShaderResources + (header)->offset), cgvProf, NULL, NULL);
+	if (!cgIsProgram(prog)) 
+	{
+		ZZLog::Error_Log("Failed to load vs %d: \n%s", Index, cgGetLastListing(g_cgcontext));
+		return false;
+	}
+	cgGLLoadProgram(prog);
+	
+	if (cgGetError() != CG_NO_ERROR) ZZLog::Error_Log("Failed to load program %d.", Index);
+	SetupVertexProgramParameters(prog, !!(Index&SH_CONTEXT1));	
+}
+
+
+static __forceinline void LOAD_VS(int Index, FRAGMENTSHADER fragment)
+{
+	bLoadSuccess = true;
+	assert(mapShaderResources.find(Index) != mapShaderResources.end());
+	header = mapShaderResources[Index];
+	fragment.prog = cgCreateProgram(g_cgcontext, CG_OBJECT, (char*)(s_lpShaderResources + (header)->offset), cgfProf, NULL, NULL);
+	if (!cgIsProgram(fragment.prog)) 
+	{
+		ZZLog::Error_Log("Failed to load ps %d: \n%s", Index, cgGetLastListing(g_cgcontext));
+		return false;
+	}
+	
+	cgGLLoadProgram(fragment.prog);
+	
+	if (cgGetError() != CG_NO_ERROR) 
+	{
+		ZZLog::Error_Log("failed to load program %d.", Index);
+		bLoadSuccess = false;
+	}
+	
+	SetupFragmentProgramParameters(&fragment, !!(Index&SH_CONTEXT1), 0);
+}
+#endif

 #define LOAD_VS(Index, prog) {						  \
 	assert( mapShaderResources.find(Index) != mapShaderResources.end() ); \
@ -612,8 +658,8 @@ bool ZZshLoadExtraEffects()
 	if( !bLoadSuccess )
 		ZZLog::Error_Log("Failed to create CRTC shaders.");

-	LOAD_PS(SH_CRTC24PS, ppsCRTC24[0]);
-	LOAD_PS(SH_CRTC24INTERPS, ppsCRTC24[1]);
+//	LOAD_PS(SH_CRTC24PS, ppsCRTC24[0]);
+//	LOAD_PS(SH_CRTC24INTERPS, ppsCRTC24[1]);
 	LOAD_PS(SH_ZEROPS, ppsOne);
 	LOAD_PS(SH_BASETEXTUREPS, ppsBaseTexture);
 	LOAD_PS(SH_CONVERT16TO32PS, ppsConvert16to32);
@ -811,7 +857,7 @@ bool ZZshLoadExtraEffects()
 	if( !bLoadSuccess )
 		ZZLog::Error_Log("Failed to create CRTC shaders.");

-	LOAD_PS("CRTC24PS", ppsCRTC24[0], cgfProf); LOAD_PS("CRTC24InterPS", ppsCRTC24[1], cgfProf);
+//	LOAD_PS("CRTC24PS", ppsCRTC24[0], cgfProf); LOAD_PS("CRTC24InterPS", ppsCRTC24[1], cgfProf);
 	LOAD_PS("ZeroPS", ppsOne, cgfProf);
 	LOAD_PS("BaseTexturePS", ppsBaseTexture, cgfProf);
 	LOAD_PS("Convert16to32PS", ppsConvert16to32, cgfProf);
@ -886,4 +932,4 @@ FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testae

 #endif // RELEASE_TO_PUBLIC

-//#endif // NVIDIA_CG_API
+#endif // NVIDIA_CG_API
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.h
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.h
@ -33,8 +33,10 @@
 #include "ZZoglMath.h"
 #include "GS.h"

-// For output
+// By default enable nvidia cg api
+#if !defined(GLSL_API) && !defined(NVIDIA_CG_API)
 #define NVIDIA_CG_API
+#endif
 // --------------------------- API abstraction level --------------------------------

 #ifdef NVIDIA_CG_API				// Code for NVIDIA cg-toolkit API
@ -56,9 +58,60 @@ inline bool ZZshActiveParameter(ZZshParameter param) {return (param !=NULL); }

 #endif					// end NVIDIA cg-toolkit API

+#ifdef GLSL_API
+
+enum ZZshPARAMTYPE {
+	ZZ_UNDEFINED,
+	ZZ_TEXTURE_2D,
+	ZZ_TEXTURE_RECT,
+	ZZ_TEXTURE_3D,
+	ZZ_FLOAT4,
+};
+
+typedef struct {
+	const char* 	ShName;		// Name of uniform
+	ZZshPARAMTYPE	type;		// Choose between parameter type
+
+	float 		fvalue[4];
+	GLuint		sampler;	// Number of texture unit in array 
+	GLint		texid;		// Number of texture - texid. 
+
+	bool		Constant;	// Uniform could be constants, does not change at program flow
+	bool 		Settled;	// Check if Uniform value was set.
+} ZZshParamInfo;
+
+typedef struct {
+	void*	 	link;
+	bool		isFragment;
+} ZZshShaderLink; 
+
+#define ZZshProgram 		GLuint
+#define ZZshShader 		GLuint
+#define ZZshParameter 		GLint
+#define ZZshContext		int
+#define ZZshProfile		int
+#define ZZshError		int
+#define ZZshIndex		GLuint
+
+const ZZshParamInfo  qZero = {ShName:"", type:ZZ_UNDEFINED, fvalue:{0}, sampler: -1, texid: 0, Constant: false, Settled: false};
+
+#define pZero			0
+
+const ZZshShaderLink sZero = {link: NULL, isFragment: false};
+
+inline bool ZZshActiveParameter(ZZshParameter param) {return (param > -1); }
+#define SAFE_RELEASE_PROG(x) 	{ /*don't know what to do*/ }
+
+// ---------------------------
+
+#endif
+
+
+
+
 //const static char* g_pPsTexWrap[] = { "-DREPEAT", "-DCLAMP", "-DREGION_REPEAT", NULL };

-enum ZZshShaderType {ZZ_SH_ZERO, ZZ_SH_REGULAR, ZZ_SH_REGULAR_FOG, ZZ_SH_TEXTURE, ZZ_SH_TEXTURE_FOG, ZZ_SH_CRTC};
+enum ZZshShaderType {ZZ_SH_ZERO, ZZ_SH_REGULAR, ZZ_SH_REGULAR_FOG, ZZ_SH_TEXTURE, ZZ_SH_TEXTURE_FOG, ZZ_SH_CRTC, ZZ_SH_NONE};
 // We have "compatible" shaders, as RegularFogVS and RegularFogPS. if don't need to wory about incompatible shaders
 // It used only in GLSL mode. 

@ -92,6 +145,7 @@ struct FRAGMENTSHADER
 	string filename;
 #endif

+#ifdef NVIDIA_CG_API
 	void set_uniform_param(ZZshParameter &var, const char *name)
 	{
 		ZZshParameter p;
@ -161,6 +215,7 @@ struct FRAGMENTSHADER

 		return false;
 	}
+#endif
 };

 struct VERTEXSHADER
@ -183,8 +238,32 @@ struct VERTEXSHADER
 	extern FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;

 	extern FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
-	extern FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
+	extern FRAGMENTSHADER ppsCRTC[2], /*ppsCRTC24[2],*/ ppsCRTCTarg[2];

+	extern int interlace_mode;
+
+	enum CRTC_TYPE
+	{
+		CRTC_RENDER,
+		//CRTC_RENDER_24,
+		CRTC_RENDER_TARG
+	};
+
+	static __forceinline FRAGMENTSHADER* curr_ppsCRTC() { return &ppsCRTC[interlace_mode]; }
+	//static __forceinline FRAGMENTSHADER* curr_ppsCRTC24() { return &ppsCRTC24[interlace_mode]; }
+	static __forceinline FRAGMENTSHADER* curr_ppsCRTCTarg() { return &ppsCRTCTarg[interlace_mode]; }
+	
+	static __forceinline FRAGMENTSHADER* curr_pps(CRTC_TYPE render_type) 
+	{
+		switch (render_type)
+		{
+			case CRTC_RENDER: return curr_ppsCRTC();
+			//case CRTC_RENDER_24: return curr_ppsCRTC24();
+			case CRTC_RENDER_TARG: return curr_ppsCRTCTarg();
+			default: return NULL;
+		}
+		
+	}
 // ------------------------- Functions -------------------------------

 #ifdef NVIDIA_CG_API
@ -192,6 +271,11 @@ inline bool ZZshExistProgram(FRAGMENTSHADER* pf) {return (pf->prog != NULL); };
 inline bool ZZshExistProgram(VERTEXSHADER* pf) {return (pf->prog != NULL); };
 inline bool ZZshExistProgram(ZZshShaderLink prog) {return (prog != NULL); };
 #endif
+#ifdef GLSL_API
+inline bool ZZshExistProgram(FRAGMENTSHADER* pf) {return (pf->Shader != 0); };
+inline bool ZZshExistProgram(VERTEXSHADER* pf) {return (pf->Shader != 0); };
+inline bool ZZshExistProgram(ZZshShaderLink prog) {return (prog.link != NULL); }		// This is used for pvs mainly. No NULL means that we do LOAD_VS
+#endif

 extern const char* ShaderCallerName;
 extern const char* ShaderHandleName;
@ -222,10 +306,17 @@ extern void ZZshDefaultOneColor( FRAGMENTSHADER ptr );
 extern void ZZshSetVertexShader(ZZshShaderLink prog);
 extern void ZZshSetPixelShader(ZZshShaderLink prog);
 extern bool ZZshLoadExtraEffects();
+extern void ZZshExitCleaning();

 extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);

 // only sets a limited amount of state (for Update)
 void SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force);

+extern u32 ptexBlocks;		// holds information on block tiling. It's texture number in OpenGL -- if 0 than such texture
+extern u32 ptexConv16to32;	// does not exists. This textures should be created on start and released on finish.
+extern u32 ptexBilinearBlocks;
+extern u32 ptexConv32to16;
+
+
 #endif
--- a/plugins/zzogl-pg/opengl/ZZoglShadersGLSL.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShadersGLSL.cpp
@ -0,0 +1,979 @@
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009 zeydlitz@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2006
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef GLSL_API 		// This code is only for GLSL API
+// ZZogl Shader manipulation functions.
+
+/*
+ * used cg calls:
+ * cgGLIsProfileSupported		-- don't needed
+ * cgGetErrorString 			-- later	
+ * cgGetLastListing			-- later
+ * cgSetErrorHandler			-- later	
+ * cgCreateContext			-- think that don't need
+ * cgGLEnableProfile			-- don't need
+ * cgGLSetOptimalOptions		-- don't need?
+ * cgGLSetManageTextureParameters	-- what's this?	
+ * cgCreateParameter			-- don't need
+ * cgGLLoadProgram			void LinkProgram(uint program)
+ * cgGetError				-- later
+ * cgGLDisableProfile			-- don't need
+ * cgGLSetParameter4fv
+ * cgGetNamedParameter
+ * cgGLEnableTextureParameter
+ * cgIsParameterUsed
+ * cgGLBindProgram			void UseProgram(uint program)
+ * cgConnectParameter
+ * cgIsProgram				bool IsProgram(uint program)
+ * cgCreateProgramFromFile
+ */
+
+//------------------- Includes
+#include "Util.h"
+#include "ZZoglShaders.h"
+#include "zpipe.h"
+#include <math.h>
+#include <map>
+#include  <fcntl.h>			// this for open(). Maybe linux-specific
+#include <sys/mman.h>			// and this for mmap
+
+// ----------------- Defines
+
+#define TEXWRAP_REPEAT 0
+#define TEXWRAP_CLAMP 1
+#define TEXWRAP_REGION_REPEAT 2
+#define TEXWRAP_REPEAT_CLAMP 3
+
+#ifdef DEVBUILD
+#	define UNIFORM_ERROR_LOG 	ZZLog::Error_Log
+#else
+#	define UNIFORM_ERROR_LOG
+#endif
+
+// Set it to 0 to diable context usage, 1 -- to enable. FFX-1 have a strange issue with ClampExt.
+#define NOCONTEXT		0
+#define NUMBER_OF_SAMPLERS 	11
+#define MAX_SHADER_NAME_SIZE	25
+#define MAX_UNIFORM_NAME_SIZE	20
+#define DEFINE_STRING_SIZE	256
+//------------------ Constants
+
+// Used in a logarithmic Z-test, as (1-o(1))/log(MAX_U32).
+const float g_filog32 = 0.999f / (32.0f * logf(2.0f));
+
+const static char* g_pTexTypes[] = { "32", "tex32", "clut32", "tex32to16", "tex16to8h" };
+const static char* g_pShaders[4] = { "full", "reduced", "accurate", "accurate-reduced" };
+
+// ----------------- Global Variables 
+
+ZZshContext	g_cgcontext;
+ZZshProfile 	cgvProf, cgfProf;
+int 		g_nPixelShaderVer = 0; 		// default
+u8* 		s_lpShaderResources = NULL;
+ZZshShaderLink 	pvs[16] = {sZero}, g_vsprog = sZero, g_psprog = sZero;							// 2 -- ZZ
+ZZshParameter 	g_vparamPosXY[2] = {pZero}, g_fparamFogColor = pZero;
+
+ZZshProgram	ZZshMainProgram;
+char*		ZZshSource;			// Shader's source data.	
+off_t		ZZshSourceSize;
+
+extern char* EFFECT_NAME;				// All this variables used for testing and set manually
+extern char* EFFECT_DIR;
+
+bool g_bCRTCBilinear = true;
+
+float4 g_vdepth, vlogz;
+FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;
+FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
+FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
+FRAGMENTSHADER ppsCRTC[2], /*ppsCRTC24[2],*/ ppsCRTCTarg[2];
+VERTEXSHADER pvsStore[16];
+VERTEXSHADER pvsBitBlt;
+
+inline bool LoadEffects();
+extern bool s_bWriteDepth;
+
+struct SHADERHEADER
+{
+	unsigned int index, offset, size; // if highest bit of index is set, pixel shader
+};
+map<int, SHADERHEADER*> mapShaderResources;
+
+// Debug variable, store name of the function that call the shader.
+const char* ShaderCallerName = "";
+const char* ShaderHandleName = "";
+
+int NumActiveUniforms, NumGlobalUniforms;
+ZZshParamInfo UniformsIndex[MAX_ACTIVE_UNIFORMS] = {qZero};
+const char* ShaderNames[MAX_ACTIVE_SHADERS] = {""};
+ZZshShaderType ShaderTypes[MAX_ACTIVE_SHADERS] = {ZZ_SH_NONE};
+
+ZZshProgram CompiledPrograms[MAX_ACTIVE_SHADERS][MAX_ACTIVE_SHADERS] = {{0}};
+const char* TextureUnits[NUMBER_OF_SAMPLERS] = 
+	{"g_sMemory[0]", 	"g_sMemory[1]", 	"g_sSrcFinal", 		"g_sBitwiseANDX", 	"g_sBitwiseANDY",  "g_sInterlace", \
+		"g_sCLUT", 		"g_sBlocks", 		"g_sBilinearBlocks", 		"g_sConv16to32", 	"g_sConv32to16"};
+ZZshPARAMTYPE TextureTypes[NUMBER_OF_SAMPLERS] = 
+	{ZZ_TEXTURE_RECT, 	ZZ_TEXTURE_RECT, 	ZZ_TEXTURE_RECT, 	ZZ_TEXTURE_RECT, 	ZZ_TEXTURE_RECT, 	ZZ_TEXTURE_RECT, \
+		ZZ_TEXTURE_2D, 		ZZ_TEXTURE_2D, 		ZZ_TEXTURE_2D,			ZZ_TEXTURE_2D,		 ZZ_TEXTURE_3D} ;
+
+//------------------ Code
+
+inline int GET_SHADER_INDEX(int type, int texfilter, int texwrap, int fog, int writedepth, int testaem, int exactcolor, int context, int ps) {
+	return type + texfilter*NUM_TYPES + NUM_FILTERS*NUM_TYPES*texwrap + NUM_TEXWRAPS*NUM_FILTERS*NUM_TYPES*(fog+2*writedepth+4*testaem+8*exactcolor+16*context+32*ps) ;
+}
+
+// Nothing need to be done.
+bool ZZshCheckProfilesSupport() {	
+	return true;
+}
+
+// Error handler. Setup in ZZogl_Create once.
+void HandleCgError(ZZshContext ctx, ZZshError err, void* appdata)
+{/*
+	ZZLog::Error_Log("%s->%s: %s", ShaderCallerName, ShaderHandleName, cgGetErrorString(err));
+	const char* listing = cgGetLastListing(g_cgcontext);
+	if (listing != NULL) 
+		ZZLog::Debug_Log("	last listing: %s", listing);
+*/
+}
+
+float ZeroFloat4[4] = {0};
+
+inline void SettleFloat(float* f, const float* v) {
+	f[0] = v[0];
+	f[1] = v[1];
+	f[2] = v[2];
+	f[3] = v[3];
+}
+
+inline ZZshParamInfo ParamInfo(const char* ShName, ZZshPARAMTYPE type, const float fvalue[], GLuint sampler, GLint texid, bool Constant, bool Settled) {
+	ZZshParamInfo x;
+	x.ShName = new char[MAX_UNIFORM_NAME_SIZE];
+	x.ShName = ShName;
+	x.type = type;
+	SettleFloat(x.fvalue, fvalue);
+	x.sampler = sampler;
+	x.texid = texid;
+	x.Constant = Constant;
+	x.Settled = Settled;
+	return x;
+}
+
+inline void SetGlobalUniform(ZZshParameter* param, const char* name) {
+	*param = NumActiveUniforms;
+	UniformsIndex[NumActiveUniforms] = ParamInfo(name, ZZ_FLOAT4, ZeroFloat4, -1, 0, false, false);
+	NumActiveUniforms++;  
+}
+
+bool ZZshStartUsingShaders() {
+	
+	ZZLog::Error_Log("Creating effects.");
+	B_G(LoadEffects(), return false);
+	if (!glCreateShader) 
+	{
+		ZZLog::Error_Log("GLSL shaders is not supported, stop.");
+		return false;
+	}
+
+	// create a sample shader
+	clampInfo temp;
+	memset(&temp, 0, sizeof(temp));
+	temp.wms = 3; temp.wmt = 3;
+
+	g_nPixelShaderVer = 0;//SHADER_ACCURATE;
+	// test
+	bool bFailed;
+	FRAGMENTSHADER* pfrag = ZZshLoadShadeEffect(0, 1, 1, 1, 1, temp, 0, &bFailed);
+	if( bFailed || pfrag == NULL ) {
+		g_nPixelShaderVer = SHADER_ACCURATE|SHADER_REDUCED;
+
+		pfrag = ZZshLoadShadeEffect(0, 0, 1, 1, 0, temp, 0, &bFailed);
+		if( pfrag != NULL )
+			glLinkProgram(pfrag->Shader);
+		if( bFailed || pfrag == NULL || glGetError() != GL_NO_ERROR) {
+			g_nPixelShaderVer = SHADER_REDUCED;
+			ZZLog::Error_Log("Basic shader test failed.");
+		}
+	}
+	ZZshMainProgram = glCreateProgram();
+	NumActiveUniforms = 0;
+	SetGlobalUniform(&g_fparamFogColor, "g_fFogColor"); 
+	SetGlobalUniform(&g_vparamPosXY[0], "g_fPosXY[0]");
+	SetGlobalUniform(&g_vparamPosXY[1], NOCONTEXT?"g_fPosXY[1]":"g_fPosXY[0]");
+	NumGlobalUniforms = NumActiveUniforms;
+
+	if (g_nPixelShaderVer & SHADER_REDUCED)
+		conf.bilinear = 0;
+
+	ZZLog::Error_Log("Creating extra effects.");
+	B_G(ZZshLoadExtraEffects(), return false);
+
+	ZZLog::Error_Log("Using %s shaders.", g_pShaders[g_nPixelShaderVer]);	
+
+	return true;
+}
+
+// open shader file according to build target 
+bool ZZshCreateOpenShadersFile() {
+	std::string ShaderFileName("plugins/ps2hw.glsl");
+	int ShaderFD = open(ShaderFileName.c_str(), O_RDONLY);
+	struct stat sb;
+	if ((ShaderFD == -1) || (fstat(ShaderFD, &sb) == -1)) {	
+		// Each linux distributions have his rules for path so we give them the possibility to
+		// change it with compilation flags. -- Gregory
+#ifdef PLUGIN_DIR_COMPILATION
+#define xPLUGIN_DIR_str(s) PLUGIN_DIR_str(s)
+#define PLUGIN_DIR_str(s) #s
+		ShaderFileName = string(xPLUGIN_DIR_str(PLUGIN_DIR_COMPILATION)) + "/ps2hw.glsl";
+		ShaderFD = open(ShaderFileName.c_str(), O_RDONLY);
+#endif
+		if ((ShaderFD == -1) || (fstat(ShaderFD, &sb) == -1)) {	
+			ZZLog::Error_Log("No source for %s: \n", ShaderFileName.c_str()); 
+			return false;
+		}
+	}
+
+	ZZshSourceSize = sb.st_size;
+	ZZshSource = (char*)mmap(NULL, sb.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, ShaderFD, 0);		// This function directly maped file into memory. 
+	ZZshSource[ ZZshSourceSize - 1] = 0;										// Made source null-terminated.
+
+	close(ShaderFD);
+	return true;
+}
+
+void ZZshExitCleaning() {
+	munmap(ZZshSource, ZZshSourceSize);
+}
+
+// Disable CG
+void ZZshGLDisableProfile() {			// This stop all other shader programs from running;
+	glUseProgram(0);
+}
+//Enable CG
+void ZZshGLEnableProfile() {
+}
+//-------------------------------------------------------------------------------------
+
+// The same function for texture, also to cgGLEnable
+void ZZshGLSetTextureParameter(ZZshParameter param, GLuint texobj, const char* name) {
+	if (param > -1) {
+//		ZZLog::Error_Log("Set texture parameter %s %d... Ok", name, texobj);
+		UniformsIndex[param].texid = texobj;
+		UniformsIndex[param].Settled = true;
+	}
+}
+
+void ZZshGLSetTextureParameter(ZZshShaderLink prog, ZZshParameter param, GLuint texobj, const char* name) {
+	if (param > -1) {
+//		ZZLog::Error_Log("Set texture parameter %s %d... Ok", name, texobj);
+		UniformsIndex[param].texid = texobj;
+		UniformsIndex[param].Settled = true;
+	}
+}
+
+// This is helper of cgGLSetParameter4fv, made for debug purpose.
+// Name could be any string. We must use it on compilation time, because erroneus handler does not
+// return name
+void ZZshSetParameter4fv(ZZshShaderLink prog, ZZshParameter param, const float* v, const char* name) {	
+	if (param > -1) {
+//		ZZLog::Error_Log("Set float parameter %s %f, %f, %f, %f... Ok", name, v[0], v[1], v[2], v[3]);
+		SettleFloat(UniformsIndex[param].fvalue, v);
+		UniformsIndex[param].Settled = true;
+	}
+}
+ 
+void ZZshSetParameter4fv(ZZshParameter param, const float* v, const char* name) {	
+	if (param > -1) {
+//		ZZLog::Error_Log("Set float parameter %s %f, %f, %f, %f... Ok", name, v[0], v[1], v[2], v[3]);
+		SettleFloat(UniformsIndex[param].fvalue, v);	
+		UniformsIndex[param].Settled = true;
+	}
+}
+
+// The same stuff, but also with retry of param, name should be USED name of param for prog.
+void ZZshSetParameter4fvWithRetry(ZZshParameter* param, ZZshShaderLink prog, const float* v, const char* name) {
+	if (param != NULL)
+		ZZshSetParameter4fv(prog, *param, v, name);
+}
+
+// Used sometimes for color 1.
+void ZZshDefaultOneColor( FRAGMENTSHADER ptr ) {
+//	return;	
+	ShaderHandleName = "Set Default One colot";
+	float4 v = float4 ( 1, 1, 1, 1 );
+	ZZshSetParameter4fv(ptr.prog, ptr.sOneColor, v, "DegaultOne");
+}
+//-------------------------------------------------------------------------------------
+
+const GLchar * EmptyVertex = "void main(void) {gl_Position = ftransform();}"; 
+const GLchar * EmptyFragment = "void main(void) {gl_FragColor = gl_Color;}"; 
+
+inline ZZshProgram UseEmptyProgram(const char* name, GLenum shaderType) {
+	GLuint shader = glCreateShader(shaderType);
+	if (shaderType == GL_VERTEX_SHADER)
+		glShaderSource(shader, 1, &EmptyVertex, NULL); 
+	else
+		glShaderSource(shader, 1, &EmptyFragment, NULL); 
+
+	glCompileShader(shader); 
+	ZZshProgram prog = glCreateProgram(); 
+	glAttachShader(prog, shader); 
+	glLinkProgram(prog);
+	if( !glIsProgram(prog) || glGetError() != GL_NO_ERROR ) { 
+		ZZLog::Error_Log("Failed to load empty shader for %s:", name); 
+		return -1; 
+	} 
+	ZZLog::Error_Log("Used Empty program for %s... Ok.",name);
+	return prog;
+}
+
+ZZshShaderType ZZshGetShaderType(const char* name) {
+	if (strncmp(name, "TextureFog", 10) == 0) return ZZ_SH_TEXTURE_FOG;
+	if (strncmp(name, "Texture", 7) == 0) return ZZ_SH_TEXTURE;
+	if (strncmp(name, "RegularFog", 10) == 0) return ZZ_SH_REGULAR_FOG;
+	if (strncmp(name, "Regular", 7) == 0) return ZZ_SH_REGULAR;
+	if (strncmp(name, "Zero", 4) == 0) return ZZ_SH_ZERO;
+	return ZZ_SH_CRTC;
+}
+
+inline ZZshShader UseEmptyShader(const char* name, GLenum shaderType) {
+	GLuint shader = glCreateShader(shaderType);
+	if (shaderType == GL_VERTEX_SHADER)
+		glShaderSource(shader, 1, &EmptyVertex, NULL); 
+	else
+		glShaderSource(shader, 1, &EmptyFragment, NULL); 
+
+	glCompileShader(shader); 
+
+	ShaderNames[shader] = name;
+	ShaderTypes[shader] = ZZshGetShaderType(name);
+
+	ZZLog::Error_Log("Used Empty shader for %s... Ok.",name);
+	return shader;
+}
+
+inline bool GetCompilationLog(GLuint shader) {
+	GLint CompileStatus;
+	glGetShaderiv(shader, GL_COMPILE_STATUS, &CompileStatus);
+	if (CompileStatus == GL_TRUE)
+		return true;
+
+	int* lenght, infologlength;
+	glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infologlength);
+	char* InfoLog = new char[infologlength];
+	glGetShaderInfoLog(shader, infologlength, lenght, InfoLog);
+	ZZLog::Error_Log("Compiling... %d:\t %s", shader, InfoLog);
+
+	return false;
+}
+
+inline bool CompileShader(ZZshProgram& shader, const char* DefineString, const char* name, GLenum shaderType) {
+	const GLchar* ShaderSource[2];
+	ShaderSource[0] = (const GLchar*)DefineString;
+	ShaderSource[1] = (const GLchar*)ZZshSource;
+
+	shader = glCreateShader(shaderType);
+	glShaderSource(shader, 2, &ShaderSource[0], NULL); 
+	glCompileShader(shader); 
+	ZZLog::Debug_Log("Creating shader %d for %s", shader, name);
+
+	if (!GetCompilationLog(shader)) {
+		ZZLog::Error_Log("Failed to compile shader for %s:", name); 
+		return false;
+	}
+
+	ShaderTypes[shader] = ZZshGetShaderType(name);
+	ShaderNames[shader] = name;
+
+	GL_REPORT_ERRORD();
+	return true;
+}
+
+inline bool LoadShaderFromFile(ZZshShader& shader, const char* DefineString, const char* name, GLenum ShaderType) {			// Linux specific, as I presume
+	if (!CompileShader(shader, DefineString, name, ShaderType)) {
+		ZZLog::Error_Log("Failed to compile shader for %s: ", name); 
+	       	return false; 
+	}
+
+	ZZLog::Error_Log("Used shader for %s... Ok",name);
+	return true;
+}
+
+inline bool GetLinkLog(ZZshProgram prog) {
+	GLint LinkStatus;
+	glGetProgramiv(prog, GL_LINK_STATUS, &LinkStatus);
+
+	int unif, atrib;
+	glGetProgramiv(prog, GL_ACTIVE_UNIFORMS, &unif);
+	glGetProgramiv(prog, GL_ACTIVE_ATTRIBUTES, &atrib);
+	UNIFORM_ERROR_LOG("Uniforms %d, attributes %d", unif, atrib);
+
+	if (LinkStatus == GL_TRUE && glIsProgram(prog)) return true;
+
+#ifdef DEVBUILD
+	int* lenght, infologlength;
+	glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &infologlength);
+	char* InfoLog = new char[infologlength];
+	glGetProgramInfoLog(prog, infologlength, lenght, InfoLog);
+	if (!infologlength == 0)
+		ZZLog::Error_Log("Linking %d... %d:\t %s", prog, infologlength, InfoLog);
+#endif
+
+	return false;	
+
+}
+
+//-------------------------------------------------------------------------------------
+inline ZZshProgram madeProgram(ZZshShader shader, ZZshShader shader2, char* name) {
+	ZZshProgram prog = glCreateProgram();
+	glAttachShader(prog, shader); 
+	if (shader2 != 0)
+		glAttachShader(prog, shader2); 
+	glLinkProgram(prog);
+	if (!GetLinkLog(prog)) { 
+		ZZLog::Error_Log("Failed to link shader for %s: ", name); 
+		prog = UseEmptyProgram(name, GL_FRAGMENT_SHADER);
+	}
+	glDetachShader(prog, shader); 
+
+	ZZLog::Error_Log("Made shader program for %s... Ok",name);
+	return prog;
+}
+
+void PutParametersInProgam(int start, int finish) {
+	for (int i = start; i < finish; i++) {
+		ZZshParamInfo param = UniformsIndex[i];
+		GLint location = glGetUniformLocation(ZZshMainProgram, param.ShName);
+
+		if (location != -1 && param.type != ZZ_UNDEFINED) {
+			UNIFORM_ERROR_LOG("\tTry uniform %d %d %d %s...\t\t", i, location, param.type, param.ShName);
+
+			if (!param.Settled && !param.Constant) {
+				UNIFORM_ERROR_LOG("\tUnsettled, non-constant uniform, could be bug: %d %s", param.type, param.ShName);
+				continue;
+			}
+
+			if (param.type == ZZ_FLOAT4) {
+				glUniform4fv(location, 1, param.fvalue);
+			}
+			else
+			{
+				glActiveTexture(GL_TEXTURE0 + param.sampler);
+				if (param.type == ZZ_TEXTURE_2D)
+					glBindTexture(GL_TEXTURE_2D, param.texid);
+				else if (param.type == ZZ_TEXTURE_3D)
+					glBindTexture(GL_TEXTURE_3D, param.texid);
+				else
+					glBindTexture(GL_TEXTURE_RECTANGLE, param.texid);
+				GL_REPORT_ERRORD();
+			}
+
+			if (glGetError() == GL_NO_ERROR)
+				UNIFORM_ERROR_LOG("Ok. Param name %s, location %d, type %d", param.ShName, location, param.type);
+			else
+				ZZLog::Error_Log("error in PutParametersInProgam param name %s, location %d, type %d", param.ShName, location, param.type);
+
+			if (!param.Constant)								// Unset used parameters 
+				UniformsIndex[i].Settled == false;
+		}
+		else if (start != 0 && location == -1 && param.Settled)					// No global variable
+			ZZLog::Error_Log("Warning! Unused, but set uniform %d, %s", location, param.ShName);
+	}
+	GL_REPORT_ERRORD();
+}
+
+void PutSInProgam(int start, int finish) {
+	for (int i = start; i < finish; i++) {
+		ZZshParamInfo param = UniformsIndex[i];
+		GLint location = glGetUniformLocation(ZZshMainProgram, param.ShName);
+
+		if (location != -1 && param.type != ZZ_UNDEFINED) {
+			if (param.type != ZZ_FLOAT4) {
+				UNIFORM_ERROR_LOG("\tTry sampler %d %d %d %s %d...\t\t", i, location, param.type, param.ShName, param.sampler);
+				if (glGetError() == GL_NO_ERROR)
+					UNIFORM_ERROR_LOG("Ok");
+				else
+					UNIFORM_ERROR_LOG("error!");
+				glUniform1i(location, param.sampler);
+			}
+		}
+	}
+	GL_REPORT_ERRORD();
+}
+
+bool ValidateProgram(ZZshProgram Prog) {
+	GLint isValid;
+	glGetProgramiv(Prog, GL_VALIDATE_STATUS, &isValid);
+
+	if (!isValid) {
+		glValidateProgram(Prog);
+		int* lenght, infologlength;
+		glGetProgramiv(Prog, GL_INFO_LOG_LENGTH, &infologlength);
+		char* InfoLog = new char[infologlength];
+		glGetProgramInfoLog(Prog, infologlength, lenght, InfoLog);
+		ZZLog::Error_Log("Validation %d... %d:\t %s", Prog, infologlength, InfoLog);
+	}
+	return (isValid != 0); 
+}
+
+void PutParametersAndRun(VERTEXSHADER* vs, FRAGMENTSHADER* ps) {
+	UNIFORM_ERROR_LOG("Run program %s(%d) \t+\t%s(%d)", ShaderNames[vs->Shader], vs->Shader, ShaderNames[ps->Shader], ps->Shader);
+
+	glUseProgram(ZZshMainProgram);	
+	if (glGetError() != GL_NO_ERROR) {
+		ZZLog::Error_Log("Something weird happened on Linking stage.");
+
+		glUseProgram(0);
+		return;
+	}
+
+	PutSInProgam(vs->ParametersStart, vs->ParametersFinish);
+	PutSInProgam(ps->ParametersStart, ps->ParametersFinish);
+
+	PutParametersInProgam(0, NumGlobalUniforms);
+	PutParametersInProgam(vs->ParametersStart, vs->ParametersFinish);
+	PutParametersInProgam(ps->ParametersStart, ps->ParametersFinish);	
+
+	ValidateProgram(ZZshMainProgram);
+	GL_REPORT_ERRORD();
+}
+
+void CreateAndRunMain(VERTEXSHADER* vs, FRAGMENTSHADER* ps) {
+	ZZLog::Error_Log("\n--->  New shader program %d, %s(%d) \t+\t%s(%d).", ZZshMainProgram, ShaderNames[vs->Shader], vs->Shader, ShaderNames[ps->Shader], ps->Shader);
+
+	if (vs->Shader != 0)
+		glAttachShader(ZZshMainProgram, vs->Shader);
+	if (ps->Shader != 0)
+		glAttachShader(ZZshMainProgram, ps->Shader);
+
+	glLinkProgram(ZZshMainProgram);
+	if (!GetLinkLog(ZZshMainProgram)) {
+		ZZLog::Error_Log("Main program linkage error, don't use any shader for this stage.");
+		return;
+	}
+
+	GL_REPORT_ERRORD();
+
+       	PutParametersAndRun(vs, ps);	
+	GL_REPORT_ERRORD();
+}
+
+inline bool ZZshCheckShaderCompatibility(VERTEXSHADER* vs, FRAGMENTSHADER* ps) { 
+	if (vs == NULL) return false;
+	if (vs->ShaderType == ZZ_SH_ZERO) return true;			// ZeroPS is compatible with everything
+	if (ps == NULL) return false;
+
+	return (vs->ShaderType == ps->ShaderType);
+}
+
+void ZZshSetShader(VERTEXSHADER* vs, FRAGMENTSHADER* ps) {
+	if (!ZZshCheckShaderCompatibility(vs, ps)) 				// We don't need to link uncompatible shaders
+		return;	
+
+	int vss = (vs!=NULL)?vs->Shader:0;
+	int pss = (ps!=NULL)?ps->Shader:0;
+
+	if (vss !=0 && pss != 0) {
+		if (CompiledPrograms[vss][pss] != 0 && glIsProgram(CompiledPrograms[vss][pss])) {
+			ZZshMainProgram = CompiledPrograms[vs->Shader][ps->Shader];
+			PutParametersAndRun(vs, ps);
+		}
+		else {
+			ZZshProgram NewProgram = glCreateProgram();
+			ZZshMainProgram = NewProgram;
+			CompiledPrograms[vss][pss] = NewProgram;
+			CreateAndRunMain(vs, ps) ;
+		}
+	}
+}
+
+void ZZshSetVertexShader(ZZshShaderLink prog) {
+	g_vsprog = prog;
+	ZZshSetShader((VERTEXSHADER*)(g_vsprog.link), (FRAGMENTSHADER*)(g_psprog.link)) ;
+}
+
+void ZZshSetPixelShader(ZZshShaderLink prog) {
+	g_psprog = prog;
+	ZZshSetShader((VERTEXSHADER*)(g_vsprog.link), (FRAGMENTSHADER*)(g_psprog.link)) ;
+}
+
+//------------------------------------------------------------------------------------------------------------------
+
+// For several reason texobj could not be put in sampler directly, only though GL_TEXTUREi interface. So we need to check correct sampler for each one.
+inline void SettleTextureUnit(ZZshParamInfo* param, const char* name) {
+	for (int i = 0; i < NUMBER_OF_SAMPLERS; i++) {
+		if (strcmp(TextureUnits[i], name) == 0) {
+			param->sampler = i;
+			param->type = TextureTypes[i];
+			return;
+		}
+	}
+}
+
+inline int SetUniformParam(ZZshProgram prog, ZZshParameter* param, const char* name) {
+	GLint p = glGetUniformLocation(prog, name); 
+	if (p > -1) { 
+		*param = NumActiveUniforms;
+		UniformsIndex[NumActiveUniforms] = ParamInfo(name, ZZ_FLOAT4, ZeroFloat4, -1, 0, false, false);		// By define Uniform is FLOAT4
+
+		SettleTextureUnit(&(UniformsIndex[NumActiveUniforms]), name);
+		UNIFORM_ERROR_LOG("uniform %s \t\t%d %d", name, p, UniformsIndex[NumActiveUniforms].type); 
+
+		NumActiveUniforms++;  		
+	}
+	else 
+		*param = -1; 
+	return p;
+}
+
+#define SET_UNIFORMPARAM(var, name) { \
+	p = SetUniformParam(prog, &(pf->var), name); \
+} 
+
+#define INIT_SAMPLERPARAM(tex, name) { \
+	ZZshParameter x; \
+	p = SetUniformParam(prog, &x, name); \
+	(UniformsIndex[x]).Constant = true; \
+	ZZshGLSetTextureParameter(pf->prog, x, tex, name); \
+}
+
+#define INIT_UNIFORMPARAM(var, name) { \
+	ZZshParameter x; \
+	p = SetUniformParam(prog, &x, name); \
+	(UniformsIndex[x]).Constant = true; \
+	ZZshSetParameter4fv(pf->prog, x, var, name); \
+} 
+
+char* AddContextToName(const char* name, int context) {
+	char* newname = new char[MAX_UNIFORM_NAME_SIZE];
+	sprintf(newname, "%s[%d]", name, context * NOCONTEXT);
+	return newname;
+}
+
+void SetupFragmentProgramParameters(FRAGMENTSHADER* pf, int context, int type)
+{
+	// uniform parameters
+	GLint p;
+	pf->prog.link = (void*)pf;			// Setting autolink
+	pf->prog.isFragment = true;			// Setting autolink
+	pf->ShaderType = ShaderTypes[pf->Shader];
+
+	pf->ParametersStart = NumActiveUniforms;
+	ZZshProgram prog = madeProgram(pf->Shader, 0, "");
+	glUseProgram(prog);
+	GL_REPORT_ERRORD();
+
+	SET_UNIFORMPARAM(sOneColor, "g_fOneColor");
+	SET_UNIFORMPARAM(sBitBltZ, "g_fBitBltZ");
+	SET_UNIFORMPARAM(sInvTexDims, "g_fInvTexDims");
+	SET_UNIFORMPARAM(fTexAlpha2, AddContextToName("fTexAlpha2", context));
+	SET_UNIFORMPARAM(fTexOffset, AddContextToName("g_fTexOffset", context));
+	SET_UNIFORMPARAM(fTexDims, AddContextToName("g_fTexDims", context));
+	SET_UNIFORMPARAM(fTexBlock, AddContextToName("g_fTexBlock", context));
+	SET_UNIFORMPARAM(fClampExts, AddContextToName("g_fClampExts",  context)); 		// FIXME: There is a bug, that lead FFX-1 to incorrect CLAMP if this uniform have context.
+	SET_UNIFORMPARAM(fTexWrapMode, AddContextToName("TexWrapMode", context));
+	SET_UNIFORMPARAM(fRealTexDims, AddContextToName("g_fRealTexDims", context));
+	SET_UNIFORMPARAM(fTestBlack, AddContextToName("g_fTestBlack", context));
+	SET_UNIFORMPARAM(fPageOffset, AddContextToName("g_fPageOffset", context));
+	SET_UNIFORMPARAM(fTexAlpha, AddContextToName("fTexAlpha", context));
+	GL_REPORT_ERRORD();
+
+	// textures
+	INIT_SAMPLERPARAM(ptexBlocks, "g_sBlocks");
+	if (type == 3) 
+		{INIT_SAMPLERPARAM(ptexConv16to32, "g_sConv16to32");}
+	else if (type == 4) 
+		{INIT_SAMPLERPARAM(ptexConv32to16, "g_sConv32to16");}
+	else 
+		{INIT_SAMPLERPARAM(ptexBilinearBlocks, "g_sBilinearBlocks");}
+	GL_REPORT_ERRORD();
+
+	SET_UNIFORMPARAM(sMemory, AddContextToName("g_sMemory", context));
+	SET_UNIFORMPARAM(sFinal, "g_sSrcFinal");
+	SET_UNIFORMPARAM(sBitwiseANDX, "g_sBitwiseANDX");
+	SET_UNIFORMPARAM(sBitwiseANDY, "g_sBitwiseANDY");
+	SET_UNIFORMPARAM(sCLUT, "g_sCLUT");
+	SET_UNIFORMPARAM(sInterlace, "g_sInterlace");
+	GL_REPORT_ERRORD();
+
+	// set global shader constants
+	INIT_UNIFORMPARAM(float4(0.5f, (conf.settings().exact_color)?0.9f/256.0f:0.5f/256.0f, 0,1/255.0f), "g_fExactColor");
+	INIT_UNIFORMPARAM(float4(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f ), "g_fBilinear");
+	INIT_UNIFORMPARAM(float4(1.0f/256.0f, 1.0004f, 1, 0.5f), "g_fZBias");
+	INIT_UNIFORMPARAM(float4(0,1, 0.001f, 0.5f), "g_fc0");
+	INIT_UNIFORMPARAM(float4(1/1024.0f, 0.2f/1024.0f, 1/128.0f, 1/512.0f), "g_fMult");
+	pf->ParametersFinish = NumActiveUniforms;
+	if (NumActiveUniforms > MAX_ACTIVE_UNIFORMS)
+		ZZLog::Error_Log("Too many shader variables. You may increase the limit in source %d.", NumActiveUniforms);
+		
+	glUseProgram(0);
+	GL_REPORT_ERRORD();
+}
+
+void SetupVertexProgramParameters(VERTEXSHADER* pf, int context)
+{
+	GLint p;
+	pf->prog.link = (void*)pf;			// Setting autolink
+	pf->prog.isFragment = false;			// Setting autolink
+	pf->ShaderType = ShaderTypes[pf->Shader];
+
+	pf->ParametersStart = NumActiveUniforms;
+
+	ZZshProgram prog = madeProgram(pf->Shader, 0, "");
+	glUseProgram(prog);
+
+	GL_REPORT_ERRORD();
+
+	// Set Z-test, log or no log;
+	if (conf.settings().no_logz) {
+       		g_vdepth = float4( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
+		vlogz = float4( 1.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else {
+		g_vdepth = float4( 256.0f*65536.0f, 65536.0f, 256.0f, 65536.0f*65536.0f);	
+		vlogz = float4( 0.0f, 1.0f, 0.0f, 0.0f);
+	}
+
+	INIT_UNIFORMPARAM(g_vdepth, "g_fZ");
+	if (p > -1) {
+		INIT_UNIFORMPARAM(vlogz, "g_fZMin");
+		if (p == -1)  ZZLog::Error_Log ("Shader file version is outdated! Only log-Z is possible.");
+	}
+	GL_REPORT_ERRORD();
+
+	float4 vnorm = float4(g_filog32, 0, 0,0);
+	INIT_UNIFORMPARAM(vnorm, "g_fZNorm");
+	INIT_UNIFORMPARAM(float4(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f ),  "g_fBilinear");
+	INIT_UNIFORMPARAM(float4(1.0f/256.0f, 1.0004f, 1, 0.5f),  "g_fZBias") ;
+	INIT_UNIFORMPARAM(float4(0,1, 0.001f, 0.5f), "g_fc0");
+
+	SET_UNIFORMPARAM(sBitBltPos, "g_fBitBltPos");
+	SET_UNIFORMPARAM(sBitBltTex, "g_fBitBltTex");
+	SET_UNIFORMPARAM(fBitBltTrans, "g_fBitBltTrans");
+	pf->ParametersFinish = NumActiveUniforms;
+	if (NumActiveUniforms > MAX_ACTIVE_UNIFORMS)
+		ZZLog::Error_Log("Too many shader variables. You may increase the limit in the source.");
+
+	glUseProgram(0);
+	GL_REPORT_ERRORD();
+}
+
+const int GLSL_VERSION = 130;  			// Sampler2DRect appear in 1.3
+
+// We use strictly compilation from source for GSLS
+static __forceinline void GlslHeaderString(char* header_string, const char* name, const char* depth)
+{
+	sprintf(header_string, "#version %d\n#define %s main\n%s\n", GLSL_VERSION, name, depth);
+}
+
+static __forceinline bool LOAD_VS(char* DefineString, const char* name, VERTEXSHADER vertex, int shaderver, ZZshProfile context, const char* depth)
+{
+	bool flag;
+	char temp[200];
+	GlslHeaderString(temp, name, depth);
+	sprintf(DefineString, "%s#define VERTEX_SHADER 1\n#define CTX %d\n", temp, context * NOCONTEXT);
+	//ZZLog::WriteLn("Define for VS == '%s'", DefineString);
+	flag = LoadShaderFromFile(vertex.Shader, DefineString, name, GL_VERTEX_SHADER);
+	SetupVertexProgramParameters(&vertex, context);
+	return flag;
+}
+
+static __forceinline bool LOAD_PS(char* DefineString, const char* name, FRAGMENTSHADER fragment, int shaderver, ZZshProfile context, const char* depth)
+{
+	bool flag;
+	char temp[200];
+	GlslHeaderString(temp, name, depth);
+	sprintf(DefineString, "%s#define FRAGMENT_SHADER 1\n#define CTX %d\n", temp, context * NOCONTEXT);
+	//ZZLog::WriteLn("Define for PS == '%s'", DefineString);
+	flag = LoadShaderFromFile(fragment.Shader, DefineString, name, GL_FRAGMENT_SHADER);
+	SetupFragmentProgramParameters(&fragment, context, 0); 
+	return flag;
+}
+
+inline bool LoadEffects()
+{
+	// clear the textures
+	for(u32 i = 0; i < ArraySize(ppsTexture); ++i) {
+		SAFE_RELEASE_PROG(ppsTexture[i].prog);
+	}
+
+#ifndef _DEBUG
+	memset(ppsTexture, 0, sizeof(ppsTexture));
+#endif
+
+	return true;
+}
+
+bool ZZshLoadExtraEffects() {
+	bool bLoadSuccess = true;
+	char DefineString[DEFINE_STRING_SIZE] = "";
+	const char* writedepth = "#define WRITE_DEPTH 1\n";	// should we write depth field
+
+
+	const char* pvsshaders[4] = { "RegularVS", "TextureVS", "RegularFogVS", "TextureFogVS" };
+
+	for (int i = 0; i < 4; ++i) {
+		if (!LOAD_VS(DefineString, pvsshaders[i], pvsStore[2 * i], cgvProf, 0, "")) bLoadSuccess = false;
+		if (!LOAD_VS(DefineString, pvsshaders[i], pvsStore[2 *i + 1 ], cgvProf, 1, "")) bLoadSuccess = false;
+		if (!LOAD_VS(DefineString, pvsshaders[i], pvsStore[2 *i + 8 ], cgvProf, 0, writedepth)) bLoadSuccess = false;
+		if (!LOAD_VS(DefineString, pvsshaders[i], pvsStore[2 *i + 8 + 1], cgvProf, 1, writedepth)) bLoadSuccess = false;
+	}
+	for (int i = 0; i < 16; ++i) 
+		pvs[i] = pvsStore[i].prog;	
+
+	if (!LOAD_VS(DefineString, "BitBltVS", pvsBitBlt, cgvProf, 0, "")) bLoadSuccess = false;
+	GLint p;
+	GL_REPORT_ERRORD();
+
+	if (!LOAD_PS(DefineString, "RegularPS", ppsRegular[0], cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "RegularFogPS", ppsRegular[1], cgfProf, 0, "")) bLoadSuccess = false;
+
+	if( conf.mrtdepth ) {
+		if (!LOAD_PS(DefineString, "RegularPS", ppsRegular[2], cgfProf, 0, writedepth)) bLoadSuccess = false;
+		if (!bLoadSuccess) conf.mrtdepth = 0;
+		
+		if (!LOAD_PS(DefineString, "RegularFogPS", ppsRegular[3], cgfProf, 0, writedepth)) bLoadSuccess = false;
+		if (!bLoadSuccess) conf.mrtdepth = 0;
+	}
+
+	if (!LOAD_PS(DefineString, "BitBltPS", ppsBitBlt[0], cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "BitBltAAPS", ppsBitBlt[1], cgfProf, 0, "")) bLoadSuccess = false;
+	if (!bLoadSuccess) {
+		ZZLog::Error_Log("Failed to load BitBltAAPS, using BitBltPS.");
+		if (!LOAD_PS(DefineString, "BitBltPS", ppsBitBlt[1], cgfProf, 0, "")) bLoadSuccess = false;
+	}
+
+	if (!LOAD_PS(DefineString, "BitBltDepthPS", ppsBitBltDepth, cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "CRTCTargPS", ppsCRTCTarg[0], cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "CRTCTargInterPS", ppsCRTCTarg[1], cgfProf, 0, "")) bLoadSuccess = false;
+	
+	g_bCRTCBilinear = true;
+	if (!LOAD_PS(DefineString, "CRTCPS", ppsCRTC[0], cgfProf, 0, "")) bLoadSuccess = false;
+	if( !bLoadSuccess ) {
+		// switch to simpler
+		g_bCRTCBilinear = false;
+		if (!LOAD_PS(DefineString, "CRTCPS_Nearest", ppsCRTC[0], cgfProf, 0, "")) bLoadSuccess = false;
+		if (!LOAD_PS(DefineString, "CRTCInterPS_Nearest", ppsCRTC[0], cgfProf, 0, "")) bLoadSuccess = false;
+	}
+	else {
+		if (!LOAD_PS(DefineString, "CRTCInterPS", ppsCRTC[1], cgfProf, 0, "")) bLoadSuccess = false;
+	}
+
+	if( !bLoadSuccess )
+		ZZLog::Error_Log("Failed to create CRTC shaders.");
+	
+	// if (!LOAD_PS(DefineString, "CRTC24PS", ppsCRTC24[0], cgfProf, 0, "")) bLoadSuccess = false;
+	// if (!LOAD_PS(DefineString, "CRTC24InterPS", ppsCRTC24[1], cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "ZeroPS", ppsOne, cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "BaseTexturePS", ppsBaseTexture, cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "Convert16to32PS", ppsConvert16to32, cgfProf, 0, "")) bLoadSuccess = false;
+	if (!LOAD_PS(DefineString, "Convert32to16PS", ppsConvert32to16, cgfProf, 0, "")) bLoadSuccess = false;
+
+	GL_REPORT_ERRORD();
+	return true;
+}
+
+const static char* g_pPsTexWrap[] = { "#define REPEAT 1\n", "#define CLAMP 1\n", "#define REGION_REPEAT 1\n", "" };
+
+static ZZshShader LoadShaderFromType(const char* srcdir, const char* srcfile, int type, int texfilter, int texwrap, int fog, int writedepth, int testaem, int exactcolor, int ps, int context) {
+
+	assert( texwrap < NUM_TEXWRAPS);
+	assert( type < NUM_TYPES );
+	//ZZLog::Error_Log("\n");
+
+	ZZshProgram prog;
+
+	char* name = new char[MAX_SHADER_NAME_SIZE];
+	sprintf(name, "Texture%s%d_%sPS", fog?"Fog":"", texfilter, g_pTexTypes[type]);
+
+	ZZLog::Debug_Log("Starting shader for %s", name);
+	
+	const char* AddWrap 	= g_pPsTexWrap[texwrap];
+	const char* AddDepth 	= writedepth?"#define WRITE_DEPTH 1\n":"";
+	const char* AddAEM	= testaem?"#define TEST_AEM 1\n":"";
+	const char* AddExcolor	= exactcolor?"#define EXACT_COLOR 1\n":"";
+	const char* AddAccurate  = (ps & SHADER_ACCURATE)?"#define ACCURATE_DECOMPRESSION 1\n":"";
+	char DefineString[DEFINE_STRING_SIZE] = "";
+	char temp[200];
+	GlslHeaderString(temp, name, AddWrap);
+	sprintf(DefineString, "%s#define FRAGMENT_SHADER 1\n%s%s%s%s\n#define CTX %d\n", temp, AddDepth, AddAEM, AddExcolor, AddAccurate, context * NOCONTEXT);
+
+	ZZshShader shader;
+	if (!CompileShader(shader, DefineString, name, GL_FRAGMENT_SHADER)) 
+		return UseEmptyShader(name, GL_FRAGMENT_SHADER);
+
+	ZZLog::Debug_Log("Used shader for type:%d filter:%d wrap:%d for:%d depth:%d aem:%d color:%d decompression:%d ctx:%d... Ok \n", type, texfilter, texwrap, fog, writedepth, testaem, exactcolor, ps, context);
+
+	GL_REPORT_ERRORD();
+	return shader;
+}
+
+FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed)
+{
+	int texwrap;
+	
+	assert( texfilter < NUM_FILTERS );
+	//assert( g_nPixelShaderVer == SHADER_30 );
+	if( clamp.wms == clamp.wmt ) {
+		switch( clamp.wms ) {
+			case 0: texwrap = TEXWRAP_REPEAT; break;
+			case 1: texwrap = TEXWRAP_CLAMP; break;
+			case 2: texwrap = TEXWRAP_CLAMP; break;
+			default:
+				texwrap = TEXWRAP_REGION_REPEAT; break;
+		}
+	}
+	else if( clamp.wms==3||clamp.wmt==3)
+		texwrap = TEXWRAP_REGION_REPEAT;
+	else
+		texwrap = TEXWRAP_REPEAT_CLAMP;
+
+	int index = GET_SHADER_INDEX(type, texfilter, texwrap, fog, s_bWriteDepth, testaem, exactcolor, context, 0);
+
+	if( pbFailed != NULL ) *pbFailed = false;
+
+	FRAGMENTSHADER* pf = ppsTexture+index;
+
+	if (ZZshExistProgram(pf)) 
+	{
+		return pf;
+	}
+	pf->Shader = LoadShaderFromType(EFFECT_DIR, EFFECT_NAME, type, texfilter, texwrap, fog, s_bWriteDepth, testaem, exactcolor, g_nPixelShaderVer, context);
+	
+	if (ZZshExistProgram(pf)) {
+		SetupFragmentProgramParameters(pf, context, type);
+		GL_REPORT_ERRORD();
+
+		if( glGetError() != GL_NO_ERROR ) {
+				ZZLog::Error_Log("Failed to load shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+				if (pbFailed != NULL ) *pbFailed = true;
+				return pf;
+		}
+		
+		return pf;
+	}
+
+	ZZLog::Error_Log("Failed to create shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+	if( pbFailed != NULL ) *pbFailed = true;
+	
+	GL_REPORT_ERRORD();
+	return NULL;
+}
+
+#endif // GLSL_API
--- a/plugins/zzogl-pg/opengl/ps2hw.glsl
+++ b/plugins/zzogl-pg/opengl/ps2hw.glsl
@ -0,0 +1,812 @@
+// Cg Shaders for PS2 GS emulation
+
+// divides by z for every pixel, instead of in vertex shader
+// fixes kh textures
+
+#extension ARB_texture_rectangle: enable
+#define GL_compatibility_profile 1
+#define PERSPECTIVE_CORRECT_TEX
+
+// When writting GLSL code we should change variables in code according to denominator
+// Not than in and out variables are differ!
+// in POSITION  	set by glVertexPointer		goes 	to gl_Vertex;
+// out POSITION						goes    to gl_position
+// in COLOR0							gl_Color
+// out COLOR0							gl_FrontColor
+// in TEXCOORD0							gl_MultiTexCoord0
+// out TEXCOORD0						gl_TexCoord[0]
+
+//in Fragments:
+// in TEXCOORD0							gl_TexCoord[0]
+// out COLOR0							gl_FragData[0]
+
+//#define TEST_AEM // tests AEM for black pixels
+//#define REGION_REPEAT // set if texture wrapping mode is region repeat
+//#define WRITE_DEPTH // set if depth is also written in a MRT
+//#define ACCURATE_DECOMPRESSION // set for less capable hardware ATI Radeon 9000 series
+//#define EXACT_COLOR	// make sure the output color is clamped to 1/255 boundaries (for alpha testing)
+
+#ifdef PERSPECTIVE_CORRECT_TEX
+#define TEX_XY tex.xy/tex.z
+#define TEX_DECL vec4
+#else
+#define TEX_XY tex.xy
+#define TEX_DECL vec4
+#endif
+
+#ifdef WRITE_DEPTH
+#define DOZWRITE(x) x
+#else
+#define DOZWRITE(x)
+#endif
+
+// NVidia CG-data types
+#define half2 vec2
+#define half3 vec3
+#define half4 vec4
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+
+// main ps2 memory, each pixel is stored in 32bit color
+uniform sampler2DRect g_sMemory[2];
+
+// per context pixel shader constants
+uniform half4 fTexAlpha2[2];
+
+uniform float4 g_fTexOffset[2];			// converts the page and block offsets into the mem addr/1024
+uniform float4 g_fTexDims[2];				// mult by tex dims when accessing the block texture
+uniform float4 g_fTexBlock[2];
+
+uniform float4 g_fClampExts[2];		// if clamping the texture, use (minu, minv, maxu, maxv)
+uniform float4 TexWrapMode[2];			// 0 - repeat/clamp, 1 - region rep (use fRegRepMask)
+
+uniform float4 g_fRealTexDims[2]; // tex dims used for linear filtering (w,h,1/w,1/h)
+
+// (alpha0, alpha1, 1 if highlight2 and tcc is rgba, 1-y)
+uniform half4 g_fTestBlack[2];	// used for aem bit
+
+uniform float4 g_fPageOffset[2];
+
+uniform half4 fTexAlpha[2];
+
+// vertex shader constants
+uniform float4 g_fPosXY[2];
+
+// used to get the tiled offset into a page given the linear offset
+uniform sampler2DRect g_sSrcFinal;
+uniform sampler2D g_sBlocks;
+uniform sampler2D g_sBilinearBlocks;
+uniform sampler2D g_sConv16to32;
+uniform sampler3D g_sConv32to16;
+uniform sampler2DRect g_sBitwiseANDX;
+uniform sampler2DRect g_sBitwiseANDY;
+uniform sampler2DRect g_sInterlace;
+
+// used only on rare cases where the render target is PSMT8H
+uniform sampler2D g_sCLUT;
+
+// global pixel shader constants
+uniform float4 g_fInvTexDims; // similar to g_fClutOff
+uniform float4 g_fFogColor;
+
+// used for rectblitting
+uniform float4 g_fBitBltZ;
+
+uniform half4 g_fOneColor; // col*.xxxy+.zzzw
+
+// vertex shader constants
+uniform float4 g_fBitBltPos;
+uniform float4 g_fZ;				// transforms d3dcolor z into float z
+uniform float4 g_fZNorm;
+uniform float4 g_fZMin = float4(0.0f, 1.0f, 0.0f, 0.0f);
+uniform float4 g_fBitBltTex;
+
+// pixel shader consts
+// .z is used for the addressing fn
+uniform half4 g_fExactColor = half4(0.5,0.5/256.0f,0,1/255.0f);
+uniform float4 g_fBilinear = float4(-0.7f, -0.65f, 0.9,1/32767.0f);
+uniform float4 g_fZBias = half4(1.0f/256.0f, 1.0004f, 1, 0.5); // also for vs
+uniform float4 g_fc0 = float4(0,1, 0.001, 0.5f); // also for vs
+uniform float4 g_fMult = float4(1/1024.0f, 0.2f/1024.0f, 1/128.0f, 1/512.0f);
+
+// vertex shader consts
+uniform float4 g_fBitBltTrans = float4(0.5f, -0.5f, 0.5, 0.5 + 0.4/416.0f);
+
+// given a local tex coord, returns the coord in the memory
+float2 ps2memcoord(float2 realtex)
+{
+	float4 off;
+
+	// block off
+	realtex.xy = realtex.xy * g_fTexDims[CTX].xy + g_fTexDims[CTX].zw;
+	realtex.xy = (realtex.xy - fract(realtex.xy)) * g_fMult.zw;
+	float2 fblock = fract(realtex.xy);
+	off.xy = realtex.xy-fblock.xy;
+
+#ifdef ACCURATE_DECOMPRESSION
+	off.zw = texture(g_sBlocks, g_fTexBlock[CTX].xy*fblock + g_fTexBlock[CTX].zw).ar;
+	off.x = dot(off.xy, g_fTexOffset[CTX].xy); 
+	float r = g_fTexOffset[CTX].w;
+	float f = fract(off.x);
+	float fadd = g_fTexOffset[CTX].z * off.z;
+	off.w = off.x + fadd + r;
+	off.x = fract(f + fadd + r);
+	off.w -= off.x ;
+#else
+	off.z = texture(g_sBlocks, g_fTexBlock[CTX].xy*fblock + g_fTexBlock[CTX].zw).a;
+
+	// combine the two
+    off.x = dot(off.xyz, g_fTexOffset[CTX].xyz)+g_fTexOffset[CTX].w;
+	off.x = modf(off.x, off.w);
+#endif
+
+	off.xy = off.xw * g_fPageOffset[CTX].zy + g_fPageOffset[CTX].wx;
+    //off.y = off.w * g_fPageOffset[CTX].y + g_fPageOffset[CTX].x;
+	return off.xy;
+}
+
+// find all texcoords for bilinear filtering
+// assume that orgtex are already on boundaries
+void ps2memcoord4(float4 orgtex, out float4 off0, out float4 off1)
+{
+	//float4 off0, off1, off2, off3;
+	float4 realtex;
+
+	// block off
+	realtex = (orgtex * g_fTexDims[CTX].xyxy + g_fTexDims[CTX].zwzw);// * g_fMult.zwzw;
+	float4 fblock = fract(realtex.xyzw);
+	float4 ftransblock = g_fTexBlock[CTX].xyxy*fblock + g_fTexBlock[CTX].zwzw;
+	realtex -= fblock;
+
+	float4 transvals = g_fTexOffset[CTX].x * realtex.xzxz + g_fTexOffset[CTX].y * realtex.yyww + g_fTexOffset[CTX].w;
+
+	float4 colors;// = texture(g_sBilinearBlocks, ftransblock.xy);
+
+	// this is faster on ffx ingame
+	colors.x = texture(g_sBlocks, ftransblock.xy).a;
+	colors.y = texture(g_sBlocks, ftransblock.zy).a;
+	colors.z = texture(g_sBlocks, ftransblock.xw).a;
+	colors.w = texture(g_sBlocks, ftransblock.zw).a;
+
+	float4 fr, rem;
+
+#ifdef ACCURATE_DECOMPRESSION
+	fr = fract(transvals);
+	float4 fadd = colors * g_fTexOffset[CTX].z;
+	rem = transvals + fadd;
+	fr = fract(fr + fadd);
+	rem -= fr;
+#else
+	transvals += colors * g_fTexOffset[CTX].z;
+
+	fr = modf(transvals, rem);
+#endif
+
+	rem = rem * g_fPageOffset[CTX].y + g_fPageOffset[CTX].x;
+	fr = fr * g_fPageOffset[CTX].z + g_fPageOffset[CTX].w;
+
+	// combine
+	off0 = g_fc0.yxyx * fr.xxyy + g_fc0.xyxy * rem.xxyy;
+	off1 = g_fc0.yxyx * fr.zzww + g_fc0.xyxy * rem.zzww;
+}
+
+void ps2memcoord4_fast(float4 orgtex, out float4 off0, out float4 off1)
+{
+	float4 realtex;
+
+	realtex = (orgtex * g_fTexDims[CTX].xyxy + g_fTexDims[CTX].zwzw);// * g_fMult.zwzw;
+	float4 fblock = fract(realtex.xyzw);
+	float2 ftransblock = g_fTexBlock[CTX].xy*fblock.xy + g_fTexBlock[CTX].zw;
+	realtex -= fblock;
+
+	float4 transvals = g_fTexOffset[CTX].x * realtex.xzxz + g_fTexOffset[CTX].y * realtex.yyww + g_fTexOffset[CTX].w;
+
+	float4 colors = texture(g_sBilinearBlocks, ftransblock.xy);
+	float4 fr, rem;
+
+#ifdef ACCURATE_DECOMPRESSION
+	fr = fract(transvals);
+	float4 fadd = colors * g_fTexOffset[CTX].z;
+	rem = transvals + fadd;
+	fr = fract(fr + fadd);
+	rem -= fr;
+#else
+	transvals += colors * g_fTexOffset[CTX].z;
+
+	fr = modf(transvals, rem);
+#endif
+
+	rem = rem * g_fPageOffset[CTX].y + g_fPageOffset[CTX].x;
+	fr = fr * g_fPageOffset[CTX].z;
+
+	off0 = g_fc0.yxyx * fr.xxyy + g_fc0.xyxy * rem.xxyy;
+	off1 = g_fc0.yxyx * fr.zzww + g_fc0.xyxy * rem.zzww;
+}
+
+// Wrapping modes
+#if defined(REPEAT)
+
+float2 ps2addr(float2 coord)
+{
+	return fract(coord.xy);
+}
+
+#elif defined(CLAMP)
+
+float2 ps2addr(float2 coord)
+{
+	return clamp(coord.xy, g_fClampExts[CTX].xy, g_fClampExts[CTX].zw);
+}
+
+#elif defined(REGION_REPEAT)
+
+// computes the local tex coord along with addressing modes
+float2 ps2addr(float2 coord)
+{
+	float2 final = fract(clamp(coord.xy, g_fClampExts[CTX].xy, g_fClampExts[CTX].zw));
+
+	if( TexWrapMode[CTX].x > g_fBilinear.z ) // region repeat mode for x (umsk&x)|ufix
+		final.x = texture(g_sBitwiseANDX, abs(coord.x)*TexWrapMode[CTX].zx).x * g_fClampExts[CTX].x + g_fClampExts[CTX].z;
+	if( TexWrapMode[CTX].y > g_fBilinear.z ) // region repeat mode for x (vmsk&x)|vfix
+		final.y = texture(g_sBitwiseANDY, abs(coord.y)*TexWrapMode[CTX].wy).x * g_fClampExts[CTX].y + g_fClampExts[CTX].w;
+
+	return final;
+}
+
+#else
+
+float2 ps2addr(float2 coord)
+{
+	return fract(clamp(coord.xy, g_fClampExts[CTX].xy, g_fClampExts[CTX].zw));
+}
+
+#endif
+
+half4 tex2DPS_32(float2 tex0)
+{
+	return texture(g_sMemory[CTX], ps2memcoord(tex0).xy);
+}
+
+// use when texture is not tiled -- shader 1
+half4 tex2DPS_tex32(float2 tex0)
+{
+	return texture(g_sMemory[CTX], g_fTexDims[CTX].xy*tex0+g_fTexDims[CTX].zw)*g_fZBias.zzzw+g_fPageOffset[CTX].w;
+}
+
+// use when texture is not tiled -- shader 2
+half4 tex2DPS_clut32(float2 tex0)
+{
+	float index = texture(g_sMemory[CTX], g_fTexDims[CTX].xy*tex0+g_fTexDims[CTX].zw).a+g_fPageOffset[CTX].w;
+	return texture(g_sCLUT, index*g_fExactColor.xz+g_fExactColor.yz);
+}
+
+// Shader 3
+// use when texture is not tiled and converting from 32bit to 16bit
+// don't convert on the block level, only on the column level
+// so every other 8 pixels, use the upper bits instead of lower
+half4 tex2DPS_tex32to16(float2 tex0)
+{
+	bool upper = false;
+	tex0.y += g_fPageOffset[CTX].z;
+	float2 ffrac = mod(tex0, g_fTexOffset[CTX].xy);
+	tex0.xy = g_fc0.ww * (tex0.xy + ffrac);
+	if( ffrac.x > g_fTexOffset[CTX].z ) {
+		tex0.x -= g_fTexOffset[CTX].z;
+		upper = true;
+	}
+	if( ffrac.y >= g_fTexOffset[CTX].w ) {
+		tex0.y -= g_fTexOffset[CTX].w;
+		tex0.x += g_fc0.w;
+	}
+
+	half4 color = texture(g_sMemory[CTX], g_fTexDims[CTX].xy*tex0+g_fTexDims[CTX].zw)*g_fZBias.zzzw+g_fPageOffset[CTX].w;
+	float2 uv = upper ? color.xw : color.zy;
+	return texture(g_sConv16to32, uv+g_fPageOffset[CTX].xy);
+}
+
+// Shader 4
+// used when a 16 bit texture is used an 8h
+half4 tex2DPS_tex16to8h(float2 tex0)
+{
+	float4 final;
+	float2 ffrac = mod(tex0+g_fPageOffset[CTX].zw, g_fTexOffset[CTX].xy);
+	tex0.xy = g_fPageOffset[CTX].xy * tex0.xy - ffrac * g_fc0.yw;
+
+	if( ffrac.x > g_fTexOffset[CTX].x*g_fc0.w )
+		tex0.x += g_fTexOffset[CTX].x*g_fc0.w;
+	if( tex0.x >= g_fc0.y ) tex0 += g_fTexOffset[CTX].zw;
+
+	float4 upper = texture(g_sMemory[CTX], g_fTexDims[CTX].xy*tex0+g_fTexDims[CTX].zw);
+
+	// only need alpha
+	float index = texture(g_sConv32to16, upper.zyx-g_fc0.z).y + upper.w*g_fc0.w*g_fc0.w;
+	return texture(g_sCLUT, index+g_fExactColor.yz);
+}
+
+// Shader 5
+// used when a 16 bit texture is used a 32bit one
+half4 tex2DPS_tex16to32(float2 tex0)
+{
+	float4 final;
+	float2 ffrac = mod(tex0+g_fPageOffset[CTX].zw, g_fTexOffset[CTX].xy);
+	//tex0.xy = g_fPageOffset[CTX].xy * tex0.xy - ffrac * g_fc0.yw;
+	tex0.y += g_fPageOffset[CTX].y * ffrac.y;
+
+	if( ffrac.x > g_fTexOffset[CTX].z ) {
+		tex0.x -= g_fTexOffset[CTX].z;
+		tex0.y += g_fTexOffset[CTX].w;
+	}
+
+	float fconst = g_fc0.w*g_fc0.w;
+	float4 lower = texture(g_sSrcFinal, g_fTexDims[CTX].xy*tex0);
+	float4 upper = texture(g_sMemory[CTX], g_fTexDims[CTX].xy*tex0+g_fTexDims[CTX].zw);
+
+	final.zy = texture(g_sConv32to16, lower.zyx).xy + lower.ww*fconst;
+	final.xw = texture(g_sConv32to16, upper.zyx).xy + upper.ww*fconst;
+	return final;
+}
+
+half4 tex2DPS_tex16to32h(float2 tex0)
+{
+	float4 final =  vec4(0.0, 0.0, 0.0, 0.0);
+	return final;
+}
+
+//half4 f;
+//f.w = old.y > (127.2f/255.0f) ? 1 : 0;
+//old.y -= 0.5f * f.w;
+//f.xyz = fract(old.yyx*half3(2.002*255.0f/256.0f, 64.025f*255.0f/256.0f, 8.002*255.0f/256.0f));
+//f.y += old.x * (0.25f*255.0f/256.0f);
+
+////////////////////////////////
+// calculates the texture color
+////////////////////////////////
+
+#define decl_ps2shade(num) \
+decl_ps2shade_##num(_32) \
+decl_ps2shade_##num(_tex32) \
+decl_ps2shade_##num(_clut32) \
+decl_ps2shade_##num(_tex32to16) \
+decl_ps2shade_##num(_tex16to8h) \
+decl_ps2shade_##num(_tex16to32h) 
+
+// nearest
+#define decl_ps2shade_0(bit) \
+float4 ps2shade0##bit( TEX_DECL tex) \
+{ \
+    return tex2DPS##bit( ps2addr(TEX_XY)); \
+} 
+
+// do fast memcoord4 calcs when textures behave well
+#ifdef REPEAT
+#define PS2MEMCOORD4 ps2memcoord4
+#else
+#define PS2MEMCOORD4 ps2memcoord4
+#endif
+
+
+#define decl_BilinearFilter(bit, addrfn) \
+half4 BilinearFilter##bit(float2 tex0) \
+{ \
+	float4 off0, off1; \
+	float4 ftex; \
+	float2 ffrac; \
+	ftex.xy = tex0 + g_fBilinear.xy * g_fRealTexDims[CTX].zw; \
+	ffrac = fract(ftex.xy*g_fRealTexDims[CTX].xy); \
+	ftex.xy -= ffrac.xy * g_fRealTexDims[CTX].zw; \
+	\
+ 	ftex.zw = ps2addr(ftex.xy + g_fRealTexDims[CTX].zw); \
+ 	ftex.xy = ps2addr(ftex.xy); \
+ 	\
+ 	PS2MEMCOORD4(ftex, off0, off1); \
+	half4 c0 = texture(g_sMemory[CTX], off0.xy); \
+	half4 c1 = texture(g_sMemory[CTX], off0.zw); \
+	half4 c2 = texture(g_sMemory[CTX], off1.xy); \
+	half4 c3 = texture(g_sMemory[CTX], off1.zw); \
+	return mix( mix(c0, c1, vec4(ffrac.x)), mix(c2, c3, ffrac.x), vec4(ffrac.y) ); \
+} 
+
+decl_BilinearFilter(_32, ps2addr)
+decl_BilinearFilter(_tex32, ps2addr)
+decl_BilinearFilter(_clut32, ps2addr)
+decl_BilinearFilter(_tex32to16, ps2addr)
+decl_BilinearFilter(_tex16to8h, ps2addr)
+decl_BilinearFilter(_tex16to32h, ps2addr)
+
+//TODO! For mip maps, only apply when LOD >= 0
+// lcm == 0, LOD = log(1/Q)*L + K, lcm == 1, LOD = K
+
+// bilinear
+#define decl_ps2shade_1(bit) \
+half4 ps2shade1##bit(TEX_DECL tex) \
+{ \
+	return BilinearFilter##bit(TEX_XY); \
+} 
+
+// nearest, mip nearest
+#define decl_ps2shade_2(bit) \
+half4 ps2shade2##bit(TEX_DECL tex) \
+{ \
+    return tex2DPS##bit( ps2addr(TEX_XY)); \
+} 
+
+// nearest, mip linear
+#define decl_ps2shade_3(bit) \
+half4 ps2shade3##bit(TEX_DECL tex) \
+{ \
+    return tex2DPS##bit(ps2addr(TEX_XY)); \
+} 
+
+// linear, mip nearest
+#define decl_ps2shade_4(bit) \
+half4 ps2shade4##bit(TEX_DECL tex) \
+{ \
+    return BilinearFilter##bit(TEX_XY); \
+} 
+
+// linear, mip linear
+#define decl_ps2shade_5(bit) \
+half4 ps2shade5##bit(TEX_DECL tex) \
+{ \
+    return BilinearFilter##bit(TEX_XY); \
+} 
+
+decl_ps2shade(0)
+decl_ps2shade(1)
+decl_ps2shade(2)
+decl_ps2shade(3)
+decl_ps2shade(4)
+decl_ps2shade(5)
+
+
+half4 ps2CalcShade(half4 texcol, half4 color)
+{
+#ifdef TEST_AEM
+	if( dot(texcol.xyzw, g_fTestBlack[CTX].xyzw) <= g_fc0.z )
+		texcol.w = g_fc0.x;
+	else
+#endif
+		texcol.w = texcol.w * fTexAlpha[CTX].y + fTexAlpha[CTX].x;
+
+	texcol = texcol * (fTexAlpha2[CTX].zzzw * color + fTexAlpha2[CTX].xxxy) + fTexAlpha[CTX].zzzw * color.wwww;
+
+	return texcol;
+}
+
+// final ops on the color
+#ifdef EXACT_COLOR
+
+half4 ps2FinalColor(half4 col)
+{
+	// g_fOneColor has to scale by 255
+	half4 temp = col * g_fOneColor.xxxy + g_fOneColor.zzzw;
+	temp.w = floor(temp.w)*g_fExactColor.w;
+	return temp;
+}
+
+#else
+half4 ps2FinalColor(half4 col)
+{
+	return col * g_fOneColor.xxxy + g_fOneColor.zzzw;
+}
+#endif
+
+#ifdef FRAGMENT_SHADER 			// This is code only for FRAGMENTS (pixel shader)
+
+void RegularPS() {
+	// whenever outputting depth, make sure to mult by 255/256 and 1
+	gl_FragData[0] = ps2FinalColor(gl_Color);
+	DOZWRITE(gl_FragData[1] = gl_TexCoord[0];)
+}
+
+#ifdef WRITE_DEPTH
+
+#define DECL_TEXPS(num, bit) \
+void Texture##num##bit##PS() \
+{ \
+	gl_FragData[0] = ps2FinalColor(ps2CalcShade(ps2shade##num##bit(gl_TexCoord[0]), gl_Color)); \
+	gl_FragData[1] = gl_TexCoord[1]; \
+} 
+
+#else
+
+#define DECL_TEXPS(num, bit) \
+void Texture##num##bit##PS() \
+{ \
+	gl_FragData[0] = ps2FinalColor(ps2CalcShade(ps2shade##num##bit(gl_TexCoord[0]), gl_Color)); \
+} 
+
+#endif
+
+
+#define DECL_TEXPS_(num) \
+DECL_TEXPS(num, _32) \
+DECL_TEXPS(num, _tex32) \
+DECL_TEXPS(num, _clut32) \
+DECL_TEXPS(num, _tex32to16) \
+DECL_TEXPS(num, _tex16to8h) 
+
+DECL_TEXPS_(0)
+DECL_TEXPS_(1)
+DECL_TEXPS_(2)
+DECL_TEXPS_(3)
+DECL_TEXPS_(4)
+DECL_TEXPS_(5)
+
+void RegularFogPS() {
+	half4 c;
+	c.xyz = mix(g_fFogColor.xyz, gl_Color.xyz, vec3(gl_TexCoord[0].x));
+	c.w = gl_Color.w;
+	gl_FragData[0] = ps2FinalColor(c);
+   	DOZWRITE(gl_FragData[1] = gl_TexCoord[1];)
+}
+
+#ifdef WRITE_DEPTH
+
+#define DECL_TEXFOGPS(num, bit) \
+void TextureFog##num##bit##PS() \
+{ \
+	half4 c = ps2CalcShade(ps2shade##num##bit(gl_TexCoord[0]), gl_Color); \
+	c.xyz = mix(g_fFogColor.xyz, c.xyz, vec3(gl_TexCoord[1].x)); \
+	gl_FragData[0] = ps2FinalColor(c); \
+   	gl_FragData[1] = gl_TexCoord[2]; \
+} 
+
+#else
+
+#define DECL_TEXFOGPS(num, bit) \
+void TextureFog##num##bit##PS() \
+{ \
+	half4 c = ps2CalcShade(ps2shade##num##bit(gl_TexCoord[0]), gl_Color); \
+	c.xyz = mix(g_fFogColor.xyz, c.xyz, vec3(gl_TexCoord[1].x)); \
+	gl_FragData[0] = ps2FinalColor(c); \
+} 
+
+#endif
+
+#define DECL_TEXFOGPS_(num) \
+DECL_TEXFOGPS(num, _32) \
+DECL_TEXFOGPS(num, _tex32) \
+DECL_TEXFOGPS(num, _clut32) \
+DECL_TEXFOGPS(num, _tex32to16) \
+DECL_TEXFOGPS(num, _tex16to8h) 
+
+DECL_TEXFOGPS_(0)
+DECL_TEXFOGPS_(1)
+DECL_TEXFOGPS_(2)
+DECL_TEXFOGPS_(3)
+DECL_TEXFOGPS_(4)
+DECL_TEXFOGPS_(5)
+
+//-------------------------------------------------------
+// Techniques not related to the main primitive commands
+half4 BilinearBitBlt(float2 tex0)
+{
+	float4 ftex;
+	float2 ffrac;
+
+	ffrac.xy = fract(tex0*g_fRealTexDims[CTX].xy);
+	ftex.xy = tex0 - ffrac.xy * g_fRealTexDims[CTX].zw;
+	ftex.zw = ftex.xy + g_fRealTexDims[CTX].zw;
+
+	float4 off0, off1;
+	ps2memcoord4_fast(ftex, off0, off1);
+	half4 c0 = texture(g_sMemory[CTX], off0.xy);
+	half4 c1 = texture(g_sMemory[CTX], off0.zw);
+	half4 c2 = texture(g_sMemory[CTX], off1.xy);
+	half4 c3 = texture(g_sMemory[CTX], off1.zw);
+
+	return mix( mix(c0, c1, vec4(ffrac.x)), mix(c2, c3, vec4(ffrac.x)), vec4(ffrac.y) );
+}
+
+void BitBltPS() {
+	gl_FragData[0] = texture(g_sMemory[CTX], ps2memcoord(gl_TexCoord[0].xy).xy)*g_fOneColor.xxxy;
+}
+
+// used when AA
+void BitBltAAPS() {
+	gl_FragData[0] = BilinearBitBlt(gl_TexCoord[0].xy) * g_fOneColor.xxxy;
+}
+
+void BitBltDepthPS() {
+	vec4 data;
+	data = texture(g_sMemory[CTX], ps2memcoord(gl_TexCoord[0].xy));
+	gl_FragData[0] = data + g_fZBias.y;
+	gl_FragDepth   = (log(g_fc0.y + dot(data, g_fBitBltZ)) * g_fOneColor.w) * g_fZMin.y + dot(data, g_fBitBltZ) * g_fZMin.x ;
+}
+
+void BitBltDepthMRTPS() {
+	vec4 data;
+	data = texture(g_sMemory[CTX], ps2memcoord(gl_TexCoord[0].xy));
+	gl_FragData[0] = data + g_fZBias.y;
+	gl_FragData[1].x = g_fc0.x;
+	gl_FragDepth = (log(g_fc0.y + dot(data, g_fBitBltZ)) * g_fOneColor.w) * g_fZMin.y + dot(data, g_fBitBltZ) * g_fZMin.x ;
+}
+
+/*static const float BlurKernel[9] = {
+	0.027601,
+	0.066213,
+	0.123701,
+	0.179952,
+	0.205065,
+	0.179952,
+	0.123701,
+	0.066213,
+	0.027601
+};*/
+
+half4 BilinearFloat16(float2 tex0)
+{
+	return texture(g_sSrcFinal, tex0.xy);
+}
+
+void CRTCTargInterPS() {
+	float finter = texture(g_sInterlace, gl_TexCoord[1].yy).x * g_fOneColor.z + g_fOneColor.w + g_fc0.w;
+	float4 c = BilinearFloat16(gl_TexCoord[0].xy);
+	c.w = ( g_fc0.w*c.w * g_fOneColor.x + g_fOneColor.y ) * finter;	
+	gl_FragData[0] = c;
+}
+
+void CRTCTargPS() {
+	float4 c = BilinearFloat16(gl_TexCoord[0].xy);
+	c.w = g_fc0.w * c.w * g_fOneColor.x + g_fOneColor.y;
+	gl_FragData[0] = c;
+}
+
+void CRTCInterPS() {
+	float finter = texture(g_sInterlace, gl_TexCoord[1].yy).x * g_fOneColor.z + g_fOneColor.w + g_fc0.w;
+	float2 filtcoord = trunc(gl_TexCoord[0].xy) * g_fInvTexDims.xy + g_fInvTexDims.zw;
+	half4 c = BilinearBitBlt(filtcoord);
+	c.w = (c.w * g_fOneColor.x + g_fOneColor.y)*finter;
+	gl_FragData[0] = c;
+}
+
+// simpler
+void CRTCInterPS_Nearest() {
+	float finter = texture(g_sInterlace, gl_TexCoord[1].yy).x * g_fOneColor.z + g_fOneColor.w + g_fc0.w;
+	half4 c = texture(g_sMemory[CTX], ps2memcoord(gl_TexCoord[0].xy).xy);
+	c.w = (c.w * g_fOneColor.x + g_fOneColor.y)*finter;
+	gl_FragData[0] = c;
+}
+
+void CRTCPS() {
+	float2 filtcoord = gl_TexCoord[0].xy * g_fInvTexDims.xy+g_fInvTexDims.zw;
+	half4 c = BilinearBitBlt(filtcoord);
+	c.w = c.w * g_fOneColor.x + g_fOneColor.y;
+	gl_FragData[0] = c;
+}
+
+// simpler
+void CRTCPS_Nearest() {
+	half4 c = texture(g_sMemory[CTX], ps2memcoord(gl_TexCoord[0].xy).xy);
+	c.w = c.w * g_fOneColor.x + g_fOneColor.y;
+	gl_FragData[0] = c;
+}
+
+void CRTC24InterPS() {
+	float finter = texture(g_sInterlace, gl_TexCoord[1].yy).x * g_fOneColor.z + g_fOneColor.w + g_fc0.w;
+	float2 filtcoord = trunc(gl_TexCoord[0].xy) * g_fInvTexDims.xy + g_fInvTexDims.zw;
+
+	half4 c = texture(g_sMemory[CTX], ps2memcoord(filtcoord).xy);
+	c.w = (c.w * g_fOneColor.x + g_fOneColor.y)*finter;
+	gl_FragData[0] = c;
+}
+
+void CRTC24PS() {
+	float2 filtcoord = trunc(gl_TexCoord[0].xy) * g_fInvTexDims.xy + g_fInvTexDims.zw;
+	half4 c = texture(g_sMemory[CTX], ps2memcoord(filtcoord).xy);
+	c.w = c.w * g_fOneColor.x + g_fOneColor.y;
+	gl_FragData[0] = c;
+}
+
+void ZeroPS() {
+	gl_FragData[0] = g_fOneColor;
+}
+
+void BaseTexturePS() {
+	gl_FragData[0] = texture(g_sSrcFinal, gl_TexCoord[0].xy) * g_fOneColor;
+}
+
+void Convert16to32PS() {
+	float4 final;
+	float2 ffrac = mod ( gl_TexCoord[0].xy + g_fTexDims[CTX].zw, g_fTexOffset[CTX].xy);
+	float2 tex0 = g_fTexDims[CTX].xy * gl_TexCoord[0].xy - ffrac * g_fc0.yw;
+
+	if (ffrac.x > g_fTexOffset[CTX].x*g_fc0.w)
+		tex0.x += g_fTexOffset[CTX].x*g_fc0.w;
+	if (tex0.x >= g_fc0.y) 
+		tex0 += g_fTexOffset[CTX].zw;
+
+	float4 lower = texture(g_sSrcFinal, tex0);
+	float4 upper = texture(g_sSrcFinal, tex0 + g_fPageOffset[CTX].xy);
+
+	final.zy = texture(g_sConv32to16, lower.zyx).xy + lower.ww*g_fPageOffset[CTX].zw;
+	final.xw = texture(g_sConv32to16, upper.zyx).xy + upper.ww*g_fPageOffset[CTX].zw;
+
+	gl_FragData[0]= final;
+}
+
+// use when texture is not tiled and converting from 32bit to 16bit
+// don't convert on the block level, only on the column level
+// so every other 8 pixels, use the upper bits instead of lower
+void Convert32to16PS() {
+	bool upper = false;
+	float2 ffrac = mod(gl_TexCoord[0].xy + g_fTexDims[CTX].zw, g_fTexOffset[CTX].xy);
+	float2 tex0 = g_fc0.ww * (gl_TexCoord[0].xy + ffrac);
+	if( ffrac.x > g_fTexOffset[CTX].z ) {
+		tex0.x -= g_fTexOffset[CTX].z;
+		upper = true;
+	}
+	if( ffrac.y >= g_fTexOffset[CTX].w ) {
+		tex0.y -= g_fTexOffset[CTX].w;
+		tex0.x += g_fc0.w;
+	}
+
+	half4 color = texture(g_sSrcFinal, tex0*g_fTexDims[CTX].xy)*g_fc0.yyyw;
+	float2 uv = upper ? color.xw : color.zy;
+	gl_FragData[0] = texture(g_sConv16to32, uv*g_fPageOffset[CTX].xy+g_fPageOffset[CTX].zw)*g_fTexDims[CTX].xxxy;
+}
+#endif 			//FRAGMENT_SHADER 
+
+#ifdef VERTEX_SHADER
+
+float4 OutPosition(float4 vertex) {
+	float4 Position;
+	Position.xy = gl_Vertex.xy * g_fPosXY[CTX].xy + g_fPosXY[CTX].zw;
+	Position.z = (log(g_fc0.y + dot(g_fZ, gl_SecondaryColor.zyxw)) * g_fZNorm.x + g_fZNorm.y) * g_fZMin.y + dot(g_fZ, gl_SecondaryColor.zyxw) * g_fZMin.x ;
+	Position.w = g_fc0.y; 
+	return Position;
+}
+
+// just smooth shadering
+void RegularVS() {
+	gl_Position = OutPosition(gl_Vertex);
+	gl_FrontColor = gl_Color;
+	DOZWRITE(gl_TexCoord[0] = gl_SecondaryColor * g_fZBias.x + g_fZBias.y; gl_TexCoord[0].w = g_fc0.y;)
+}
+
+// diffuse texture mapping
+void TextureVS() {
+	gl_Position = OutPosition(gl_Vertex);
+ 	gl_FrontColor = gl_Color;
+#ifdef PERSPECTIVE_CORRECT_TEX
+	gl_TexCoord[0].xyz = gl_MultiTexCoord0.xyz;
+#else
+	gl_TexCoord[0].xy = gl_MultiTexCoord0.xy/gl_MultiTexCoord0.z;
+#endif
+ 	DOZWRITE(gl_TexCoord[1] = gl_SecondaryColor * g_fZBias.x + g_fZBias.y; gl_TexCoord[1].w = g_fc0.y;)
+}
+
+void RegularFogVS() {
+	float4 position = OutPosition(gl_Vertex);
+	gl_Position = position;
+	gl_FrontColor = gl_Color;
+    	gl_TexCoord[0].x = position.z * g_fBilinear.w;
+	DOZWRITE(gl_TexCoord[1] = gl_SecondaryColor * g_fZBias.x + g_fZBias.y; gl_TexCoord[1].w = g_fc0.y;)
+}
+
+void TextureFogVS() {
+	gl_Position = OutPosition(gl_Vertex);
+	gl_FrontColor = gl_Color;
+#ifdef PERSPECTIVE_CORRECT_TEX
+    	gl_TexCoord[0].xyz = gl_MultiTexCoord0.xyz;
+#else
+	gl_TexCoord[0].xy  = gl_MultiTexCoord0.xy / gl_MultiTexCoord0.z;
+#endif
+	gl_TexCoord[1].x = gl_Vertex.z * g_fBilinear.w;
+    	DOZWRITE(gl_TexCoord[2] = gl_SecondaryColor * g_fZBias.x + g_fZBias.y; gl_TexCoord[2].w = g_fc0.y;)
+}
+
+void BitBltVS() {
+	vec4 position;
+	position.xy = gl_Vertex.xy * g_fBitBltPos.xy + g_fBitBltPos.zw;
+	position.zw = g_fc0.xy;
+	gl_Position = position;
+
+	gl_TexCoord[0].xy = gl_MultiTexCoord0.xy * g_fBitBltTex.xy + g_fBitBltTex.zw;
+	gl_TexCoord[1].xy = position.xy * g_fBitBltTrans.xy + g_fBitBltTrans.zw;
+}
+
+#endif VERTEX_SHADER
+
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
--- a/plugins/zzogl-pg/opengl/targets.h
+++ b/plugins/zzogl-pg/opengl/targets.h
@ -34,6 +34,9 @@

 #define VB_BUFFERSIZE			   0x4000

+extern void FlushIfNecesary(void* ptr);
+extern bool g_bSaveZUpdate;
+
 // all textures have this width
 extern int GPU_TEXWIDTH;
 extern float g_fiGPU_TEXWIDTH;
@ -101,10 +104,10 @@ class CRenderTarget
 			TS_Virtual = 4, // currently not mapped to memory
 			TS_FeedbackReady = 8, // feedback effect is ready and doesn't need to be updated
 			TS_NeedConvert32 = 16,
-			TS_NeedConvert16 = 32,
+			TS_NeedConvert16 = 32
 		};
-		inline float4 DefaultBitBltPos();
-		inline float4 DefaultBitBltTex();
+		float4 DefaultBitBltPos();
+		float4 DefaultBitBltTex();

 	private:
 		void _CreateFeedback();
@ -310,9 +313,12 @@ class CRenderTargetMngr
 			return ptarg;
 		}

-		static void DestroyTarg(CRenderTarget* ptarg);
+		void DestroyTarg(CRenderTarget* ptarg);
 		void PrintTargets();
 		MAPTARGETS mapTargets, mapDummyTargs;
+	private:
+		
+		void DestroyAllTargetsHelper(void* ptr);
 };

 class CMemoryTargetMngr
@ -487,108 +493,6 @@ inline u32 GetFrameKeyDummy(CRenderTarget* frame)
 	return GetFrameKeyDummy(frame->fbp, frame->fbw, frame->fbh, frame->psm);
 }

-#include "Mem.h"
-
-static __forceinline void DrawTriangleArray()
-{
-	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-	GL_REPORT_ERRORD();
-}
-
-static __forceinline void DrawBuffers(GLenum *buffer)
-{
-	if (glDrawBuffers != NULL) 
-	{
-		glDrawBuffers(1, buffer);
-	}
-
-	GL_REPORT_ERRORD();
-}
-
-static __forceinline void FBTexture(int attach, int id = 0)
-{
-	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT + attach, GL_TEXTURE_RECTANGLE_NV, id, 0);
-	GL_REPORT_ERRORD();
-}
-
-static __forceinline void ResetRenderTarget(int index)
-{
-	FBTexture(index);
-}
-
-static __forceinline void Texture2D(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
-{
-	glTexImage2D(GL_TEXTURE_2D, 0, iFormat, width, height, 0, format, type, pixels);
-}
-
-static __forceinline void Texture2D(GLint iFormat, GLenum format, GLenum type, const GLvoid* pixels)
-{
-	glTexImage2D(GL_TEXTURE_2D, 0, iFormat, BLOCK_TEXWIDTH, BLOCK_TEXHEIGHT, 0, format, type, pixels);
-}
-
-static __forceinline void Texture3D(GLint iFormat, GLint width, GLint height, GLint depth, GLenum format, GLenum type, const GLvoid* pixels)
-{
-	glTexImage3D(GL_TEXTURE_3D, 0, iFormat, width, height, depth, 0, format, type, pixels);
-}
-	
-static __forceinline void TextureRect(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
-{
-	glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, iFormat, width, height, 0, format, type, pixels);
-}
-
-static __forceinline void TextureRect2(GLint iFormat, GLint width, GLint height, GLenum format, GLenum type, const GLvoid* pixels)
-{
-	glTexImage2D(GL_TEXTURE_RECTANGLE, 0, iFormat, width, height, 0, format, type, pixels);
-}
-
-static __forceinline void TextureRect(GLenum attach, GLuint id = 0)
-{
-	glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, attach, GL_RENDERBUFFER_EXT, id);
-}
-
-static __forceinline void setTex2DFilters(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, type);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, type);
-}
-
-static __forceinline void setTex2DWrap(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, type);
-	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, type);
-}
-
-static __forceinline void setTex3DFilters(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MAG_FILTER, type);
-	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_MIN_FILTER, type);
-}
-
-static __forceinline void setTex3DWrap(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_S, type);
-	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_T, type);
-	glTexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_R, type);
-}
-
-static __forceinline void setRectFilters(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, type);
-	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, type);
-}
-
-static __forceinline void setRectWrap(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, type);
-	glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, type);
-}
-
-static __forceinline void setRectWrap2(GLint type)
-{
-	glTexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_WRAP_S, type);
-	glTexParameteri(GL_TEXTURE_RECTANGLE, GL_TEXTURE_WRAP_T, type);
-}
-
 //------------------------ Inlines -------------------------

 // Calculate maximum height for target
@ -604,4 +508,12 @@ inline int get_maxheight(int fbp, int fbw, int psm)
 	return ret;
 }

+// memory size for one row of texture. It depends on width of texture and number of bytes
+// per pixel
+inline u32 Pitch(int fbw) { return (RW(fbw) * 4) ; }
+
+// memory size of whole texture. It is number of rows multiplied by memory size of row
+inline u32 Tex_Memory_Size(int fbw, int fbh) { return (RH(fbh) * Pitch(fbw)); }
+
+
 #endif
--- a/plugins/zzogl-pg/opengl/x86.cpp
+++ b/plugins/zzogl-pg/opengl/x86.cpp
@ -22,9 +22,13 @@
 #include "x86.h"

 #if defined(ZEROGS_SSE2)
-#include <emmintrin.h>
+#include <immintrin.h>
 #endif

+
+// Note: all codes of this files is deprecated. Keeping for reference.
+
+
 // swizzling

 //These were only used in the old version of RESOLVE_32_BITS. Keeping for reference.
--- a/plugins/zzogl-pg/opengl/x86.h
+++ b/plugins/zzogl-pg/opengl/x86.h
@ -22,153 +22,99 @@

 #include "GS.h"

-extern "C" void __fastcall SwizzleBlock32_sse2(u8* dst, u8* src, int srcpitch, u32 WriteMask = 0xffffffff);
-extern "C" void __fastcall SwizzleBlock16_sse2(u8* dst, u8* src, int srcpitch);
-extern "C" void __fastcall SwizzleBlock8_sse2(u8* dst, u8* src, int srcpitch);
-extern "C" void __fastcall SwizzleBlock4_sse2(u8* dst, u8* src, int srcpitch);
-extern "C" void __fastcall SwizzleBlock32u_sse2(u8* dst, u8* src, int srcpitch, u32 WriteMask = 0xffffffff);
-extern "C" void __fastcall SwizzleBlock16u_sse2(u8* dst, u8* src, int srcpitch);
-extern "C" void __fastcall SwizzleBlock8u_sse2(u8* dst, u8* src, int srcpitch);
-extern "C" void __fastcall SwizzleBlock4u_sse2(u8* dst, u8* src, int srcpitch);
+#ifndef ZZNORMAL_MEMORY
+// StarOcean use 24 in logo and 4HH and 4HL in menu subfont
+// Tony hawk use 16, but have a lot of trouble
+// This function move one blockwidth * blockheigh data block from src to dst, in assumption, that in dst we store swizzled data,
+template <int psm>
+inline void __fastcall SwizzleBlock(u32* dst, u32* src, int pitch, u32 WriteMask = 0xffffffff) {
+	u8 B = (PSM_PIXELS_PER_WORD<psm>() > 2)? 4 : 2;

-// frame swizzling
+	assert ((pitch & 3) == 0 );

-#if 0
-// no AA
-extern "C" void __fastcall FrameSwizzleBlock32_sse2(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall FrameSwizzleBlock16_sse2(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32Z_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16Z_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+	u32* src1 = src;
+	u32* src2 = src + pitch / 4;

-// AA 2x
-extern "C" void __fastcall FrameSwizzleBlock32A2_sse2(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall FrameSwizzleBlock16A2_sse2(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32A2_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32ZA2_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16A2_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16ZA2_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+	for(int j = 0; j < 4 ; j++, src1 += B * pitch / 4, src2 += B * pitch / 4)
+		for(int i = 0; i < 8; i++) {
+			fillPixelsFromMemory<psm>(dst, src1, i, B * j, pitch /4, 0, 0, WriteMask);
+			fillPixelsFromMemory<psm>(dst, src2, i, B * j + 1, pitch / 4 , 0, 0, WriteMask);
+		}
+}

-// AA 4x
-extern "C" void __fastcall FrameSwizzleBlock32A4_sse2(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall FrameSwizzleBlock16A4_sse2(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32A4_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock32ZA4_sse2(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16A4_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern "C" void __fastcall Frame16SwizzleBlock16ZA4_sse2(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+// Simply AA multiplication. We does not use src[j << AA], but prefer to keep more central pixel in data.
+// We does not use mixing of neighbour pixels, because it does not give any noticiable bonus, but speed penalty is big.
+template <u8 AA>
+inline u32 mixed_pixel(u32* src, int j) {
+	if (AA == 0)
+		return src[j] ;

-/*extern void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch, u32 WriteMask = 0xffffffff);
-extern void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch);
-extern void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch);
-extern void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch);*/
+	if (AA == 1) 
+		return src[(j << 1) + 1];	

-// no AA
-extern void __fastcall FrameSwizzleBlock32_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock24_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock16_c(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32Z_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16Z_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+	if (AA == 2)
+		return src[(j << 2) + 2];
+}

-// AA 2x
-extern void __fastcall FrameSwizzleBlock32A2_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock24A2_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock16A2_c(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32A2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32ZA2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16A2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16ZA2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+// We fill destination word for pixel number j (j < 8). For 16-bit storage upper size of this word is pixel of j + 8,
+// and RGBA data should be convert to ARGB16.
+// WARNING: floating storage is never be testing
+template <int psm, bool is_float, u8 AA>
+inline u32 convert_pixel(u32* src, int j) {
+	if (is_float) {
+		Vector_16F* fsrc = (Vector_16F*)src;									// We use simplified code for float, it seems not 
+															// to be used anyway.
+		if (PSM_ISHALF<psm>()) {
+			return Float16ToARGB16 ( fsrc[j << AA]) + (Float16ToARGB16(fsrc[(j + 8) << AA]) << 16);
+		}
+		else {
+			return Float16ToARGB ( fsrc[j << AA] );
+		}
+	}
+	else {
+		if (PSM_ISHALF<psm>()) {
+			return RGBA32to16(mixed_pixel<AA>(src, j)) + (RGBA32to16(mixed_pixel<AA>(src, j + 8)) << 16);
+		}
+		else {
+			return mixed_pixel<AA>(src, j);
+		}
+	}
+}

-// AA 4x
-extern void __fastcall FrameSwizzleBlock32A4_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock24A4_c(u32* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall FrameSwizzleBlock16A4_c(u16* dst, u32* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32A4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock32ZA4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16A4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
-extern void __fastcall Frame16SwizzleBlock16ZA4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask);
+// put data in u32 destination word for pixel x, y < 8 in swizzled block. Note, that in 16-bit target we put 2 pixels (x,y 
+// and x+8, y) in the same word. 
+template <int pix, int x, int y, int psm, bool is_float, u8 AA>
+inline void SettleSwizzlePixel(u32* dst, u32* src, int srcpitch, u32 mask) {
+	u32 tmp = convert_pixel<psm, is_float, AA>(src + y * srcpitch, x);
+	MaskedOR (dst + pix, tmp, mask);										// Don't forget to use mask. 
+}
+
+// Put in dst memory location swizzled block for src. We does not calculate pixel address there at all.
+template <int psm, bool is_float, u8 AA>
+void __fastcall FrameSwizzleBlock(u32* dst, int sj, int si, u32* src, int srcpitch, u32 WriteMask) {
+	u32 mask = HandleWritemask<psm>(WriteMask);									// This function made correct mask for 32, 24 and 16 target's
+
+	for (int i = 0; i < 4; i++) {
+		SettleSwizzlePixel<0, 0, 0, psm, is_float, AA>(dst, src, srcpitch, mask);				// it's possible to put one for here, but I don't know, what's faster
+		SettleSwizzlePixel<1, 1, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<2, 0, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<3, 1, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<4, 2, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<5, 3, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<6, 2, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<7, 3, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<8, 4, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<9, 5, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<10, 4, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<11, 5, 1, psm, is_float, AA>(dst, src, srcpitch, mask);		
+		SettleSwizzlePixel<12, 6, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<13, 7, 0, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<14, 6, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+		SettleSwizzlePixel<15, 7, 1, psm, is_float, AA>(dst, src, srcpitch, mask);
+
+		src += 2 * srcpitch; 
+		dst += 16;
+	}
+}
 #endif
-
-extern void __fastcall SwizzleColumn32_c(int y, u8* dst, u8* src, int srcpitch, u32 WriteMask = 0xffffffff);
-extern void __fastcall SwizzleColumn16_c(int y, u8* dst, u8* src, int srcpitch);
-extern void __fastcall SwizzleColumn8_c(int y, u8* dst, u8* src, int srcpitch);
-extern void __fastcall SwizzleColumn4_c(int y, u8* dst, u8* src, int srcpitch);
-
-// extern "C" void __fastcall WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32* clut);
-extern "C" void __fastcall WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa);
-extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut);
-// extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut);
-extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32 csa);
-extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut);
-extern void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* vm, u32* clut);
-extern void __fastcall WriteCLUT_T32_I8_CSM1_c(u32* vm, u32* clut);
-
-extern void __fastcall WriteCLUT_T16_I4_CSM1_c(u32* vm, u32* clut);
-extern void __fastcall WriteCLUT_T32_I4_CSM1_c(u32* vm, u32* clut);
-
-extern void SSE2_UnswizzleZ16Target(u16* dst, u16* src, int iters);
-
-#ifdef ZEROGS_SSE2
-
-#define FrameSwizzleBlock32 FrameSwizzleBlock32_c
-#define FrameSwizzleBlock24 FrameSwizzleBlock24_c
-#define FrameSwizzleBlock16 FrameSwizzleBlock16_c
-#define Frame16SwizzleBlock32 Frame16SwizzleBlock32_c
-#define Frame16SwizzleBlock32Z Frame16SwizzleBlock32Z_c
-#define Frame16SwizzleBlock16 Frame16SwizzleBlock16_c
-#define Frame16SwizzleBlock16Z Frame16SwizzleBlock16Z_c
-
-#define FrameSwizzleBlock32A2 FrameSwizzleBlock32A2_c
-#define FrameSwizzleBlock24A2 FrameSwizzleBlock24A2_c
-#define FrameSwizzleBlock16A2 FrameSwizzleBlock16A2_c
-#define Frame16SwizzleBlock32A2 Frame16SwizzleBlock32A2_c
-#define Frame16SwizzleBlock32ZA2 Frame16SwizzleBlock32ZA2_c
-#define Frame16SwizzleBlock16A2 Frame16SwizzleBlock16A2_c
-#define Frame16SwizzleBlock16ZA2 Frame16SwizzleBlock16ZA2_c
-
-#define FrameSwizzleBlock32A4 FrameSwizzleBlock32A4_c
-#define FrameSwizzleBlock24A4 FrameSwizzleBlock24A4_c
-#define FrameSwizzleBlock16A4 FrameSwizzleBlock16A4_c
-#define Frame16SwizzleBlock32A4 Frame16SwizzleBlock32A4_c
-#define Frame16SwizzleBlock32ZA4 Frame16SwizzleBlock32ZA4_c
-#define Frame16SwizzleBlock16A4 Frame16SwizzleBlock16A4_c
-#define Frame16SwizzleBlock16ZA4 Frame16SwizzleBlock16ZA4_c
-
-#define WriteCLUT_T16_I8_CSM1 WriteCLUT_T16_I8_CSM1_sse2
-#define WriteCLUT_T32_I8_CSM1 WriteCLUT_T32_I8_CSM1_sse2
-#define WriteCLUT_T16_I4_CSM1 WriteCLUT_T16_I4_CSM1_sse2
-#define WriteCLUT_T32_I4_CSM1 WriteCLUT_T32_I4_CSM1_sse2
-
-#else
-
-#define FrameSwizzleBlock32 FrameSwizzleBlock32_c
-#define FrameSwizzleBlock16 FrameSwizzleBlock16_c
-#define Frame16SwizzleBlock32 Frame16SwizzleBlock32_c
-#define Frame16SwizzleBlock32Z Frame16SwizzleBlock32Z_c
-#define Frame16SwizzleBlock16 Frame16SwizzleBlock16_c
-#define Frame16SwizzleBlock16Z Frame16SwizzleBlock16Z_c
-
-#define FrameSwizzleBlock32A2 FrameSwizzleBlock32A2_c
-#define FrameSwizzleBlock16A2 FrameSwizzleBlock16A2_c
-#define Frame16SwizzleBlock32A2 Frame16SwizzleBlock32A2_c
-#define Frame16SwizzleBlock32ZA2 Frame16SwizzleBlock32ZA2_c
-#define Frame16SwizzleBlock16A2 Frame16SwizzleBlock16A2_c
-#define Frame16SwizzleBlock16ZA2 Frame16SwizzleBlock16ZA2_c
-
-#define FrameSwizzleBlock32A4 FrameSwizzleBlock32A4_c
-#define FrameSwizzleBlock16A4 FrameSwizzleBlock16A4_c
-#define Frame16SwizzleBlock32A4 Frame16SwizzleBlock32A4_c
-#define Frame16SwizzleBlock32ZA4 Frame16SwizzleBlock32ZA4_c
-#define Frame16SwizzleBlock16A4 Frame16SwizzleBlock16A4_c
-#define Frame16SwizzleBlock16ZA4 Frame16SwizzleBlock16ZA4_c
-
-#define WriteCLUT_T16_I8_CSM1 WriteCLUT_T16_I8_CSM1_c
-#define WriteCLUT_T32_I8_CSM1 WriteCLUT_T32_I8_CSM1_c
-#define WriteCLUT_T16_I4_CSM1 WriteCLUT_T16_I4_CSM1_c
-#define WriteCLUT_T32_I4_CSM1 WriteCLUT_T32_I4_CSM1_c
-
-#endif
-
 #endif
--- a/plugins/zzogl-pg/opengl/zerogs.cpp
+++ b/plugins/zzogl-pg/opengl/zerogs.cpp
@ -43,7 +43,7 @@ void HandleGLError()
 {
 	FUNCLOG
 	// check the error status of this framebuffer */
-	GLenum error = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
+	GLenum error = FB::State();

 	// if error != GL_FRAMEBUFFER_COMPLETE_EXT, there's an error of some sort

@ -210,7 +210,7 @@ void SetAA(int mode)
 //	GL_REPORT_ERROR();
 //
 //	fAlpha = 1;
-//	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);   // switch to the backbuffer
+//	FB::Unbind();   // switch to the backbuffer
 //
 //	DisableAllgl() ;
 //	SetShaderCaller("RenderCustom");
@ -311,7 +311,7 @@ void ExtWrite()
 //  else if (PSMT_ISHALF(texframe.psm)) bpp = 2;
 //
 //  // get the start and end addresses of the buffer
-//  GetRectMemAddress(start, end, texframe.psm, 0, 0, texframe.tw, texframe.th, texframe.tbp0, texframe.tbw);
+//  GetRectMemAddressZero(start, end, texframe.psm, texframe.tw, texframe.th, texframe.tbp0, texframe.tbw);
 }

 ////////////