zzogl-pg: Merge back GregMiscellaneous branch (3867)

* Various clean * Replace ASM by intrinsics (much more portable) * Various performance tuning (expect 10%-20% speedup ^_^ ) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3868 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-10-03 12:35:57 +00:00 · 2010-10-03 12:35:57 +00:00 · c7a929a530
parent c875caec15
commit c7a929a530
27 changed files with 2006 additions and 972 deletions
--- a/plugins/zzogl-pg/opengl/GS.h
+++ b/plugins/zzogl-pg/opengl/GS.h
@ -205,6 +205,8 @@ inline bool PSMT_HAS_SHARED_BITS (int fpsm, int tpsm) {
 	return (SUM == 0x15 || SUM == 0x1D || SUM == 0x2C || SUM == 0x30);
 }

+// If a clut is in 32-bit color, its size is 4 bytes, and 16-bit clut has a 2 byte size.
+inline int CLUT_PIXEL_SIZE(int cpsm) {return ((cpsm <= 1) ? 4 : 2); }

 //----------------------- Data from registers -----------------------

@ -542,7 +544,9 @@ typedef struct

 extern GSinternal gs;

-static __forceinline u16 RGBA32to16(u32 c)
+// Note the function is used in a template parameter so it must be declared extern
+// Note2: In this case extern is not compatible with __forceinline so just inline it...
+extern inline u16 RGBA32to16(u32 c)
 {
 	return (u16)((((c) & 0x000000f8) >>  3) |
 				 (((c) & 0x0000f800) >>  6) |
@ -558,6 +562,7 @@ static __forceinline u32 RGBA16to32(u16 c)
 		   (((c) & 0x8000) ? 0xff000000 : 0);
 }

+#if 0
 // converts float16 [0,1] to BYTE [0,255] (assumes value is in range, otherwise will take lower 8bits)
 // f is a u16
 static __forceinline u16 Float16ToBYTE(u16 f)
@ -603,6 +608,7 @@ static __forceinline u16 Float16ToALPHA(u16 f)
 // used for Z values
 #define Float16ToARGB_Z(f) COLOR_ARGB((u32)Float16ToBYTE_2(f.w), Float16ToBYTE_2(f.x), Float16ToBYTE_2(f.y), Float16ToBYTE_2(f.z))
 #define Float16ToARGB16_Z(f) ((Float16ToBYTE_2(f.y)<<8)|Float16ToBYTE_2(f.z))
+#endif


 inline float Clamp(float fx, float fmin, float fmax)
--- a/plugins/zzogl-pg/opengl/GSmain.cpp
+++ b/plugins/zzogl-pg/opengl/GSmain.cpp
@ -38,6 +38,7 @@ using namespace std;
 #include "targets.h"
 #include "ZZoglShaders.h"
 #include "ZZoglFlushHack.h"
+#include "ZZoglFlushHack.h"

 #ifdef _MSC_VER
 #pragma warning(disable:4244)
@ -68,7 +69,7 @@ extern const char* pbilinear[];
 // statistics
 u32 g_nGenVars = 0, g_nTexVars = 0, g_nAlphaVars = 0, g_nResolve = 0;

-#define VER 2
+#define VER 3
 const unsigned char zgsversion	= PS2E_GS_VERSION;
 unsigned char zgsrevision = 0; // revision and build gives plugin version
 unsigned char zgsbuild	= VER;
@ -143,6 +144,7 @@ void ReportHacks(gameHacks hacks)
 	if (hacks.quick_resolve_1) ZZLog::WriteLn("'Quick resolve 1' enabled.");
 	if (hacks.no_quick_resolve) ZZLog::WriteLn("'No Quick resolve' hack enabled.");
 	if (hacks.no_target_clut) ZZLog::WriteLn("'No target clut' hack enabled.");
+	if (hacks.no_stencil) ZZLog::WriteLn("'No stencil' hack enabled.");
 	if (hacks.vss_hack_off) ZZLog::WriteLn("VSS hack enabled.");
 	if (hacks.no_depth_resolve) ZZLog::WriteLn("'No depth resolve' hack enabled.");
 	if (hacks.full_16_bit_res) ZZLog::WriteLn("'Full 16 bit resolution' hack enabled.");
@ -151,7 +153,7 @@ void ReportHacks(gameHacks hacks)
 	if (hacks.no_alpha_test) ZZLog::WriteLn("'No alpha test' hack enabled.");
 	if (hacks.disable_mrt_depth) ZZLog::WriteLn("'Disable mrt depth' hack enabled.");
 	if (hacks.args_32_bit) ZZLog::WriteLn("'Args 32 bit' hack enabled.");
-	if (hacks.path3) ZZLog::WriteLn("'Path3' hack enabled.");
+	//if (hacks.path3) ZZLog::WriteLn("'Path3' hack enabled.");
 	if (hacks.parallel_context) ZZLog::WriteLn("'Parallel context' hack enabled.");
 	if (hacks.xenosaga_spec) ZZLog::WriteLn("'Xenosaga spec' hack enabled.");
 	if (hacks.partial_pointers) ZZLog::WriteLn("'Partial pointers' hack enabled.");
@ -382,6 +384,7 @@ void CALLBACK GSclose()

 	SaveStateFile = NULL;
 	SaveStateExists = true; // default value
+    g_LastCRC = 0;
 }

 void CALLBACK GSirqCallback(void (*callback)())
--- a/plugins/zzogl-pg/opengl/GifTransfer.cpp
+++ b/plugins/zzogl-pg/opengl/GifTransfer.cpp
@ -87,9 +87,7 @@ template<int index> void _GSgifTransfer(const u32 *pMem, u32 size)
 			path->setTag(pMem);
 			pMem += 4;
 			size--;
-
-			if ((conf.settings().path3) && (index == 2) && path->eop) nPath3Hack = 1;
-
+			
 			// eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and
 			// values other than the EOP field are disregarded."
 			if (path->nloop > 0)
--- a/plugins/zzogl-pg/opengl/HostMemory.cpp
+++ b/plugins/zzogl-pg/opengl/HostMemory.cpp
@ -78,7 +78,12 @@
 	
 	static vector<u8> s_vTempBuffer, s_vTransferCache;
 	static int gs_imageEnd = 0;
-	
+
+//	From the start of monster labs. In all 3 cases, psm == 0.
+//	ZZogl-PG:  GetRectMemAddress(0x3f4000, 0x404000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f40, 0x100);
+//	ZZogl-PG:  GetRectMemAddress(0x3f8000, 0x408000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f80, 0x100);
+//	ZZogl-PG:  GetRectMemAddress(0x3fc000, 0x40c000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3fc0, 0x100);
+
 	void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw)
 	{
 		FUNCLOG
@ -108,7 +113,7 @@
 		bits = PSMT_BITS_NUM(psm);
 		start = getPixelFun[psm](x, y, bp, bw);
 		end = getPixelFun[psm](x + w - 1, y + h - 1, bp, bw) + 1;
-		
+
 		if (bits > 0)
 		{
 			start *= bits;
@ -158,7 +163,7 @@

 		if (end > MEMORY_END)
 		{
-			ZZLog::Warn_Log("Host local out of bounds!");
+			ZZLog::Warn_Log("Init host local out of bounds! (end == 0x%x)", end);
 			//gs.imageTransfer = -1;
 			end = MEMORY_END;
 		}
@ -178,9 +183,8 @@
 		int start, end;

 		GetRectMemAddress(start, end, gs.dstbuf.psm, gs.imageX, gs.imageY, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw);
-
+		
 		assert(start < gs_imageEnd);
-
 		end = gs_imageEnd;

 		// sometimes games can decompress to alpha channel of render target only, in this case
@ -434,20 +438,20 @@ __forceinline void _TransferLocalLocal_4()
 			write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
 			pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
 	
-			read = gsp((j+2)%2048, i%2048, gs.srcbuf.bw);
-			write = gdp((j2+2)%2048, i2%2048, gs.dstbuf.bw);
+			read = gsp((j+4)%2048, i%2048, gs.srcbuf.bw);
+			write = gdp((j2+4)%2048, i2%2048, gs.dstbuf.bw);
 			pDstBuf[write] = (pDstBuf[write]&0xf0)|(pSrcBuf[read]&0x0f);
 	
-			read = gsp((j+3)%2048, i%2048, gs.srcbuf.bw);
-			write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
+			read = gsp((j+5)%2048, i%2048, gs.srcbuf.bw);
+			write = gdp((j2+5)%2048, i2%2048, gs.dstbuf.bw);
 			pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
 	
-			read = gsp((j+2)%2048, i%2048, gs.srcbuf.bw);
-			write = gdp((j2+2)%2048, i2%2048, gs.dstbuf.bw);
+			read = gsp((j+6)%2048, i%2048, gs.srcbuf.bw);
+			write = gdp((j2+6)%2048, i2%2048, gs.dstbuf.bw);
 			pDstBuf[write] = (pDstBuf[write]&0xf0)|(pSrcBuf[read]&0x0f);
 	
-			read = gsp((j+3)%2048, i%2048, gs.srcbuf.bw);
-			write = gdp((j2+3)%2048, i2%2048, gs.dstbuf.bw);
+			read = gsp((j+7)%2048, i%2048, gs.srcbuf.bw);
+			write = gdp((j2+7)%2048, i2%2048, gs.dstbuf.bw);
 			pDstBuf[write] = (pDstBuf[write]&0x0f)|(pSrcBuf[read]&0xf0);
 		}
 	}
--- a/plugins/zzogl-pg/opengl/Linux/Linux.cpp
+++ b/plugins/zzogl-pg/opengl/Linux/Linux.cpp
@ -132,28 +132,35 @@ void CreateGameHackTable(GtkWidget *treeview, gameHacks hacks)
 	mapConfOpts.clear();

 	add_map_entry(GAME_TEXTURETARGS, "00000001", "Tex Target checking - 00000001\nLego Racers");
-	add_map_entry(GAME_AUTORESET, "00000002", "Auto reset targs - 00000002\nShadow Hearts, Samurai Warriors.  Use when game is slow and toggling AA fixes it.");
-	add_map_entry(GAME_NOTARGETRESOLVE, "00000010", "No target resolves - 00000010\nStops all resolving of targets.  Try this first for really slow games. Dark Cloud 1");
-	add_map_entry(GAME_EXACTCOLOR, "00000020", "Exact color testing - 00000020\nFixes overbright or shadow/black artifacts (Crash 'n Burn).");
-	add_map_entry(GAME_NOCOLORCLAMP, "00000040", "No color clamping - 00000040\nSpeeds up games, but might be too bright or too dim.");
-	add_map_entry(GAME_NOALPHAFAIL, "00000100", "Alpha Fail hack - 00000100\nFor Sonic Unleashed, Shadow the Hedgehog, Ghost in the Shell. Remove vertical stripes or other coloring artefacts. Break Persona 4 and MGS3");
+	add_map_entry(GAME_AUTORESET, "00000002", "Auto reset targs - 00000002\nUse when game is slow and toggling AA fixes it. Samurai Warriors. (Automatically on for Shadow Hearts)");
+	add_map_entry(GAME_INTERLACE2X, "00000004", "Interlace 2X - 00000004\nFixes 2x bigger screen. Gradius 3.");
+	//GAME_TEXAHACK (still implemented)
+	add_map_entry(GAME_NOTARGETRESOLVE, "00000010", "No target resolves - 00000010\nStops all resolving of targets.  Try this first for really slow games. (Automatically on for Dark Cloud 1.)");
+	add_map_entry(GAME_EXACTCOLOR, "00000020", "Exact color testing - 00000020\nFixes overbright or shadow/black artifacts. Crash 'n Burn.");
+	//add_map_entry(GAME_NOCOLORCLAMP, "00000040", "No color clamping - 00000040\nSpeeds up games, but might be too bright or too dim.");
+	//GAME_FFXHACK
+	add_map_entry(GAME_NOALPHAFAIL, "00000100", "Alpha Fail hack - 00000100\nRemove vertical stripes or other coloring artifacts. Breaks Persona 4 and MGS3. (Automatically on for Sonic Unleashed, Shadow the Hedgehog, & Ghost in the Shell.)");
 	add_map_entry(GAME_NODEPTHUPDATE, "00000200", "Disable depth updates - 00000200");
-	add_map_entry(GAME_QUICKRESOLVE1, "00000400", "Resolve Hack #1 - 00000400\nKingdom Hearts.  Speeds some games.");
-	add_map_entry(GAME_NOQUICKRESOLVE, "00000800", "Resolve Hack #2 - 00000800\nShadow Hearts, Urbz. Destroy FFX");
+	add_map_entry(GAME_QUICKRESOLVE1, "00000400", "Resolve Hack #1 - 00000400\n Speeds some games. Kingdom Hearts.");
+	add_map_entry(GAME_NOQUICKRESOLVE, "00000800", "Resolve Hack #2 - 00000800\nShadow Hearts, Urbz. Destroys FFX.");
 	add_map_entry(GAME_NOTARGETCLUT, "00001000", "No target CLUT - 00001000\nResident Evil 4, or foggy scenes.");
-	add_map_entry(GAME_NOSTENCIL, "00002000", "Disable stencil buffer - 00002000\nUsually safe to do for simple scenes. Harvest Moon");
+	add_map_entry(GAME_NOSTENCIL, "00002000", "Disable stencil buffer - 00002000\nUsually safe to do for simple scenes. Harvest Moon.");
+	//GAME_VSSHACKOFF (still implemented)
 	add_map_entry(GAME_NODEPTHRESOLVE, "00008000", "No depth resolve - 00008000\nMight give z buffer artifacts.");
 	add_map_entry(GAME_FULL16BITRES, "00010000", "Full 16 bit resolution - 00010000\nUse when half the screen is missing.");
 	add_map_entry(GAME_RESOLVEPROMOTED, "00020000", "Resolve Hack #3 - 00020000\nNeopets");
-	add_map_entry(GAME_FASTUPDATE, "00040000", "Fast Update - 00040000\nOkami. Speeds some games. Needs for Sonic Unleashed");
+	add_map_entry(GAME_FASTUPDATE, "00040000", "Fast Update - 00040000\n Speeds some games. Needed for Sonic Unleashed. Okami.");
 	add_map_entry(GAME_NOALPHATEST, "00080000", "Disable alpha testing - 00080000");
 	add_map_entry(GAME_DISABLEMRTDEPTH, "00100000", "Enable Multiple RTs - 00100000");
-	add_map_entry(GAME_XENOSPECHACK, "01000000", "Specular Highlights - 01000000\nMakes Xenosaga and Okage graphics faster by removing highlights");
-	add_map_entry(GAME_PARTIALPOINTERS, "02000000", "Partial targets - 02000000");
+	//GAME_32BITTARGS
+	//GAME_PATH3HACK
+	//GAME_DOPARALLELCTX
+	add_map_entry(GAME_XENOSPECHACK, "01000000", "Specular Highlights - 01000000\nMakes graphics faster by removing highlights. (Automatically on for Xenosaga, Okami, & Okage.)");
+	//add_map_entry(GAME_PARTIALPOINTERS, "02000000", "Partial targets - 02000000");
 	add_map_entry(GAME_PARTIALDEPTH, "04000000", "Partial depth - 04000000");
-	add_map_entry(GAME_GUSTHACK, "10000000", "Gust fix, made gustgame more clean and fast - 10000000");
-	add_map_entry(GAME_NOLOGZ, "20000000", "No logarithmic Z, could decrease number of Z-artefacts - 20000000");
-	add_map_entry(GAME_INTERLACE2X, "00000004", "Interlace 2X - 00000004\nFixes 2x bigger screen (Gradius 3).");
+	//GAME_REGETHACK (commented out in code)
+	add_map_entry(GAME_GUSTHACK, "10000000", "Gust fix - 10000000. Makes gust games cleaner and faster. (Automatically on for most Gust games)");
+	add_map_entry(GAME_NOLOGZ, "20000000", "No logarithmic Z - 20000000. Could decrease number of Z-artifacts.");
 	add_map_entry(GAME_AUTOSKIPDRAW, "40000000", "Remove blur effect on some games\nSlow games.");

 	for (map<string, confOptsStruct>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
@ -255,7 +262,7 @@ void DisplayDialog()
 	GtkWidget *option_frame, *option_box;
 	GtkWidget *log_check;
 	GtkWidget *int_label, *int_box, *int_holder;
-	GtkWidget *bilinear_check;
+	GtkWidget *bilinear_label, *bilinear_box, *bilinear_holder;
 	GtkWidget *aa_label, *aa_box, *aa_holder;
 	GtkWidget *snap_label, *snap_box, *snap_holder;
 	GtkWidget  *fullscreen_label, *widescreen_check;
@ -293,10 +300,18 @@ void DisplayDialog()
 	gtk_box_pack_start(GTK_BOX(int_holder), int_label, false, false, 2);
 	gtk_box_pack_start(GTK_BOX(int_holder), int_box, false, false, 2);

-
-	bilinear_check = gtk_check_button_new_with_label("Bilinear Filtering");
-	gtk_widget_set_tooltip_text(bilinear_check, "Best quality is off. Turn on for speed. Toggled by pressing Shift + F5 when running.");
-
+	bilinear_label = gtk_label_new("Bilinear Filtering:");
+	bilinear_box = gtk_combo_box_new_text();
+	
+	gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Off");
+	gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Normal");
+	gtk_combo_box_append_text(GTK_COMBO_BOX(bilinear_box), "Forced");
+	gtk_combo_box_set_active(GTK_COMBO_BOX(bilinear_box), conf.bilinear);
+	gtk_widget_set_tooltip_text(bilinear_box, "Best quality is off. Turn on for speed. Toggled by pressing Shift + F5 when running.");
+	bilinear_holder = gtk_hbox_new(false, 5);
+	gtk_box_pack_start(GTK_BOX(bilinear_holder), bilinear_label, false, false, 2);
+	gtk_box_pack_start(GTK_BOX(bilinear_holder), bilinear_box, false, false, 2);
+	
 	aa_label = gtk_label_new("Anti-Aliasing:");
 	aa_box = gtk_combo_box_new_text();

@ -352,7 +367,7 @@ void DisplayDialog()
 	gtk_frame_set_shadow_type(GTK_FRAME(option_frame), GTK_SHADOW_NONE);

 	gtk_box_pack_start(GTK_BOX(option_box), log_check, false, false, 2);
-	gtk_box_pack_start(GTK_BOX(option_box), bilinear_check, false, false, 2);
+	gtk_box_pack_start(GTK_BOX(option_box), bilinear_holder, false, false, 2);
 	gtk_box_pack_start(GTK_BOX(option_box), int_holder, false, false, 2);
 	gtk_box_pack_start(GTK_BOX(option_box), aa_holder, false, false, 2);
 	gtk_box_pack_start(GTK_BOX(option_box), snap_holder, false, false, 2);
@ -370,7 +385,6 @@ void DisplayDialog()
 	gtk_box_pack_start(GTK_BOX(main_box), option_frame, false, false, 2);

 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(log_check), conf.log);
-	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(bilinear_check), conf.bilinear);
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(widescreen_check), (conf.widescreen()));

 	gtk_container_add(GTK_CONTAINER(GTK_DIALOG(dialog)->vbox), main_frame);
@ -389,9 +403,11 @@ void DisplayDialog()

 		if (gtk_combo_box_get_active(GTK_COMBO_BOX(aa_box)) != -1)
 			conf.aa = gtk_combo_box_get_active(GTK_COMBO_BOX(aa_box));
+			
+		if (gtk_combo_box_get_active(GTK_COMBO_BOX(bilinear_box)) != -1)
+			conf.bilinear = gtk_combo_box_get_active(GTK_COMBO_BOX(bilinear_box));

 		conf.log = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(log_check));
-		conf.bilinear = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(bilinear_check));
 		fake_options.widescreen = gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(widescreen_check));
 		fake_options.tga_snap = gtk_combo_box_get_active(GTK_COMBO_BOX(snap_box));
 		
@ -445,7 +461,7 @@ void SysMessage(const char *fmt, ...)

 void CALLBACK GSabout()
 {
-	SysMessage("ZZOgl PG: by Zeydlitz (PG version worked on by arcum42). Based off of ZeroGS, by zerofrog.");
+	SysMessage("ZZOgl PG: by Zeydlitz (PG version worked on by arcum42, gregory, and the pcsx2 development team). Based off of ZeroGS, by zerofrog.");
 }

 s32 CALLBACK GStest()
--- a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
+++ b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
@ -152,6 +152,7 @@
 		<Unit filename="../../ZZoglFlush.cpp" />
 		<Unit filename="../../ZZoglFlushHack.cpp" />
 		<Unit filename="../../ZZoglFlushHack.h" />
+		<Unit filename="../../ZZoglMath.h" />
 		<Unit filename="../../ZZoglSave.cpp" />
 		<Unit filename="../../ZZoglShaders.cpp" />
 		<Unit filename="../../ZZoglShaders.h" />
@ -171,7 +172,6 @@
 		<Unit filename="../../x86.h" />
 		<Unit filename="../../zerogs.cpp" />
 		<Unit filename="../../zerogs.h" />
-		<Unit filename="../../zerogsmath.h" />
 		<Unit filename="../../zpipe.cpp" />
 		<Unit filename="../../zpipe.h" />
 		<Extensions>
--- a/plugins/zzogl-pg/opengl/Mem.cpp
+++ b/plugins/zzogl-pg/opengl/Mem.cpp
@ -184,7 +184,7 @@ static __forceinline int RealTransfer(u32 psm, const void* pbyMem, u32 nQWordSiz
 	tempY = gs.imageY;
 	tempX = gs.imageX;
 	Point alignedPt;
-	
+
 	nSize = (nQWordSize * 4 * 2) / tp2;
 	nSize = min(nSize, gs.imageWnew * gs.imageHnew);

@ -241,237 +241,136 @@ void TransferLocalHost24Z(void* pbyMem, u32 nQWordSize)		{FUNCLOG}
 void TransferLocalHost16Z(void* pbyMem, u32 nQWordSize)		{FUNCLOG}
 void TransferLocalHost16SZ(void* pbyMem, u32 nQWordSize)	{FUNCLOG}

-#define FILL_BLOCK(psm, psmcol) \
-{ \
-	b.pageTable = &g_pageTable##psm[0][0]; \
-	b.blockTable = &g_blockTable##psm[0][0]; \
-	b.columnTable = &g_columnTable##psmcol[0][0]; \
-	\
-	assert( sizeof(g_pageTable##psm) == b.width * b.height * sizeof(g_pageTable##psm[0][0]) ); \
-	\
-	psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
-	psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
-	\
-	for(i = 0; i < b.height; ++i) \
-	{ \
-		u32 i_width = i*BLOCK_TEXWIDTH; \
-		for(j = 0; j < b.width; ++j) \
-		{ \
-			/* fill the table */ \
-			u32 u = g_blockTable##psm[(i / b.colheight)][(j / b.colwidth)] * 64 * b.mult + g_columnTable##psmcol[i%b.colheight][j%b.colwidth]; \
-			b.pageTable[i * b.width + j] = u; \
-			psrcf[i_width + j] = (float)(u) / (float)(GPU_TEXWIDTH * b.mult); \
-		} \
-	} \
-	\
-	psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
-	\
-	for(i = 0; i < b.height; ++i) \
-	{ \
-		u32 i_width = i*BLOCK_TEXWIDTH; \
-		u32 i_width2 = ((i+1)%b.height)*BLOCK_TEXWIDTH; \
-		for(j = 0; j < b.width; ++j) \
-		{ \
-			u32 temp = ((j + 1) % b.width); \
-			float4* pv = &psrcv[i_width + j]; \
-			pv->x = psrcf[i_width + j]; \
-			pv->y = psrcf[i_width + temp]; \
-			pv->z = psrcf[i_width2 + j]; \
-			pv->w = psrcf[i_width2 + temp]; \
-		} \
-	} \
-} 
-
-#define FILL_BLOCK_NF(psm, psmcol) \
-{ \
-	b.pageTable = &g_pageTable##psm[0][0]; \
-	b.blockTable = &g_blockTable##psm[0][0]; \
-	b.columnTable = &g_columnTable##psmcol[0][0]; \
-	\
-	assert( sizeof(g_pageTable##psm) == b.width * b.height * sizeof(g_pageTable##psm[0][0]) ); \
-	\
-	psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
-	psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH; \
-	\
-	for(i = 0; i < b.height; ++i) \
-	{ \
-		u32 i_width = i*BLOCK_TEXWIDTH; \
-		for(j = 0; j < b.width; ++j) \
-		{ \
-			/* fill the table */ \
-			u32 u = g_blockTable##psm[(i / b.colheight)][(j / b.colwidth)] * 64 * b.mult + g_columnTable##psmcol[i%b.colheight][j%b.colwidth]; \
-			b.pageTable[i * b.width + j] = u; \
-			psrcw[i_width + j] = u; \
-		} \
-	} \
-} 
-
-void FillBlocksNF(vector<char>& vBlockData, vector<char>& vBilinearData)
+void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData, int floatfmt)
 {
-	FUNCLOG
-	vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
+	float* psrcf = (float*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
+    u16* psrcw = NULL;
+    if (!floatfmt)
+        psrcw = (u16*)&vBlockData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;

-	int i, j;
-	BLOCK b;
-	float* psrcf = NULL;
-	u16* psrcw = NULL;
+	for(int i = 0; i < b.height; ++i)
+	{
+		u32 i_width = i*BLOCK_TEXWIDTH;
+		for(int j = 0; j < b.width; ++j)
+		{
+			/* fill the table */
+            u32 bt = b.blockTable[(i / b.colheight)*(b.width/b.colwidth) + (j / b.colwidth)];
+            u32 ct = b.columnTable[(i%b.colheight)*b.colwidth + (j%b.colwidth)];
+            u32 u = bt * 64 * b.mult + ct;
+			b.pageTable[i * b.width + j] = u;
+            if (floatfmt)
+                psrcf[i_width + j] = (float)(u) / (float)(GPU_TEXWIDTH * b.mult);
+            else
+                psrcw[i_width + j] = u;

-	memset(m_Blocks, 0, sizeof(m_Blocks));
+		}
+	}

-	// 32
-	b.SetDim(64, 32, 0, 0, 1);
-	FILL_BLOCK_NF(32, 32);
-	m_Blocks[PSMCT32] = b;
-	m_Blocks[PSMCT32].SetFun(PSMCT32);
+    if (floatfmt) {
+        float4* psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;

-	// 24 (same as 32 except write/readPixel are different)
-	m_Blocks[PSMCT24] = b;
-	m_Blocks[PSMCT24].SetFun(PSMCT24);
-
-	// 8H (same as 32 except write/readPixel are different)
-	m_Blocks[PSMT8H] = b;
-	m_Blocks[PSMT8H].SetFun(PSMT8H);
-
-	m_Blocks[PSMT4HL] = b;
-	m_Blocks[PSMT4HL].SetFun(PSMT4HL);
-	
-	m_Blocks[PSMT4HH] = b;
-	m_Blocks[PSMT4HH].SetFun(PSMT4HH);
-
-	// 32z
-	b.SetDim(64, 32, 64, 0, 1);
-	FILL_BLOCK_NF(32Z, 32);
-	m_Blocks[PSMT32Z] = b;
-	m_Blocks[PSMT32Z].SetFun(PSMT32Z);
-
-	// 24Z (same as 32Z except write/readPixel are different)
-	m_Blocks[PSMT24Z] = b;
-	m_Blocks[PSMT24Z].SetFun(PSMT24Z);
-
-	// 16
-	b.SetDim(64, 64, 0, 32, 2);
-	FILL_BLOCK_NF(16, 16);
-	m_Blocks[PSMCT16] = b;
-	m_Blocks[PSMCT16].SetFun(PSMCT16);
-
-	// 16s
-	b.SetDim(64, 64, 64, 32, 2);
-	FILL_BLOCK_NF(16S, 16);
-	m_Blocks[PSMCT16S] = b;
-	m_Blocks[PSMCT16S].SetFun(PSMCT16S);
-
-	// 16z
-	b.SetDim(64, 64, 0, 96, 2);
-	FILL_BLOCK_NF(16Z, 16);
-	m_Blocks[PSMT16Z] = b;
-	m_Blocks[PSMT16Z].SetFun(PSMT16Z);
-
-	// 16sz
-	b.SetDim(64, 64, 64, 96, 2);
-	FILL_BLOCK_NF(16SZ, 16);
-	m_Blocks[PSMT16SZ] = b;
-	m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
-
-	// 8
-	b.SetDim(128, 64, 0, 160, 4);
-	FILL_BLOCK_NF(8, 8);
-	m_Blocks[PSMT8] = b;
-	m_Blocks[PSMT8].SetFun(PSMT8);
-
-	// 4
-	b.SetDim(128, 128, 0, 224, 8);
-	FILL_BLOCK_NF(4, 4);
-	m_Blocks[PSMT4] = b;
-	m_Blocks[PSMT4].SetFun(PSMT4);
-}
-
-
-void FillBlocksF(vector<char>& vBlockData, vector<char>& vBilinearData)
-{
-	FUNCLOG
-	vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
-	vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
-
-	int i, j;
-	BLOCK b;
-	float* psrcf = NULL;
-	u16* psrcw = NULL;
-	float4* psrcv = NULL;
-
-	memset(m_Blocks, 0, sizeof(m_Blocks));
-
-	// 32
-	b.SetDim(64, 32, 0, 0, 1);
-	FILL_BLOCK(32, 32);
-	m_Blocks[PSMCT32] = b;
-	m_Blocks[PSMCT32].SetFun(PSMCT32);
-
-	// 24 (same as 32 except write/readPixel are different)
-	m_Blocks[PSMCT24] = b;
-	m_Blocks[PSMCT24].SetFun(PSMCT24);
-
-	// 8H (same as 32 except write/readPixel are different)
-	m_Blocks[PSMT8H] = b;
-	m_Blocks[PSMT8H].SetFun(PSMT8H);
-
-	m_Blocks[PSMT4HL] = b;
-	m_Blocks[PSMT4HL].SetFun(PSMT4HL);
-	
-	m_Blocks[PSMT4HH] = b;
-	m_Blocks[PSMT4HH].SetFun(PSMT4HH);
-
-	// 32z
-	b.SetDim(64, 32, 64, 0, 1);
-	FILL_BLOCK(32Z, 32);
-	m_Blocks[PSMT32Z] = b;
-	m_Blocks[PSMT32Z].SetFun(PSMT32Z);
-
-	// 24Z (same as 32Z except write/readPixel are different)
-	m_Blocks[PSMT24Z] = b;
-	m_Blocks[PSMT24Z].SetFun(PSMT24Z);
-
-	// 16
-	b.SetDim(64, 64, 0, 32, 2);
-	FILL_BLOCK(16, 16);
-	m_Blocks[PSMCT16] = b;
-	m_Blocks[PSMCT16].SetFun(PSMCT16);
-
-	// 16s
-	b.SetDim(64, 64, 64, 32, 2);
-	FILL_BLOCK(16S, 16);
-	m_Blocks[PSMCT16S] = b;
-	m_Blocks[PSMCT16S].SetFun(PSMCT16S);
-
-	// 16z
-	b.SetDim(64, 64, 0, 96, 2);
-	FILL_BLOCK(16Z, 16);
-	m_Blocks[PSMT16Z] = b;
-	m_Blocks[PSMT16Z].SetFun(PSMT16Z);
-
-	// 16sz
-	b.SetDim(64, 64, 64, 96, 2);
-	FILL_BLOCK(16SZ, 16);
-	m_Blocks[PSMT16SZ] = b;
-	m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
-
-	// 8
-	b.SetDim(128, 64, 0, 160, 4);
-	FILL_BLOCK(8, 8);
-	m_Blocks[PSMT8] = b;
-	m_Blocks[PSMT8].SetFun(PSMT8);
-
-	// 4
-	b.SetDim(128, 128, 0, 224, 8);
-	FILL_BLOCK(4, 4);
-	m_Blocks[PSMT4] = b;
-	m_Blocks[PSMT4].SetFun(PSMT4);
+        for(int i = 0; i < b.height; ++i)
+        {
+            u32 i_width = i*BLOCK_TEXWIDTH;
+            u32 i_width2 = ((i+1)%b.height)*BLOCK_TEXWIDTH;
+            for(int j = 0; j < b.width; ++j)
+            {
+                u32 temp = ((j + 1) % b.width);
+                float4* pv = &psrcv[i_width + j];
+                pv->x = psrcf[i_width + j];
+                pv->y = psrcf[i_width + temp];
+                pv->z = psrcf[i_width2 + j];
+                pv->w = psrcf[i_width2 + temp];
+            }
+        }
+    }
 }

 void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, int floatfmt)
 {
 	FUNCLOG
-	if (floatfmt) 
-		FillBlocksF(vBlockData, vBilinearData);
-	else
-		FillBlocksNF(vBlockData, vBilinearData);
+    if (floatfmt) {
+        vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
+        vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
+    } else {
+        vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
+    }
+
+	BLOCK b;
+
+	memset(m_Blocks, 0, sizeof(m_Blocks));
+
+	// 32
+	b.SetDim(64, 32, 0, 0, 1);
+    b.SetTable(PSMCT32);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMCT32] = b;
+	m_Blocks[PSMCT32].SetFun(PSMCT32);
+
+	// 24 (same as 32 except write/readPixel are different)
+	m_Blocks[PSMCT24] = b;
+	m_Blocks[PSMCT24].SetFun(PSMCT24);
+
+	// 8H (same as 32 except write/readPixel are different)
+	m_Blocks[PSMT8H] = b;
+	m_Blocks[PSMT8H].SetFun(PSMT8H);
+
+	m_Blocks[PSMT4HL] = b;
+	m_Blocks[PSMT4HL].SetFun(PSMT4HL);
+
+	m_Blocks[PSMT4HH] = b;
+	m_Blocks[PSMT4HH].SetFun(PSMT4HH);
+
+	// 32z
+	b.SetDim(64, 32, 64, 0, 1);
+    b.SetTable(PSMT32Z);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMT32Z] = b;
+	m_Blocks[PSMT32Z].SetFun(PSMT32Z);
+
+	// 24Z (same as 32Z except write/readPixel are different)
+	m_Blocks[PSMT24Z] = b;
+	m_Blocks[PSMT24Z].SetFun(PSMT24Z);
+
+	// 16
+	b.SetDim(64, 64, 0, 32, 2);
+    b.SetTable(PSMCT16);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMCT16] = b;
+	m_Blocks[PSMCT16].SetFun(PSMCT16);
+
+	// 16s
+	b.SetDim(64, 64, 64, 32, 2);
+    b.SetTable(PSMCT16S);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMCT16S] = b;
+	m_Blocks[PSMCT16S].SetFun(PSMCT16S);
+
+	// 16z
+	b.SetDim(64, 64, 0, 96, 2);
+    b.SetTable(PSMT16Z);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMT16Z] = b;
+	m_Blocks[PSMT16Z].SetFun(PSMT16Z);
+
+	// 16sz
+	b.SetDim(64, 64, 64, 96, 2);
+    b.SetTable(PSMT16SZ);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMT16SZ] = b;
+	m_Blocks[PSMT16SZ].SetFun(PSMT16SZ);
+
+	// 8
+	b.SetDim(128, 64, 0, 160, 4);
+    b.SetTable(PSMT8);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMT8] = b;
+	m_Blocks[PSMT8].SetFun(PSMT8);
+
+	// 4
+	b.SetDim(128, 128, 0, 224, 8);
+    b.SetTable(PSMT4);
+    fill_block(b, vBlockData, vBilinearData, floatfmt);
+	m_Blocks[PSMT4] = b;
+	m_Blocks[PSMT4].SetFun(PSMT4);
 }
--- a/plugins/zzogl-pg/opengl/Mem.h
+++ b/plugins/zzogl-pg/opengl/Mem.h
@ -92,6 +92,29 @@ struct TransferFuncts
 extern TransferData tData[64];
 // rest not visible externally

+extern u32 g_blockTable32[4][8];
+extern u32 g_blockTable32Z[4][8];
+extern u32 g_blockTable16[8][4];
+extern u32 g_blockTable16S[8][4];
+extern u32 g_blockTable16Z[8][4];
+extern u32 g_blockTable16SZ[8][4];
+extern u32 g_blockTable8[4][8];
+extern u32 g_blockTable4[8][4];
+
+extern u32 g_columnTable32[8][8];
+extern u32 g_columnTable16[8][16];
+extern u32 g_columnTable8[16][16];
+extern u32 g_columnTable4[16][32];
+
+extern u32 g_pageTable32[32][64];
+extern u32 g_pageTable32Z[32][64];
+extern u32 g_pageTable16[64][64];
+extern u32 g_pageTable16S[64][64];
+extern u32 g_pageTable16Z[64][64];
+extern u32 g_pageTable16SZ[64][64];
+extern u32 g_pageTable8[64][128];
+extern u32 g_pageTable4[128][128];
+
 struct BLOCK
 {
 	BLOCK() { memset(this, 0, sizeof(BLOCK)); }
@ -142,47 +165,69 @@ struct BLOCK
 		TransferHostLocal = TransferHostLocalFun[psm];
 		TransferLocalHost = TransferLocalHostFun[psm];
 	}
+
+    void SetTable(u32 psm)
+    {
+        switch (psm) {
+            case PSMCT32:
+                assert( sizeof(g_pageTable32) == width * height * sizeof(g_pageTable32[0][0]) );
+                pageTable = &g_pageTable32[0][0];
+                blockTable = &g_blockTable32[0][0];
+                columnTable = &g_columnTable32[0][0];
+                break;
+            case PSMT32Z:
+                assert( sizeof(g_pageTable32Z) == width * height * sizeof(g_pageTable32Z[0][0]) );
+                pageTable = &g_pageTable32Z[0][0];
+                blockTable = &g_blockTable32Z[0][0];
+                columnTable = &g_columnTable32[0][0];
+                break;
+            case PSMCT16:
+                assert( sizeof(g_pageTable16) == width * height * sizeof(g_pageTable16[0][0]) );
+                pageTable = &g_pageTable16[0][0];
+                blockTable = &g_blockTable16[0][0];
+                columnTable = &g_columnTable16[0][0];
+                break;
+            case PSMCT16S:
+                assert( sizeof(g_pageTable16S) == width * height * sizeof(g_pageTable16S[0][0]) );
+                pageTable = &g_pageTable16S[0][0];
+                blockTable = &g_blockTable16S[0][0];
+                columnTable = &g_columnTable16[0][0];
+                break;
+            case PSMT16Z:
+                assert( sizeof(g_pageTable16Z) == width * height * sizeof(g_pageTable16Z[0][0]) );
+                pageTable = &g_pageTable16Z[0][0];
+                blockTable = &g_blockTable16Z[0][0];
+                columnTable = &g_columnTable16[0][0];
+                break;
+            case PSMT16SZ:
+                assert( sizeof(g_pageTable16SZ) == width * height * sizeof(g_pageTable16SZ[0][0]) );
+                pageTable = &g_pageTable16SZ[0][0];
+                blockTable = &g_blockTable16SZ[0][0];
+                columnTable = &g_columnTable16[0][0];
+                break;
+            case PSMT8:
+                assert( sizeof(g_pageTable8) == width * height * sizeof(g_pageTable8[0][0]) );
+                pageTable = &g_pageTable8[0][0];
+                blockTable = &g_blockTable8[0][0];
+                columnTable = &g_columnTable8[0][0];
+                break;
+            case PSMT4:
+                assert( sizeof(g_pageTable4) == width * height * sizeof(g_pageTable4[0][0]) );
+                pageTable = &g_pageTable4[0][0];
+                blockTable = &g_blockTable4[0][0];
+                columnTable = &g_columnTable4[0][0];
+                break;
+            default:
+                pageTable = NULL;
+                blockTable = NULL;
+                columnTable = NULL;
+                break;
+        }
+    }
 };

 extern BLOCK m_Blocks[];

-extern u32 g_blockTable32[4][8];
-extern u32 g_blockTable32Z[4][8];
-extern u32 g_blockTable16[8][4];
-extern u32 g_blockTable16S[8][4];
-extern u32 g_blockTable16Z[8][4];
-extern u32 g_blockTable16SZ[8][4];
-extern u32 g_blockTable8[4][8];
-extern u32 g_blockTable4[8][4];
-
-extern u32 g_columnTable32[8][8];
-extern u32 g_columnTable16[8][16];
-extern u32 g_columnTable8[16][16];
-extern u32 g_columnTable4[16][32];
-
-extern u32 g_pageTable32[32][64];
-extern u32 g_pageTable32Z[32][64];
-extern u32 g_pageTable16[64][64];
-extern u32 g_pageTable16S[64][64];
-extern u32 g_pageTable16Z[64][64];
-extern u32 g_pageTable16SZ[64][64];
-extern u32 g_pageTable8[64][128];
-extern u32 g_pageTable4[128][128];
-
-static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw)
-{
-	u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
-	u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63];
-	return word;
-}
-
-static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 2048 + g_pageTable32[y&31][x&63];
-	return word;
-}
-
 #define getPixelAddress24 getPixelAddress32
 #define getPixelAddress24_0 getPixelAddress32_0
 #define getPixelAddress8H getPixelAddress32
@ -191,6 +236,15 @@ static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw)
 #define getPixelAddress4HL_0 getPixelAddress32_0
 #define getPixelAddress4HH getPixelAddress32
 #define getPixelAddress4HH_0 getPixelAddress32_0
+#define getPixelAddress24Z getPixelAddress32Z
+#define getPixelAddress24Z_0 getPixelAddress32Z_0
+
+static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw)
+{
+	u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
+	u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63];
+	return word;
+}

 static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw)
 {
@ -199,13 +253,6 @@ static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 4096 + g_pageTable16[y&63][x&63];
-	return word;
-}
-
 static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -213,13 +260,6 @@ static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 4096 + g_pageTable16S[y&63][x&63];
-	return word;
-}
-
 static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 6) * ((bw + 127) >> 7)) + (x >> 7);
@ -227,13 +267,6 @@ static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 6) * ((bw + 127) >> 7)) + (x >> 7);
-	u32 word = basepage * 8192 + g_pageTable8[y&63][x&127];
-	return word;
-}
-
 static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 7) * ((bw + 127) >> 7)) + (x >> 7);
@ -241,13 +274,6 @@ static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 7) * ((bw + 127) >> 7)) + (x >> 7);
-	u32 word = basepage * 16384 + g_pageTable4[y&127][x&127];
-	return word;
-}
-
 static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
@ -255,16 +281,6 @@ static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 5) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 2048 + g_pageTable32Z[y&31][x&63];
-	return word;
-}
-
-#define getPixelAddress24Z getPixelAddress32Z
-#define getPixelAddress24Z_0 getPixelAddress32Z_0
-
 static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -272,13 +288,6 @@ static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 4096 + g_pageTable16Z[y&63][x&63];
-	return word;
-}
-
 static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw)
 {
 	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
@ -286,15 +295,7 @@ static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw)
 	return word;
 }

-static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw)
-{
-	u32 basepage = ((y >> 6) * (bw >> 6)) + (x >> 6);
-	u32 word = basepage * 4096 + g_pageTable16SZ[y&63][x&63];
-	return word;
-}
-
-//#define getPixelAddress_0(psm,x,y,bw) getPixelAddress##psm##_0(x,y,bw)
-//#define getPixelAddress(psm,x,y,bp,bw) getPixelAddress##psm##(x,y,bp,bw)
+///////////////

 static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw)
 {
@ -375,7 +376,6 @@ static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u3
 	((u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)] = pixel;
 }

-
 ///////////////

 static __forceinline u32 readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw)
@ -457,161 +457,48 @@ static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, u
 // Functions that take 0 bps //
 ///////////////////////////////

-static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u32*)pmem)[getPixelAddress32_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	u8 *buf = (u8*) & ((u32*)pmem)[getPixelAddress32_0(x, y, bw)];
-	u8 *pix = (u8*) & pixel;
-	buf[0] = pix[0];
-	buf[1] = pix[1];
-	buf[2] = pix[2];
-}
-
-static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u16*)pmem)[getPixelAddress16_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u16*)pmem)[getPixelAddress16S_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u8*)pmem)[getPixelAddress8_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u8*)pmem)[4*getPixelAddress32_0(x, y, bw)+3] = pixel;
-}
-
-static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	u32 addr = getPixelAddress4_0(x, y, bw);
-	u8 pix = ((u8*)pmem)[addr/2];
-
-	if (addr & 0x1)((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4);
-	else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel);
-}
-
-static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	u8 *p = (u8*)pmem + 4 * getPixelAddress4HL_0(x, y, bw) + 3;
-	*p = (*p & 0xf0) | pixel;
-}
-
-static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	u8 *p = (u8*)pmem + 4 * getPixelAddress4HH_0(x, y, bw) + 3;
-	*p = (*p & 0x0f) | (pixel << 4);
-}
-
-static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	u8 *buf = (u8*)pmem + 4 * getPixelAddress32Z_0(x, y, bw);
-	u8 *pix = (u8*) & pixel;
-	buf[0] = pix[0];
-	buf[1] = pix[1];
-	buf[2] = pix[2];
-}
-
-static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u16*)pmem)[getPixelAddress16Z_0(x, y, bw)] = pixel;
-}
-
-static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw)
-{
-	((u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)] = pixel;
-}
+static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) { return getPixelAddress32(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) { return getPixelAddress16(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) { return getPixelAddress16S(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) { return getPixelAddress8(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) { return getPixelAddress4(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) { return getPixelAddress32Z(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) { return getPixelAddress16Z(x, y, 0, bw); }
+static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) { return getPixelAddress16SZ(x, y, 0, bw); }

 ///////////////

-static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)];
-}
-
-static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)] & 0xffffff;
-}
-
-static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u16*)pmem)[getPixelAddress16_0(x, y, bw)];
-}
-
-static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u16*)pmem)[getPixelAddress16S_0(x, y, bw)];
-}
-
-static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u8*)pmem)[getPixelAddress8_0(x, y, bw)];
-}
-
-static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u8*)pmem)[4*getPixelAddress32_0(x, y, bw) + 3];
-}
-
-static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw)
-{
-	u32 addr = getPixelAddress4_0(x, y, bw);
-	u8 pix = ((const u8*)pmem)[addr/2];
-
-	if (addr & 0x1)
-		return pix >> 4;
-	else
-		return pix & 0xf;
-}
-
-static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw)
-{
-	const u8 *p = (const u8*)pmem + 4 * getPixelAddress4HL_0(x, y, bw) + 3;
-	return *p & 0x0f;
-}
-
-static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw)
-{
-	const u8 *p = (const u8*)pmem + 4 * getPixelAddress4HH_0(x, y, bw) + 3;
-	return *p >> 4;
-}
+static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel32(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel24(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16S(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel8(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel8H(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4HL(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel4HH(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel32Z(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel24Z(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16Z(pmem, x, y, pixel, 0, bw); }
+static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) { writePixel16SZ(pmem, x, y, pixel, 0, bw); }

 ///////////////

-static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)];
-}
+static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw) { return readPixel32(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw) { return readPixel24(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw) { return readPixel16(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw) { return readPixel16S(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw) { return readPixel8(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw) { return readPixel8H(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw) { return readPixel4(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw) { return readPixel4HL(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw) { return readPixel4HH(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel32Z(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel24Z(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw) { return readPixel16Z(pmem, x, y, 0, bw); }
+static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) { return readPixel16SZ(pmem, x, y, 0, bw); }

-static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] & 0xffffff;
-}
-
-static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u16*)pmem)[getPixelAddress16Z_0(x, y, bw)];
-}
-
-static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw)
-{
-	return ((const u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)];
-}
+///////////////

 extern int TransferHostLocal32(const void* pbyMem, u32 nQWordSize);
 extern int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize);
--- a/plugins/zzogl-pg/opengl/Mem_Tables.cpp
+++ b/plugins/zzogl-pg/opengl/Mem_Tables.cpp
@ -120,9 +120,9 @@ u32 g_columnTable32[8][8] =
 u32 g_columnTable16[8][16] =
 {
 	{   0,   2,   8,  10,  16,  18,  24,  26,
-		1,   3,   9,  11,  17,  19,  25,  27 },
+	    1,   3,   9,  11,  17,  19,  25,  27 },
 	{   4,   6,  12,  14,  20,  22,  28,  30,
-		5,   7,  13,  15,  21,  23,  29,  31 },
+	    5,   7,  13,  15,  21,  23,  29,  31 },
 	{  32,  34,  40,  42,  48,  50,  56,  58,
 	   33,  35,  41,  43,  49,  51,  57,  59 },
 	{  36,  38,  44,  46,  52,  54,  60,  62,
@ -139,15 +139,15 @@ u32 g_columnTable16[8][16] =

 u32 g_columnTable8[16][16] =
 {
-	{   0,   4,  16,  20,  32,  36,  48,  52,   // column 0
-		2,   6,  18,  22,  34,  38,  50,  54 },
+	{   0,   4,  16,  20,  32,  36,  48,  52,	// column 0
+	    2,   6,  18,  22,  34,  38,  50,  54 },
 	{   8,  12,  24,  28,  40,  44,  56,  60,
-		10,  14,  26,  30,  42,  46,  58,  62 },
+	   10,  14,  26,  30,  42,  46,  58,  62 },
 	{  33,  37,  49,  53,   1,   5,  17,  21,
 	   35,  39,  51,  55,   3,   7,  19,  23 },
 	{  41,  45,  57,  61,   9,  13,  25,  29,
 	   43,  47,  59,  63,  11,  15,  27,  31 },
-	{  96, 100, 112, 116,  64,  68,  80,  84,   // column 1
+	{  96, 100, 112, 116,  64,  68,  80,  84, 	// column 1
 	   98, 102, 114, 118,  66,  70,  82,  86 },
 	{ 104, 108, 120, 124,  72,  76,  88,  92,
 	  106, 110, 122, 126,  74,  78,  90,  94 },
@ -155,7 +155,7 @@ u32 g_columnTable8[16][16] =
 	   67,  71,  83,  87,  99, 103, 115, 119 },
 	{  73,  77,  89,  93, 105, 109, 121, 125,
 	   75,  79,  91,  95, 107, 111, 123, 127 },
-	{ 128, 132, 144, 148, 160, 164, 176, 180,   // column 2
+	{ 128, 132, 144, 148, 160, 164, 176, 180,	// column 2
 	  130, 134, 146, 150, 162, 166, 178, 182 },
 	{ 136, 140, 152, 156, 168, 172, 184, 188,
 	  138, 142, 154, 158, 170, 174, 186, 190 },
@ -163,7 +163,7 @@ u32 g_columnTable8[16][16] =
 	  163, 167, 179, 183, 131, 135, 147, 151 },
 	{ 169, 173, 185, 189, 137, 141, 153, 157,
 	  171, 175, 187, 191, 139, 143, 155, 159 },
-	{ 224, 228, 240, 244, 192, 196, 208, 212,   // column 3
+	{ 224, 228, 240, 244, 192, 196, 208, 212,	// column 3
 	  226, 230, 242, 246, 194, 198, 210, 214 },
 	{ 232, 236, 248, 252, 200, 204, 216, 220,
 	  234, 238, 250, 254, 202, 206, 218, 222 },
@ -175,10 +175,10 @@ u32 g_columnTable8[16][16] =

 u32 g_columnTable4[16][32] =
 {
-	{   0,   8,  32,  40,  64,  72,  96, 104,   // column 0
-		2,  10,  34,  42,  66,  74,  98, 106,
-		4,  12,  36,  44,  68,  76, 100, 108,
-		6,  14,  38,  46,  70,  78, 102, 110 },
+	{   0,   8,  32,  40,  64,  72,  96, 104,	// column 0
+	    2,  10,  34,  42,  66,  74,  98, 106,
+	    4,  12,  36,  44,  68,  76, 100, 108,
+	    6,  14,  38,  46,  70,  78, 102, 110 },
 	{  16,  24,  48,  56,  80,  88, 112, 120,
 	   18,  26,  50,  58,  82,  90, 114, 122,
 	   20,  28,  52,  60,  84,  92, 116, 124,
@ -191,7 +191,7 @@ u32 g_columnTable4[16][32] =
 	   83,  91, 115, 123,  19,  27,  51,  59,
 	   85,  93, 117, 125,  21,  29,  53,  61,
 	   87,  95, 119, 127,  23,  31,  55,  63 },
-	{ 192, 200, 224, 232, 128, 136, 160, 168,   // column 1
+	{ 192, 200, 224, 232, 128, 136, 160, 168,	// column 1
 	  194, 202, 226, 234, 130, 138, 162, 170,
 	  196, 204, 228, 236, 132, 140, 164, 172,
 	  198, 206, 230, 238, 134, 142, 166, 174 },
@ -207,7 +207,7 @@ u32 g_columnTable4[16][32] =
 	  147, 155, 179, 187, 211, 219, 243, 251,
 	  149, 157, 181, 189, 213, 221, 245, 253,
 	  151, 159, 183, 191, 215, 223, 247, 255 },
-	{ 256, 264, 288, 296, 320, 328, 352, 360,   // column 2
+	{ 256, 264, 288, 296, 320, 328, 352, 360,	// column 2
 	  258, 266, 290, 298, 322, 330, 354, 362,
 	  260, 268, 292, 300, 324, 332, 356, 364,
 	  262, 270, 294, 302, 326, 334, 358, 366 },
@ -223,7 +223,7 @@ u32 g_columnTable4[16][32] =
 	  339, 347, 371, 379, 275, 283, 307, 315,
 	  341, 349, 373, 381, 277, 285, 309, 317,
 	  343, 351, 375, 383, 279, 287, 311, 319 },
-	{ 448, 456, 480, 488, 384, 392, 416, 424,   // column 3
+	{ 448, 456, 480, 488, 384, 392, 416, 424,	// column 3
 	  450, 458, 482, 490, 386, 394, 418, 426,
 	  452, 460, 484, 492, 388, 396, 420, 428,
 	  454, 462, 486, 494, 390, 398, 422, 430 },
--- a/plugins/zzogl-pg/opengl/NewRegs.cpp
+++ b/plugins/zzogl-pg/opengl/NewRegs.cpp
@ -638,7 +638,7 @@ void __gifCall GIFRegHandlerSCISSOR(const u32* data)
 		Flush();
 	}

-	m_env.CTXT[i].SCISSOR = (Vector4i)r->SCISSOR;
+	m_env.CTXT[i].SCISSOR = (GSVector4i)r->SCISSOR;

 	m_env.CTXT[i].UpdateScissor();*/
 	ZZLog::Greg_Log("SCISSOR%d", i);
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@ -56,6 +56,7 @@ extern "C" char* CALLBACK PS2EgetLibName(void);

 #include <vector>
 #include <string>
+#include <cstring>

 extern std::string s_strIniPath; // Air's new (r2361) new constant for ini file path

@ -87,6 +88,9 @@ static __forceinline void pcsx2_aligned_free(void* pmem)
 #define _aligned_malloc pcsx2_aligned_malloc
 #define _aligned_free pcsx2_aligned_free

+#endif
+
+#ifdef __LINUX__
 #include <sys/timeb.h>	// ftime(), struct timeb

 inline unsigned long timeGetTime()
@ -97,6 +101,15 @@ inline unsigned long timeGetTime()
 	return (unsigned long)(t.time*1000 + t.millitm);
 }

+#include <time.h>
+inline unsigned long timeGetPreciseTime()
+{
+    timespec t;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
+
+    return t.tv_nsec;
+}
+
 struct RECT
 {
 	int left, top;
@ -138,6 +151,7 @@ enum GSWindowDim
 	GSDim_1024,
 	GSDim_1280,
 };
+
 typedef union 
 {
 	struct
@ -217,7 +231,7 @@ typedef struct
 	gameHacks settings() 
 	{
 		gameHacks tempHack;
-		tempHack._u32 = (hacks._u32 | def_hacks._u32 | GAME_PATH3HACK);
+		tempHack._u32 = (hacks._u32 | def_hacks._u32);
 		 return tempHack; 
 	}
 	bool fullscreen() { return !!(zz_options.fullscreen); }
--- a/plugins/zzogl-pg/opengl/Win32/Conf.cpp
+++ b/plugins/zzogl-pg/opengl/Win32/Conf.cpp
@ -25,6 +25,8 @@ void SaveConfig()
 	WritePrivateProfileString("Settings", "Width", szValue, iniFile.c_str());
 	sprintf(szValue, "%u", conf.height);
 	WritePrivateProfileString("Settings", "Height", szValue, iniFile.c_str());
+	sprintf(szValue, "%u", conf.SkipDraw);
+	WritePrivateProfileString("Settings", "SkipDraw", szValue, iniFile.c_str());
 }

 void LoadConfig()
@ -40,6 +42,7 @@ void LoadConfig()
 	conf.bilinear = 1;
 	conf.width = 640;
 	conf.height = 480;
+	conf.SkipDraw = 0;

 	FILE *fp = fopen(iniFile.c_str(), "rt");

@ -67,6 +70,8 @@ void LoadConfig()
 	conf.width = strtoul(szValue, NULL, 10);
 	GetPrivateProfileString("Settings", "Height", NULL, szValue, 20, iniFile.c_str());
 	conf.height = strtoul(szValue, NULL, 10);
+	GetPrivateProfileString("Settings", "SkipDraw", NULL, szValue, 20, iniFile.c_str());
+	conf.SkipDraw = strtoul(szValue, NULL, 10);

 	if (conf.aa < 0 || conf.aa > 4) conf.aa = 0;

--- a/plugins/zzogl-pg/opengl/ZZKeyboard.cpp
+++ b/plugins/zzogl-pg/opengl/ZZKeyboard.cpp
@ -116,7 +116,7 @@ typedef struct GameHackStruct
 	u32 HackMask;
 } GameHack;

-#define HACK_NUMBER 30
+#define HACK_NUMBER 25

 GameHack HackinshTable[HACK_NUMBER] =
 {
@ -127,30 +127,31 @@ GameHack HackinshTable[HACK_NUMBER] =
 	{"*** 4 TexA hack", GAME_TEXAHACK},
 	{"*** 5 No Target Resolve", GAME_NOTARGETRESOLVE},
 	{"*** 6 Exact color", GAME_EXACTCOLOR},
-	{"*** 7 No color clamp", GAME_NOCOLORCLAMP},
-	{"*** 8 FFX hack", GAME_FFXHACK},
-	{"*** 9 No Alpha Fail", GAME_NOALPHAFAIL},
-	{"***10 No Depth Update", GAME_NODEPTHUPDATE},
-	{"***11 Quick Resolve 1", GAME_QUICKRESOLVE1},
-	{"***12 No quick resolve", GAME_NOQUICKRESOLVE},
-	{"***13 Notaget clut", GAME_NOTARGETCLUT},
-	{"***14 No Stencil", GAME_NOSTENCIL},
-	{"***15 No Depth resolve", GAME_NODEPTHRESOLVE},
-	{"***16 Full 16 bit", GAME_FULL16BITRES},
-	{"***17 Resolve promoted", GAME_RESOLVEPROMOTED},
-	{"***18 Fast Update", GAME_FASTUPDATE},
-	{"***19 No Alpha Test", GAME_NOALPHATEST},
-	{"***20 Disable MRT deprh", GAME_DISABLEMRTDEPTH},
-	{"***21 32 bit targes", GAME_32BITTARGS},
-	{"***22 path 3 hack", GAME_PATH3HACK},
-	{"***23 parallelise calls", GAME_DOPARALLELCTX},
-	{"***24 specular highligths", GAME_XENOSPECHACK},
-	{"***25 partial pointers", GAME_PARTIALPOINTERS},
-	{"***26 partial depth", GAME_PARTIALDEPTH},
-	{"***27 reget hack", GAME_REGETHACK},
+	//{"***xx No color clamp", GAME_NOCOLORCLAMP},
+	//{"***xx FFX hack", GAME_FFXHACK},
+	{"*** 7 No Alpha Fail", GAME_NOALPHAFAIL},
+	{"*** 8 No Depth Update", GAME_NODEPTHUPDATE},
+	{"*** 9 Quick Resolve 1", GAME_QUICKRESOLVE1},
+	{"***10 No quick resolve", GAME_NOQUICKRESOLVE},
+	{"***11 Notaget clut", GAME_NOTARGETCLUT},
+	{"***12 No Stencil", GAME_NOSTENCIL},
+	{"***13 No Depth resolve", GAME_NODEPTHRESOLVE},
+	{"***14 Full 16 bit", GAME_FULL16BITRES},
+	{"***15 Resolve promoted", GAME_RESOLVEPROMOTED},
+	{"***16 Fast Update", GAME_FASTUPDATE},
+	{"***17 No Alpha Test", GAME_NOALPHATEST},
+	{"***18 Disable MRT depth", GAME_DISABLEMRTDEPTH},
+	//{"***xx 32 bit targs", GAME_32BITTARGS},
+	//{"***xx Path 3 hack", GAME_PATH3HACK},
+	//{"***xx Parallel calls", GAME_DOPARALLELCTX},
+	{"***19 Specular highlights", GAME_XENOSPECHACK},
+	//{"***xx Partial pointers", GAME_PARTIALPOINTERS},
+	{"***20 Partial depth", GAME_PARTIALDEPTH},
+	{"***21 Reget hack", GAME_REGETHACK},

-	{"***28 gust hack", GAME_GUSTHACK},
-	{"***29 log-Z", GAME_NOLOGZ}
+	{"***22 Gust hack", GAME_GUSTHACK},
+	{"***23 Log-Z", GAME_NOLOGZ},
+	{"***24 Auto skipdraw", GAME_AUTOSKIPDRAW}
 };

 int CurrentHackSetting = 0;
@ -172,7 +173,7 @@ void ProcessHackSetting(bool reverse)
 	{
 		CurrentHackSetting++;

-		if (CurrentHackSetting == HACK_NUMBER) CurrentHackSetting = 0;
+		if (CurrentHackSetting >= HACK_NUMBER) CurrentHackSetting = 0;
 	}

 	conf.hacks._u32 |= HackinshTable[CurrentHackSetting].HackMask;
--- a/plugins/zzogl-pg/opengl/ZZLog.cpp
+++ b/plugins/zzogl-pg/opengl/ZZLog.cpp
@ -244,6 +244,27 @@ void Warn_Log(const char *fmt, ...)
 #endif
 }

+void Dev_Log(const char *fmt, ...)
+{
+#ifdef ZEROGS_DEVBUILD
+	va_list list;
+
+	va_start(list, fmt);
+
+	if (IsLogging())
+	{
+		vfprintf(gsLog, fmt, list);
+		fprintf(gsLog, "\n");
+	}
+
+	fprintf(stderr, "ZZogl-PG:  ");
+	vfprintf(stderr, fmt, list);
+	fprintf(stderr, "\n");
+	
+	va_end(list);
+#endif
+}
+
 void Debug_Log(const char *fmt, ...)
 {
 #if _DEBUG
--- a/plugins/zzogl-pg/opengl/ZZLog.h
+++ b/plugins/zzogl-pg/opengl/ZZLog.h
@ -185,6 +185,7 @@ extern void Prim_Log(const char *fmt, ...);
 extern void GS_Log(const char *fmt, ...);

 extern void Debug_Log(const char *fmt, ...);
+extern void Dev_Log(const char *fmt, ...);
 extern void Warn_Log(const char *fmt, ...);
 extern void Error_Log(const char *fmt, ...);
 };
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
@ -54,6 +54,7 @@ void ZeroGS::AdjustTransToAspect(float4& v)
 {
 	double temp;
 	float f;
+	const float mult = 1 / 32767.0f;

 	if (conf.width * nBackbufferHeight > conf.height * nBackbufferWidth) // limited by width
 	{
@ -74,7 +75,7 @@ void ZeroGS::AdjustTransToAspect(float4& v)
 		v.z *= f;
 	}

-	v *= 1 / 32767.0f;
+	v  *= mult;
 }

 inline bool FrameSkippingHelper()
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.h
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.h
@ -17,13 +17,13 @@
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

-#ifndef ZZOGLCRTC_H_INCLUDED
-#define ZZOGLCRTC_H_INCLUDED
+#ifndef ZZOGLCRTC_H_INCLUDED
+#define ZZOGLCRTC_H_INCLUDED

 #include <stdlib.h>

 #include "zerogs.h"
-#include "targets.h"
+#include "targets.h"

 #define INTERLACE_COUNT (bInterlace && interlace == (conf.interlace))

@ -89,12 +89,12 @@ inline u32 CreateInterlaceTex(int width)

 	glGenTextures(1, &s_ptexInterlace);
 	glBindTexture(GL_TEXTURE_RECTANGLE_NV, s_ptexInterlace);
-	TextureRect(4, width, 1, GL_RGBA, GL_UNSIGNED_BYTE, &data[0]);
+	TextureRect(GL_RGBA, width, 1, GL_RGBA, GL_UNSIGNED_BYTE, &data[0]);
 	setRectFilters(GL_NEAREST);
 	GL_REPORT_ERRORD();

 	return s_ptexInterlace;
 }
-}
-
-#endif // ZZOGLCRTC_H_INCLUDED
+}
+
+#endif // ZZOGLCRTC_H_INCLUDED
--- a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
@ -75,8 +75,6 @@ map<string, GLbyte> mapGLExtensions;

 namespace ZeroGS
 {
-RenderFormatType g_RenderFormatType = RFT_float16;
-
 extern void KickPoint();
 extern void KickLine();
 extern void KickTriangle();
@ -84,8 +82,8 @@ extern void KickTriangleFan();
 extern void KickSprite();
 extern void KickDummy();
 extern bool LoadEffects();
-extern bool LoadExtraEffects();
-extern FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
+extern bool ZZshLoadExtraEffects();
+extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);

 GLuint vboRect = 0;
 vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
@ -270,19 +268,6 @@ inline void ZeroGS::CreateOtherCheck()
 	if (Max_Texture_Size_NV < 1024)
 		ZZLog::Error_Log("Could not properly make bitmasks, so some textures will be missed.");

-	/* Zeydlitz: we don't support 128-bit targets yet. they are slow and weirdo
-	if( conf.settings() & GAME_32BITTARGS ) {
-		g_RenderFormatType = RFT_byte8;
-		ZZLog::Error_Log("Setting 32 bit render target.");
-	}
-	else {
-		if( !IsGLExt("GL_NV_float_buffer") && !IsGLExt("GL_ARB_color_buffer_float") && !IsGLExt("ATI_pixel_format_float") ) {
-			ZZLog::Error_Log("******\nZZogl: GS WARNING: Floating point render targets not supported, switching to 32bit\nZZogl: *********");
-			g_RenderFormatType = RFT_byte8;
-		}
-	}*/
-	g_RenderFormatType = RFT_byte8;
-
 #ifdef _WIN32
 	if (IsGLExt("WGL_EXT_swap_control") || IsGLExt("EXT_swap_control"))
 		wglSwapIntervalEXT(0);
@ -469,8 +454,6 @@ bool ZeroGS::Create(int _width, int _height)
 	Destroy(1);
 	GSStateReset();

-	g_RenderFormatType = RFT_float16;
-
 	if (!Create_Window(_width, _height)) return false;
 	if (!CreateFillExtensionsMap()) return false;
 	if (!CreateImportantCheck()) return false;
@ -574,7 +557,7 @@ bool ZeroGS::Create(int _width, int _height)
 	PBITMAPINFO pinfo = (PBITMAPINFO)LockResource(hBitmapGlob);

 	GLenum tempFmt = (pinfo->bmiHeader.biBitCount == 32) ? GL_RGBA : GL_RGB;
-	TextureRect(4, pinfo->bmiHeader.biWidth, pinfo->bmiHeader.biHeight, tempFmt, GL_UNSIGNED_BYTE, (u8*)pinfo + pinfo->bmiHeader.biSize);
+	TextureRect(GL_RGBA, pinfo->bmiHeader.biWidth, pinfo->bmiHeader.biHeight, tempFmt, GL_UNSIGNED_BYTE, (u8*)pinfo + pinfo->bmiHeader.biSize);

 	nLogoWidth = pinfo->bmiHeader.biWidth;
 	nLogoHeight = pinfo->bmiHeader.biHeight;
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@ -207,8 +207,6 @@ int icurctx = -1;
 extern CRangeManager s_RangeMngr; // manages overwritten memory				// zz
 void FlushTransferRanges(const tex0Info* ptex);						//zz

-RenderFormatType GetRenderFormat() { return g_RenderFormatType; }					//zz
-
 // use to update the state
 void SetTexVariables(int context, FRAGMENTSHADER* pfragment);			// zz
 void SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint);		// zz
@ -859,7 +857,7 @@ inline float4 FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRen
 	// zoe2
 	if (PSMT_ISZTEX(ptextarg->psm)) vpageoffset.w = -1.0f;

-	ZZshSetParameter4fv(pfragment->prog, pfragment->fPageOffset, vpageoffset, "g_fPageOffset");
+	ZZshSetParameter4fv(pfragment->fPageOffset, vpageoffset, "g_fPageOffset");

 	return vpageoffset;
 }
@ -877,7 +875,7 @@ inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 		v.y = 16.0f / (float)curvb.tex0.th;
 		v.z = 0.5f * v.x;
 		v.w = 0.5f * v.y;
-		ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
+		ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
 	}
 	else if (shadertype == 4)
 	{
@ -886,7 +884,7 @@ inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 		v.y = 16.0f / (float)ptextarg->fbh;
 		v.z = -1;
 		v.w = 8.0f / (float)ptextarg->fbh;
-		ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
+		ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
 	}

 	return v;
@ -920,7 +918,7 @@ inline float4 FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& cu
 	if (shadertype == 4)
 		vTexDims.z += 8.0f;

-	ZZshSetParameter4fv(pfragment->prog, pfragment->fTexDims, vTexDims, "g_fTexDims");
+	ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");

 	return vTexDims;
 }
@ -970,7 +968,7 @@ inline FRAGMENTSHADER* FlushUseExistRenderTarget(VB& curvb, CRenderTarget* ptext
 	float4 vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);

 	if (pfragment->sCLUT != NULL && ptexclut != 0)
-		ZZshGLSetTextureParameter(pfragment->prog, pfragment->sCLUT, ptexclut, "CLUT");
+		ZZshGLSetTextureParameter(pfragment->sCLUT, ptexclut, "CLUT");

 	FlushApplyResizeFilter(curvb, dwFilterOpts, ptextarg, context);

@ -1016,13 +1014,13 @@ inline void FlushSetTexture(VB& curvb, FRAGMENTSHADER* pfragment, CRenderTarget*

 	// have to enable the texture parameters(curtest.atst)
 	if( curvb.ptexClamp[0] != 0 ) 
-		ZZshGLSetTextureParameter(pfragment->prog, pfragment->sBitwiseANDX, curvb.ptexClamp[0], "Clamp 0");
+		ZZshGLSetTextureParameter(pfragment->sBitwiseANDX, curvb.ptexClamp[0], "Clamp 0");
 	
 	if( curvb.ptexClamp[1] != 0 ) 
-		ZZshGLSetTextureParameter(pfragment->prog, pfragment->sBitwiseANDY, curvb.ptexClamp[1], "Clamp 1");
+		ZZshGLSetTextureParameter(pfragment->sBitwiseANDY, curvb.ptexClamp[1], "Clamp 1");
 	
 	if( pfragment->sMemory != NULL && s_ptexCurSet[context] != 0) 
-		ZZshGLSetTextureParameter(pfragment->prog, pfragment->sMemory, s_ptexCurSet[context], "Clamp memory");
+		ZZshGLSetTextureParameter(pfragment->sMemory, s_ptexCurSet[context], "Clamp memory");

 }

@ -1170,13 +1168,13 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
 			v.w *= 255;
 		}

-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
 	}
 	else
 	{
 		// not using blending so set to defaults
 		float4 v = exactcolor ? float4(1, 510 * 255.0f / 256.0f, 0, 0) : float4(1, 2 * 255.0f / 256.0f, 0, 0);
-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");

 	}

@ -1267,7 +1265,7 @@ inline void AlphaPabe(VB& curvb, FRAGMENTSHADER* pfragment, int exactcolor)

 		if (exactcolor) v.y *= 255;

-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");

 		Draw(curvb);

@ -1336,7 +1334,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE

 		if (exactcolor) { v.y *= 255; v.w *= 255; }

-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");

 		glEnable(GL_BLEND);
 		GL_STENCILFUNC(GL_EQUAL, s_stencilref | STENCIL_FBA, s_stencilmask | STENCIL_FBA);
@ -1360,7 +1358,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE

 		if (exactcolor) v.y *= 255;

-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");

 		Draw(curvb);

@ -1412,7 +1410,7 @@ inline void AlphaSpecialTesting(VB& curvb, FRAGMENTSHADER* pfragment, u32 dwUsin
 		glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);

 		float4 v = float4(0, exactcolor ? 510.0f : 2.0f, 0, 0);
-		ZZshSetParameter4fv(pfragment->prog, pfragment->sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
 		Draw(curvb);

 		// don't need to restore
@ -1468,66 +1466,6 @@ inline void AlphaSaveTarget(VB& curvb)
 #endif
 }

-inline void AlphaColorClamping(VB& curvb, const pixTest curtest)
-{
-	// clamp the final colors, when enabled ffx2 credits mess up
-	//if (gs.colclamp) ZZLog::Error_Log("ColClamp!");
-	if ((curvb.curprim.abe && bAlphaClamping) && (GetRenderFormat() != RFT_byte8) && !(conf.settings().no_color_clamp))   // if !colclamp, skip
-	{
-		//ZZLog::Error_Log("Clamped.");
-		ResetAlphaVariables();
-
-		// if processing the clamping case, make sure can write to the front buffer
-		glDisable(GL_STENCIL_TEST);
-		glEnable(GL_BLEND);
-		glDisable(GL_ALPHA_TEST);
-		glDisable(GL_DEPTH_TEST);
-		glDepthMask(0);
-		glColorMask(1, 1, 1, 0);
-
-		if (s_bWriteDepth) ResetRenderTarget(1);
-
-		SetShaderCaller("AlphaColorClamping");
-
-		ZZshSetPixelShader(ppsOne.prog);
-		GL_BLEND_RGB(GL_ONE, GL_ONE);
-
-		float f;
-
-		if (bAlphaClamping & 1)    // min
-		{
-			f = 0;
-			ZZshSetParameter4fv(ppsOne.prog, ppsOne.sOneColor, &f, "g_fOneColor");
-			GL_BLENDEQ_RGB(GL_MAX_EXT);
-			Draw(curvb);
-		}
-
-		// bios shows white screen
-		if (bAlphaClamping & 2)    // max
-		{
-			f = 1;
-			ZZshSetParameter4fv(ppsOne.prog, ppsOne.sOneColor, &f, "g_fOneColor");
-			GL_BLENDEQ_RGB(GL_MIN_EXT);
-			Draw(curvb);
-		}
-
-		if (!curvb.zbuf.zmsk)
-		{
-			glDepthMask(1);
-
-			if (s_bWriteDepth)
-			{
-				assert(curvb.pdepth != NULL);
-				curvb.pdepth->SetRenderTarget(1);
-			}
-		}
-
-		if (curvb.test.ate && USEALPHATESTING) glEnable(GL_ALPHA_TEST);
-
-		GL_ZTEST(curtest.zte);
-	}
-}
-
 inline void FlushUndoFiter(u32 dwFilterOpts)
 {
 	if (dwFilterOpts)
@ -1585,7 +1523,6 @@ void ZeroGS::Flush(int context)
 	
 	GL_REPORT_ERRORD();

-	AlphaColorClamping(curvb, curtest);
 	FlushUndoFiter(dwFilterOpts);

 	ppf += curvb.nCount + 0x100000;
@ -1988,7 +1925,7 @@ void ZeroGS::SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint)
 }

 // clamp relies on texture width
-inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment ) 
+void SetTexClamping(int context, FRAGMENTSHADER* pfragment)
 {
 	FUNCLOG
 	SetShaderCaller("SetTexClamping");
@ -1998,68 +1935,84 @@ inline void SetTexClamping(int context, FRAGMENTSHADER* pfragment )
 	u32* ptex = ZeroGS::vb[context].ptexClamp;
 	ptex[0] = ptex[1] = 0;

-	float fw = ZeroGS::vb[context].tex0.tw;
-	float fh = ZeroGS::vb[context].tex0.th;
+	float fw = ZeroGS::vb[context].tex0.tw ;
+	float fh = ZeroGS::vb[context].tex0.th ;

-	switch(pclamp->wms) 
+	switch (pclamp->wms)
 	{
 		case 0:
-			v2.x = -1e10;   v2.z = 1e10;
+			v2.x = -1e10;
+			v2.z = 1e10;
 			break;
+
 		case 1: // pclamp
 			// suikoden5 movie text
-			v2.x = 0; v2.z = 1-0.5f/fw;
+			v2.x = 0;
+			v2.z = 1 - 0.5f / fw;
 			break;
+
 		case 2: // reg pclamp
-			v2.x = (pclamp->minu+0.5f)/fw;  v2.z = (pclamp->maxu-0.5f)/fw;
+			v2.x = (pclamp->minu + 0.5f) / fw;
+			v2.z = (pclamp->maxu - 0.5f) / fw;
 			break;

 		case 3: // region rep x
 			v.x = 0.9999f;
-			v.z = (float)fw ;  
+			v.z = (float)fw;
 			v2.x = (float)GPU_TEXMASKWIDTH / fw;
 			v2.z = pclamp->maxu / fw;
 			int correctMinu = pclamp->minu & (~pclamp->maxu);		// (A && B) || C == (A && (B && !C)) + C

-			if (correctMinu != g_PrevBitwiseTexX) 
+			if (correctMinu != g_PrevBitwiseTexX)
 			{
 				g_PrevBitwiseTexX = correctMinu;
 				ptex[0] = ZeroGS::s_BitwiseTextures.GetTex(correctMinu, 0);
 			}
+
 			break;
 	}

-	switch(pclamp->wmt) 
+	switch (pclamp->wmt)
 	{
+
 		case 0:
-			v2.y = -1e10;   v2.w = 1e10;
+			v2.y = -1e10;
+			v2.w = 1e10;
 			break;
+
 		case 1: // pclamp
 			// suikoden5 movie text
-			v2.y = 0;   v2.w = 1-0.5f/fh;
+			v2.y = 0;
+			v2.w = 1 - 0.5f / fh;
 			break;
+
 		case 2: // reg pclamp
-			v2.y = (pclamp->minv+0.5f)/fh; v2.w = (pclamp->maxv-0.5f)/fh;
+			v2.y = (pclamp->minv + 0.5f) / fh;
+			v2.w = (pclamp->maxv - 0.5f) / fh;
 			break;

 		case 3: // region rep y
 			v.y = 0.9999f;
-			v.w = (float)fh ;
+			v.w = (float)fh;
 			v2.y = (float)GPU_TEXMASKWIDTH / fh;
 			v2.w = pclamp->maxv / fh;
 			int correctMinv = pclamp->minv & (~pclamp->maxv);		// (A && B) || C == (A && (B && !C)) + C

-			if (correctMinv != g_PrevBitwiseTexY) {
+			if (correctMinv != g_PrevBitwiseTexY)
+			{
 				g_PrevBitwiseTexY = correctMinv;
 				ptex[1] = ZeroGS::s_BitwiseTextures.GetTex(correctMinv, ptex[0]);
 			}
 			break;
-		}
+	}
+
+	if (pfragment->fTexWrapMode != 0)
+		ZZshSetParameter4fv(pfragment->fTexWrapMode, v, "g_fTexWrapMode");
+
+	if (pfragment->fClampExts != 0)
+		ZZshSetParameter4fv(pfragment->fClampExts, v2, "g_fClampExts");
+

-	if (ZZshActiveParameter(pfragment->fTexWrapMode))
-		ZZshSetParameter4fv(pfragment->prog, pfragment->fTexWrapMode, v, "g_fTexWrapMode");
-	if (ZZshActiveParameter( pfragment->fClampExts))
-		ZZshSetParameter4fv(pfragment->prog, pfragment->fClampExts, v2, "g_fClampExts");
 }

 // Fixme should be in float4 lib
@ -2230,11 +2183,11 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)

 		// Test;*/

-		ZZshSetParameter4fv(pfragment->prog, pfragment->fTexAlpha, valpha, "g_fTexAlpha");
-		ZZshSetParameter4fv(pfragment->prog, pfragment->fTexAlpha2, valpha2, "g_fTexAlpha2");
+		ZZshSetParameter4fv(pfragment->fTexAlpha, valpha, "g_fTexAlpha");
+		ZZshSetParameter4fv(pfragment->fTexAlpha2, valpha2, "g_fTexAlpha2");

 		if (IsAlphaTestExpansion(tex0))
-			ZZshSetParameter4fv(pfragment->prog, pfragment->fTestBlack, vblack, "g_fTestBlack");
+			ZZshSetParameter4fv(pfragment->fTestBlack, vblack, "g_fTestBlack");

 		SetTexClamping(context, pfragment);

@ -2280,7 +2233,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
 		v.w = 1.0f / (float)fh;

 		if (pfragment->fRealTexDims)
-			ZZshSetParameter4fv(pfragment->prog, pfragment->fRealTexDims, v, "g_fRealTexDims");
+			ZZshSetParameter4fv(pfragment->fRealTexDims, v, "g_fRealTexDims");
 		else
 			ZZshSetParameter4fv(cgGetNamedParameter(pfragment->prog,"g_fRealTexDims"),v, "g_fRealTexDims");	
 	}
@ -2336,15 +2289,15 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
 		v.z *= b.bpp * (1 / 32.0f);
 	}

-	ZZshSetParameter4fv(pfragment->prog, pfragment->fTexDims, vTexDims, "g_fTexDims");
+	ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");

-//	ZZshSetParameter4fv(pfragment->prog, pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
-	ZZshSetParameter4fv(pfragment->prog, pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
-	ZZshSetParameter4fv(pfragment->prog, pfragment->fTexOffset, v, "g_fTexOffset");
+//	ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
+	ZZshSetParameter4fv(pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
+	ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");

 	// get hardware texture dims
-	//int texheight = (pmemtarg->realheight+pmemtarg->widthmult-1)/pmemtarg->widthmult;
-	int texwidth = GPU_TEXWIDTH * pmemtarg->widthmult * pmemtarg->channels;
+	//int texheight = pmemtarg->texH;
+	int texwidth = pmemtarg->texW;

 	v.y = 1.0f;
 	v.x = (fpageint - (float)pmemtarg->realy / (float)pmemtarg->widthmult + 0.5f);//*v.y;
--- a/plugins/zzogl-pg/opengl/ZZoglMath.h
+++ b/plugins/zzogl-pg/opengl/ZZoglMath.h
@ -1,83 +1,493 @@
- /* ZeroGS KOSMOS
-  *
-  * Zerofrog's ZeroGS KOSMOS (c)2005-2008
-  *
-  * Zerofrog forgot to write any copyright notice after releasing the plugin into GPLv2
-  * If someone can contact him successfully to clarify this matter that would be great.
-  */
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */

-// Now that it's down to 82 lines, and most of it's fairly obvious, perhaps it'd be easier to 
-// just reimplement it... -arcum42
-
-#ifndef ZZOGLMATH_H_INCLUDED
-#define ZZOGLMATH_H_INCLUDED
+#ifndef ZZOGLMATH_H_INCLUDED
+#define ZZOGLMATH_H_INCLUDED

+//Remind me to check and see if this is necessary, and what uses it. --arcum42
 #ifndef _WIN32
 #include <alloca.h>
 #endif

-#include <string.h>
-#include <math.h>
 #include <assert.h>

-typedef float dReal;
+//#define ZZ_MMATH

-// class used for 3 and 4 dim vectors and quaternions
-// It is better to use this for a 3 dim vector because it is 16byte aligned and SIMD instructions can be used
+#ifndef ZZ_MMATH

-class float4
+template <class T>
+class Vector4
 {
 	public:
-		dReal x, y, z, w;
-
-		float4() : x(0), y(0), z(0), w(0) {}
-		float4(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
-		float4(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
-		float4(const float4 &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
-		float4(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
-		dReal  operator[](int i) const	   { return (&x)[i]; }
-		dReal& operator[](int i)			 { return (&x)[i]; }
+		T x, y, z, w;
 		
-		// casting operators
-		operator dReal*() { return &x; }
-		operator const dReal*() const { return (const dReal*)&x; }
+		Vector4(T x1 = 0, T y1 = 0, T z1 = 0, T w1 = 0) 
+		{ 
+			x = x1; 
+			y = y1; 
+			z = z1; 
+			w = w1; 
+		}
 		
-		// SCALAR FUNCTIONS
-		inline dReal dot(const float4 &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
-		inline void Set3(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; }
-		inline void Set4(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; w = pvals[3]; }
-		inline void SetColor(u32 color)
+		Vector4(Vector4<T> &f) 
+		{ 
+			x = f.x; 
+			y = f.y; 
+			z = f.z; 
+			w = f.w; 
+		}
+		
+		Vector4(T* f) 
+		{
+			x = f[0]; 
+			y = f[1]; 
+			z = f[2]; 
+			w = f[3]; // For some reason, the old code set this to 0. 
+		}
+		
+		T& operator[](int i)
+		{
+			switch(i)
+			{
+				case 0: return x;
+				case 1: return y;
+				case 2: return z;
+				case 3: return w;
+				default: assert(0);
+			}
+		}
+		
+		operator T*()
+		{
+			return (T*) this;
+		}
+		
+		operator const T*() const
+		{
+			return (const T*) this;
+		}
+		
+		Vector4<T>& operator =(const Vector4<T>& v)
+		{
+			x = v.x;
+			y = v.y;
+			z = v.z;
+			w = v.w;
+			return *this;
+		}
+		
+		bool operator ==(const Vector4<T>& v)
+		{
+			return !!(	x == v.x &&
+						y == v.y &&
+						z == v.z &&
+						w == v.w	);
+		}
+		
+		Vector4<T> operator +(const Vector4<T>& v) const
+		{
+			return Vector4<T>(x + v.x, y + v.y, z + v.z, w + v.w);
+		}
+		
+		Vector4<T> operator -(const Vector4<T>& v) const
+		{
+			return Vector4<T>(x - v.x, y - v.y, z - v.z, w - v.w);
+		}
+		
+		Vector4<T> operator *(const Vector4<T>& v) const
+		{
+			return Vector4<T>(x * v.x, y * v.y, z * v.z, w * v.w);
+		}
+		
+		Vector4<T> operator /(const Vector4<T>& v) const
+		{
+			return Vector4<T>(x / v.x, y / v.y, z / v.z, w / v.w);
+		}
+		Vector4<T> operator +(T val) const
+		{
+			return Vector4<T>(x + val, y + val, z + val, w + val);
+		}
+		
+		Vector4<T> operator -(T val) const
+		{
+			return Vector4<T>(x - val, y - val, z - val, w - val);
+		}
+		
+		Vector4<T> operator *(T val) const
+		{
+			return Vector4<T>(x * val, y * val, z * val, w * val);
+		}
+		
+		Vector4<T> operator /(T val) const
+		{
+			return Vector4<T>(x / val, y / val, z / val, w / val);
+		}
+		
+		Vector4<T>& operator +=(const Vector4<T>& v)
+		{
+			*this = *this + v;
+			return *this;
+		}
+		
+		Vector4<T>& operator -=(const Vector4<T>& v)
+		{
+			*this = *this - v;
+			return *this;
+		}
+		
+		Vector4<T>& operator *=(const Vector4<T>& v)
+		{
+			*this = *this * v;
+			return *this;
+		}
+		
+		Vector4<T>& operator /=(const Vector4<T>& v)
+		{
+			*this = *this - v;
+			return *this;
+		}
+		
+		Vector4<T>& operator +=(T val)
+		{
+			*this = *this + (T)val;
+			return *this;
+		}
+		
+		Vector4<T>& operator -=(T val)
+		{
+			*this = *this - (T)val;
+			return *this;
+		}
+		
+		Vector4<T>& operator *=(T val)
+		{
+			*this = *this * (T)val;
+			return *this;
+		}
+		
+		Vector4<T>& operator /=(T val)
+		{
+			*this = *this / (T)val;
+			return *this;
+		}
+		
+		// Probably doesn't belong here, but I'll leave it in for the moment.
+		void SetColor(u32 color)
 		{
 			x = (color & 0xff) / 255.0f;
 			y = ((color >> 8) & 0xff) / 255.0f;
 			z = ((color >> 16) & 0xff) / 255.0f;
 		}
-
-		// 3 dim cross product, w is not touched
-		/// this = this x v
-		/// this = u x v
-		inline float4 operator-() const { float4 v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
-		inline float4 operator+(const float4 &r) const { float4 v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
-		inline float4 operator-(const float4 &r) const { float4 v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
-		inline float4 operator*(const float4 &r) const { float4 v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
-		inline float4 operator*(dReal k) const { float4 v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
-		inline float4& operator += (const float4& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
-		inline float4& operator -= (const float4& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
-		inline float4& operator *= (const float4& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
-		inline float4& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
-		inline float4& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
-		friend float4 operator*(float f, const float4& v);
-		//friend ostream& operator<<(ostream& O, const float4& v);
-		//friend istream& operator>>(istream& I, float4& v);
 };

-inline float4 operator*(float f, const float4& left)
+typedef Vector4<float> float4;
+
+#else
+
+// Reimplement, swiping a bunch of code from GSdx and adapting it. (specifically GSVector.h)
+// This doesn't include more then half of the functions in there, as well as some of the structs...
+#include <xmmintrin.h>
+
+#include "Pcsx2Types.h"
+
+class float4
 {
-	float4 v;
-	v.x = f * left.x;
-	v.y = f * left.y;
-	v.z = f * left.z;
-	return v;
-}
-
-#endif // ZZOGLMATH_H_INCLUDED
+	public:
+		union
+		{
+			struct {float x, y, z, w;};
+			struct {float r, g, b, a;};
+			struct {float left, top, right, bottom;};
+			float v[4];
+			float f32[4];
+			s8 _s8[16];
+			s16 _s16[8];
+			s32 _s32[4];
+			s64 _s64[2];
+			u8 _u8[16];
+			u16 _u16[8];
+			u32 _u32[4];
+			u64 _u64[2];
+			__m128 m;
+		};
+		
+		float4() 
+		{ 
+			m = _mm_setzero_ps();
+		}
+		
+		float4(float x, float y, float z, float w = 0) 
+		{ 
+			m = _mm_set_ps(w, z, y, x);
+		}
+		
+		float4(float4 &f) 
+		{ 
+			m = f.m;
+		}
+
+		float4(float x, float y)
+		{
+			m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y));
+		}
+
+		float4(int x, int y)
+		{
+			m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
+		}
+
+		explicit float4(float f)
+		{
+			m = _mm_set1_ps(f);
+		}
+
+		explicit float4(__m128 m)
+		{
+			this->m = m;
+		}
+
+		float4(float* f) 
+		{
+			x = f[0]; 
+			y = f[1]; 
+			z = f[2]; 
+			w = f[3]; // For some reason, the old code set this to 0. 
+		}
+		
+		float& operator[](int i)
+		{
+			switch(i)
+			{
+				case 0: return x;
+				case 1: return y;
+				case 2: return z;
+				case 3: return w;
+				default: assert(0);
+			}
+		}
+		
+		operator float*()
+		{
+			return (float*) this;
+		}
+		
+		operator const float*() const
+		{
+			return (const float*) this;
+		}
+
+		void operator = (float f)
+		{
+			m = _mm_set1_ps(f);
+		}
+
+		void operator = (__m128 m)
+		{
+			this->m = m;
+		}
+
+		
+		void operator += (const float4& v)
+		{
+			m = _mm_add_ps(m, v.m);
+		}
+
+		void operator -= (const float4& v)
+		{
+			m = _mm_sub_ps(m, v.m);
+		}
+
+		void operator *= (const float4& v)
+		{
+			m = _mm_mul_ps(m, v.m);
+		}
+
+		void operator /= (const float4& v)
+		{
+			m = _mm_div_ps(m, v.m);
+		}
+
+		void operator += (float f)
+		{
+			*this += float4(f);
+		}
+
+		void operator -= (float f)
+		{
+			*this -= float4(f);
+		}
+
+		void operator *= (float f)
+		{
+			*this *= float4(f);
+		}
+
+		void operator /= (float f)
+		{
+			*this /= float4(f);
+		}
+
+		void operator &= (const float4& v)
+		{
+			m = _mm_and_ps(m, v.m);
+		}
+
+		void operator |= (const float4& v)
+		{
+			m = _mm_or_ps(m, v.m);
+		}
+
+		void operator ^= (const float4& v)
+		{
+			m = _mm_xor_ps(m, v.m);
+		}
+
+		friend float4 operator + (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_add_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator - (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_sub_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator * (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_mul_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator / (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_div_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator + (const float4& v, float f)
+		{
+			return v + float4(f);
+		}
+
+		friend float4 operator - (const float4& v, float f)
+		{
+			return v - float4(f);
+		}
+
+		friend float4 operator * (const float4& v, float f)
+		{
+			return v * float4(f);
+		}
+
+		friend float4 operator / (const float4& v, float f)
+		{
+			return v / float4(f);
+		}
+
+		friend float4 operator & (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_and_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator | (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_or_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator ^ (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_xor_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator == (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmpeq_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator != (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmpneq_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator > (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmpgt_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator < (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmplt_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator >= (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmpge_ps(v1.m, v2.m));
+		}
+
+		friend float4 operator <= (const float4& v1, const float4& v2)
+		{
+			return float4(_mm_cmple_ps(v1.m, v2.m));
+		}
+		
+		// This looked interesting, so I thought I'd include it...
+		
+		template<int i> float4 shuffle() const
+		{
+			return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i)));
+		}
+
+		#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+			float4 xs##ys##zs##ws() const {return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+			float4 xs##ys##zs##ws(const float4& v) const {return float4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+		#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+			VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+			VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+			VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+			VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+		#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+			VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+			VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+			VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+			VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+		#define VECTOR4_SHUFFLE_1(xs, xn) \
+			float4 xs##4() const {return float4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+			float4 xs##4(const float4& v) const {return float4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(xn, xn, xn, xn)));} \
+			VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+			VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+			VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+			VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+		VECTOR4_SHUFFLE_1(x, 0)
+		VECTOR4_SHUFFLE_1(y, 1)
+		VECTOR4_SHUFFLE_1(z, 2)
+		VECTOR4_SHUFFLE_1(w, 3)
+	
+		// Probably doesn't belong here, but I'll leave it in for the moment.
+		void SetColor(u32 color)
+		{
+			x = (color & 0xff) / 255.0f;
+			y = ((color >> 8) & 0xff) / 255.0f;
+			z = ((color >> 16) & 0xff) / 255.0f;
+		}
+};
+
+#endif
+
+#endif
--- a/plugins/zzogl-pg/opengl/ZZoglShoots.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShoots.cpp
@ -392,16 +392,16 @@ ZeroGS::SaveTex(tex0Info* ptex, int usevid)
 		assert(pmemtarg != NULL);

 		glBindTexture(GL_TEXTURE_RECTANGLE_NV, pmemtarg->ptex->tex);
-		srcdata.resize(pmemtarg->realheight * GPU_TEXWIDTH * pmemtarg->widthmult * 4 * 8); // max of 8 cannels
+		srcdata.resize(4 * pmemtarg->texW * pmemtarg->texH);

 		glGetTexImage(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, pmemtarg->fmt, &srcdata[0]);

-		u32 offset = pmemtarg->realy * 4 * GPU_TEXWIDTH;
+		u32 offset = MemorySize(pmemtarg->realy);

 		if (ptex->psm == PSMT8)
-			offset *= PSMT_IS32BIT(ptex->cpsm) ? 4 : 2;
+			offset *= CLUT_PIXEL_SIZE(ptex->cpsm);
 		else if (ptex->psm == PSMT4)
-			offset *= PSMT_IS32BIT(ptex->cpsm) ? 8 : 4;
+			offset *= CLUT_PIXEL_SIZE(ptex->cpsm) * 2;

 		psrc = &srcdata[0] - offset;
 	}
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
--- a/plugins/zzogl-pg/opengl/targets.h
+++ b/plugins/zzogl-pg/opengl/targets.h
@ -50,7 +50,9 @@ class CRenderTargetMngr

 		void Destroy();
 		static MAPTARGETS::iterator GetOldestTarg(MAPTARGETS& m);
-
+		
+		bool isFound(const frameInfo& frame, MAPTARGETS::iterator& it, u32 opts, u32 key, int maxposheight);
+		
 		CRenderTarget* GetTarg(const frameInfo& frame, u32 Options, int maxposheight);
 		inline CRenderTarget* GetTarg(int fbp, int fbw, VB& curvb)
 		{
@ -119,13 +121,13 @@ class CRenderTargetMngr

 class CMemoryTargetMngr
 {
-
 	public:
 		CMemoryTargetMngr() : curstamp(0) {}

 		CMemoryTarget* GetMemoryTarget(const tex0Info& tex0, int forcevalidate); // pcbp is pointer to start of clut
-		CMemoryTarget* MemoryTarget_SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate);
-		CMemoryTarget* MemoryTarget_ClearedTargetsSearch(int fmt, int widthmult, int channels, int height);
+		CMemoryTarget* SearchExistTarget(int start, int end, int nClutOffset, int clutsize, const tex0Info& tex0, int forcevalidate);
+		CMemoryTarget* ClearedTargetsSearch(int fmt, int widthmult, int channels, int height);
+		int CompareTarget(list<CMemoryTarget>::iterator& it, const tex0Info& tex0, int clutsize, int nClutOffset);

 		void Destroy(); // destroy all targs

@ -138,6 +140,8 @@ class CMemoryTargetMngr

 	private:
 		list<CMemoryTarget>::iterator DestroyTargetIter(list<CMemoryTarget>::iterator& it);
+		void GetClutVariables(int& nClutOffset, int& clutsize, const tex0Info& tex0);
+		void GetMemAddress(int& start, int& end,  const tex0Info& tex0);
 };

 class CBitwiseTextureMngr
--- a/plugins/zzogl-pg/opengl/x86-32.S
+++ b/plugins/zzogl-pg/opengl/x86-32.S
@ -4,15 +4,15 @@
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation either ve%rsion 2, or (at your option)
 #  any later ve%rsion.
-#   
+#
 #  This Program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 #  GNU General Public License for more details.
-#   
+#
 #  You should have received a copy of the GNU General Public License
 #  along with GNU Make see the file COPYING.  If not, write to
-#  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+#  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 #  http://www.gnu.org/copyleft/gpl.html
 #
 #
@ -20,6 +20,11 @@

 #ifdef ZEROGS_SSE2
 // SSE2 extensions
+
+// Note: pshufd 0xea <=> movdqa !!!
+// What the function does is
+// Interleave s1 and sd0 -> d1 (high) & sd0 (low)
+// Interleave s3 and sd2 -> d3 (high) & sd2 (low)
 #define punpck(op, sd0, sd2, s1, s3, d1, d3) \
 	movdqa %xmm##d1, %xmm##sd0; \
 	pshufd %xmm##d3, %xmm##sd2, 0xe4; \
@ -28,7 +33,16 @@
 	punpckl##op %xmm##sd2, %xmm##s3; \
 	punpckh##op %xmm##d3, %xmm##s3; \

-	
+
+// Input xmm7 == 0x0F0F0F0F 0x0F0F0F0F  0x0F0F0F0F 0x0F0F0F0F
+// DATA xmm[0-3]
+// This function does a 4-bits interleaving of 4 xmm registers
+//
+// ARG Can not put comment in the middle of the define...
+// After the first por
+// low 32bits (4bits packed) == 1.6 0.6 1.4 0.4  1.2 0.2 1.0 0.0
+// After the second one
+// low 32bits (4bits packed) == 1.7 0.7 1.5 0.5  1.3 0.3 1.1 0.1
 #define punpcknb \
 	movdqa	%xmm4, %xmm0; \
 	pshufd	%xmm5, %xmm1, 0xe4; \
@ -48,6 +62,7 @@
        \
 	movdqa	%xmm1, %xmm4; \
        \
+        \
 	movdqa	%xmm4, %xmm2; \
 	pshufd	%xmm5, %xmm3, 0xe4; \
        \
@ -66,7 +81,13 @@
        \
 	movdqa	%xmm3, %xmm4; \
        \
-	punpck(bw, 0, 2, 1, 3, 4, 6); \
+	punpck(bw, 0, 2, 1, 3, 4, 6);\
+
+// output
+// low 32 bits 0 (4 bits packed) == 1.3  0.3  1.2  0.2    1.1  0.1  1.0  0.0
+// low 32 bits 4 (4 bits packed) == 1.19 0.19 1.18 0.18   1.17 0.17 1.16 0.16
+// low 32 bits 2 (4 bits packed) == 3.3  2.3  3.2  2.2    3.1  2.1  3.0  2.0
+// low 32 bits 6 (4 bits packed) == 3.19 2.19 3.18 2.18   3.17 2.17 3.16 2.16


 //
@ -84,11 +105,15 @@ SwizzleBlock32_sse2:
 	push		%esi
 	push		%edi

+    // save dst
 	mov			%edi, %ecx
+    // save src
 	mov			%esi, %edx
+    // get pitch
 	mov			%edx, [%esp+4+8]
 	mov			%ecx, 4

+    // get WriteMask
 	mov			%eax, [%esp+8+8]
 	cmp			%eax, 0xffffffff
 	jne			SwizzleBlock32_sse2_2
@ -100,6 +125,8 @@ SwizzleBlock32_sse2_1:
 	movdqa		%xmm1, [%esi+%edx]
 	movdqa		%xmm5, [%esi+%edx+16]

+    // 64bits interleave 1&0 -> 2&0
+    // 64bits interleave 5&4 -> 6&4
 	punpck(qdq, 0, 4, 1, 5, 2, 6)

 	movntps		[%edi+16*0], %xmm0
@ -107,6 +134,7 @@ SwizzleBlock32_sse2_1:
 	movntps		[%edi+16*2], %xmm4
 	movntps		[%edi+16*3], %xmm6

+    // update ptr
 	lea			%esi, [%esi+%edx*2]
 	add			%edi, 64

@ -120,9 +148,10 @@ SwizzleBlock32_sse2_1:

 SwizzleBlock32_sse2_2:

+    // WriteMask: 32bits to 4*32bits
 	movd		%xmm7, %eax
 	pshufd		%xmm7, %xmm7, 0
-	
+
 	.align 16
 SwizzleBlock32_sse2_3:
 	movdqa		%xmm0, [%esi]
@ -130,13 +159,19 @@ SwizzleBlock32_sse2_3:
 	movdqa		%xmm1, [%esi+%edx]
 	movdqa		%xmm5, [%esi+%edx+16]

+    // 64bits interleave 1&0 -> 2&0
+    // 64bits interleave 5&4 -> 6&4
 	punpck(qdq, 0, 4, 1, 5, 2, 6)

+    // save a mask copy
 	movdqa		%xmm3, %xmm7
 	pshufd		%xmm5, %xmm7, 0xe4

+    // *dst & ~WriteMask
 	pandn		%xmm3, [%edi+16*0]
+    // *src & WriteMask
 	pand		%xmm0, %xmm7
+    // Final value to save
 	por			%xmm0, %xmm3
 	movntps		[%edi+16*0], %xmm0

@ -158,6 +193,7 @@ SwizzleBlock32_sse2_3:
 	por			%xmm6, %xmm5
 	movntps		[%edi+16*3], %xmm6

+    // update ptr
 	lea			%esi, [%esi+%edx*2]
 	add			%edi, 64

@ -179,6 +215,7 @@ SwizzleBlock16_sse2:

 	push		%ebx

+    // srcpitch
 	mov			%ebx, [%esp+4+4]
 	mov			%eax, 4

@ -189,7 +226,11 @@ SwizzleBlock16_sse2_1:
 	movdqa		%xmm2, [%edx+%ebx]
 	movdqa		%xmm3, [%edx+%ebx+16]

+    // 16bits interleave 1&0 -> 4&0
+    // 16bits interleave 3&2 -> 6&2
 	punpck(wd, 0, 2, 1, 3, 4, 6)
+    // 64bits interleave 2&0 -> 1&0
+    // 64bits interleave 6&4 -> 5&4
 	punpck(qdq, 0, 4, 2, 6, 1, 5)

 	movntps		[%ecx+16*0], %xmm0
@ -197,6 +238,7 @@ SwizzleBlock16_sse2_1:
 	movntps		[%ecx+16*2], %xmm4
 	movntps		[%ecx+16*3], %xmm5

+    // update ptr
 	lea			%edx, [%edx+%ebx*2]
 	add			%ecx, 64

@ -217,7 +259,9 @@ SwizzleBlock8_sse2:

 	push		%ebx

+    // load srcpitch
 	mov			%ebx, [%esp+4+4]
+    // basic counter
 	mov			%eax, 2

 	.align 16
@ -226,14 +270,23 @@ SwizzleBlock8_sse2_1:

 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]

+    // 2 3  0 1
 	pshufd		%xmm1, [%edx], 0xb1
 	pshufd		%xmm3, [%edx+%ebx], 0xb1
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]

+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 16bits interleave 4&0 -> 1&0
+    // 16bits interleave 6&2 -> 3&2
 	punpck(wd, 0, 2, 4, 6, 1, 3)
+    // 64bits interleave 2&0 -> 4&0
+    // 64bits interleave 3&1 -> 5&1
 	punpck(qdq, 0, 1, 2, 3, 4, 5)

 	movntps		[%ecx+16*0], %xmm0
@ -241,18 +294,27 @@ SwizzleBlock8_sse2_1:
 	movntps		[%ecx+16*2], %xmm1
 	movntps		[%ecx+16*3], %xmm5

-	// col 1, 3
+	// col 1, 3 (same as previous column)

+    // 2 3  0 1
 	pshufd		%xmm0, [%edx], 0xb1
 	pshufd		%xmm2, [%edx+%ebx], 0xb1
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]

 	movdqa		%xmm1, [%edx]
 	movdqa		%xmm3, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]

+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 16bits interleave 4&0 -> 1&0
+    // 16bits interleave 6&2 -> 3&2
 	punpck(wd, 0, 2, 4, 6, 1, 3)
+    // 64bits interleave 2&0 -> 4&0
+    // 64bits interleave 3&1 -> 5&1
 	punpck(qdq, 0, 1, 2, 3, 4, 5)

 	movntps		[%ecx+16*4], %xmm0
@ -260,6 +322,7 @@ SwizzleBlock8_sse2_1:
 	movntps		[%ecx+16*6], %xmm1
 	movntps		[%ecx+16*7], %xmm5

+    // update dst pointer
 	add			%ecx, 128

 	dec			%eax
@ -278,11 +341,13 @@ SwizzleBlock8_sse2_1:
 SwizzleBlock4_sse2:

 	push		%ebx
-	
+
+    // load 4 0x0F0F0F0F
 	mov         %eax, 0xf0f0f0f
-	movd        %xmm7, %eax 
+	movd        %xmm7, %eax
 	pshufd      %xmm7, %xmm7, 0

+    // load srcpitch
 	mov			%ebx, [%esp+4+4]
 	mov			%eax, 2

@ -292,20 +357,32 @@ SwizzleBlock4_sse2_1:

 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
+    //update src pointer
 	lea			%edx, [%edx+%ebx*2]

 	movdqa		%xmm1, [%edx]
 	movdqa		%xmm3, [%edx+%ebx]
+    // update src pointer
 	lea			%edx, [%edx+%ebx*2]

+    // - - - -  2 3 0 1
 	pshuflw		%xmm1, %xmm1, 0xb1
 	pshuflw		%xmm3, %xmm3, 0xb1
+    // 6 7 4 5  - - - -
 	pshufhw		%xmm1, %xmm1, 0xb1
 	pshufhw		%xmm3, %xmm3, 0xb1

+    // 4bits interleave 1&0 -> 4&0
+    // 4bits interleave 3&2 -> 6&2
 	punpcknb
+    // 8bits interleave 4&0 -> 1&0
+    // 8bits interleave 6&2 -> 3&2
 	punpck(bw, 0, 2, 4, 6, 1, 3)
+    // 8bits interleave 1&0 -> 4&0
+    // 8bits interleave 3&2 -> 6&2
 	punpck(bw, 0, 2, 1, 3, 4, 6)
+    // 64bits interleave 2&0 -> 1&0
+    // 64bits interleave 6&4 -> 3&4
 	punpck(qdq, 0, 4, 2, 6, 1, 3)

 	movntps		[%ecx+16*0], %xmm0
@ -313,7 +390,7 @@ SwizzleBlock4_sse2_1:
 	movntps		[%ecx+16*2], %xmm4
 	movntps		[%ecx+16*3], %xmm3

-	// col 1, 3
+	// col 1, 3 (same as previous column)

 	movdqa		%xmm0, [%edx]
 	movdqa		%xmm2, [%edx+%ebx]
@ -349,6 +426,9 @@ SwizzleBlock4_sse2_1:

 //
 // swizzling with unaligned reads
+// Same functions as a above with movdqu instead of movdqa for the reads
+// Movdqu is as fast as movdqa with aligned address... So do not bother, directly
+// use movdqu
 //

 //
@ -400,7 +480,7 @@ SwizzleBlock32u_sse2_2:

 	movd		%xmm7, %eax
 	pshufd		%xmm7, %xmm7, 0
-	
+
 	.align 16
 SwizzleBlock32u_sse2_3:
 	movdqu		%xmm0, [%esi]
@ -480,7 +560,7 @@ SwizzleBlock16u_sse2_1:

 	dec			%eax
 	jnz			SwizzleBlock16u_sse2_1
-        
+
 	pop			%ebx

 	ret			4
@ -560,9 +640,9 @@ SwizzleBlock8u_sse2_1:
 SwizzleBlock4u_sse2:

 	push		%ebx
-	
+
 	mov         %eax, 0xf0f0f0f
-	movd        %xmm7, %eax 
+	movd        %xmm7, %eax
 	pshufd      %xmm7, %xmm7, 0

 	mov			%ebx, [%esp+4+4]
@ -628,7 +708,7 @@ SwizzleBlock4u_sse2_1:
 	pop			%ebx

 	ret			4
-                        
+
 #endif

 #if defined(__linux__) && defined(__ELF__)
--- a/plugins/zzogl-pg/opengl/x86.cpp
+++ b/plugins/zzogl-pg/opengl/x86.cpp
@ -22,7 +22,6 @@
 #include "x86.h"

 #if defined(ZEROGS_SSE2)
-#include <xmmintrin.h>
 #include <emmintrin.h>
 #endif

@ -64,23 +63,17 @@ void __fastcall FrameSwizzleBlock32A2_c(u32* dst, u32* src, int srcpitch, u32 Wr
 { 
 	u32* d = &g_columnTable32[0][0]; 
 	
-	if( WriteMask == 0xffffffff ) 
-	{ 
-		for(int i = 0; i < 8; ++i, d += 8) 
-		{ 
-			for(int j = 0; j < 8; ++j) 
-			{ 
+	if( WriteMask == 0xffffffff ) { 
+		for(int i = 0; i < 8; ++i, d += 8) { 
+			for(int j = 0; j < 8; ++j) { 
 				dst[d[j]] = ((src[2*j] + src[2*j+1]) >> 1); 
 			} 
 			src += srcpitch; 
 		} 
 	} 
-	else 
-	{ 
-		for(int i = 0; i < 8; ++i, d += 8) 
-		{ 
-			for(int j = 0; j < 8; ++j) 
-			{ 
+	else { 
+		for(int i = 0; i < 8; ++i, d += 8) { 
+			for(int j = 0; j < 8; ++j) { 
 				dst[d[j]] = (((src[2*j] + src[2*j+1]) >> 1)&WriteMask)|(dst[d[j]]&~WriteMask); 
 			} 
 			src += srcpitch; 
@ -92,23 +85,17 @@ void __fastcall FrameSwizzleBlock32A4_c(u32* dst, u32* src, int srcpitch, u32 Wr
 { 
 	u32* d = &g_columnTable32[0][0]; 
 	
-	if( WriteMask == 0xffffffff ) 
-	{ 
-		for(int i = 0; i < 8; ++i, d += 8) 
-		{ 
-			for(int j = 0; j < 8; ++j) 
-			{ 
+	if( WriteMask == 0xffffffff ) { 
+		for(int i = 0; i < 8; ++i, d += 8) { 
+			for(int j = 0; j < 8; ++j) { 
 				dst[d[j]] = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2); 
 			} 
 			src += srcpitch << 1; 
 		} 
 	} 
-	else 
-	{ 
-		for(int i = 0; i < 8; ++i, d += 8) 
-		{ 
-			for(int j = 0; j < 8; ++j) 
-			{ 
+	else { 
+		for(int i = 0; i < 8; ++i, d += 8) { 
+			for(int j = 0; j < 8; ++j) { 
 				dst[d[j]] = (((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2)&WriteMask)|(dst[d[j]]&~WriteMask); 
 			} 
 			src += srcpitch << 1; 
@ -663,6 +650,120 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0

 extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
 {
+#define YET_ANOTHER_INTRINSIC
+#ifdef YET_ANOTHER_INTRINSIC
+    __m128i vm0 = _mm_load_si128((__m128i*)vm);
+    __m128i vm1 = _mm_load_si128((__m128i*)vm+1);
+    __m128i vm2 = _mm_load_si128((__m128i*)vm+2);
+    __m128i vm3 = _mm_load_si128((__m128i*)vm+3);
+
+    // rearrange 16bits words
+    vm0 = _mm_shufflehi_epi16(vm0, 0x88);
+    vm0 = _mm_shufflelo_epi16(vm0, 0x88); // 6 4 6 4  2 0 2 0
+    vm1 = _mm_shufflehi_epi16(vm1, 0x88);
+    vm1 = _mm_shufflelo_epi16(vm1, 0x88); // 14 12 14 12  10 8 10 8
+
+    // Note: MSVC complains about direct c-cast...
+    // vm0 = (__m128i)_mm_shuffle_ps((__m128)vm0, (__m128)vm1, 0x88); // 14 12 10 8  6 4 2 0
+    __m128 vm0_f = (_mm_shuffle_ps((__m128&)vm0, (__m128&)vm1, 0x88)); // 14 12 10 8  6 4 2 0
+    vm0 = (__m128i&)vm0_f;
+    vm0 = _mm_shuffle_epi32(vm0, 0xD8); // 14 12 6 4  10 8 2 0
+
+    // *** Same jobs for vm2 and vm3
+    vm2 = _mm_shufflehi_epi16(vm2, 0x88);
+    vm2 = _mm_shufflelo_epi16(vm2, 0x88);
+    vm3 = _mm_shufflehi_epi16(vm3, 0x88);
+    vm3 = _mm_shufflelo_epi16(vm3, 0x88);
+
+    // Note: MSVC complains about direct c-cast...
+    // vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
+    __m128 vm2_f = (_mm_shuffle_ps((__m128&)vm2, (__m128&)vm3, 0x88)); // 14 12 10 8  6 4 2 0
+    vm2 = (__m128i&)vm2_f;
+    vm2 = _mm_shuffle_epi32(vm2, 0xD8);
+
+    // Create a zero register.
+    __m128i zero_128 = _mm_setzero_si128();
+
+    if ((u32)clut & 0x0F) {
+        // Unaligned write.
+
+        u16* clut_word_ptr = (u16*)clut;
+        __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask2);
+
+        // Load previous data and clear high 16 bits of double words
+        __m128i clut_0 = _mm_load_si128((__m128i*)(clut_word_ptr-1)); // 6 5 4 3  2 1 0 x
+        __m128i clut_2 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+2); // 22 21 20 19  18 17 16 15
+        clut_0 = _mm_and_si128(clut_0, clut_mask); // - 5 - 3  - 1 - x
+        clut_2 = _mm_and_si128(clut_2, clut_mask); // - 21 - 19  - 17 - 15
+
+        // Convert 16bits to 32 bits vm0 (zero entended)
+        __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8  - 2 - 0
+        __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12  - 6  - 4
+
+        // shift the value to aligned it with clut
+        vm0_low = _mm_slli_epi32(vm0_low, 16); // 10 - 8 -  2 - 0 -
+        vm0_high = _mm_slli_epi32(vm0_high, 16); // 14 - 12 -  6 - 4 -
+
+        // Interlace old and new data
+        clut_0 = _mm_or_si128(clut_0, vm0_low); // 10 5 8 3  2 1 0 x
+        clut_2 = _mm_or_si128(clut_2, vm0_high); // 14 21 12 19  6 17 4 15
+
+        // Save the result
+        _mm_store_si128((__m128i*)(clut_word_ptr-1), clut_0);
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+2, clut_2);
+
+        // *** Same jobs for clut_1 and clut_3
+        __m128i clut_1 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+1);
+        __m128i clut_3 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+3);
+        clut_1 = _mm_and_si128(clut_1, clut_mask);
+        clut_3 = _mm_and_si128(clut_3, clut_mask);
+
+        __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
+        __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
+        vm2_low = _mm_slli_epi32(vm2_low, 16);
+        vm2_high = _mm_slli_epi32(vm2_high, 16);
+
+        clut_1 = _mm_or_si128(clut_1, vm2_low);
+        clut_3 = _mm_or_si128(clut_3, vm2_high);
+
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+1, clut_1);
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+3, clut_3);
+    } else {
+        // Standard write
+
+        __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask);
+
+        // Load previous data and clear low 16 bits of double words
+        __m128i clut_0 = _mm_and_si128(_mm_load_si128((__m128i*)clut), clut_mask); // 7 - 5 -  3 - 1 -
+        __m128i clut_2 = _mm_and_si128(_mm_load_si128((__m128i*)clut+2), clut_mask); // 23 - 21 -  19 - 17 -
+
+        //  Convert 16bits to 32 bits vm0 (zero entended)
+        __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8  - 2 - 0
+        __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12  - 6  - 4
+
+        // Interlace old and new data
+        clut_0 = _mm_or_si128(clut_0, vm0_low); // 7 10 5 8  3 2 1 0
+        clut_2 = _mm_or_si128(clut_2, vm0_high); // 23 14 21 12  19 6 17 4
+
+        // Save the result
+        _mm_store_si128((__m128i*)clut, clut_0);
+        _mm_store_si128((__m128i*)clut+2, clut_2);
+
+        // *** Same jobs for clut_1 and clut_3
+        __m128i clut_1 = _mm_and_si128(_mm_load_si128((__m128i*)clut+1), clut_mask);
+        __m128i clut_3 = _mm_and_si128(_mm_load_si128((__m128i*)clut+3), clut_mask);
+
+        __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
+        __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
+
+        clut_1 = _mm_or_si128(clut_1, vm2_low);
+        clut_3 = _mm_or_si128(clut_3, vm2_high);
+
+        _mm_store_si128((__m128i*)clut+1, clut_1);
+        _mm_store_si128((__m128i*)clut+3, clut_3);
+    }
+
+#else
 #if defined(_MSC_VER)
 	__asm
 	{
@ -893,6 +994,7 @@ End:
    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
       );
 #endif // _MSC_VER
+#endif
 }

 #endif // ZEROGS_SSE2
@ -1115,3 +1217,4 @@ Z16Loop:
       );
 #endif // _MSC_VER
 }
+
--- a/plugins/zzogl-pg/opengl/zerogs.cpp
+++ b/plugins/zzogl-pg/opengl/zerogs.cpp
@ -32,6 +32,9 @@
 #include "targets.h"
 #include "GLWin.h"
 #include "ZZoglShaders.h"
+#ifdef ZEROGS_SSE2
+#include <emmintrin.h>
+#endif

 //----------------------- Defines

@ -95,7 +98,6 @@ namespace ZeroGS
 //	float4 g_vdepth = float4( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);

 extern CRangeManager s_RangeMngr; // manages overwritten memory
-GLenum GetRenderTargetFormat() { return GetRenderFormat() == RFT_byte8 ? 4 : g_internalRGBAFloat16Fmt; }

 // returns the first and last addresses aligned to a page that cover
 void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw);
@ -541,7 +543,7 @@ __forceinline void MOVFOG(VertexGPU *p, Vertex gsf)

 int Values[100] = {0, };

-void SET_VERTEX(VertexGPU *p, int Index, const VB& curvb)
+inline void SET_VERTEX(VertexGPU *p, int Index, const VB& curvb)
 {
 	int index = Index;
 	p->x = ((((int)gs.gsvertex[index].x - curvb.offset.x) >> 1) & 0xffff);
@ -852,6 +854,55 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)

 	bool bRet = false;

+    // FIXME code generated by intrinsics is the same as the linux asm.
+    // However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!!
+    // So control flow must be check
+#define TEST_THIS
+#ifdef TEST_THIS
+    while(entries != 0) {
+#ifdef ZEROGS_SSE2
+        __m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst));
+
+        __m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1));
+        result = _mm_and_si128(result, result_tmp);
+
+        result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2));
+        result = _mm_and_si128(result, result_tmp);
+
+        result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3));
+        result = _mm_and_si128(result, result_tmp);
+
+        u32 result_int = _mm_movemask_epi8(result);
+        if (result_int != 0xFF) {
+            bRet = true;
+            break;
+        }
+#else
+        // I see no point to keep an mmx version. SSE2 versions is probably faster.
+        // Keep a slow portable C version for reference/debug
+        for (int i=0; i < 16 ; i++) {
+            if (*((u32*)src+i) != *((u32*)dst+i)) {
+                bRet = true;
+                break;
+            }
+        }
+#endif
+
+        if (entries & 0x10) {
+            src -= 56; // go back and down one column
+        }
+
+        src += 32; // go to the right block
+
+        if (entries == 0x90) {
+            src += 32; // skip whole block
+        }
+
+        dst += 8;
+        entries -= 16;
+    }
+#else
+
 	// do a fast test with MMX
 #ifdef _MSC_VER
 	int storeebx;
@ -978,6 +1029,7 @@ Return:
 	".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");

 #endif // _WIN32
+#endif
 	return bRet;
 }

--- a/plugins/zzogl-pg/opengl/zerogs.h
+++ b/plugins/zzogl-pg/opengl/zerogs.h
@ -29,6 +29,7 @@
 #include <vector>
 #include <map>
 #include <string>
+#include <math.h>

 #include "ZZGl.h"
 #include "GS.h"
@ -100,12 +101,6 @@ namespace ZeroGS

 typedef void (*DrawFn)();

-enum RenderFormatType
-{
-	RFT_byte8 = 0,	  // A8R8G8B8
-	RFT_float16 = 1,	// A32R32B32G32
-};
-
 // managers render-to-texture targets

 class CRenderTarget
@ -237,6 +232,8 @@ class CMemoryTarget
 			clearminy = r.clearminy;
 			clearmaxy = r.clearmaxy;
 			widthmult = r.widthmult;
+			texH = r.texH;
+			texW = r.texW;
 			channels = r.channels;
 			validatecount = r.validatecount;
 			fmt = r.fmt;
@ -267,14 +264,20 @@ class CMemoryTarget

 		int starty, height; // assert(starty >= realy)
 		int realy, realheight; // this is never touched once allocated
+		// realy is start pointer of data in 4M data block (start) and size (end-start).
+		
 		u32 usedstamp;
 		u8 psm, cpsm; // texture and clut format. For psm, only 16bit/32bit differentiation matters

 		u32 fmt;

-		int widthmult;
-		int channels;
-		int clearminy, clearmaxy; // when maxy > 0, need to check for clearing
+		int widthmult;	// Either 1 or 2.
+		int channels;	// The number of pixels per PSM format word. channels == PIXELS_PER_WORD(psm)
+						// This is the real drawing size in pixels of the texture in renderbuffer.
+		int texW;		// (realheight + widthmult - 1)/widthmult == realheight or [(realheight+1)/2]
+		int texH;		//  GPU_TEXWIDTH *widthmult * channels;			
+
+		int clearminy, clearmaxy;	// when maxy > 0, need to check for clearing

 		int validatecount; // count how many times has been validated, if too many, destroy

@ -415,7 +418,6 @@ extern float fiTexWidth[2], fiTexHeight[2];	// current tex width and height
 extern vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
 extern GLuint vboRect;
 extern int g_nCurVBOIndex;
-extern RenderFormatType g_RenderFormatType;

 void AddMessage(const char* pstr, u32 ms = 5000);
 void DrawText(const char* pstr, int left, int top, u32 color);
@ -479,8 +481,6 @@ bool CheckChangeInClut(u32 highdword, u32 psm); // returns true if clut will cha

 // call to load CLUT data (depending on CLD)
 void texClutWrite(int ctx);
-RenderFormatType GetRenderFormat();
-GLenum GetRenderTargetFormat();

 int Save(s8* pbydata);
 bool Load(s8* pbydata);
@ -523,7 +523,25 @@ inline void CluttingForFlushedTex(tex0Info* tex0, u32 Data, int ictx)
 	tex0->cld  = ZZOglGet_cld_TexBits(Data);

 	ZeroGS::texClutWrite(ictx);
+ };
+ 
+// The size in bytes of x strings (of texture).
+inline int MemorySize(int x) 
+{
+	return 4 * GPU_TEXWIDTH * x;
 }
-};

+// Return the address in memory of data block for string x. 
+inline u8* MemoryAddress(int x) 
+{
+	return g_pbyGSMemory + MemorySize(x);
+}
+
+template <u32 mult>
+inline u8* _MemoryAddress(int x) 
+{
+	return g_pbyGSMemory + mult * x;
+}
+
+};
 #endif