diff --git a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
index b9659b4c01..bbcc2d170f 100644
--- a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
+++ b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp
@@ -78,6 +78,9 @@
 		<Unit filename="../zerogs.glade" />
 		<Unit filename="../../Mem.cpp" />
 		<Unit filename="../../Mem.h" />
+		<Unit filename="../../Mem_Swizzle.h" />
+		<Unit filename="../../Mem_Tables.cpp" />
+		<Unit filename="../../Mem_Transmit.h" />
 		<Unit filename="../../Regs.cpp" />
 		<Unit filename="../../Regs.h" />
 		<Unit filename="../../Win32/Conf.cpp">
diff --git a/plugins/zzogl-pg/opengl/Makefile.am b/plugins/zzogl-pg/opengl/Makefile.am
index 679a611d0b..c055ca4c82 100644
--- a/plugins/zzogl-pg/opengl/Makefile.am
+++ b/plugins/zzogl-pg/opengl/Makefile.am
@@ -26,7 +26,9 @@ libzzoglpg_LDADD=$(libzzoglpg_a_OBJECTS)
 libzzoglpg_a_SOURCES = \
 GSmain.cpp  GifTransfer.cpp memcpy_amd.cpp  Regs.cpp     x86.cpp     zpipe.cpp 		Mem.cpp     \
 rasterfont.cpp  targets.cpp  zerogs.cpp  ZZoglVB.cpp  ZZoglShoots.cpp	ZZoglCreate.cpp	\
-ZZoglShaders.cpp	ZZoglCRTC.cpp	ZZoglSave.cpp	ZZoglFlush.cpp
+ZZoglShaders.cpp	ZZoglCRTC.cpp	ZZoglSave.cpp	ZZoglFlush.cpp \
+Mem_Swizzle.h Mem_Tables.cpp Mem_Transmit.h
+
 
 libzzoglpg_a_SOURCES += x86-32.S
 
diff --git a/plugins/zzogl-pg/opengl/Mem.cpp b/plugins/zzogl-pg/opengl/Mem.cpp
index 8d82f6ebca..3d67880f85 100644
--- a/plugins/zzogl-pg/opengl/Mem.cpp
+++ b/plugins/zzogl-pg/opengl/Mem.cpp
@@ -22,481 +22,12 @@
 #include "targets.h"
 #include "x86.h"
 
-u32 g_blockTable32[4][8] = {
-    {  0,  1,  4,  5, 16, 17, 20, 21},
-    {  2,  3,  6,  7, 18, 19, 22, 23},
-    {  8,  9, 12, 13, 24, 25, 28, 29},
-    { 10, 11, 14, 15, 26, 27, 30, 31}
-};
-
-u32 g_blockTable32Z[4][8] = {
-    { 24, 25, 28, 29,  8,  9, 12, 13},
-    { 26, 27, 30, 31, 10, 11, 14, 15},
-    { 16, 17, 20, 21,  0,  1,  4,  5},
-    { 18, 19, 22, 23,  2,  3,  6,  7}
-};
-
-u32 g_blockTable16[8][4] = {
-    {  0,  2,  8, 10 },
-    {  1,  3,  9, 11 },
-    {  4,  6, 12, 14 },
-    {  5,  7, 13, 15 },
-    { 16, 18, 24, 26 },
-    { 17, 19, 25, 27 },
-    { 20, 22, 28, 30 },
-    { 21, 23, 29, 31 }
-};
-
-u32 g_blockTable16S[8][4] = {
-    {  0,  2, 16, 18 },
-    {  1,  3, 17, 19 },
-    {  8, 10, 24, 26 },
-    {  9, 11, 25, 27 },
-    {  4,  6, 20, 22 },
-    {  5,  7, 21, 23 },
-    { 12, 14, 28, 30 },
-    { 13, 15, 29, 31 }
-};
-
-u32 g_blockTable16Z[8][4] = {
-    { 24, 26, 16, 18 },
-    { 25, 27, 17, 19 },
-    { 28, 30, 20, 22 },
-    { 29, 31, 21, 23 },
-    {  8, 10,  0,  2 },
-    {  9, 11,  1,  3 },
-    { 12, 14,  4,  6 },
-    { 13, 15,  5,  7 }
-};
-
-u32 g_blockTable16SZ[8][4] = {
-    { 24, 26,  8, 10 },
-    { 25, 27,  9, 11 },
-    { 16, 18,  0,  2 },
-    { 17, 19,  1,  3 },
-    { 28, 30, 12, 14 },
-    { 29, 31, 13, 15 },
-    { 20, 22,  4,  6 },
-    { 21, 23,  5,  7 }
-};
-
-u32 g_blockTable8[4][8] = {
-    {  0,  1,  4,  5, 16, 17, 20, 21},
-    {  2,  3,  6,  7, 18, 19, 22, 23},
-    {  8,  9, 12, 13, 24, 25, 28, 29},
-    { 10, 11, 14, 15, 26, 27, 30, 31}
-};
-
-u32 g_blockTable4[8][4] = {
-    {  0,  2,  8, 10 },
-    {  1,  3,  9, 11 },
-    {  4,  6, 12, 14 },
-    {  5,  7, 13, 15 },
-    { 16, 18, 24, 26 },
-    { 17, 19, 25, 27 },
-    { 20, 22, 28, 30 },
-    { 21, 23, 29, 31 }
-};
-
-u32 g_columnTable32[8][8] = {
-    {  0,  1,  4,  5,  8,  9, 12, 13 },
-    {  2,  3,  6,  7, 10, 11, 14, 15 },
-    { 16, 17, 20, 21, 24, 25, 28, 29 },
-    { 18, 19, 22, 23, 26, 27, 30, 31 },
-    { 32, 33, 36, 37, 40, 41, 44, 45 },
-    { 34, 35, 38, 39, 42, 43, 46, 47 },
-    { 48, 49, 52, 53, 56, 57, 60, 61 },
-    { 50, 51, 54, 55, 58, 59, 62, 63 },
-};
-
-u32 g_columnTable16[8][16] = {
-    {   0,   2,   8,  10,  16,  18,  24,  26, 
-        1,   3,   9,  11,  17,  19,  25,  27 },
-    {   4,   6,  12,  14,  20,  22,  28,  30, 
-        5,   7,  13,  15,  21,  23,  29,  31 },
-    {  32,  34,  40,  42,  48,  50,  56,  58,
-       33,  35,  41,  43,  49,  51,  57,  59 },
-    {  36,  38,  44,  46,  52,  54,  60,  62,
-       37,  39,  45,  47,  53,  55,  61,  63 },
-    {  64,  66,  72,  74,  80,  82,  88,  90,
-       65,  67,  73,  75,  81,  83,  89,  91 },
-    {  68,  70,  76,  78,  84,  86,  92,  94,
-       69,  71,  77,  79,  85,  87,  93,  95 },
-    {  96,  98, 104, 106, 112, 114, 120, 122,
-       97,  99, 105, 107, 113, 115, 121, 123 },
-    { 100, 102, 108, 110, 116, 118, 124, 126,
-      101, 103, 109, 111, 117, 119, 125, 127 },
-};
-
-u32 g_columnTable8[16][16] = {
-	{   0,   4,  16,  20,  32,  36,  48,  52,   // column 0
-        2,   6,  18,  22,  34,  38,  50,  54 },
-    {   8,  12,  24,  28,  40,  44,  56,  60,
-       10,  14,  26,  30,  42,  46,  58,  62 },
-    {  33,  37,  49,  53,   1,   5,  17,  21,
-       35,  39,  51,  55,   3,   7,  19,  23 },
-    {  41,  45,  57,  61,   9,  13,  25,  29,
-	  43,  47,  59,  63,  11,  15,  27,  31 },
-	{  96, 100, 112, 116,  64,  68,  80,  84,   // column 1
-       98, 102, 114, 118,  66,  70,  82,  86 },
-    { 104, 108, 120, 124,  72,  76,  88,  92, 
-      106, 110, 122, 126,  74,  78,  90,  94 },
-    {  65,  69,  81,  85,  97, 101, 113, 117,
-       67,  71,  83,  87,  99, 103, 115, 119 },
-    {  73,  77,  89,  93, 105, 109, 121, 125,
-       75,  79,  91,  95, 107, 111, 123, 127 },
-	{ 128, 132, 144, 148, 160, 164, 176, 180,   // column 2
-      130, 134, 146, 150, 162, 166, 178, 182 },
-    { 136, 140, 152, 156, 168, 172, 184, 188,
-      138, 142, 154, 158, 170, 174, 186, 190 },
-    { 161, 165, 177, 181, 129, 133, 145, 149,
-      163, 167, 179, 183, 131, 135, 147, 151 },
-    { 169, 173, 185, 189, 137, 141, 153, 157,
-      171, 175, 187, 191, 139, 143, 155, 159 },
-	{ 224, 228, 240, 244, 192, 196, 208, 212,   // column 3
-      226, 230, 242, 246, 194, 198, 210, 214 },
-    { 232, 236, 248, 252, 200, 204, 216, 220,
-      234, 238, 250, 254, 202, 206, 218, 222 },
-    { 193, 197, 209, 213, 225, 229, 241, 245,
-      195, 199, 211, 215, 227, 231, 243, 247 },
-    { 201, 205, 217, 221, 233, 237, 249, 253,
-      203, 207, 219, 223, 235, 239, 251, 255 },
-};
-
-u32 g_columnTable4[16][32] = {
-	{   0,   8,  32,  40,  64,  72,  96, 104,   // column 0
-        2,  10,  34,  42,  66,  74,  98, 106,
-        4,  12,  36,  44,  68,  76, 100, 108,
-        6,  14,  38,  46,  70,  78, 102, 110 },
-    {  16,  24,  48,  56,  80,  88, 112, 120,
-       18,  26,  50,  58,  82,  90, 114, 122,
-       20,  28,  52,  60,  84,  92, 116, 124,
-       22,  30,  54,  62,  86,  94, 118, 126 },
-    {  65,  73,  97, 105,   1,   9,  33,  41,
-       67,  75,  99, 107,   3,  11,  35,  43,
-       69,  77, 101, 109,   5,  13,  37,  45, 
-       71,  79, 103, 111,   7,  15,  39,  47 },
-    {  81,  89, 113, 121,  17,  25,  49,  57,
-       83,  91, 115, 123,  19,  27,  51,  59,
-       85,  93, 117, 125,  21,  29,  53,  61,
-       87,  95, 119, 127,  23,  31,  55,  63 },
-	{ 192, 200, 224, 232, 128, 136, 160, 168,   // column 1
-      194, 202, 226, 234, 130, 138, 162, 170,
-      196, 204, 228, 236, 132, 140, 164, 172,
-      198, 206, 230, 238, 134, 142, 166, 174 },
-    { 208, 216, 240, 248, 144, 152, 176, 184,
-      210, 218, 242, 250, 146, 154, 178, 186,
-      212, 220, 244, 252, 148, 156, 180, 188,
-      214, 222, 246, 254, 150, 158, 182, 190 },
-    { 129, 137, 161, 169, 193, 201, 225, 233,
-      131, 139, 163, 171, 195, 203, 227, 235,
-      133, 141, 165, 173, 197, 205, 229, 237, 
-      135, 143, 167, 175, 199, 207, 231, 239 },
-    { 145, 153, 177, 185, 209, 217, 241, 249,
-      147, 155, 179, 187, 211, 219, 243, 251,
-      149, 157, 181, 189, 213, 221, 245, 253,
-      151, 159, 183, 191, 215, 223, 247, 255 },
-	{ 256, 264, 288, 296, 320, 328, 352, 360,   // column 2
-      258, 266, 290, 298, 322, 330, 354, 362,
-      260, 268, 292, 300, 324, 332, 356, 364,
-      262, 270, 294, 302, 326, 334, 358, 366 },
-    { 272, 280, 304, 312, 336, 344, 368, 376,
-      274, 282, 306, 314, 338, 346, 370, 378,
-      276, 284, 308, 316, 340, 348, 372, 380,
-      278, 286, 310, 318, 342, 350, 374, 382 },
-    { 321, 329, 353, 361, 257, 265, 289, 297,
-      323, 331, 355, 363, 259, 267, 291, 299,
-      325, 333, 357, 365, 261, 269, 293, 301, 
-      327, 335, 359, 367, 263, 271, 295, 303 },
-    { 337, 345, 369, 377, 273, 281, 305, 313,
-      339, 347, 371, 379, 275, 283, 307, 315,
-      341, 349, 373, 381, 277, 285, 309, 317,
-      343, 351, 375, 383, 279, 287, 311, 319 },
-	{ 448, 456, 480, 488, 384, 392, 416, 424,   // column 3
-      450, 458, 482, 490, 386, 394, 418, 426,
-      452, 460, 484, 492, 388, 396, 420, 428,
-      454, 462, 486, 494, 390, 398, 422, 430 },
-    { 464, 472, 496, 504, 400, 408, 432, 440,
-      466, 474, 498, 506, 402, 410, 434, 442,
-      468, 476, 500, 508, 404, 412, 436, 444,
-      470, 478, 502, 510, 406, 414, 438, 446 },
-    { 385, 393, 417, 425, 449, 457, 481, 489,
-      387, 395, 419, 427, 451, 459, 483, 491,
-      389, 397, 421, 429, 453, 461, 485, 493, 
-      391, 399, 423, 431, 455, 463, 487, 495 },
-    { 401, 409, 433, 441, 465, 473, 497, 505,
-      403, 411, 435, 443, 467, 475, 499, 507,
-      405, 413, 437, 445, 469, 477, 501, 509,
-      407, 415, 439, 447, 471, 479, 503, 511 },
-};
-
-u32 g_pageTable32[32][64];
-u32 g_pageTable32Z[32][64];
-u32 g_pageTable16[64][64];
-u32 g_pageTable16S[64][64];
-u32 g_pageTable16Z[64][64];
-u32 g_pageTable16SZ[64][64];
-u32 g_pageTable8[64][128];
-u32 g_pageTable4[128][128];
+#include "Mem_Transmit.h"
+#include "Mem_Swizzle.h"
 
 BLOCK m_Blocks[0x40]; // do so blocks are indexable
-static PCSX2_ALIGNED16(u32 tempblock[64]);
 
-#define DSTPSM gs.dstbuf.psm
-
-#define START_HOSTLOCAL() \
-	assert( gs.imageTransfer == 0 ); \
-	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; \
-	\
-	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ \
-	int i = gs.imageY, j = gs.imageX; \
-
-#define END_HOSTLOCAL() \
-End: \
-	if( i >= gs.imageEndY ) { \
-		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); \
-		gs.imageTransfer = -1; \
-		/*int start, end; \
-		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); \
-		ZeroGS::g_MemTargs.ClearRange(start, end);*/ \
-	} \
-	else { \
-		/* update new params */ \
-		gs.imageY = i; \
-		gs.imageX = j; \
-	} \
-
-// transfers whole rows
-#define TRANSMIT_HOSTLOCAL_Y_(psm, T, widthlimit, endY) { \
-	assert( (nSize%widthlimit) == 0 && widthlimit <= 4 ); \
-	if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) { \
-		/*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \
-		for(; i < endY; ++i) { \
-			for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 1) { \
-				/* write as many pixel at one time as possible */ \
-				writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \
-			} \
-		} \
-	} \
-	for(; i < endY; ++i) { \
-		for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += widthlimit) { \
-			/* write as many pixel at one time as possible */ \
-			if( nSize < widthlimit ) goto End; \
-			writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \
-			\
-			if( widthlimit > 1 ) { \
-				writePixel##psm##_0(pstart, (j+1)%2048, i%2048, pbuf[1], gs.dstbuf.bw); \
-				\
-				if( widthlimit > 2 ) { \
-					writePixel##psm##_0(pstart, (j+2)%2048, i%2048, pbuf[2], gs.dstbuf.bw); \
-					\
-					if( widthlimit > 3 ) { \
-						writePixel##psm##_0(pstart, (j+3)%2048, i%2048, pbuf[3], gs.dstbuf.bw); \
-					} \
-				} \
-			} \
-		} \
-		\
-		if( j >= gs.imageEndX ) { assert(j == gs.imageEndX); j = gs.trxpos.dx; } \
-		else { assert( gs.imageTransfer == -1 || nSize*sizeof(T)/4 == 0 ); goto End; } \
-	} \
-} \
-
-// transmit until endX, don't check size since it has already been prevalidated
-#define TRANSMIT_HOSTLOCAL_X_(psm, T, widthlimit, blockheight, startX) { \
-	for(int tempi = 0; tempi < blockheight; ++tempi) { \
-		for(j = startX; j < gs.imageEndX; j++, pbuf++) { \
-			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); \
-		} \
-		pbuf += pitch-fracX; \
-	} \
-} \
-
-// transfers whole rows
-#define TRANSMIT_HOSTLOCAL_Y_24(psm, T, widthlimit, endY) { \
-	if( widthlimit != 8 || ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) { \
-		/*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \
-		for(; i < endY; ++i) { \
-			for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 3) { \
-				writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf), gs.dstbuf.bw); \
-			} \
-			\
-			if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \
-			else { assert( gs.imageTransfer == -1 || nSize == 0 ); goto End; } \
-		} \
-	} \
-	else { \
-		assert( /*(nSize%widthlimit) == 0 &&*/ widthlimit == 8 ); \
-		for(; i < endY; ++i) { \
-			for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += 3*widthlimit) { \
-				if( nSize < widthlimit ) goto End; \
-				/* write as many pixel at one time as possible */ \
-				writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf+0), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *(u32*)(pbuf+3), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *(u32*)(pbuf+6), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *(u32*)(pbuf+9), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *(u32*)(pbuf+12), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *(u32*)(pbuf+15), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *(u32*)(pbuf+18), gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *(u32*)(pbuf+21), gs.dstbuf.bw); \
-			} \
-			\
-			if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \
-			else { \
-				if( nSize < 0 ) { \
-					/* extracted too much */ \
-					assert( (nSize%3)==0 && nSize > -24 ); \
-					j += nSize/3; \
-					nSize = 0; \
-				} \
-				assert( gs.imageTransfer == -1 || nSize == 0 ); \
-				goto End; \
-			} \
-		} \
-	} \
-} \
-
-// transmit until endX, don't check size since it has already been prevalidated
-#define TRANSMIT_HOSTLOCAL_X_24(psm, T, widthlimit, blockheight, startX) { \
-	for(int tempi = 0; tempi < blockheight; ++tempi) { \
-		for(j = startX; j < gs.imageEndX; j++, pbuf += 3) { \
-			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, *(u32*)pbuf, gs.dstbuf.bw); \
-		} \
-		pbuf += 3*(pitch-fracX); \
-	} \
-} \
-
-// meant for 4bit transfers
-#define TRANSMIT_HOSTLOCAL_Y_4(psm, T, widthlimit, endY) { \
-	for(; i < endY; ++i) { \
-		for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit) { \
-			/* write as many pixel at one time as possible */ \
-			writePixel##psm##_0(pstart, j%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
-			writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
-			pbuf++; \
-			if( widthlimit > 2 ) { \
-				writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
-				writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
-				pbuf++; \
-				\
-				if( widthlimit > 4 ) { \
-					writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
-					writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
-					pbuf++; \
-					\
-					if( widthlimit > 6 ) { \
-						writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
-						writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
-						pbuf++; \
-					} \
-				} \
-			} \
-		} \
-		\
-		if( j >= gs.imageEndX ) { j = gs.trxpos.dx; } \
-		else { assert( gs.imageTransfer == -1 || (nSize/32) == 0 ); goto End; } \
-	} \
-} \
-
-// transmit until endX, don't check size since it has already been prevalidated
-#define TRANSMIT_HOSTLOCAL_X_4(psm, T, widthlimit, blockheight, startX) { \
-	for(int tempi = 0; tempi < blockheight; ++tempi) { \
-		for(j = startX; j < gs.imageEndX; j+=2, pbuf++) { \
-			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0]&0x0f, gs.dstbuf.bw); \
-			writePixel##psm##_0(pstart, (j+1)%2048, (i+tempi)%2048, pbuf[0]>>4, gs.dstbuf.bw); \
-		} \
-		pbuf += (pitch-fracX)/2; \
-	} \
-} \
-
-// calculate pitch in source buffer
-#define TRANSMIT_PITCH_(pitch, T) (pitch*sizeof(T))
-#define TRANSMIT_PITCH_24(pitch, T) (pitch*3)
-#define TRANSMIT_PITCH_4(pitch, T) (pitch/2)
-
-// special swizzle macros
-#define SwizzleBlock24(dst, src, pitch) { \
-	u8* pnewsrc = src; \
-	u32* pblock = tempblock; \
-	\
-	for(int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch-24) { \
-		for(int bx = 0; bx < 8; ++bx, pnewsrc += 3) { \
-			pblock[bx] = *(u32*)pnewsrc; \
-		} \
-	} \
-	for(int bx = 0; bx < 7; ++bx, pnewsrc += 3) { \
-		/* might be 1 byte out of bounds of GS memory */ \
-		pblock[bx] = *(u32*)pnewsrc; \
-	} \
-	/* do 3 bytes for the last copy */ \
-	*((u8*)pblock+28) = pnewsrc[0]; \
-	*((u8*)pblock+29) = pnewsrc[1]; \
-	*((u8*)pblock+30) = pnewsrc[2]; \
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff); \
-} \
-
-#define SwizzleBlock24u SwizzleBlock24
-
-#define SwizzleBlock8H(dst, src, pitch) { \
-	u8* pnewsrc = src; \
-	u32* pblock = tempblock; \
-	\
-	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \
-		u32 u = *(u32*)pnewsrc; \
-		pblock[0] = u<<24; \
-		pblock[1] = u<<16; \
-		pblock[2] = u<<8; \
-		pblock[3] = u; \
-		u = *(u32*)(pnewsrc+4); \
-		pblock[4] = u<<24; \
-		pblock[5] = u<<16; \
-		pblock[6] = u<<8; \
-		pblock[7] = u; \
-	} \
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000); \
-} \
-
-#define SwizzleBlock8Hu SwizzleBlock8H
-
-#define SwizzleBlock4HH(dst, src, pitch) { \
-	u8* pnewsrc = src; \
-	u32* pblock = tempblock; \
-	\
-	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \
-		u32 u = *(u32*)pnewsrc; \
-		pblock[0] = u<<28; \
-		pblock[1] = u<<24; \
-		pblock[2] = u<<20; \
-		pblock[3] = u<<16; \
-		pblock[4] = u<<12; \
-		pblock[5] = u<<8; \
-		pblock[6] = u<<4; \
-		pblock[7] = u; \
-	} \
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000); \
-} \
-
-#define SwizzleBlock4HHu SwizzleBlock4HH
-
-#define SwizzleBlock4HL(dst, src, pitch) { \
-	u8* pnewsrc = src; \
-	u32* pblock = tempblock; \
-	\
-	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \
-		u32 u = *(u32*)pnewsrc; \
-		pblock[0] = u<<24; \
-		pblock[1] = u<<20; \
-		pblock[2] = u<<16; \
-		pblock[3] = u<<12; \
-		pblock[4] = u<<8; \
-		pblock[5] = u<<4; \
-		pblock[6] = u; \
-		pblock[7] = u>>4; \
-	} \
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000); \
-} \
-
-#define SwizzleBlock4HLu SwizzleBlock4HL
+PCSX2_ALIGNED16(u32 tempblock[64]);
 
 // ------------------------
 // |              Y       |
@@ -510,11 +41,15 @@ End: \
 #define DEFINE_TRANSFERLOCAL(psm, T, widthlimit, blockbits, blockwidth, blockheight, TransSfx, SwizzleBlock) \
 int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \
 { \
-	START_HOSTLOCAL(); \
+	assert( gs.imageTransfer == 0 ); \
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; \
+	\
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ \
+	int i = gs.imageY, j = gs.imageX; \
 	\
 	const T* pbuf = (const T*)pbyMem; \
-	int nLeftOver = (nQWordSize*4*2)%(TRANSMIT_PITCH##TransSfx(2, T)); \
-	int nSize = nQWordSize*4*2/TRANSMIT_PITCH##TransSfx(2, T); \
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch##TransSfx<T>(2)); \
+	int nSize = nQWordSize*4*2/TransmitPitch##TransSfx<T>(2); \
 	nSize = min(nSize, gs.imageWnew * gs.imageHnew); \
 	\
 	int pitch, area, fracX; \
@@ -544,10 +79,10 @@ int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \
 		\
 		if( ((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit) ) { \
 			/* transmit with a width of 1 */ \
-			TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, (1+(DSTPSM == 0x14)), endY); \
+			TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, (1+(DSTPSM == 0x14)), endY); \
 		} \
 		else { \
-			TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, widthlimit, endY); \
+			TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, widthlimit, endY); \
 		} \
 		\
 		if( nSize == 0 || i == gs.imageEndY ) \
@@ -561,44 +96,1710 @@ int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \
 	area = pitch*blockheight; \
 	fracX = gs.imageEndX-alignedX; \
 	\
-	/* on top of checking whether pbuf is alinged, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ \
-	bAligned = !((uptr)pbuf & 0xf) && (TRANSMIT_PITCH##TransSfx(pitch, T)&0xf) == 0; \
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ \
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch##TransSfx<T>(pitch) & 0xf) == 0; \
 	\
 	/* transfer aligning to blocks */ \
 	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) { \
 		\
 		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) { \
-			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH##TransSfx(blockwidth, T)/sizeof(T)) { \
-				SwizzleBlock(pstart + getPixelAddress##psm##_0(tempj, i, gs.dstbuf.bw)*blockbits/8, \
-					(u8*)pbuf, TRANSMIT_PITCH##TransSfx(pitch, T)); \
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch##TransSfx<T>(blockwidth)/sizeof(T)) { \
+				SwizzleBlock(pstart + getPixelAddress_0(psm,tempj, i, gs.dstbuf.bw)*blockbits/8, \
+					(u8*)pbuf, TransmitPitch##TransSfx<T>(pitch)); \
 			} \
 		} \
 		else { \
-			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH##TransSfx(blockwidth, T)/sizeof(T)) { \
-				SwizzleBlock##u(pstart + getPixelAddress##psm##_0(tempj, i, gs.dstbuf.bw)*blockbits/8, \
-					(u8*)pbuf, TRANSMIT_PITCH##TransSfx(pitch, T)); \
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch##TransSfx<T>(blockwidth)/sizeof(T)) { \
+				SwizzleBlock##u(pstart + getPixelAddress_0(psm,tempj, i, gs.dstbuf.bw)*blockbits/8, \
+					(u8*)pbuf, TransmitPitch##TransSfx<T>(pitch)); \
 			} \
 		} \
 		\
 		/* transfer the rest */ \
 		if( alignedX < gs.imageEndX ) { \
-			TRANSMIT_HOSTLOCAL_X##TransSfx(psm, T, widthlimit, blockheight, alignedX); \
-			pbuf -= TRANSMIT_PITCH##TransSfx((alignedX-gs.trxpos.dx), T)/sizeof(T); \
+			TRANSMIT_HOSTLOCAL_X(TransSfx,psm, T, widthlimit, blockheight, alignedX); \
+			pbuf -= TransmitPitch##TransSfx<T>(alignedX-gs.trxpos.dx)/sizeof(T); \
 		} \
-		else pbuf += (blockheight-1)*TRANSMIT_PITCH##TransSfx(pitch, T)/sizeof(T); \
+		else pbuf += (blockheight-1)*TransmitPitch##TransSfx<T>(pitch)/sizeof(T); \
 		j = gs.trxpos.dx; \
 	} \
 	\
-	if( TRANSMIT_PITCH##TransSfx(nSize, T)/4 > 0 ) { \
-		TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, widthlimit, gs.imageEndY); \
+	if( TransmitPitch##TransSfx<T>(nSize)/4 > 0 ) { \
+		TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, widthlimit, gs.imageEndY); \
 		/* sometimes wrong sizes are sent (tekken tag) */ \
-		assert( gs.imageTransfer == -1 || TRANSMIT_PITCH##TransSfx(nSize,T)/4 <= 2 ); \
+		assert( gs.imageTransfer == -1 || TransmitPitch##TransSfx<T>(nSize)/4 <= 2 ); \
 	} \
 	\
-	END_HOSTLOCAL(); \
-	return (nSize * TRANSMIT_PITCH##TransSfx(2, T) + nLeftOver)/2; \
+End: \
+	if( i >= gs.imageEndY ) { \
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); \
+		gs.imageTransfer = -1; \
+		/*int start, end; \
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); \
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ \
+	} \
+	else { \
+		/* update new params */ \
+		gs.imageY = i; \
+		gs.imageX = j; \
+	} \
+	return (nSize * TransmitPitch##TransSfx<T>(2) + nLeftOver)/2; \
 } \
 
+//#define NEW_TRANSFER
+#ifdef NEW_TRANSFER
+
+//DEFINE_TRANSFERLOCAL(32, u32, 2, 32, 8, 8, _, SwizzleBlock32);
+int TransferHostLocal32(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 2;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u32);
+//	_SwizzleBlock swizzle;
+
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u32* pbuf = (const u32*)pbyMem; 
+	const int tp2 = TransmitPitch_<u32>(2);
+	int nLeftOver = (nQWordSize*4*2)%tp2; 
+	int nSize = (nQWordSize*4*2)/tp2; 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(32, u32, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(32, u32, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u32>(pitch) & 0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		
+		if ( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL))) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u32>(blockwidth)/TSize) 
+			{ 
+				u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8;
+				SwizzleBlock32(temp, (u8*)pbuf, TransmitPitch_<u32>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u32>(blockwidth)/TSize) 
+			{ 
+				u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8;
+				SwizzleBlock32u(temp, (u8*)pbuf, TransmitPitch_<u32>(pitch)); 
+			} 
+		} 
+			
+		
+//		if ( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL))) 
+//		{ 
+//			swizzle = SwizzleBlock32;
+//		} 
+//		else 
+//		{ 
+//			swizzle = SwizzleBlock32u;
+//		} 
+//			
+//		for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u32>(blockwidth)/TSize) 
+//		{ 
+//				u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8;
+//				swizzle(temp, (u8*)pbuf, TransmitPitch_<u32>(pitch), 0xffffffff); 
+//		} 
+		
+		/* transfer the rest */ 
+		if( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(32, u32, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u32>((alignedX-gs.trxpos.dx))/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u32>(pitch)/TSize; 
+		}
+		
+		j = gs.trxpos.dx; 
+	} 
+	
+	if ( TransmitPitch_<u32>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(32, u32, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u32>(nSize)/4 <= 2 ); 
+	} 
+	
+End: 
+	if( i >= gs.imageEndY ) 
+	{ 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else 
+	{ 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	
+	return (nSize * TransmitPitch_<u32>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(32Z, u32, 2, 32, 8, 8, _, SwizzleBlock32);
+int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 2;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u32);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u32* pbuf = (const u32*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u32>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u32>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(32Z, u32, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(32Z, u32, widthlimit, endY); 
+		} 
+		
+		if ( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u32>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u32>(blockwidth)/sizeof(u32)) 
+			{ 
+				SwizzleBlock32(pstart + getPixelAddress_0(32Z,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u32>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u32>(blockwidth)/sizeof(u32)) 
+			{ 
+				SwizzleBlock32u(pstart + getPixelAddress_0(32Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u32>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_( 32Z, u32, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u32>(alignedX - gs.trxpos.dx)/sizeof(u32); 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u32>(pitch)/sizeof(u32);
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_<u32>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_( 32Z, u32, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u32>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u32>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(24, u8, 8, 32, 8, 8, _24, SwizzleBlock24);
+int TransferHostLocal24(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 8;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_24<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_24<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_24(24, T, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_24(24, T, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_24<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24<u8>(blockwidth)/sizeof(u8)) 
+			{ 
+				SwizzleBlock24(pstart + getPixelAddress_0(24,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_24<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24<u8>(blockwidth)/sizeof(u8)) 
+			{ 
+				SwizzleBlock24u(pstart + getPixelAddress_0(24,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_24(24, T, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_24<u8>((alignedX-gs.trxpos.dx))/sizeof(u8); 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_24<u8>(pitch)/sizeof(u8);
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_24<u8>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_24(24, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_24<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_24<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(24Z, u8, 8, 32, 8, 8, _24, SwizzleBlock24);
+int TransferHostLocal24Z(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 8;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_24<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_24<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_24(16, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_24(16, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_24<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24<u8>(blockwidth)/sizeof(u8)) 
+			{ 
+				SwizzleBlock24(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_24<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24<u8>(blockwidth)/sizeof(u8)) 
+			{ 
+				SwizzleBlock24u(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_24(16, u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_24<u8>(alignedX-gs.trxpos.dx)/sizeof(u8); 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_24<u8>(pitch)/sizeof(u8);
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_24<u8>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_24(24, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_24<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_24<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(16, u16, 4, 16, 16, 8, _, SwizzleBlock16);
+int TransferHostLocal16(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 16;
+	const u32 blockwidth = 16;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u16);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u16* pbuf = (const u16*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u16>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u16>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(16, u16, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(16, u16, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u16>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/sizeof(u16)) 
+			{ 
+				SwizzleBlock16(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/sizeof(u16)) 
+			{ 
+				SwizzleBlock16u(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(16, T, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u16>((alignedX-gs.trxpos.dx))/sizeof(u16); 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u16>(pitch)/sizeof(u16);
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_<u16>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(16, u16, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u16>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u16>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(16S, u16, 4, 16, 16, 8, _, SwizzleBlock16);
+int TransferHostLocal16S(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 16;
+	const u32 blockwidth = 16;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u16);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u16* pbuf = (const u16*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u16>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u16>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(16S, u16, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(16S, u16, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u16>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16(pstart + getPixelAddress_0(16S,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16u(pstart + getPixelAddress_0(16S,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(16S, u16, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u16>((alignedX-gs.trxpos.dx))/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u16>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_<u16>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(16S, u16, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u16>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u16>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(16Z, u16, 4, 16, 16, 8, _, SwizzleBlock16);
+int TransferHostLocal16Z(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 16;
+	const u32 blockwidth = 16;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u16);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u16* pbuf = (const u16*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u16>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u16>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(16Z, u16, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(16Z, u16, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u16>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16(pstart + getPixelAddress_0(16Z,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16u(pstart + getPixelAddress_0(16Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(16Z, T, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u16>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u16>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_<u16>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(16Z, u16, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u16>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u16>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(16SZ, u16, 4, 16, 16, 8, _, SwizzleBlock16);
+int TransferHostLocal16SZ(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 16;
+	const u32 blockwidth = 16;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u16);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u16* pbuf = (const u16*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u16>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u16>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u16>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16(pstart + getPixelAddress_0(16SZ,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u16>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock16u(pstart + getPixelAddress_0(16SZ,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u16>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(16SZ, u16, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u16>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u16>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_<u16>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u16>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u16>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(8, u8, 4, 8, 16, 16, _, SwizzleBlock8);
+int TransferHostLocal8(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 8;
+	const u32 blockwidth = 16;
+	const u32 blockheight = 16;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(8, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(8, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY ) goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock8(pstart + getPixelAddress_0(8,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock8u(pstart + getPixelAddress_0(8,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(8, u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u8>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u8>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TRANSMIT_PITCH_(nSize, u8)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(8, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(4, u8, 8, 4, 32, 16, _4, SwizzleBlock4);
+int TransferHostLocal4(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 8;
+	const u32 blockbits = 4;
+	const u32 blockwidth = 32;
+	const u32 blockheight = 16;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_4<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_4(4, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_4(4, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY )  goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4(pstart + getPixelAddress_0(4,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4u(pstart + getPixelAddress_0(4,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_4(4, u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_4<u8>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_4<u8>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_4<u8>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_4(4, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_4<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_4<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(8H, u8, 4, 32, 8, 8, _, SwizzleBlock8H);
+int TransferHostLocal8H(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 4;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_(8H, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_(8H, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY )  goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock8H(pstart + getPixelAddress_0(8H,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock8Hu(pstart + getPixelAddress_0(8H,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_(8H, u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_<u8>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_<u8>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TRANSMIT_PITCH_(nSize, u8)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_(8H, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(4HL, u8, 8, 32, 8, 8, _4, SwizzleBlock4HL);
+int TransferHostLocal4HL(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 8;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_4<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY )  goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4HL(pstart + getPixelAddress_0(4HL,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4HLu(pstart + getPixelAddress_0(4HL,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_4(4HL, u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_4<u8>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_4<u8>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_4<u8>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_4<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_4<u8>(2) + nLeftOver)/2; 
+} 
+
+//DEFINE_TRANSFERLOCAL(4HH, u8, 8, 32, 8, 8, _4, SwizzleBlock4HH);
+int TransferHostLocal4HH(const void* pbyMem, u32 nQWordSize) 
+{ 
+	const u32 widthlimit = 8;
+	const u32 blockbits = 32;
+	const u32 blockwidth = 8;
+	const u32 blockheight = 8;
+	const u32 TSize = sizeof(u8);
+	
+	assert( gs.imageTransfer == 0 ); 
+	u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; 
+	
+	/*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ 
+	int i = gs.imageY, j = gs.imageX; 
+	
+	const u8* pbuf = (const u8*)pbyMem; 
+	int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4<u8>(2)); 
+	int nSize = nQWordSize*4*2/TransmitPitch_4<u8>(2); 
+	nSize = min(nSize, gs.imageWnew * gs.imageHnew); 
+	
+	int pitch, area, fracX; 
+	int endY = ROUND_UPPOW2(i, blockheight); 
+	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); 
+	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); 
+	bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; 
+	
+	if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) 
+	{ 
+		/* hack */ 
+		int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); 
+		if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) 
+		{ 
+			/* don't transfer */ 
+			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ 
+			gs.imageTransfer = -1; 
+		} 
+		bCanAlign = false; 
+	} 
+	
+	/* first align on block boundary */ 
+	if ( MOD_POW2(i, blockheight) || !bCanAlign ) 
+	{ 
+		
+		if ( !bCanAlign ) 
+			endY = gs.imageEndY; /* transfer the whole image */ 
+		else 
+			assert( endY < gs.imageEndY); /* part of alignment condition */ 
+		
+		if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) 
+		{ 
+			/* transmit with a width of 1 */ 
+			TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, (1+(DSTPSM == 0x14)), endY); 
+		} 
+		else 
+		{ 
+			TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, widthlimit, endY); 
+		} 
+		
+		if( nSize == 0 || i == gs.imageEndY )  goto End; 
+	} 
+	
+	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); 
+	
+	/* can align! */ 
+	pitch = gs.imageEndX-gs.trxpos.dx; 
+	area = pitch*blockheight; 
+	fracX = gs.imageEndX-alignedX; 
+	
+	/* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ 
+	bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4<u8>(pitch)&0xf) == 0; 
+	
+	/* transfer aligning to blocks */ 
+	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) 
+	{ 
+		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4HH(pstart + getPixelAddress_0(4HH,tempj, i, gs.dstbuf.bw)*blockbits/8,  (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		else 
+		{ 
+			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4<u8>(blockwidth)/TSize) 
+			{ 
+				SwizzleBlock4HHu(pstart + getPixelAddress_0(4HH,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4<u8>(pitch)); 
+			} 
+		} 
+		
+		/* transfer the rest */ 
+		if ( alignedX < gs.imageEndX ) 
+		{ 
+			TRANSMIT_HOSTLOCAL_X_4(4HH,u8, widthlimit, blockheight, alignedX); 
+			pbuf -= TransmitPitch_4<u8>(alignedX-gs.trxpos.dx)/TSize; 
+		} 
+		else 
+		{
+			pbuf += (blockheight-1)*TransmitPitch_4<u8>(pitch)/TSize;
+		}
+		j = gs.trxpos.dx; 
+	} 
+	
+	if (TransmitPitch_4<u8>(nSize)/4 > 0 ) 
+	{ 
+		TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, widthlimit, gs.imageEndY); 
+		/* sometimes wrong sizes are sent (tekken tag) */ 
+		assert( gs.imageTransfer == -1 || TransmitPitch_4<u8>(nSize)/4 <= 2 ); 
+	} 
+	
+	End: 
+	if( i >= gs.imageEndY ) { 
+		assert( gs.imageTransfer == -1 || i == gs.imageEndY ); 
+		gs.imageTransfer = -1; 
+		/*int start, end; 
+		ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); 
+		ZeroGS::g_MemTargs.ClearRange(start, end);*/ 
+	} 
+	else { 
+		/* update new params */ 
+		gs.imageY = i; 
+		gs.imageX = j; 
+	} 
+	return (nSize * TransmitPitch_4<u8>(2) + nLeftOver)/2; 
+} 
+#else
+
 DEFINE_TRANSFERLOCAL(32, u32, 2, 32, 8, 8, _, SwizzleBlock32);
 DEFINE_TRANSFERLOCAL(32Z, u32, 2, 32, 8, 8, _, SwizzleBlock32);
 DEFINE_TRANSFERLOCAL(24, u8, 8, 32, 8, 8, _24, SwizzleBlock24);
@@ -613,98 +1814,7 @@ DEFINE_TRANSFERLOCAL(8H, u8, 4, 32, 8, 8, _, SwizzleBlock8H);
 DEFINE_TRANSFERLOCAL(4HL, u8, 8, 32, 8, 8, _4, SwizzleBlock4HL);
 DEFINE_TRANSFERLOCAL(4HH, u8, 8, 32, 8, 8, _4, SwizzleBlock4HH);
 
-//#define T u8
-//#define widthlimit 8
-//#define blockbits 4
-//#define blockwidth 32
-//#define blockheight 16
-//
-//void TransferHostLocal4(const void* pbyMem, u32 nQWordSize)
-//{
-//	START_HOSTLOCAL();
-//	
-//	const T* pbuf = (const T*)pbyMem;
-//	u32 nSize = nQWordSize*16*2/TRANSMIT_PITCH_4(2, T);
-//	nSize = min(nSize, gs.imageWnew * gs.imageHnew);
-//	
-//	int endY = ROUND_UPPOW2(i, blockheight);
-//	int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight);
-//	int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth);
-//	bool bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx;
-//	
-//	if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) {
-//		/* hack */
-//		if( abs((int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx)) <= widthlimit ) {
-//			/* don't transfer */
-//			/*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/
-//			gs.imageTransfer = -1;
-//		}
-//		bCanAlign = false;
-//	}
-//	
-//	/* first align on block boundary */
-//	if( MOD_POW2(i, blockheight) || !bCanAlign ) {
-//		
-//		if( !bCanAlign )
-//			endY = gs.imageEndY; /* transfer the whole image */
-//		else
-//			assert( endY < gs.imageEndY); /* part of alignment condition */
-//		
-//		if( (DSTPSM == 0x13 || DSTPSM == 0x14) && ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) {
-//			/* transmit with a width of 1 */
-//			TRANSMIT_HOSTLOCAL_Y_4(4, T, (1+(DSTPSM == 0x14)), endY);
-//		}
-//		else {
-//			TRANSMIT_HOSTLOCAL_Y_4(4, T, widthlimit, endY);
-//		}
-//		
-//		if( nSize == 0 || i == gs.imageEndY )
-//			goto End;
-//	}
-//	
-//	assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx);
-//	
-//	/* can align! */
-//	int pitch = gs.imageEndX-gs.trxpos.dx;
-//	u32 area = pitch*blockheight;
-//	int fracX = gs.imageEndX-alignedX;
-//	
-//	/* on top of checking whether pbuf is alinged, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */
-//	bool bAligned = !((u32)pbuf & 0xf) && (TRANSMIT_PITCH_4(pitch, T)&0xf) == 0;
-//	
-//	/* transfer aligning to blocks */
-//	for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) {
-//		
-//		if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) {
-//			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH_4(blockwidth, T)/sizeof(T)) {
-//				SwizzleBlock4(pstart + getPixelAddress4_0(tempj, i, gs.dstbuf.bw)*blockbits/8,
-//					(u8*)pbuf, TRANSMIT_PITCH_4(pitch, T));
-//			}
-//		}
-//		else {
-//			for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH_4(blockwidth, T)/sizeof(T)) {
-//				SwizzleBlock4u(pstart + getPixelAddress4_0(tempj, i, gs.dstbuf.bw)*blockbits/8,
-//					(u8*)pbuf, TRANSMIT_PITCH_4(pitch, T));
-//			}
-//		}
-//		
-//		/* transfer the rest */
-//		if( alignedX < gs.imageEndX ) {
-//			TRANSMIT_HOSTLOCAL_X_4(4, T, widthlimit, blockheight, alignedX);
-//			pbuf -= TRANSMIT_PITCH_4((alignedX-gs.trxpos.dx), T)/sizeof(T);
-//		}
-//		else pbuf += (blockheight-1)*TRANSMIT_PITCH_4(pitch, T)/sizeof(T);
-//		j = 0;
-//	}
-//	
-//	if( TRANSMIT_PITCH_4(nSize, T)/4 > 0 ) {
-//		TRANSMIT_HOSTLOCAL_Y_4(4, T, widthlimit, gs.imageEndY);
-//		/* sometimes wrong sizes are sent (tekken tag) */
-//		assert( gs.imageTransfer == -1 || TRANSMIT_PITCH_4(nSize,T)/4 <= 2 );
-//	}
-//	
-//	END_HOSTLOCAL();
-//}
+#endif
 
 void TransferLocalHost32(void* pbyMem, u32 nQWordSize)
 {FUNCLOG
diff --git a/plugins/zzogl-pg/opengl/Mem.h b/plugins/zzogl-pg/opengl/Mem.h
index 63317313a8..d29fa93ca0 100644
--- a/plugins/zzogl-pg/opengl/Mem.h
+++ b/plugins/zzogl-pg/opengl/Mem.h
@@ -23,14 +23,26 @@
 #include <vector>
 
 // works only when base is a power of 2
-#define ROUND_UPPOW2(val, base)	(((val)+(base-1))&~(base-1))
-#define ROUND_DOWNPOW2(val, base)	((val)&~(base-1))
-#define MOD_POW2(val, base) ((val)&(base-1))
+static __forceinline int ROUND_UPPOW2(int val, int base) { return (((val)+(base-1))&~(base-1)); }
+static __forceinline int ROUND_DOWNPOW2(int val, int base) { return ((val)&~(base-1)); }
+static __forceinline int MOD_POW2(int val, int base) { return ((val)&(base-1)); }
 
 // d3d texture dims
-#define BLOCK_TEXWIDTH	128
-#define BLOCK_TEXHEIGHT 512
+const int BLOCK_TEXWIDTH = 128;
+const int BLOCK_TEXHEIGHT = 512;
 
+extern PCSX2_ALIGNED16(u32 tempblock[64]);
+
+
+typedef u32 ( *_getPixelAddress)(int x, int y, u32 bp, u32 bw);
+typedef u32 (*_getPixelAddress_0)(int x, int y, u32 bw);
+typedef void (*_writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw);
+typedef void (*_writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw);
+typedef u32 (*_readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw);
+typedef u32 (*_readPixel_0)(const void* pmem, int x, int y, u32 bw);
+typedef int (*_TransferHostLocal)(const void* pbyMem, u32 nQWordSize);
+typedef void (*_TransferLocalHost)(void* pbyMem, u32 nQWordSize);
+typedef void (__fastcall *_SwizzleBlock)(u8 *dst, u8 *src, int pitch, u32 WriteMask);
 // rest not visible externally
 struct BLOCK
 {
@@ -46,14 +58,14 @@ struct BLOCK
 	u32* blockTable;
 	u32* columnTable;
 
-	u32 (*getPixelAddress)(int x, int y, u32 bp, u32 bw);
-	u32 (*getPixelAddress_0)(int x, int y, u32 bw);
-	void (*writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw);
-	void (*writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw);
-	u32 (*readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw);
-	u32 (*readPixel_0)(const void* pmem, int x, int y, u32 bw);
-	int (*TransferHostLocal)(const void* pbyMem, u32 nQWordSize);
-	void (*TransferLocalHost)(void* pbyMem, u32 nQWordSize);
+	_getPixelAddress getPixelAddress;
+	_getPixelAddress_0 getPixelAddress_0;
+	_writePixel writePixel;
+	_writePixel_0 writePixel_0;
+	_readPixel readPixel;
+	_readPixel_0 readPixel_0;
+	_TransferHostLocal TransferHostLocal;
+	_TransferLocalHost TransferLocalHost;
 
 	// texture must be of dims BLOCK_TEXWIDTH and BLOCK_TEXHEIGHT
 	static void FillBlocks(std::vector<char>& vBlockData, std::vector<char>& vBilinearData, int floatfmt);
@@ -84,19 +96,17 @@ extern u32 g_pageTable16SZ[64][64];
 extern u32 g_pageTable8[64][128];
 extern u32 g_pageTable4[128][128];
 
-static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>5) * (bw>>6)) + (x>>6);
 	u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63];
-	//assert (word < 0x100000);
-	//word = min(word, 0xfffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>5) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 2048 + g_pageTable32[y&31][x&63];
-	//assert (word < 0x100000);
-	//word = min(word, 0xfffff);
 	return word;
 }
 
@@ -109,210 +119,221 @@ static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) {
 #define getPixelAddress4HH getPixelAddress32
 #define getPixelAddress4HH_0 getPixelAddress32_0
 
-static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = bp * 128 + basepage * 4096 + g_pageTable16[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 4096 + g_pageTable16[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = bp * 128 + basepage * 4096 + g_pageTable16S[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 4096 + g_pageTable16S[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>6) * ((bw+127)>>7)) + (x>>7);
 	u32 word = bp * 256 + basepage * 8192 + g_pageTable8[y&63][x&127];
-	//assert (word < 0x400000);
-	//word = min(word, 0x3fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>6) * ((bw+127)>>7)) + (x>>7);
 	u32 word = basepage * 8192 + g_pageTable8[y&63][x&127];
-	//assert (word < 0x400000);
-	//word = min(word, 0x3fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>7) * ((bw+127)>>7)) + (x>>7);
 	u32 word = bp * 512 + basepage * 16384 + g_pageTable4[y&127][x&127];
-	//assert (word < 0x800000);
-	//word = min(word, 0x7fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>7) * ((bw+127)>>7)) + (x>>7);
 	u32 word = basepage * 16384 + g_pageTable4[y&127][x&127];
-	//assert (word < 0x800000);
-	//word = min(word, 0x7fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>5) * (bw>>6)) + (x>>6);
 	u32 word = bp * 64 + basepage * 2048 + g_pageTable32Z[y&31][x&63];
-	//assert (word < 0x100000);
-	//word = min(word, 0xfffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>5) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 2048 + g_pageTable32Z[y&31][x&63];
-	//assert (word < 0x100000);
-	//word = min(word, 0xfffff);
 	return word;
 }
 
 #define getPixelAddress24Z getPixelAddress32Z
 #define getPixelAddress24Z_0 getPixelAddress32Z_0
 
-static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = bp * 128 + basepage * 4096 + g_pageTable16Z[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 4096 + g_pageTable16Z[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = bp * 128 + basepage * 4096 + g_pageTable16SZ[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) {
+static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) 
+{
 	u32 basepage = ((y>>6) * (bw>>6)) + (x>>6);
 	u32 word = basepage * 4096 + g_pageTable16SZ[y&63][x&63];
-	//assert (word < 0x200000);
-	//word = min(word, 0x1fffff);
 	return word;
 }
 
-static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+#define getPixelAddress_0(psm,x,y,bw) getPixelAddress##psm##_0(x,y,bw)
+#define getPixelAddress(psm,x,y,bp,bw) getPixelAddress##psm##(x,y,bp,bw)
+
+
+
+static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u32*)pmem)[getPixelAddress32(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel24(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel24(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32(x, y, bp, bw)];
 	u8 *pix = (u8*)&pixel;
 	buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
 }
 
-static __forceinline void writePixel16(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel16(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel16S(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel16S(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16S(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel8(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel8(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u8*)pmem)[getPixelAddress8(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel8H(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel8H(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u8*)pmem)[4*getPixelAddress32(x, y, bp, bw)+3] = pixel;
 }
 
-static __forceinline void writePixel4(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel4(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	u32 addr = getPixelAddress4(x, y, bp, bw);
 	u8 pix = ((u8*)pmem)[addr/2];
 	if (addr & 0x1) ((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4);
 	else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel);
 }
 
-static __forceinline void writePixel4HL(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel4HL(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	u8 *p = (u8*)pmem + 4*getPixelAddress4HL(x, y, bp, bw)+3;
 	*p = (*p & 0xf0) | pixel;
 }
 
-static __forceinline void writePixel4HH(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel4HH(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	u8 *p = (u8*)pmem + 4*getPixelAddress4HH(x, y, bp, bw)+3;
 	*p = (*p & 0x0f) | (pixel<<4);
 }
 
-static __forceinline void writePixel32Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel32Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel24Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel24Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	u8 *buf = (u8*)pmem + 4*getPixelAddress32Z(x, y, bp, bw);
 	u8 *pix = (u8*)&pixel;
 	buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
 }
 
-static __forceinline void writePixel16Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel16Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16Z(x, y, bp, bw)] = pixel;
 }
 
-static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) {
+static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)] = pixel;
 }
 
 
 ///////////////
 
-static __forceinline u32  readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel24(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel24(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32(x, y, bp, bw)] & 0xffffff;
 }
 
-static __forceinline u32  readPixel16(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel16(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel16S(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel16S(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16S(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel8(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel8(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u8*)pmem)[getPixelAddress8(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel8H(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel8H(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u8*)pmem)[4*getPixelAddress32(x, y, bp, bw) + 3];
 }
 
-static __forceinline u32  readPixel4(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel4(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	u32 addr = getPixelAddress4(x, y, bp, bw);
 	u8 pix = ((const u8*)pmem)[addr/2];
 	if (addr & 0x1)
@@ -320,31 +341,37 @@ static __forceinline u32  readPixel4(const void* pmem, int x, int y, u32 bp, u32
 	else return pix & 0xf;
 }
 
-static __forceinline u32  readPixel4HL(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel4HL(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	const u8 *p = (const u8*)pmem+4*getPixelAddress4HL(x, y, bp, bw)+3;
 	return *p & 0x0f;
 }
 
-static __forceinline u32  readPixel4HH(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel4HH(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	const u8 *p = (const u8*)pmem+4*getPixelAddress4HH(x, y, bp, bw) + 3;
 	return *p >> 4;
 }
 
 ///////////////
 
-static __forceinline u32  readPixel32Z(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel32Z(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel24Z(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel24Z(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)] & 0xffffff;
 }
 
-static __forceinline u32  readPixel16Z(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel16Z(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16Z(x, y, bp, bw)];
 }
 
-static __forceinline u32  readPixel16SZ(const void* pmem, int x, int y, u32 bp, u32 bw) {
+static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)];
 }
 
@@ -352,135 +379,154 @@ static __forceinline u32  readPixel16SZ(const void* pmem, int x, int y, u32 bp,
 // Functions that take 0 bps //
 ///////////////////////////////
 
-static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u32*)pmem)[getPixelAddress32_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32_0(x, y, bw)];
 	u8 *pix = (u8*)&pixel;
-#if defined(_MSC_VER) && defined(__x86_64__)
-	memcpy(buf, pix, 3);
-#else
 	buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
-#endif
 }
 
-static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16S_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u8*)pmem)[getPixelAddress8_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u8*)pmem)[4*getPixelAddress32_0(x, y, bw)+3] = pixel;
 }
 
-static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	u32 addr = getPixelAddress4_0(x, y, bw);
 	u8 pix = ((u8*)pmem)[addr/2];
 	if (addr & 0x1) ((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4);
 	else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel);
 }
 
-static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	u8 *p = (u8*)pmem + 4*getPixelAddress4HL_0(x, y, bw)+3;
 	*p = (*p & 0xf0) | pixel;
 }
 
-static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	u8 *p = (u8*)pmem + 4*getPixelAddress4HH_0(x, y, bw)+3;
 	*p = (*p & 0x0f) | (pixel<<4);
 }
 
-static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	u8 *buf = (u8*)pmem + 4*getPixelAddress32Z_0(x, y, bw);
 	u8 *pix = (u8*)&pixel;
-#if defined(_MSC_VER) && defined(__x86_64__)
-	memcpy(buf, pix, 3);
-#else
 	buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2];
-#endif
 }
 
-static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16Z_0(x, y, bw)] = pixel;
 }
 
-static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) {
+static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) 
+{
 	((u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)] = pixel;
 }
 
 
 ///////////////
 
-static __forceinline u32  readPixel32_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel24_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)] & 0xffffff;
 }
 
-static __forceinline u32  readPixel16_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel16S_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16S_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel8_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u8*)pmem)[getPixelAddress8_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel8H_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u8*)pmem)[4*getPixelAddress32_0(x, y, bw) + 3];
 }
 
-static __forceinline u32  readPixel4_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw) 
+{
 	u32 addr = getPixelAddress4_0(x, y, bw);
 	u8 pix = ((const u8*)pmem)[addr/2];
 	if (addr & 0x1)
-		 return pix >> 4;
-	else return pix & 0xf;
+		return pix >> 4;
+	else 
+		return pix & 0xf;
 }
 
-static __forceinline u32  readPixel4HL_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw) 
+{
 	const u8 *p = (const u8*)pmem+4*getPixelAddress4HL_0(x, y, bw)+3;
 	return *p & 0x0f;
 }
 
-static __forceinline u32  readPixel4HH_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw) 
+{
 	const u8 *p = (const u8*)pmem+4*getPixelAddress4HH_0(x, y, bw) + 3;
 	return *p >> 4;
 }
 
 ///////////////
 
-static __forceinline u32  readPixel32Z_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel24Z_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] & 0xffffff;
 }
 
-static __forceinline u32  readPixel16Z_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16Z_0(x, y, bw)];
 }
 
-static __forceinline u32  readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) {
+static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) 
+{
 	return ((const u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)];
 }
 
diff --git a/plugins/zzogl-pg/opengl/Mem_Swizzle.h b/plugins/zzogl-pg/opengl/Mem_Swizzle.h
new file mode 100644
index 0000000000..b40d6673ed
--- /dev/null
+++ b/plugins/zzogl-pg/opengl/Mem_Swizzle.h
@@ -0,0 +1,123 @@
+/*  ZeroGS KOSMOS
+ *  Copyright (C) 2005-2006 zerofrog@gmail.com
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef MEM_SWIZZLE_H_INCLUDED
+#define MEM_SWIZZLE_H_INCLUDED
+
+#include "GS.h"
+#include "Mem.h"
+
+// special swizzle macros - which I converted to functions.
+
+static __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) 
+{ 
+	u8* pnewsrc = src; 
+	u32* pblock = tempblock; 
+	
+	for(int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch-24) 
+	{ 
+		for(int bx = 0; bx < 8; ++bx, pnewsrc += 3) 
+		{ 
+			pblock[bx] = *(u32*)pnewsrc; 
+		} 
+	} 
+	
+	for(int bx = 0; bx < 7; ++bx, pnewsrc += 3) 
+	{ 
+		/* might be 1 byte out of bounds of GS memory */ 
+		pblock[bx] = *(u32*)pnewsrc; 
+	} 
+	
+	/* do 3 bytes for the last copy */ 
+	*((u8*)pblock+28) = pnewsrc[0]; 
+	*((u8*)pblock+29) = pnewsrc[1]; 
+	*((u8*)pblock+30) = pnewsrc[2]; 
+	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff); 
+} 
+
+#define SwizzleBlock24u SwizzleBlock24
+
+static __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff)
+{
+	u8* pnewsrc = src; 
+	u32* pblock = tempblock; 
+	
+	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) 
+	{ 
+		u32 u = *(u32*)pnewsrc; 
+		pblock[0] = u<<24; 
+		pblock[1] = u<<16; 
+		pblock[2] = u<<8; 
+		pblock[3] = u; 
+		u = *(u32*)(pnewsrc+4); 
+		pblock[4] = u<<24; 
+		pblock[5] = u<<16; 
+		pblock[6] = u<<8; 
+		pblock[7] = u; 
+	} 
+	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000); 
+} 
+
+#define SwizzleBlock8Hu SwizzleBlock8H
+
+static __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) 
+{ 
+	u8* pnewsrc = src; 
+	u32* pblock = tempblock; 
+	
+	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) 
+	{ 
+		u32 u = *(u32*)pnewsrc; 
+		pblock[0] = u<<28; 
+		pblock[1] = u<<24; 
+		pblock[2] = u<<20; 
+		pblock[3] = u<<16; 
+		pblock[4] = u<<12; 
+		pblock[5] = u<<8; 
+		pblock[6] = u<<4; 
+		pblock[7] = u; 
+	} 
+	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000); 
+} 
+
+#define SwizzleBlock4HHu SwizzleBlock4HH
+
+static __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) 
+{
+	u8* pnewsrc = src; 
+	u32* pblock = tempblock; 
+	
+	for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) 
+	{ 
+		u32 u = *(u32*)pnewsrc; 
+		pblock[0] = u<<24; 
+		pblock[1] = u<<20; 
+		pblock[2] = u<<16; 
+		pblock[3] = u<<12; 
+		pblock[4] = u<<8; 
+		pblock[5] = u<<4; 
+		pblock[6] = u; 
+		pblock[7] = u>>4; 
+	} 
+	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000); 
+} 
+
+#define SwizzleBlock4HLu SwizzleBlock4HL
+
+
+#endif // MEM_SWIZZLE_H_INCLUDED
diff --git a/plugins/zzogl-pg/opengl/Mem_Tables.cpp b/plugins/zzogl-pg/opengl/Mem_Tables.cpp
new file mode 100644
index 0000000000..2c40ae1b4c
--- /dev/null
+++ b/plugins/zzogl-pg/opengl/Mem_Tables.cpp
@@ -0,0 +1,236 @@
+/*  ZeroGS KOSMOS
+ *  Copyright (C) 2005-2006 zerofrog@gmail.com
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "GS.h"
+
+u32 g_blockTable32[4][8] = {
+    {  0,  1,  4,  5, 16, 17, 20, 21},
+    {  2,  3,  6,  7, 18, 19, 22, 23},
+    {  8,  9, 12, 13, 24, 25, 28, 29},
+    { 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+u32 g_blockTable32Z[4][8] = {
+    { 24, 25, 28, 29,  8,  9, 12, 13},
+    { 26, 27, 30, 31, 10, 11, 14, 15},
+    { 16, 17, 20, 21,  0,  1,  4,  5},
+    { 18, 19, 22, 23,  2,  3,  6,  7}
+};
+
+u32 g_blockTable16[8][4] = {
+    {  0,  2,  8, 10 },
+    {  1,  3,  9, 11 },
+    {  4,  6, 12, 14 },
+    {  5,  7, 13, 15 },
+    { 16, 18, 24, 26 },
+    { 17, 19, 25, 27 },
+    { 20, 22, 28, 30 },
+    { 21, 23, 29, 31 }
+};
+
+u32 g_blockTable16S[8][4] = {
+    {  0,  2, 16, 18 },
+    {  1,  3, 17, 19 },
+    {  8, 10, 24, 26 },
+    {  9, 11, 25, 27 },
+    {  4,  6, 20, 22 },
+    {  5,  7, 21, 23 },
+    { 12, 14, 28, 30 },
+    { 13, 15, 29, 31 }
+};
+
+u32 g_blockTable16Z[8][4] = {
+    { 24, 26, 16, 18 },
+    { 25, 27, 17, 19 },
+    { 28, 30, 20, 22 },
+    { 29, 31, 21, 23 },
+    {  8, 10,  0,  2 },
+    {  9, 11,  1,  3 },
+    { 12, 14,  4,  6 },
+    { 13, 15,  5,  7 }
+};
+
+u32 g_blockTable16SZ[8][4] = {
+    { 24, 26,  8, 10 },
+    { 25, 27,  9, 11 },
+    { 16, 18,  0,  2 },
+    { 17, 19,  1,  3 },
+    { 28, 30, 12, 14 },
+    { 29, 31, 13, 15 },
+    { 20, 22,  4,  6 },
+    { 21, 23,  5,  7 }
+};
+
+u32 g_blockTable8[4][8] = {
+    {  0,  1,  4,  5, 16, 17, 20, 21},
+    {  2,  3,  6,  7, 18, 19, 22, 23},
+    {  8,  9, 12, 13, 24, 25, 28, 29},
+    { 10, 11, 14, 15, 26, 27, 30, 31}
+};
+
+u32 g_blockTable4[8][4] = {
+    {  0,  2,  8, 10 },
+    {  1,  3,  9, 11 },
+    {  4,  6, 12, 14 },
+    {  5,  7, 13, 15 },
+    { 16, 18, 24, 26 },
+    { 17, 19, 25, 27 },
+    { 20, 22, 28, 30 },
+    { 21, 23, 29, 31 }
+};
+
+u32 g_columnTable32[8][8] = {
+    {  0,  1,  4,  5,  8,  9, 12, 13 },
+    {  2,  3,  6,  7, 10, 11, 14, 15 },
+    { 16, 17, 20, 21, 24, 25, 28, 29 },
+    { 18, 19, 22, 23, 26, 27, 30, 31 },
+    { 32, 33, 36, 37, 40, 41, 44, 45 },
+    { 34, 35, 38, 39, 42, 43, 46, 47 },
+    { 48, 49, 52, 53, 56, 57, 60, 61 },
+    { 50, 51, 54, 55, 58, 59, 62, 63 },
+};
+
+u32 g_columnTable16[8][16] = {
+    {   0,   2,   8,  10,  16,  18,  24,  26, 
+        1,   3,   9,  11,  17,  19,  25,  27 },
+    {   4,   6,  12,  14,  20,  22,  28,  30, 
+        5,   7,  13,  15,  21,  23,  29,  31 },
+    {  32,  34,  40,  42,  48,  50,  56,  58,
+       33,  35,  41,  43,  49,  51,  57,  59 },
+    {  36,  38,  44,  46,  52,  54,  60,  62,
+       37,  39,  45,  47,  53,  55,  61,  63 },
+    {  64,  66,  72,  74,  80,  82,  88,  90,
+       65,  67,  73,  75,  81,  83,  89,  91 },
+    {  68,  70,  76,  78,  84,  86,  92,  94,
+       69,  71,  77,  79,  85,  87,  93,  95 },
+    {  96,  98, 104, 106, 112, 114, 120, 122,
+       97,  99, 105, 107, 113, 115, 121, 123 },
+    { 100, 102, 108, 110, 116, 118, 124, 126,
+      101, 103, 109, 111, 117, 119, 125, 127 },
+};
+
+u32 g_columnTable8[16][16] = {
+	{   0,   4,  16,  20,  32,  36,  48,  52,   // column 0
+        2,   6,  18,  22,  34,  38,  50,  54 },
+    {   8,  12,  24,  28,  40,  44,  56,  60,
+       10,  14,  26,  30,  42,  46,  58,  62 },
+    {  33,  37,  49,  53,   1,   5,  17,  21,
+       35,  39,  51,  55,   3,   7,  19,  23 },
+    {  41,  45,  57,  61,   9,  13,  25,  29,
+	  43,  47,  59,  63,  11,  15,  27,  31 },
+	{  96, 100, 112, 116,  64,  68,  80,  84,   // column 1
+       98, 102, 114, 118,  66,  70,  82,  86 },
+    { 104, 108, 120, 124,  72,  76,  88,  92, 
+      106, 110, 122, 126,  74,  78,  90,  94 },
+    {  65,  69,  81,  85,  97, 101, 113, 117,
+       67,  71,  83,  87,  99, 103, 115, 119 },
+    {  73,  77,  89,  93, 105, 109, 121, 125,
+       75,  79,  91,  95, 107, 111, 123, 127 },
+	{ 128, 132, 144, 148, 160, 164, 176, 180,   // column 2
+      130, 134, 146, 150, 162, 166, 178, 182 },
+    { 136, 140, 152, 156, 168, 172, 184, 188,
+      138, 142, 154, 158, 170, 174, 186, 190 },
+    { 161, 165, 177, 181, 129, 133, 145, 149,
+      163, 167, 179, 183, 131, 135, 147, 151 },
+    { 169, 173, 185, 189, 137, 141, 153, 157,
+      171, 175, 187, 191, 139, 143, 155, 159 },
+	{ 224, 228, 240, 244, 192, 196, 208, 212,   // column 3
+      226, 230, 242, 246, 194, 198, 210, 214 },
+    { 232, 236, 248, 252, 200, 204, 216, 220,
+      234, 238, 250, 254, 202, 206, 218, 222 },
+    { 193, 197, 209, 213, 225, 229, 241, 245,
+      195, 199, 211, 215, 227, 231, 243, 247 },
+    { 201, 205, 217, 221, 233, 237, 249, 253,
+      203, 207, 219, 223, 235, 239, 251, 255 },
+};
+
+u32 g_columnTable4[16][32] = {
+	{   0,   8,  32,  40,  64,  72,  96, 104,   // column 0
+        2,  10,  34,  42,  66,  74,  98, 106,
+        4,  12,  36,  44,  68,  76, 100, 108,
+        6,  14,  38,  46,  70,  78, 102, 110 },
+    {  16,  24,  48,  56,  80,  88, 112, 120,
+       18,  26,  50,  58,  82,  90, 114, 122,
+       20,  28,  52,  60,  84,  92, 116, 124,
+       22,  30,  54,  62,  86,  94, 118, 126 },
+    {  65,  73,  97, 105,   1,   9,  33,  41,
+       67,  75,  99, 107,   3,  11,  35,  43,
+       69,  77, 101, 109,   5,  13,  37,  45, 
+       71,  79, 103, 111,   7,  15,  39,  47 },
+    {  81,  89, 113, 121,  17,  25,  49,  57,
+       83,  91, 115, 123,  19,  27,  51,  59,
+       85,  93, 117, 125,  21,  29,  53,  61,
+       87,  95, 119, 127,  23,  31,  55,  63 },
+	{ 192, 200, 224, 232, 128, 136, 160, 168,   // column 1
+      194, 202, 226, 234, 130, 138, 162, 170,
+      196, 204, 228, 236, 132, 140, 164, 172,
+      198, 206, 230, 238, 134, 142, 166, 174 },
+    { 208, 216, 240, 248, 144, 152, 176, 184,
+      210, 218, 242, 250, 146, 154, 178, 186,
+      212, 220, 244, 252, 148, 156, 180, 188,
+      214, 222, 246, 254, 150, 158, 182, 190 },
+    { 129, 137, 161, 169, 193, 201, 225, 233,
+      131, 139, 163, 171, 195, 203, 227, 235,
+      133, 141, 165, 173, 197, 205, 229, 237, 
+      135, 143, 167, 175, 199, 207, 231, 239 },
+    { 145, 153, 177, 185, 209, 217, 241, 249,
+      147, 155, 179, 187, 211, 219, 243, 251,
+      149, 157, 181, 189, 213, 221, 245, 253,
+      151, 159, 183, 191, 215, 223, 247, 255 },
+	{ 256, 264, 288, 296, 320, 328, 352, 360,   // column 2
+      258, 266, 290, 298, 322, 330, 354, 362,
+      260, 268, 292, 300, 324, 332, 356, 364,
+      262, 270, 294, 302, 326, 334, 358, 366 },
+    { 272, 280, 304, 312, 336, 344, 368, 376,
+      274, 282, 306, 314, 338, 346, 370, 378,
+      276, 284, 308, 316, 340, 348, 372, 380,
+      278, 286, 310, 318, 342, 350, 374, 382 },
+    { 321, 329, 353, 361, 257, 265, 289, 297,
+      323, 331, 355, 363, 259, 267, 291, 299,
+      325, 333, 357, 365, 261, 269, 293, 301, 
+      327, 335, 359, 367, 263, 271, 295, 303 },
+    { 337, 345, 369, 377, 273, 281, 305, 313,
+      339, 347, 371, 379, 275, 283, 307, 315,
+      341, 349, 373, 381, 277, 285, 309, 317,
+      343, 351, 375, 383, 279, 287, 311, 319 },
+	{ 448, 456, 480, 488, 384, 392, 416, 424,   // column 3
+      450, 458, 482, 490, 386, 394, 418, 426,
+      452, 460, 484, 492, 388, 396, 420, 428,
+      454, 462, 486, 494, 390, 398, 422, 430 },
+    { 464, 472, 496, 504, 400, 408, 432, 440,
+      466, 474, 498, 506, 402, 410, 434, 442,
+      468, 476, 500, 508, 404, 412, 436, 444,
+      470, 478, 502, 510, 406, 414, 438, 446 },
+    { 385, 393, 417, 425, 449, 457, 481, 489,
+      387, 395, 419, 427, 451, 459, 483, 491,
+      389, 397, 421, 429, 453, 461, 485, 493, 
+      391, 399, 423, 431, 455, 463, 487, 495 },
+    { 401, 409, 433, 441, 465, 473, 497, 505,
+      403, 411, 435, 443, 467, 475, 499, 507,
+      405, 413, 437, 445, 469, 477, 501, 509,
+      407, 415, 439, 447, 471, 479, 503, 511 },
+};
+
+u32 g_pageTable32[32][64];
+u32 g_pageTable32Z[32][64];
+u32 g_pageTable16[64][64];
+u32 g_pageTable16S[64][64];
+u32 g_pageTable16Z[64][64];
+u32 g_pageTable16SZ[64][64];
+u32 g_pageTable8[64][128];
+u32 g_pageTable4[128][128];
diff --git a/plugins/zzogl-pg/opengl/Mem_Transmit.h b/plugins/zzogl-pg/opengl/Mem_Transmit.h
new file mode 100644
index 0000000000..9e84ef2dc9
--- /dev/null
+++ b/plugins/zzogl-pg/opengl/Mem_Transmit.h
@@ -0,0 +1,184 @@
+#ifndef MEM_TRANSMIT_H_INCLUDED
+#define MEM_TRANSMIT_H_INCLUDED
+
+#include "GS.h"
+#include "Mem.h"
+
+#define DSTPSM gs.dstbuf.psm
+
+// transfers whole rows
+#define TRANSMIT_HOSTLOCAL_Y_(psm, T, widthlimit, endY) { \
+	assert( (nSize%widthlimit) == 0 && widthlimit <= 4 ); \
+	if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) { \
+		/*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \
+		for(; i < endY; ++i) { \
+			for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 1) { \
+				/* write as many pixel at one time as possible */ \
+				writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \
+			} \
+		} \
+	} \
+	for(; i < endY; ++i) { \
+		for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += widthlimit) { \
+			/* write as many pixel at one time as possible */ \
+			if( nSize < widthlimit ) goto End; \
+			writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \
+			\
+			if( widthlimit > 1 ) { \
+				writePixel##psm##_0(pstart, (j+1)%2048, i%2048, pbuf[1], gs.dstbuf.bw); \
+				\
+				if( widthlimit > 2 ) { \
+					writePixel##psm##_0(pstart, (j+2)%2048, i%2048, pbuf[2], gs.dstbuf.bw); \
+					\
+					if( widthlimit > 3 ) { \
+						writePixel##psm##_0(pstart, (j+3)%2048, i%2048, pbuf[3], gs.dstbuf.bw); \
+					} \
+				} \
+			} \
+		} \
+		\
+		if( j >= gs.imageEndX ) { assert(j == gs.imageEndX); j = gs.trxpos.dx; } \
+		else { assert( gs.imageTransfer == -1 || nSize*sizeof(T)/4 == 0 ); goto End; } \
+	} \
+} \
+
+// transmit until endX, don't check size since it has already been prevalidated
+#define TRANSMIT_HOSTLOCAL_X_(psm, T, widthlimit, blockheight, startX) { \
+	for(int tempi = 0; tempi < blockheight; ++tempi) { \
+		for(j = startX; j < gs.imageEndX; j++, pbuf++) { \
+			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); \
+		} \
+		pbuf += pitch-fracX; \
+	} \
+} \
+
+//template <class T>
+//static __forceinline void TransmitHostLocalX_(_writePixel_0 wp, u32 widthlimit, u32 blockheight, u32 startX) 
+//{
+//	for(int tempi = 0; tempi < blockheight; ++tempi) 
+//	{ 
+//		for(j = startX; j < gs.imageEndX; j++, pbuf++) 
+//		{ 
+//			wp(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); 
+//		} 
+//		pbuf += pitch - fracX; 
+//	} 
+//} 
+
+// transfers whole rows
+#define TRANSMIT_HOSTLOCAL_Y_24(psm, T, widthlimit, endY) { \
+	if( widthlimit != 8 || ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) { \
+		/*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \
+		for(; i < endY; ++i) { \
+			for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 3) { \
+				writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf), gs.dstbuf.bw); \
+			} \
+			\
+			if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \
+			else { assert( gs.imageTransfer == -1 || nSize == 0 ); goto End; } \
+		} \
+	} \
+	else { \
+		assert( /*(nSize%widthlimit) == 0 &&*/ widthlimit == 8 ); \
+		for(; i < endY; ++i) { \
+			for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += 3*widthlimit) { \
+				if( nSize < widthlimit ) goto End; \
+				/* write as many pixel at one time as possible */ \
+				writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf+0), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *(u32*)(pbuf+3), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *(u32*)(pbuf+6), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *(u32*)(pbuf+9), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *(u32*)(pbuf+12), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *(u32*)(pbuf+15), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *(u32*)(pbuf+18), gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *(u32*)(pbuf+21), gs.dstbuf.bw); \
+			} \
+			\
+			if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \
+			else { \
+				if( nSize < 0 ) { \
+					/* extracted too much */ \
+					assert( (nSize%3)==0 && nSize > -24 ); \
+					j += nSize/3; \
+					nSize = 0; \
+				} \
+				assert( gs.imageTransfer == -1 || nSize == 0 ); \
+				goto End; \
+			} \
+		} \
+	} \
+} \
+
+// transmit until endX, don't check size since it has already been prevalidated
+#define TRANSMIT_HOSTLOCAL_X_24(psm, T, widthlimit, blockheight, startX) { \
+	for(int tempi = 0; tempi < blockheight; ++tempi) { \
+		for(j = startX; j < gs.imageEndX; j++, pbuf += 3) { \
+			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, *(u32*)pbuf, gs.dstbuf.bw); \
+		} \
+		pbuf += 3*(pitch-fracX); \
+	} \
+} \
+
+
+// meant for 4bit transfers
+#define TRANSMIT_HOSTLOCAL_Y_4(psm, T, widthlimit, endY) { \
+	for(; i < endY; ++i) { \
+		for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit) { \
+			/* write as many pixel at one time as possible */ \
+			writePixel##psm##_0(pstart, j%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
+			writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
+			pbuf++; \
+			if( widthlimit > 2 ) { \
+				writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
+				writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
+				pbuf++; \
+				\
+				if( widthlimit > 4 ) { \
+					writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
+					writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
+					pbuf++; \
+					\
+					if( widthlimit > 6 ) { \
+						writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \
+						writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \
+						pbuf++; \
+					} \
+				} \
+			} \
+		} \
+		\
+		if( j >= gs.imageEndX ) { j = gs.trxpos.dx; } \
+		else { assert( gs.imageTransfer == -1 || (nSize/32) == 0 ); goto End; } \
+	} \
+} \
+
+// transmit until endX, don't check size since it has already been prevalidated
+#define TRANSMIT_HOSTLOCAL_X_4(psm, T, widthlimit, blockheight, startX) { \
+	for(int tempi = 0; tempi < blockheight; ++tempi) { \
+		for(j = startX; j < gs.imageEndX; j+=2, pbuf++) { \
+			writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0]&0x0f, gs.dstbuf.bw); \
+			writePixel##psm##_0(pstart, (j+1)%2048, (i+tempi)%2048, pbuf[0]>>4, gs.dstbuf.bw); \
+		} \
+		pbuf += (pitch-fracX)/2; \
+	} \
+} \
+
+#define TRANSMIT_HOSTLOCAL_X(th, psm, T, widthlimit, blockheight, startX) \
+	TRANSMIT_HOSTLOCAL_X##th(psm, T, widthlimit, blockheight, startX)
+#define TRANSMIT_HOSTLOCAL_Y(th, psm, T, widthlimit, endY) \
+	TRANSMIT_HOSTLOCAL_Y##th(psm,T,widthlimit,endY)
+// calculate pitch in source buffer
+
+
+template <class T>
+static __forceinline int TransmitPitch_(int pitch) { return (pitch * sizeof(T)); }
+template <class T>
+static __forceinline int TransmitPitch_24(int pitch) { return (pitch * 3); }
+template <class T>
+static __forceinline int TransmitPitch_4(int pitch) { return (pitch/2); }
+
+#define TRANSMIT_PITCH_(pitch, T) TransmitPitch_<T>(pitch)
+#define TRANSMIT_PITCH_24(pitch, T) TransmitPitch_24<T>(pitch)
+#define TRANSMIT_PITCH_4(pitch, T) TransmitPitch_4<T>(pitch)
+
+#endif // MEM_TRANSMIT_H_INCLUDED
diff --git a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
index ebdb1bac16..74ce2d3ad4 100644
--- a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
+++ b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
@@ -323,6 +323,10 @@
 				RelativePath="..\Mem.cpp"
 				>
 			</File>
+			<File
+				RelativePath="..\Mem_Tables.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\memcpy_amd.cpp"
 				>
@@ -439,6 +443,15 @@
 				RelativePath="..\Mem.h"
 				>
 			</File>
+			<File
+				RelativePath="..\Mem_Swizzle.h"
+				>
+			</File>
+			<File
+				RelativePath="..\Mem_Transmit.h"
+				>
+			</File>
+			</File>
 			<File
 				RelativePath="..\PS2Edefs.h"
 				>