diff --git a/pcsx2/COP0.cpp b/pcsx2/COP0.cpp
index 0911eefde6..40f2bd7ae5 100644
--- a/pcsx2/COP0.cpp
+++ b/pcsx2/COP0.cpp
@@ -20,9 +20,10 @@
 u32 s_iLastCOP0Cycle = 0;
 u32 s_iLastPERFCycle[2] = { 0, 0 };
 
-__ri void UpdateCP0Status() {
-	//currently the 2 memory modes are not implemented. Given this function is called so much,
-	//it's commented out for now. Only the interrupt test is needed. (rama)
+// Updates the CPU's mode of operation (either, Kernel, Supervisor, or User modes).
+// Currently the different modes are not implemented.
+// Given this function is called so much, it's commented out for now. (rama)
+__ri void cpuUpdateOperationMode() {
 
 	//u32 value = cpuRegs.CP0.n.Status.val;
 
@@ -32,7 +33,6 @@ __ri void UpdateCP0Status() {
 	//} else { // User Mode
 	//	memSetUserMode();
 	//}
-	cpuTestHwInts();
 }
 
 void __fastcall WriteCP0Status(u32 value) {
diff --git a/pcsx2/Hw.cpp b/pcsx2/Hw.cpp
index 44a0003d6a..1202a1911e 100644
--- a/pcsx2/Hw.cpp
+++ b/pcsx2/Hw.cpp
@@ -50,8 +50,7 @@ void hwReset()
 {
 	hwInit();
 
-	memzero_ptr<Ps2MemSize::Hardware>( eeHw );
-	//memset(eeHw+0x2000, 0, 0x0000e000);
+	memzero( eeHw );
 
 	psHu32(SBUS_F260) = 0x1D000060;
 
@@ -73,16 +72,16 @@ void hwReset()
 	ipuDmaReset();
 }
 
-__fi void intcInterrupt()
+__fi uint intcInterrupt()
 {
 	if ((psHu32(INTC_STAT)) == 0) {
 		//DevCon.Warning("*PCSX2*: intcInterrupt already cleared");
-        return;
+        return 0;
 	}
 	if ((psHu32(INTC_STAT) & psHu32(INTC_MASK)) == 0) 
 	{
 		//DevCon.Warning("*PCSX2*: No valid interrupt INTC_MASK: %x INTC_STAT: %x", psHu32(INTC_MASK), psHu32(INTC_STAT));
-		return;
+		return 0;
 	}
 
 	HW_LOG("intcInterrupt %x", psHu32(INTC_STAT) & psHu32(INTC_MASK));
@@ -91,27 +90,29 @@ __fi void intcInterrupt()
 		counters[1].hold = rcntRcount(1);
 	}
 
-	cpuException(0x400, cpuRegs.branch);
+	//cpuException(0x400, cpuRegs.branch);
+	return 0x400;
 }
 
-__fi void dmacInterrupt()
+__fi uint dmacInterrupt()
 {
 	if( ((psHu16(DMAC_STAT + 2) & psHu16(DMAC_STAT)) == 0 ) &&
 		( psHu16(DMAC_STAT) & 0x8000) == 0 ) 
 	{
 		//DevCon.Warning("No valid DMAC interrupt MASK %x STAT %x", psHu16(DMAC_STAT+2), psHu16(DMAC_STAT));
-		return;
+		return 0;
 	}
 
-	if (!(dmacRegs.ctrl.DMAE) || psHu8(DMAC_ENABLER+2) == 1) 
+	if (!dmacRegs.ctrl.DMAE || psHu8(DMAC_ENABLER+2) == 1) 
 	{
 		//DevCon.Warning("DMAC Suspended or Disabled on interrupt");
-		return;
+		return 0;
 	}
 	HW_LOG("dmacInterrupt %x", (psHu16(DMAC_STAT + 2) & psHu16(DMAC_STAT) |
-								  psHu16(DMAC_STAT) & 0x8000));
+								psHu16(DMAC_STAT) & 0x8000));
 
-	cpuException(0x800, cpuRegs.branch);
+	//cpuException(0x800, cpuRegs.branch);
+	return 0x800;
 }
 
 void hwIntcIrq(int n)
diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp
index 165e2a0b40..5e19cd9124 100644
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@@ -55,9 +55,6 @@ int coded_block_pattern = 0;
 u8 indx4[16*16/2];
 __aligned16 decoder_t decoder;
 
-__aligned16 u8 _readbits[80];	//local buffer (ring buffer)
-u8* readbits = _readbits;		// always can decrement by one 1qw
-
 __fi void IPUProcessInterrupt()
 {
 	if (ipuRegs.ctrl.BUSY && g_BP.IFC) IPUWorker();
@@ -96,8 +93,6 @@ void ReportIPU()
 	Console.WriteLn("g_decoder = 0x%x.", &decoder);
 	Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
 	Console.WriteLn(ipu_cmd.desc());
-	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
-		_readbits, readbits - _readbits);
 	Console.Newline();
 }
 
@@ -114,15 +109,6 @@ void SaveStateBase::ipuFreeze()
 	Freeze(coded_block_pattern);
 	Freeze(decoder);
 	Freeze(ipu_cmd);
-	Freeze(_readbits);
-
-	int temp = readbits - _readbits;
-	Freeze(temp);
-
-	if (IsLoading())
-	{
-		readbits = _readbits;
-	}
 }
 
 void tIPU_CMD_IDEC::log() const
@@ -213,21 +199,27 @@ __fi u32 ipuRead32(u32 mem)
 	switch (mem)
 	{
 		ipucase(IPU_CTRL): // IPU_CTRL
+		{
 			ipuRegs.ctrl.IFC = g_BP.IFC;
 			ipuRegs.ctrl.CBP = coded_block_pattern;
 
 			if (!ipuRegs.ctrl.BUSY)
 				IPU_LOG("read32: IPU_CTRL=0x%08X", ipuRegs.ctrl._u32);
 
-		return ipuRegs.ctrl._u32;
+			return ipuRegs.ctrl._u32;
+		}		
 
 		ipucase(IPU_BP): // IPU_BP
+		{
+			pxAssume(g_BP.FP <= 2);
+			
 			ipuRegs.ipubp = g_BP.BP & 0x7f;
 			ipuRegs.ipubp |= g_BP.IFC << 8;
-			ipuRegs.ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
+			ipuRegs.ipubp |= g_BP.FP << 16;
 
 			IPU_LOG("read32: IPU_BP=0x%08X", ipuRegs.ipubp);
-		return ipuRegs.ipubp;
+			return ipuRegs.ipubp;
+		}
 
 		default:
 			IPU_LOG("read32: Addr=0x%08X Value = 0x%08X", mem, psHu32(IPU_CMD + mem));
@@ -283,9 +275,7 @@ void ipuSoftReset()
 	ipu_cmd.clear();
 	ipuRegs.cmd.BUSY = 0;
 
-	g_BP.BP = 0;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+	memzero(g_BP);
 }
 
 __fi bool ipuWrite32(u32 mem, u32 value)
@@ -354,12 +344,11 @@ static void ipuBCLR(u32 val)
 {
 	ipu_fifo.in.clear();
 
+	memzero(g_BP);
 	g_BP.BP = val & 0x7F;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+
 	ipuRegs.ctrl.BUSY = 0;
 	ipuRegs.cmd.BUSY = 0;
-	memzero(_readbits);
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }
 
@@ -370,7 +359,7 @@ static bool ipuIDEC(u32 val, bool resume)
 	if (!resume)
 	{
 		idec.log();
-		g_BP.BP += idec.FB;//skip FB bits
+		g_BP.Advance(idec.FB);
 
 	//from IPU_CTRL
 		ipuRegs.ctrl.PCT = I_TYPE; //Intra DECoding;)
@@ -407,7 +396,7 @@ static __fi bool ipuBDEC(u32 val, bool resume)
 		bdec.log(s_bdec);
 		if (IsDebugBuild) s_bdec++;
 
-	g_BP.BP += bdec.FB;//skip FB bits
+		g_BP.Advance(bdec.FB);
 		decoder.coding_type			= I_TYPE;
 		decoder.mpeg1				= ipuRegs.ctrl.MP1;
 		decoder.q_scale_type		= ipuRegs.ctrl.QST;
@@ -433,11 +422,7 @@ static bool __fastcall ipuVDEC(u32 val)
 	switch (ipu_cmd.pos[0])
 	{
 		case 0:
-			ipuRegs.cmd.DATA = 0;
-			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return false;
-
-			decoder.bitstream_bits = -16;
-			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+			if (!bitstream_init()) return false;
 
 			switch ((val >> 26) & 3)
 			{
@@ -459,17 +444,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				case 3://DMVector
 					ipuRegs.cmd.DATA = get_dmv();
 					break;
+
+				jNO_DEFAULT
 			}
 
-			g_BP.BP += (int)decoder.bitstream_bits + 16;
+			ipuRegs.cmd.DATA &= 0xFFFF;
+			ipuRegs.cmd.DATA |= 0x10000;
 
-			if ((int)g_BP.BP < 0)
-			{
-				g_BP.BP += 128;
-				ReorderBitstream();
-			}
-
-			ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
+			//ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);
 
 		case 1:
@@ -479,14 +461,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				return false;
 			}
 
-			BigEndian(ipuRegs.top, ipuRegs.top);
+			ipuRegs.top = BigEndian(ipuRegs.top);
 
 			IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
 			        ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
 			        ((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
 			return true;
 
-			jNO_DEFAULT
+		jNO_DEFAULT
 	}
 
 	return false;
@@ -496,7 +478,7 @@ static __fi bool ipuFDEC(u32 val)
 {
 	if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;
 
-	BigEndian(ipuRegs.cmd.DATA, ipuRegs.cmd.DATA);
+	ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
 	ipuRegs.top = ipuRegs.cmd.DATA;
 
 	IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
@@ -553,11 +535,10 @@ static bool ipuSETVQ(u32 val)
 		if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
 	}
 
-	IPU_LOG("SETVQ command.\nRead VQCLUT table from FIFO.");
-	IPU_LOG(
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	IPU_LOG("SETVQ command.   Read VQCLUT table from FIFO.\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
 	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
 	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
 	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
@@ -723,148 +704,48 @@ __fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
 	Console.Error("IPU: VQ not implemented");
 }
 
-__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
-{
-	const u8	*s = (const u8*)&mb8;
-	s16	*d = (s16*)&mb16;
-	int i;
-	for (i = 0; i < 256; i++) *d++ = *s++;		//Y  bias	- 16
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cr bias	- 128
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cb bias	- 128
-}
-
 
 // --------------------------------------------------------------------------------------
 //  Buffer reader
 // --------------------------------------------------------------------------------------
 
-// move the readbits queue
-__fi void inc_readbits()
+__ri u32 UBITS(uint bits)
 {
-	readbits += 16;
-	if (readbits >= _readbits + 64)
-	{
-		// move back
-		*(u64*)(_readbits) = *(u64*)(_readbits + 64);
-		*(u64*)(_readbits + 8) = *(u64*)(_readbits + 72);
-		readbits = _readbits;
-	}
+	uint readpos8 = g_BP.BP/8;
+
+	uint result = BigEndian(*(u32*)( (u8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
+
+	return result;
 }
 
-// returns the pointer of readbits moved by 1 qword
-__fi u8* next_readbits()
+__ri s32 SBITS(uint bits)
 {
-	return readbits + 16;
-}
+	// Read an unaligned 32 bit value and then shift the bits up and then back down.
 
-// returns the pointer of readbits moved by 1 qword
-u8* prev_readbits()
-{
-	if (readbits < _readbits + 16) return _readbits + 48 - (readbits - _readbits);
+	uint readpos8 = g_BP.BP/8;
 
-	return readbits - 16;
-}
+	int result = BigEndian(*(s32*)( (s8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
 
-void ReorderBitstream()
-{
-	readbits = prev_readbits();
-	g_BP.FP = 2;
-}
-
-// IPU has a 2qword internal buffer whose status is pointed by FP.
-// If FP is 1, there's 1 qword in buffer. Second qword is only loaded
-// incase there are less than 32bits available in the first qword.
-// \return Number of bits available (clamps at 16 bits)
-u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
-{
-	if (g_BP.FP == 0)
-	{
-		if (ipu_fifo.in.read(next_readbits()) == 0) return 0;
-
-		inc_readbits();
-		g_BP.FP = 1;
-	}
-
-	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
-	{
-		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
-	}
-
-	if (*(int*)pointer >= 128)
-	{
-		pxAssert(g_BP.FP >= 1);
-
-		if (g_BP.FP > 1) inc_readbits();
-
-		if (advance)
-		{
-			g_BP.FP--;
-			*pointer &= 127;
-		}
-	}
-
-	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
+	return result;
 }
 
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits128(u8 *address, u32 advance)
+u8 getBits64(u8 *address, bool advance)
 {
-	u64 mask2;
-	u128 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(64)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask2 = 0xff >> shift;
-		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
-		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
-
-		u128 notMask;
-		u128 data = *(u128*)(readpos + 1);
-		notMask.lo = ~mask.lo & data.lo;
-		notMask.hi = ~mask.hi & data.hi;
-		notMask.lo >>= 8 - shift;
-		notMask.lo |= (notMask.hi & (ULLONG_MAX >> (64 - shift))) << (64 - shift);
-		notMask.hi >>= 8 - shift;
-
-		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
-		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
-		
-		notMask.lo |= mask.lo;
-		notMask.hi |= mask.hi;
-		*(u128*)address = notMask;
-	}
-	else
-	{
-		*(u128*)address = *(u128*)readpos;
-	}
-
-	if (advance) g_BP.BP += 128;
-
-	return 1;
-}
-
-// whenever reading fractions of bytes. The low bits always come from the next byte
-// while the high bits come from the current byte
-u8 __fastcall getBits64(u8 *address, u32 advance)
-{
-	register u64 mask = 0;
-	u8* readpos;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
-	{
-		mask = (0xff >> shift);
+		u64 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);
 
 		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
@@ -874,89 +755,76 @@ u8 __fastcall getBits64(u8 *address, u32 advance)
 		*(u64*)address = *(u64*)readpos;
 	}
 
-	if (advance) g_BP.BP += 64;
+	if (advance) g_BP.Advance(64);
 
 	return 1;
 }
 
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits32(u8 *address, u32 advance)
+__fi u8 getBits32(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(32)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 32) < 32) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
+	const u8* readpos = &g_BP.internal_qwc->_u8[g_BP.BP/8];
+	
+	if(uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		u32 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24);
 
 		*(u32*)address = ((~mask & *(u32*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u32*)readpos) << shift);
 	}
 	else
 	{
+		// Bit position-aligned -- no masking/shifting necessary
 		*(u32*)address = *(u32*)readpos;
 	}
 
-	if (advance) g_BP.BP += 32;
+	if (advance) g_BP.Advance(32);
 
 	return 1;
 }
 
-__fi u8 __fastcall getBits16(u8 *address, u32 advance)
+__fi u8 getBits16(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(16)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 16) < 16) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		mask = mask | (mask << 8);
-
 		*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
-			}
+	}
 	else
 	{
 		*(u16*)address = *(u16*)readpos;
-			}
+	}
 
-	if (advance) g_BP.BP += 16;
+	if (advance) g_BP.Advance(16);
 
 	return 1;
 }
 
-u8 __fastcall getBits8(u8 *address, u32 advance)
+u8 getBits8(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(8)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8)
-		return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
-			{
-		mask = (0xff >> shift);
+	{
+		uint mask = (0xff >> shift);
 		*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
-			}
+	}
 	else
 	{
 		*(u8*)address = *(u8*)readpos;
-		}
+	}
 
-	if (advance) g_BP.BP += 8;
+	if (advance) g_BP.Advance(8);
 
 	return 1;
 }
@@ -983,7 +851,7 @@ void IPUCMD_WRITE(u32 val)
 
 		case SCE_IPU_VDEC:
 
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 
 			// check if enough data in queue
 			if (ipuVDEC(val)) return;
@@ -993,9 +861,11 @@ void IPUCMD_WRITE(u32 val)
 			break;
 
 		case SCE_IPU_FDEC:
-			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, FP %d, CHCR 0x%x",
-			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, g_BP.FP, ipu1dma.chcr._u32);
-			g_BP.BP += val & 0x3F;
+			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, CHCR 0x%x",
+			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, ipu1dma.chcr._u32);
+
+			g_BP.Advance(val & 0x3F);
+
 			if (ipuFDEC(val)) return;
 			ipuRegs.cmd.BUSY = 0x80000000;
 			ipuRegs.topbusy = 0x80000000;
@@ -1009,7 +879,7 @@ void IPUCMD_WRITE(u32 val)
 		case SCE_IPU_SETIQ:
 			IPU_LOG("SETIQ command.");
 			if (val & 0x3f) IPU_LOG("Skip %d bits.", val & 0x3f);
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 			if (ipuSETIQ(val)) return;
 			break;
 
diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h
index e33c211b3e..6759e547db 100644
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@@ -67,11 +67,66 @@ union tIPU_CTRL {
 	void reset() { _u32 = 0; }
 };
 
-struct tIPU_BP {
-	u32 BP;		// Bit stream point
-	u16 IFC;	// Input FIFO counter
-	u8 FP;		// FIFO point
-	u8 bufferhasnew; // Always 0.
+struct __aligned16 tIPU_BP {
+	__aligned16 u128 internal_qwc[2];
+
+	u32 BP;		// Bit stream point (0 to 128*2)
+	u32 IFC;	// Input FIFO counter (8QWC) (0 to 8)
+	u32 FP;		// internal FIFO (2QWC) fill status (0 to 2)
+
+	__fi void Align()
+	{
+		BP = (BP + 7) & ~7;
+		Advance(0);
+	}
+
+	__fi void Advance(uint bits)
+	{
+		BP += bits;
+		pxAssume( BP <= 256 );
+
+		if (BP > 127)
+		{
+			BP -= 128;
+
+			if (FP == 2)
+			{
+				// when BP is over 128 it means we're reading data from the second quadword.  Shift that one
+				// to the front and load the new quadword into the second QWC (its a manualized ringbuffer!)
+
+				CopyQWC(&internal_qwc[0], &internal_qwc[1]);
+				FP = 1;
+			}
+			else
+			{
+				// if FP == 1 then the buffer has been completely drained.
+				// if FP == 0 then an already-drained buffer is being advanced.
+				// In either case we just assign FP to 0.
+
+				FP = 0;
+			}
+		}
+	}
+
+	__fi bool FillBuffer(u32 bits)
+	{
+		while (FP < 2)
+		{
+			if (ipu_fifo.in.read(&internal_qwc[FP]) == 0)
+			{
+				// Here we *try* to fill the entire internal QWC buffer; however that may not necessarily
+				// be possible -- so if the fill fails we'll only return 0 if we don't have enough
+				// remaining bits in the FIFO to fill the request.
+
+				return ((FP!=0) && (BP + bits) <= 128);
+			}
+
+			++FP;
+		}
+
+		return true;
+	}
+
 	wxString desc() const
 	{
 		return wxsFormat(L"Ipu BP: bp = 0x%x, IFC = 0x%x, FP = 0x%x.", BP, IFC, FP);
@@ -217,10 +272,9 @@ extern void IPUCMD_WRITE(u32 val);
 extern void ipuSoftReset();
 extern void IPUProcessInterrupt();
 
-extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
-extern u8 __fastcall getBits128(u8 *address, u32 advance);
-extern u8 __fastcall getBits64(u8 *address, u32 advance);
-extern u8 __fastcall getBits32(u8 *address, u32 advance);
-extern u8 __fastcall getBits16(u8 *address, u32 advance);
-extern u8 __fastcall getBits8(u8 *address, u32 advance);
+extern u8 getBits128(u8 *address, bool advance);
+extern u8 getBits64(u8 *address, bool advance);
+extern u8 getBits32(u8 *address, bool advance);
+extern u8 getBits16(u8 *address, bool advance);
+extern u8 getBits8(u8 *address, bool advance);
 
diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp
index 2c2902cf6f..25b0aad6f5 100644
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@@ -19,7 +19,6 @@
 #include "IPU/IPUdma.h"
 #include "mpeg2lib/Mpeg.h"
 
-
 __aligned16 IPU_Fifo ipu_fifo;
 
 void IPU_Fifo::init()
@@ -75,10 +74,7 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
 
 	while (transsize-- > 0)
 	{
-		for (int i = 0; i <= 3; i++)
-		{
-			data[writepos + i] = pMem[i];
-		}
+		CopyQWC(&data[writepos], pMem);
 		writepos = (writepos + 4) & 31;
 		pMem += 4;
 	}
@@ -86,118 +82,100 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
 	return firsttrans;
 }
 
-int IPU_Fifo_Output::write(const u32 *value, int size)
-{
-	int transsize, firsttrans;
-
-	if ((int)ipuRegs.ctrl.OFC >= 8) IPU0dma();
-
-	transsize = min(size, 8 - (int)ipuRegs.ctrl.OFC);
-	firsttrans = transsize;
-
-	while (transsize-- > 0)
-	{
-		for (int i = 0; i <= 3; i++)
-		{
-			data[writepos + i] = ((u32*)value)[i];
-		}
-		writepos = (writepos + 4) & 31;
-		value += 4;
-	}
-
-	ipuRegs.ctrl.OFC += firsttrans;
-	IPU0dma();
-
-	return firsttrans;
-}
-
 int IPU_Fifo_Input::read(void *value)
 {
 	// wait until enough data to ensure proper streaming.
-	if (g_BP.IFC < 4)
+	if (g_BP.IFC < 3)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
-			CPU_INT( DMAC_TO_IPU, 4 );
+			CPU_INT( DMAC_TO_IPU, 32 );
 		}
-		
+
 		if (g_BP.IFC == 0) return 0;
 		pxAssert(g_BP.IFC > 0);
 	}
 
-	// transfer 1 qword, split into two transfers
-	for (int i = 0; i <= 3; i++)
-	{
-		((u32*)value)[i] = data[readpos + i];
-		data[readpos + i] = 0;
-	}
+	CopyQWC(value, &data[readpos]);
 
 	readpos = (readpos + 4) & 31;
 	g_BP.IFC--;
 	return 1;
 }
 
-void IPU_Fifo_Output::_readsingle(void *value)
+int IPU_Fifo_Output::write(const u32 *value, uint size)
 {
-	// transfer 1 qword, split into two transfers
-	for (int i = 0; i <= 3; i++)
+	pxAssumeMsg(size>0, "Invalid size==0 when calling IPU_Fifo_Output::write");
+
+	uint origsize = size;
+	do {
+		IPU0dma();
+	
+		uint transsize = min(size, 8 - (uint)ipuRegs.ctrl.OFC);
+		if(!transsize) break;
+
+		ipuRegs.ctrl.OFC = transsize;
+		size -= transsize;
+		while (transsize > 0)
+		{
+			CopyQWC(&data[writepos], value);
+			writepos = (writepos + 4) & 31;
+			value += 4;
+			--transsize;
+		}
+	} while(true);
+
+	return origsize - size;
+
+#if 0
+	if (ipuRegs.ctrl.OFC >= 8) IPU0dma();
+
+	uint transsize = min(size, 8 - (uint)ipuRegs.ctrl.OFC);
+	uint firsttrans = transsize;
+
+	while (transsize > 0)
 	{
-		((u32*)value)[i] = data[readpos + i];
-		data[readpos + i] = 0;
+		CopyQWC(&data[writepos], value);
+		writepos = (writepos + 4) & 31;
+		value += 4;
+		--transsize;
 	}
-	readpos = (readpos + 4) & 31;
+
+	ipuRegs.ctrl.OFC += firsttrans;
+	IPU0dma();
+
+	return firsttrans;
+#endif
 }
 
-void IPU_Fifo_Output::read(void *value, int size)
+void IPU_Fifo_Output::read(void *value, uint size)
 {
+	pxAssume(ipuRegs.ctrl.OFC >= size);
 	ipuRegs.ctrl.OFC -= size;
+	
+	// Zeroing the read data is not needed, since the ringbuffer design will never read back
+	// the zero'd data anyway. --air
+
+	//__m128 zeroreg = _mm_setzero_ps();
 	while (size > 0)
 	{
-		_readsingle(value);
-		value = (u32*)value + 4;
-		size--;
+		CopyQWC(value, &data[readpos]);
+		//_mm_store_ps((float*)&data[readpos], zeroreg);
+
+		readpos = (readpos + 4) & 31;
+		value = (u128*)value + 1;
+		--size;
 	}
 }
 
-void IPU_Fifo_Output::readsingle(void *value)
-{
-	if (ipuRegs.ctrl.OFC > 0)
-	{
-		ipuRegs.ctrl.OFC--;
-		_readsingle(value);
-	}
-}
-
-__fi bool decoder_t::ReadIpuData(u128* out)
-{
-	if(ipu0_data == 0)
-	{
-		IPU_LOG( "ReadFIFO/IPUout -> (fifo empty/no data available)" );
-		return false;
-	}
-
-	CopyQWC(out, GetIpuDataPtr());
-
-	--ipu0_data;
-	++ipu0_idx;
-
-	IPU_LOG( "ReadFIFO/IPUout -> %ls", out->ToString().c_str() );
-
-	return true;
-}
-
 void __fastcall ReadFIFO_IPUout(mem128_t* out)
 {
-	// FIXME!  When ReadIpuData() doesn't succeed (returns false), the EE should probably stall
-	// until a value becomes available.  This isn't exactly easy to do since the virtualized EE
-	// in PCSX2 *has* to be running in order for the IPU DMA to upload new input data to allow
-	// IPUout's FIFO to fill.  Thus if we implement an EE stall, PCSX2 deadlocks.  Grr.  --air
+	if (!pxAssertDev( ipuRegs.ctrl.OFC > 0, "Attempted read from IPUout's FIFO, but the FIFO is empty!" )) return;
+	ipu_fifo.out.read(out, 1);
 
-	if (decoder.ReadIpuData(out))
-	{
-		ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
-	}
+	// Games should always check the fifo before reading from it -- so if the FIFO has no data
+	// its either some glitchy game or a bug in pcsx2.
 }
 
 void __fastcall WriteFIFO_IPUin(const mem128_t* value)
diff --git a/pcsx2/IPU/IPU_Fifo.h b/pcsx2/IPU/IPU_Fifo.h
index 10a1e940d3..69d2eab597 100644
--- a/pcsx2/IPU/IPU_Fifo.h
+++ b/pcsx2/IPU/IPU_Fifo.h
@@ -37,13 +37,10 @@ struct IPU_Fifo_Output
 	int readpos, writepos;
 
 	// returns number of qw read
-	int write(const u32 * value, int size);
-	void read(void *value,int size);
-	void readsingle(void *value);
+	int write(const u32 * value, uint size);
+	void read(void *value, uint size);
 	void clear();
 	wxString desc() const;
-
-	void _readsingle(void *value);
 };
 
 struct IPU_Fifo
diff --git a/pcsx2/IPU/IPUdma.cpp b/pcsx2/IPU/IPUdma.cpp
index 5a5949e0b0..26e9e80a6a 100644
--- a/pcsx2/IPU/IPUdma.cpp
+++ b/pcsx2/IPU/IPUdma.cpp
@@ -189,7 +189,7 @@ int IPU1dma()
 			{
 				if(!WaitGSPaths())
 				{ // legacy WaitGSPaths() for now
-					IPU_INT_TO(4); //Give it a short wait.
+					IPU_INT_TO(32); //Give it a short wait.
 					return totalqwc;
 				}
 				IPU_LOG("Processing Normal QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@@ -203,7 +203,7 @@ int IPU1dma()
 				{
 					if(!WaitGSPaths())
 					{ // legacy WaitGSPaths() for now
-						IPU_INT_TO(4); //Give it a short wait.
+						IPU_INT_TO(32); //Give it a short wait.
 						return totalqwc;
 					}
 					IPU_LOG("Processing Chain QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@@ -283,7 +283,7 @@ int IPU1dma()
 
 					if(!WaitGSPaths() && ipu1dma.qwc > 0)
 					{ // legacy WaitGSPaths() for now
-						IPU_INT_TO(4); //Give it a short wait.
+						IPU_INT_TO(32); //Give it a short wait.
 						return totalqwc;
 					}
 					IPU_LOG("Processing Start Chain QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@@ -312,8 +312,9 @@ int IPU1dma()
 
 int IPU0dma()
 {
+	if(!ipuRegs.ctrl.OFC) return 0;
+
 	int readsize;
-	static int totalsize = 0;
 	tDMA_TAG* pMem;
 
 	if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0))
@@ -329,7 +330,6 @@ int IPU0dma()
 	pMem = dmaGetAddr(ipu0dma.madr, true);
 
 	readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC);
-	totalsize+=readsize;
 	ipu_fifo.out.read(pMem, readsize);
 
 	ipu0dma.madr += readsize << 4;
@@ -363,7 +363,6 @@ int IPU0dma()
 		//This broke vids in Digital Devil Saga
 		//Note that interrupting based on totalsize is just guessing..
 		IPU_INT_FROM( readsize * BIAS );
-		totalsize = 0;
 	}
 
 	return readsize;
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
index 27edd38f89..bdef4b2d49 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@@ -47,10 +47,14 @@ const int non_linear_quantizer_scale [] =
 	into 1st slot is copied to the 2nd slot. Which will later be copied
 	back to the 1st slot when 128bits have been read.
 */
-extern void ReorderBitstream();
 const DCTtab * tab;
 int mbaCount = 0;
 
+int bitstream_init ()
+{
+	return g_BP.FillBuffer(32);
+}
+
 int get_macroblock_modes()
 {
 	int macroblock_modes;
@@ -221,9 +225,7 @@ int __fi get_motion_delta(const int f_code)
 
 int __fi get_dmv()
 {
-	const DMVtab * tab;
-
-	tab = DMV_2 + UBITS(2);
+	const DMVtab* tab = DMV_2 + UBITS(2);
 	DUMPBITS(tab->len);
 	return tab->dmv;
 }
@@ -239,22 +241,21 @@ int get_macroblock_address_increment()
 	else if (code >= 768)
 		mba = MBA.mba11 + (UBITS(11) - 24);
 	else switch (UBITS(11))
-		{
+	{
+		case 8:		/* macroblock_escape */
+			DUMPBITS(11);
+			return 0x23;
 
-			case 8:		/* macroblock_escape */
+		case 15:	/* macroblock_stuffing (MPEG1 only) */
+			if (decoder.mpeg1)
+			{
 				DUMPBITS(11);
-				return 0x23;
+				return 0x22;
+			}
 
-			case 15:	/* macroblock_stuffing (MPEG1 only) */
-				if (decoder.mpeg1)
-				{
-					DUMPBITS(11);
-					return 0x22;
-				}
-
-			default:
-				return 0;//error
-		}
+		default:
+			return 0;//error
+	}
 
 	DUMPBITS(mba->len);
 
@@ -336,11 +337,8 @@ do {							\
 	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)
 
-static __fi bool get_intra_block()
+static bool get_intra_block()
 {
-	int i;
-	int j;
-	int val;
 	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
 	const u8 (&quant_matrix)[64] = decoder.iq;
 	int quantizer_scale = decoder.quantizer_scale;
@@ -348,7 +346,7 @@ static __fi bool get_intra_block()
 	u16 code; 
 
 	/* decode AC coefficients */
-  for (i=1 + ipu_cmd.pos[4]; ; i++)
+  for (int i=1 + ipu_cmd.pos[4]; ; i++)
   {
 	  switch (ipu_cmd.pos[5])
 	  {
@@ -427,60 +425,65 @@ static __fi bool get_intra_block()
 			return true;
 		}
 		
-		i+= tab->run == 65 ? GETBITS(6) : tab->run;
+		i += (tab->run == 65) ? GETBITS(6) : tab->run;
 		if (i >= 64)
 		{
 			ipu_cmd.pos[4] = 0;
 			return true;
 		}
+
 	  case 1:
-		if (!GETWORD())
-		{
-		  ipu_cmd.pos[4] = i - 1;
-		  ipu_cmd.pos[5] = 1;
-		  return false;
+	  {
+			if (!GETWORD())
+			{
+				ipu_cmd.pos[4] = i - 1;
+				ipu_cmd.pos[5] = 1;
+				return false;
+			}
+
+			uint j = scan[i];
+			int val;
+
+			if (tab->run==65) /* escape */
+			{
+				if(!decoder.mpeg1)
+				{
+				  val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
+				  DUMPBITS(12);
+				}
+				else
+				{
+				  val = SBITS(8);
+				  DUMPBITS(8);
+
+				  if (!(val & 0x7f))
+				  {
+					val = GETBITS(8) + 2 * val;
+				  }
+
+				  val = (val * quantizer_scale * quant_matrix[i]) >> 4;
+				  val = (val + ~ (((s32)val) >> 31)) | 1;
+				}
+			}
+			else
+			{
+				val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+				if(decoder.mpeg1)
+				{
+					/* oddification */
+					val = (val - 1) | 1;
+				}
+
+				/* if (bitstream_get (1)) val = -val; */
+				int bit1 = SBITS(1);
+				val = (val ^ bit1) - bit1;
+				DUMPBITS(1);
+			}
+
+			SATURATE(val);
+			dest[j] = val;
+			ipu_cmd.pos[5] = 0;
 		}
-
-		j = scan[i];
-
-		if (tab->run==65) /* escape */
-		{
-		  if(!decoder.mpeg1)
-		  {
-			  val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
-			  DUMPBITS(12);
-		  }
-		  else
-		  {
-			  val = SBITS(8);
-			  DUMPBITS(8);
-
-			  if (!(val & 0x7f))
-			  {
-				val = GETBITS(8) + 2 * val;
-			  }
-			
-			  val = (val * quantizer_scale * quant_matrix[i]) >> 4;
-			  val = (val + ~ (((s32)val) >> 31)) | 1;
-		  }
-		}
-		else
-		{
-		  val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
-		  if(decoder.mpeg1)
-		  {
-			/* oddification */
-			val = (val - 1) | 1;
-		  }
-
- 		  /* if (bitstream_get (1)) val = -val; */
-		  val = (val ^ SBITS(1)) - SBITS(1);
-		  DUMPBITS(1);
-		}
-
-		SATURATE(val);
-		dest[j] = val;
-		ipu_cmd.pos[5] = 0;
 	 }
   }
 
@@ -488,7 +491,7 @@ static __fi bool get_intra_block()
   return true;
 }
 
-static __fi bool get_non_intra_block(int * last)
+static bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
@@ -614,8 +617,9 @@ static __fi bool get_non_intra_block(int * last)
 			}
 			else
 			{
+				int bit1 = SBITS(1);
 				val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-				val = (val ^ SBITS(1)) - SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}
 
@@ -682,25 +686,11 @@ void __fi finishmpeg2sliceIDEC()
 {
 	ipuRegs.ctrl.SCD = 0;
 	coded_block_pattern = decoder.coded_block_pattern;
-
-	g_BP.BP += decoder.bitstream_bits - 16;
-
-	if ((int)g_BP.BP < 0)
-	{
-		g_BP.BP = 128 + (int)g_BP.BP;
-
-		// After BP is positioned correctly, we need to reload the old buffer
-		// so that reading may continue properly
-		ReorderBitstream();
-	}
-
-	FillInternalBuffer(&g_BP.BP, 1, 0);
 }
 
 bool mpeg2sliceIDEC()
 {
 	u16 code;
-	u8 bit8;
 
 	switch (ipu_cmd.pos[0])
 	{
@@ -798,6 +788,9 @@ bool mpeg2sliceIDEC()
 						ipu_cmd.pos[2] = 6;
 						return false;
 					}
+					break;
+
+				jNO_DEFAULT;
 				}
 
 				// Send The MacroBlock via DmaIpuFrom
@@ -812,23 +805,23 @@ bool mpeg2sliceIDEC()
 				}
 
 			case 2:
-				while (decoder.ipu0_data > 0)
-				{
-					uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+			{
+				pxAssume(decoder.ipu0_data > 0);
 
-					if (read == 0)
-					{
-						ipu_cmd.pos[1] = 2;
-						return false;
-					}
-					else
-					{
-						decoder.AdvanceIpuDataBy(read);
-					}
+				uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+				decoder.AdvanceIpuDataBy(read);
+
+				if (decoder.ipu0_data != 0)
+				{
+					// IPU FIFO filled up -- Will have to finish transferring later.
+					ipu_cmd.pos[1] = 2;
+					return false;
 				}
 
 				decoder.mbc++;
 				mbaCount = 0;
+			}
+			
 			case 3:
 				while (1)
 				{
@@ -851,18 +844,18 @@ bool mpeg2sliceIDEC()
 					}
 					else switch (UBITS(11))
 					{
-							case 8:		/* macroblock_escape */
-								mbaCount += 33;
-								/* pass through */
+						case 8:		/* macroblock_escape */
+							mbaCount += 33;
+							/* pass through */
 
-							case 15:	/* macroblock_stuffing (MPEG1 only) */
-								DUMPBITS(11);
-								continue;
+						case 15:	/* macroblock_stuffing (MPEG1 only) */
+							DUMPBITS(11);
+							continue;
 
-							default:	/* end of slice/frame, or error? */
-							{
-								goto finish_idec;	
-							}
+						default:	/* end of slice/frame, or error? */
+						{
+							goto finish_idec;	
+						}
 					}
 				}
 
@@ -886,17 +879,20 @@ bool mpeg2sliceIDEC()
 				}
 
 				break;
+
+			jNO_DEFAULT;
 			}
 
 			ipu_cmd.pos[1] = 0;
 			ipu_cmd.pos[2] = 0;
 		}
-		
+
 finish_idec:
 		finishmpeg2sliceIDEC();
 
 	case 3:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 3;
@@ -905,10 +901,10 @@ finish_idec:
 
 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
+	}
 
 	case 4:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
@@ -917,8 +913,10 @@ finish_idec:
 			return false;
 		}
 
-		BigEndian(ipuRegs.top, ipuRegs.top);
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
+
+	jNO_DEFAULT;
 	}
 
 	return true;
@@ -927,7 +925,6 @@ finish_idec:
 bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
-	u8 bit8;
 
 	macroblock_8& mb8 = decoder.mb8;
 	macroblock_16& mb16 = decoder.mb16;
@@ -1010,9 +1007,35 @@ bool mpeg2_slice()
 					return false;
 				}
 				break;
+
+			jNO_DEFAULT;
 			}
 
-			ipu_copy(mb8, mb16);
+			// Copy macroblock8 to macroblock16 - without sign extension.
+			// Manually inlined due to MSVC refusing to inline the SSE-optimized version.
+			{
+				const u8	*s = (const u8*)&mb8;
+				u16			*d = (u16*)&mb16;
+
+				//Y  bias	- 16 * 16
+				//Cr bias	- 8 * 8
+				//Cb bias	- 8 * 8
+
+				__m128i zeroreg = _mm_setzero_si128();
+
+				for (uint i = 0; i < (256+64+64) / 32; ++i)
+				{
+					//*d++ = *s++;
+					__m128i woot1 = _mm_load_si128((__m128i*)s);
+					__m128i woot2 = _mm_load_si128((__m128i*)s+1);
+					_mm_store_si128((__m128i*)d,	_mm_unpacklo_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+1,	_mm_unpackhi_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+2,	_mm_unpacklo_epi8(woot2, zeroreg));
+					_mm_store_si128((__m128i*)d+3,	_mm_unpackhi_epi8(woot2, zeroreg));
+					s += 32;
+					d += 32;
+				}
+			}
 		}
 		else
 		{
@@ -1077,6 +1100,8 @@ bool mpeg2_slice()
 						}
 					}
 					break;
+
+				jNO_DEFAULT;
 				}
 			}
 		}
@@ -1084,40 +1109,31 @@ bool mpeg2_slice()
 		// Send The MacroBlock via DmaIpuFrom
 		ipuRegs.ctrl.SCD = 0;
 		coded_block_pattern = decoder.coded_block_pattern;
-		g_BP.BP += (int)decoder.bitstream_bits - 16;
-
-		// BP goes from 0 to 128, so negative values mean to read old buffer
-		// so we minus from 128 to get the correct BP
-		if ((int)g_BP.BP < 0)
-		{
-			g_BP.BP = 128 + (int)g_BP.BP;
-
-			// After BP is positioned correctly, we need to reload the old buffer
-			// so that reading may continue properly
-			ReorderBitstream();
-		}
 
 		decoder.mbc = 1;
 		decoder.SetOutputTo(mb16);
 
 	case 3:
-		while (decoder.ipu0_data > 0)
-		{
-			uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+	{
+		pxAssume(decoder.ipu0_data > 0);
 
-			if (size == 0)
-			{
-				ipu_cmd.pos[0] = 3;
-				return false;
-			}
-			else
-			{
-				decoder.AdvanceIpuDataBy(size);
-			}
+		uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+		decoder.AdvanceIpuDataBy(read);
+
+		if (decoder.ipu0_data != 0)
+		{
+			// IPU FIFO filled up -- Will have to finish transferring later.
+			ipu_cmd.pos[0] = 3;
+			return false;
 		}
 
+		decoder.mbc++;
+		mbaCount = 0;
+	}
+	
 	case 4:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 4;
@@ -1126,11 +1142,11 @@ bool mpeg2_slice()
 
 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
-
+	}
+	
 	case 5:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
 		{
@@ -1138,8 +1154,7 @@ bool mpeg2_slice()
 			return false;
 		}
 
-		BigEndian(ipuRegs.top, ipuRegs.top);
-		decoder.bitstream_bits = 0;
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
 	}
 
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h
index 13431eb396..5ea46631e7 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@@ -148,12 +148,12 @@ struct decoder_t {
 	macroblock_rgb32 rgb32;
 	macroblock_rgb16 rgb16;
 
-	uint ipu0_data;
+	uint ipu0_data;		// amount of data in the output macroblock (in QWC)
 	uint ipu0_idx;
 
 	/* bit parsing stuff */
-	u32 bitstream_buf;		/* current 32 bit working set */
-	int bitstream_bits;			/* used bits in working set */
+	//u32 bitstream_buf;		/* current 32 bit working set */
+	//int bitstream_bits;			/* used bits in working set */
 
 	int quantizer_scale;	/* remove */
 	int dmv_offset;		/* remove */
@@ -230,7 +230,7 @@ struct decoder_t {
 		ipu0_data -= amt;
 	}
 	
-	bool ReadIpuData(u128* out);
+	__fi bool ReadIpuData(u128* out);
 };
 
 struct mpeg2_scan_pack
@@ -241,6 +241,10 @@ struct mpeg2_scan_pack
 	mpeg2_scan_pack();
 };
 
+extern int bitstream_init ();
+extern u32 UBITS(uint bits);
+extern s32 SBITS(uint bits);
+
 extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
 extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
 
@@ -258,20 +262,19 @@ extern int get_dmv();
 extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
 extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
 extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
-extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);
 
 extern int slice (u8 * buffer);
 
 #ifdef _MSC_VER
-#define BigEndian(out, in) out = _byteswap_ulong(in)
+#define BigEndian(in) _byteswap_ulong(in)
 #else
-#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
+#define BigEndian(in) __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif
 
 #ifdef _MSC_VER
-#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#define BigEndian64(in) _byteswap_uint64(in)
 #else
-#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#define BigEndian64(in) __builtin_bswap64(in) // or we could use the asm function bswap...
 #endif
 
 extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
diff --git a/pcsx2/IPU/mpeg2lib/Vlc.h b/pcsx2/IPU/mpeg2lib/Vlc.h
index 0b30d1b8bb..cac61dd40c 100644
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@@ -30,65 +30,24 @@
 #ifndef __VLC_H__
 #define __VLC_H__
 
-//static u8 word[4];
-//static u8 dword[8];
-//static u8 qword[16];
-
 static __fi int GETWORD()
 {
-	static u8 data[2];
-
-	if (decoder.bitstream_bits > 0)
-	{
-		if(!getBits16(data,1))
-		{
-			return 0;
-		}
-		
-		/*u32 data;
-		BigEndian(data, *(u32*)word);
-		decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
-		decoder.bitstream_bits -= 32;*/
-		decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
-		decoder.bitstream_bits -= 16;
-	}
-
-	return 1;
+	return g_BP.FillBuffer(16);
 }
 
-static __fi int bitstream_init ()
+// Removes bits from the bitstream.  This is done independently of UBITS/SBITS because a
+// lot of mpeg streams have to read ahead and rewind bits and re-read them at different
+// bit depths or sign'age.
+static __fi void DUMPBITS(uint num)
 {
-	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
-	{
-		return 0;
-	}
-
-	decoder.bitstream_bits = -16;
-	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
-	/*decoder.bitstream_buf = *(u64*)dword;
-	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/
-
-	return 1;
+	g_BP.Advance(num);
+	//pxAssume(g_BP.FP != 0);
 }
 
-/* remove num valid bits from bit_buf */
-static __fi void DUMPBITS(int num)
+static __fi u32 GETBITS(uint num)
 {
-	decoder.bitstream_buf <<= num;
-    decoder.bitstream_bits += num;
-}
-
-/* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* Get bits from bitstream */
-static __fi u32 GETBITS(int num)
-{
-	u16 retVal = UBITS(num);
-	DUMPBITS(num);
+	uint retVal = UBITS(num);
+	g_BP.Advance(num);
 
 	return retVal;
 }
diff --git a/pcsx2/R5900.cpp b/pcsx2/R5900.cpp
index 1138594ecd..01223d6bd5 100644
--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@@ -130,7 +130,7 @@ __ri void cpuException(u32 code, u32 bd)
 			//Reset / NMI
 			cpuRegs.pc = 0xBFC00000;
 			Console.Warning("Reset request");
-			UpdateCP0Status();
+			cpuUpdateOperationMode();
 			return;
 		}
 		else if((code & 0x38000) == 0x10000)
@@ -167,7 +167,7 @@ __ri void cpuException(u32 code, u32 bd)
 	else
 		cpuRegs.pc = 0xBFC00200 + offset;
 
-	UpdateCP0Status();
+	cpuUpdateOperationMode();
 }
 
 void cpuTlbMiss(u32 addr, u32 bd, u32 excode)
@@ -196,7 +196,7 @@ void cpuTlbMiss(u32 addr, u32 bd, u32 excode)
 	}
 
 	cpuRegs.CP0.n.Status.b.EXL = 1;
-	UpdateCP0Status();
+	cpuUpdateOperationMode();
 //	Log=1; varLog|= 0x40000000;
 }
 
@@ -208,33 +208,6 @@ void cpuTlbMissW(u32 addr, u32 bd) {
 	cpuTlbMiss(addr, bd, EXC_CODE_TLBS);
 }
 
-__fi void _cpuTestMissingINTC() {
-	if (cpuRegs.CP0.n.Status.val & 0x400 &&
-		psHu32(INTC_STAT) & psHu32(INTC_MASK)) {
-		if ((cpuRegs.interrupt & (1 << 30)) == 0) {
-			Console.Error("*PCSX2*: Error, missing INTC Interrupt");
-		}
-	}
-}
-
-__fi void _cpuTestMissingDMAC() {
-	if (cpuRegs.CP0.n.Status.val & 0x800 &&
-		(psHu16(0xe012) & psHu16(0xe010) ||
-		 psHu16(0xe010) & 0x8000)) {
-		if ((cpuRegs.interrupt & (1 << 31)) == 0) {
-			Console.Error("*PCSX2*: Error, missing DMAC Interrupt");
-		}
-	}
-}
-
-void cpuTestMissingHwInts() {
-	if ((cpuRegs.CP0.n.Status.val & 0x10007) == 0x10001) {
-		_cpuTestMissingINTC();
-		_cpuTestMissingDMAC();
-//		_cpuTestTIMR();
-	}
-}
-
 // sets a branch test to occur some time from an arbitrary starting point.
 __fi void cpuSetNextEvent( u32 startCycle, s32 delta )
 {
@@ -253,7 +226,7 @@ __fi void cpuSetNextEventDelta( s32 delta )
 	cpuSetNextEvent( cpuRegs.cycle, delta );
 }
 
-// tests the cpu cycle agaisnt the given start and delta values.
+// tests the cpu cycle against the given start and delta values.
 // Returns true if the delta time has passed.
 __fi int cpuTestCycle( u32 startCycle, s32 delta )
 {
@@ -361,8 +334,8 @@ static bool cpuIntsEnabled(int Interrupt)
 {
 	bool IntType = !!(cpuRegs.CP0.n.Status.val & Interrupt); //Choose either INTC or DMAC, depending on what called it
 
-	return cpuRegs.CP0.n.Status.b.EIE && cpuRegs.CP0.n.Status.b.IE &&
-		!cpuRegs.CP0.n.Status.b.EXL && (cpuRegs.CP0.n.Status.b.ERL == 0) && IntType;
+	return IntType && cpuRegs.CP0.n.Status.b.EIE && cpuRegs.CP0.n.Status.b.IE &&
+		!cpuRegs.CP0.n.Status.b.EXL && (cpuRegs.CP0.n.Status.b.ERL == 0);
 }
 
 // if cpuRegs.cycle is greater than this cycle, should check cpuEventTest for updates
@@ -375,10 +348,19 @@ __fi void _cpuEventTest_Shared()
 	ScopedBool etest(eeEventTestIsActive);
 	g_nextEventCycle = cpuRegs.cycle + eeWaitCycles;
 
+	// ---- INTC / DMAC (CPU-level Exceptions) -----------------
+	// Done first because exceptions raised during event tests need to be postponed a few
+	// cycles (fixes Grandia II [PAL], which does a spin loop on a vsync and expects to
+	// be able to read the value before the exception handler clears it).
+
+	uint mask = intcInterrupt() | dmacInterrupt();
+	if (cpuIntsEnabled(mask)) cpuException(mask, cpuRegs.branch);
+
+
 	// ---- Counters -------------
 	// Important: the vsync counter must be the first to be checked.  It includes emulation
 	// escape/suspend hooks, and it's really a good idea to suspend/resume emulation before
-	// doing any actual meaninful branchtest logic.
+	// doing any actual meaningful branchtest logic.
 
 	if( cpuTestCycle( nextsCounter, nextCounter ) )
 	{
@@ -391,10 +373,10 @@ __fi void _cpuEventTest_Shared()
 	_cpuTestTIMR();
 
 	// ---- Interrupts -------------
-	// Handles all interrupts except 30 and 31, which are handled later.
+	// These are basically just DMAC-related events, which also piggy-back the same bits as
+	// the PS2's own DMA channel IRQs and IRQ Masks.
 
-	if( cpuRegs.interrupt & ~(3<<30) )
-		_cpuTestInterrupts();
+	_cpuTestInterrupts();
 
 	// ---- IOP -------------
 	// * It's important to run a iopEventTest before calling ExecuteBlock. This
@@ -418,11 +400,7 @@ __fi void _cpuEventTest_Shared()
 		//if( EEsCycle < -450 )
 		//	Console.WriteLn( " IOP ahead by: %d cycles", -EEsCycle );
 
-		// Experimental and Probably Unnecessary Logic -->
-		// Check if the EE already has an exception pending, and if so we shouldn't
-		// waste too much time updating the IOP.  Theory being that the EE and IOP should
-		// run closely in sync during raised exception events.  But in practice it didn't
-		// seem to make much of a difference.
+		EEsCycle = psxCpu->ExecuteBlock( EEsCycle );
 
 		iopEventAction = false;
 	}
@@ -456,22 +434,10 @@ __fi void _cpuEventTest_Shared()
 
 	// Apply vsync and other counter nextCycles
 	cpuSetNextEvent( nextsCounter, nextCounter );
-
-	// ---- INTC / DMAC Exceptions -----------------
-	// Raise the INTC and DMAC interrupts here, which usually throw exceptions.
-	// This should be done last since the IOP and the VU0 can raise several EE
-	// exceptions.
-
-	//if ((cpuRegs.CP0.n.Status.val & 0x10007) == 0x10001)
-	if( cpuIntsEnabled(0x400) ) TESTINT(30, intcInterrupt);
-	if( cpuIntsEnabled(0x800) ) TESTINT(31, dmacInterrupt);
 }
 
 __ri void cpuTestINTCInts()
 {
-	// Check the internal Event System -- if one's already scheduled then don't bother:
-	if( cpuRegs.interrupt & (1 << 30) ) return;
-
 	// Check the COP0's Status register for general interrupt disables, and the 0x400
 	// bit (which is INTC master toggle).
 	if( !cpuIntsEnabled(0x400) ) return;
@@ -488,9 +454,6 @@ __ri void cpuTestINTCInts()
 
 __fi void cpuTestDMACInts()
 {
-	// Check the internal Event System -- if one's already scheduled then don't bother:
-	if ( cpuRegs.interrupt & (1 << 31) ) return;
-
 	// Check the COP0's Status register for general interrupt disables, and the 0x800
 	// bit (which is the DMAC master toggle).
 	if( !cpuIntsEnabled(0x800) ) return;
diff --git a/pcsx2/R5900.h b/pcsx2/R5900.h
index eac96559b3..d865a3b2a8 100644
--- a/pcsx2/R5900.h
+++ b/pcsx2/R5900.h
@@ -403,8 +403,8 @@ enum EE_EventType
 };
 
 extern void CPU_INT( EE_EventType n, s32 ecycle );
-extern void intcInterrupt();
-extern void dmacInterrupt();
+extern uint intcInterrupt();
+extern uint dmacInterrupt();
 
 
 extern void cpuInit();
diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h
index bf506a3d58..a981837907 100644
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
 
-static const u32 g_SaveVersion = 0x8b4a0000;
+static const u32 g_SaveVersion = 0x8b4b0000;
 
 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)
diff --git a/plugins/zzogl-pg/opengl/CMakeLists.txt b/plugins/zzogl-pg/opengl/CMakeLists.txt
index 3d9c4e913b..1f83451cb4 100644
--- a/plugins/zzogl-pg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg/opengl/CMakeLists.txt
@@ -92,9 +92,9 @@ set(zzoglHeaders
     Util.h
     x86.h
     zerogs.h
-    zerogsmath.h
     zpipe.h
     ZZoglCRTC.h
+    ZZoglMath.h
     ZZoglShaders.h
     ZZGl.h
     ZZLog.h)
diff --git a/plugins/zzogl-pg/opengl/GLWin32.cpp b/plugins/zzogl-pg/opengl/GLWin32.cpp
index da64f16fe8..0c66206fda 100644
--- a/plugins/zzogl-pg/opengl/GLWin32.cpp
+++ b/plugins/zzogl-pg/opengl/GLWin32.cpp
@@ -32,7 +32,6 @@ LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
 
 	switch (msg)
 	{
-
 		case WM_DESTROY:
 			PostQuitMessage(0);
 			return 0;
@@ -76,21 +75,21 @@ bool GLWindow::CreateWindow(void *pDisplay)
 	rc.bottom = conf.height;
 
 	WNDCLASSEX wc;
-	HINSTANCE hInstance = GetModuleHandle(NULL);
+	HINSTANCE hInstance = GetModuleHandle(NULL); // Grab An Instance For Our Window
 	DWORD dwExStyle, dwStyle;
 
 	wc.cbSize = sizeof(WNDCLASSEX);
-	wc.style = CS_CLASSDC;
-	wc.lpfnWndProc = (WNDPROC) MsgProc;
-	wc.cbClsExtra = 0;
-	wc.cbWndExtra = 0;
-	wc.hInstance = hInstance;
-	wc.hIcon = NULL;
-	wc.hIconSm = NULL;
-	wc.hCursor = NULL;
-	wc.hbrBackground = NULL;
-	wc.lpszMenuName = NULL;
-	wc.lpszClassName = "PS2EMU_ZEROGS";
+	wc.style		= CS_HREDRAW | CS_VREDRAW | CS_OWNDC;		// Redraw On Move, And Own DC For Window
+	wc.lpfnWndProc		= (WNDPROC) MsgProc;					// MsgProc Handles Messages
+	wc.cbClsExtra		= 0;									// No Extra Window Data
+	wc.cbWndExtra		= 0;									// No Extra Window Data
+	wc.hInstance		= hInstance;							// Set The Instance
+	wc.hIcon		= NULL;			
+	wc.hIconSm		= NULL;										// Load The Default Icon
+	wc.hCursor		= LoadCursor(NULL, IDC_ARROW);				// Load The Arrow Pointer
+	wc.hbrBackground	= (HBRUSH)GetStockObject(BLACK_BRUSH);	// No Background Required For GL
+	wc.lpszMenuName		= NULL;									// We Don't Want A Menu
+	wc.lpszClassName	= "PS2EMU_ZEROGS";						// Set The Class Name
 
 	RegisterClassEx(&wc);
 
@@ -102,26 +101,26 @@ bool GLWindow::CreateWindow(void *pDisplay)
 	else
 	{
 		dwExStyle = WS_EX_APPWINDOW | WS_EX_WINDOWEDGE;
-		dwStyle = WS_OVERLAPPEDWINDOW;
+		dwStyle = WS_OVERLAPPEDWINDOW | WS_BORDER;
 	}
 
+	dwStyle |= WS_CLIPSIBLINGS | WS_CLIPCHILDREN;
 	AdjustWindowRectEx(&rc, dwStyle, false, dwExStyle);
 
 	GetWindowRect(GetDesktopWindow(), &rcdesktop);
 
-	GShwnd = CreateWindowEx(
-				 dwExStyle,
-				 "PS2EMU_ZEROGS",
-				 "ZeroGS",
-				 dwStyle,
-				 (rcdesktop.right - (rc.right - rc.left)) / 2,
-				 (rcdesktop.bottom - (rc.bottom - rc.top)) / 2,
-				 rc.right - rc.left,
-				 rc.bottom - rc.top,
-				 NULL,
-				 NULL,
-				 hInstance,
-				 NULL);
+	GShwnd = CreateWindowEx(	dwExStyle,				// Extended Style For The Window
+					"PS2EMU_ZEROGS",				// Class Name
+					"ZZOgl",					// Window Title
+					dwStyle,				// Selected Window Style
+					(rcdesktop.right - (rc.right - rc.left)) / 2,  // Window Position
+					(rcdesktop.bottom - (rc.bottom - rc.top)) / 2, // Window Position
+					rc.right - rc.left,	// Calculate Adjusted Window Width
+					rc.bottom - rc.top,	// Calculate Adjusted Window Height
+					NULL,					// No Parent Window
+					NULL,					// No Menu
+					hInstance,				// Instance
+					NULL);					// Don't Pass Anything To WM_CREATE
 
 	if (GShwnd == NULL) return false;
 
@@ -197,6 +196,7 @@ bool GLWindow::DisplayWindow(int _width, int _height)
 		dwExStyle = WS_EX_APPWINDOW | WS_EX_WINDOWEDGE;
 		dwStyle = WS_OVERLAPPEDWINDOW;
 	}
+	dwStyle |= WS_CLIPSIBLINGS | WS_CLIPCHILDREN;
 
 	RECT rc;
 
diff --git a/plugins/zzogl-pg/opengl/HostMemory.cpp b/plugins/zzogl-pg/opengl/HostMemory.cpp
index 889423bd53..419ad6ed8d 100644
--- a/plugins/zzogl-pg/opengl/HostMemory.cpp
+++ b/plugins/zzogl-pg/opengl/HostMemory.cpp
@@ -469,10 +469,10 @@ __forceinline void _TransferLocalLocal_4()
 		assert((gs.srcbuf.psm&0x7) == (gs.dstbuf.psm&0x7));
 
 		if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw)
-			ZZLog::Warn_Log("Transfer error, src width exceeded.");
+			ZZLog::Debug_Log("Transfer error, src width exceeded.");
 
 		if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw)
-			ZZLog::Warn_Log("Transfer error, dst width exceeded.");
+			ZZLog::Debug_Log("Transfer error, dst width exceeded.");
 
 		int srcstart, srcend, dststart, dstend;
 
diff --git a/plugins/zzogl-pg/opengl/Mem.cpp b/plugins/zzogl-pg/opengl/Mem.cpp
index 82a46bab47..1a65a91e2e 100644
--- a/plugins/zzogl-pg/opengl/Mem.cpp
+++ b/plugins/zzogl-pg/opengl/Mem.cpp
@@ -267,7 +267,7 @@ void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData,
 	}
 
     if (floatfmt) {
-        Vector* psrcv = (Vector*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
+        float4* psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
 
         for(int i = 0; i < b.height; ++i)
         {
@@ -276,7 +276,7 @@ void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData,
             for(int j = 0; j < b.width; ++j)
             {
                 u32 temp = ((j + 1) % b.width);
-                Vector* pv = &psrcv[i_width + j];
+                float4* pv = &psrcv[i_width + j];
                 pv->x = psrcf[i_width + j];
                 pv->y = psrcf[i_width + temp];
                 pv->z = psrcf[i_width2 + j];
@@ -291,7 +291,7 @@ void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, in
 	FUNCLOG
     if (floatfmt) {
         vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
-        vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(Vector));
+        vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
     } else {
         vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
     }
diff --git a/plugins/zzogl-pg/opengl/Mem.h b/plugins/zzogl-pg/opengl/Mem.h
index 95689692cf..19ee8e5d07 100644
--- a/plugins/zzogl-pg/opengl/Mem.h
+++ b/plugins/zzogl-pg/opengl/Mem.h
@@ -120,8 +120,8 @@ struct BLOCK
 	BLOCK() { memset(this, 0, sizeof(BLOCK)); }
 
 	// shader constants for this block
-	Vector vTexBlock;
-	Vector vTexDims;
+	float4 vTexBlock;
+	float4 vTexDims;
 	int width, height;	// dims of one page in pixels
 	int ox, oy, mult;
 	int bpp;
@@ -147,8 +147,8 @@ struct BLOCK
 		ox = ox2;
 		oy = oy2;
 		mult = mult2;
-		vTexDims = Vector(BLOCK_TEXWIDTH/(float)(bw), BLOCK_TEXHEIGHT/(float)bh, 0, 0); 
-		vTexBlock = Vector((float)bw/BLOCK_TEXWIDTH, (float)bh/BLOCK_TEXHEIGHT, ((float)ox+0.2f)/BLOCK_TEXWIDTH, ((float)oy+0.05f)/BLOCK_TEXHEIGHT);
+		vTexDims = float4(BLOCK_TEXWIDTH/(float)(bw), BLOCK_TEXHEIGHT/(float)bh, 0, 0); 
+		vTexBlock = float4((float)bw/BLOCK_TEXWIDTH, (float)bh/BLOCK_TEXHEIGHT, ((float)ox+0.2f)/BLOCK_TEXWIDTH, ((float)oy+0.05f)/BLOCK_TEXHEIGHT);
 		width = bw;
 		height = bh;
 		colwidth = bh / 4;
diff --git a/plugins/zzogl-pg/opengl/NewRegs.cpp b/plugins/zzogl-pg/opengl/NewRegs.cpp
index c258d4f475..3c51353c8d 100644
--- a/plugins/zzogl-pg/opengl/NewRegs.cpp
+++ b/plugins/zzogl-pg/opengl/NewRegs.cpp
@@ -638,7 +638,7 @@ void __gifCall GIFRegHandlerSCISSOR(const u32* data)
 		Flush();
 	}
 
-	m_env.CTXT[i].SCISSOR = (GSVector4i)r->SCISSOR;
+	m_env.CTXT[i].SCISSOR = (Vector4i)r->SCISSOR;
 
 	m_env.CTXT[i].UpdateScissor();*/
 	ZZLog::Greg_Log("SCISSOR%d", i);
diff --git a/plugins/zzogl-pg/opengl/Regs.cpp b/plugins/zzogl-pg/opengl/Regs.cpp
index aafd648ab5..8ca39fe749 100644
--- a/plugins/zzogl-pg/opengl/Regs.cpp
+++ b/plugins/zzogl-pg/opengl/Regs.cpp
@@ -55,7 +55,7 @@ inline bool NoHighlights(int i)
 	
 //	if ( results[resultA] == 0 ) {
 //		results[resultA] = 1;
-//		ZZLog::ERROR_LOG("%x = %d %d %d %d %d %d %d %d \n", resultA, prim->iip, (prim->tme), (prim->fge), (prim->abe) , (prim->aa1) ,(prim->fst), (prim->ctxt), (prim->fix)) ;
+//		ZZLog::Error_Log("%x = %d %d %d %d %d %d %d %d \n", resultA, prim->iip, (prim->tme), (prim->fge), (prim->abe) , (prim->aa1) ,(prim->fst), (prim->ctxt), (prim->fix)) ;
 //	}
 //	if (resultA == 0xb && ZeroGS::vb[i].zbuf.zmsk ) return false; //ATF
 
diff --git a/plugins/zzogl-pg/opengl/Util.h b/plugins/zzogl-pg/opengl/Util.h
index 75fa384167..15964f6e93 100644
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@@ -52,7 +52,7 @@ extern "C" u32   CALLBACK PS2EgetLibType(void);
 extern "C" u32   CALLBACK PS2EgetLibVersion2(u32 type);
 extern "C" char* CALLBACK PS2EgetLibName(void);
 
-#include "zerogsmath.h"
+#include "ZZoglMath.h"
 
 #include <vector>
 #include <string>
diff --git a/plugins/zzogl-pg/opengl/Win32/Win32.cpp b/plugins/zzogl-pg/opengl/Win32/Win32.cpp
index 03ee157668..e52c6909b1 100644
--- a/plugins/zzogl-pg/opengl/Win32/Win32.cpp
+++ b/plugins/zzogl-pg/opengl/Win32/Win32.cpp
@@ -37,62 +37,27 @@ void CALLBACK GSkeyEvent(keyEvent *ev)
 
 #include "Win32/resource.h"
 
-BOOL CALLBACK LoggingDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
-{
-	switch (uMsg)
-	{
-
-		case WM_INITDIALOG:
-
-			if (conf.log) CheckDlgButton(hW, IDC_LOG, true);
-
-			return true;
-
-		case WM_COMMAND:
-			switch (LOWORD(wParam))
-			{
-				case IDCANCEL:
-					EndDialog(hW, true);
-					return true;
-
-				case IDOK:
-
-					if (IsDlgButtonChecked(hW, IDC_LOG))
-						conf.log = 1;
-					else
-						conf.log = 0;
-
-					SaveConfig();
-
-					EndDialog(hW, false);
-
-					return true;
-			}
-	}
-
-	return false;
-}
-
 map<int, int> mapConfOpts;
 #define PUT_CONF(id) mapConfOpts[IDC_CONFOPT_##id] = 0x##id;
 
-void OnInitDialog(HWND hW)
+void OnAdvOK(HWND hW)
 {
-	if (!(conf.zz_options.loaded)) LoadConfig();
+	conf.hacks._u32 = 0;
 
-	CheckDlgButton(hW, IDC_CONFIG_INTERLACE, conf.interlace);
-	CheckDlgButton(hW, IDC_CONFIG_BILINEAR, conf.bilinear);
-	CheckDlgButton(hW, IDC_CONFIG_DEPTHWRITE, conf.mrtdepth);
-	CheckRadioButton(hW, IDC_CONFIG_AANONE, IDC_CONFIG_AA4, IDC_CONFIG_AANONE + conf.aa);
-	CheckDlgButton(hW, IDC_CONFIG_WIREFRAME, (conf.wireframe()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_CAPTUREAVI, (conf.captureAvi()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_FULLSCREEN, (conf.fullscreen()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_WIDESCREEN, (conf.widescreen()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_BMPSS, (conf.zz_options.tga_snap) ? 1 : 0);
-	CheckRadioButton(hW, IDC_CONF_WIN640, IDC_CONF_WIN1280, IDC_CONF_WIN640 + conf.zz_options.dimensions);
+	for (map<int, int>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
+	{
+		if (IsDlgButtonChecked(hW, it->first)) conf.hacks._u32 |= it->second;
+	}
 
-	prevbilinearfilter = conf.bilinear;
+	GSsetGameCRC(g_LastCRC, conf.hacks._u32);
 
+	SaveConfig();
+
+	EndDialog(hW, false);
+}
+
+void OnInitAdvDialog(HWND hW)
+{
 	mapConfOpts.clear();
 
 	PUT_CONF(00000001);
@@ -129,45 +94,87 @@ void OnInitDialog(HWND hW)
 	}
 }
 
-void OnOK(HWND hW)
+BOOL CALLBACK AdvancedDialogProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
+{
+	switch (uMsg)
+	{
+		case WM_INITDIALOG:
+			OnInitAdvDialog(hW);
+			return true;
+
+		case WM_COMMAND:
+
+			switch (LOWORD(wParam))
+			{
+				case IDCANCEL:
+					EndDialog(hW, true);
+					return true;
+
+				case IDOK:
+					OnAdvOK(hW);
+					return true;
+			}
+	}
+
+	return false;
+}
+
+void CALLBACK AdvancedDialog()
+{
+	DialogBox(hInst,
+			  MAKEINTRESOURCE(IDD_ADV_OPTIONS),
+			  GetActiveWindow(),
+			  (DLGPROC)AdvancedDialogProc);
+}
+
+void OnInitConfDialog(HWND hW)
+{
+	if (!(conf.zz_options.loaded)) LoadConfig();
+
+     TCHAR *aaName[] = {"None", "x2", "x4", "x8", "x16"};
+ 
+     for(int i=0; i<5; i++)
+	 {
+		 ComboBox_AddString(GetDlgItem(hW, IDC_AA_COMBO), (LPARAM)aaName[i]);
+	 }
+	ComboBox_SelectString(GetDlgItem(hW, IDC_AA_COMBO), -1, (LPARAM)aaName[conf.aa]);
+
+    TCHAR *sizeName[] = {"640 x 480", "800 x 600", "1024 x 768", "1280 x 960"};
+ 
+    for(int i=0; i<4; i++)
+	{
+		ComboBox_AddString(GetDlgItem(hW, IDC_WIN_SIZE_COMBO), (LPARAM)sizeName[i]);
+	}
+	ComboBox_SelectString(GetDlgItem(hW, IDC_WIN_SIZE_COMBO), -1, (LPARAM)sizeName[conf.zz_options.dimensions]);
+
+	CheckDlgButton(hW, IDC_CONFIG_INTERLACE, conf.interlace);
+	CheckDlgButton(hW, IDC_CONFIG_BILINEAR, conf.bilinear);
+	CheckDlgButton(hW, IDC_CONFIG_DEPTHWRITE, conf.mrtdepth);
+	CheckDlgButton(hW, IDC_CONFIG_WIREFRAME, (conf.wireframe()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_CAPTUREAVI, (conf.captureAvi()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_FULLSCREEN, (conf.fullscreen()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_WIDESCREEN, (conf.widescreen()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_BMPSS, (conf.zz_options.tga_snap) ? 1 : 0);
+
+	prevbilinearfilter = conf.bilinear;
+}
+
+void OnConfOK(HWND hW)
 {
 	u32 newinterlace = IsDlgButtonChecked(hW, IDC_CONFIG_INTERLACE);
 
-	if (!conf.interlace) conf.interlace = newinterlace;
-	else if (!newinterlace) conf.interlace = 2;  // off
+	if (!conf.interlace) 
+		conf.interlace = newinterlace;
+	else if (!newinterlace) 
+		conf.interlace = 2;  // off
 
 	conf.bilinear = IsDlgButtonChecked(hW, IDC_CONFIG_BILINEAR);
 
 	// restore
-	if (conf.bilinear && prevbilinearfilter)
-		conf.bilinear = prevbilinearfilter;
+	if (conf.bilinear && prevbilinearfilter) conf.bilinear = prevbilinearfilter;
 
-	//conf.mrtdepth = 1;//IsDlgButtonChecked(hW, IDC_CONFIG_DEPTHWRITE);
-
-	if (SendDlgItemMessage(hW, IDC_CONFIG_AANONE, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 0;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA2, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 1;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA4, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 2;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA8, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 3;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA16, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 4;
-	}
-	else 
-	{
-		conf.aa = 0;
-	}
+	if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_AA_COMBO)) != -1)
+		conf.aa = ComboBox_GetCurSel(GetDlgItem(hW, IDC_AA_COMBO));
 
 	conf.zz_options._u32 = 0;
 
@@ -177,22 +184,13 @@ void OnOK(HWND hW)
 	conf.zz_options.widescreen = IsDlgButtonChecked(hW, IDC_CONFIG_WIDESCREEN) ? 1 : 0;
 	conf.zz_options.tga_snap = IsDlgButtonChecked(hW, IDC_CONFIG_BMPSS) ? 1 : 0;
 
-	conf.hacks._u32 = 0;
-
-	for (map<int, int>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
-	{
-		if (IsDlgButtonChecked(hW, it->first)) conf.hacks._u32 |= it->second;
-	}
-
-	GSsetGameCRC(g_LastCRC, conf.hacks._u32);
-
-	if (SendDlgItemMessage(hW, IDC_CONF_WIN640, BM_GETCHECK, 0, 0)) 
+	if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 0) 
 		conf.zz_options.dimensions = GSDim_640;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN800, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 1) 
 		conf.zz_options.dimensions = GSDim_800;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN1024, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 2) 
 		conf.zz_options.dimensions = GSDim_1024;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN1280, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 3) 
 		conf.zz_options.dimensions = GSDim_1280;
 
 	SaveConfig();
@@ -205,19 +203,26 @@ BOOL CALLBACK ConfigureDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	switch (uMsg)
 	{
 		case WM_INITDIALOG:
-			OnInitDialog(hW);
+			OnInitConfDialog(hW);
 			return true;
 
 		case WM_COMMAND:
 
 			switch (LOWORD(wParam))
 			{
+                case IDC_AA_COMBO: 
+					break; 
+
+				case IDC_ADV_BTN:
+					AdvancedDialog();
+					return true;
+
 				case IDCANCEL:
 					EndDialog(hW, true);
 					return true;
 
 				case IDOK:
-					OnOK(hW);
+					OnConfOK(hW);
 					return true;
 			}
 	}
@@ -225,13 +230,26 @@ BOOL CALLBACK ConfigureDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	return false;
 }
 
+void CALLBACK GSconfigure()
+{
+	DialogBox(hInst,
+			  MAKEINTRESOURCE(IDD_CONFIG2),
+			  GetActiveWindow(),
+			  (DLGPROC)ConfigureDlgProc);
+
+	if (g_nPixelShaderVer == SHADER_REDUCED) conf.bilinear = 0;
+}
+
+s32 CALLBACK GStest()
+{
+	return 0;
+}
+
 BOOL CALLBACK AboutDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
 	switch (uMsg)
 	{
 		case WM_INITDIALOG:
-			//ZeroGS uses floating point render targets because A8R8G8B8 format is not sufficient for ps2 blending and this requires alpha blending on floating point render targets
-			//There might be a problem with pixel shader precision with older geforce models (textures will look blocky).
 			return true;
 
 		case WM_COMMAND:
@@ -246,21 +264,6 @@ BOOL CALLBACK AboutDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	return false;
 }
 
-void CALLBACK GSconfigure()
-{
-	DialogBox(hInst,
-			  MAKEINTRESOURCE(IDD_CONFIG),
-			  GetActiveWindow(),
-			  (DLGPROC)ConfigureDlgProc);
-
-	if (g_nPixelShaderVer == SHADER_REDUCED) conf.bilinear = 0;
-}
-
-s32 CALLBACK GStest()
-{
-	return 0;
-}
-
 void CALLBACK GSabout()
 {
 	DialogBox(hInst,
diff --git a/plugins/zzogl-pg/opengl/Win32/resrc1.h b/plugins/zzogl-pg/opengl/Win32/resrc1.h
index c4259633ae..0c2e913e95 100644
--- a/plugins/zzogl-pg/opengl/Win32/resrc1.h
+++ b/plugins/zzogl-pg/opengl/Win32/resrc1.h
@@ -5,7 +5,6 @@
 #define IDC_CONF_DEFAULT                3
 #define IDR_DATA1                       112
 #define IDD_ADV_OPTIONS                 113
-#define IDD_DIALOG1                     114
 #define IDD_CONFIG2                     114
 #define IDC_ABOUTTEXT                   1015
 #define IDC_CONFIG_AA                   1016
@@ -52,12 +51,15 @@
 #define IDC_CONFOPT_00004000            1047
 #define IDC_BUTTON1                     1048
 #define IDC_CONFOPT_COMPUTEOR           1048
+#define IDC_ADV_BTN                     1048
 #define IDC_CONFOPT_4001                1049
 #define IDC_CONFOPT_00000010            1049
 #define IDC_CONFOPT_00008000            1050
 #define IDC_CONFOPT_00010000            1052
 #define IDC_CONFOPT_00020000            1054
+#define IDC_AA_COMBO                    1054
 #define IDC_CONFOPT_00000002            1055
+#define IDC_WIN_SIZE_COMBO              1055
 #define IDC_CONFOPT_01000000            1056
 #define IDC_CONFOPT_00800000            1057
 #define IDC_CONFOPT_00000008            1058
@@ -80,7 +82,7 @@
 #ifndef APSTUDIO_READONLY_SYMBOLS
 #define _APS_NEXT_RESOURCE_VALUE        116
 #define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1051
+#define _APS_NEXT_CONTROL_VALUE         1056
 #define _APS_NEXT_SYMED_VALUE           101
 #endif
 #endif
diff --git a/plugins/zzogl-pg/opengl/Win32/zerogs.rc b/plugins/zzogl-pg/opengl/Win32/zerogs.rc
index 5655a1bbbd..2aab0fd2d6 100644
--- a/plugins/zzogl-pg/opengl/Win32/zerogs.rc
+++ b/plugins/zzogl-pg/opengl/Win32/zerogs.rc
@@ -206,32 +206,28 @@ BEGIN
                     "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,266,365,8
 END
 
-IDD_CONFIG2 DIALOGEX 0, 0, 171, 217
+IDD_CONFIG2 DIALOGEX 0, 0, 159, 160
 STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
 CAPTION "ZZOgl Options"
 FONT 8, "MS Shell Dlg", 400, 0, 0x1
 BEGIN
-    DEFPUSHBUTTON   "OK",IDOK,55,192,50,14
-    PUSHBUTTON      "Cancel",IDCANCEL,108,192,50,14
-    GROUPBOX        "Static",IDC_STATIC,7,7,152,183
-    CONTROL         "Logging (For Debugging)",1000,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,18,102,10
+    DEFPUSHBUTTON   "OK",IDOK,37,138,50,14
+    PUSHBUTTON      "Cancel",IDCANCEL,91,138,50,14
+    CONTROL         "Logging (For Debugging)",1000,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,7,102,10
     CONTROL         "Interlace Enable (toggle with F5). There are 2 modes + interlace off",IDC_CONFIG_INTERLACE,
-                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,45,137,18
+                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,32,137,18
     CONTROL         "Bilinear Filtering (Shift+F5). Best quality is on, turn off for speed.",IDC_CONFIG_BILINEAR,
-                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,67,137,18
+                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,50,137,18
     CONTROL         "Capture Avi (zerogs.avi) (F12)",IDC_CONFIG_CAPTUREAVI,
-                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,103,109,10
+                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,82,109,10
     CONTROL         "Save Snapshots as BMP(default is JPG)",IDC_CONFIG_BMPSS,
-                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,116,141,10
-    CONTROL         "Wide Screen",IDC_CONFIG_WIDESCREEN,"Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,90,109,10
-    CONTROL         "640 x 480",IDC_CONF_WIN640,"Button",BS_AUTORADIOBUTTON | WS_GROUP,20,140,59,8
-    CONTROL         "800 x 600",IDC_CONF_WIN800,"Button",BS_AUTORADIOBUTTON,21,152,59,8
-    CONTROL         "1024 x 768",IDC_CONF_WIN1024,"Button",BS_AUTORADIOBUTTON,86,140,59,8
-    CONTROL         "1280 x 960",IDC_CONF_WIN1280,"Button",BS_AUTORADIOBUTTON,86,151,53,8
-    GROUPBOX        "Default Window Size (no speed impact)",IDC_STATIC,14,129,137,39
-    COMBOBOX        IDC_COMBO1,59,31,48,30,CBS_DROPDOWNLIST | CBS_SORT | WS_VSCROLL | WS_TABSTOP
-    LTEXT           "Anti-aliasing",IDC_STATIC,15,33,43,13
-    PUSHBUTTON      "Advanced...",IDC_BUTTON1,17,170,134,14
+                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,93,141,10
+    CONTROL         "Wide Screen",IDC_CONFIG_WIDESCREEN,"Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,69,109,10
+    LTEXT           "Anti-aliasing",IDC_STATIC,7,20,43,13
+    PUSHBUTTON      "Advanced...",IDC_ADV_BTN,7,118,134,14
+    COMBOBOX        IDC_AA_COMBO,53,18,48,30,CBS_DROPDOWN | WS_VSCROLL | WS_TABSTOP
+    COMBOBOX        IDC_WIN_SIZE_COMBO,78,104,62,30,CBS_DROPDOWN | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Default Window Size",IDC_STATIC,7,106,68,8
 END
 
 
@@ -277,9 +273,9 @@ BEGIN
     IDD_CONFIG2, DIALOG
     BEGIN
         LEFTMARGIN, 7
-        RIGHTMARGIN, 164
+        RIGHTMARGIN, 152
         TOPMARGIN, 7
-        BOTTOMMARGIN, 210
+        BOTTOMMARGIN, 152
     END
 END
 #endif    // APSTUDIO_INVOKED
@@ -311,27 +307,6 @@ END
 
 #endif    // APSTUDIO_INVOKED
 
-
-/////////////////////////////////////////////////////////////////////////////
-//
-// Dialog Info
-//
-
-IDD_CONFIG2 DLGINIT
-BEGIN
-    IDC_COMBO1, 0x403, 5, 0
-0x6f4e, 0x656e, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5832, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5834, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5838, "\000" 
-    IDC_COMBO1, 0x403, 4, 0
-0x3631, 0x0058, 
-    0
-END
-
 #endif    // English (U.S.) resources
 /////////////////////////////////////////////////////////////////////////////
 
diff --git a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
index 2862c0a2da..9b2502170d 100644
--- a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
+++ b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
@@ -481,10 +481,6 @@
 				RelativePath="..\zerogs.h"
 				>
 			</File>
-			<File
-				RelativePath="..\zerogsmath.h"
-				>
-			</File>
 			<File
 				RelativePath="..\ZZGl.h"
 				>
@@ -497,6 +493,10 @@
 				RelativePath="..\ZZoglFlushHack.h"
 				>
 			</File>
+			<File
+				RelativePath="..\ZZoglMath.h"
+				>
+			</File>
 			<File
 				RelativePath="..\ZZoglShaders.h"
 				>
@@ -528,11 +528,11 @@
 			</File>
 		</Filter>
 		<File
-			RelativePath=".\ps2hw.dat"
+			RelativePath="..\ps2hw.dat"
 			>
 		</File>
 		<File
-			RelativePath="..\ps2hw.dat"
+			RelativePath=".\ps2hw.dat"
 			>
 		</File>
 	</Files>
diff --git a/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp b/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
index 64a6b31374..768eb2d1f4 100644
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
@@ -50,7 +50,7 @@ extern bool g_bMakeSnapshot;
 extern string strSnapshot;
 
 // Adjusts vertex shader BitBltPos vector v to preserve aspect ratio. It used to emulate 4:3 or 16:9.
-void ZeroGS::AdjustTransToAspect(Vector& v)
+void ZeroGS::AdjustTransToAspect(float4& v)
 {
 	double temp;
 	float f;
@@ -242,11 +242,11 @@ inline void RenderStartHelper(u32 bInterlace)
 // on image y coords. So if we write valpha.z * F + valpha.w + 0.5, it would be switching odd
 // and even strings at each frame.
 // valpha.x and y are used for image blending.
-inline Vector RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTSHADER* prog)
+inline float4 RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTSHADER* prog)
 {
 	SetShaderCaller("RenderGetForClip");
 
-	Vector valpha;
+	float4 valpha;
 	// first render the current render targets, then from ptexMem
 
 	if (psm == 1)
@@ -282,7 +282,7 @@ inline Vector RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTS
 		valpha.w = 1;
 	}
 
-	ZZshSetParameter4fv(prog->sOneColor, valpha, "g_fOneColor");
+	ZZshSetParameter4fv(prog->prog, prog->sOneColor, valpha, "g_fOneColor");
 
 	return valpha;
 }
@@ -295,7 +295,7 @@ inline void RenderCreateInterlaceTex(u32 bInterlace, int th, FRAGMENTSHADER* pro
 
 	int interlacetex = CreateInterlaceTex(2 * th);
 
-	ZZshGLSetTextureParameter(prog->sInterlace, interlacetex, "Interlace");
+	ZZshGLSetTextureParameter(prog->prog, prog->sInterlace, interlacetex, "Interlace");
 }
 
 // Well, do blending setup prior to second pass of half-frame drawing
@@ -396,10 +396,10 @@ inline int RenderGetOffsets(int* dby, int* movy, tex0Info& texframe, CRenderTarg
 }
 
 // BltBit shader calculate vertex (4 coord's pixel) position at the viewport.
-inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
+inline float4 RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 {
 	SetShaderCaller("RenderSetTargetBitPos");
-	Vector v;
+	float4 v;
 	// dest rect
 	v.x = 1;
 	v.y = dh / (float)th;
@@ -416,7 +416,7 @@ inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 		v.w += 1.0f / (float)dh ;
 	}
 
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
 
 	return v;
 }
@@ -425,12 +425,12 @@ inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 // For example, use tw / X and tw / X magnify the viewport.
 // Interlaced output is little out of VB, it could be seen as an evil blinking line on top
 // and bottom, so we try to remove it.
-inline Vector RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool isInterlace)
+inline float4 RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool isInterlace)
 {
 	SetShaderCaller("RenderSetTargetBitTex");
 
-	Vector v;
-	v = Vector(th, tw, dh, dw);
+	float4 v;
+	v = float4(th, tw, dh, dw);
 
 	// Incorrect Aspect ratio on interlaced frames
 
@@ -440,28 +440,28 @@ inline Vector RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool
 		v.w += 1.0f / conf.height;
 	}
 
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
 
 	return v;
 }
 
 // Translator for POSITION coordinates (-1.0:+1.0f at x axis, +1.0f:-1.0y at y) into target frame ones.
 // We don't need x coordinate, because interlacing is y-axis only.
-inline Vector RenderSetTargetBitTrans(int th)
+inline float4 RenderSetTargetBitTrans(int th)
 {
 	SetShaderCaller("RenderSetTargetBitTrans");
-	Vector v = Vector(float(th), -float(th), float(th), float(th));
-	ZZshSetParameter4fv(pvsBitBlt.fBitBltTrans, v, "g_fBitBltTrans");
+	float4 v = float4(float(th), -float(th), float(th), float(th));
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.fBitBltTrans, v, "g_fBitBltTrans");
 	return v;
 }
 
 // use g_fInvTexDims to store inverse texture dims
 // Seems, that Targ shader does not use it
-inline Vector RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHADER* prog)
+inline float4 RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHADER* prog)
 {
 	SetShaderCaller("RenderSetTargetInvTex");
 
-	Vector v = Vector(0, 0, 0, 0);
+	float4 v = float4(0, 0, 0, 0);
 
 	if (prog->sInvTexDims)
 	{
@@ -469,7 +469,7 @@ inline Vector RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHAD
 		v.y = 1.0f / (float)th;
 		v.z = (float)0.0;
 		v.w = -0.5f / (float)th;
-		ZZshSetParameter4fv(prog->sInvTexDims, v, "g_fInvTexDims");
+		ZZshSetParameter4fv(prog->prog, prog->sInvTexDims, v, "g_fInvTexDims");
 	}
 
 	return v;
@@ -544,17 +544,17 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list
 				SetShaderCaller("RenderCheckForTargets");
 
 				// Texture
-				Vector v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby), INTERLACE_COUNT);
+				float4 v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby), INTERLACE_COUNT);
 
 				// dest rect
 				v = RenderSetTargetBitPos(dh, texframe.th, movy, INTERLACE_COUNT);
 				v = RenderSetTargetBitTrans(ptarg->fbh);
 				v = RenderSetTargetInvTex(bInterlace, texframe.tbw, ptarg->fbh, &ppsCRTCTarg[bInterlace]) ; 	// FIXME. This is no use
 
-				Vector valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTCTarg[bInterlace]);
+				float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTCTarg[bInterlace]);
 
 				// inside vb[0]'s target area, so render that region only
-				ZZshGLSetTextureParameter(ppsCRTCTarg[bInterlace].sFinal, ptarg->ptex, "CRTC target");
+				ZZshGLSetTextureParameter(ppsCRTCTarg[bInterlace].prog, ppsCRTCTarg[bInterlace].sFinal, ptarg->ptex, "CRTC target");
 				RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTCTarg[bInterlace]);
 
 				ZZshSetPixelShader(ppsCRTCTarg[bInterlace].prog);
@@ -582,7 +582,7 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list
 // this is the function that does it.
 inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int i, bool* bUsingStencil, int interlace, int bInterlace)
 {
-	Vector v;
+	float4 v;
 	
 	for (list<CRenderTarget*>::iterator it = listTargs.begin(); it != listTargs.end(); ++it)
 	{
@@ -624,9 +624,9 @@ inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listT
 	v = RenderSetTargetBitPos(1, 1, 0, INTERLACE_COUNT);
 	v = RenderSetTargetBitTrans(texframe.th);
 	v = RenderSetTargetInvTex(bInterlace, texframe.tw, texframe.th, &ppsCRTC[bInterlace]);
-	Vector valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTC[bInterlace]);
+	float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTC[bInterlace]);
 
-	ZZshGLSetTextureParameter(ppsCRTC[bInterlace].sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
+	ZZshGLSetTextureParameter(ppsCRTC[bInterlace].prog, ppsCRTC[bInterlace].sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
 	RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTC[bInterlace]);
 	ZZshSetPixelShader(ppsCRTC[bInterlace].prog);
 	
diff --git a/plugins/zzogl-pg/opengl/ZZoglCRTC.h b/plugins/zzogl-pg/opengl/ZZoglCRTC.h
index 3b617fca1e..7a1cb474fd 100644
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.h
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.h
@@ -63,7 +63,7 @@ extern int s_nNewWidth, s_nNewHeight;
 extern CRangeManager s_RangeMngr; // manages overwritten memory
 extern void FlushTransferRanges(const tex0Info* ptex);
 extern void ProcessMessages();
-void AdjustTransToAspect(Vector& v);
+void AdjustTransToAspect(float4& v);
 
 // Interlace texture is lazy 1*(height) array of 1 and 0.
 // If its height (named s_nInterlaceTexWidth here) is hanging we must redo
diff --git a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
index 941a0f4ddf..81d6040284 100644
--- a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
@@ -82,8 +82,8 @@ extern void KickTriangleFan();
 extern void KickSprite();
 extern void KickDummy();
 extern bool LoadEffects();
-extern bool LoadExtraEffects();
-extern FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
+extern bool ZZshLoadExtraEffects();
+extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
 
 GLuint vboRect = 0;
 vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
@@ -127,7 +127,6 @@ void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum) = NULL;
 // State parameters
 
 extern u8* s_lpShaderResources;
-ZZshProgram pvs[16] = {NULL};
 
 // String's for shader file in developer mode
 #ifdef DEVBUILD
diff --git a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
index c47b4cc2d0..a457cebd8a 100644
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@@ -156,7 +156,7 @@ int s_nWriteDestAlphaTest = 0;					// ZZ
 
 ////////////////////
 // State parameters
-static Vector vAlphaBlendColor;	 // used for GPU_COLOR
+static float4 vAlphaBlendColor;	 // used for GPU_COLOR
 
 static bool bNeedBlendFactorInAlpha;	  // set if the output source alpha is different from the real source alpha (only when blend factor > 0x80)
 static u32 s_dwColorWrite = 0xf;			// the color write mask of the current target
@@ -310,7 +310,7 @@ void ZeroGS::ReloadEffects()
 
 	memset(ppsTexture, 0, sizeof(ppsTexture));
 
-	LoadExtraEffects();
+	ZZshLoadExtraEffects();
 #endif
 }
 
@@ -830,11 +830,11 @@ inline int FlushGetShaderType(VB& curvb, CRenderTarget* ptextarg, GLuint& ptexcl
 
 
 //Set page offsets depends on shader type.
-inline Vector FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRenderTarget* ptextarg)
+inline float4 FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushSetPageOffset");
 
-	Vector vpageoffset;
+	float4 vpageoffset;
 	vpageoffset.w = 0;
 
 	switch (shadertype)
@@ -863,14 +863,14 @@ inline Vector FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRen
 }
 
 //Set texture offsets depends omn shader type.
-inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
+inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushSetTexOffset");
-	Vector v;
+	float4 v;
 
 	if (shadertype == 3)
 	{
-		Vector v;
+		float4 v;
 		v.x = 16.0f / (float)curvb.tex0.tw;
 		v.y = 16.0f / (float)curvb.tex0.th;
 		v.z = 0.5f * v.x;
@@ -879,7 +879,7 @@ inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 	}
 	else if (shadertype == 4)
 	{
-		Vector v;
+		float4 v;
 		v.x = 16.0f / (float)ptextarg->fbw;
 		v.y = 16.0f / (float)ptextarg->fbh;
 		v.z = -1;
@@ -891,10 +891,10 @@ inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 }
 
 // Set dimension (Real!) of texture. z and w
-inline Vector FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
+inline float4 FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushTextureDims");
-	Vector vTexDims;
+	float4 vTexDims;
 	vTexDims.x = (float)RW(curvb.tex0.tw) ;
 	vTexDims.y = (float)RH(curvb.tex0.th) ;
 
@@ -958,14 +958,14 @@ inline FRAGMENTSHADER* FlushUseExistRenderTarget(VB& curvb, CRenderTarget* ptext
 	//int psm = PIXEL_STORAGE_FORMAT(curvb.tex0);
 	int shadertype = FlushGetShaderType(curvb, ptextarg, ptexclut);
 
-	FRAGMENTSHADER* pfragment = LoadShadeEffect(shadertype, 0, curvb.curprim.fge,
+	FRAGMENTSHADER* pfragment = ZZshLoadShadeEffect(shadertype, 0, curvb.curprim.fge,
 								IsAlphaTestExpansion(curvb.tex0), exactcolor, curvb.clamp, context, NULL);
 
-	Vector vpageoffset = FlushSetPageOffset(pfragment, shadertype, ptextarg);
+	float4 vpageoffset = FlushSetPageOffset(pfragment, shadertype, ptextarg);
 
-	Vector v = FlushSetTexOffset(pfragment, shadertype, curvb, ptextarg);
+	float4 v = FlushSetTexOffset(pfragment, shadertype, curvb, ptextarg);
 
-	Vector vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);
+	float4 vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);
 
 	if (pfragment->sCLUT != NULL && ptexclut != 0)
 		ZZshGLSetTextureParameter(pfragment->sCLUT, ptexclut, "CLUT");
@@ -997,7 +997,7 @@ inline FRAGMENTSHADER* FlushMadeNewTarget(VB& curvb, int exactcolor, int context
 		}
 	}
 
-	FRAGMENTSHADER* pfragment = LoadShadeEffect(0, GetTexFilter(curvb.tex1), curvb.curprim.fge,
+	FRAGMENTSHADER* pfragment = ZZshLoadShadeEffect(0, GetTexFilter(curvb.tex1), curvb.curprim.fge,
 								IsAlphaTestExpansion(curvb.tex0), exactcolor, curvb.clamp, context, NULL);
 
 	if (pfragment == NULL)
@@ -1160,7 +1160,7 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
 		}
 
 		// harvest fishing
-		Vector v = vAlphaBlendColor;
+		float4 v = vAlphaBlendColor;
 
 		if (exactcolor)
 		{
@@ -1173,7 +1173,7 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
 	else
 	{
 		// not using blending so set to defaults
-		Vector v = exactcolor ? Vector(1, 510 * 255.0f / 256.0f, 0, 0) : Vector(1, 2 * 255.0f / 256.0f, 0, 0);
+		float4 v = exactcolor ? float4(1, 510 * 255.0f / 256.0f, 0, 0) : float4(1, 2 * 255.0f / 256.0f, 0, 0);
 		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
 
 	}
@@ -1257,7 +1257,7 @@ inline void AlphaPabe(VB& curvb, FRAGMENTSHADER* pfragment, int exactcolor)
 		glDisable(GL_BLEND);
 		GL_STENCILFUNC_SET();
 
-		Vector v;
+		float4 v;
 		v.x = 1;
 		v.y = 2;
 		v.z = 0;
@@ -1330,7 +1330,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE
 	if (gs.pabe && bCanRenderStencil)
 	{
 		// only render the pixels with alpha values >= 0x80
-		Vector v = vAlphaBlendColor;
+		float4 v = vAlphaBlendColor;
 
 		if (exactcolor) { v.y *= 255; v.w *= 255; }
 
@@ -1350,7 +1350,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE
 		glDisable(GL_BLEND);
 		GL_STENCILFUNC_SET();
 
-		Vector v;
+		float4 v;
 		v.x = 1;
 		v.y = 2;
 		v.z = 0;
@@ -1409,7 +1409,7 @@ inline void AlphaSpecialTesting(VB& curvb, FRAGMENTSHADER* pfragment, u32 dwUsin
 		glStencilFunc(GL_EQUAL, STENCIL_SPECIAL | STENCIL_PIXELWRITE, STENCIL_SPECIAL | STENCIL_PIXELWRITE);
 		glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
 
-		Vector v = Vector(0, exactcolor ? 510.0f : 2.0f, 0, 0);
+		float4 v = float4(0, exactcolor ? 510.0f : 2.0f, 0, 0);
 		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
 		Draw(curvb);
 
@@ -1560,7 +1560,7 @@ inline void ZeroGS::RenderFBA(const VB& curvb, ZZshParameter sOneColor)
 
 	glAlphaFunc(GL_GEQUAL, 1);
 
-	Vector v(1,2,0,0);
+	float4 v(1,2,0,0);
 
 	ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");
 
@@ -1599,7 +1599,7 @@ __forceinline void ZeroGS::RenderAlphaTest(const VB& curvb, ZZshParameter sOneCo
 
 	SetShaderCaller("RenderAlphaTest");
 
-	Vector v(1,2,0,0);
+	float4 v(1,2,0,0);
 
 	ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");
 
@@ -1624,7 +1624,7 @@ __forceinline void ZeroGS::RenderAlphaTest(const VB& curvb, ZZshParameter sOneCo
 
 	if (curvb.test.ate && curvb.test.atst > 1 && curvb.test.aref > 0x80)
 	{
-		v = Vector(1,1,0,0);
+		v = float4(1,1,0,0);
 		ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");
 		glAlphaFunc(g_dwAlphaCmp[curvb.test.atst], AlphaReferedValue(curvb.test.aref));
 	}
@@ -1925,12 +1925,12 @@ void ZeroGS::SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint)
 }
 
 // clamp relies on texture width
-void ZeroGS::SetTexClamping(int context, FRAGMENTSHADER* pfragment)
+void SetTexClamping(int context, FRAGMENTSHADER* pfragment)
 {
 	FUNCLOG
 	SetShaderCaller("SetTexClamping");
 	clampInfo* pclamp = &ZeroGS::vb[context].clamp;
-	Vector v, v2;
+	float4 v, v2;
 	v.x = v.y = 0;
 	u32* ptex = ZeroGS::vb[context].ptexClamp;
 	ptex[0] = ptex[1] = 0;
@@ -2015,8 +2015,8 @@ void ZeroGS::SetTexClamping(int context, FRAGMENTSHADER* pfragment)
 
 }
 
-// Fixme should be in Vector lib
-inline bool equal_vectors(Vector a, Vector b)
+// Fixme should be in float4 lib
+inline bool equal_vectors(float4 a, float4 b)
 {
 	if (abs(a.x - b.x) + abs(a.y - b.y) + abs(a.z - b.z) + abs(a.w - b.w) < 0.01)
 		return true;
@@ -2033,7 +2033,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 
 	assert(!vb[context].bNeedTexCheck);
 
-	Vector v, v2;
+	float4 v, v2;
 
 	tex0Info& tex0 = vb[context].tex0;
 
@@ -2045,14 +2045,14 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 		SetShaderCaller("SetTexVariables");
 
 		// alpha and texture highlighting
-		Vector valpha, valpha2 ;
+		float4 valpha, valpha2 ;
 
 		// if clut, use the frame format
 		int psm = PIXEL_STORAGE_FORMAT(tex0);
 
 //		ZZLog::Error_Log( "A %d psm, is-clut %d. cpsm %d | %d %d", psm,  PSMT_ISCLUT(psm), tex0.cpsm,  tex0.tfx, tex0.tcc );
 
-		Vector vblack;
+		float4 vblack;
 		vblack.x = vblack.y = vblack.z = vblack.w = 10;
 
 		/* tcc -- Tecture Color Component 0=RGB, 1=RGBA + use Alpha from TEXA reg when not in PSM
@@ -2096,7 +2096,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 
 		/*
 		// Test, old code.
-				Vector valpha3, valpha4;
+				float4 valpha3, valpha4;
 		 		switch(tex0.tfx) {
 					case 0:
 						valpha3.z = 0; valpha3.w = 0;
@@ -2206,7 +2206,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force)
 {
 	FUNCLOG
-	Vector v;
+	float4 v;
 	CMemoryTarget* pmemtarg = g_MemTargs.GetMemoryTarget(tex0, 1);
 
 	assert( pmemtarg != NULL && pfragment != NULL && pmemtarg->ptex != NULL);	
@@ -2248,7 +2248,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
 
 	float fbw = (float)tex0.tbw;
 
-	Vector vTexDims;
+	float4 vTexDims;
 
 	vTexDims.x = b.vTexDims.x * (fw);
 	vTexDims.y = b.vTexDims.y * (fh);
@@ -2291,7 +2291,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,
 
 	ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");
 
-//	ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from Vector to float[4] is ok.
+//	ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
 	ZZshSetParameter4fv(pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
 	ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");
 
@@ -2403,7 +2403,7 @@ void ZeroGS::SetAlphaVariables(const alphaInfo& a)
 	s_rgbeq = 1;
 
 //	s_alphaInfo = a;
-	vAlphaBlendColor = Vector(1, 2 * 255.0f / 256.0f, 0, 0);
+	vAlphaBlendColor = float4(1, 2 * 255.0f / 256.0f, 0, 0);
 	u32 usec = a.c;
 
 
diff --git a/plugins/zzogl-pg/opengl/zerogsmath.h b/plugins/zzogl-pg/opengl/ZZoglMath.h
similarity index 53%
rename from plugins/zzogl-pg/opengl/zerogsmath.h
rename to plugins/zzogl-pg/opengl/ZZoglMath.h
index 3177e710db..dab2440566 100644
--- a/plugins/zzogl-pg/opengl/zerogsmath.h
+++ b/plugins/zzogl-pg/opengl/ZZoglMath.h
@@ -2,12 +2,15 @@
   *
   * Zerofrog's ZeroGS KOSMOS (c)2005-2008
   *
-  * Zerofrog forgot to write any copyright notice after release the plugin into GPLv2
+  * Zerofrog forgot to write any copyright notice after releasing the plugin into GPLv2
   * If someone can contact him successfully to clarify this matter that would be great.
   */
 
-#ifndef ZEROGS_MATH_H
-#define ZEROGS_MATH_H
+// Now that it's down to 82 lines, and most of it's fairly obvious, perhaps it'd be easier to 
+// just reimplement it... -arcum42
+
+#ifndef ZZOGLMATH_H_INCLUDED
+#define ZZOGLMATH_H_INCLUDED
 
 #ifndef _WIN32
 #include <alloca.h>
@@ -22,16 +25,16 @@ typedef float dReal;
 // class used for 3 and 4 dim vectors and quaternions
 // It is better to use this for a 3 dim vector because it is 16byte aligned and SIMD instructions can be used
 
-class Vector
+class float4
 {
 	public:
 		dReal x, y, z, w;
 
-		Vector() : x(0), y(0), z(0), w(0) {}
-		Vector(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
-		Vector(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
-		Vector(const Vector &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
-		Vector(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
+		float4() : x(0), y(0), z(0), w(0) {}
+		float4(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
+		float4(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
+		float4(const float4 &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
+		float4(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
 		dReal  operator[](int i) const	   { return (&x)[i]; }
 		dReal& operator[](int i)			 { return (&x)[i]; }
 		
@@ -40,7 +43,7 @@ class Vector
 		operator const dReal*() const { return (const dReal*)&x; }
 		
 		// SCALAR FUNCTIONS
-		inline dReal dot(const Vector &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
+		inline dReal dot(const float4 &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
 		inline void Set3(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; }
 		inline void Set4(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; w = pvals[3]; }
 		inline void SetColor(u32 color)
@@ -53,28 +56,28 @@ class Vector
 		// 3 dim cross product, w is not touched
 		/// this = this x v
 		/// this = u x v
-		inline Vector operator-() const { Vector v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
-		inline Vector operator+(const Vector &r) const { Vector v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
-		inline Vector operator-(const Vector &r) const { Vector v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
-		inline Vector operator*(const Vector &r) const { Vector v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
-		inline Vector operator*(dReal k) const { Vector v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
-		inline Vector& operator += (const Vector& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
-		inline Vector& operator -= (const Vector& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
-		inline Vector& operator *= (const Vector& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
-		inline Vector& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
-		inline Vector& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
-		friend Vector operator*(float f, const Vector& v);
-		//friend ostream& operator<<(ostream& O, const Vector& v);
-		//friend istream& operator>>(istream& I, Vector& v);
+		inline float4 operator-() const { float4 v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
+		inline float4 operator+(const float4 &r) const { float4 v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
+		inline float4 operator-(const float4 &r) const { float4 v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
+		inline float4 operator*(const float4 &r) const { float4 v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
+		inline float4 operator*(dReal k) const { float4 v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
+		inline float4& operator += (const float4& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
+		inline float4& operator -= (const float4& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
+		inline float4& operator *= (const float4& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
+		inline float4& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
+		inline float4& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
+		friend float4 operator*(float f, const float4& v);
+		//friend ostream& operator<<(ostream& O, const float4& v);
+		//friend istream& operator>>(istream& I, float4& v);
 };
 
-inline Vector operator*(float f, const Vector& left)
+inline float4 operator*(float f, const float4& left)
 {
-	Vector v;
+	float4 v;
 	v.x = f * left.x;
 	v.y = f * left.y;
 	v.z = f * left.z;
 	return v;
-}
-
-#endif
+}
+
+#endif // ZZOGLMATH_H_INCLUDED
diff --git a/plugins/zzogl-pg/opengl/ZZoglShaders.cpp b/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
index 658d763de5..136606cbf6 100644
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
@@ -1,6 +1,6 @@
 /*  ZZ Open GL graphics plugin
- *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
- *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *  Copyright (c)2009 zeydlitz@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2006
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -14,18 +14,27 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+//#ifdef NVIDIA_CG_API 		// This code is only for NVIDIA cg-toolkit API
 // ZZogl Shader manipulation functions.
 
 //------------------- Includes
 #include "zerogs.h"
 #include "ZZoglShaders.h"
 #include "zpipe.h"
+	
+
+#ifdef _WIN32
+#	include "Win32.h"
+extern HINSTANCE hInst;
+#endif
 
 // ----------------- Defines
 
+using namespace ZeroGS; 
+
 #define TEXWRAP_REPEAT 0
 #define TEXWRAP_CLAMP 1
 #define TEXWRAP_REGION_REPEAT 2
@@ -33,7 +42,6 @@
 
 #define SH_WRITEDEPTH 0x2000 // depth is written
 #define SH_CONTEXT1 0x1000 // context1 is used
-
 #define SH_REGULARVS 0x8000
 #define SH_TEXTUREVS 0x8001
 #define SH_REGULARFOGVS 0x8002
@@ -58,35 +66,54 @@
 #define SH_CRTC_NEARESTPS 0x8022
 #define SH_CRTCINTER_NEARESTPS 0x8023
 
-
-using namespace ZeroGS;
 //------------------ Constants
 
-// ----------------- Global Variables
+const static char* g_pTexTypes[] = { "32", "tex32", "clut32", "tex32to16", "tex16to8h" };
 
-namespace ZeroGS
-{
-FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;
-FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
-VERTEXSHADER pvsBitBlt;
+// ----------------- Global Variables 
+
+ZZshContext	g_cgcontext;
+ZZshProfile 	cgvProf, cgfProf;
+int 		g_nPixelShaderVer = 0; 		// default
+u8* 		s_lpShaderResources = NULL;
+ZZshProgram 	pvs[16] = {NULL};
+ZZshProgram 	g_vsprog = 0, g_psprog = 0;							// 2 -- ZZ
+ZZshParameter 	g_vparamPosXY[2] = {0}, g_fparamFogColor = 0;
+
+#ifdef DEVBUILD
+char* EFFECT_NAME;		// All this variables used for testing and set manually
+char* EFFECT_DIR;
+#endif
+
+bool g_bCRTCBilinear = true;
+
+namespace ZeroGS { 
+	float4 g_vdepth, vlogz;
+	FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;
+	FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
+	FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
+	FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
+	VERTEXSHADER pvsBitBlt;
+	
+	inline bool LoadEffects();
 }
 
+struct SHADERHEADER
+{
+	unsigned int index, offset, size; // if highest bit of index is set, pixel shader
+};
+map<int, SHADERHEADER*> mapShaderResources;
+
 // Debug variable, store name of the function that call the shader.
 const char* ShaderCallerName = "";
 const char* ShaderHandleName = "";
 
-extern u32 ptexBlocks;		// holds information on block tiling. Its texture number in OpenGL -- if 0 than such texture
-extern u32 ptexConv16to32;	// does not exist. This textures should be created on start and released on finish.  
-extern u32 ptexConv32to16;
-bool g_bCRTCBilinear = true;
-u8* s_lpShaderResources = NULL;
-map<int, SHADERHEADER*> mapShaderResources;
-ZZshContext g_cgcontext;
-ZZshProfile cgvProf, cgfProf;
-int g_nPixelShaderVer = 0; 		// default
-
 //------------------ Code
 
+inline int GET_SHADER_INDEX(int type, int texfilter, int texwrap, int fog, int writedepth, int testaem, int exactcolor, int context, int ps) {
+	return type + texfilter*NUM_TYPES + NUM_FILTERS*NUM_TYPES*texwrap + NUM_TEXWRAPS*NUM_FILTERS*NUM_TYPES*(fog+2*writedepth+4*testaem+8*exactcolor+16*context+32*ps) ;
+}
+
 bool ZZshCheckProfilesSupport() {
 	// load the effect, find the best profiles (if any)
 	if (cgGLIsProfileSupported(CG_PROFILE_ARBVP1) != CG_TRUE) {
@@ -103,10 +130,10 @@ bool ZZshCheckProfilesSupport() {
 // Error handler. Setup in ZZogl_Create once.
 void HandleCgError(ZZshContext ctx, ZZshError err, void* appdata)
 {
-	ZZLog::Error_Log("%s->%s: %s", ShaderCallerName, ShaderHandleName, cgGetErrorString(err));
+	ZZLog::Error_Log("%s->%s: %s\n", ShaderCallerName, ShaderHandleName, cgGetErrorString(err));
 	const char* listing = cgGetLastListing(g_cgcontext);
-
-	if (listing != NULL) ZZLog::Debug_Log("	Last listing: %s", listing);
+	if (listing != NULL) 
+		ZZLog::Debug_Log("	last listing: %s\n", listing);
 }
 
 bool ZZshStartUsingShaders() {
@@ -128,7 +155,7 @@ bool ZZshStartUsingShaders() {
 	g_vparamPosXY[1] = cgCreateParameter(g_cgcontext, CG_FLOAT4);
 
 
-	ZZLog::Debug_Log("Creating effects.");
+	ZZLog::GS_Log("Creating effects.");
 	B_G(LoadEffects(), return false);
 
 	// create a sample shader
@@ -139,11 +166,11 @@ bool ZZshStartUsingShaders() {
 	g_nPixelShaderVer = 0;//SHADER_ACCURATE;
 	// test
 	bool bFailed;
-	FRAGMENTSHADER* pfrag = LoadShadeEffect(0, 1, 1, 1, 1, temp, 0, &bFailed);
+	FRAGMENTSHADER* pfrag = ZZshLoadShadeEffect(0, 1, 1, 1, 1, temp, 0, &bFailed);
 	if( bFailed || pfrag == NULL ) {
 		g_nPixelShaderVer = SHADER_ACCURATE|SHADER_REDUCED;
 
-		pfrag = LoadShadeEffect(0, 0, 1, 1, 0, temp, 0, &bFailed);
+		pfrag = ZZshLoadShadeEffect(0, 0, 1, 1, 0, temp, 0, &bFailed);
 		if( pfrag != NULL )
 			cgGLLoadProgram(pfrag->prog);
 		if( bFailed || pfrag == NULL || cgGetError() != CG_NO_ERROR ) {
@@ -155,10 +182,65 @@ bool ZZshStartUsingShaders() {
 	if (g_nPixelShaderVer & SHADER_REDUCED)
 		conf.bilinear = 0;
 
-	ZZLog::Debug_Log("Creating extra effects.");
-	B_G(LoadExtraEffects(), return false);
+	ZZLog::GS_Log("Creating extra effects.");
+	B_G(ZZshLoadExtraEffects(), return false);
 
-	ZZLog::Debug_Log("using %s shaders.", g_pShaders[g_nPixelShaderVer]);	
+	ZZLog::GS_Log("using %s shaders\n", g_pShaders[g_nPixelShaderVer]);	
+	return true;
+}
+
+// open shader file according to build target
+bool ZZshCreateOpenShadersFile() {
+#ifndef DEVBUILD
+#	ifdef _WIN32
+	HRSRC hShaderSrc = FindResource(hInst, MAKEINTRESOURCE(IDR_SHADERS), RT_RCDATA);
+	assert( hShaderSrc != NULL );
+	HGLOBAL hShaderGlob = LoadResource(hInst, hShaderSrc);
+	assert( hShaderGlob != NULL );
+	s_lpShaderResources = (u8*)LockResource(hShaderGlob);
+#	else // not _WIN32
+	FILE* fres = fopen("ps2hw.dat", "rb");
+	if( fres == NULL ) {
+		fres = fopen("plugins/ps2hw.dat", "rb");
+		if( fres == NULL ) {
+			ZZLog::Error_Log("Cannot find ps2hw.dat in working directory. Exiting.");
+			return false;
+		}
+	}
+	fseek(fres, 0, SEEK_END);
+	size_t s = ftell(fres);
+	s_lpShaderResources = new u8[s+1];
+	fseek(fres, 0, SEEK_SET);
+	fread(s_lpShaderResources, s, 1, fres);
+	s_lpShaderResources[s] = 0;
+#	endif // _WIN32
+#else // NOT RELEASE_TO_PUBLIC
+#	ifndef _WIN32 // NOT WINDOWS
+	// test if ps2hw.fx exists
+	char tempstr[255];
+	char curwd[255];
+	getcwd(curwd, ARRAY_SIZE(curwd));
+
+	strcpy(tempstr, "/plugins/");
+	sprintf(EFFECT_NAME, "%sps2hw.fx", tempstr);
+	FILE* f = fopen(EFFECT_NAME, "r");
+	if( f == NULL ) {
+
+		strcpy(tempstr, "../../plugins/zzogl-pg/opengl/");
+		sprintf(EFFECT_NAME, "%sps2hw.fx", tempstr);
+		f = fopen(EFFECT_NAME, "r");
+
+		if( f == NULL ) {
+			ZZLog::Error_Log("Failed to find %s, try compiling a non-devbuild\n", EFFECT_NAME);
+			return false;
+		}
+	}
+	fclose(f);
+
+	sprintf(EFFECT_DIR, "%s/%s", curwd, tempstr);
+	sprintf(EFFECT_NAME, "%sps2hw.fx", EFFECT_DIR);
+	#endif
+#endif // RELEASE_TO_PUBLIC
 	return true;
 }
 
@@ -173,37 +255,61 @@ void ZZshGLEnableProfile() {
 	cgGLEnableProfile(cgfProf);
 }
 
-// This is a helper of cgGLSetParameter4fv, made for debugging purposes.
-// The name could be any string. We must use it on compilation time, because the erronious handler does not
-// return it.
-void ZZshSetParameter4fv(ZZshParameter param, const float* v, const char* name)
-{
+// This is helper of cgGLSetParameter4fv, made for debug purpose.
+// Name could be any string. We must use it on compilation time, because erroneus handler does not
+// return name
+void ZZshSetParameter4fv(ZZshParameter param, const float* v, const char* name) {
 	ShaderHandleName = name;
 	cgGLSetParameter4fv(param, v);
 }
- 
-// The same function for texture, also to cgGLEnable
+
+void ZZshSetParameter4fv(ZZshProgram prog, ZZshParameter param, const float* v, const char* name) {	
+	ShaderHandleName = name;
+	cgGLSetParameter4fv(param, v);
+}
+
+// The same stuff, but also with retry of param, name should be USED name of param for prog.
+void ZZshSetParameter4fvWithRetry(ZZshParameter* param, ZZshProgram prog, const float* v, const char* name) {
+	if (param != NULL)
+		ZZshSetParameter4fv(prog, param[0], v, name);
+	else
+		ZZshSetParameter4fv(prog, cgGetNamedParameter(prog, name), v, name);
+}
+
 void ZZshGLSetTextureParameter(ZZshParameter param, GLuint texobj, const char* name) {
 	ShaderHandleName = name;
 	cgGLSetTextureParameter(param, texobj);
 	cgGLEnableTextureParameter(param);
 }
 
+// The same function for texture, also to cgGLEnable
+void ZZshGLSetTextureParameter(ZZshProgram prog, ZZshParameter param, GLuint texobj, const char* name) {
+	ShaderHandleName = name;
+	cgGLSetTextureParameter(param, texobj);
+	cgGLEnableTextureParameter(param);
+}
+
 // Used sometimes for color 1.
 void ZZshDefaultOneColor( FRAGMENTSHADER ptr ) {
 	ShaderHandleName = "Set Default One color";
-	Vector v = Vector ( 1, 1, 1, 1 );
-	ZZshSetParameter4fv( ptr.sOneColor, v, "DefaultOne");
+	float4 v = float4 ( 1, 1, 1, 1 );
+	ZZshSetParameter4fv( ptr.prog, ptr.sOneColor, v, "DefaultOne");
 }
 
-void ZZshSetVertexShader(ZZshShader prog) {
+#define SET_UNIFORMPARAM(var, name) { \
+	p = cgGetNamedParameter(pf->prog, name); \
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) \
+		pf->var = p; \
+} \
+
+void ZZshSetVertexShader(ZZshProgram prog) {
 	if ((prog) != g_vsprog) {
 		cgGLBindProgram(prog); 
 		g_vsprog = prog; 
 	}
 }
 
-void ZZshSetPixelShader(ZZshShader prog) {
+void ZZshSetPixelShader(ZZshProgram prog) {
 	if ((prog) != g_psprog) {
 		cgGLBindProgram(prog); 
 		g_psprog = prog; 
@@ -213,125 +319,158 @@ void ZZshSetPixelShader(ZZshShader prog) {
 void SetupFragmentProgramParameters(FRAGMENTSHADER* pf, int context, int type)
 {
 	// uniform parameters
-	pf->connect(g_fparamFogColor, "g_fFogColor");
+	ZZshParameter p;
 
-	pf->set_uniform_param(pf->sOneColor, "g_fOneColor");
-	pf->set_uniform_param(pf->sBitBltZ, "g_fBitBltZ");
-	pf->set_uniform_param(pf->sInvTexDims, "g_fInvTexDims");
-	pf->set_uniform_param(pf->fTexAlpha2, "fTexAlpha2");
-	pf->set_uniform_param(pf->fTexOffset, "g_fTexOffset");
-	pf->set_uniform_param(pf->fTexDims, "g_fTexDims");
-	pf->set_uniform_param(pf->fTexBlock, "g_fTexBlock");
-	pf->set_uniform_param(pf->fClampExts, "g_fClampExts");
-	pf->set_uniform_param(pf->fTexWrapMode, "TexWrapMode");
-	pf->set_uniform_param(pf->fRealTexDims, "g_fRealTexDims");
-	pf->set_uniform_param(pf->fTestBlack, "g_fTestBlack");
-	pf->set_uniform_param(pf->fPageOffset, "g_fPageOffset");
-	pf->set_uniform_param(pf->fTexAlpha, "fTexAlpha");
-
-	// textures
-	pf->set_texture(ptexBlocks, "g_sBlocks");
-
-	// cg parameter usage is wrong, so do it manually
-
-	switch (type)
-	{
-		case 3:
-			pf->set_texture(ptexConv16to32, "g_sConv16to32");
-			break;
-
-		case 4:
-			pf->set_texture(ptexConv32to16, "g_sConv32to16");
-			break;
-
-		default:
-			pf->set_texture(ptexBilinearBlocks, "g_sBilinearBlocks");
-			break;
+	p = cgGetNamedParameter(pf->prog, "g_fFogColor");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		cgConnectParameter(g_fparamFogColor, p);
 	}
 
-	pf->set_texture(pf->sMemory, "g_sMemory");
+	SET_UNIFORMPARAM(sOneColor, "g_fOneColor");
+	SET_UNIFORMPARAM(sBitBltZ, "g_fBitBltZ");
+	SET_UNIFORMPARAM(sInvTexDims, "g_fInvTexDims");
+	SET_UNIFORMPARAM(fTexAlpha2, "fTexAlpha2");
+	SET_UNIFORMPARAM(fTexOffset, "g_fTexOffset");
+	SET_UNIFORMPARAM(fTexDims, "g_fTexDims");
+	SET_UNIFORMPARAM(fTexBlock, "g_fTexBlock");
+	SET_UNIFORMPARAM(fClampExts, "g_fClampExts");
+	SET_UNIFORMPARAM(fTexWrapMode, "TexWrapMode");
+	SET_UNIFORMPARAM(fRealTexDims, "g_fRealTexDims");
+	SET_UNIFORMPARAM(fTestBlack, "g_fTestBlack");
+	SET_UNIFORMPARAM(fPageOffset, "g_fPageOffset");
+	SET_UNIFORMPARAM(fTexAlpha, "fTexAlpha");
 
-	pf->set_texture(pf->sFinal, "g_sSrcFinal");
-	pf->set_texture(pf->sBitwiseANDX, "g_sBitwiseANDX");
-	pf->set_texture(pf->sBitwiseANDY, "g_sBitwiseANDY");
-	pf->set_texture(pf->sCLUT, "g_sCLUT");
-	pf->set_texture(pf->sInterlace, "g_sInterlace");
+	// textures
+	p = cgGetNamedParameter(pf->prog, "g_sBlocks");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		cgGLSetTextureParameter(p, ptexBlocks);
+		cgGLEnableTextureParameter(p);
+	}
+
+	// cg parameter usage is wrong, so do it manually
+	if( type == 3 ) {
+		p = cgGetNamedParameter(pf->prog, "g_sConv16to32");
+		if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+			cgGLSetTextureParameter(p, ptexConv16to32);
+			cgGLEnableTextureParameter(p);
+		}
+	}
+	else if( type == 4 ) {
+		p = cgGetNamedParameter(pf->prog, "g_sConv32to16");
+		if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+			cgGLSetTextureParameter(p, ptexConv32to16);
+			cgGLEnableTextureParameter(p);
+		}
+	}
+	else {
+		p = cgGetNamedParameter(pf->prog, "g_sBilinearBlocks");
+		if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+			cgGLSetTextureParameter(p, ptexBilinearBlocks);
+			cgGLEnableTextureParameter(p);
+		}
+	}
+
+	p = cgGetNamedParameter(pf->prog, "g_sMemory");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sMemory = p;
+	}
+	p = cgGetNamedParameter(pf->prog, "g_sSrcFinal");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sFinal = p;
+	}
+	p = cgGetNamedParameter(pf->prog, "g_sBitwiseANDX");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sBitwiseANDX = p;
+	}
+	p = cgGetNamedParameter(pf->prog, "g_sBitwiseANDY");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sBitwiseANDY = p;
+	}
+	p = cgGetNamedParameter(pf->prog, "g_sCLUT");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sCLUT = p;
+	}
+	p = cgGetNamedParameter(pf->prog, "g_sInterlace");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		//cgGLEnableTextureParameter(p);
+		pf->sInterlace = p;
+	}
 
 	// set global shader constants
-	pf->set_shader_const(Vector(0.5f, (conf.settings().exact_color) ? 0.9f / 256.0f : 0.5f / 256.0f, 0, 1 / 255.0f), "g_fExactColor");
-	pf->set_shader_const(Vector(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f), "g_fBilinear");
-	pf->set_shader_const(Vector(1.0f / 256.0f, 1.0004f, 1, 0.5f), "g_fZBias");
-	pf->set_shader_const(Vector(0, 1, 0.001f, 0.5f), "g_fc0");
-	pf->set_shader_const(Vector(1 / 1024.0f, 0.2f / 1024.0f, 1 / 128.0f, 1 / 512.0f), "g_fMult");
-}
+	p = cgGetNamedParameter(pf->prog, "g_fExactColor");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE ) {
+		cgGLSetParameter4fv(p, float4(0.5f, (conf.settings().exact_color)?0.9f/256.0f:0.5f/256.0f, 0,1/255.0f));
+	}
 
-static bool outdated_shaders = false;
+	p = cgGetNamedParameter(pf->prog, "g_fBilinear");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f ));
+
+	p = cgGetNamedParameter(pf->prog, "g_fZBias");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(1.0f/256.0f, 1.0004f, 1, 0.5f));
+
+	p = cgGetNamedParameter(pf->prog, "g_fc0");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(0,1, 0.001f, 0.5f));
+
+	p = cgGetNamedParameter(pf->prog, "g_fMult");
+	if( p != NULL && cgIsParameterUsed(p, pf->prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(1/1024.0f, 0.2f/1024.0f, 1/128.0f, 1/512.0f));
+}
 
 void SetupVertexProgramParameters(ZZshProgram prog, int context)
 {
 	ZZshParameter p;
 
 	p = cgGetNamedParameter(prog, "g_fPosXY");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )
 		cgConnectParameter(g_vparamPosXY[context], p);
 
 	// Set Z-test, log or no log;
-	if (conf.settings().no_logz)
-	{
-		g_vdepth = Vector(255.0 / 256.0f,  255.0 / 65536.0f, 255.0f / (65535.0f * 256.0f), 1.0f / (65536.0f * 65536.0f));
-		vlogz = Vector(1.0f, 0.0f, 0.0f, 0.0f);
+	if (conf.settings().no_logz) {
+       		g_vdepth = float4( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
+		vlogz = float4( 1.0f, 0.0f, 0.0f, 0.0f);
 	}
-	else
-	{
-		g_vdepth = Vector(256.0f * 65536.0f, 65536.0f, 256.0f, 65536.0f * 65536.0f);
-		vlogz = Vector(0.0f, 1.0f, 0.0f, 0.0f);
+	else {
+		g_vdepth = float4( 256.0f*65536.0f, 65536.0f, 256.0f, 65536.0f*65536.0f);	
+		vlogz = float4( 0.0f, 1.0f, 0.0f, 0.0f);
 	}
 
 	p = cgGetNamedParameter(prog, "g_fZ");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
-	{
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE ) {
 		cgGLSetParameter4fv(p, g_vdepth);
 
 		p = cgGetNamedParameter(prog, "g_fZMin"); // Switch to flat-z when needed
-
-		if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
-		{
-			//ZZLog::Error_Log("Use flat-z");
-			cgGLSetParameter4fv(p, vlogz);
+		if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )  {
+			//ZZLog::Error_Log("Use flat-z\n");
+			cgGLSetParameter4fv(p, vlogz);			
 		}
 		else
-		{
-			if (!outdated_shaders)
-			{
-				outdated_shaders = true;
-				ZZLog::Error_Log("Shader file version is outdated! Only log-Z is possible.");
-			}
-		}
+			ZZLog::Error_Log("Shader file version is outdated! Only log-Z is possible.");
 	}
 
-	Vector vnorm = Vector(g_filog32, 0, 0, 0);
-
+	float4 vnorm = float4(g_filog32, 0, 0,0);
 	p = cgGetNamedParameter(prog, "g_fZNorm");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )
 		cgGLSetParameter4fv(p, vnorm);
 
 	p = cgGetNamedParameter(prog, "g_fBilinear");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
-		cgGLSetParameter4fv(p, Vector(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f));
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(-0.2f, -0.65f, 0.9f, 1.0f / 32767.0f ));
 
 	p = cgGetNamedParameter(prog, "g_fZBias");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
-		cgGLSetParameter4fv(p, Vector(1.0f / 256.0f, 1.0004f, 1, 0.5f));
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(1.0f/256.0f, 1.0004f, 1, 0.5f));
 
 	p = cgGetNamedParameter(prog, "g_fc0");
-
-	if (p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE)
-		cgGLSetParameter4fv(p, Vector(0, 1, 0.001f, 0.5f));
+	if( p != NULL && cgIsParameterUsed(p, prog) == CG_TRUE )
+		cgGLSetParameter4fv(p, float4(0,1, 0.001f, 0.5f));
 }
 
 #ifndef DEVBUILD
@@ -342,21 +481,21 @@ void SetupVertexProgramParameters(ZZshProgram prog, int context)
 	assert( (header) != NULL && (header)->index == (Index) ); \
 	prog = cgCreateProgram(g_cgcontext, CG_OBJECT, (char*)(s_lpShaderResources + (header)->offset), cgvProf, NULL, NULL); \
 	if( !cgIsProgram(prog) ) { \
-		ZZLog::Error_Log("Failed to load vs %d: \n%s.", Index, cgGetLastListing(g_cgcontext)); \
+		ZZLog::Error_Log("Failed to load vs %d: \n%s", Index, cgGetLastListing(g_cgcontext)); \
 		return false; \
 	} \
 	cgGLLoadProgram(prog); \
-	if( cgGetError() != CG_NO_ERROR ) ZZLog::Error_Log("failed to load program %d.", Index); \
+	if( cgGetError() != CG_NO_ERROR ) ZZLog::Error_Log("Failed to load program %d.", Index); \
 	SetupVertexProgramParameters(prog, !!(Index&SH_CONTEXT1));			\
 } \
- 
+
 #define LOAD_PS(Index, fragment) {  \
 	bLoadSuccess = true; \
 	assert( mapShaderResources.find(Index) != mapShaderResources.end() ); \
 	header = mapShaderResources[Index]; \
 	fragment.prog = cgCreateProgram(g_cgcontext, CG_OBJECT, (char*)(s_lpShaderResources + (header)->offset), cgfProf, NULL, NULL); \
 	if( !cgIsProgram(fragment.prog) ) { \
-		ZZLog::Error_Log("Failed to load ps %d: \n%s.", Index, cgGetLastListing(g_cgcontext)); \
+		ZZLog::Error_Log("Failed to load ps %d: \n%s", Index, cgGetLastListing(g_cgcontext)); \
 		return false; \
 	} \
 	cgGLLoadProgram(fragment.prog); \
@@ -366,70 +505,63 @@ void SetupVertexProgramParameters(ZZshProgram prog, int context)
 	} \
 	SetupFragmentProgramParameters(&fragment, !!(Index&SH_CONTEXT1), 0);  \
 } \
- 
-bool ZeroGS::LoadEffects()
+
+inline bool ZeroGS::LoadEffects()
 {
-	assert(s_lpShaderResources != NULL);
+	assert( s_lpShaderResources != NULL );
 
 	// process the header
 	u32 num = *(u32*)s_lpShaderResources;
-	int compressed_size = *(int*)(s_lpShaderResources + 4);
-	int real_size = *(int*)(s_lpShaderResources + 8);
+	int compressed_size = *(int*)(s_lpShaderResources+4);
+	int real_size = *(int*)(s_lpShaderResources+8);
 	int out;
 
 	char* pbuffer = (char*)malloc(real_size);
-	inf((char*)s_lpShaderResources + 12, &pbuffer[0], compressed_size, real_size, &out);
+	inf((char*)s_lpShaderResources+12, &pbuffer[0], compressed_size, real_size, &out);
 	assert(out == real_size);
 
 	s_lpShaderResources = (u8*)pbuffer;
 	SHADERHEADER* header = (SHADERHEADER*)s_lpShaderResources;
 
 	mapShaderResources.clear();
-
-	while (num-- > 0)
-	{
+	while(num-- > 0 ) {
 		mapShaderResources[header->index] = header;
 		++header;
 	}
 
 	// clear the textures
-	for (int i = 0; i < ARRAY_SIZE(ppsTexture); ++i)
-	{
+	for(u16 i = 0; i < ARRAY_SIZE(ppsTexture); ++i) {
 		SAFE_RELEASE_PROG(ppsTexture[i].prog);
 		ppsTexture[i].prog = NULL;
 	}
-
 #ifndef _DEBUG
 	memset(ppsTexture, 0, sizeof(ppsTexture));
-
 #endif
 
 	return true;
 }
 
 // called
-bool ZeroGS::LoadExtraEffects()
+bool ZZshLoadExtraEffects()
 {
 	SHADERHEADER* header;
 	bool bLoadSuccess = true;
 
 	const int vsshaders[4] = { SH_REGULARVS, SH_TEXTUREVS, SH_REGULARFOGVS, SH_TEXTUREFOGVS };
 
-	for (int i = 0; i < 4; ++i)
-	{
+	for(int i = 0; i < 4; ++i) {
 		LOAD_VS(vsshaders[i], pvs[2*i]);
 		LOAD_VS((vsshaders[i] | SH_CONTEXT1), pvs[2*i+1]);
 		//if( conf.mrtdepth ) {
-		LOAD_VS((vsshaders[i] | SH_WRITEDEPTH), pvs[2*i+8]);
-		LOAD_VS((vsshaders[i] | SH_WRITEDEPTH | SH_CONTEXT1), pvs[2*i+8+1]);
+			LOAD_VS((vsshaders[i] | SH_WRITEDEPTH), pvs[2*i+8]);
+			LOAD_VS((vsshaders[i] | SH_WRITEDEPTH | SH_CONTEXT1), pvs[2*i+8+1]);
 //		}
 //		else {
 //			pvs[2*i+8] = pvs[2*i+8+1] = NULL;
 //		}
 	}
-
+	
 	LOAD_VS(SH_BITBLTVS, pvsBitBlt.prog);
-
 	pvsBitBlt.sBitBltPos = cgGetNamedParameter(pvsBitBlt.prog, "g_fBitBltPos");
 	pvsBitBlt.sBitBltTex = cgGetNamedParameter(pvsBitBlt.prog, "g_fBitBltTex");
 	pvsBitBlt.fBitBltTrans = cgGetNamedParameter(pvsBitBlt.prog, "g_fBitBltTrans");
@@ -437,52 +569,40 @@ bool ZeroGS::LoadExtraEffects()
 	LOAD_PS(SH_REGULARPS, ppsRegular[0]);
 	LOAD_PS(SH_REGULARFOGPS, ppsRegular[1]);
 
-	if (conf.mrtdepth)
-	{
+	if( conf.mrtdepth ) {
 		LOAD_PS(SH_REGULARPS, ppsRegular[2]);
-
-		if (!bLoadSuccess)
+		if( !bLoadSuccess )
 			conf.mrtdepth = 0;
-
 		LOAD_PS(SH_REGULARFOGPS, ppsRegular[3]);
-
-		if (!bLoadSuccess)
+		if( !bLoadSuccess )
 			conf.mrtdepth = 0;
 	}
 
 	LOAD_PS(SH_BITBLTPS, ppsBitBlt[0]);
-
 	LOAD_PS(SH_BITBLTAAPS, ppsBitBlt[1]);
-
-	if (!bLoadSuccess)
-	{
+	if( !bLoadSuccess ) {
 		ZZLog::Error_Log("Failed to load BitBltAAPS, using BitBltPS.");
 		LOAD_PS(SH_BITBLTPS, ppsBitBlt[1]);
 	}
-
 	LOAD_PS(SH_BITBLTDEPTHPS, ppsBitBltDepth);
-
 	LOAD_PS(SH_CRTCTARGPS, ppsCRTCTarg[0]);
 	LOAD_PS(SH_CRTCTARGINTERPS, ppsCRTCTarg[1]);
-
+	
 	g_bCRTCBilinear = true;
 	LOAD_PS(SH_CRTCPS, ppsCRTC[0]);
-
-	if (!bLoadSuccess)
-	{
+	if( !bLoadSuccess ) {
 		// switch to simpler
 		g_bCRTCBilinear = false;
 		LOAD_PS(SH_CRTC_NEARESTPS, ppsCRTC[0]);
 		LOAD_PS(SH_CRTCINTER_NEARESTPS, ppsCRTC[0]);
 	}
-	else
-	{
+	else {
 		LOAD_PS(SH_CRTCINTERPS, ppsCRTC[1]);
 	}
 
-	if (!bLoadSuccess)
+	if( !bLoadSuccess )
 		ZZLog::Error_Log("Failed to create CRTC shaders.");
-
+	
 	LOAD_PS(SH_CRTC24PS, ppsCRTC24[0]);
 	LOAD_PS(SH_CRTC24INTERPS, ppsCRTC24[1]);
 	LOAD_PS(SH_ZEROPS, ppsOne);
@@ -493,105 +613,82 @@ bool ZeroGS::LoadExtraEffects()
 	return true;
 }
 
-FRAGMENTSHADER* ZeroGS::LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed)
+FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed)
 {
 	int texwrap;
-	assert(texfilter < NUM_FILTERS);
+	assert( texfilter < NUM_FILTERS );
 
-	if (g_nPixelShaderVer & SHADER_REDUCED) texfilter = 0;
+	if(g_nPixelShaderVer&SHADER_REDUCED)
+		texfilter = 0;
+	assert(!(g_nPixelShaderVer&SHADER_REDUCED) || !exactcolor);
 
-	assert(!(g_nPixelShaderVer & SHADER_REDUCED) || !exactcolor);
-
-	if (clamp.wms == clamp.wmt)
-	{
-		switch (clamp.wms)
-		{
-			case 0:
-				texwrap = TEXWRAP_REPEAT;
-				break;
-
-			case 1:
-				texwrap = TEXWRAP_CLAMP;
-				break;
-
-			case 2:
-				texwrap = TEXWRAP_CLAMP;
-				break;
-
-			default:
-				texwrap = TEXWRAP_REGION_REPEAT;
-				break;
+	if( clamp.wms == clamp.wmt ) {
+		switch( clamp.wms ) {
+			case 0: texwrap = TEXWRAP_REPEAT; break;
+			case 1: texwrap = TEXWRAP_CLAMP; break;
+			case 2: texwrap = TEXWRAP_CLAMP; break;
+			default: texwrap = TEXWRAP_REGION_REPEAT; break;
 		}
 	}
-	else if (clamp.wms == 3 || clamp.wmt == 3)
+	else if( clamp.wms==3||clamp.wmt==3)
 		texwrap = TEXWRAP_REGION_REPEAT;
 	else
 		texwrap = TEXWRAP_REPEAT_CLAMP;
 
 	int index = GET_SHADER_INDEX(type, texfilter, texwrap, fog, s_bWriteDepth, testaem, exactcolor, context, 0);
+	
+	assert( index < ARRAY_SIZE(ppsTexture) );
+	FRAGMENTSHADER* pf = ppsTexture+index;
+	
+	if( pbFailed != NULL ) *pbFailed = false;
 
-	assert(index < ARRAY_SIZE(ppsTexture));
+	if( pf->prog != NULL ) 
+		return pf;
 
-	FRAGMENTSHADER* pf = ppsTexture + index;
-
-	if (pbFailed != NULL) *pbFailed = false;
-
-	if (pf->prog != NULL) return pf;
-
-	if ((g_nPixelShaderVer & SHADER_ACCURATE) && mapShaderResources.find(index + NUM_SHADERS*SHADER_ACCURATE) != mapShaderResources.end())
-		index += NUM_SHADERS * SHADER_ACCURATE;
-
-	assert(mapShaderResources.find(index) != mapShaderResources.end());
+	if( (g_nPixelShaderVer & SHADER_ACCURATE) && mapShaderResources.find(index+NUM_SHADERS*SHADER_ACCURATE) != mapShaderResources.end() )
+		index += NUM_SHADERS*SHADER_ACCURATE;
 
+	assert( mapShaderResources.find(index) != mapShaderResources.end() );
 	SHADERHEADER* header = mapShaderResources[index];
+	if( header == NULL )
+		ZZLog::Error_Log("%d %d", index, g_nPixelShaderVer);
+	assert( header != NULL );
 
-	if (header == NULL) ZZLog::Error_Log("%d %d", index, g_nPixelShaderVer);
-
-	assert(header != NULL);
-
-	//ZZLog::Debug_Log("Shader:\n%s.", (char*)(s_lpShaderResources + (header)->offset));
+	//DEBUG_LOG("shader:\n%s\n", (char*)(s_lpShaderResources + (header)->offset));
 	pf->prog = cgCreateProgram(g_cgcontext, CG_OBJECT, (char*)(s_lpShaderResources + (header)->offset), cgfProf, NULL, NULL);
-
-	if (pf->prog != NULL && cgIsProgram(pf->prog) && cgGetError() == CG_NO_ERROR)
-	{
+	if( pf->prog != NULL && cgIsProgram(pf->prog) && cgGetError() == CG_NO_ERROR ) {
 		SetupFragmentProgramParameters(pf, context, type);
 		cgGLLoadProgram(pf->prog);
-
-		if (cgGetError() != CG_NO_ERROR)
-		{
+		if( cgGetError() != CG_NO_ERROR ) {
 //		  cgGLLoadProgram(pf->prog);
 //		  if( cgGetError() != CG_NO_ERROR ) {
-			ZZLog::Error_Log("Failed to load shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms + clamp.wmt);
-
-			if (pbFailed != NULL) *pbFailed = true;
-
-			return pf;
-
+				ZZLog::Error_Log("Failed to load shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+				if( pbFailed != NULL ) *pbFailed = true;
+				return pf;
 //		  }
 		}
 		return pf;
 	}
 
-	ZZLog::Error_Log("Failed to create shader %d,%d,%d,%d", type, fog, texfilter, 4*clamp.wms + clamp.wmt);
-
-	if (pbFailed != NULL) *pbFailed = true;
+	ZZLog::Error_Log("Failed to create shader %d,%d,%d,%d", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+	if( pbFailed != NULL ) *pbFailed = true;
 
 	return NULL;
 }
-
-#else // defined(ZEROGS_DEVBUILD)
+	
+#else // not RELEASE_TO_PUBLIC
 
 #define LOAD_VS(name, prog, shaderver) { \
 	prog = cgCreateProgramFromFile(g_cgcontext, CG_SOURCE, EFFECT_NAME, shaderver, name, args); \
 	if( !cgIsProgram(prog) ) { \
-		ZZLog::Error_Log("Failed to load vs %s: \n%s.", name, cgGetLastListing(g_cgcontext)); \
+		ZZLog::Error_Log("Failed to load vs %s: \n%s", name, cgGetLastListing(g_cgcontext)); \
 		return false; \
 	} \
 	cgGLLoadProgram(prog); \
-	if( cgGetError() != CG_NO_ERROR ) ZZLog::Error_Log("failed to load program %s.", name); \
+	if( cgGetError() != CG_NO_ERROR ) ZZLog::Error_Log("failed to load program %s", name); \
 	SetupVertexProgramParameters(prog, args[0]==context1); \
 } \
- 
+
 #ifdef _DEBUG
 #define SET_PSFILENAME(frag, name) frag.filename = name
 #else
@@ -602,35 +699,33 @@ FRAGMENTSHADER* ZeroGS::LoadShadeEffect(int type, int texfilter, int fog, int te
 	bLoadSuccess = true; \
 	fragment.prog = cgCreateProgramFromFile(g_cgcontext, CG_SOURCE, EFFECT_NAME, shaderver, name, args); \
 	if( !cgIsProgram(fragment.prog) ) { \
-		ZZLog::Error_Log("Failed to load ps %s: \n%s.", name, cgGetLastListing(g_cgcontext)); \
+		ZZLog::Error_Log("Failed to load ps %s: \n%s", name, cgGetLastListing(g_cgcontext)); \
 		return false; \
 	} \
 	cgGLLoadProgram(fragment.prog); \
 	if( cgGetError() != CG_NO_ERROR ) { \
-		ZZLog::Error_Log("failed to load program %s.", name);		   \
+		ZZLog::Error_Log("failed to load program %s", name);		   \
 		bLoadSuccess = false; \
 	} \
 	SetupFragmentProgramParameters(&fragment, args[0]==context1, 0);  \
 	SET_PSFILENAME(fragment, name); \
 } \
- 
-bool ZeroGS::LoadEffects()
+
+inline bool ZeroGS::LoadEffects()
 {
 	// clear the textures
-	for (int i = 0; i < ARRAY_SIZE(ppsTexture); ++i)
-	{
+	for(int i = 0; i < ARRAY_SIZE(ppsTexture); ++i) {
 		SAFE_RELEASE_PROG(ppsTexture[i].prog);
 	}
 
 #ifndef _DEBUG
 	memset(ppsTexture, 0, sizeof(ppsTexture));
-
 #endif
 
 	return true;
 }
 
-bool ZeroGS::LoadExtraEffects()
+bool ZZshLoadExtraEffects()
 {
 	const char* args[] = { NULL , NULL, NULL, NULL };
 	char context0[255], context1[255];
@@ -641,8 +736,7 @@ bool ZeroGS::LoadExtraEffects()
 
 	const char* pvsshaders[4] = { "RegularVS", "TextureVS", "RegularFogVS", "TextureFogVS" };
 
-	for (int i = 0; i < 4; ++i)
-	{
+	for(int i = 0; i < 4; ++i) {
 		args[0] = context0;
 		args[1] = NULL;
 		LOAD_VS(pvsshaders[i], pvs[2*i], cgvProf);
@@ -650,11 +744,11 @@ bool ZeroGS::LoadExtraEffects()
 		LOAD_VS(pvsshaders[i], pvs[2*i+1], cgvProf);
 
 		//if( conf.mrtdepth ) {
-		args[0] = context0;
-		args[1] = write_depth;
-		LOAD_VS(pvsshaders[i], pvs[2*i+8], cgvProf);
-		args[0] = context1;
-		LOAD_VS(pvsshaders[i], pvs[2*i+8+1], cgvProf);
+			args[0] = context0;
+			args[1] = write_depth;
+			LOAD_VS(pvsshaders[i], pvs[2*i+8], cgvProf);
+			args[0] = context1;
+			LOAD_VS(pvsshaders[i], pvs[2*i+8+1], cgvProf);
 //		}
 //		else {
 //			pvs[2*i+8] = pvs[2*i+8+1] = NULL;
@@ -662,7 +756,6 @@ bool ZeroGS::LoadExtraEffects()
 	}
 
 	args[0] = context0;
-
 	args[1] = NULL;
 	LOAD_VS("BitBltVS", pvsBitBlt.prog, cgvProf);
 	pvsBitBlt.sBitBltPos = cgGetNamedParameter(pvsBitBlt.prog, "g_fBitBltPos");
@@ -672,142 +765,116 @@ bool ZeroGS::LoadExtraEffects()
 	LOAD_PS("RegularPS", ppsRegular[0], cgfProf);
 	LOAD_PS("RegularFogPS", ppsRegular[1], cgfProf);
 
-	if (conf.mrtdepth)
-	{
+	if( conf.mrtdepth ) {
 		args[0] = context0;
 		args[1] = write_depth;
 		LOAD_PS("RegularPS", ppsRegular[2], cgfProf);
-
-		if (!bLoadSuccess) conf.mrtdepth = 0;
-
+		if( !bLoadSuccess )
+			conf.mrtdepth = 0;
 		LOAD_PS("RegularFogPS", ppsRegular[3], cgfProf);
-
-		if (!bLoadSuccess) conf.mrtdepth = 0;
+		if( !bLoadSuccess )
+			conf.mrtdepth = 0;
 	}
 
 	LOAD_PS("BitBltPS", ppsBitBlt[0], cgfProf);
 	LOAD_PS("BitBltAAPS", ppsBitBlt[1], cgfProf);
-
-	if (!bLoadSuccess)
-	{
+	if( !bLoadSuccess ) {
 		ZZLog::Error_Log("Failed to load BitBltAAPS, using BitBltPS.");
 		LOAD_PS("BitBltPS", ppsBitBlt[1], cgfProf);
 	}
 
 	LOAD_PS("BitBltDepthPS", ppsBitBltDepth, cgfProf);
-	LOAD_PS("CRTCTargPS", ppsCRTCTarg[0], cgfProf);
+	LOAD_PS("CRTCTargPS", ppsCRTCTarg[0], cgfProf); 
 	LOAD_PS("CRTCTargInterPS", ppsCRTCTarg[1], cgfProf);
-
+	
 	g_bCRTCBilinear = true;
 	LOAD_PS("CRTCPS", ppsCRTC[0], cgfProf);
-
-	if (!bLoadSuccess)
-	{
+	if( !bLoadSuccess ) {
 		// switch to simpler
 		g_bCRTCBilinear = false;
 		LOAD_PS("CRTCPS_Nearest", ppsCRTC[0], cgfProf);
 		LOAD_PS("CRTCInterPS_Nearest", ppsCRTC[0], cgfProf);
 	}
-	else
-	{
+	else {
 		LOAD_PS("CRTCInterPS", ppsCRTC[1], cgfProf);
 	}
 
-	if (!bLoadSuccess) ZZLog::Error_Log("Failed to create CRTC shaders.");
-
-	LOAD_PS("CRTC24PS", ppsCRTC24[0], cgfProf);
-	LOAD_PS("CRTC24InterPS", ppsCRTC24[1], cgfProf);
+	if( !bLoadSuccess )
+		ZZLog::Error_Log("Failed to create CRTC shaders.");
+	
+	LOAD_PS("CRTC24PS", ppsCRTC24[0], cgfProf); LOAD_PS("CRTC24InterPS", ppsCRTC24[1], cgfProf);
 	LOAD_PS("ZeroPS", ppsOne, cgfProf);
 	LOAD_PS("BaseTexturePS", ppsBaseTexture, cgfProf);
 	LOAD_PS("Convert16to32PS", ppsConvert16to32, cgfProf);
 	LOAD_PS("Convert32to16PS", ppsConvert32to16, cgfProf);
 
 //	if( !conf.mrtdepth ) {
-//		ZZLog::Error_Log("Disabling MRT depth writing.");
-//		s_bWriteDepth = false;
+//		ZZLog::Error_Log("Disabling MRT depth writing,");
+//		s_bWriteDepth = FALSE;
 //	}
 
 	return true;
 }
 
-FRAGMENTSHADER* ZeroGS::LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed)
+FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed)
 {
 	int texwrap;
-
-	assert(texfilter < NUM_FILTERS);
+	
+	assert( texfilter < NUM_FILTERS );
 	//assert( g_nPixelShaderVer == SHADER_30 );
-
-	if (clamp.wms == clamp.wmt)
-	{
-		switch (clamp.wms)
-		{
-			case 0:
-				texwrap = TEXWRAP_REPEAT;
-				break;
-
-			case 1:
-				texwrap = TEXWRAP_CLAMP;
-				break;
-
-			case 2:
-				texwrap = TEXWRAP_CLAMP;
-				break;
-
+	if( clamp.wms == clamp.wmt ) {
+		switch( clamp.wms ) {
+			case 0: texwrap = TEXWRAP_REPEAT; break;
+			case 1: texwrap = TEXWRAP_CLAMP; break;
+			case 2: texwrap = TEXWRAP_CLAMP; break;
 			default:
-				texwrap = TEXWRAP_REGION_REPEAT;
-				break;
+				texwrap = TEXWRAP_REGION_REPEAT; break;
 		}
 	}
-	else if (clamp.wms == 3 || clamp.wmt == 3)
+	else if( clamp.wms==3||clamp.wmt==3)
 		texwrap = TEXWRAP_REGION_REPEAT;
 	else
 		texwrap = TEXWRAP_REPEAT_CLAMP;
 
 	int index = GET_SHADER_INDEX(type, texfilter, texwrap, fog, s_bWriteDepth, testaem, exactcolor, context, 0);
 
-	if (pbFailed != NULL) *pbFailed = false;
+	if( pbFailed != NULL ) *pbFailed = false;
 
-	FRAGMENTSHADER* pf = ppsTexture + index;
-
-	if (pf->prog != NULL) return pf;
+	FRAGMENTSHADER* pf = ppsTexture+index;
 
+	if( pf->prog != NULL ) 
+		return pf;
+	
 	pf->prog = LoadShaderFromType(EFFECT_DIR, EFFECT_NAME, type, texfilter, texwrap, fog, s_bWriteDepth, testaem, exactcolor, g_nPixelShaderVer, context);
 
-	if (pf->prog != NULL)
-	{
+	if( pf->prog != NULL ) {
 #ifdef _DEBUG
 		char str[255];
-		sprintf(str, "Texture%s%d_%sPS", fog ? "Fog" : "", texfilter, g_pTexTypes[type]);
+		sprintf(str, "Texture%s%d_%sPS", fog?"Fog":"", texfilter, g_pTexTypes[type]);
 		pf->filename = str;
 #endif
 		SetupFragmentProgramParameters(pf, context, type);
 		cgGLLoadProgram(pf->prog);
-
-		if (cgGetError() != CG_NO_ERROR)
-		{
+		if( cgGetError() != CG_NO_ERROR ) {
 			// try again
 //			cgGLLoadProgram(pf->prog);
 //			if( cgGetError() != CG_NO_ERROR ) {
-			ZZLog::Error_Log("Failed to load shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms + clamp.wmt);
-
-			if (pbFailed != NULL) *pbFailed = true;
-
-			//assert(0);
-			// NULL makes things crash
-			return pf;
-
+				ZZLog::Error_Log("Failed to load shader %d,%d,%d,%d", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+				if( pbFailed != NULL ) *pbFailed = true;
+				//assert(0);
+				// NULL makes things crash
+				return pf;
 //			}
 		}
-
 		return pf;
 	}
 
-	ZZLog::Error_Log("Failed to create shader %d,%d,%d,%d.", type, fog, texfilter, 4*clamp.wms + clamp.wmt);
-
-	if (pbFailed != NULL) *pbFailed = true;
+	ZZLog::Error_Log("Failed to create shader %d,%d,%d,%d", type, fog, texfilter, 4*clamp.wms+clamp.wmt);
+	if( pbFailed != NULL ) *pbFailed = true;
 
 	return NULL;
 }
 
-#endif // !defined(ZEROGS_DEVBUILD)
+#endif // RELEASE_TO_PUBLIC
 
+//#endif // NVIDIA_CG_API
diff --git a/plugins/zzogl-pg/opengl/ZZoglShaders.h b/plugins/zzogl-pg/opengl/ZZoglShaders.h
index 41b7511e51..a941702c5a 100644
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.h
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.h
@@ -55,16 +55,16 @@ inline bool ZZshActiveParameter(ZZshParameter param) {return (param !=NULL); }
 #endif					// end NVIDIA cg-toolkit API
 
 const static char* g_pPsTexWrap[] = { "-DREPEAT", "-DCLAMP", "-DREGION_REPEAT", NULL };
-const static char* g_pTexTypes[] = { "32", "tex32", "clut32", "tex32to16", "tex16to8h" };
 
 enum ZZshShaderType {ZZ_SH_ZERO, ZZ_SH_REGULAR, ZZ_SH_REGULAR_FOG, ZZ_SH_TEXTURE, ZZ_SH_TEXTURE_FOG, ZZ_SH_CRTC};
-// We have "compatible" shaders, as RegularFogVS and RegularFogPS, if we don't need to worry about incompatible shaders.
-// It's used only in GLSL mode. 
+// We have "compatible" shaders, as RegularFogVS and RegularFogPS. if don't need to wory about incompatible shaders
+// It used only in GLSL mode. 
 
 // ------------------------- Variables -------------------------------
-extern int g_nPixelShaderVer;
-extern ZZshShaderLink pvs[16], g_vsprog, g_psprog;
-extern ZZshParameter g_vparamPosXY[2], g_fparamFogColor;
+
+extern int 		g_nPixelShaderVer;
+extern ZZshShaderLink 	pvs[16], g_vsprog, g_psprog;
+extern ZZshParameter 	g_vparamPosXY[2], g_fparamFogColor;
 
 #define MAX_ACTIVE_UNIFORMS 600
 #define MAX_ACTIVE_SHADERS 400
@@ -73,18 +73,18 @@ struct FRAGMENTSHADER
 {
 	FRAGMENTSHADER() : prog(sZero), Shader(0), sMemory(pZero), sFinal(pZero), sBitwiseANDX(pZero), sBitwiseANDY(pZero), sInterlace(pZero), sCLUT(pZero), sOneColor(pZero), sBitBltZ(pZero),
 		fTexAlpha2(pZero), fTexOffset(pZero), fTexDims(pZero), fTexBlock(pZero), fClampExts(pZero), fTexWrapMode(pZero),
-		fRealTexDims(pZero), fTestBlack(pZero), fPageOffset(pZero), fTexAlpha(pZero) {}
-		
-	ZZshShaderLink prog;						// it links to the FRAGMENTSHADER structure, for compatibility between GLSL and CG.
-	ZZshShader Shader;							// GLSL store shaders not as ready programs, but as shader compiled objects. VS and PS should be linked together to
-												// make a program.
+		fRealTexDims(pZero), fTestBlack(pZero), fPageOffset(pZero), fTexAlpha(pZero)  {}
+	
+	ZZshShaderLink prog;						// it link to FRAGMENTSHADER structure, for compability between GLSL and CG
+	ZZshShader Shader;						// GLSL store shader's not as ready programs, but as shaders compilated object. VS and PS should be linked together to
+									// made a program.
 	ZZshShaderType ShaderType;					// Not every PS and VS are used together, only compatible ones.
 
 	ZZshParameter sMemory, sFinal, sBitwiseANDX, sBitwiseANDY, sInterlace, sCLUT;
 	ZZshParameter sOneColor, sBitBltZ, sInvTexDims;
 	ZZshParameter fTexAlpha2, fTexOffset, fTexDims, fTexBlock, fClampExts, fTexWrapMode, fRealTexDims, fTestBlack, fPageOffset, fTexAlpha;
 
-	int ParametersStart, ParametersFinish;				// this is part of UniformsIndex array in which parameters of this shader asre stored. The last one is ParametersFinish-1
+	int ParametersStart, ParametersFinish;				// this is part of UniformsIndex array in which parameters of this shader stored. Last one is ParametersFinish-1
 
 #ifdef _DEBUG
 	string filename;
@@ -145,7 +145,7 @@ struct FRAGMENTSHADER
 		return false;
 	}
 
-	bool set_shader_const(Vector v, const char *name)
+	bool set_shader_const(float4 v, const char *name)
 	{
 		ZZshParameter p;
 
@@ -174,29 +174,17 @@ struct VERTEXSHADER
 	int ParametersStart, ParametersFinish;
 };
 
-namespace ZeroGS {
-	// Shaders variables
-	extern Vector g_vdepth;	
-	extern Vector vlogz;
+namespace ZeroGS { 
+	extern float4 g_vdepth;	
+	extern float4 vlogz;
 	extern VERTEXSHADER pvsBitBlt;
 	extern FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;					// ppsOne used to stop using shaders for draw
 	extern FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
-	bool LoadEffects();
-	bool LoadExtraEffects();
-	FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
 
-	// only sets a limited amount of state (for Update)
-	void SetTexClamping(int context, FRAGMENTSHADER* pfragment);
-	void SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force);
+	extern FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
+	extern FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
 }
 
-// ------------------------- Variables -------------------------------
-
-extern u8* s_lpShaderResources;
-extern ZZshProfile cgvProf, cgfProf;
-extern FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
-extern FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
-
 // ------------------------- Functions -------------------------------
 
 #ifdef NVIDIA_CG_API
@@ -208,7 +196,7 @@ inline bool ZZshExistProgram(ZZshShaderLink prog) {return (prog != NULL); };
 extern const char* ShaderCallerName;
 extern const char* ShaderHandleName;
 
-inline void SetShaderCaller(const char* Name) {
+inline void SetShaderCaller(const char* Name) {	
 	ShaderCallerName = Name;
 }
 
@@ -222,22 +210,23 @@ inline void ResetShaderCounters() {
 
 extern bool ZZshCheckProfilesSupport();
 extern bool ZZshStartUsingShaders();
+extern bool ZZshCreateOpenShadersFile();
 extern void ZZshGLDisableProfile();
 extern void ZZshGLEnableProfile();
+extern void ZZshSetParameter4fv(ZZshShaderLink prog, ZZshParameter param, const float* v, const char* name);
 extern void ZZshSetParameter4fv(ZZshParameter param, const float* v, const char* name);
+extern void ZZshSetParameter4fvWithRetry(ZZshParameter* param, ZZshShaderLink prog, const float* v, const char* name);
+extern void ZZshGLSetTextureParameter(ZZshShaderLink prog, ZZshParameter param, GLuint texobj, const char* name);
 extern void ZZshGLSetTextureParameter(ZZshParameter param, GLuint texobj, const char* name);
 extern void ZZshDefaultOneColor( FRAGMENTSHADER ptr );
-extern void ZZshSetVertexShader(ZZshShader prog);
-extern void ZZshSetPixelShader(ZZshShader prog);
+extern void ZZshSetVertexShader(ZZshShaderLink prog);
+extern void ZZshSetPixelShader(ZZshShaderLink prog);
+extern bool ZZshLoadExtraEffects();
 
-inline int GET_SHADER_INDEX(int type, int texfilter, int texwrap, int fog, int writedepth, int testaem, int exactcolor, int context, int ps)
-{
-	return type + texfilter*NUM_TYPES + NUM_FILTERS*NUM_TYPES*texwrap + NUM_TEXWRAPS*NUM_FILTERS*NUM_TYPES*(fog+2*writedepth+4*testaem+8*exactcolor+16*context+32*ps);
+extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
+
+namespace ZeroGS {
+	// only sets a limited amount of state (for Update)
+	void SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force);
 }
-	
-struct SHADERHEADER
-{
-	unsigned int index, offset, size; // if highest bit of index is set, pixel shader
-};
-
 #endif
diff --git a/plugins/zzogl-pg/opengl/targets.cpp b/plugins/zzogl-pg/opengl/targets.cpp
index 2624ac145f..65b7d2f243 100644
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@@ -122,22 +122,22 @@ inline void FillOnlyStencilBuffer()
 
 // used for transformation from vertex position in GS window.coords (I hope)
 // to view coordinates (in range 0, 1).
-inline Vector ZeroGS::CRenderTarget::DefaultBitBltPos()
+inline float4 ZeroGS::CRenderTarget::DefaultBitBltPos()
 {
-	Vector v = Vector(1, -1, 0.5f / (float)RW(fbw), 0.5f / (float)RH(fbh));
+	float4 v = float4(1, -1, 0.5f / (float)RW(fbw), 0.5f / (float)RH(fbh));
 	v *= 1.0f / 32767.0f;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_sBitBltPos");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_sBitBltPos");
 	return v;
 }
 
 // Used to transform texture coordinates from GS (when 0,0 is upper left) to
 // OpenGL (0,0 - lower left).
-inline Vector ZeroGS::CRenderTarget::DefaultBitBltTex()
+inline float4 ZeroGS::CRenderTarget::DefaultBitBltTex()
 {
 	// I really sure that -0.5 is correct, because OpenGL have no half-offset
 	// issue, DirectX known for.
-	Vector v = Vector(1, -1, 0.5f / (float)RW(fbw), -0.5f / (float)RH(fbh));
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_sBitBltTex");
+	float4 v = float4(1, -1, 0.5f / (float)RW(fbw), -0.5f / (float)RH(fbh));
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_sBitBltTex");
 	return v;
 }
 
@@ -222,7 +222,7 @@ void ZeroGS::CRenderTarget::SetTarget(int fbplocal, const Rect2& scissor, int co
 
 	if (fbplocal != fbp)
 	{
-		Vector v;
+		float4 v;
 
 		// will be rendering to a subregion
 		u32 bpp = PSMT_ISHALF(psm) ? 2 : 4;
@@ -401,7 +401,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 	((CDepthTarget*)pdepth)->SetDepthStencilSurface();
 
 	SetShaderCaller("CRenderTarget::Update");
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 
 	CRenderTargetMngr::MAPTARGETS::iterator ittarg;
 
@@ -432,7 +432,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 
 	if (nUpdateTarg)
 	{
-		ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ittarg->second->ptex, "BaseTexture.final");
+		ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ittarg->second->ptex, "BaseTexture.final");
 
 		//assert( ittarg->second->fbw == fbw );
 		int offset = (fbp - ittarg->second->fbp) * 64 / fbw;
@@ -445,7 +445,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 		v.z = 0.25f;
 		v.w = (float)RH(offset) + 0.25f;
 
-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
 
 //		v = DefaultBitBltTex(); Maybe?
 		ZZshDefaultOneColor ( ppsBaseTexture );
@@ -472,14 +472,14 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 		// Fix in r133 -- FFX movies and Gust backgrounds!
 		//SetTexVariablesInt(0, 0*(AA.x || AA.y) ? 2 : 0, texframe, false, &ppsBitBlt[!!s_AAx], 1);
 		SetTexVariablesInt(0, 0, texframe, false, &ppsBitBlt[bit_idx], 1);
-		ZZshGLSetTextureParameter(ppsBitBlt[bit_idx].sMemory, vb[0].pmemtarg->ptex->tex, "BitBlt.memory");
+		ZZshGLSetTextureParameter(ppsBitBlt[bit_idx].prog, ppsBitBlt[bit_idx].sMemory, vb[0].pmemtarg->ptex->tex, "BitBlt.memory");
 
-		v = Vector(1, 1, 0.0f, 0.0f);
-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+		v = float4(1, 1, 0.0f, 0.0f);
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
 
 		v.x = 1;
 		v.y = 2;
-		ZZshSetParameter4fv(ppsBitBlt[bit_idx].sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(ppsBitBlt[bit_idx].prog, ppsBitBlt[bit_idx].sOneColor, v, "g_fOneColor");
 
 		assert(ptex != 0);
 
@@ -536,26 +536,26 @@ void ZeroGS::CRenderTarget::ConvertTo32()
 	SetShaderCaller("CRenderTarget::ConvertTo32");
 
 	// tex coords, test ffx bikanel island when changing these
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 	v = DefaultBitBltTex();
 
 	v.x = (float)RW(16);
 	v.y = (float)RH(16);
 	v.z = -(float)RW(fbw);
 	v.w = (float)RH(8);
-	ZZshSetParameter4fv(ppsConvert16to32.fTexOffset, v, "g_fTexOffset");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fTexOffset, v, "g_fTexOffset");
 
 	v.x = (float)RW(8);
 	v.y = 0;
 	v.z = 0;
 	v.w = 0.25f;
-	ZZshSetParameter4fv(ppsConvert16to32.fPageOffset, v, "g_fPageOffset");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fPageOffset, v, "g_fPageOffset");
 
 	v.x = (float)RW(2 * fbw);
 	v.y = (float)RH(fbh);
 	v.z = 0;
 	v.w = 0.0001f * (float)RH(fbh);
-	ZZshSetParameter4fv(ppsConvert16to32.fTexDims, v, "g_fTexDims");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fTexDims, v, "g_fTexDims");
 
 //	v.x = 0;
 //	ZZshSetParameter4fv(ppsConvert16to32.fTexBlock, v, "g_fTexBlock");
@@ -568,7 +568,7 @@ void ZeroGS::CRenderTarget::ConvertTo32()
 	ZeroGS::ResetRenderTarget(1);
 
 	BindToSample(&ptex);
-	ZZshGLSetTextureParameter(ppsConvert16to32.sFinal, ptex, "Convert 16 to 32.Final");
+	ZZshGLSetTextureParameter(ppsConvert16to32.prog, ppsConvert16to32.sFinal, ptex, "Convert 16 to 32.Final");
 
 	fbh /= 2; // have 16 bit surfaces are usually 2x higher
 	SetViewport();
@@ -640,26 +640,26 @@ void ZeroGS::CRenderTarget::ConvertTo16()
 	SetShaderCaller("CRenderTarget::ConvertTo16");
 
 	// tex coords, test ffx bikanel island when changing these
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 	v = DefaultBitBltTex();
 
 	v.x = 16.0f / (float)fbw;
 	v.y = 8.0f / (float)fbh;
 	v.z = 0.5f * v.x;
 	v.w = 0.5f * v.y;
-	ZZshSetParameter4fv(ppsConvert32to16.fTexOffset, v, "g_fTexOffset");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fTexOffset, v, "g_fTexOffset");
 
 	v.x = 256.0f / 255.0f;
 	v.y = 256.0f / 255.0f;
 	v.z = 0.05f / 256.0f;
 	v.w = -0.001f / 256.0f;
-	ZZshSetParameter4fv(ppsConvert32to16.fPageOffset, v, "g_fPageOffset");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fPageOffset, v, "g_fPageOffset");
 
 	v.x = (float)RW(fbw);
 	v.y = (float)RH(2 * fbh);
 	v.z = 0;
 	v.w = -0.1f / RH(fbh);
-	ZZshSetParameter4fv(ppsConvert32to16.fTexDims, v, "g_fTexDims");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fTexDims, v, "g_fTexDims");
 
 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
 	SET_STREAM();
@@ -671,7 +671,7 @@ void ZeroGS::CRenderTarget::ConvertTo16()
 
 	BindToSample(&ptex);
 
-	ZZshGLSetTextureParameter(ppsConvert32to16.sFinal, ptex, "Convert 32 to 16");
+	ZZshGLSetTextureParameter(ppsConvert32to16.prog, ppsConvert32to16.sFinal, ptex, "Convert 32 to 16");
 
 //	fbh *= 2; // have 16 bit surfaces are usually 2x higher
 
@@ -748,22 +748,22 @@ void ZeroGS::CRenderTarget::_CreateFeedback()
 	ResetRenderTarget(1);
 
 	// tex coords, test ffx bikanel island when changing these
-	/*	Vector v = DefaultBitBltPos();
-		v = Vector ((float)(RW(fbw+4)), (float)(RH(fbh+4)), +0.25f, -0.25f);
-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "BitBltTex");*/
+	/*	float4 v = DefaultBitBltPos();
+		v = float4 ((float)(RW(fbw+4)), (float)(RH(fbh+4)), +0.25f, -0.25f);
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "BitBltTex");*/
 
 	// tex coords, test ffx bikanel island when changing these
 
-//	Vector v = Vector(1, -1, 0.5f / (fbw << AA.x), 0.5f / (fbh << AA.y));
+//	float4 v = float4(1, -1, 0.5f / (fbw << AA.x), 0.5f / (fbh << AA.y));
 //	v *= 1/32767.0f;
 //	cgGLSetParameter4fv(pvsBitBlt.sBitBltPos, v);
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 
 	v.x = (float)(RW(fbw));
 	v.y = (float)(RH(fbh));
 	v.z = 0.0f;
 	v.w = 0.0f;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "BitBlt.Feedback");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "BitBlt.Feedback");
 	ZZshDefaultOneColor(ppsBaseTexture);
 
 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
@@ -773,7 +773,7 @@ void ZeroGS::CRenderTarget::_CreateFeedback()
 	glBindTexture(GL_TEXTURE_RECTANGLE_NV, ptex);
 	GL_REPORT_ERRORD();
 
-	ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ptex, "BaseTexture.Feedback");
+	ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ptex, "BaseTexture.Feedback");
 
 	SetViewport();
 
@@ -976,9 +976,9 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)
 
 	// write color and zero out stencil buf, always 0 context!
 	SetTexVariablesInt(0, 0, texframe, false, &ppsBitBltDepth, 1);
-	ZZshGLSetTextureParameter(ppsBitBltDepth.sMemory, vb[0].pmemtarg->ptex->tex, "BitBltDepth");
+	ZZshGLSetTextureParameter(ppsBitBltDepth.prog, ppsBitBltDepth.sMemory, vb[0].pmemtarg->ptex->tex, "BitBltDepth");
 
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 
 	v = DefaultBitBltTex();
 
@@ -986,9 +986,9 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)
 	v.y = 2;
 	v.z = PSMT_IS16Z(psm) ? 1.0f : 0.0f;
 	v.w = g_filog32;
-	ZZshSetParameter4fv(ppsBitBltDepth.sOneColor, v, "g_fOneColor");
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sOneColor, v, "g_fOneColor");
 
-	Vector vdepth = g_vdepth;
+	float4 vdepth = g_vdepth;
 
 	if (psm == PSMT24Z)
 	{
@@ -1001,7 +1001,7 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)
 
 	assert(ppsBitBltDepth.sBitBltZ != 0);
 
-	ZZshSetParameter4fv(ppsBitBltDepth.sBitBltZ, ((255.0f / 256.0f)*vdepth), "g_fBitBltZ");
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sBitBltZ, ((255.0f / 256.0f)*vdepth), "g_fBitBltZ");
 
 	assert(pdepth != 0);
 	//GLint w1 = 0;
diff --git a/plugins/zzogl-pg/opengl/targets.h b/plugins/zzogl-pg/opengl/targets.h
index e9ecc8dafb..ee65bfbce2 100644
--- a/plugins/zzogl-pg/opengl/targets.h
+++ b/plugins/zzogl-pg/opengl/targets.h
@@ -228,7 +228,6 @@ inline list<ZeroGS::CRenderTarget*> CreateTargetsList(int start, int end)
 	return listTargs;
 }
 
-extern Vector g_vdepth;
 extern int icurctx;
 extern GLuint vboRect;
 
diff --git a/plugins/zzogl-pg/opengl/zerogs.cpp b/plugins/zzogl-pg/opengl/zerogs.cpp
index bae3235064..88bda55557 100644
--- a/plugins/zzogl-pg/opengl/zerogs.cpp
+++ b/plugins/zzogl-pg/opengl/zerogs.cpp
@@ -29,7 +29,6 @@
 #include "Mem.h"
 #include "x86.h"
 #include "zerogs.h"
-#include "zpipe.h"
 #include "targets.h"
 #include "GLWin.h"
 #include "ZZoglShaders.h"
@@ -51,7 +50,6 @@ extern int g_nFrame, g_nRealFrame;
 //-------------------------- Variables
 
 primInfo *prim;
-ZZshProgram g_vsprog = 0, g_psprog = 0;							// 2 -- ZZ
 
 inline u32 FtoDW(float f) { return (*((u32*)&f)); }
 
@@ -82,7 +80,6 @@ PFNGLDRAWBUFFERSPROC glDrawBuffers = NULL;
 
 /////////////////////
 // graphics resources
-ZZshParameter g_vparamPosXY[2] = {0}, g_fparamFogColor = 0;
 
 bool s_bTexFlush = false;
 int s_nLastResolveReset = 0;
@@ -94,10 +91,8 @@ int nBackbufferWidth, nBackbufferHeight;									// ZZ
 
 namespace ZeroGS
 {
-Vector g_vdepth, vlogz;
-
-//       	= Vector( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
-//	Vector g_vdepth = Vector( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);
+//       	= float4( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
+//	float4 g_vdepth = float4( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);
 
 extern CRangeManager s_RangeMngr; // manages overwritten memory
 
@@ -341,7 +336,7 @@ void ZeroGS::DrawText(const char* pstr, int left, int top, u32 color)
 	FUNCLOG
 	ZZshGLDisableProfile();
 
-	Vector v;
+	float4 v;
 	v.SetColor(color);
 	glColor3f(v.z, v.y, v.x);
 	//glColor3f(((color >> 16) & 0xff) / 255.0f, ((color >> 8) & 0xff)/ 255.0f, (color & 0xff) / 255.0f);
@@ -490,19 +485,19 @@ void ZeroGS::RenderCustom(float fAlpha)
 	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
 
 	// tex coords
-	Vector v = Vector(1 / 32767.0f, 1 / 32767.0f, 0, 0);
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
+	float4 v = float4(1 / 32767.0f, 1 / 32767.0f, 0, 0);
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
 	v.x = (float)nLogoWidth;
 	v.y = (float)nLogoHeight;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
 
 	v.x = v.y = v.z = v.w = fAlpha;
-	ZZshSetParameter4fv(ppsBaseTexture.sOneColor, v, "g_fOneColor");
+	ZZshSetParameter4fv(ppsBaseTexture.prog, ppsBaseTexture.sOneColor, v, "g_fOneColor");
 
 	if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
 
 	// inside vhDCb[0]'s target area, so render that region only
-	ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ptexLogo, "Logo");
+	ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ptexLogo, "Logo");
 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
 
 	SET_STREAM();
@@ -781,7 +776,7 @@ void ZeroGS::SetFogColor(u32 fog)
 	ZeroGS::FlushBoth();
 
 	SetShaderCaller("SetFogColor");
-	Vector v;
+	float4 v;
 
 	// set it immediately
 	v.SetColor(gs.fogcol);
@@ -795,7 +790,7 @@ void ZeroGS::SetFogColor(GIFRegFOGCOL* fog)
 	FUNCLOG
 	
 	SetShaderCaller("SetFogColor");
-	Vector v;
+	float4 v;
 	
 	v.x = fog->FCR / 255.0f;
 	v.y = fog->FCG / 255.0f;
diff --git a/plugins/zzogl-pg/opengl/zerogs.h b/plugins/zzogl-pg/opengl/zerogs.h
index 6883704dcb..c9365593be 100644
--- a/plugins/zzogl-pg/opengl/zerogs.h
+++ b/plugins/zzogl-pg/opengl/zerogs.h
@@ -66,7 +66,10 @@ extern float g_fiGPU_TEXWIDTH;
 #define MASKDIVISOR		0							// Used for decrement bitwise mask texture size if 1024 is too big
 #define GPU_TEXMASKWIDTH	(1024 >> MASKDIVISOR)	// bitwise mask width for region repeat mode
 
+extern u32 ptexBlocks;		// holds information on block tiling. It's texture number in OpenGL -- if 0 than such texture
+extern u32 ptexConv16to32;	// does not exists. This textures should be created on start and released on finish.  
 extern u32 ptexBilinearBlocks;
+extern u32 ptexConv32to16;
 
 // this is currently *not* used as a bool, in spite of its moniker --air
 // Actually, the only thing written to it is 1 or 0, which makes the (g_bSaveFlushedFrame & 0x80000000) check rather bizzare.
@@ -136,7 +139,7 @@ class CRenderTarget
 		int fbp, fbw, fbh, fbhCalc; // if fbp is negative, virtual target (not mapped to any real addr)
 		int start, end; // in bytes
 		u32 lastused;	// time stamp since last used
-		Vector vposxy;
+		float4 vposxy;
 
 		u32 fbm;
 		u16 status;
@@ -161,8 +164,8 @@ class CRenderTarget
 			TS_NeedConvert32 = 16,
 			TS_NeedConvert16 = 32,
 		};
-		inline Vector DefaultBitBltPos() ;
-		inline Vector DefaultBitBltTex() ;
+		inline float4 DefaultBitBltPos();
+		inline float4 DefaultBitBltTex();
 
 	private:
 		void _CreateFeedback();