GregMiscellaneous: Sync against trunk. (3768:3804)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3805 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-09-19 08:01:48 +00:00 · 2010-09-19 08:01:48 +00:00 · 0a832acabb
parent 28819c2098
commit 0a832acabb
36 changed files with 1176 additions and 1302 deletions
--- a/pcsx2/COP0.cpp
+++ b/pcsx2/COP0.cpp
@ -20,9 +20,10 @@
 u32 s_iLastCOP0Cycle = 0;
 u32 s_iLastPERFCycle[2] = { 0, 0 };

-__ri void UpdateCP0Status() {
-	//currently the 2 memory modes are not implemented. Given this function is called so much,
-	//it's commented out for now. Only the interrupt test is needed. (rama)
+// Updates the CPU's mode of operation (either, Kernel, Supervisor, or User modes).
+// Currently the different modes are not implemented.
+// Given this function is called so much, it's commented out for now. (rama)
+__ri void cpuUpdateOperationMode() {

 	//u32 value = cpuRegs.CP0.n.Status.val;

@ -32,7 +33,6 @@ __ri void UpdateCP0Status() {
 	//} else { // User Mode
 	//	memSetUserMode();
 	//}
-	cpuTestHwInts();
 }

 void __fastcall WriteCP0Status(u32 value) {
--- a/pcsx2/Hw.cpp
+++ b/pcsx2/Hw.cpp
@ -50,8 +50,7 @@ void hwReset()
 {
 	hwInit();

-	memzero_ptr<Ps2MemSize::Hardware>( eeHw );
-	//memset(eeHw+0x2000, 0, 0x0000e000);
+	memzero( eeHw );

 	psHu32(SBUS_F260) = 0x1D000060;

@ -73,16 +72,16 @@ void hwReset()
 	ipuDmaReset();
 }

-__fi void intcInterrupt()
+__fi uint intcInterrupt()
 {
 	if ((psHu32(INTC_STAT)) == 0) {
 		//DevCon.Warning("*PCSX2*: intcInterrupt already cleared");
-        return;
+        return 0;
 	}
 	if ((psHu32(INTC_STAT) & psHu32(INTC_MASK)) == 0) 
 	{
 		//DevCon.Warning("*PCSX2*: No valid interrupt INTC_MASK: %x INTC_STAT: %x", psHu32(INTC_MASK), psHu32(INTC_STAT));
-		return;
+		return 0;
 	}

 	HW_LOG("intcInterrupt %x", psHu32(INTC_STAT) & psHu32(INTC_MASK));
@ -91,27 +90,29 @@ __fi void intcInterrupt()
 		counters[1].hold = rcntRcount(1);
 	}

-	cpuException(0x400, cpuRegs.branch);
+	//cpuException(0x400, cpuRegs.branch);
+	return 0x400;
 }

-__fi void dmacInterrupt()
+__fi uint dmacInterrupt()
 {
 	if( ((psHu16(DMAC_STAT + 2) & psHu16(DMAC_STAT)) == 0 ) &&
 		( psHu16(DMAC_STAT) & 0x8000) == 0 ) 
 	{
 		//DevCon.Warning("No valid DMAC interrupt MASK %x STAT %x", psHu16(DMAC_STAT+2), psHu16(DMAC_STAT));
-		return;
+		return 0;
 	}

-	if (!(dmacRegs.ctrl.DMAE) || psHu8(DMAC_ENABLER+2) == 1) 
+	if (!dmacRegs.ctrl.DMAE || psHu8(DMAC_ENABLER+2) == 1) 
 	{
 		//DevCon.Warning("DMAC Suspended or Disabled on interrupt");
-		return;
+		return 0;
 	}
 	HW_LOG("dmacInterrupt %x", (psHu16(DMAC_STAT + 2) & psHu16(DMAC_STAT) |
-								  psHu16(DMAC_STAT) & 0x8000));
+								psHu16(DMAC_STAT) & 0x8000));

-	cpuException(0x800, cpuRegs.branch);
+	//cpuException(0x800, cpuRegs.branch);
+	return 0x800;
 }

 void hwIntcIrq(int n)
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@ -55,9 +55,6 @@ int coded_block_pattern = 0;
 u8 indx4[16*16/2];
 __aligned16 decoder_t decoder;

-__aligned16 u8 _readbits[80];	//local buffer (ring buffer)
-u8* readbits = _readbits;		// always can decrement by one 1qw
-
 __fi void IPUProcessInterrupt()
 {
 	if (ipuRegs.ctrl.BUSY && g_BP.IFC) IPUWorker();
@ -96,8 +93,6 @@ void ReportIPU()
 	Console.WriteLn("g_decoder = 0x%x.", &decoder);
 	Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
 	Console.WriteLn(ipu_cmd.desc());
-	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
-		_readbits, readbits - _readbits);
 	Console.Newline();
 }

@ -114,15 +109,6 @@ void SaveStateBase::ipuFreeze()
 	Freeze(coded_block_pattern);
 	Freeze(decoder);
 	Freeze(ipu_cmd);
-	Freeze(_readbits);
-
-	int temp = readbits - _readbits;
-	Freeze(temp);
-
-	if (IsLoading())
-	{
-		readbits = _readbits;
-	}
 }

 void tIPU_CMD_IDEC::log() const
@ -213,21 +199,27 @@ __fi u32 ipuRead32(u32 mem)
 	switch (mem)
 	{
 		ipucase(IPU_CTRL): // IPU_CTRL
+		{
 			ipuRegs.ctrl.IFC = g_BP.IFC;
 			ipuRegs.ctrl.CBP = coded_block_pattern;

 			if (!ipuRegs.ctrl.BUSY)
 				IPU_LOG("read32: IPU_CTRL=0x%08X", ipuRegs.ctrl._u32);

-		return ipuRegs.ctrl._u32;
+			return ipuRegs.ctrl._u32;
+		}		

 		ipucase(IPU_BP): // IPU_BP
+		{
+			pxAssume(g_BP.FP <= 2);
+			
 			ipuRegs.ipubp = g_BP.BP & 0x7f;
 			ipuRegs.ipubp |= g_BP.IFC << 8;
-			ipuRegs.ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
+			ipuRegs.ipubp |= g_BP.FP << 16;

 			IPU_LOG("read32: IPU_BP=0x%08X", ipuRegs.ipubp);
-		return ipuRegs.ipubp;
+			return ipuRegs.ipubp;
+		}

 		default:
 			IPU_LOG("read32: Addr=0x%08X Value = 0x%08X", mem, psHu32(IPU_CMD + mem));
@ -283,9 +275,7 @@ void ipuSoftReset()
 	ipu_cmd.clear();
 	ipuRegs.cmd.BUSY = 0;

-	g_BP.BP = 0;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+	memzero(g_BP);
 }

 __fi bool ipuWrite32(u32 mem, u32 value)
@ -354,12 +344,11 @@ static void ipuBCLR(u32 val)
 {
 	ipu_fifo.in.clear();

+	memzero(g_BP);
 	g_BP.BP = val & 0x7F;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+
 	ipuRegs.ctrl.BUSY = 0;
 	ipuRegs.cmd.BUSY = 0;
-	memzero(_readbits);
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }

@ -370,7 +359,7 @@ static bool ipuIDEC(u32 val, bool resume)
 	if (!resume)
 	{
 		idec.log();
-		g_BP.BP += idec.FB;//skip FB bits
+		g_BP.Advance(idec.FB);

 	//from IPU_CTRL
 		ipuRegs.ctrl.PCT = I_TYPE; //Intra DECoding;)
@ -407,7 +396,7 @@ static __fi bool ipuBDEC(u32 val, bool resume)
 		bdec.log(s_bdec);
 		if (IsDebugBuild) s_bdec++;

-	g_BP.BP += bdec.FB;//skip FB bits
+		g_BP.Advance(bdec.FB);
 		decoder.coding_type			= I_TYPE;
 		decoder.mpeg1				= ipuRegs.ctrl.MP1;
 		decoder.q_scale_type		= ipuRegs.ctrl.QST;
@ -433,11 +422,7 @@ static bool __fastcall ipuVDEC(u32 val)
 	switch (ipu_cmd.pos[0])
 	{
 		case 0:
-			ipuRegs.cmd.DATA = 0;
-			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return false;
-
-			decoder.bitstream_bits = -16;
-			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+			if (!bitstream_init()) return false;

 			switch ((val >> 26) & 3)
 			{
@ -459,17 +444,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				case 3://DMVector
 					ipuRegs.cmd.DATA = get_dmv();
 					break;
+
+				jNO_DEFAULT
 			}

-			g_BP.BP += (int)decoder.bitstream_bits + 16;
+			ipuRegs.cmd.DATA &= 0xFFFF;
+			ipuRegs.cmd.DATA |= 0x10000;

-			if ((int)g_BP.BP < 0)
-			{
-				g_BP.BP += 128;
-				ReorderBitstream();
-			}
-
-			ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
+			//ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);

 		case 1:
@ -479,14 +461,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				return false;
 			}

-			BigEndian(ipuRegs.top, ipuRegs.top);
+			ipuRegs.top = BigEndian(ipuRegs.top);

 			IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
 			        ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
 			        ((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
 			return true;

-			jNO_DEFAULT
+		jNO_DEFAULT
 	}

 	return false;
@ -496,7 +478,7 @@ static __fi bool ipuFDEC(u32 val)
 {
 	if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;

-	BigEndian(ipuRegs.cmd.DATA, ipuRegs.cmd.DATA);
+	ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
 	ipuRegs.top = ipuRegs.cmd.DATA;

 	IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
@ -553,11 +535,10 @@ static bool ipuSETVQ(u32 val)
 		if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
 	}

-	IPU_LOG("SETVQ command.\nRead VQCLUT table from FIFO.");
-	IPU_LOG(
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	IPU_LOG("SETVQ command.   Read VQCLUT table from FIFO.\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
 	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
 	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
 	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
@ -723,148 +704,48 @@ __fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
 	Console.Error("IPU: VQ not implemented");
 }

-__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
-{
-	const u8	*s = (const u8*)&mb8;
-	s16	*d = (s16*)&mb16;
-	int i;
-	for (i = 0; i < 256; i++) *d++ = *s++;		//Y  bias	- 16
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cr bias	- 128
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cb bias	- 128
-}
-

 // --------------------------------------------------------------------------------------
 //  Buffer reader
 // --------------------------------------------------------------------------------------

-// move the readbits queue
-__fi void inc_readbits()
+__ri u32 UBITS(uint bits)
 {
-	readbits += 16;
-	if (readbits >= _readbits + 64)
-	{
-		// move back
-		*(u64*)(_readbits) = *(u64*)(_readbits + 64);
-		*(u64*)(_readbits + 8) = *(u64*)(_readbits + 72);
-		readbits = _readbits;
-	}
+	uint readpos8 = g_BP.BP/8;
+
+	uint result = BigEndian(*(u32*)( (u8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
+
+	return result;
 }

-// returns the pointer of readbits moved by 1 qword
-__fi u8* next_readbits()
+__ri s32 SBITS(uint bits)
 {
-	return readbits + 16;
-}
+	// Read an unaligned 32 bit value and then shift the bits up and then back down.

-// returns the pointer of readbits moved by 1 qword
-u8* prev_readbits()
-{
-	if (readbits < _readbits + 16) return _readbits + 48 - (readbits - _readbits);
+	uint readpos8 = g_BP.BP/8;

-	return readbits - 16;
-}
+	int result = BigEndian(*(s32*)( (s8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);

-void ReorderBitstream()
-{
-	readbits = prev_readbits();
-	g_BP.FP = 2;
-}
-
-// IPU has a 2qword internal buffer whose status is pointed by FP.
-// If FP is 1, there's 1 qword in buffer. Second qword is only loaded
-// incase there are less than 32bits available in the first qword.
-// \return Number of bits available (clamps at 16 bits)
-u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
-{
-	if (g_BP.FP == 0)
-	{
-		if (ipu_fifo.in.read(next_readbits()) == 0) return 0;
-
-		inc_readbits();
-		g_BP.FP = 1;
-	}
-
-	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
-	{
-		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
-	}
-
-	if (*(int*)pointer >= 128)
-	{
-		pxAssert(g_BP.FP >= 1);
-
-		if (g_BP.FP > 1) inc_readbits();
-
-		if (advance)
-		{
-			g_BP.FP--;
-			*pointer &= 127;
-		}
-	}
-
-	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
+	return result;
 }

 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits128(u8 *address, u32 advance)
+u8 getBits64(u8 *address, bool advance)
 {
-	u64 mask2;
-	u128 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(64)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask2 = 0xff >> shift;
-		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
-		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
-
-		u128 notMask;
-		u128 data = *(u128*)(readpos + 1);
-		notMask.lo = ~mask.lo & data.lo;
-		notMask.hi = ~mask.hi & data.hi;
-		notMask.lo >>= 8 - shift;
-		notMask.lo |= (notMask.hi & (ULLONG_MAX >> (64 - shift))) << (64 - shift);
-		notMask.hi >>= 8 - shift;
-
-		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
-		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
-		
-		notMask.lo |= mask.lo;
-		notMask.hi |= mask.hi;
-		*(u128*)address = notMask;
-	}
-	else
-	{
-		*(u128*)address = *(u128*)readpos;
-	}
-
-	if (advance) g_BP.BP += 128;
-
-	return 1;
-}
-
-// whenever reading fractions of bytes. The low bits always come from the next byte
-// while the high bits come from the current byte
-u8 __fastcall getBits64(u8 *address, u32 advance)
-{
-	register u64 mask = 0;
-	u8* readpos;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
-	{
-		mask = (0xff >> shift);
+		u64 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);

 		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
@ -874,89 +755,76 @@ u8 __fastcall getBits64(u8 *address, u32 advance)
 		*(u64*)address = *(u64*)readpos;
 	}

-	if (advance) g_BP.BP += 64;
+	if (advance) g_BP.Advance(64);

 	return 1;
 }

 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits32(u8 *address, u32 advance)
+__fi u8 getBits32(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(32)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 32) < 32) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
+	const u8* readpos = &g_BP.internal_qwc->_u8[g_BP.BP/8];
+	
+	if(uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		u32 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24);

 		*(u32*)address = ((~mask & *(u32*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u32*)readpos) << shift);
 	}
 	else
 	{
+		// Bit position-aligned -- no masking/shifting necessary
 		*(u32*)address = *(u32*)readpos;
 	}

-	if (advance) g_BP.BP += 32;
+	if (advance) g_BP.Advance(32);

 	return 1;
 }

-__fi u8 __fastcall getBits16(u8 *address, u32 advance)
+__fi u8 getBits16(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(16)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 16) < 16) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		mask = mask | (mask << 8);
-
 		*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
-			}
+	}
 	else
 	{
 		*(u16*)address = *(u16*)readpos;
-			}
+	}

-	if (advance) g_BP.BP += 16;
+	if (advance) g_BP.Advance(16);

 	return 1;
 }

-u8 __fastcall getBits8(u8 *address, u32 advance)
+u8 getBits8(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(8)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8)
-		return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
-			{
-		mask = (0xff >> shift);
+	{
+		uint mask = (0xff >> shift);
 		*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
-			}
+	}
 	else
 	{
 		*(u8*)address = *(u8*)readpos;
-		}
+	}

-	if (advance) g_BP.BP += 8;
+	if (advance) g_BP.Advance(8);

 	return 1;
 }
@ -983,7 +851,7 @@ void IPUCMD_WRITE(u32 val)

 		case SCE_IPU_VDEC:

-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);

 			// check if enough data in queue
 			if (ipuVDEC(val)) return;
@ -993,9 +861,11 @@ void IPUCMD_WRITE(u32 val)
 			break;

 		case SCE_IPU_FDEC:
-			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, FP %d, CHCR 0x%x",
-			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, g_BP.FP, ipu1dma.chcr._u32);
-			g_BP.BP += val & 0x3F;
+			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, CHCR 0x%x",
+			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, ipu1dma.chcr._u32);
+
+			g_BP.Advance(val & 0x3F);
+
 			if (ipuFDEC(val)) return;
 			ipuRegs.cmd.BUSY = 0x80000000;
 			ipuRegs.topbusy = 0x80000000;
@ -1009,7 +879,7 @@ void IPUCMD_WRITE(u32 val)
 		case SCE_IPU_SETIQ:
 			IPU_LOG("SETIQ command.");
 			if (val & 0x3f) IPU_LOG("Skip %d bits.", val & 0x3f);
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 			if (ipuSETIQ(val)) return;
 			break;

--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@ -67,11 +67,66 @@ union tIPU_CTRL {
 	void reset() { _u32 = 0; }
 };

-struct tIPU_BP {
-	u32 BP;		// Bit stream point
-	u16 IFC;	// Input FIFO counter
-	u8 FP;		// FIFO point
-	u8 bufferhasnew; // Always 0.
+struct __aligned16 tIPU_BP {
+	__aligned16 u128 internal_qwc[2];
+
+	u32 BP;		// Bit stream point (0 to 128*2)
+	u32 IFC;	// Input FIFO counter (8QWC) (0 to 8)
+	u32 FP;		// internal FIFO (2QWC) fill status (0 to 2)
+
+	__fi void Align()
+	{
+		BP = (BP + 7) & ~7;
+		Advance(0);
+	}
+
+	__fi void Advance(uint bits)
+	{
+		BP += bits;
+		pxAssume( BP <= 256 );
+
+		if (BP > 127)
+		{
+			BP -= 128;
+
+			if (FP == 2)
+			{
+				// when BP is over 128 it means we're reading data from the second quadword.  Shift that one
+				// to the front and load the new quadword into the second QWC (its a manualized ringbuffer!)
+
+				CopyQWC(&internal_qwc[0], &internal_qwc[1]);
+				FP = 1;
+			}
+			else
+			{
+				// if FP == 1 then the buffer has been completely drained.
+				// if FP == 0 then an already-drained buffer is being advanced.
+				// In either case we just assign FP to 0.
+
+				FP = 0;
+			}
+		}
+	}
+
+	__fi bool FillBuffer(u32 bits)
+	{
+		while (FP < 2)
+		{
+			if (ipu_fifo.in.read(&internal_qwc[FP]) == 0)
+			{
+				// Here we *try* to fill the entire internal QWC buffer; however that may not necessarily
+				// be possible -- so if the fill fails we'll only return 0 if we don't have enough
+				// remaining bits in the FIFO to fill the request.
+
+				return ((FP!=0) && (BP + bits) <= 128);
+			}
+
+			++FP;
+		}
+
+		return true;
+	}
+
 	wxString desc() const
 	{
 		return wxsFormat(L"Ipu BP: bp = 0x%x, IFC = 0x%x, FP = 0x%x.", BP, IFC, FP);
@ -217,10 +272,9 @@ extern void IPUCMD_WRITE(u32 val);
 extern void ipuSoftReset();
 extern void IPUProcessInterrupt();

-extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
-extern u8 __fastcall getBits128(u8 *address, u32 advance);
-extern u8 __fastcall getBits64(u8 *address, u32 advance);
-extern u8 __fastcall getBits32(u8 *address, u32 advance);
-extern u8 __fastcall getBits16(u8 *address, u32 advance);
-extern u8 __fastcall getBits8(u8 *address, u32 advance);
+extern u8 getBits128(u8 *address, bool advance);
+extern u8 getBits64(u8 *address, bool advance);
+extern u8 getBits32(u8 *address, bool advance);
+extern u8 getBits16(u8 *address, bool advance);
+extern u8 getBits8(u8 *address, bool advance);

--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@ -19,7 +19,6 @@
 #include "IPU/IPUdma.h"
 #include "mpeg2lib/Mpeg.h"

-
 __aligned16 IPU_Fifo ipu_fifo;

 void IPU_Fifo::init()
@ -75,10 +74,7 @@ int IPU_Fifo_Input::write(u32* pMem, int size)

 	while (transsize-- > 0)
 	{
-		for (int i = 0; i <= 3; i++)
-		{
-			data[writepos + i] = pMem[i];
-		}
+		CopyQWC(&data[writepos], pMem);
 		writepos = (writepos + 4) & 31;
 		pMem += 4;
 	}
@ -86,118 +82,100 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
 	return firsttrans;
 }

-int IPU_Fifo_Output::write(const u32 *value, int size)
-{
-	int transsize, firsttrans;
-
-	if ((int)ipuRegs.ctrl.OFC >= 8) IPU0dma();
-
-	transsize = min(size, 8 - (int)ipuRegs.ctrl.OFC);
-	firsttrans = transsize;
-
-	while (transsize-- > 0)
-	{
-		for (int i = 0; i <= 3; i++)
-		{
-			data[writepos + i] = ((u32*)value)[i];
-		}
-		writepos = (writepos + 4) & 31;
-		value += 4;
-	}
-
-	ipuRegs.ctrl.OFC += firsttrans;
-	IPU0dma();
-
-	return firsttrans;
-}
-
 int IPU_Fifo_Input::read(void *value)
 {
 	// wait until enough data to ensure proper streaming.
-	if (g_BP.IFC < 4)
+	if (g_BP.IFC < 3)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
-			CPU_INT( DMAC_TO_IPU, 4 );
+			CPU_INT( DMAC_TO_IPU, 32 );
 		}
-		
+
 		if (g_BP.IFC == 0) return 0;
 		pxAssert(g_BP.IFC > 0);
 	}

-	// transfer 1 qword, split into two transfers
-	for (int i = 0; i <= 3; i++)
-	{
-		((u32*)value)[i] = data[readpos + i];
-		data[readpos + i] = 0;
-	}
+	CopyQWC(value, &data[readpos]);

 	readpos = (readpos + 4) & 31;
 	g_BP.IFC--;
 	return 1;
 }

-void IPU_Fifo_Output::_readsingle(void *value)
+int IPU_Fifo_Output::write(const u32 *value, uint size)
 {
-	// transfer 1 qword, split into two transfers
-	for (int i = 0; i <= 3; i++)
+	pxAssumeMsg(size>0, "Invalid size==0 when calling IPU_Fifo_Output::write");
+
+	uint origsize = size;
+	do {
+		IPU0dma();
+	
+		uint transsize = min(size, 8 - (uint)ipuRegs.ctrl.OFC);
+		if(!transsize) break;
+
+		ipuRegs.ctrl.OFC = transsize;
+		size -= transsize;
+		while (transsize > 0)
+		{
+			CopyQWC(&data[writepos], value);
+			writepos = (writepos + 4) & 31;
+			value += 4;
+			--transsize;
+		}
+	} while(true);
+
+	return origsize - size;
+
+#if 0
+	if (ipuRegs.ctrl.OFC >= 8) IPU0dma();
+
+	uint transsize = min(size, 8 - (uint)ipuRegs.ctrl.OFC);
+	uint firsttrans = transsize;
+
+	while (transsize > 0)
 	{
-		((u32*)value)[i] = data[readpos + i];
-		data[readpos + i] = 0;
+		CopyQWC(&data[writepos], value);
+		writepos = (writepos + 4) & 31;
+		value += 4;
+		--transsize;
 	}
-	readpos = (readpos + 4) & 31;
+
+	ipuRegs.ctrl.OFC += firsttrans;
+	IPU0dma();
+
+	return firsttrans;
+#endif
 }

-void IPU_Fifo_Output::read(void *value, int size)
+void IPU_Fifo_Output::read(void *value, uint size)
 {
+	pxAssume(ipuRegs.ctrl.OFC >= size);
 	ipuRegs.ctrl.OFC -= size;
+	
+	// Zeroing the read data is not needed, since the ringbuffer design will never read back
+	// the zero'd data anyway. --air
+
+	//__m128 zeroreg = _mm_setzero_ps();
 	while (size > 0)
 	{
-		_readsingle(value);
-		value = (u32*)value + 4;
-		size--;
+		CopyQWC(value, &data[readpos]);
+		//_mm_store_ps((float*)&data[readpos], zeroreg);
+
+		readpos = (readpos + 4) & 31;
+		value = (u128*)value + 1;
+		--size;
 	}
 }

-void IPU_Fifo_Output::readsingle(void *value)
-{
-	if (ipuRegs.ctrl.OFC > 0)
-	{
-		ipuRegs.ctrl.OFC--;
-		_readsingle(value);
-	}
-}
-
-__fi bool decoder_t::ReadIpuData(u128* out)
-{
-	if(ipu0_data == 0)
-	{
-		IPU_LOG( "ReadFIFO/IPUout -> (fifo empty/no data available)" );
-		return false;
-	}
-
-	CopyQWC(out, GetIpuDataPtr());
-
-	--ipu0_data;
-	++ipu0_idx;
-
-	IPU_LOG( "ReadFIFO/IPUout -> %ls", out->ToString().c_str() );
-
-	return true;
-}
-
 void __fastcall ReadFIFO_IPUout(mem128_t* out)
 {
-	// FIXME!  When ReadIpuData() doesn't succeed (returns false), the EE should probably stall
-	// until a value becomes available.  This isn't exactly easy to do since the virtualized EE
-	// in PCSX2 *has* to be running in order for the IPU DMA to upload new input data to allow
-	// IPUout's FIFO to fill.  Thus if we implement an EE stall, PCSX2 deadlocks.  Grr.  --air
+	if (!pxAssertDev( ipuRegs.ctrl.OFC > 0, "Attempted read from IPUout's FIFO, but the FIFO is empty!" )) return;
+	ipu_fifo.out.read(out, 1);

-	if (decoder.ReadIpuData(out))
-	{
-		ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
-	}
+	// Games should always check the fifo before reading from it -- so if the FIFO has no data
+	// its either some glitchy game or a bug in pcsx2.
 }

 void __fastcall WriteFIFO_IPUin(const mem128_t* value)
--- a/pcsx2/IPU/IPU_Fifo.h
+++ b/pcsx2/IPU/IPU_Fifo.h
@ -37,13 +37,10 @@ struct IPU_Fifo_Output
 	int readpos, writepos;

 	// returns number of qw read
-	int write(const u32 * value, int size);
-	void read(void *value,int size);
-	void readsingle(void *value);
+	int write(const u32 * value, uint size);
+	void read(void *value, uint size);
 	void clear();
 	wxString desc() const;
-
-	void _readsingle(void *value);
 };

 struct IPU_Fifo
--- a/pcsx2/IPU/IPUdma.cpp
+++ b/pcsx2/IPU/IPUdma.cpp
@ -189,7 +189,7 @@ int IPU1dma()
 			{
 				if(!WaitGSPaths())
 				{ // legacy WaitGSPaths() for now
-					IPU_INT_TO(4); //Give it a short wait.
+					IPU_INT_TO(32); //Give it a short wait.
 					return totalqwc;
 				}
 				IPU_LOG("Processing Normal QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@ -203,7 +203,7 @@ int IPU1dma()
 				{
 					if(!WaitGSPaths())
 					{ // legacy WaitGSPaths() for now
-						IPU_INT_TO(4); //Give it a short wait.
+						IPU_INT_TO(32); //Give it a short wait.
 						return totalqwc;
 					}
 					IPU_LOG("Processing Chain QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@ -283,7 +283,7 @@ int IPU1dma()

 					if(!WaitGSPaths() && ipu1dma.qwc > 0)
 					{ // legacy WaitGSPaths() for now
-						IPU_INT_TO(4); //Give it a short wait.
+						IPU_INT_TO(32); //Give it a short wait.
 						return totalqwc;
 					}
 					IPU_LOG("Processing Start Chain QWC left %x Finished %d In Progress %d", ipu1dma.qwc, IPU1Status.DMAFinished, IPU1Status.InProgress);
@ -312,8 +312,9 @@ int IPU1dma()

 int IPU0dma()
 {
+	if(!ipuRegs.ctrl.OFC) return 0;
+
 	int readsize;
-	static int totalsize = 0;
 	tDMA_TAG* pMem;

 	if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0))
@ -329,7 +330,6 @@ int IPU0dma()
 	pMem = dmaGetAddr(ipu0dma.madr, true);

 	readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC);
-	totalsize+=readsize;
 	ipu_fifo.out.read(pMem, readsize);

 	ipu0dma.madr += readsize << 4;
@ -363,7 +363,6 @@ int IPU0dma()
 		//This broke vids in Digital Devil Saga
 		//Note that interrupting based on totalsize is just guessing..
 		IPU_INT_FROM( readsize * BIAS );
-		totalsize = 0;
 	}

 	return readsize;
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@ -47,10 +47,14 @@ const int non_linear_quantizer_scale [] =
 	into 1st slot is copied to the 2nd slot. Which will later be copied
 	back to the 1st slot when 128bits have been read.
 */
-extern void ReorderBitstream();
 const DCTtab * tab;
 int mbaCount = 0;

+int bitstream_init ()
+{
+	return g_BP.FillBuffer(32);
+}
+
 int get_macroblock_modes()
 {
 	int macroblock_modes;
@ -221,9 +225,7 @@ int __fi get_motion_delta(const int f_code)

 int __fi get_dmv()
 {
-	const DMVtab * tab;
-
-	tab = DMV_2 + UBITS(2);
+	const DMVtab* tab = DMV_2 + UBITS(2);
 	DUMPBITS(tab->len);
 	return tab->dmv;
 }
@ -239,22 +241,21 @@ int get_macroblock_address_increment()
 	else if (code >= 768)
 		mba = MBA.mba11 + (UBITS(11) - 24);
 	else switch (UBITS(11))
-		{
+	{
+		case 8:		/* macroblock_escape */
+			DUMPBITS(11);
+			return 0x23;

-			case 8:		/* macroblock_escape */
+		case 15:	/* macroblock_stuffing (MPEG1 only) */
+			if (decoder.mpeg1)
+			{
 				DUMPBITS(11);
-				return 0x23;
+				return 0x22;
+			}

-			case 15:	/* macroblock_stuffing (MPEG1 only) */
-				if (decoder.mpeg1)
-				{
-					DUMPBITS(11);
-					return 0x22;
-				}
-
-			default:
-				return 0;//error
-		}
+		default:
+			return 0;//error
+	}

 	DUMPBITS(mba->len);

@ -336,11 +337,8 @@ do {							\
 	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)

-static __fi bool get_intra_block()
+static bool get_intra_block()
 {
-	int i;
-	int j;
-	int val;
 	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
 	const u8 (&quant_matrix)[64] = decoder.iq;
 	int quantizer_scale = decoder.quantizer_scale;
@ -348,7 +346,7 @@ static __fi bool get_intra_block()
 	u16 code; 

 	/* decode AC coefficients */
-  for (i=1 + ipu_cmd.pos[4]; ; i++)
+  for (int i=1 + ipu_cmd.pos[4]; ; i++)
  {
 	  switch (ipu_cmd.pos[5])
 	  {
@ -427,60 +425,65 @@ static __fi bool get_intra_block()
 			return true;
 		}
 		
-		i+= tab->run == 65 ? GETBITS(6) : tab->run;
+		i += (tab->run == 65) ? GETBITS(6) : tab->run;
 		if (i >= 64)
 		{
 			ipu_cmd.pos[4] = 0;
 			return true;
 		}
+
 	  case 1:
-		if (!GETWORD())
-		{
-		  ipu_cmd.pos[4] = i - 1;
-		  ipu_cmd.pos[5] = 1;
-		  return false;
+	  {
+			if (!GETWORD())
+			{
+				ipu_cmd.pos[4] = i - 1;
+				ipu_cmd.pos[5] = 1;
+				return false;
+			}
+
+			uint j = scan[i];
+			int val;
+
+			if (tab->run==65) /* escape */
+			{
+				if(!decoder.mpeg1)
+				{
+				  val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
+				  DUMPBITS(12);
+				}
+				else
+				{
+				  val = SBITS(8);
+				  DUMPBITS(8);
+
+				  if (!(val & 0x7f))
+				  {
+					val = GETBITS(8) + 2 * val;
+				  }
+
+				  val = (val * quantizer_scale * quant_matrix[i]) >> 4;
+				  val = (val + ~ (((s32)val) >> 31)) | 1;
+				}
+			}
+			else
+			{
+				val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+				if(decoder.mpeg1)
+				{
+					/* oddification */
+					val = (val - 1) | 1;
+				}
+
+				/* if (bitstream_get (1)) val = -val; */
+				int bit1 = SBITS(1);
+				val = (val ^ bit1) - bit1;
+				DUMPBITS(1);
+			}
+
+			SATURATE(val);
+			dest[j] = val;
+			ipu_cmd.pos[5] = 0;
 		}
-
-		j = scan[i];
-
-		if (tab->run==65) /* escape */
-		{
-		  if(!decoder.mpeg1)
-		  {
-			  val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
-			  DUMPBITS(12);
-		  }
-		  else
-		  {
-			  val = SBITS(8);
-			  DUMPBITS(8);
-
-			  if (!(val & 0x7f))
-			  {
-				val = GETBITS(8) + 2 * val;
-			  }
-			
-			  val = (val * quantizer_scale * quant_matrix[i]) >> 4;
-			  val = (val + ~ (((s32)val) >> 31)) | 1;
-		  }
-		}
-		else
-		{
-		  val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
-		  if(decoder.mpeg1)
-		  {
-			/* oddification */
-			val = (val - 1) | 1;
-		  }
-
- 		  /* if (bitstream_get (1)) val = -val; */
-		  val = (val ^ SBITS(1)) - SBITS(1);
-		  DUMPBITS(1);
-		}
-
-		SATURATE(val);
-		dest[j] = val;
-		ipu_cmd.pos[5] = 0;
 	 }
  }

@ -488,7 +491,7 @@ static __fi bool get_intra_block()
  return true;
 }

-static __fi bool get_non_intra_block(int * last)
+static bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
@ -614,8 +617,9 @@ static __fi bool get_non_intra_block(int * last)
 			}
 			else
 			{
+				int bit1 = SBITS(1);
 				val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-				val = (val ^ SBITS(1)) - SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}

@ -682,25 +686,11 @@ void __fi finishmpeg2sliceIDEC()
 {
 	ipuRegs.ctrl.SCD = 0;
 	coded_block_pattern = decoder.coded_block_pattern;
-
-	g_BP.BP += decoder.bitstream_bits - 16;
-
-	if ((int)g_BP.BP < 0)
-	{
-		g_BP.BP = 128 + (int)g_BP.BP;
-
-		// After BP is positioned correctly, we need to reload the old buffer
-		// so that reading may continue properly
-		ReorderBitstream();
-	}
-
-	FillInternalBuffer(&g_BP.BP, 1, 0);
 }

 bool mpeg2sliceIDEC()
 {
 	u16 code;
-	u8 bit8;

 	switch (ipu_cmd.pos[0])
 	{
@ -798,6 +788,9 @@ bool mpeg2sliceIDEC()
 						ipu_cmd.pos[2] = 6;
 						return false;
 					}
+					break;
+
+				jNO_DEFAULT;
 				}

 				// Send The MacroBlock via DmaIpuFrom
@ -812,23 +805,23 @@ bool mpeg2sliceIDEC()
 				}

 			case 2:
-				while (decoder.ipu0_data > 0)
-				{
-					uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+			{
+				pxAssume(decoder.ipu0_data > 0);

-					if (read == 0)
-					{
-						ipu_cmd.pos[1] = 2;
-						return false;
-					}
-					else
-					{
-						decoder.AdvanceIpuDataBy(read);
-					}
+				uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+				decoder.AdvanceIpuDataBy(read);
+
+				if (decoder.ipu0_data != 0)
+				{
+					// IPU FIFO filled up -- Will have to finish transferring later.
+					ipu_cmd.pos[1] = 2;
+					return false;
 				}

 				decoder.mbc++;
 				mbaCount = 0;
+			}
+			
 			case 3:
 				while (1)
 				{
@ -851,18 +844,18 @@ bool mpeg2sliceIDEC()
 					}
 					else switch (UBITS(11))
 					{
-							case 8:		/* macroblock_escape */
-								mbaCount += 33;
-								/* pass through */
+						case 8:		/* macroblock_escape */
+							mbaCount += 33;
+							/* pass through */

-							case 15:	/* macroblock_stuffing (MPEG1 only) */
-								DUMPBITS(11);
-								continue;
+						case 15:	/* macroblock_stuffing (MPEG1 only) */
+							DUMPBITS(11);
+							continue;

-							default:	/* end of slice/frame, or error? */
-							{
-								goto finish_idec;	
-							}
+						default:	/* end of slice/frame, or error? */
+						{
+							goto finish_idec;	
+						}
 					}
 				}

@ -886,17 +879,20 @@ bool mpeg2sliceIDEC()
 				}

 				break;
+
+			jNO_DEFAULT;
 			}

 			ipu_cmd.pos[1] = 0;
 			ipu_cmd.pos[2] = 0;
 		}
-		
+
 finish_idec:
 		finishmpeg2sliceIDEC();

 	case 3:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 3;
@ -905,10 +901,10 @@ finish_idec:

 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
+	}

 	case 4:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
@ -917,8 +913,10 @@ finish_idec:
 			return false;
 		}

-		BigEndian(ipuRegs.top, ipuRegs.top);
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
+
+	jNO_DEFAULT;
 	}

 	return true;
@ -927,7 +925,6 @@ finish_idec:
 bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
-	u8 bit8;

 	macroblock_8& mb8 = decoder.mb8;
 	macroblock_16& mb16 = decoder.mb16;
@ -1010,9 +1007,35 @@ bool mpeg2_slice()
 					return false;
 				}
 				break;
+
+			jNO_DEFAULT;
 			}

-			ipu_copy(mb8, mb16);
+			// Copy macroblock8 to macroblock16 - without sign extension.
+			// Manually inlined due to MSVC refusing to inline the SSE-optimized version.
+			{
+				const u8	*s = (const u8*)&mb8;
+				u16			*d = (u16*)&mb16;
+
+				//Y  bias	- 16 * 16
+				//Cr bias	- 8 * 8
+				//Cb bias	- 8 * 8
+
+				__m128i zeroreg = _mm_setzero_si128();
+
+				for (uint i = 0; i < (256+64+64) / 32; ++i)
+				{
+					//*d++ = *s++;
+					__m128i woot1 = _mm_load_si128((__m128i*)s);
+					__m128i woot2 = _mm_load_si128((__m128i*)s+1);
+					_mm_store_si128((__m128i*)d,	_mm_unpacklo_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+1,	_mm_unpackhi_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+2,	_mm_unpacklo_epi8(woot2, zeroreg));
+					_mm_store_si128((__m128i*)d+3,	_mm_unpackhi_epi8(woot2, zeroreg));
+					s += 32;
+					d += 32;
+				}
+			}
 		}
 		else
 		{
@ -1077,6 +1100,8 @@ bool mpeg2_slice()
 						}
 					}
 					break;
+
+				jNO_DEFAULT;
 				}
 			}
 		}
@ -1084,40 +1109,31 @@ bool mpeg2_slice()
 		// Send The MacroBlock via DmaIpuFrom
 		ipuRegs.ctrl.SCD = 0;
 		coded_block_pattern = decoder.coded_block_pattern;
-		g_BP.BP += (int)decoder.bitstream_bits - 16;
-
-		// BP goes from 0 to 128, so negative values mean to read old buffer
-		// so we minus from 128 to get the correct BP
-		if ((int)g_BP.BP < 0)
-		{
-			g_BP.BP = 128 + (int)g_BP.BP;
-
-			// After BP is positioned correctly, we need to reload the old buffer
-			// so that reading may continue properly
-			ReorderBitstream();
-		}

 		decoder.mbc = 1;
 		decoder.SetOutputTo(mb16);

 	case 3:
-		while (decoder.ipu0_data > 0)
-		{
-			uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+	{
+		pxAssume(decoder.ipu0_data > 0);

-			if (size == 0)
-			{
-				ipu_cmd.pos[0] = 3;
-				return false;
-			}
-			else
-			{
-				decoder.AdvanceIpuDataBy(size);
-			}
+		uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
+		decoder.AdvanceIpuDataBy(read);
+
+		if (decoder.ipu0_data != 0)
+		{
+			// IPU FIFO filled up -- Will have to finish transferring later.
+			ipu_cmd.pos[0] = 3;
+			return false;
 		}

+		decoder.mbc++;
+		mbaCount = 0;
+	}
+	
 	case 4:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 4;
@ -1126,11 +1142,11 @@ bool mpeg2_slice()

 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
-
+	}
+	
 	case 5:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
 		{
@ -1138,8 +1154,7 @@ bool mpeg2_slice()
 			return false;
 		}

-		BigEndian(ipuRegs.top, ipuRegs.top);
-		decoder.bitstream_bits = 0;
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
 	}

--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@ -148,12 +148,12 @@ struct decoder_t {
 	macroblock_rgb32 rgb32;
 	macroblock_rgb16 rgb16;

-	uint ipu0_data;
+	uint ipu0_data;		// amount of data in the output macroblock (in QWC)
 	uint ipu0_idx;

 	/* bit parsing stuff */
-	u32 bitstream_buf;		/* current 32 bit working set */
-	int bitstream_bits;			/* used bits in working set */
+	//u32 bitstream_buf;		/* current 32 bit working set */
+	//int bitstream_bits;			/* used bits in working set */

 	int quantizer_scale;	/* remove */
 	int dmv_offset;		/* remove */
@ -230,7 +230,7 @@ struct decoder_t {
 		ipu0_data -= amt;
 	}
 	
-	bool ReadIpuData(u128* out);
+	__fi bool ReadIpuData(u128* out);
 };

 struct mpeg2_scan_pack
@ -241,6 +241,10 @@ struct mpeg2_scan_pack
 	mpeg2_scan_pack();
 };

+extern int bitstream_init ();
+extern u32 UBITS(uint bits);
+extern s32 SBITS(uint bits);
+
 extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
 extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);

@ -258,20 +262,19 @@ extern int get_dmv();
 extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
 extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
 extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
-extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);

 extern int slice (u8 * buffer);

 #ifdef _MSC_VER
-#define BigEndian(out, in) out = _byteswap_ulong(in)
+#define BigEndian(in) _byteswap_ulong(in)
 #else
-#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
+#define BigEndian(in) __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif

 #ifdef _MSC_VER
-#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#define BigEndian64(in) _byteswap_uint64(in)
 #else
-#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#define BigEndian64(in) __builtin_bswap64(in) // or we could use the asm function bswap...
 #endif

 extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@ -30,65 +30,24 @@
 #ifndef __VLC_H__
 #define __VLC_H__

-//static u8 word[4];
-//static u8 dword[8];
-//static u8 qword[16];
-
 static __fi int GETWORD()
 {
-	static u8 data[2];
-
-	if (decoder.bitstream_bits > 0)
-	{
-		if(!getBits16(data,1))
-		{
-			return 0;
-		}
-		
-		/*u32 data;
-		BigEndian(data, *(u32*)word);
-		decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
-		decoder.bitstream_bits -= 32;*/
-		decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
-		decoder.bitstream_bits -= 16;
-	}
-
-	return 1;
+	return g_BP.FillBuffer(16);
 }

-static __fi int bitstream_init ()
+// Removes bits from the bitstream.  This is done independently of UBITS/SBITS because a
+// lot of mpeg streams have to read ahead and rewind bits and re-read them at different
+// bit depths or sign'age.
+static __fi void DUMPBITS(uint num)
 {
-	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
-	{
-		return 0;
-	}
-
-	decoder.bitstream_bits = -16;
-	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
-	/*decoder.bitstream_buf = *(u64*)dword;
-	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/
-
-	return 1;
+	g_BP.Advance(num);
+	//pxAssume(g_BP.FP != 0);
 }

-/* remove num valid bits from bit_buf */
-static __fi void DUMPBITS(int num)
+static __fi u32 GETBITS(uint num)
 {
-	decoder.bitstream_buf <<= num;
-    decoder.bitstream_bits += num;
-}
-
-/* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* Get bits from bitstream */
-static __fi u32 GETBITS(int num)
-{
-	u16 retVal = UBITS(num);
-	DUMPBITS(num);
+	uint retVal = UBITS(num);
+	g_BP.Advance(num);

 	return retVal;
 }
--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@ -130,7 +130,7 @@ __ri void cpuException(u32 code, u32 bd)
 			//Reset / NMI
 			cpuRegs.pc = 0xBFC00000;
 			Console.Warning("Reset request");
-			UpdateCP0Status();
+			cpuUpdateOperationMode();
 			return;
 		}
 		else if((code & 0x38000) == 0x10000)
@ -167,7 +167,7 @@ __ri void cpuException(u32 code, u32 bd)
 	else
 		cpuRegs.pc = 0xBFC00200 + offset;

-	UpdateCP0Status();
+	cpuUpdateOperationMode();
 }

 void cpuTlbMiss(u32 addr, u32 bd, u32 excode)
@ -196,7 +196,7 @@ void cpuTlbMiss(u32 addr, u32 bd, u32 excode)
 	}

 	cpuRegs.CP0.n.Status.b.EXL = 1;
-	UpdateCP0Status();
+	cpuUpdateOperationMode();
 //	Log=1; varLog|= 0x40000000;
 }

@ -208,33 +208,6 @@ void cpuTlbMissW(u32 addr, u32 bd) {
 	cpuTlbMiss(addr, bd, EXC_CODE_TLBS);
 }

-__fi void _cpuTestMissingINTC() {
-	if (cpuRegs.CP0.n.Status.val & 0x400 &&
-		psHu32(INTC_STAT) & psHu32(INTC_MASK)) {
-		if ((cpuRegs.interrupt & (1 << 30)) == 0) {
-			Console.Error("*PCSX2*: Error, missing INTC Interrupt");
-		}
-	}
-}
-
-__fi void _cpuTestMissingDMAC() {
-	if (cpuRegs.CP0.n.Status.val & 0x800 &&
-		(psHu16(0xe012) & psHu16(0xe010) ||
-		 psHu16(0xe010) & 0x8000)) {
-		if ((cpuRegs.interrupt & (1 << 31)) == 0) {
-			Console.Error("*PCSX2*: Error, missing DMAC Interrupt");
-		}
-	}
-}
-
-void cpuTestMissingHwInts() {
-	if ((cpuRegs.CP0.n.Status.val & 0x10007) == 0x10001) {
-		_cpuTestMissingINTC();
-		_cpuTestMissingDMAC();
-//		_cpuTestTIMR();
-	}
-}
-
 // sets a branch test to occur some time from an arbitrary starting point.
 __fi void cpuSetNextEvent( u32 startCycle, s32 delta )
 {
@ -253,7 +226,7 @@ __fi void cpuSetNextEventDelta( s32 delta )
 	cpuSetNextEvent( cpuRegs.cycle, delta );
 }

-// tests the cpu cycle agaisnt the given start and delta values.
+// tests the cpu cycle against the given start and delta values.
 // Returns true if the delta time has passed.
 __fi int cpuTestCycle( u32 startCycle, s32 delta )
 {
@ -361,8 +334,8 @@ static bool cpuIntsEnabled(int Interrupt)
 {
 	bool IntType = !!(cpuRegs.CP0.n.Status.val & Interrupt); //Choose either INTC or DMAC, depending on what called it

-	return cpuRegs.CP0.n.Status.b.EIE && cpuRegs.CP0.n.Status.b.IE &&
-		!cpuRegs.CP0.n.Status.b.EXL && (cpuRegs.CP0.n.Status.b.ERL == 0) && IntType;
+	return IntType && cpuRegs.CP0.n.Status.b.EIE && cpuRegs.CP0.n.Status.b.IE &&
+		!cpuRegs.CP0.n.Status.b.EXL && (cpuRegs.CP0.n.Status.b.ERL == 0);
 }

 // if cpuRegs.cycle is greater than this cycle, should check cpuEventTest for updates
@ -375,10 +348,19 @@ __fi void _cpuEventTest_Shared()
 	ScopedBool etest(eeEventTestIsActive);
 	g_nextEventCycle = cpuRegs.cycle + eeWaitCycles;

+	// ---- INTC / DMAC (CPU-level Exceptions) -----------------
+	// Done first because exceptions raised during event tests need to be postponed a few
+	// cycles (fixes Grandia II [PAL], which does a spin loop on a vsync and expects to
+	// be able to read the value before the exception handler clears it).
+
+	uint mask = intcInterrupt() | dmacInterrupt();
+	if (cpuIntsEnabled(mask)) cpuException(mask, cpuRegs.branch);
+
+
 	// ---- Counters -------------
 	// Important: the vsync counter must be the first to be checked.  It includes emulation
 	// escape/suspend hooks, and it's really a good idea to suspend/resume emulation before
-	// doing any actual meaninful branchtest logic.
+	// doing any actual meaningful branchtest logic.

 	if( cpuTestCycle( nextsCounter, nextCounter ) )
 	{
@ -391,10 +373,10 @@ __fi void _cpuEventTest_Shared()
 	_cpuTestTIMR();

 	// ---- Interrupts -------------
-	// Handles all interrupts except 30 and 31, which are handled later.
+	// These are basically just DMAC-related events, which also piggy-back the same bits as
+	// the PS2's own DMA channel IRQs and IRQ Masks.

-	if( cpuRegs.interrupt & ~(3<<30) )
-		_cpuTestInterrupts();
+	_cpuTestInterrupts();

 	// ---- IOP -------------
 	// * It's important to run a iopEventTest before calling ExecuteBlock. This
@ -418,11 +400,7 @@ __fi void _cpuEventTest_Shared()
 		//if( EEsCycle < -450 )
 		//	Console.WriteLn( " IOP ahead by: %d cycles", -EEsCycle );

-		// Experimental and Probably Unnecessary Logic -->
-		// Check if the EE already has an exception pending, and if so we shouldn't
-		// waste too much time updating the IOP.  Theory being that the EE and IOP should
-		// run closely in sync during raised exception events.  But in practice it didn't
-		// seem to make much of a difference.
+		EEsCycle = psxCpu->ExecuteBlock( EEsCycle );

 		iopEventAction = false;
 	}
@ -456,22 +434,10 @@ __fi void _cpuEventTest_Shared()

 	// Apply vsync and other counter nextCycles
 	cpuSetNextEvent( nextsCounter, nextCounter );
-
-	// ---- INTC / DMAC Exceptions -----------------
-	// Raise the INTC and DMAC interrupts here, which usually throw exceptions.
-	// This should be done last since the IOP and the VU0 can raise several EE
-	// exceptions.
-
-	//if ((cpuRegs.CP0.n.Status.val & 0x10007) == 0x10001)
-	if( cpuIntsEnabled(0x400) ) TESTINT(30, intcInterrupt);
-	if( cpuIntsEnabled(0x800) ) TESTINT(31, dmacInterrupt);
 }

 __ri void cpuTestINTCInts()
 {
-	// Check the internal Event System -- if one's already scheduled then don't bother:
-	if( cpuRegs.interrupt & (1 << 30) ) return;
-
 	// Check the COP0's Status register for general interrupt disables, and the 0x400
 	// bit (which is INTC master toggle).
 	if( !cpuIntsEnabled(0x400) ) return;
@ -488,9 +454,6 @@ __ri void cpuTestINTCInts()

 __fi void cpuTestDMACInts()
 {
-	// Check the internal Event System -- if one's already scheduled then don't bother:
-	if ( cpuRegs.interrupt & (1 << 31) ) return;
-
 	// Check the COP0's Status register for general interrupt disables, and the 0x800
 	// bit (which is the DMAC master toggle).
 	if( !cpuIntsEnabled(0x800) ) return;
--- a/pcsx2/R5900.h
+++ b/pcsx2/R5900.h
@ -403,8 +403,8 @@ enum EE_EventType
 };

 extern void CPU_INT( EE_EventType n, s32 ecycle );
-extern void intcInterrupt();
-extern void dmacInterrupt();
+extern uint intcInterrupt();
+extern uint dmacInterrupt();


 extern void cpuInit();
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.

-static const u32 g_SaveVersion = 0x8b4a0000;
+static const u32 g_SaveVersion = 0x8b4b0000;

 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)
--- a/plugins/zzogl-pg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg/opengl/CMakeLists.txt
@ -92,9 +92,9 @@ set(zzoglHeaders
    Util.h
    x86.h
    zerogs.h
-    zerogsmath.h
    zpipe.h
    ZZoglCRTC.h
+    ZZoglMath.h
    ZZoglShaders.h
    ZZGl.h
    ZZLog.h)
--- a/plugins/zzogl-pg/opengl/GLWin32.cpp
+++ b/plugins/zzogl-pg/opengl/GLWin32.cpp
@ -32,7 +32,6 @@ LRESULT WINAPI MsgProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)

 	switch (msg)
 	{
-
 		case WM_DESTROY:
 			PostQuitMessage(0);
 			return 0;
@ -76,21 +75,21 @@ bool GLWindow::CreateWindow(void *pDisplay)
 	rc.bottom = conf.height;

 	WNDCLASSEX wc;
-	HINSTANCE hInstance = GetModuleHandle(NULL);
+	HINSTANCE hInstance = GetModuleHandle(NULL); // Grab An Instance For Our Window
 	DWORD dwExStyle, dwStyle;

 	wc.cbSize = sizeof(WNDCLASSEX);
-	wc.style = CS_CLASSDC;
-	wc.lpfnWndProc = (WNDPROC) MsgProc;
-	wc.cbClsExtra = 0;
-	wc.cbWndExtra = 0;
-	wc.hInstance = hInstance;
-	wc.hIcon = NULL;
-	wc.hIconSm = NULL;
-	wc.hCursor = NULL;
-	wc.hbrBackground = NULL;
-	wc.lpszMenuName = NULL;
-	wc.lpszClassName = "PS2EMU_ZEROGS";
+	wc.style		= CS_HREDRAW | CS_VREDRAW | CS_OWNDC;		// Redraw On Move, And Own DC For Window
+	wc.lpfnWndProc		= (WNDPROC) MsgProc;					// MsgProc Handles Messages
+	wc.cbClsExtra		= 0;									// No Extra Window Data
+	wc.cbWndExtra		= 0;									// No Extra Window Data
+	wc.hInstance		= hInstance;							// Set The Instance
+	wc.hIcon		= NULL;			
+	wc.hIconSm		= NULL;										// Load The Default Icon
+	wc.hCursor		= LoadCursor(NULL, IDC_ARROW);				// Load The Arrow Pointer
+	wc.hbrBackground	= (HBRUSH)GetStockObject(BLACK_BRUSH);	// No Background Required For GL
+	wc.lpszMenuName		= NULL;									// We Don't Want A Menu
+	wc.lpszClassName	= "PS2EMU_ZEROGS";						// Set The Class Name

 	RegisterClassEx(&wc);

@ -102,26 +101,26 @@ bool GLWindow::CreateWindow(void *pDisplay)
 	else
 	{
 		dwExStyle = WS_EX_APPWINDOW | WS_EX_WINDOWEDGE;
-		dwStyle = WS_OVERLAPPEDWINDOW;
+		dwStyle = WS_OVERLAPPEDWINDOW | WS_BORDER;
 	}

+	dwStyle |= WS_CLIPSIBLINGS | WS_CLIPCHILDREN;
 	AdjustWindowRectEx(&rc, dwStyle, false, dwExStyle);

 	GetWindowRect(GetDesktopWindow(), &rcdesktop);

-	GShwnd = CreateWindowEx(
-				 dwExStyle,
-				 "PS2EMU_ZEROGS",
-				 "ZeroGS",
-				 dwStyle,
-				 (rcdesktop.right - (rc.right - rc.left)) / 2,
-				 (rcdesktop.bottom - (rc.bottom - rc.top)) / 2,
-				 rc.right - rc.left,
-				 rc.bottom - rc.top,
-				 NULL,
-				 NULL,
-				 hInstance,
-				 NULL);
+	GShwnd = CreateWindowEx(	dwExStyle,				// Extended Style For The Window
+					"PS2EMU_ZEROGS",				// Class Name
+					"ZZOgl",					// Window Title
+					dwStyle,				// Selected Window Style
+					(rcdesktop.right - (rc.right - rc.left)) / 2,  // Window Position
+					(rcdesktop.bottom - (rc.bottom - rc.top)) / 2, // Window Position
+					rc.right - rc.left,	// Calculate Adjusted Window Width
+					rc.bottom - rc.top,	// Calculate Adjusted Window Height
+					NULL,					// No Parent Window
+					NULL,					// No Menu
+					hInstance,				// Instance
+					NULL);					// Don't Pass Anything To WM_CREATE

 	if (GShwnd == NULL) return false;

@ -197,6 +196,7 @@ bool GLWindow::DisplayWindow(int _width, int _height)
 		dwExStyle = WS_EX_APPWINDOW | WS_EX_WINDOWEDGE;
 		dwStyle = WS_OVERLAPPEDWINDOW;
 	}
+	dwStyle |= WS_CLIPSIBLINGS | WS_CLIPCHILDREN;

 	RECT rc;

--- a/plugins/zzogl-pg/opengl/HostMemory.cpp
+++ b/plugins/zzogl-pg/opengl/HostMemory.cpp
@ -469,10 +469,10 @@ __forceinline void _TransferLocalLocal_4()
 		assert((gs.srcbuf.psm&0x7) == (gs.dstbuf.psm&0x7));

 		if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw)
-			ZZLog::Warn_Log("Transfer error, src width exceeded.");
+			ZZLog::Debug_Log("Transfer error, src width exceeded.");

 		if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw)
-			ZZLog::Warn_Log("Transfer error, dst width exceeded.");
+			ZZLog::Debug_Log("Transfer error, dst width exceeded.");

 		int srcstart, srcend, dststart, dstend;

--- a/plugins/zzogl-pg/opengl/Mem.cpp
+++ b/plugins/zzogl-pg/opengl/Mem.cpp
@ -267,7 +267,7 @@ void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData,
 	}

    if (floatfmt) {
-        Vector* psrcv = (Vector*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;
+        float4* psrcv = (float4*)&vBilinearData[0] + b.ox + b.oy * BLOCK_TEXWIDTH;

        for(int i = 0; i < b.height; ++i)
        {
@ -276,7 +276,7 @@ void fill_block(BLOCK b, vector<char>& vBlockData, vector<char>& vBilinearData,
            for(int j = 0; j < b.width; ++j)
            {
                u32 temp = ((j + 1) % b.width);
-                Vector* pv = &psrcv[i_width + j];
+                float4* pv = &psrcv[i_width + j];
                pv->x = psrcf[i_width + j];
                pv->y = psrcf[i_width + temp];
                pv->z = psrcf[i_width2 + j];
@ -291,7 +291,7 @@ void BLOCK::FillBlocks(vector<char>& vBlockData, vector<char>& vBilinearData, in
 	FUNCLOG
    if (floatfmt) {
        vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 4);
-        vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(Vector));
+        vBilinearData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * sizeof(float4));
    } else {
        vBlockData.resize(BLOCK_TEXWIDTH * BLOCK_TEXHEIGHT * 2);
    }
--- a/plugins/zzogl-pg/opengl/Mem.h
+++ b/plugins/zzogl-pg/opengl/Mem.h
@ -120,8 +120,8 @@ struct BLOCK
 	BLOCK() { memset(this, 0, sizeof(BLOCK)); }

 	// shader constants for this block
-	Vector vTexBlock;
-	Vector vTexDims;
+	float4 vTexBlock;
+	float4 vTexDims;
 	int width, height;	// dims of one page in pixels
 	int ox, oy, mult;
 	int bpp;
@ -147,8 +147,8 @@ struct BLOCK
 		ox = ox2;
 		oy = oy2;
 		mult = mult2;
-		vTexDims = Vector(BLOCK_TEXWIDTH/(float)(bw), BLOCK_TEXHEIGHT/(float)bh, 0, 0); 
-		vTexBlock = Vector((float)bw/BLOCK_TEXWIDTH, (float)bh/BLOCK_TEXHEIGHT, ((float)ox+0.2f)/BLOCK_TEXWIDTH, ((float)oy+0.05f)/BLOCK_TEXHEIGHT);
+		vTexDims = float4(BLOCK_TEXWIDTH/(float)(bw), BLOCK_TEXHEIGHT/(float)bh, 0, 0); 
+		vTexBlock = float4((float)bw/BLOCK_TEXWIDTH, (float)bh/BLOCK_TEXHEIGHT, ((float)ox+0.2f)/BLOCK_TEXWIDTH, ((float)oy+0.05f)/BLOCK_TEXHEIGHT);
 		width = bw;
 		height = bh;
 		colwidth = bh / 4;
--- a/plugins/zzogl-pg/opengl/NewRegs.cpp
+++ b/plugins/zzogl-pg/opengl/NewRegs.cpp
@ -638,7 +638,7 @@ void __gifCall GIFRegHandlerSCISSOR(const u32* data)
 		Flush();
 	}

-	m_env.CTXT[i].SCISSOR = (GSVector4i)r->SCISSOR;
+	m_env.CTXT[i].SCISSOR = (Vector4i)r->SCISSOR;

 	m_env.CTXT[i].UpdateScissor();*/
 	ZZLog::Greg_Log("SCISSOR%d", i);
--- a/plugins/zzogl-pg/opengl/Regs.cpp
+++ b/plugins/zzogl-pg/opengl/Regs.cpp
@ -55,7 +55,7 @@ inline bool NoHighlights(int i)
 	
 //	if ( results[resultA] == 0 ) {
 //		results[resultA] = 1;
-//		ZZLog::ERROR_LOG("%x = %d %d %d %d %d %d %d %d \n", resultA, prim->iip, (prim->tme), (prim->fge), (prim->abe) , (prim->aa1) ,(prim->fst), (prim->ctxt), (prim->fix)) ;
+//		ZZLog::Error_Log("%x = %d %d %d %d %d %d %d %d \n", resultA, prim->iip, (prim->tme), (prim->fge), (prim->abe) , (prim->aa1) ,(prim->fst), (prim->ctxt), (prim->fix)) ;
 //	}
 //	if (resultA == 0xb && ZeroGS::vb[i].zbuf.zmsk ) return false; //ATF

--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@ -52,7 +52,7 @@ extern "C" u32   CALLBACK PS2EgetLibType(void);
 extern "C" u32   CALLBACK PS2EgetLibVersion2(u32 type);
 extern "C" char* CALLBACK PS2EgetLibName(void);

-#include "zerogsmath.h"
+#include "ZZoglMath.h"

 #include <vector>
 #include <string>
--- a/plugins/zzogl-pg/opengl/Win32/Win32.cpp
+++ b/plugins/zzogl-pg/opengl/Win32/Win32.cpp
@ -37,62 +37,27 @@ void CALLBACK GSkeyEvent(keyEvent *ev)

 #include "Win32/resource.h"

-BOOL CALLBACK LoggingDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
-{
-	switch (uMsg)
-	{
-
-		case WM_INITDIALOG:
-
-			if (conf.log) CheckDlgButton(hW, IDC_LOG, true);
-
-			return true;
-
-		case WM_COMMAND:
-			switch (LOWORD(wParam))
-			{
-				case IDCANCEL:
-					EndDialog(hW, true);
-					return true;
-
-				case IDOK:
-
-					if (IsDlgButtonChecked(hW, IDC_LOG))
-						conf.log = 1;
-					else
-						conf.log = 0;
-
-					SaveConfig();
-
-					EndDialog(hW, false);
-
-					return true;
-			}
-	}
-
-	return false;
-}
-
 map<int, int> mapConfOpts;
 #define PUT_CONF(id) mapConfOpts[IDC_CONFOPT_##id] = 0x##id;

-void OnInitDialog(HWND hW)
+void OnAdvOK(HWND hW)
 {
-	if (!(conf.zz_options.loaded)) LoadConfig();
+	conf.hacks._u32 = 0;

-	CheckDlgButton(hW, IDC_CONFIG_INTERLACE, conf.interlace);
-	CheckDlgButton(hW, IDC_CONFIG_BILINEAR, conf.bilinear);
-	CheckDlgButton(hW, IDC_CONFIG_DEPTHWRITE, conf.mrtdepth);
-	CheckRadioButton(hW, IDC_CONFIG_AANONE, IDC_CONFIG_AA4, IDC_CONFIG_AANONE + conf.aa);
-	CheckDlgButton(hW, IDC_CONFIG_WIREFRAME, (conf.wireframe()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_CAPTUREAVI, (conf.captureAvi()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_FULLSCREEN, (conf.fullscreen()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_WIDESCREEN, (conf.widescreen()) ? 1 : 0);
-	CheckDlgButton(hW, IDC_CONFIG_BMPSS, (conf.zz_options.tga_snap) ? 1 : 0);
-	CheckRadioButton(hW, IDC_CONF_WIN640, IDC_CONF_WIN1280, IDC_CONF_WIN640 + conf.zz_options.dimensions);
+	for (map<int, int>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
+	{
+		if (IsDlgButtonChecked(hW, it->first)) conf.hacks._u32 |= it->second;
+	}

-	prevbilinearfilter = conf.bilinear;
+	GSsetGameCRC(g_LastCRC, conf.hacks._u32);

+	SaveConfig();
+
+	EndDialog(hW, false);
+}
+
+void OnInitAdvDialog(HWND hW)
+{
 	mapConfOpts.clear();

 	PUT_CONF(00000001);
@ -129,45 +94,87 @@ void OnInitDialog(HWND hW)
 	}
 }

-void OnOK(HWND hW)
+BOOL CALLBACK AdvancedDialogProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
+{
+	switch (uMsg)
+	{
+		case WM_INITDIALOG:
+			OnInitAdvDialog(hW);
+			return true;
+
+		case WM_COMMAND:
+
+			switch (LOWORD(wParam))
+			{
+				case IDCANCEL:
+					EndDialog(hW, true);
+					return true;
+
+				case IDOK:
+					OnAdvOK(hW);
+					return true;
+			}
+	}
+
+	return false;
+}
+
+void CALLBACK AdvancedDialog()
+{
+	DialogBox(hInst,
+			  MAKEINTRESOURCE(IDD_ADV_OPTIONS),
+			  GetActiveWindow(),
+			  (DLGPROC)AdvancedDialogProc);
+}
+
+void OnInitConfDialog(HWND hW)
+{
+	if (!(conf.zz_options.loaded)) LoadConfig();
+
+     TCHAR *aaName[] = {"None", "x2", "x4", "x8", "x16"};
+ 
+     for(int i=0; i<5; i++)
+	 {
+		 ComboBox_AddString(GetDlgItem(hW, IDC_AA_COMBO), (LPARAM)aaName[i]);
+	 }
+	ComboBox_SelectString(GetDlgItem(hW, IDC_AA_COMBO), -1, (LPARAM)aaName[conf.aa]);
+
+    TCHAR *sizeName[] = {"640 x 480", "800 x 600", "1024 x 768", "1280 x 960"};
+ 
+    for(int i=0; i<4; i++)
+	{
+		ComboBox_AddString(GetDlgItem(hW, IDC_WIN_SIZE_COMBO), (LPARAM)sizeName[i]);
+	}
+	ComboBox_SelectString(GetDlgItem(hW, IDC_WIN_SIZE_COMBO), -1, (LPARAM)sizeName[conf.zz_options.dimensions]);
+
+	CheckDlgButton(hW, IDC_CONFIG_INTERLACE, conf.interlace);
+	CheckDlgButton(hW, IDC_CONFIG_BILINEAR, conf.bilinear);
+	CheckDlgButton(hW, IDC_CONFIG_DEPTHWRITE, conf.mrtdepth);
+	CheckDlgButton(hW, IDC_CONFIG_WIREFRAME, (conf.wireframe()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_CAPTUREAVI, (conf.captureAvi()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_FULLSCREEN, (conf.fullscreen()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_WIDESCREEN, (conf.widescreen()) ? 1 : 0);
+	CheckDlgButton(hW, IDC_CONFIG_BMPSS, (conf.zz_options.tga_snap) ? 1 : 0);
+
+	prevbilinearfilter = conf.bilinear;
+}
+
+void OnConfOK(HWND hW)
 {
 	u32 newinterlace = IsDlgButtonChecked(hW, IDC_CONFIG_INTERLACE);

-	if (!conf.interlace) conf.interlace = newinterlace;
-	else if (!newinterlace) conf.interlace = 2;  // off
+	if (!conf.interlace) 
+		conf.interlace = newinterlace;
+	else if (!newinterlace) 
+		conf.interlace = 2;  // off

 	conf.bilinear = IsDlgButtonChecked(hW, IDC_CONFIG_BILINEAR);

 	// restore
-	if (conf.bilinear && prevbilinearfilter)
-		conf.bilinear = prevbilinearfilter;
+	if (conf.bilinear && prevbilinearfilter) conf.bilinear = prevbilinearfilter;

-	//conf.mrtdepth = 1;//IsDlgButtonChecked(hW, IDC_CONFIG_DEPTHWRITE);
-
-	if (SendDlgItemMessage(hW, IDC_CONFIG_AANONE, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 0;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA2, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 1;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA4, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 2;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA8, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 3;
-	}
-	else if (SendDlgItemMessage(hW, IDC_CONFIG_AA16, BM_GETCHECK, 0, 0))
-	{
-		conf.aa = 4;
-	}
-	else 
-	{
-		conf.aa = 0;
-	}
+	if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_AA_COMBO)) != -1)
+		conf.aa = ComboBox_GetCurSel(GetDlgItem(hW, IDC_AA_COMBO));

 	conf.zz_options._u32 = 0;

@ -177,22 +184,13 @@ void OnOK(HWND hW)
 	conf.zz_options.widescreen = IsDlgButtonChecked(hW, IDC_CONFIG_WIDESCREEN) ? 1 : 0;
 	conf.zz_options.tga_snap = IsDlgButtonChecked(hW, IDC_CONFIG_BMPSS) ? 1 : 0;

-	conf.hacks._u32 = 0;
-
-	for (map<int, int>::iterator it = mapConfOpts.begin(); it != mapConfOpts.end(); ++it)
-	{
-		if (IsDlgButtonChecked(hW, it->first)) conf.hacks._u32 |= it->second;
-	}
-
-	GSsetGameCRC(g_LastCRC, conf.hacks._u32);
-
-	if (SendDlgItemMessage(hW, IDC_CONF_WIN640, BM_GETCHECK, 0, 0)) 
+	if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 0) 
 		conf.zz_options.dimensions = GSDim_640;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN800, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 1) 
 		conf.zz_options.dimensions = GSDim_800;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN1024, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 2) 
 		conf.zz_options.dimensions = GSDim_1024;
-	else if (SendDlgItemMessage(hW, IDC_CONF_WIN1280, BM_GETCHECK, 0, 0)) 
+	else if (ComboBox_GetCurSel(GetDlgItem(hW, IDC_WIN_SIZE_COMBO)) == 3) 
 		conf.zz_options.dimensions = GSDim_1280;

 	SaveConfig();
@ -205,19 +203,26 @@ BOOL CALLBACK ConfigureDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	switch (uMsg)
 	{
 		case WM_INITDIALOG:
-			OnInitDialog(hW);
+			OnInitConfDialog(hW);
 			return true;

 		case WM_COMMAND:

 			switch (LOWORD(wParam))
 			{
+                case IDC_AA_COMBO: 
+					break; 
+
+				case IDC_ADV_BTN:
+					AdvancedDialog();
+					return true;
+
 				case IDCANCEL:
 					EndDialog(hW, true);
 					return true;

 				case IDOK:
-					OnOK(hW);
+					OnConfOK(hW);
 					return true;
 			}
 	}
@ -225,13 +230,26 @@ BOOL CALLBACK ConfigureDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	return false;
 }

+void CALLBACK GSconfigure()
+{
+	DialogBox(hInst,
+			  MAKEINTRESOURCE(IDD_CONFIG2),
+			  GetActiveWindow(),
+			  (DLGPROC)ConfigureDlgProc);
+
+	if (g_nPixelShaderVer == SHADER_REDUCED) conf.bilinear = 0;
+}
+
+s32 CALLBACK GStest()
+{
+	return 0;
+}
+
 BOOL CALLBACK AboutDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
 	switch (uMsg)
 	{
 		case WM_INITDIALOG:
-			//ZeroGS uses floating point render targets because A8R8G8B8 format is not sufficient for ps2 blending and this requires alpha blending on floating point render targets
-			//There might be a problem with pixel shader precision with older geforce models (textures will look blocky).
 			return true;

 		case WM_COMMAND:
@ -246,21 +264,6 @@ BOOL CALLBACK AboutDlgProc(HWND hW, UINT uMsg, WPARAM wParam, LPARAM lParam)
 	return false;
 }

-void CALLBACK GSconfigure()
-{
-	DialogBox(hInst,
-			  MAKEINTRESOURCE(IDD_CONFIG),
-			  GetActiveWindow(),
-			  (DLGPROC)ConfigureDlgProc);
-
-	if (g_nPixelShaderVer == SHADER_REDUCED) conf.bilinear = 0;
-}
-
-s32 CALLBACK GStest()
-{
-	return 0;
-}
-
 void CALLBACK GSabout()
 {
 	DialogBox(hInst,
--- a/plugins/zzogl-pg/opengl/Win32/resrc1.h
+++ b/plugins/zzogl-pg/opengl/Win32/resrc1.h
@ -5,7 +5,6 @@
 #define IDC_CONF_DEFAULT                3
 #define IDR_DATA1                       112
 #define IDD_ADV_OPTIONS                 113
-#define IDD_DIALOG1                     114
 #define IDD_CONFIG2                     114
 #define IDC_ABOUTTEXT                   1015
 #define IDC_CONFIG_AA                   1016
@ -52,12 +51,15 @@
 #define IDC_CONFOPT_00004000            1047
 #define IDC_BUTTON1                     1048
 #define IDC_CONFOPT_COMPUTEOR           1048
+#define IDC_ADV_BTN                     1048
 #define IDC_CONFOPT_4001                1049
 #define IDC_CONFOPT_00000010            1049
 #define IDC_CONFOPT_00008000            1050
 #define IDC_CONFOPT_00010000            1052
 #define IDC_CONFOPT_00020000            1054
+#define IDC_AA_COMBO                    1054
 #define IDC_CONFOPT_00000002            1055
+#define IDC_WIN_SIZE_COMBO              1055
 #define IDC_CONFOPT_01000000            1056
 #define IDC_CONFOPT_00800000            1057
 #define IDC_CONFOPT_00000008            1058
@ -80,7 +82,7 @@
 #ifndef APSTUDIO_READONLY_SYMBOLS
 #define _APS_NEXT_RESOURCE_VALUE        116
 #define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1051
+#define _APS_NEXT_CONTROL_VALUE         1056
 #define _APS_NEXT_SYMED_VALUE           101
 #endif
 #endif
--- a/plugins/zzogl-pg/opengl/Win32/zerogs.rc
+++ b/plugins/zzogl-pg/opengl/Win32/zerogs.rc
@ -206,32 +206,28 @@ BEGIN
                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,266,365,8
 END

-IDD_CONFIG2 DIALOGEX 0, 0, 171, 217
+IDD_CONFIG2 DIALOGEX 0, 0, 159, 160
 STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
 CAPTION "ZZOgl Options"
 FONT 8, "MS Shell Dlg", 400, 0, 0x1
 BEGIN
-    DEFPUSHBUTTON   "OK",IDOK,55,192,50,14
-    PUSHBUTTON      "Cancel",IDCANCEL,108,192,50,14
-    GROUPBOX        "Static",IDC_STATIC,7,7,152,183
-    CONTROL         "Logging (For Debugging)",1000,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,18,102,10
+    DEFPUSHBUTTON   "OK",IDOK,37,138,50,14
+    PUSHBUTTON      "Cancel",IDCANCEL,91,138,50,14
+    CONTROL         "Logging (For Debugging)",1000,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,7,102,10
    CONTROL         "Interlace Enable (toggle with F5). There are 2 modes + interlace off",IDC_CONFIG_INTERLACE,
-                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,45,137,18
+                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,32,137,18
    CONTROL         "Bilinear Filtering (Shift+F5). Best quality is on, turn off for speed.",IDC_CONFIG_BILINEAR,
-                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,67,137,18
+                    "Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,50,137,18
    CONTROL         "Capture Avi (zerogs.avi) (F12)",IDC_CONFIG_CAPTUREAVI,
-                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,103,109,10
+                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,82,109,10
    CONTROL         "Save Snapshots as BMP(default is JPG)",IDC_CONFIG_BMPSS,
-                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,14,116,141,10
-    CONTROL         "Wide Screen",IDC_CONFIG_WIDESCREEN,"Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,14,90,109,10
-    CONTROL         "640 x 480",IDC_CONF_WIN640,"Button",BS_AUTORADIOBUTTON | WS_GROUP,20,140,59,8
-    CONTROL         "800 x 600",IDC_CONF_WIN800,"Button",BS_AUTORADIOBUTTON,21,152,59,8
-    CONTROL         "1024 x 768",IDC_CONF_WIN1024,"Button",BS_AUTORADIOBUTTON,86,140,59,8
-    CONTROL         "1280 x 960",IDC_CONF_WIN1280,"Button",BS_AUTORADIOBUTTON,86,151,53,8
-    GROUPBOX        "Default Window Size (no speed impact)",IDC_STATIC,14,129,137,39
-    COMBOBOX        IDC_COMBO1,59,31,48,30,CBS_DROPDOWNLIST | CBS_SORT | WS_VSCROLL | WS_TABSTOP
-    LTEXT           "Anti-aliasing",IDC_STATIC,15,33,43,13
-    PUSHBUTTON      "Advanced...",IDC_BUTTON1,17,170,134,14
+                    "Button",BS_AUTOCHECKBOX | WS_TABSTOP,7,93,141,10
+    CONTROL         "Wide Screen",IDC_CONFIG_WIDESCREEN,"Button",BS_AUTOCHECKBOX | BS_MULTILINE | WS_TABSTOP,7,69,109,10
+    LTEXT           "Anti-aliasing",IDC_STATIC,7,20,43,13
+    PUSHBUTTON      "Advanced...",IDC_ADV_BTN,7,118,134,14
+    COMBOBOX        IDC_AA_COMBO,53,18,48,30,CBS_DROPDOWN | WS_VSCROLL | WS_TABSTOP
+    COMBOBOX        IDC_WIN_SIZE_COMBO,78,104,62,30,CBS_DROPDOWN | WS_VSCROLL | WS_TABSTOP
+    LTEXT           "Default Window Size",IDC_STATIC,7,106,68,8
 END


@ -277,9 +273,9 @@ BEGIN
    IDD_CONFIG2, DIALOG
    BEGIN
        LEFTMARGIN, 7
-        RIGHTMARGIN, 164
+        RIGHTMARGIN, 152
        TOPMARGIN, 7
-        BOTTOMMARGIN, 210
+        BOTTOMMARGIN, 152
    END
 END
 #endif    // APSTUDIO_INVOKED
@ -311,27 +307,6 @@ END

 #endif    // APSTUDIO_INVOKED

-
-/////////////////////////////////////////////////////////////////////////////
-//
-// Dialog Info
-//
-
-IDD_CONFIG2 DLGINIT
-BEGIN
-    IDC_COMBO1, 0x403, 5, 0
-0x6f4e, 0x656e, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5832, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5834, "\000" 
-    IDC_COMBO1, 0x403, 3, 0
-0x5838, "\000" 
-    IDC_COMBO1, 0x403, 4, 0
-0x3631, 0x0058, 
-    0
-END
-
 #endif    // English (U.S.) resources
 /////////////////////////////////////////////////////////////////////////////

--- a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
+++ b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
@ -481,10 +481,6 @@
 				RelativePath="..\zerogs.h"
 				>
 			</File>
-			<File
-				RelativePath="..\zerogsmath.h"
-				>
-			</File>
 			<File
 				RelativePath="..\ZZGl.h"
 				>
@ -497,6 +493,10 @@
 				RelativePath="..\ZZoglFlushHack.h"
 				>
 			</File>
+			<File
+				RelativePath="..\ZZoglMath.h"
+				>
+			</File>
 			<File
 				RelativePath="..\ZZoglShaders.h"
 				>
@ -528,11 +528,11 @@
 			</File>
 		</Filter>
 		<File
-			RelativePath=".\ps2hw.dat"
+			RelativePath="..\ps2hw.dat"
 			>
 		</File>
 		<File
-			RelativePath="..\ps2hw.dat"
+			RelativePath=".\ps2hw.dat"
 			>
 		</File>
 	</Files>
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.cpp
@ -50,7 +50,7 @@ extern bool g_bMakeSnapshot;
 extern string strSnapshot;

 // Adjusts vertex shader BitBltPos vector v to preserve aspect ratio. It used to emulate 4:3 or 16:9.
-void ZeroGS::AdjustTransToAspect(Vector& v)
+void ZeroGS::AdjustTransToAspect(float4& v)
 {
 	double temp;
 	float f;
@ -242,11 +242,11 @@ inline void RenderStartHelper(u32 bInterlace)
 // on image y coords. So if we write valpha.z * F + valpha.w + 0.5, it would be switching odd
 // and even strings at each frame.
 // valpha.x and y are used for image blending.
-inline Vector RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTSHADER* prog)
+inline float4 RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTSHADER* prog)
 {
 	SetShaderCaller("RenderGetForClip");

-	Vector valpha;
+	float4 valpha;
 	// first render the current render targets, then from ptexMem

 	if (psm == 1)
@ -282,7 +282,7 @@ inline Vector RenderGetForClip(u32 bInterlace, int interlace, int psm, FRAGMENTS
 		valpha.w = 1;
 	}

-	ZZshSetParameter4fv(prog->sOneColor, valpha, "g_fOneColor");
+	ZZshSetParameter4fv(prog->prog, prog->sOneColor, valpha, "g_fOneColor");

 	return valpha;
 }
@ -295,7 +295,7 @@ inline void RenderCreateInterlaceTex(u32 bInterlace, int th, FRAGMENTSHADER* pro

 	int interlacetex = CreateInterlaceTex(2 * th);

-	ZZshGLSetTextureParameter(prog->sInterlace, interlacetex, "Interlace");
+	ZZshGLSetTextureParameter(prog->prog, prog->sInterlace, interlacetex, "Interlace");
 }

 // Well, do blending setup prior to second pass of half-frame drawing
@ -396,10 +396,10 @@ inline int RenderGetOffsets(int* dby, int* movy, tex0Info& texframe, CRenderTarg
 }

 // BltBit shader calculate vertex (4 coord's pixel) position at the viewport.
-inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
+inline float4 RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 {
 	SetShaderCaller("RenderSetTargetBitPos");
-	Vector v;
+	float4 v;
 	// dest rect
 	v.x = 1;
 	v.y = dh / (float)th;
@ -416,7 +416,7 @@ inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 		v.w += 1.0f / (float)dh ;
 	}

-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");

 	return v;
 }
@ -425,12 +425,12 @@ inline Vector RenderSetTargetBitPos(int dh, int th, int movy, bool isInterlace)
 // For example, use tw / X and tw / X magnify the viewport.
 // Interlaced output is little out of VB, it could be seen as an evil blinking line on top
 // and bottom, so we try to remove it.
-inline Vector RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool isInterlace)
+inline float4 RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool isInterlace)
 {
 	SetShaderCaller("RenderSetTargetBitTex");

-	Vector v;
-	v = Vector(th, tw, dh, dw);
+	float4 v;
+	v = float4(th, tw, dh, dw);

 	// Incorrect Aspect ratio on interlaced frames

@ -440,28 +440,28 @@ inline Vector RenderSetTargetBitTex(float th, float tw, float dh, float dw, bool
 		v.w += 1.0f / conf.height;
 	}

-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");

 	return v;
 }

 // Translator for POSITION coordinates (-1.0:+1.0f at x axis, +1.0f:-1.0y at y) into target frame ones.
 // We don't need x coordinate, because interlacing is y-axis only.
-inline Vector RenderSetTargetBitTrans(int th)
+inline float4 RenderSetTargetBitTrans(int th)
 {
 	SetShaderCaller("RenderSetTargetBitTrans");
-	Vector v = Vector(float(th), -float(th), float(th), float(th));
-	ZZshSetParameter4fv(pvsBitBlt.fBitBltTrans, v, "g_fBitBltTrans");
+	float4 v = float4(float(th), -float(th), float(th), float(th));
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.fBitBltTrans, v, "g_fBitBltTrans");
 	return v;
 }

 // use g_fInvTexDims to store inverse texture dims
 // Seems, that Targ shader does not use it
-inline Vector RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHADER* prog)
+inline float4 RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHADER* prog)
 {
 	SetShaderCaller("RenderSetTargetInvTex");

-	Vector v = Vector(0, 0, 0, 0);
+	float4 v = float4(0, 0, 0, 0);

 	if (prog->sInvTexDims)
 	{
@ -469,7 +469,7 @@ inline Vector RenderSetTargetInvTex(int bInterlace, int tw, int th, FRAGMENTSHAD
 		v.y = 1.0f / (float)th;
 		v.z = (float)0.0;
 		v.w = -0.5f / (float)th;
-		ZZshSetParameter4fv(prog->sInvTexDims, v, "g_fInvTexDims");
+		ZZshSetParameter4fv(prog->prog, prog->sInvTexDims, v, "g_fInvTexDims");
 	}

 	return v;
@ -544,17 +544,17 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list
 				SetShaderCaller("RenderCheckForTargets");

 				// Texture
-				Vector v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby), INTERLACE_COUNT);
+				float4 v = RenderSetTargetBitTex((float)RW(texframe.tw), (float)RH(dh), (float)RW(pfb->DBX), (float)RH(dby), INTERLACE_COUNT);

 				// dest rect
 				v = RenderSetTargetBitPos(dh, texframe.th, movy, INTERLACE_COUNT);
 				v = RenderSetTargetBitTrans(ptarg->fbh);
 				v = RenderSetTargetInvTex(bInterlace, texframe.tbw, ptarg->fbh, &ppsCRTCTarg[bInterlace]) ; 	// FIXME. This is no use

-				Vector valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTCTarg[bInterlace]);
+				float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTCTarg[bInterlace]);

 				// inside vb[0]'s target area, so render that region only
-				ZZshGLSetTextureParameter(ppsCRTCTarg[bInterlace].sFinal, ptarg->ptex, "CRTC target");
+				ZZshGLSetTextureParameter(ppsCRTCTarg[bInterlace].prog, ppsCRTCTarg[bInterlace].sFinal, ptarg->ptex, "CRTC target");
 				RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTCTarg[bInterlace]);

 				ZZshSetPixelShader(ppsCRTCTarg[bInterlace].prog);
@ -582,7 +582,7 @@ inline void RenderCheckForTargets(tex0Info& texframe, list<CRenderTarget*>& list
 // this is the function that does it.
 inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listTargs, int i, bool* bUsingStencil, int interlace, int bInterlace)
 {
-	Vector v;
+	float4 v;
 	
 	for (list<CRenderTarget*>::iterator it = listTargs.begin(); it != listTargs.end(); ++it)
 	{
@ -624,9 +624,9 @@ inline void RenderCheckForMemory(tex0Info& texframe, list<CRenderTarget*>& listT
 	v = RenderSetTargetBitPos(1, 1, 0, INTERLACE_COUNT);
 	v = RenderSetTargetBitTrans(texframe.th);
 	v = RenderSetTargetInvTex(bInterlace, texframe.tw, texframe.th, &ppsCRTC[bInterlace]);
-	Vector valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTC[bInterlace]);
+	float4 valpha = RenderGetForClip(bInterlace, interlace, texframe.psm, &ppsCRTC[bInterlace]);

-	ZZshGLSetTextureParameter(ppsCRTC[bInterlace].sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
+	ZZshGLSetTextureParameter(ppsCRTC[bInterlace].prog, ppsCRTC[bInterlace].sMemory, vb[0].pmemtarg->ptex->tex, "CRTC memory");
 	RenderCreateInterlaceTex(bInterlace, texframe.th, &ppsCRTC[bInterlace]);
 	ZZshSetPixelShader(ppsCRTC[bInterlace].prog);
 	
--- a/plugins/zzogl-pg/opengl/ZZoglCRTC.h
+++ b/plugins/zzogl-pg/opengl/ZZoglCRTC.h
@ -63,7 +63,7 @@ extern int s_nNewWidth, s_nNewHeight;
 extern CRangeManager s_RangeMngr; // manages overwritten memory
 extern void FlushTransferRanges(const tex0Info* ptex);
 extern void ProcessMessages();
-void AdjustTransToAspect(Vector& v);
+void AdjustTransToAspect(float4& v);

 // Interlace texture is lazy 1*(height) array of 1 and 0.
 // If its height (named s_nInterlaceTexWidth here) is hanging we must redo
--- a/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglCreate.cpp
@ -82,8 +82,8 @@ extern void KickTriangleFan();
 extern void KickSprite();
 extern void KickDummy();
 extern bool LoadEffects();
-extern bool LoadExtraEffects();
-extern FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
+extern bool ZZshLoadExtraEffects();
+extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);

 GLuint vboRect = 0;
 vector<GLuint> g_vboBuffers; // VBOs for all drawing commands
@ -127,7 +127,6 @@ void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum) = NULL;
 // State parameters

 extern u8* s_lpShaderResources;
-ZZshProgram pvs[16] = {NULL};

 // String's for shader file in developer mode
 #ifdef DEVBUILD
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@ -156,7 +156,7 @@ int s_nWriteDestAlphaTest = 0;					// ZZ

 ////////////////////
 // State parameters
-static Vector vAlphaBlendColor;	 // used for GPU_COLOR
+static float4 vAlphaBlendColor;	 // used for GPU_COLOR

 static bool bNeedBlendFactorInAlpha;	  // set if the output source alpha is different from the real source alpha (only when blend factor > 0x80)
 static u32 s_dwColorWrite = 0xf;			// the color write mask of the current target
@ -310,7 +310,7 @@ void ZeroGS::ReloadEffects()

 	memset(ppsTexture, 0, sizeof(ppsTexture));

-	LoadExtraEffects();
+	ZZshLoadExtraEffects();
 #endif
 }

@ -830,11 +830,11 @@ inline int FlushGetShaderType(VB& curvb, CRenderTarget* ptextarg, GLuint& ptexcl


 //Set page offsets depends on shader type.
-inline Vector FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRenderTarget* ptextarg)
+inline float4 FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushSetPageOffset");

-	Vector vpageoffset;
+	float4 vpageoffset;
 	vpageoffset.w = 0;

 	switch (shadertype)
@ -863,14 +863,14 @@ inline Vector FlushSetPageOffset(FRAGMENTSHADER* pfragment, int shadertype, CRen
 }

 //Set texture offsets depends omn shader type.
-inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
+inline float4 FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushSetTexOffset");
-	Vector v;
+	float4 v;

 	if (shadertype == 3)
 	{
-		Vector v;
+		float4 v;
 		v.x = 16.0f / (float)curvb.tex0.tw;
 		v.y = 16.0f / (float)curvb.tex0.th;
 		v.z = 0.5f * v.x;
@ -879,7 +879,7 @@ inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 	}
 	else if (shadertype == 4)
 	{
-		Vector v;
+		float4 v;
 		v.x = 16.0f / (float)ptextarg->fbw;
 		v.y = 16.0f / (float)ptextarg->fbh;
 		v.z = -1;
@ -891,10 +891,10 @@ inline Vector FlushSetTexOffset(FRAGMENTSHADER* pfragment, int shadertype, VB& c
 }

 // Set dimension (Real!) of texture. z and w
-inline Vector FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
+inline float4 FlushTextureDims(FRAGMENTSHADER* pfragment, int shadertype, VB& curvb, CRenderTarget* ptextarg)
 {
 	SetShaderCaller("FlushTextureDims");
-	Vector vTexDims;
+	float4 vTexDims;
 	vTexDims.x = (float)RW(curvb.tex0.tw) ;
 	vTexDims.y = (float)RH(curvb.tex0.th) ;

@ -958,14 +958,14 @@ inline FRAGMENTSHADER* FlushUseExistRenderTarget(VB& curvb, CRenderTarget* ptext
 	//int psm = PIXEL_STORAGE_FORMAT(curvb.tex0);
 	int shadertype = FlushGetShaderType(curvb, ptextarg, ptexclut);

-	FRAGMENTSHADER* pfragment = LoadShadeEffect(shadertype, 0, curvb.curprim.fge,
+	FRAGMENTSHADER* pfragment = ZZshLoadShadeEffect(shadertype, 0, curvb.curprim.fge,
 								IsAlphaTestExpansion(curvb.tex0), exactcolor, curvb.clamp, context, NULL);

-	Vector vpageoffset = FlushSetPageOffset(pfragment, shadertype, ptextarg);
+	float4 vpageoffset = FlushSetPageOffset(pfragment, shadertype, ptextarg);

-	Vector v = FlushSetTexOffset(pfragment, shadertype, curvb, ptextarg);
+	float4 v = FlushSetTexOffset(pfragment, shadertype, curvb, ptextarg);

-	Vector vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);
+	float4 vTexDims = FlushTextureDims(pfragment, shadertype, curvb, ptextarg);

 	if (pfragment->sCLUT != NULL && ptexclut != 0)
 		ZZshGLSetTextureParameter(pfragment->sCLUT, ptexclut, "CLUT");
@ -997,7 +997,7 @@ inline FRAGMENTSHADER* FlushMadeNewTarget(VB& curvb, int exactcolor, int context
 		}
 	}

-	FRAGMENTSHADER* pfragment = LoadShadeEffect(0, GetTexFilter(curvb.tex1), curvb.curprim.fge,
+	FRAGMENTSHADER* pfragment = ZZshLoadShadeEffect(0, GetTexFilter(curvb.tex1), curvb.curprim.fge,
 								IsAlphaTestExpansion(curvb.tex0), exactcolor, curvb.clamp, context, NULL);

 	if (pfragment == NULL)
@ -1160,7 +1160,7 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
 		}

 		// harvest fishing
-		Vector v = vAlphaBlendColor;
+		float4 v = vAlphaBlendColor;

 		if (exactcolor)
 		{
@ -1173,7 +1173,7 @@ inline u32 AlphaRenderAlpha(VB& curvb, const pixTest curtest, FRAGMENTSHADER* pf
 	else
 	{
 		// not using blending so set to defaults
-		Vector v = exactcolor ? Vector(1, 510 * 255.0f / 256.0f, 0, 0) : Vector(1, 2 * 255.0f / 256.0f, 0, 0);
+		float4 v = exactcolor ? float4(1, 510 * 255.0f / 256.0f, 0, 0) : float4(1, 2 * 255.0f / 256.0f, 0, 0);
 		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");

 	}
@ -1257,7 +1257,7 @@ inline void AlphaPabe(VB& curvb, FRAGMENTSHADER* pfragment, int exactcolor)
 		glDisable(GL_BLEND);
 		GL_STENCILFUNC_SET();

-		Vector v;
+		float4 v;
 		v.x = 1;
 		v.y = 2;
 		v.z = 0;
@ -1330,7 +1330,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE
 	if (gs.pabe && bCanRenderStencil)
 	{
 		// only render the pixels with alpha values >= 0x80
-		Vector v = vAlphaBlendColor;
+		float4 v = vAlphaBlendColor;

 		if (exactcolor) { v.y *= 255; v.w *= 255; }

@ -1350,7 +1350,7 @@ inline void AlphaFailureTestJob(VB& curvb, const pixTest curtest,  FRAGMENTSHADE
 		glDisable(GL_BLEND);
 		GL_STENCILFUNC_SET();

-		Vector v;
+		float4 v;
 		v.x = 1;
 		v.y = 2;
 		v.z = 0;
@ -1409,7 +1409,7 @@ inline void AlphaSpecialTesting(VB& curvb, FRAGMENTSHADER* pfragment, u32 dwUsin
 		glStencilFunc(GL_EQUAL, STENCIL_SPECIAL | STENCIL_PIXELWRITE, STENCIL_SPECIAL | STENCIL_PIXELWRITE);
 		glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);

-		Vector v = Vector(0, exactcolor ? 510.0f : 2.0f, 0, 0);
+		float4 v = float4(0, exactcolor ? 510.0f : 2.0f, 0, 0);
 		ZZshSetParameter4fv(pfragment->sOneColor, v, "g_fOneColor");
 		Draw(curvb);

@ -1560,7 +1560,7 @@ inline void ZeroGS::RenderFBA(const VB& curvb, ZZshParameter sOneColor)

 	glAlphaFunc(GL_GEQUAL, 1);

-	Vector v(1,2,0,0);
+	float4 v(1,2,0,0);

 	ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");

@ -1599,7 +1599,7 @@ __forceinline void ZeroGS::RenderAlphaTest(const VB& curvb, ZZshParameter sOneCo

 	SetShaderCaller("RenderAlphaTest");

-	Vector v(1,2,0,0);
+	float4 v(1,2,0,0);

 	ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");

@ -1624,7 +1624,7 @@ __forceinline void ZeroGS::RenderAlphaTest(const VB& curvb, ZZshParameter sOneCo

 	if (curvb.test.ate && curvb.test.atst > 1 && curvb.test.aref > 0x80)
 	{
-		v = Vector(1,1,0,0);
+		v = float4(1,1,0,0);
 		ZZshSetParameter4fv(sOneColor, v, "g_fOneColor");
 		glAlphaFunc(g_dwAlphaCmp[curvb.test.atst], AlphaReferedValue(curvb.test.aref));
 	}
@ -1925,12 +1925,12 @@ void ZeroGS::SetTexInt(int context, FRAGMENTSHADER* pfragment, int settexint)
 }

 // clamp relies on texture width
-void ZeroGS::SetTexClamping(int context, FRAGMENTSHADER* pfragment)
+void SetTexClamping(int context, FRAGMENTSHADER* pfragment)
 {
 	FUNCLOG
 	SetShaderCaller("SetTexClamping");
 	clampInfo* pclamp = &ZeroGS::vb[context].clamp;
-	Vector v, v2;
+	float4 v, v2;
 	v.x = v.y = 0;
 	u32* ptex = ZeroGS::vb[context].ptexClamp;
 	ptex[0] = ptex[1] = 0;
@ -2015,8 +2015,8 @@ void ZeroGS::SetTexClamping(int context, FRAGMENTSHADER* pfragment)

 }

-// Fixme should be in Vector lib
-inline bool equal_vectors(Vector a, Vector b)
+// Fixme should be in float4 lib
+inline bool equal_vectors(float4 a, float4 b)
 {
 	if (abs(a.x - b.x) + abs(a.y - b.y) + abs(a.z - b.z) + abs(a.w - b.w) < 0.01)
 		return true;
@ -2033,7 +2033,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)

 	assert(!vb[context].bNeedTexCheck);

-	Vector v, v2;
+	float4 v, v2;

 	tex0Info& tex0 = vb[context].tex0;

@ -2045,14 +2045,14 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 		SetShaderCaller("SetTexVariables");

 		// alpha and texture highlighting
-		Vector valpha, valpha2 ;
+		float4 valpha, valpha2 ;

 		// if clut, use the frame format
 		int psm = PIXEL_STORAGE_FORMAT(tex0);

 //		ZZLog::Error_Log( "A %d psm, is-clut %d. cpsm %d | %d %d", psm,  PSMT_ISCLUT(psm), tex0.cpsm,  tex0.tfx, tex0.tcc );

-		Vector vblack;
+		float4 vblack;
 		vblack.x = vblack.y = vblack.z = vblack.w = 10;

 		/* tcc -- Tecture Color Component 0=RGB, 1=RGBA + use Alpha from TEXA reg when not in PSM
@ -2096,7 +2096,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)

 		/*
 		// Test, old code.
-				Vector valpha3, valpha4;
+				float4 valpha3, valpha4;
 		 		switch(tex0.tfx) {
 					case 0:
 						valpha3.z = 0; valpha3.w = 0;
@ -2206,7 +2206,7 @@ void ZeroGS::SetTexVariables(int context, FRAGMENTSHADER* pfragment)
 void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force)
 {
 	FUNCLOG
-	Vector v;
+	float4 v;
 	CMemoryTarget* pmemtarg = g_MemTargs.GetMemoryTarget(tex0, 1);

 	assert( pmemtarg != NULL && pfragment != NULL && pmemtarg->ptex != NULL);	
@ -2248,7 +2248,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,

 	float fbw = (float)tex0.tbw;

-	Vector vTexDims;
+	float4 vTexDims;

 	vTexDims.x = b.vTexDims.x * (fw);
 	vTexDims.y = b.vTexDims.y * (fh);
@ -2291,7 +2291,7 @@ void ZeroGS::SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0,

 	ZZshSetParameter4fv(pfragment->fTexDims, vTexDims, "g_fTexDims");

-//	ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from Vector to float[4] is ok.
+//	ZZshSetParameter4fv(pfragment->fTexBlock, b.vTexBlock, "g_fTexBlock"); // I change it, and it's working. Seems casting from float4 to float[4] is ok.
 	ZZshSetParameter4fv(pfragment->fTexBlock, &b.vTexBlock.x, "g_fTexBlock");
 	ZZshSetParameter4fv(pfragment->fTexOffset, v, "g_fTexOffset");

@ -2403,7 +2403,7 @@ void ZeroGS::SetAlphaVariables(const alphaInfo& a)
 	s_rgbeq = 1;

 //	s_alphaInfo = a;
-	vAlphaBlendColor = Vector(1, 2 * 255.0f / 256.0f, 0, 0);
+	vAlphaBlendColor = float4(1, 2 * 255.0f / 256.0f, 0, 0);
 	u32 usec = a.c;


--- a/plugins/zzogl-pg/opengl/zerogsmath.h
+++ b/plugins/zzogl-pg/opengl/zerogsmath.h
@ -2,12 +2,15 @@
  *
  * Zerofrog's ZeroGS KOSMOS (c)2005-2008
  *
-  * Zerofrog forgot to write any copyright notice after release the plugin into GPLv2
+  * Zerofrog forgot to write any copyright notice after releasing the plugin into GPLv2
  * If someone can contact him successfully to clarify this matter that would be great.
  */

-#ifndef ZEROGS_MATH_H
-#define ZEROGS_MATH_H
+// Now that it's down to 82 lines, and most of it's fairly obvious, perhaps it'd be easier to 
+// just reimplement it... -arcum42
+
+#ifndef ZZOGLMATH_H_INCLUDED
+#define ZZOGLMATH_H_INCLUDED

 #ifndef _WIN32
 #include <alloca.h>
@ -22,16 +25,16 @@ typedef float dReal;
 // class used for 3 and 4 dim vectors and quaternions
 // It is better to use this for a 3 dim vector because it is 16byte aligned and SIMD instructions can be used

-class Vector
+class float4
 {
 	public:
 		dReal x, y, z, w;

-		Vector() : x(0), y(0), z(0), w(0) {}
-		Vector(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
-		Vector(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
-		Vector(const Vector &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
-		Vector(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
+		float4() : x(0), y(0), z(0), w(0) {}
+		float4(dReal x, dReal y, dReal z) : x(x), y(y), z(z), w(0) {}
+		float4(dReal x, dReal y, dReal z, dReal w) : x(x), y(y), z(z), w(w) {}
+		float4(const float4 &vec) : x(vec.x), y(vec.y), z(vec.z), w(vec.w) {}
+		float4(const dReal* pf) { assert(pf != NULL); x = pf[0]; y = pf[1]; z = pf[2]; w = 0; }
 		dReal  operator[](int i) const	   { return (&x)[i]; }
 		dReal& operator[](int i)			 { return (&x)[i]; }
 		
@ -40,7 +43,7 @@ class Vector
 		operator const dReal*() const { return (const dReal*)&x; }
 		
 		// SCALAR FUNCTIONS
-		inline dReal dot(const Vector &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
+		inline dReal dot(const float4 &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; }
 		inline void Set3(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; }
 		inline void Set4(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; w = pvals[3]; }
 		inline void SetColor(u32 color)
@ -53,28 +56,28 @@ class Vector
 		// 3 dim cross product, w is not touched
 		/// this = this x v
 		/// this = u x v
-		inline Vector operator-() const { Vector v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
-		inline Vector operator+(const Vector &r) const { Vector v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
-		inline Vector operator-(const Vector &r) const { Vector v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
-		inline Vector operator*(const Vector &r) const { Vector v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
-		inline Vector operator*(dReal k) const { Vector v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
-		inline Vector& operator += (const Vector& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
-		inline Vector& operator -= (const Vector& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
-		inline Vector& operator *= (const Vector& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
-		inline Vector& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
-		inline Vector& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
-		friend Vector operator*(float f, const Vector& v);
-		//friend ostream& operator<<(ostream& O, const Vector& v);
-		//friend istream& operator>>(istream& I, Vector& v);
+		inline float4 operator-() const { float4 v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; }
+		inline float4 operator+(const float4 &r) const { float4 v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; }
+		inline float4 operator-(const float4 &r) const { float4 v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; }
+		inline float4 operator*(const float4 &r) const { float4 v; v.x = r.x * x; v.y = r.y * y; v.z = r.z * z; v.w = r.w * w; return v; }
+		inline float4 operator*(dReal k) const { float4 v; v.x = k * x; v.y = k * y; v.z = k * z; v.w = k * w; return v; }
+		inline float4& operator += (const float4& r) { x += r.x; y += r.y; z += r.z; w += r.w; return *this; }
+		inline float4& operator -= (const float4& r) { x -= r.x; y -= r.y; z -= r.z; w -= r.w; return *this; }
+		inline float4& operator *= (const float4& r) { x *= r.x; y *= r.y; z *= r.z; w *= r.w; return *this; }
+		inline float4& operator *= (const dReal k) { x *= k; y *= k; z *= k; w *= k; return *this; }
+		inline float4& operator /= (const dReal _k) { dReal k = 1 / _k; x *= k; y *= k; z *= k; w *= k; return *this; }
+		friend float4 operator*(float f, const float4& v);
+		//friend ostream& operator<<(ostream& O, const float4& v);
+		//friend istream& operator>>(istream& I, float4& v);
 };

-inline Vector operator*(float f, const Vector& left)
+inline float4 operator*(float f, const float4& left)
 {
-	Vector v;
+	float4 v;
 	v.x = f * left.x;
 	v.y = f * left.y;
 	v.z = f * left.z;
 	return v;
-}
-
-#endif
+}
+
+#endif // ZZOGLMATH_H_INCLUDED
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.cpp
--- a/plugins/zzogl-pg/opengl/ZZoglShaders.h
+++ b/plugins/zzogl-pg/opengl/ZZoglShaders.h
@ -55,16 +55,16 @@ inline bool ZZshActiveParameter(ZZshParameter param) {return (param !=NULL); }
 #endif					// end NVIDIA cg-toolkit API

 const static char* g_pPsTexWrap[] = { "-DREPEAT", "-DCLAMP", "-DREGION_REPEAT", NULL };
-const static char* g_pTexTypes[] = { "32", "tex32", "clut32", "tex32to16", "tex16to8h" };

 enum ZZshShaderType {ZZ_SH_ZERO, ZZ_SH_REGULAR, ZZ_SH_REGULAR_FOG, ZZ_SH_TEXTURE, ZZ_SH_TEXTURE_FOG, ZZ_SH_CRTC};
-// We have "compatible" shaders, as RegularFogVS and RegularFogPS, if we don't need to worry about incompatible shaders.
-// It's used only in GLSL mode. 
+// We have "compatible" shaders, as RegularFogVS and RegularFogPS. if don't need to wory about incompatible shaders
+// It used only in GLSL mode. 

 // ------------------------- Variables -------------------------------
-extern int g_nPixelShaderVer;
-extern ZZshShaderLink pvs[16], g_vsprog, g_psprog;
-extern ZZshParameter g_vparamPosXY[2], g_fparamFogColor;
+
+extern int 		g_nPixelShaderVer;
+extern ZZshShaderLink 	pvs[16], g_vsprog, g_psprog;
+extern ZZshParameter 	g_vparamPosXY[2], g_fparamFogColor;

 #define MAX_ACTIVE_UNIFORMS 600
 #define MAX_ACTIVE_SHADERS 400
@ -73,18 +73,18 @@ struct FRAGMENTSHADER
 {
 	FRAGMENTSHADER() : prog(sZero), Shader(0), sMemory(pZero), sFinal(pZero), sBitwiseANDX(pZero), sBitwiseANDY(pZero), sInterlace(pZero), sCLUT(pZero), sOneColor(pZero), sBitBltZ(pZero),
 		fTexAlpha2(pZero), fTexOffset(pZero), fTexDims(pZero), fTexBlock(pZero), fClampExts(pZero), fTexWrapMode(pZero),
-		fRealTexDims(pZero), fTestBlack(pZero), fPageOffset(pZero), fTexAlpha(pZero) {}
-		
-	ZZshShaderLink prog;						// it links to the FRAGMENTSHADER structure, for compatibility between GLSL and CG.
-	ZZshShader Shader;							// GLSL store shaders not as ready programs, but as shader compiled objects. VS and PS should be linked together to
-												// make a program.
+		fRealTexDims(pZero), fTestBlack(pZero), fPageOffset(pZero), fTexAlpha(pZero)  {}
+	
+	ZZshShaderLink prog;						// it link to FRAGMENTSHADER structure, for compability between GLSL and CG
+	ZZshShader Shader;						// GLSL store shader's not as ready programs, but as shaders compilated object. VS and PS should be linked together to
+									// made a program.
 	ZZshShaderType ShaderType;					// Not every PS and VS are used together, only compatible ones.

 	ZZshParameter sMemory, sFinal, sBitwiseANDX, sBitwiseANDY, sInterlace, sCLUT;
 	ZZshParameter sOneColor, sBitBltZ, sInvTexDims;
 	ZZshParameter fTexAlpha2, fTexOffset, fTexDims, fTexBlock, fClampExts, fTexWrapMode, fRealTexDims, fTestBlack, fPageOffset, fTexAlpha;

-	int ParametersStart, ParametersFinish;				// this is part of UniformsIndex array in which parameters of this shader asre stored. The last one is ParametersFinish-1
+	int ParametersStart, ParametersFinish;				// this is part of UniformsIndex array in which parameters of this shader stored. Last one is ParametersFinish-1

 #ifdef _DEBUG
 	string filename;
@ -145,7 +145,7 @@ struct FRAGMENTSHADER
 		return false;
 	}

-	bool set_shader_const(Vector v, const char *name)
+	bool set_shader_const(float4 v, const char *name)
 	{
 		ZZshParameter p;

@ -174,29 +174,17 @@ struct VERTEXSHADER
 	int ParametersStart, ParametersFinish;
 };

-namespace ZeroGS {
-	// Shaders variables
-	extern Vector g_vdepth;	
-	extern Vector vlogz;
+namespace ZeroGS { 
+	extern float4 g_vdepth;	
+	extern float4 vlogz;
 	extern VERTEXSHADER pvsBitBlt;
 	extern FRAGMENTSHADER ppsBitBlt[2], ppsBitBltDepth, ppsOne;					// ppsOne used to stop using shaders for draw
 	extern FRAGMENTSHADER ppsBaseTexture, ppsConvert16to32, ppsConvert32to16;
-	bool LoadEffects();
-	bool LoadExtraEffects();
-	FRAGMENTSHADER* LoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);

-	// only sets a limited amount of state (for Update)
-	void SetTexClamping(int context, FRAGMENTSHADER* pfragment);
-	void SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force);
+	extern FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
+	extern FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
 }

-// ------------------------- Variables -------------------------------
-
-extern u8* s_lpShaderResources;
-extern ZZshProfile cgvProf, cgfProf;
-extern FRAGMENTSHADER ppsRegular[4], ppsTexture[NUM_SHADERS];
-extern FRAGMENTSHADER ppsCRTC[2], ppsCRTC24[2], ppsCRTCTarg[2];
-
 // ------------------------- Functions -------------------------------

 #ifdef NVIDIA_CG_API
@ -208,7 +196,7 @@ inline bool ZZshExistProgram(ZZshShaderLink prog) {return (prog != NULL); };
 extern const char* ShaderCallerName;
 extern const char* ShaderHandleName;

-inline void SetShaderCaller(const char* Name) {
+inline void SetShaderCaller(const char* Name) {	
 	ShaderCallerName = Name;
 }

@ -222,22 +210,23 @@ inline void ResetShaderCounters() {

 extern bool ZZshCheckProfilesSupport();
 extern bool ZZshStartUsingShaders();
+extern bool ZZshCreateOpenShadersFile();
 extern void ZZshGLDisableProfile();
 extern void ZZshGLEnableProfile();
+extern void ZZshSetParameter4fv(ZZshShaderLink prog, ZZshParameter param, const float* v, const char* name);
 extern void ZZshSetParameter4fv(ZZshParameter param, const float* v, const char* name);
+extern void ZZshSetParameter4fvWithRetry(ZZshParameter* param, ZZshShaderLink prog, const float* v, const char* name);
+extern void ZZshGLSetTextureParameter(ZZshShaderLink prog, ZZshParameter param, GLuint texobj, const char* name);
 extern void ZZshGLSetTextureParameter(ZZshParameter param, GLuint texobj, const char* name);
 extern void ZZshDefaultOneColor( FRAGMENTSHADER ptr );
-extern void ZZshSetVertexShader(ZZshShader prog);
-extern void ZZshSetPixelShader(ZZshShader prog);
+extern void ZZshSetVertexShader(ZZshShaderLink prog);
+extern void ZZshSetPixelShader(ZZshShaderLink prog);
+extern bool ZZshLoadExtraEffects();

-inline int GET_SHADER_INDEX(int type, int texfilter, int texwrap, int fog, int writedepth, int testaem, int exactcolor, int context, int ps)
-{
-	return type + texfilter*NUM_TYPES + NUM_FILTERS*NUM_TYPES*texwrap + NUM_TEXWRAPS*NUM_FILTERS*NUM_TYPES*(fog+2*writedepth+4*testaem+8*exactcolor+16*context+32*ps);
+extern FRAGMENTSHADER* ZZshLoadShadeEffect(int type, int texfilter, int fog, int testaem, int exactcolor, const clampInfo& clamp, int context, bool* pbFailed);
+
+namespace ZeroGS {
+	// only sets a limited amount of state (for Update)
+	void SetTexVariablesInt(int context, int bilinear, const tex0Info& tex0, bool CheckVB, FRAGMENTSHADER* pfragment, int force);
 }
-	
-struct SHADERHEADER
-{
-	unsigned int index, offset, size; // if highest bit of index is set, pixel shader
-};
-
 #endif
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@ -122,22 +122,22 @@ inline void FillOnlyStencilBuffer()

 // used for transformation from vertex position in GS window.coords (I hope)
 // to view coordinates (in range 0, 1).
-inline Vector ZeroGS::CRenderTarget::DefaultBitBltPos()
+inline float4 ZeroGS::CRenderTarget::DefaultBitBltPos()
 {
-	Vector v = Vector(1, -1, 0.5f / (float)RW(fbw), 0.5f / (float)RH(fbh));
+	float4 v = float4(1, -1, 0.5f / (float)RW(fbw), 0.5f / (float)RH(fbh));
 	v *= 1.0f / 32767.0f;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_sBitBltPos");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_sBitBltPos");
 	return v;
 }

 // Used to transform texture coordinates from GS (when 0,0 is upper left) to
 // OpenGL (0,0 - lower left).
-inline Vector ZeroGS::CRenderTarget::DefaultBitBltTex()
+inline float4 ZeroGS::CRenderTarget::DefaultBitBltTex()
 {
 	// I really sure that -0.5 is correct, because OpenGL have no half-offset
 	// issue, DirectX known for.
-	Vector v = Vector(1, -1, 0.5f / (float)RW(fbw), -0.5f / (float)RH(fbh));
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_sBitBltTex");
+	float4 v = float4(1, -1, 0.5f / (float)RW(fbw), -0.5f / (float)RH(fbh));
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_sBitBltTex");
 	return v;
 }

@ -222,7 +222,7 @@ void ZeroGS::CRenderTarget::SetTarget(int fbplocal, const Rect2& scissor, int co

 	if (fbplocal != fbp)
 	{
-		Vector v;
+		float4 v;

 		// will be rendering to a subregion
 		u32 bpp = PSMT_ISHALF(psm) ? 2 : 4;
@ -401,7 +401,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 	((CDepthTarget*)pdepth)->SetDepthStencilSurface();

 	SetShaderCaller("CRenderTarget::Update");
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();

 	CRenderTargetMngr::MAPTARGETS::iterator ittarg;

@ -432,7 +432,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)

 	if (nUpdateTarg)
 	{
-		ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ittarg->second->ptex, "BaseTexture.final");
+		ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ittarg->second->ptex, "BaseTexture.final");

 		//assert( ittarg->second->fbw == fbw );
 		int offset = (fbp - ittarg->second->fbp) * 64 / fbw;
@ -445,7 +445,7 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 		v.z = 0.25f;
 		v.w = (float)RH(offset) + 0.25f;

-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");

 //		v = DefaultBitBltTex(); Maybe?
 		ZZshDefaultOneColor ( ppsBaseTexture );
@ -472,14 +472,14 @@ void ZeroGS::CRenderTarget::Update(int context, ZeroGS::CRenderTarget* pdepth)
 		// Fix in r133 -- FFX movies and Gust backgrounds!
 		//SetTexVariablesInt(0, 0*(AA.x || AA.y) ? 2 : 0, texframe, false, &ppsBitBlt[!!s_AAx], 1);
 		SetTexVariablesInt(0, 0, texframe, false, &ppsBitBlt[bit_idx], 1);
-		ZZshGLSetTextureParameter(ppsBitBlt[bit_idx].sMemory, vb[0].pmemtarg->ptex->tex, "BitBlt.memory");
+		ZZshGLSetTextureParameter(ppsBitBlt[bit_idx].prog, ppsBitBlt[bit_idx].sMemory, vb[0].pmemtarg->ptex->tex, "BitBlt.memory");

-		v = Vector(1, 1, 0.0f, 0.0f);
-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+		v = float4(1, 1, 0.0f, 0.0f);
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");

 		v.x = 1;
 		v.y = 2;
-		ZZshSetParameter4fv(ppsBitBlt[bit_idx].sOneColor, v, "g_fOneColor");
+		ZZshSetParameter4fv(ppsBitBlt[bit_idx].prog, ppsBitBlt[bit_idx].sOneColor, v, "g_fOneColor");

 		assert(ptex != 0);

@ -536,26 +536,26 @@ void ZeroGS::CRenderTarget::ConvertTo32()
 	SetShaderCaller("CRenderTarget::ConvertTo32");

 	// tex coords, test ffx bikanel island when changing these
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 	v = DefaultBitBltTex();

 	v.x = (float)RW(16);
 	v.y = (float)RH(16);
 	v.z = -(float)RW(fbw);
 	v.w = (float)RH(8);
-	ZZshSetParameter4fv(ppsConvert16to32.fTexOffset, v, "g_fTexOffset");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fTexOffset, v, "g_fTexOffset");

 	v.x = (float)RW(8);
 	v.y = 0;
 	v.z = 0;
 	v.w = 0.25f;
-	ZZshSetParameter4fv(ppsConvert16to32.fPageOffset, v, "g_fPageOffset");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fPageOffset, v, "g_fPageOffset");

 	v.x = (float)RW(2 * fbw);
 	v.y = (float)RH(fbh);
 	v.z = 0;
 	v.w = 0.0001f * (float)RH(fbh);
-	ZZshSetParameter4fv(ppsConvert16to32.fTexDims, v, "g_fTexDims");
+	ZZshSetParameter4fv(ppsConvert16to32.prog, ppsConvert16to32.fTexDims, v, "g_fTexDims");

 //	v.x = 0;
 //	ZZshSetParameter4fv(ppsConvert16to32.fTexBlock, v, "g_fTexBlock");
@ -568,7 +568,7 @@ void ZeroGS::CRenderTarget::ConvertTo32()
 	ZeroGS::ResetRenderTarget(1);

 	BindToSample(&ptex);
-	ZZshGLSetTextureParameter(ppsConvert16to32.sFinal, ptex, "Convert 16 to 32.Final");
+	ZZshGLSetTextureParameter(ppsConvert16to32.prog, ppsConvert16to32.sFinal, ptex, "Convert 16 to 32.Final");

 	fbh /= 2; // have 16 bit surfaces are usually 2x higher
 	SetViewport();
@ -640,26 +640,26 @@ void ZeroGS::CRenderTarget::ConvertTo16()
 	SetShaderCaller("CRenderTarget::ConvertTo16");

 	// tex coords, test ffx bikanel island when changing these
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();
 	v = DefaultBitBltTex();

 	v.x = 16.0f / (float)fbw;
 	v.y = 8.0f / (float)fbh;
 	v.z = 0.5f * v.x;
 	v.w = 0.5f * v.y;
-	ZZshSetParameter4fv(ppsConvert32to16.fTexOffset, v, "g_fTexOffset");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fTexOffset, v, "g_fTexOffset");

 	v.x = 256.0f / 255.0f;
 	v.y = 256.0f / 255.0f;
 	v.z = 0.05f / 256.0f;
 	v.w = -0.001f / 256.0f;
-	ZZshSetParameter4fv(ppsConvert32to16.fPageOffset, v, "g_fPageOffset");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fPageOffset, v, "g_fPageOffset");

 	v.x = (float)RW(fbw);
 	v.y = (float)RH(2 * fbh);
 	v.z = 0;
 	v.w = -0.1f / RH(fbh);
-	ZZshSetParameter4fv(ppsConvert32to16.fTexDims, v, "g_fTexDims");
+	ZZshSetParameter4fv(ppsConvert32to16.prog, ppsConvert32to16.fTexDims, v, "g_fTexDims");

 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
 	SET_STREAM();
@ -671,7 +671,7 @@ void ZeroGS::CRenderTarget::ConvertTo16()

 	BindToSample(&ptex);

-	ZZshGLSetTextureParameter(ppsConvert32to16.sFinal, ptex, "Convert 32 to 16");
+	ZZshGLSetTextureParameter(ppsConvert32to16.prog, ppsConvert32to16.sFinal, ptex, "Convert 32 to 16");

 //	fbh *= 2; // have 16 bit surfaces are usually 2x higher

@ -748,22 +748,22 @@ void ZeroGS::CRenderTarget::_CreateFeedback()
 	ResetRenderTarget(1);

 	// tex coords, test ffx bikanel island when changing these
-	/*	Vector v = DefaultBitBltPos();
-		v = Vector ((float)(RW(fbw+4)), (float)(RH(fbh+4)), +0.25f, -0.25f);
-		ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "BitBltTex");*/
+	/*	float4 v = DefaultBitBltPos();
+		v = float4 ((float)(RW(fbw+4)), (float)(RH(fbh+4)), +0.25f, -0.25f);
+		ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "BitBltTex");*/

 	// tex coords, test ffx bikanel island when changing these

-//	Vector v = Vector(1, -1, 0.5f / (fbw << AA.x), 0.5f / (fbh << AA.y));
+//	float4 v = float4(1, -1, 0.5f / (fbw << AA.x), 0.5f / (fbh << AA.y));
 //	v *= 1/32767.0f;
 //	cgGLSetParameter4fv(pvsBitBlt.sBitBltPos, v);
-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();

 	v.x = (float)(RW(fbw));
 	v.y = (float)(RH(fbh));
 	v.z = 0.0f;
 	v.w = 0.0f;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "BitBlt.Feedback");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "BitBlt.Feedback");
 	ZZshDefaultOneColor(ppsBaseTexture);

 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);
@ -773,7 +773,7 @@ void ZeroGS::CRenderTarget::_CreateFeedback()
 	glBindTexture(GL_TEXTURE_RECTANGLE_NV, ptex);
 	GL_REPORT_ERRORD();

-	ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ptex, "BaseTexture.Feedback");
+	ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ptex, "BaseTexture.Feedback");

 	SetViewport();

@ -976,9 +976,9 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)

 	// write color and zero out stencil buf, always 0 context!
 	SetTexVariablesInt(0, 0, texframe, false, &ppsBitBltDepth, 1);
-	ZZshGLSetTextureParameter(ppsBitBltDepth.sMemory, vb[0].pmemtarg->ptex->tex, "BitBltDepth");
+	ZZshGLSetTextureParameter(ppsBitBltDepth.prog, ppsBitBltDepth.sMemory, vb[0].pmemtarg->ptex->tex, "BitBltDepth");

-	Vector v = DefaultBitBltPos();
+	float4 v = DefaultBitBltPos();

 	v = DefaultBitBltTex();

@ -986,9 +986,9 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)
 	v.y = 2;
 	v.z = PSMT_IS16Z(psm) ? 1.0f : 0.0f;
 	v.w = g_filog32;
-	ZZshSetParameter4fv(ppsBitBltDepth.sOneColor, v, "g_fOneColor");
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sOneColor, v, "g_fOneColor");

-	Vector vdepth = g_vdepth;
+	float4 vdepth = g_vdepth;

 	if (psm == PSMT24Z)
 	{
@ -1001,7 +1001,7 @@ void ZeroGS::CDepthTarget::Update(int context, ZeroGS::CRenderTarget* prndr)

 	assert(ppsBitBltDepth.sBitBltZ != 0);

-	ZZshSetParameter4fv(ppsBitBltDepth.sBitBltZ, ((255.0f / 256.0f)*vdepth), "g_fBitBltZ");
+	ZZshSetParameter4fv(ppsBitBltDepth.prog, ppsBitBltDepth.sBitBltZ, ((255.0f / 256.0f)*vdepth), "g_fBitBltZ");

 	assert(pdepth != 0);
 	//GLint w1 = 0;
--- a/plugins/zzogl-pg/opengl/targets.h
+++ b/plugins/zzogl-pg/opengl/targets.h
@ -228,7 +228,6 @@ inline list<ZeroGS::CRenderTarget*> CreateTargetsList(int start, int end)
 	return listTargs;
 }

-extern Vector g_vdepth;
 extern int icurctx;
 extern GLuint vboRect;

--- a/plugins/zzogl-pg/opengl/zerogs.cpp
+++ b/plugins/zzogl-pg/opengl/zerogs.cpp
@ -29,7 +29,6 @@
 #include "Mem.h"
 #include "x86.h"
 #include "zerogs.h"
-#include "zpipe.h"
 #include "targets.h"
 #include "GLWin.h"
 #include "ZZoglShaders.h"
@ -51,7 +50,6 @@ extern int g_nFrame, g_nRealFrame;
 //-------------------------- Variables

 primInfo *prim;
-ZZshProgram g_vsprog = 0, g_psprog = 0;							// 2 -- ZZ

 inline u32 FtoDW(float f) { return (*((u32*)&f)); }

@ -82,7 +80,6 @@ PFNGLDRAWBUFFERSPROC glDrawBuffers = NULL;

 /////////////////////
 // graphics resources
-ZZshParameter g_vparamPosXY[2] = {0}, g_fparamFogColor = 0;

 bool s_bTexFlush = false;
 int s_nLastResolveReset = 0;
@ -94,10 +91,8 @@ int nBackbufferWidth, nBackbufferHeight;									// ZZ

 namespace ZeroGS
 {
-Vector g_vdepth, vlogz;
-
-//       	= Vector( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
-//	Vector g_vdepth = Vector( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);
+//       	= float4( 255.0 /256.0f,  255.0/65536.0f, 255.0f/(65535.0f*256.0f), 1.0f/(65536.0f*65536.0f));
+//	float4 g_vdepth = float4( 65536.0f*65536.0f, 256.0f*65536.0f, 65536.0f, 256.0f);

 extern CRangeManager s_RangeMngr; // manages overwritten memory

@ -341,7 +336,7 @@ void ZeroGS::DrawText(const char* pstr, int left, int top, u32 color)
 	FUNCLOG
 	ZZshGLDisableProfile();

-	Vector v;
+	float4 v;
 	v.SetColor(color);
 	glColor3f(v.z, v.y, v.x);
 	//glColor3f(((color >> 16) & 0xff) / 255.0f, ((color >> 8) & 0xff)/ 255.0f, (color & 0xff) / 255.0f);
@ -490,19 +485,19 @@ void ZeroGS::RenderCustom(float fAlpha)
 	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);

 	// tex coords
-	Vector v = Vector(1 / 32767.0f, 1 / 32767.0f, 0, 0);
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
+	float4 v = float4(1 / 32767.0f, 1 / 32767.0f, 0, 0);
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltPos, v, "g_fBitBltPos");
 	v.x = (float)nLogoWidth;
 	v.y = (float)nLogoHeight;
-	ZZshSetParameter4fv(pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");
+	ZZshSetParameter4fv(pvsBitBlt.prog, pvsBitBlt.sBitBltTex, v, "g_fBitBltTex");

 	v.x = v.y = v.z = v.w = fAlpha;
-	ZZshSetParameter4fv(ppsBaseTexture.sOneColor, v, "g_fOneColor");
+	ZZshSetParameter4fv(ppsBaseTexture.prog, ppsBaseTexture.sOneColor, v, "g_fOneColor");

 	if (conf.wireframe()) glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);

 	// inside vhDCb[0]'s target area, so render that region only
-	ZZshGLSetTextureParameter(ppsBaseTexture.sFinal, ptexLogo, "Logo");
+	ZZshGLSetTextureParameter(ppsBaseTexture.prog, ppsBaseTexture.sFinal, ptexLogo, "Logo");
 	glBindBuffer(GL_ARRAY_BUFFER, vboRect);

 	SET_STREAM();
@ -781,7 +776,7 @@ void ZeroGS::SetFogColor(u32 fog)
 	ZeroGS::FlushBoth();

 	SetShaderCaller("SetFogColor");
-	Vector v;
+	float4 v;

 	// set it immediately
 	v.SetColor(gs.fogcol);
@ -795,7 +790,7 @@ void ZeroGS::SetFogColor(GIFRegFOGCOL* fog)
 	FUNCLOG
 	
 	SetShaderCaller("SetFogColor");
-	Vector v;
+	float4 v;
 	
 	v.x = fog->FCR / 255.0f;
 	v.y = fog->FCG / 255.0f;
--- a/plugins/zzogl-pg/opengl/zerogs.h
+++ b/plugins/zzogl-pg/opengl/zerogs.h
@ -66,7 +66,10 @@ extern float g_fiGPU_TEXWIDTH;
 #define MASKDIVISOR		0							// Used for decrement bitwise mask texture size if 1024 is too big
 #define GPU_TEXMASKWIDTH	(1024 >> MASKDIVISOR)	// bitwise mask width for region repeat mode

+extern u32 ptexBlocks;		// holds information on block tiling. It's texture number in OpenGL -- if 0 than such texture
+extern u32 ptexConv16to32;	// does not exists. This textures should be created on start and released on finish.  
 extern u32 ptexBilinearBlocks;
+extern u32 ptexConv32to16;

 // this is currently *not* used as a bool, in spite of its moniker --air
 // Actually, the only thing written to it is 1 or 0, which makes the (g_bSaveFlushedFrame & 0x80000000) check rather bizzare.
@ -136,7 +139,7 @@ class CRenderTarget
 		int fbp, fbw, fbh, fbhCalc; // if fbp is negative, virtual target (not mapped to any real addr)
 		int start, end; // in bytes
 		u32 lastused;	// time stamp since last used
-		Vector vposxy;
+		float4 vposxy;

 		u32 fbm;
 		u16 status;
@ -161,8 +164,8 @@ class CRenderTarget
 			TS_NeedConvert32 = 16,
 			TS_NeedConvert16 = 32,
 		};
-		inline Vector DefaultBitBltPos() ;
-		inline Vector DefaultBitBltTex() ;
+		inline float4 DefaultBitBltPos();
+		inline float4 DefaultBitBltTex();

 	private:
 		void _CreateFeedback();