IPU : Removed the MPEG internal 32 bit buffer and all associated logic for "rewinding" bits out of the buffer and back into the IPU's internal 2QWC buffer. Simplifies IPU's bitstreaming code quite a bit, but isn't really much faster (yet).

(savestate version upgraded) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3788 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-09-17 13:10:54 +00:00 · 2010-09-17 13:10:54 +00:00 · 47cf781130
parent da49cce435
commit 47cf781130
7 changed files with 233 additions and 345 deletions
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@ -55,9 +55,6 @@ int coded_block_pattern = 0;
 u8 indx4[16*16/2];
 __aligned16 decoder_t decoder;

-__aligned16 u8 _readbits[80];	//local buffer (ring buffer)
-u8* readbits = _readbits;		// always can decrement by one 1qw
-
 __fi void IPUProcessInterrupt()
 {
 	if (ipuRegs.ctrl.BUSY && g_BP.IFC) IPUWorker();
@ -96,8 +93,6 @@ void ReportIPU()
 	Console.WriteLn("g_decoder = 0x%x.", &decoder);
 	Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
 	Console.WriteLn(ipu_cmd.desc());
-	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
-		_readbits, readbits - _readbits);
 	Console.Newline();
 }

@ -114,15 +109,6 @@ void SaveStateBase::ipuFreeze()
 	Freeze(coded_block_pattern);
 	Freeze(decoder);
 	Freeze(ipu_cmd);
-	Freeze(_readbits);
-
-	int temp = readbits - _readbits;
-	Freeze(temp);
-
-	if (IsLoading())
-	{
-		readbits = _readbits;
-	}
 }

 void tIPU_CMD_IDEC::log() const
@ -213,21 +199,27 @@ __fi u32 ipuRead32(u32 mem)
 	switch (mem)
 	{
 		ipucase(IPU_CTRL): // IPU_CTRL
+		{
 			ipuRegs.ctrl.IFC = g_BP.IFC;
 			ipuRegs.ctrl.CBP = coded_block_pattern;

 			if (!ipuRegs.ctrl.BUSY)
 				IPU_LOG("read32: IPU_CTRL=0x%08X", ipuRegs.ctrl._u32);

-		return ipuRegs.ctrl._u32;
+			return ipuRegs.ctrl._u32;
+		}		

 		ipucase(IPU_BP): // IPU_BP
+		{
+			pxAssume(g_BP.FP <= 2);
+			
 			ipuRegs.ipubp = g_BP.BP & 0x7f;
 			ipuRegs.ipubp |= g_BP.IFC << 8;
-			ipuRegs.ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
+			ipuRegs.ipubp |= g_BP.FP << 16;

 			IPU_LOG("read32: IPU_BP=0x%08X", ipuRegs.ipubp);
-		return ipuRegs.ipubp;
+			return ipuRegs.ipubp;
+		}

 		default:
 			IPU_LOG("read32: Addr=0x%08X Value = 0x%08X", mem, psHu32(IPU_CMD + mem));
@ -283,9 +275,7 @@ void ipuSoftReset()
 	ipu_cmd.clear();
 	ipuRegs.cmd.BUSY = 0;

-	g_BP.BP = 0;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+	memzero(g_BP);
 }

 __fi bool ipuWrite32(u32 mem, u32 value)
@ -354,12 +344,11 @@ static void ipuBCLR(u32 val)
 {
 	ipu_fifo.in.clear();

+	memzero(g_BP);
 	g_BP.BP = val & 0x7F;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+
 	ipuRegs.ctrl.BUSY = 0;
 	ipuRegs.cmd.BUSY = 0;
-	memzero(_readbits);
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }

@ -370,7 +359,7 @@ static bool ipuIDEC(u32 val, bool resume)
 	if (!resume)
 	{
 		idec.log();
-		g_BP.BP += idec.FB;//skip FB bits
+		g_BP.Advance(idec.FB);

 	//from IPU_CTRL
 		ipuRegs.ctrl.PCT = I_TYPE; //Intra DECoding;)
@ -407,7 +396,7 @@ static __fi bool ipuBDEC(u32 val, bool resume)
 		bdec.log(s_bdec);
 		if (IsDebugBuild) s_bdec++;

-	g_BP.BP += bdec.FB;//skip FB bits
+		g_BP.Advance(bdec.FB);
 		decoder.coding_type			= I_TYPE;
 		decoder.mpeg1				= ipuRegs.ctrl.MP1;
 		decoder.q_scale_type		= ipuRegs.ctrl.QST;
@ -433,11 +422,7 @@ static bool __fastcall ipuVDEC(u32 val)
 	switch (ipu_cmd.pos[0])
 	{
 		case 0:
-			ipuRegs.cmd.DATA = 0;
-			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return false;
-
-			decoder.bitstream_bits = -16;
-			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+			if (!bitstream_init()) return false;

 			switch ((val >> 26) & 3)
 			{
@ -459,17 +444,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				case 3://DMVector
 					ipuRegs.cmd.DATA = get_dmv();
 					break;
+
+				jNO_DEFAULT
 			}

-			g_BP.BP += (int)decoder.bitstream_bits + 16;
+			ipuRegs.cmd.DATA &= 0xFFFF;
+			ipuRegs.cmd.DATA |= 0x10000;

-			if ((int)g_BP.BP < 0)
-			{
-				g_BP.BP += 128;
-				ReorderBitstream();
-			}
-
-			ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
+			//ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);

 		case 1:
@ -479,14 +461,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				return false;
 			}

-			BigEndian(ipuRegs.top, ipuRegs.top);
+			ipuRegs.top = BigEndian(ipuRegs.top);

 			IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
 			        ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
 			        ((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
 			return true;

-			jNO_DEFAULT
+		jNO_DEFAULT
 	}

 	return false;
@ -496,7 +478,7 @@ static __fi bool ipuFDEC(u32 val)
 {
 	if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;

-	BigEndian(ipuRegs.cmd.DATA, ipuRegs.cmd.DATA);
+	ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
 	ipuRegs.top = ipuRegs.cmd.DATA;

 	IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
@ -553,11 +535,10 @@ static bool ipuSETVQ(u32 val)
 		if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
 	}

-	IPU_LOG("SETVQ command.\nRead VQCLUT table from FIFO.");
-	IPU_LOG(
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	IPU_LOG("SETVQ command.   Read VQCLUT table from FIFO.\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
 	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
 	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
 	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
@ -723,148 +704,48 @@ __fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
 	Console.Error("IPU: VQ not implemented");
 }

-__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
-{
-	const u8	*s = (const u8*)&mb8;
-	s16	*d = (s16*)&mb16;
-	int i;
-	for (i = 0; i < 256; i++) *d++ = *s++;		//Y  bias	- 16
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cr bias	- 128
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cb bias	- 128
-}
-

 // --------------------------------------------------------------------------------------
 //  Buffer reader
 // --------------------------------------------------------------------------------------

-// move the readbits queue
-__fi void inc_readbits()
+__ri u32 UBITS(uint bits)
 {
-	readbits += 16;
-	if (readbits >= _readbits + 64)
-	{
-		// move back
-		*(u64*)(_readbits) = *(u64*)(_readbits + 64);
-		*(u64*)(_readbits + 8) = *(u64*)(_readbits + 72);
-		readbits = _readbits;
-	}
+	uint readpos8 = g_BP.BP/8;
+
+	uint result = BigEndian(*(u32*)( (u8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
+
+	return result;
 }

-// returns the pointer of readbits moved by 1 qword
-__fi u8* next_readbits()
+__ri s32 SBITS(uint bits)
 {
-	return readbits + 16;
-}
+	// Read an unaligned 32 bit value and then shift the bits up and then back down.

-// returns the pointer of readbits moved by 1 qword
-u8* prev_readbits()
-{
-	if (readbits < _readbits + 16) return _readbits + 48 - (readbits - _readbits);
+	uint readpos8 = g_BP.BP/8;

-	return readbits - 16;
-}
+	int result = BigEndian(*(s32*)( (s8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);

-void ReorderBitstream()
-{
-	readbits = prev_readbits();
-	g_BP.FP = 2;
-}
-
-// IPU has a 2qword internal buffer whose status is pointed by FP.
-// If FP is 1, there's 1 qword in buffer. Second qword is only loaded
-// incase there are less than 32bits available in the first qword.
-// \return Number of bits available (clamps at 16 bits)
-u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
-{
-	if (g_BP.FP == 0)
-	{
-		if (ipu_fifo.in.read(next_readbits()) == 0) return 0;
-
-		inc_readbits();
-		g_BP.FP = 1;
-	}
-
-	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
-	{
-		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
-	}
-
-	if (*(int*)pointer >= 128)
-	{
-		pxAssert(g_BP.FP >= 1);
-
-		if (g_BP.FP > 1) inc_readbits();
-
-		if (advance)
-		{
-			g_BP.FP--;
-			*pointer &= 127;
-		}
-	}
-
-	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
+	return result;
 }

 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits128(u8 *address, u32 advance)
+u8 getBits64(u8 *address, bool advance)
 {
-	u64 mask2;
-	u128 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(64)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask2 = 0xff >> shift;
-		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
-		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
-
-		u128 notMask;
-		u128 data = *(u128*)(readpos + 1);
-		notMask.lo = ~mask.lo & data.lo;
-		notMask.hi = ~mask.hi & data.hi;
-		notMask.lo >>= 8 - shift;
-		notMask.lo |= (notMask.hi & (ULLONG_MAX >> (64 - shift))) << (64 - shift);
-		notMask.hi >>= 8 - shift;
-
-		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
-		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
-		
-		notMask.lo |= mask.lo;
-		notMask.hi |= mask.hi;
-		*(u128*)address = notMask;
-	}
-	else
-	{
-		*(u128*)address = *(u128*)readpos;
-	}
-
-	if (advance) g_BP.BP += 128;
-
-	return 1;
-}
-
-// whenever reading fractions of bytes. The low bits always come from the next byte
-// while the high bits come from the current byte
-u8 __fastcall getBits64(u8 *address, u32 advance)
-{
-	register u64 mask = 0;
-	u8* readpos;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
-	{
-		mask = (0xff >> shift);
+		u64 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);

 		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
@ -874,55 +755,47 @@ u8 __fastcall getBits64(u8 *address, u32 advance)
 		*(u64*)address = *(u64*)readpos;
 	}

-	if (advance) g_BP.BP += 64;
+	if (advance) g_BP.Advance(64);

 	return 1;
 }

 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits32(u8 *address, u32 advance)
+__fi u8 getBits32(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(32)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 32) < 32) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
+	const u8* readpos = &g_BP.internal_qwc->_u8[g_BP.BP/8];
+	
+	if(uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		u32 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24);

 		*(u32*)address = ((~mask & *(u32*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u32*)readpos) << shift);
 	}
 	else
 	{
+		// Bit position-aligned -- no masking/shifting necessary
 		*(u32*)address = *(u32*)readpos;
 	}

-	if (advance) g_BP.BP += 32;
+	if (advance) g_BP.Advance(32);

 	return 1;
 }

-__fi u8 __fastcall getBits16(u8 *address, u32 advance)
+__fi u8 getBits16(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(16)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 16) < 16) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		mask = mask | (mask << 8);
-
 		*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
 	}
 	else
@ -930,25 +803,20 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance)
 		*(u16*)address = *(u16*)readpos;
 	}

-	if (advance) g_BP.BP += 16;
+	if (advance) g_BP.Advance(16);

 	return 1;
 }

-u8 __fastcall getBits8(u8 *address, u32 advance)
+u8 getBits8(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(8)) return 0;

-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8)
-		return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];

 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
 	}
 	else
@ -956,7 +824,7 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
 		*(u8*)address = *(u8*)readpos;
 	}

-	if (advance) g_BP.BP += 8;
+	if (advance) g_BP.Advance(8);

 	return 1;
 }
@ -983,7 +851,7 @@ void IPUCMD_WRITE(u32 val)

 		case SCE_IPU_VDEC:

-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);

 			// check if enough data in queue
 			if (ipuVDEC(val)) return;
@ -993,9 +861,11 @@ void IPUCMD_WRITE(u32 val)
 			break;

 		case SCE_IPU_FDEC:
-			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, FP %d, CHCR 0x%x",
-			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, g_BP.FP, ipu1dma.chcr._u32);
-			g_BP.BP += val & 0x3F;
+			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, CHCR 0x%x",
+			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, ipu1dma.chcr._u32);
+
+			g_BP.Advance(val & 0x3F);
+
 			if (ipuFDEC(val)) return;
 			ipuRegs.cmd.BUSY = 0x80000000;
 			ipuRegs.topbusy = 0x80000000;
@ -1009,7 +879,7 @@ void IPUCMD_WRITE(u32 val)
 		case SCE_IPU_SETIQ:
 			IPU_LOG("SETIQ command.");
 			if (val & 0x3f) IPU_LOG("Skip %d bits.", val & 0x3f);
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 			if (ipuSETIQ(val)) return;
 			break;

--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@ -67,11 +67,66 @@ union tIPU_CTRL {
 	void reset() { _u32 = 0; }
 };

-struct tIPU_BP {
-	u32 BP;		// Bit stream point
-	u16 IFC;	// Input FIFO counter
-	u8 FP;		// FIFO point
-	u8 bufferhasnew; // Always 0.
+__aligned16 struct tIPU_BP {
+	__aligned16 u128 internal_qwc[2];
+
+	u32 BP;		// Bit stream point (0 to 128*2)
+	u32 IFC;	// Input FIFO counter (8QWC) (0 to 8)
+	u32 FP;		// internal FIFO (2QWC) fill status (0 to 2)
+
+	__fi void Align()
+	{
+		BP = (BP + 7) & ~7;
+		Advance(0);
+	}
+
+	__fi void Advance(uint bits)
+	{
+		BP += bits;
+		pxAssume( BP <= 256 );
+
+		if (BP > 127)
+		{
+			BP -= 128;
+
+			if (FP == 2)
+			{
+				// when BP is over 128 it means we're reading data from the second quadword.  Shift that one
+				// to the front and load the new quadword into the second QWC (its a manualized ringbuffer!)
+
+				CopyQWC(&internal_qwc[0], &internal_qwc[1]);
+				FP = 1;
+			}
+			else
+			{
+				// if FP == 1 then the buffer has been completely drained.
+				// if FP == 0 then an already-drained buffer is being advanced.
+				// In either case we just assign FP to 0.
+
+				FP = 0;
+			}
+		}
+	}
+
+	__fi bool FillBuffer(u32 bits)
+	{
+		while (FP < 2)
+		{
+			if (ipu_fifo.in.read(&internal_qwc[FP]) == 0)
+			{
+				// Here we *try* to fill the entire internal QWC buffer; however that may not necessarily
+				// be possible -- so if the fill fails we'll only return 0 if we don't have enough
+				// remaining bits in the FIFO to fill the request.
+
+				return ((FP!=0) && (BP + bits) <= 128);
+			}
+
+			++FP;
+		}
+
+		return true;
+	}
+
 	wxString desc() const
 	{
 		return wxsFormat(L"Ipu BP: bp = 0x%x, IFC = 0x%x, FP = 0x%x.", BP, IFC, FP);
@ -217,10 +272,9 @@ extern void IPUCMD_WRITE(u32 val);
 extern void ipuSoftReset();
 extern void IPUProcessInterrupt();

-extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
-extern u8 __fastcall getBits128(u8 *address, u32 advance);
-extern u8 __fastcall getBits64(u8 *address, u32 advance);
-extern u8 __fastcall getBits32(u8 *address, u32 advance);
-extern u8 __fastcall getBits16(u8 *address, u32 advance);
-extern u8 __fastcall getBits8(u8 *address, u32 advance);
+extern u8 getBits128(u8 *address, bool advance);
+extern u8 getBits64(u8 *address, bool advance);
+extern u8 getBits32(u8 *address, bool advance);
+extern u8 getBits16(u8 *address, bool advance);
+extern u8 getBits8(u8 *address, bool advance);

--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@ -85,14 +85,14 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
 int IPU_Fifo_Input::read(void *value)
 {
 	// wait until enough data to ensure proper streaming.
-	if (g_BP.IFC < 4)
+	if (g_BP.IFC < 3)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
 			CPU_INT( DMAC_TO_IPU, 32 );
 		}
-		
+
 		if (g_BP.IFC == 0) return 0;
 		pxAssert(g_BP.IFC > 0);
 	}
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@ -47,10 +47,14 @@ const int non_linear_quantizer_scale [] =
 	into 1st slot is copied to the 2nd slot. Which will later be copied
 	back to the 1st slot when 128bits have been read.
 */
-extern void ReorderBitstream();
 const DCTtab * tab;
 int mbaCount = 0;

+int bitstream_init ()
+{
+	return g_BP.FillBuffer(32);
+}
+
 int get_macroblock_modes()
 {
 	int macroblock_modes;
@ -221,9 +225,7 @@ int __fi get_motion_delta(const int f_code)

 int __fi get_dmv()
 {
-	const DMVtab * tab;
-
-	tab = DMV_2 + UBITS(2);
+	const DMVtab* tab = DMV_2 + UBITS(2);
 	DUMPBITS(tab->len);
 	return tab->dmv;
 }
@ -239,22 +241,21 @@ int get_macroblock_address_increment()
 	else if (code >= 768)
 		mba = MBA.mba11 + (UBITS(11) - 24);
 	else switch (UBITS(11))
-		{
+	{
+		case 8:		/* macroblock_escape */
+			DUMPBITS(11);
+			return 0x23;

-			case 8:		/* macroblock_escape */
+		case 15:	/* macroblock_stuffing (MPEG1 only) */
+			if (decoder.mpeg1)
+			{
 				DUMPBITS(11);
-				return 0x23;
+				return 0x22;
+			}

-			case 15:	/* macroblock_stuffing (MPEG1 only) */
-				if (decoder.mpeg1)
-				{
-					DUMPBITS(11);
-					return 0x22;
-				}
-
-			default:
-				return 0;//error
-		}
+		default:
+			return 0;//error
+	}

 	DUMPBITS(mba->len);

@ -336,7 +337,7 @@ do {							\
 	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)

-static __fi bool get_intra_block()
+static bool get_intra_block()
 {
 	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
 	const u8 (&quant_matrix)[64] = decoder.iq;
@ -474,7 +475,8 @@ static __fi bool get_intra_block()
 				}

 				/* if (bitstream_get (1)) val = -val; */
-				val = (val ^ SBITS(1)) - SBITS(1);
+				int bit1 = SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}

@ -489,7 +491,7 @@ static __fi bool get_intra_block()
  return true;
 }

-static __fi bool get_non_intra_block(int * last)
+static bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
@ -615,8 +617,9 @@ static __fi bool get_non_intra_block(int * last)
 			}
 			else
 			{
+				int bit1 = SBITS(1);
 				val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-				val = (val ^ SBITS(1)) - SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}

@ -683,25 +686,11 @@ void __fi finishmpeg2sliceIDEC()
 {
 	ipuRegs.ctrl.SCD = 0;
 	coded_block_pattern = decoder.coded_block_pattern;
-
-	g_BP.BP += decoder.bitstream_bits - 16;
-
-	if ((int)g_BP.BP < 0)
-	{
-		g_BP.BP = 128 + (int)g_BP.BP;
-
-		// After BP is positioned correctly, we need to reload the old buffer
-		// so that reading may continue properly
-		ReorderBitstream();
-	}
-
-	FillInternalBuffer(&g_BP.BP, 1, 0);
 }

 bool mpeg2sliceIDEC()
 {
 	u16 code;
-	u8 bit8;

 	switch (ipu_cmd.pos[0])
 	{
@ -855,18 +844,18 @@ bool mpeg2sliceIDEC()
 					}
 					else switch (UBITS(11))
 					{
-							case 8:		/* macroblock_escape */
-								mbaCount += 33;
-								/* pass through */
+						case 8:		/* macroblock_escape */
+							mbaCount += 33;
+							/* pass through */

-							case 15:	/* macroblock_stuffing (MPEG1 only) */
-								DUMPBITS(11);
-								continue;
+						case 15:	/* macroblock_stuffing (MPEG1 only) */
+							DUMPBITS(11);
+							continue;

-							default:	/* end of slice/frame, or error? */
-							{
-								goto finish_idec;	
-							}
+						default:	/* end of slice/frame, or error? */
+						{
+							goto finish_idec;	
+						}
 					}
 				}

@ -897,12 +886,13 @@ bool mpeg2sliceIDEC()
 			ipu_cmd.pos[1] = 0;
 			ipu_cmd.pos[2] = 0;
 		}
-		
+
 finish_idec:
 		finishmpeg2sliceIDEC();

 	case 3:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 3;
@ -911,10 +901,10 @@ finish_idec:

 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
+	}

 	case 4:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
@ -923,7 +913,7 @@ finish_idec:
 			return false;
 		}

-		BigEndian(ipuRegs.top, ipuRegs.top);
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;

 	jNO_DEFAULT;
@ -935,7 +925,6 @@ finish_idec:
 bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
-	u8 bit8;

 	macroblock_8& mb8 = decoder.mb8;
 	macroblock_16& mb16 = decoder.mb16;
@ -1022,7 +1011,31 @@ bool mpeg2_slice()
 			jNO_DEFAULT;
 			}

-			ipu_copy(mb8, mb16);
+			// Copy macroblock8 to macroblock16 - without sign extension.
+			// Manually inlined due to MSVC refusing to inline the SSE-optimized version.
+			{
+				const u8	*s = (const u8*)&mb8;
+				u16			*d = (u16*)&mb16;
+
+				//Y  bias	- 16 * 16
+				//Cr bias	- 8 * 8
+				//Cb bias	- 8 * 8
+
+				__m128i zeroreg = _mm_setzero_si128();
+
+				for (uint i = 0; i < (256+64+64) / 32; ++i)
+				{
+					//*d++ = *s++;
+					__m128i woot1 = _mm_load_si128((__m128i*)s);
+					__m128i woot2 = _mm_load_si128((__m128i*)s+1);
+					_mm_store_si128((__m128i*)d,	_mm_unpacklo_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+1,	_mm_unpackhi_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+2,	_mm_unpacklo_epi8(woot2, zeroreg));
+					_mm_store_si128((__m128i*)d+3,	_mm_unpackhi_epi8(woot2, zeroreg));
+					s += 32;
+					d += 32;
+				}
+			}
 		}
 		else
 		{
@ -1096,18 +1109,6 @@ bool mpeg2_slice()
 		// Send The MacroBlock via DmaIpuFrom
 		ipuRegs.ctrl.SCD = 0;
 		coded_block_pattern = decoder.coded_block_pattern;
-		g_BP.BP += (int)decoder.bitstream_bits - 16;
-
-		// BP goes from 0 to 128, so negative values mean to read old buffer
-		// so we minus from 128 to get the correct BP
-		if ((int)g_BP.BP < 0)
-		{
-			g_BP.BP = 128 + (int)g_BP.BP;
-
-			// After BP is positioned correctly, we need to reload the old buffer
-			// so that reading may continue properly
-			ReorderBitstream();
-		}

 		decoder.mbc = 1;
 		decoder.SetOutputTo(mb16);
@ -1131,7 +1132,8 @@ bool mpeg2_slice()
 	}
 	
 	case 4:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 4;
@ -1140,11 +1142,11 @@ bool mpeg2_slice()

 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
-
+	}
+	
 	case 5:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
 		{
@ -1152,8 +1154,7 @@ bool mpeg2_slice()
 			return false;
 		}

-		BigEndian(ipuRegs.top, ipuRegs.top);
-		decoder.bitstream_bits = 0;
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
 	}

--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@ -152,8 +152,8 @@ struct decoder_t {
 	uint ipu0_idx;

 	/* bit parsing stuff */
-	u32 bitstream_buf;		/* current 32 bit working set */
-	int bitstream_bits;			/* used bits in working set */
+	//u32 bitstream_buf;		/* current 32 bit working set */
+	//int bitstream_bits;			/* used bits in working set */

 	int quantizer_scale;	/* remove */
 	int dmv_offset;		/* remove */
@ -241,6 +241,10 @@ struct mpeg2_scan_pack
 	mpeg2_scan_pack();
 };

+extern int bitstream_init ();
+extern u32 UBITS(uint bits);
+extern s32 SBITS(uint bits);
+
 extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
 extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);

@ -258,20 +262,19 @@ extern int get_dmv();
 extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
 extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
 extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
-extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);

 extern int slice (u8 * buffer);

 #ifdef _MSC_VER
-#define BigEndian(out, in) out = _byteswap_ulong(in)
+#define BigEndian(in) _byteswap_ulong(in)
 #else
-#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
+#define BigEndian(in) __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif

 #ifdef _MSC_VER
-#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#define BigEndian64(in) _byteswap_uint64(in)
 #else
-#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#define BigEndian64(in) __builtin_bswap64(in) // or we could use the asm function bswap...
 #endif

 extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@ -30,64 +30,24 @@
 #ifndef __VLC_H__
 #define __VLC_H__

-//static u8 word[4];
-//static u8 dword[8];
-//static u8 qword[16];
-
 static __fi int GETWORD()
 {
-	if (decoder.bitstream_bits <= 0) return 1;
-
-	static u8 data[2];
-	
-	if(!getBits16(data,1))
-	{
-		return 0;
-	}
-	
-	/*u32 data;
-	BigEndian(data, *(u32*)word);
-	decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
-	decoder.bitstream_bits -= 32;*/
-	decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
-	decoder.bitstream_bits -= 16;
-
-	return 1;
+	return g_BP.FillBuffer(16);
 }

-static __fi int bitstream_init ()
+// Removes bits from the bitstream.  This is done independently of UBITS/SBITS because a
+// lot of mpeg streams have to read ahead and rewind bits and re-read them at different
+// bit depths or sign'age.
+static __fi void DUMPBITS(uint num)
 {
-	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
-	{
-		return 0;
-	}
-
-	decoder.bitstream_bits = -16;
-	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
-	/*decoder.bitstream_buf = *(u64*)dword;
-	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/
-
-	return 1;
+	g_BP.Advance(num);
+	//pxAssume(g_BP.FP != 0);
 }

-/* remove num valid bits from bit_buf */
-static __fi void DUMPBITS(int num)
+static __fi u32 GETBITS(uint num)
 {
-	decoder.bitstream_buf <<= num;
-    decoder.bitstream_bits += num;
-}
-
-/* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* Get bits from bitstream */
-static __fi u32 GETBITS(int num)
-{
-	u16 retVal = UBITS(num);
-	DUMPBITS(num);
+	uint retVal = UBITS(num);
+	g_BP.Advance(num);

 	return retVal;
 }
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.

-static const u32 g_SaveVersion = 0x8b4a0000;
+static const u32 g_SaveVersion = 0x8b4b0000;

 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)