diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp
index 55f988dcc3..5e19cd9124 100644
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@@ -55,9 +55,6 @@ int coded_block_pattern = 0;
 u8 indx4[16*16/2];
 __aligned16 decoder_t decoder;
 
-__aligned16 u8 _readbits[80];	//local buffer (ring buffer)
-u8* readbits = _readbits;		// always can decrement by one 1qw
-
 __fi void IPUProcessInterrupt()
 {
 	if (ipuRegs.ctrl.BUSY && g_BP.IFC) IPUWorker();
@@ -96,8 +93,6 @@ void ReportIPU()
 	Console.WriteLn("g_decoder = 0x%x.", &decoder);
 	Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
 	Console.WriteLn(ipu_cmd.desc());
-	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
-		_readbits, readbits - _readbits);
 	Console.Newline();
 }
 
@@ -114,15 +109,6 @@ void SaveStateBase::ipuFreeze()
 	Freeze(coded_block_pattern);
 	Freeze(decoder);
 	Freeze(ipu_cmd);
-	Freeze(_readbits);
-
-	int temp = readbits - _readbits;
-	Freeze(temp);
-
-	if (IsLoading())
-	{
-		readbits = _readbits;
-	}
 }
 
 void tIPU_CMD_IDEC::log() const
@@ -213,21 +199,27 @@ __fi u32 ipuRead32(u32 mem)
 	switch (mem)
 	{
 		ipucase(IPU_CTRL): // IPU_CTRL
+		{
 			ipuRegs.ctrl.IFC = g_BP.IFC;
 			ipuRegs.ctrl.CBP = coded_block_pattern;
 
 			if (!ipuRegs.ctrl.BUSY)
 				IPU_LOG("read32: IPU_CTRL=0x%08X", ipuRegs.ctrl._u32);
 
-		return ipuRegs.ctrl._u32;
+			return ipuRegs.ctrl._u32;
+		}		
 
 		ipucase(IPU_BP): // IPU_BP
+		{
+			pxAssume(g_BP.FP <= 2);
+			
 			ipuRegs.ipubp = g_BP.BP & 0x7f;
 			ipuRegs.ipubp |= g_BP.IFC << 8;
-			ipuRegs.ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
+			ipuRegs.ipubp |= g_BP.FP << 16;
 
 			IPU_LOG("read32: IPU_BP=0x%08X", ipuRegs.ipubp);
-		return ipuRegs.ipubp;
+			return ipuRegs.ipubp;
+		}
 
 		default:
 			IPU_LOG("read32: Addr=0x%08X Value = 0x%08X", mem, psHu32(IPU_CMD + mem));
@@ -283,9 +275,7 @@ void ipuSoftReset()
 	ipu_cmd.clear();
 	ipuRegs.cmd.BUSY = 0;
 
-	g_BP.BP = 0;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+	memzero(g_BP);
 }
 
 __fi bool ipuWrite32(u32 mem, u32 value)
@@ -354,12 +344,11 @@ static void ipuBCLR(u32 val)
 {
 	ipu_fifo.in.clear();
 
+	memzero(g_BP);
 	g_BP.BP = val & 0x7F;
-	g_BP.FP = 0;
-	//g_BP.bufferhasnew = 0;
+
 	ipuRegs.ctrl.BUSY = 0;
 	ipuRegs.cmd.BUSY = 0;
-	memzero(_readbits);
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }
 
@@ -370,7 +359,7 @@ static bool ipuIDEC(u32 val, bool resume)
 	if (!resume)
 	{
 		idec.log();
-		g_BP.BP += idec.FB;//skip FB bits
+		g_BP.Advance(idec.FB);
 
 	//from IPU_CTRL
 		ipuRegs.ctrl.PCT = I_TYPE; //Intra DECoding;)
@@ -407,7 +396,7 @@ static __fi bool ipuBDEC(u32 val, bool resume)
 		bdec.log(s_bdec);
 		if (IsDebugBuild) s_bdec++;
 
-	g_BP.BP += bdec.FB;//skip FB bits
+		g_BP.Advance(bdec.FB);
 		decoder.coding_type			= I_TYPE;
 		decoder.mpeg1				= ipuRegs.ctrl.MP1;
 		decoder.q_scale_type		= ipuRegs.ctrl.QST;
@@ -433,11 +422,7 @@ static bool __fastcall ipuVDEC(u32 val)
 	switch (ipu_cmd.pos[0])
 	{
 		case 0:
-			ipuRegs.cmd.DATA = 0;
-			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return false;
-
-			decoder.bitstream_bits = -16;
-			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+			if (!bitstream_init()) return false;
 
 			switch ((val >> 26) & 3)
 			{
@@ -459,17 +444,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				case 3://DMVector
 					ipuRegs.cmd.DATA = get_dmv();
 					break;
+
+				jNO_DEFAULT
 			}
 
-			g_BP.BP += (int)decoder.bitstream_bits + 16;
+			ipuRegs.cmd.DATA &= 0xFFFF;
+			ipuRegs.cmd.DATA |= 0x10000;
 
-			if ((int)g_BP.BP < 0)
-			{
-				g_BP.BP += 128;
-				ReorderBitstream();
-			}
-
-			ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
+			//ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0);
 
 		case 1:
@@ -479,14 +461,14 @@ static bool __fastcall ipuVDEC(u32 val)
 				return false;
 			}
 
-			BigEndian(ipuRegs.top, ipuRegs.top);
+			ipuRegs.top = BigEndian(ipuRegs.top);
 
 			IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d",
 			        ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ?
 			        ((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT);
 			return true;
 
-			jNO_DEFAULT
+		jNO_DEFAULT
 	}
 
 	return false;
@@ -496,7 +478,7 @@ static __fi bool ipuFDEC(u32 val)
 {
 	if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false;
 
-	BigEndian(ipuRegs.cmd.DATA, ipuRegs.cmd.DATA);
+	ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA);
 	ipuRegs.top = ipuRegs.cmd.DATA;
 
 	IPU_LOG("FDEC read: 0x%08x", ipuRegs.top);
@@ -553,11 +535,10 @@ static bool ipuSETVQ(u32 val)
 		if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false;
 	}
 
-	IPU_LOG("SETVQ command.\nRead VQCLUT table from FIFO.");
-	IPU_LOG(
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	IPU_LOG("SETVQ command.   Read VQCLUT table from FIFO.\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n"
 	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
 	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
 	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
@@ -723,148 +704,48 @@ __fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4)
 	Console.Error("IPU: VQ not implemented");
 }
 
-__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16)
-{
-	const u8	*s = (const u8*)&mb8;
-	s16	*d = (s16*)&mb16;
-	int i;
-	for (i = 0; i < 256; i++) *d++ = *s++;		//Y  bias	- 16
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cr bias	- 128
-	for (i = 0; i < 64; i++) *d++ = *s++;		//Cb bias	- 128
-}
-
 
 // --------------------------------------------------------------------------------------
 //  Buffer reader
 // --------------------------------------------------------------------------------------
 
-// move the readbits queue
-__fi void inc_readbits()
+__ri u32 UBITS(uint bits)
 {
-	readbits += 16;
-	if (readbits >= _readbits + 64)
-	{
-		// move back
-		*(u64*)(_readbits) = *(u64*)(_readbits + 64);
-		*(u64*)(_readbits + 8) = *(u64*)(_readbits + 72);
-		readbits = _readbits;
-	}
+	uint readpos8 = g_BP.BP/8;
+
+	uint result = BigEndian(*(u32*)( (u8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
+
+	return result;
 }
 
-// returns the pointer of readbits moved by 1 qword
-__fi u8* next_readbits()
+__ri s32 SBITS(uint bits)
 {
-	return readbits + 16;
-}
+	// Read an unaligned 32 bit value and then shift the bits up and then back down.
 
-// returns the pointer of readbits moved by 1 qword
-u8* prev_readbits()
-{
-	if (readbits < _readbits + 16) return _readbits + 48 - (readbits - _readbits);
+	uint readpos8 = g_BP.BP/8;
 
-	return readbits - 16;
-}
+	int result = BigEndian(*(s32*)( (s8*)g_BP.internal_qwc + readpos8 ));
+	uint bp7 = (g_BP.BP & 7);
+	result <<= bp7;
+	result >>= (32 - bits);
 
-void ReorderBitstream()
-{
-	readbits = prev_readbits();
-	g_BP.FP = 2;
-}
-
-// IPU has a 2qword internal buffer whose status is pointed by FP.
-// If FP is 1, there's 1 qword in buffer. Second qword is only loaded
-// incase there are less than 32bits available in the first qword.
-// \return Number of bits available (clamps at 16 bits)
-u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
-{
-	if (g_BP.FP == 0)
-	{
-		if (ipu_fifo.in.read(next_readbits()) == 0) return 0;
-
-		inc_readbits();
-		g_BP.FP = 1;
-	}
-
-	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
-	{
-		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
-	}
-
-	if (*(int*)pointer >= 128)
-	{
-		pxAssert(g_BP.FP >= 1);
-
-		if (g_BP.FP > 1) inc_readbits();
-
-		if (advance)
-		{
-			g_BP.FP--;
-			*pointer &= 127;
-		}
-	}
-
-	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
+	return result;
 }
 
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits128(u8 *address, u32 advance)
+u8 getBits64(u8 *address, bool advance)
 {
-	u64 mask2;
-	u128 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(64)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask2 = 0xff >> shift;
-		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
-		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
-
-		u128 notMask;
-		u128 data = *(u128*)(readpos + 1);
-		notMask.lo = ~mask.lo & data.lo;
-		notMask.hi = ~mask.hi & data.hi;
-		notMask.lo >>= 8 - shift;
-		notMask.lo |= (notMask.hi & (ULLONG_MAX >> (64 - shift))) << (64 - shift);
-		notMask.hi >>= 8 - shift;
-
-		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
-		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
-		
-		notMask.lo |= mask.lo;
-		notMask.hi |= mask.hi;
-		*(u128*)address = notMask;
-	}
-	else
-	{
-		*(u128*)address = *(u128*)readpos;
-	}
-
-	if (advance) g_BP.BP += 128;
-
-	return 1;
-}
-
-// whenever reading fractions of bytes. The low bits always come from the next byte
-// while the high bits come from the current byte
-u8 __fastcall getBits64(u8 *address, u32 advance)
-{
-	register u64 mask = 0;
-	u8* readpos;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
-	{
-		mask = (0xff >> shift);
+		u64 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);
 
 		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
@@ -874,55 +755,47 @@ u8 __fastcall getBits64(u8 *address, u32 advance)
 		*(u64*)address = *(u64*)readpos;
 	}
 
-	if (advance) g_BP.BP += 64;
+	if (advance) g_BP.Advance(64);
 
 	return 1;
 }
 
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
-u8 __fastcall getBits32(u8 *address, u32 advance)
+__fi u8 getBits32(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(32)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 32) < 32) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
-
-	if (uint shift = (g_BP.BP & 7))
+	const u8* readpos = &g_BP.internal_qwc->_u8[g_BP.BP/8];
+	
+	if(uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		u32 mask = (0xff >> shift);
 		mask = mask | (mask << 8) | (mask << 16) | (mask << 24);
 
 		*(u32*)address = ((~mask & *(u32*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u32*)readpos) << shift);
 	}
 	else
 	{
+		// Bit position-aligned -- no masking/shifting necessary
 		*(u32*)address = *(u32*)readpos;
 	}
 
-	if (advance) g_BP.BP += 32;
+	if (advance) g_BP.Advance(32);
 
 	return 1;
 }
 
-__fi u8 __fastcall getBits16(u8 *address, u32 advance)
+__fi u8 getBits16(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(16)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 16) < 16) return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		mask = mask | (mask << 8);
-
 		*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
 	}
 	else
@@ -930,25 +803,20 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance)
 		*(u16*)address = *(u16*)readpos;
 	}
 
-	if (advance) g_BP.BP += 16;
+	if (advance) g_BP.Advance(16);
 
 	return 1;
 }
 
-u8 __fastcall getBits8(u8 *address, u32 advance)
+u8 getBits8(u8 *address, bool advance)
 {
-	u32 mask;
-	u8* readpos;
+	if (!g_BP.FillBuffer(8)) return 0;
 
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8)
-		return 0;
-
-	readpos = readbits + (int)g_BP.BP / 8;
+	const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8];
 
 	if (uint shift = (g_BP.BP & 7))
 	{
-		mask = (0xff >> shift);
+		uint mask = (0xff >> shift);
 		*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
 	}
 	else
@@ -956,7 +824,7 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
 		*(u8*)address = *(u8*)readpos;
 	}
 
-	if (advance) g_BP.BP += 8;
+	if (advance) g_BP.Advance(8);
 
 	return 1;
 }
@@ -983,7 +851,7 @@ void IPUCMD_WRITE(u32 val)
 
 		case SCE_IPU_VDEC:
 
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 
 			// check if enough data in queue
 			if (ipuVDEC(val)) return;
@@ -993,9 +861,11 @@ void IPUCMD_WRITE(u32 val)
 			break;
 
 		case SCE_IPU_FDEC:
-			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, FP %d, CHCR 0x%x",
-			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, g_BP.FP, ipu1dma.chcr._u32);
-			g_BP.BP += val & 0x3F;
+			IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, CHCR 0x%x",
+			        val & 0x3f, g_BP.IFC, (int)g_BP.BP, ipu1dma.chcr._u32);
+
+			g_BP.Advance(val & 0x3F);
+
 			if (ipuFDEC(val)) return;
 			ipuRegs.cmd.BUSY = 0x80000000;
 			ipuRegs.topbusy = 0x80000000;
@@ -1009,7 +879,7 @@ void IPUCMD_WRITE(u32 val)
 		case SCE_IPU_SETIQ:
 			IPU_LOG("SETIQ command.");
 			if (val & 0x3f) IPU_LOG("Skip %d bits.", val & 0x3f);
-			g_BP.BP += val & 0x3F;
+			g_BP.Advance(val & 0x3F);
 			if (ipuSETIQ(val)) return;
 			break;
 
diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h
index e33c211b3e..a719e07474 100644
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@@ -67,11 +67,66 @@ union tIPU_CTRL {
 	void reset() { _u32 = 0; }
 };
 
-struct tIPU_BP {
-	u32 BP;		// Bit stream point
-	u16 IFC;	// Input FIFO counter
-	u8 FP;		// FIFO point
-	u8 bufferhasnew; // Always 0.
+__aligned16 struct tIPU_BP {
+	__aligned16 u128 internal_qwc[2];
+
+	u32 BP;		// Bit stream point (0 to 128*2)
+	u32 IFC;	// Input FIFO counter (8QWC) (0 to 8)
+	u32 FP;		// internal FIFO (2QWC) fill status (0 to 2)
+
+	__fi void Align()
+	{
+		BP = (BP + 7) & ~7;
+		Advance(0);
+	}
+
+	__fi void Advance(uint bits)
+	{
+		BP += bits;
+		pxAssume( BP <= 256 );
+
+		if (BP > 127)
+		{
+			BP -= 128;
+
+			if (FP == 2)
+			{
+				// when BP is over 128 it means we're reading data from the second quadword.  Shift that one
+				// to the front and load the new quadword into the second QWC (its a manualized ringbuffer!)
+
+				CopyQWC(&internal_qwc[0], &internal_qwc[1]);
+				FP = 1;
+			}
+			else
+			{
+				// if FP == 1 then the buffer has been completely drained.
+				// if FP == 0 then an already-drained buffer is being advanced.
+				// In either case we just assign FP to 0.
+
+				FP = 0;
+			}
+		}
+	}
+
+	__fi bool FillBuffer(u32 bits)
+	{
+		while (FP < 2)
+		{
+			if (ipu_fifo.in.read(&internal_qwc[FP]) == 0)
+			{
+				// Here we *try* to fill the entire internal QWC buffer; however that may not necessarily
+				// be possible -- so if the fill fails we'll only return 0 if we don't have enough
+				// remaining bits in the FIFO to fill the request.
+
+				return ((FP!=0) && (BP + bits) <= 128);
+			}
+
+			++FP;
+		}
+
+		return true;
+	}
+
 	wxString desc() const
 	{
 		return wxsFormat(L"Ipu BP: bp = 0x%x, IFC = 0x%x, FP = 0x%x.", BP, IFC, FP);
@@ -217,10 +272,9 @@ extern void IPUCMD_WRITE(u32 val);
 extern void ipuSoftReset();
 extern void IPUProcessInterrupt();
 
-extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
-extern u8 __fastcall getBits128(u8 *address, u32 advance);
-extern u8 __fastcall getBits64(u8 *address, u32 advance);
-extern u8 __fastcall getBits32(u8 *address, u32 advance);
-extern u8 __fastcall getBits16(u8 *address, u32 advance);
-extern u8 __fastcall getBits8(u8 *address, u32 advance);
+extern u8 getBits128(u8 *address, bool advance);
+extern u8 getBits64(u8 *address, bool advance);
+extern u8 getBits32(u8 *address, bool advance);
+extern u8 getBits16(u8 *address, bool advance);
+extern u8 getBits8(u8 *address, bool advance);
 
diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp
index 4b749ae339..25b0aad6f5 100644
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@@ -85,14 +85,14 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
 int IPU_Fifo_Input::read(void *value)
 {
 	// wait until enough data to ensure proper streaming.
-	if (g_BP.IFC < 4)
+	if (g_BP.IFC < 3)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
 			CPU_INT( DMAC_TO_IPU, 32 );
 		}
-		
+
 		if (g_BP.IFC == 0) return 0;
 		pxAssert(g_BP.IFC > 0);
 	}
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
index f222ab68c3..bdef4b2d49 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@@ -47,10 +47,14 @@ const int non_linear_quantizer_scale [] =
 	into 1st slot is copied to the 2nd slot. Which will later be copied
 	back to the 1st slot when 128bits have been read.
 */
-extern void ReorderBitstream();
 const DCTtab * tab;
 int mbaCount = 0;
 
+int bitstream_init ()
+{
+	return g_BP.FillBuffer(32);
+}
+
 int get_macroblock_modes()
 {
 	int macroblock_modes;
@@ -221,9 +225,7 @@ int __fi get_motion_delta(const int f_code)
 
 int __fi get_dmv()
 {
-	const DMVtab * tab;
-
-	tab = DMV_2 + UBITS(2);
+	const DMVtab* tab = DMV_2 + UBITS(2);
 	DUMPBITS(tab->len);
 	return tab->dmv;
 }
@@ -239,22 +241,21 @@ int get_macroblock_address_increment()
 	else if (code >= 768)
 		mba = MBA.mba11 + (UBITS(11) - 24);
 	else switch (UBITS(11))
-		{
+	{
+		case 8:		/* macroblock_escape */
+			DUMPBITS(11);
+			return 0x23;
 
-			case 8:		/* macroblock_escape */
+		case 15:	/* macroblock_stuffing (MPEG1 only) */
+			if (decoder.mpeg1)
+			{
 				DUMPBITS(11);
-				return 0x23;
+				return 0x22;
+			}
 
-			case 15:	/* macroblock_stuffing (MPEG1 only) */
-				if (decoder.mpeg1)
-				{
-					DUMPBITS(11);
-					return 0x22;
-				}
-
-			default:
-				return 0;//error
-		}
+		default:
+			return 0;//error
+	}
 
 	DUMPBITS(mba->len);
 
@@ -336,7 +337,7 @@ do {							\
 	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)
 
-static __fi bool get_intra_block()
+static bool get_intra_block()
 {
 	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
 	const u8 (&quant_matrix)[64] = decoder.iq;
@@ -474,7 +475,8 @@ static __fi bool get_intra_block()
 				}
 
 				/* if (bitstream_get (1)) val = -val; */
-				val = (val ^ SBITS(1)) - SBITS(1);
+				int bit1 = SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}
 
@@ -489,7 +491,7 @@ static __fi bool get_intra_block()
   return true;
 }
 
-static __fi bool get_non_intra_block(int * last)
+static bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
@@ -615,8 +617,9 @@ static __fi bool get_non_intra_block(int * last)
 			}
 			else
 			{
+				int bit1 = SBITS(1);
 				val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-				val = (val ^ SBITS(1)) - SBITS(1);
+				val = (val ^ bit1) - bit1;
 				DUMPBITS(1);
 			}
 
@@ -683,25 +686,11 @@ void __fi finishmpeg2sliceIDEC()
 {
 	ipuRegs.ctrl.SCD = 0;
 	coded_block_pattern = decoder.coded_block_pattern;
-
-	g_BP.BP += decoder.bitstream_bits - 16;
-
-	if ((int)g_BP.BP < 0)
-	{
-		g_BP.BP = 128 + (int)g_BP.BP;
-
-		// After BP is positioned correctly, we need to reload the old buffer
-		// so that reading may continue properly
-		ReorderBitstream();
-	}
-
-	FillInternalBuffer(&g_BP.BP, 1, 0);
 }
 
 bool mpeg2sliceIDEC()
 {
 	u16 code;
-	u8 bit8;
 
 	switch (ipu_cmd.pos[0])
 	{
@@ -855,18 +844,18 @@ bool mpeg2sliceIDEC()
 					}
 					else switch (UBITS(11))
 					{
-							case 8:		/* macroblock_escape */
-								mbaCount += 33;
-								/* pass through */
+						case 8:		/* macroblock_escape */
+							mbaCount += 33;
+							/* pass through */
 
-							case 15:	/* macroblock_stuffing (MPEG1 only) */
-								DUMPBITS(11);
-								continue;
+						case 15:	/* macroblock_stuffing (MPEG1 only) */
+							DUMPBITS(11);
+							continue;
 
-							default:	/* end of slice/frame, or error? */
-							{
-								goto finish_idec;	
-							}
+						default:	/* end of slice/frame, or error? */
+						{
+							goto finish_idec;	
+						}
 					}
 				}
 
@@ -897,12 +886,13 @@ bool mpeg2sliceIDEC()
 			ipu_cmd.pos[1] = 0;
 			ipu_cmd.pos[2] = 0;
 		}
-		
+
 finish_idec:
 		finishmpeg2sliceIDEC();
 
 	case 3:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 3;
@@ -911,10 +901,10 @@ finish_idec:
 
 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
+	}
 
 	case 4:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
@@ -923,7 +913,7 @@ finish_idec:
 			return false;
 		}
 
-		BigEndian(ipuRegs.top, ipuRegs.top);
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
 
 	jNO_DEFAULT;
@@ -935,7 +925,6 @@ finish_idec:
 bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
-	u8 bit8;
 
 	macroblock_8& mb8 = decoder.mb8;
 	macroblock_16& mb16 = decoder.mb16;
@@ -1022,7 +1011,31 @@ bool mpeg2_slice()
 			jNO_DEFAULT;
 			}
 
-			ipu_copy(mb8, mb16);
+			// Copy macroblock8 to macroblock16 - without sign extension.
+			// Manually inlined due to MSVC refusing to inline the SSE-optimized version.
+			{
+				const u8	*s = (const u8*)&mb8;
+				u16			*d = (u16*)&mb16;
+
+				//Y  bias	- 16 * 16
+				//Cr bias	- 8 * 8
+				//Cb bias	- 8 * 8
+
+				__m128i zeroreg = _mm_setzero_si128();
+
+				for (uint i = 0; i < (256+64+64) / 32; ++i)
+				{
+					//*d++ = *s++;
+					__m128i woot1 = _mm_load_si128((__m128i*)s);
+					__m128i woot2 = _mm_load_si128((__m128i*)s+1);
+					_mm_store_si128((__m128i*)d,	_mm_unpacklo_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+1,	_mm_unpackhi_epi8(woot1, zeroreg));
+					_mm_store_si128((__m128i*)d+2,	_mm_unpacklo_epi8(woot2, zeroreg));
+					_mm_store_si128((__m128i*)d+3,	_mm_unpackhi_epi8(woot2, zeroreg));
+					s += 32;
+					d += 32;
+				}
+			}
 		}
 		else
 		{
@@ -1096,18 +1109,6 @@ bool mpeg2_slice()
 		// Send The MacroBlock via DmaIpuFrom
 		ipuRegs.ctrl.SCD = 0;
 		coded_block_pattern = decoder.coded_block_pattern;
-		g_BP.BP += (int)decoder.bitstream_bits - 16;
-
-		// BP goes from 0 to 128, so negative values mean to read old buffer
-		// so we minus from 128 to get the correct BP
-		if ((int)g_BP.BP < 0)
-		{
-			g_BP.BP = 128 + (int)g_BP.BP;
-
-			// After BP is positioned correctly, we need to reload the old buffer
-			// so that reading may continue properly
-			ReorderBitstream();
-		}
 
 		decoder.mbc = 1;
 		decoder.SetOutputTo(mb16);
@@ -1131,7 +1132,8 @@ bool mpeg2_slice()
 	}
 	
 	case 4:
-		bit8 = 1;
+	{
+		u8 bit8;
 		if (!getBits8((u8*)&bit8, 0))
 		{
 			ipu_cmd.pos[0] = 4;
@@ -1140,11 +1142,11 @@ bool mpeg2_slice()
 
 		if (bit8 == 0)
 		{
-			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
+			g_BP.Align();
 			ipuRegs.ctrl.SCD = 1;
 		}
-
+	}
+	
 	case 5:
 		if (!getBits32((u8*)&ipuRegs.top, 0))
 		{
@@ -1152,8 +1154,7 @@ bool mpeg2_slice()
 			return false;
 		}
 
-		BigEndian(ipuRegs.top, ipuRegs.top);
-		decoder.bitstream_bits = 0;
+		ipuRegs.top = BigEndian(ipuRegs.top);
 		break;
 	}
 
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h
index 877f4cdd46..5ea46631e7 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@@ -152,8 +152,8 @@ struct decoder_t {
 	uint ipu0_idx;
 
 	/* bit parsing stuff */
-	u32 bitstream_buf;		/* current 32 bit working set */
-	int bitstream_bits;			/* used bits in working set */
+	//u32 bitstream_buf;		/* current 32 bit working set */
+	//int bitstream_bits;			/* used bits in working set */
 
 	int quantizer_scale;	/* remove */
 	int dmv_offset;		/* remove */
@@ -241,6 +241,10 @@ struct mpeg2_scan_pack
 	mpeg2_scan_pack();
 };
 
+extern int bitstream_init ();
+extern u32 UBITS(uint bits);
+extern s32 SBITS(uint bits);
+
 extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
 extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
 
@@ -258,20 +262,19 @@ extern int get_dmv();
 extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
 extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
 extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
-extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);
 
 extern int slice (u8 * buffer);
 
 #ifdef _MSC_VER
-#define BigEndian(out, in) out = _byteswap_ulong(in)
+#define BigEndian(in) _byteswap_ulong(in)
 #else
-#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
+#define BigEndian(in) __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif
 
 #ifdef _MSC_VER
-#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#define BigEndian64(in) _byteswap_uint64(in)
 #else
-#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#define BigEndian64(in) __builtin_bswap64(in) // or we could use the asm function bswap...
 #endif
 
 extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
diff --git a/pcsx2/IPU/mpeg2lib/Vlc.h b/pcsx2/IPU/mpeg2lib/Vlc.h
index 86f9f7ad5e..cac61dd40c 100644
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@@ -30,64 +30,24 @@
 #ifndef __VLC_H__
 #define __VLC_H__
 
-//static u8 word[4];
-//static u8 dword[8];
-//static u8 qword[16];
-
 static __fi int GETWORD()
 {
-	if (decoder.bitstream_bits <= 0) return 1;
-
-	static u8 data[2];
-	
-	if(!getBits16(data,1))
-	{
-		return 0;
-	}
-	
-	/*u32 data;
-	BigEndian(data, *(u32*)word);
-	decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
-	decoder.bitstream_bits -= 32;*/
-	decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
-	decoder.bitstream_bits -= 16;
-
-	return 1;
+	return g_BP.FillBuffer(16);
 }
 
-static __fi int bitstream_init ()
+// Removes bits from the bitstream.  This is done independently of UBITS/SBITS because a
+// lot of mpeg streams have to read ahead and rewind bits and re-read them at different
+// bit depths or sign'age.
+static __fi void DUMPBITS(uint num)
 {
-	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
-	{
-		return 0;
-	}
-
-	decoder.bitstream_bits = -16;
-	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
-	/*decoder.bitstream_buf = *(u64*)dword;
-	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/
-
-	return 1;
+	g_BP.Advance(num);
+	//pxAssume(g_BP.FP != 0);
 }
 
-/* remove num valid bits from bit_buf */
-static __fi void DUMPBITS(int num)
+static __fi u32 GETBITS(uint num)
 {
-	decoder.bitstream_buf <<= num;
-    decoder.bitstream_bits += num;
-}
-
-/* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
-
-/* Get bits from bitstream */
-static __fi u32 GETBITS(int num)
-{
-	u16 retVal = UBITS(num);
-	DUMPBITS(num);
+	uint retVal = UBITS(num);
+	g_BP.Advance(num);
 
 	return retVal;
 }
diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h
index bf506a3d58..a981837907 100644
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
 
-static const u32 g_SaveVersion = 0x8b4a0000;
+static const u32 g_SaveVersion = 0x8b4b0000;
 
 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)