diff --git a/pcsx2/Dmac.h b/pcsx2/Dmac.h
index 75c1496cc9..f3432c7013 100644
--- a/pcsx2/Dmac.h
+++ b/pcsx2/Dmac.h
@@ -161,8 +161,8 @@ union tDMA_SADR {
 
 union tDMA_MADR {
 	struct {
-		u32 ADDR : 31; // Transfer memory address
-		u32 SPR : 1; // Memory/SPR Address
+		u32 ADDR : 31;	// Transfer memory address
+		u32 SPR : 1;	// Memory/SPR Address
 	};
 	u32 _u32;
 
@@ -175,8 +175,8 @@ union tDMA_MADR {
 
 union tDMA_TADR {
 	struct {
-		u32 ADDR : 31; // Next Tag address
-		u32 SPR : 1; // Memory/SPR Address
+		u32 ADDR : 31;	// Next Tag address
+		u32 SPR : 1;	// Memory/SPR Address
 	};
 	u32 _u32;
 
@@ -190,8 +190,8 @@ union tDMA_TADR {
 // The Address Stack Register
 union tDMA_ASR {
 	struct {
-		u32 ADDR : 31; // Tag memory address
-		u32 SPR : 1; // Memory/SPR Address
+		u32 ADDR : 31;	// Tag memory address
+		u32 SPR : 1;	// Memory/SPR Address
 	};
 	u32 _u32;
 
diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index 28122713c4..79ca234fc2 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -94,27 +94,7 @@ void __fastcall ReadFIFO_page_6(u32 mem, u64 *out)
 	out[1] = psHu64(GIF_FIFO + 8);
 }
 
-void __fastcall ReadFIFO_page_7(u32 mem, u64 *out)
-{
-	pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) );
-
-	// All addresses in this page map to 0x7000 and 0x7010:
-	mem &= 0x10;
-
-	if( mem == 0 ) // IPUout_FIFO
-	{
-		if( g_nIPU0Data > 0 )
-		{
-			out[0] = *(u64*)(g_pIPU0Pointer);
-			out[1] = *(u64*)(g_pIPU0Pointer+8);
-			ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
-			g_nIPU0Data--;
-			g_pIPU0Pointer += 16;
-		}
-	}
-	else // IPUin_FIFO
-		ipu_fifo.out.readsingle((void*)out);
-}
+// ReadFIFO_page_7 is contained in IPU_Fifo.cpp
 
 //////////////////////////////////////////////////////////////////////////
 // WriteFIFO Pages
diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp
index 9160f98ec9..fdf9bc6779 100644
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@@ -43,12 +43,6 @@ tIPU_DMA g_nDMATransfer(0);
 tIPU_cmd ipu_cmd;
 IPUStatus IPU1Status;
 
-// FIXME - g_nIPU0Data and Pointer are not saved in the savestate, which breaks savestates for some
-// FMVs at random (if they get saved during the half frame of a 30fps rate).  The fix is complicated
-// since coroutine is such a pita.  (air)
-int g_nIPU0Data = 0; // data left to transfer
-u8* g_pIPU0Pointer = NULL;
-
 void ReorderBitstream();
 
 // the BP doesn't advance and returns -1 if there is no data to be read
@@ -59,72 +53,36 @@ void IPUWorker();
 // Color conversion stuff, the memory layout is a total hack
 // convert_data_buffer is a pointer to the internal rgb struct (the first param in convert_init_t)
 //char convert_data_buffer[sizeof(convert_rgb_t)];
-char convert_data_buffer[0x1C];
+//char convert_data_buffer[0x1C];							// unused?
+//u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'};		// unused?
 
 // Quantization matrix
-// Pointers outside of IPU.cpp point to niq & iq. As such, all hell breaks loose under gcc if you make them static. 
-u8 niq[64];			//non-intraquant matrix
-u8 iq[64];			//intraquant matrix
-u16 vqclut[16];				//clut conversion table
-static u8 s_thresh[2];		//thresholds for color conversions
+static u16 vqclut[16];				//clut conversion table
+static u8 s_thresh[2];				//thresholds for color conversions
 int coded_block_pattern = 0;
 
-__aligned16 macroblock_8 mb8;
-__aligned16 macroblock_16 mb16;
-__aligned16 macroblock_rgb32 rgb32;
-__aligned16 macroblock_rgb16 rgb16;
 
 u8 indx4[16*16/2];
-bool mpeg2_inited = false;		//mpeg2_idct_init() must be called only once
-u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'};
-__aligned16 decoder_t decoder;						//static, only to place it in bss
-
-extern "C"
-{
-	extern u8 mpeg2_scan_norm[64];
-	extern u8 mpeg2_scan_alt[64];
-}
+__aligned16 decoder_t decoder;
 
 __aligned16 u8 _readbits[80];	//local buffer (ring buffer)
-u8* readbits = _readbits; // always can decrement by one 1qw
+u8* readbits = _readbits;		// always can decrement by one 1qw
 
 __forceinline void IPUProcessInterrupt()
 {
 	if (ipuRegs->ctrl.BUSY && g_BP.IFC) IPUWorker();
 }
 
-void init_g_decoder()
-{
-	//other stuff
-	decoder.intra_quantizer_matrix = (u8*)iq;
-	decoder.non_intra_quantizer_matrix = (u8*)niq;
-	decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
-	decoder.stride = 16;
-}
-
-void mpeg2_init()
-{
-	if (!mpeg2_inited)
-	{
-		mpeg2_idct_init();
-		yuv2rgb_init();
-		memzero(mb8.Y);
-		memzero(mb8.Cb);
-		memzero(mb8.Cr);
-		memzero(mb16.Y);
-		memzero(mb16.Cb);
-		memzero(mb16.Cr);
-		mpeg2_inited = true;
-	}
-}
-
 /////////////////////////////////////////////////////////
 // Register accesses (run on EE thread)
 int ipuInit()
 {
 	memzero(*ipuRegs);
 	memzero(g_BP);
-	init_g_decoder();
+	memzero(decoder);
+
+	decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
+
 	g_nDMATransfer.reset();
 	IPU1Status.InProgress = false;
 	IPU1Status.DMAMode = DMA_MODE_NORMAL;
@@ -149,18 +107,16 @@ void ReportIPU()
 	Console.WriteLn(ipu_fifo.in.desc());
 	Console.WriteLn(ipu_fifo.out.desc());
 	Console.WriteLn(g_BP.desc());
-	Console.WriteLn("niq = 0x%x, iq = 0x%x.", niq, iq);
 	Console.WriteLn("vqclut = 0x%x.", vqclut);
 	Console.WriteLn("s_thresh = 0x%x.", s_thresh);
 	Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
-	Console.WriteLn("g_decoder = 0x%x.", decoder);
-	Console.WriteLn("mpeg2: scan_norm = 0x%x, alt = 0x%x.", mpeg2_scan_norm, mpeg2_scan_alt);
+	Console.WriteLn("g_decoder = 0x%x.", &decoder);
+	Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
 	Console.WriteLn(ipu_cmd.desc());
 	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
 		_readbits, readbits - _readbits);
 	Console.Newline();
 }
-// fixme - ipuFreeze looks fairly broken. Should probably take a closer look at some point.
 
 void SaveStateBase::ipuFreeze()
 {
@@ -168,24 +124,15 @@ void SaveStateBase::ipuFreeze()
 	//ReportIPU();
 	FreezeTag("IPU");
 
-	// old versions saved the IPU regs, but they're already saved as part of HW!
-	//FreezeMem(ipuRegs, sizeof(IPUregisters));
-
 	Freeze(g_nDMATransfer);
 	Freeze(ipu_fifo);
 
 	Freeze(g_BP);
-	Freeze(niq);
-	Freeze(iq);
 	Freeze(vqclut);
 	Freeze(s_thresh);
 	Freeze(coded_block_pattern);
 	Freeze(decoder);
-	Freeze(mpeg2_scan_norm);
-	Freeze(mpeg2_scan_alt);
-
 	Freeze(ipu_cmd);
-
 	Freeze(_readbits);
 
 	int temp = readbits - _readbits;
@@ -194,16 +141,9 @@ void SaveStateBase::ipuFreeze()
 	if (IsLoading())
 	{
 		readbits = _readbits;
-		init_g_decoder();
-		mpeg2_init();
 	}
 }
 
-bool ipuCanFreeze()
-{
-	return (ipu_cmd.current == -1);
-}
-
 __forceinline u32 ipuRead32(u32 mem)
 {
 	// Note: It's assumed that mem's input value is always in the 0x10002000 page
@@ -223,7 +163,7 @@ __forceinline u32 ipuRead32(u32 mem)
 			if (!ipuRegs->ctrl.BUSY)
 				IPU_LOG("Ipu read32: IPU_CTRL=0x%08X %x", ipuRegs->ctrl._u32, cpuRegs.pc);
 
-			return ipuRegs->ctrl._u32;
+		return ipuRegs->ctrl._u32;
 
 		ipucase(IPU_BP): // IPU_BP
 			ipuRegs->ipubp = g_BP.BP & 0x7f;
@@ -231,7 +171,8 @@ __forceinline u32 ipuRead32(u32 mem)
 			ipuRegs->ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
 
 			IPU_LOG("Ipu read32: IPU_BP=0x%08X", ipuRegs->ipubp);
-			return ipuRegs->ipubp;
+		return ipuRegs->ipubp;
+
 		default:
 			IPU_LOG("Ipu read32: Addr=0x%x Value = 0x%08X", mem, *(u32*)(((u8*)ipuRegs) + mem));
 	}
@@ -277,7 +218,6 @@ __forceinline u64 ipuRead64(u32 mem)
 
 void ipuSoftReset()
 {
-	mpeg2_init();
 	ipu_fifo.clear();
 
 	coded_block_pattern = 0;
@@ -381,16 +321,16 @@ static BOOL ipuIDEC(u32 val, bool resume)
 		g_BP.BP += idec.FB;//skip FB bits
 		//from IPU_CTRL
 		ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
-		decoder.coding_type = ipuRegs->ctrl.PCT;
-		decoder.mpeg1 = ipuRegs->ctrl.MP1;
-		decoder.q_scale_type	= ipuRegs->ctrl.QST;
-		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+		decoder.coding_type			= ipuRegs->ctrl.PCT;
+		decoder.mpeg1				= ipuRegs->ctrl.MP1;
+		decoder.q_scale_type		= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format	= ipuRegs->ctrl.IVF;
+		decoder.scantype			= ipuRegs->ctrl.AS;
+		decoder.intra_dc_precision	= ipuRegs->ctrl.IDP;
 
 		//from IDEC value
-		decoder.quantizer_scale = idec.QSC;
-		decoder.frame_pred_frame_dct = !idec.DTD;
+		decoder.quantizer_scale		= idec.QSC;
+		decoder.frame_pred_frame_dct= !idec.DTD;
 		decoder.sgn = idec.SGN;
 		decoder.dte = idec.DTE;
 		decoder.ofm = idec.OFM;
@@ -414,21 +354,21 @@ static __forceinline BOOL ipuBDEC(u32 val, bool resume)
 		if (IsDebugBuild) s_bdec++;
 
 		g_BP.BP += bdec.FB;//skip FB bits
-		decoder.coding_type = I_TYPE;
-		decoder.mpeg1 = ipuRegs->ctrl.MP1;
-		decoder.q_scale_type	= ipuRegs->ctrl.QST;
-		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+		decoder.coding_type			= I_TYPE;
+		decoder.mpeg1				= ipuRegs->ctrl.MP1;
+		decoder.q_scale_type		= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format	= ipuRegs->ctrl.IVF;
+		decoder.scantype			= ipuRegs->ctrl.AS;
+		decoder.intra_dc_precision	= ipuRegs->ctrl.IDP;
 
 		//from BDEC value
-		decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
-		decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
-		decoder.dcr = bdec.DCR;
-		decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
+		decoder.quantizer_scale		= decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
+		decoder.macroblock_modes	= bdec.DT ? DCT_TYPE_INTERLACED : 0;
+		decoder.dcr					= bdec.DCR;
+		decoder.macroblock_modes	|= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
 
-		memzero_sse_a(mb8);
-		memzero_sse_a(mb16);
+		memzero_sse_a(decoder.mb8);
+		memzero_sse_a(decoder.mb16);
 	}
 
 	return mpeg2_slice();
@@ -516,6 +456,8 @@ static BOOL ipuSETIQ(u32 val)
 
 	if ((val >> 27) & 1)
 	{
+		u8 (&niq)[64] = decoder.niq;
+
 		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
 		{
 			if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
@@ -531,6 +473,8 @@ static BOOL ipuSETIQ(u32 val)
 	}
 	else
 	{
+		u8 (&iq)[64] = decoder.iq;
+
 		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
 		{
 			if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
@@ -552,7 +496,7 @@ static BOOL ipuSETVQ(u32 val)
 {
 	for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
 	{
-		if (!getBits64((u8*)vqclut + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+		if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 	}
 
 	IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
@@ -591,17 +535,17 @@ static BOOL __fastcall ipuCSC(u32 val)
 	{
 		for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
 		{
-			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+			if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}
 
-		ipu_csc(mb8, rgb32, 0);
-		if (csc.OFM) ipu_dither(rgb32, rgb16, csc.DTE);
+		ipu_csc(decoder.mb8, decoder.rgb32, 0);
+		if (csc.OFM) ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
 		
 		if (csc.OFM)
 		{
 			while (ipu_cmd.pos[1] < 32)
 			{
-				ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
+				ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
 
 				if (ipu_cmd.pos[1] <= 0) return FALSE;
 			}
@@ -610,7 +554,7 @@ static BOOL __fastcall ipuCSC(u32 val)
 		{
 			while (ipu_cmd.pos[1] < 64)
 			{
-				ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
+				ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
 
 				if (ipu_cmd.pos[1] <= 0) return FALSE;
 			}
@@ -633,17 +577,17 @@ static BOOL ipuPACK(u32 val)
 	{
 		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
 		{
-			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+			if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}
 
-		ipu_csc(mb8, rgb32, 0);
-		ipu_dither(rgb32, rgb16, csc.DTE);
+		ipu_csc(decoder.mb8, decoder.rgb32, 0);
+		ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
 
-		if (csc.OFM) ipu_vq(rgb16, indx4);
+		if (csc.OFM) ipu_vq(decoder.rgb16, indx4);
 		
 		if (csc.OFM)
 		{
-			ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
+			ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
 
 			if (ipu_cmd.pos[1] < 32) return FALSE;
 		}
diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h
index cc3be3772f..a8f9ed30bd 100644
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@@ -342,8 +342,6 @@ struct tIPU_cmd
 
 extern tIPU_cmd ipu_cmd;
 extern int coded_block_pattern;
-extern int g_nIPU0Data; // or 0x80000000 whenever transferring
-extern u8* g_pIPU0Pointer;
 extern IPUStatus IPU1Status;
 extern tIPU_DMA g_nDMATransfer;
 
diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp
index b6a3b08127..e1467724e2 100644
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@@ -19,7 +19,7 @@
 #include "mpeg2lib/Mpeg.h"
 
 
-IPU_Fifo ipu_fifo;
+__aligned16 IPU_Fifo ipu_fifo;
 
 void IPU_Fifo::init()
 {
@@ -167,3 +167,32 @@ void IPU_Fifo_Output::readsingle(void *value)
 		_readsingle(value);
 	}
 }
+
+__forceinline bool decoder_t::ReadIpuData(u128* out)
+{
+	if(decoder.ipu0_data == 0) return false;
+	_mm_store_ps((float*)out, _mm_load_ps((float*)GetIpuDataPtr()));
+
+	--ipu0_data;
+	++ipu0_idx;
+
+	return true;
+}
+
+void __fastcall ReadFIFO_page_7(u32 mem, u64 *out)
+{
+	pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) );
+
+	// All addresses in this page map to 0x7000 and 0x7010:
+	mem &= 0x10;
+
+	if (mem == 0) // IPUout_FIFO
+	{
+		if (decoder.ReadIpuData((u128*)out))
+		{
+			ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
+		}
+	}
+	else // IPUin_FIFO
+		ipu_fifo.out.readsingle((void*)out);
+}
diff --git a/pcsx2/IPU/IPU_Fifo.h b/pcsx2/IPU/IPU_Fifo.h
index 6ea658cfd7..10a1e940d3 100644
--- a/pcsx2/IPU/IPU_Fifo.h
+++ b/pcsx2/IPU/IPU_Fifo.h
@@ -16,12 +16,14 @@
 #ifndef IPU_FIFO_H_INCLUDED
 #define IPU_FIFO_H_INCLUDED
 
-class IPU_Fifo_Input
-{
-	public:
+// Important!  All FIFO containers in this header should be 'struct' type, not class type.
+// They are saved into the savestate as-is, and keeping them as struct ensures that the
+// layout of their contents is reliable.
 
-	int readpos, writepos;
+struct IPU_Fifo_Input
+{
 	__aligned16 u32 data[32];
+	int readpos, writepos;
 
 	int write(u32* pMem, int size);
 	int read(void *value);
@@ -29,12 +31,10 @@ class IPU_Fifo_Input
 	wxString desc() const;
 };
 
-class IPU_Fifo_Output
+struct IPU_Fifo_Output
 {
-	public:
-
-	int readpos, writepos;
 	__aligned16 u32 data[32];
+	int readpos, writepos;
 
 	// returns number of qw read
 	int write(const u32 * value, int size);
@@ -42,20 +42,19 @@ class IPU_Fifo_Output
 	void readsingle(void *value);
 	void clear();
 	wxString desc() const;
-	private:
+
 	void _readsingle(void *value);
 };
 
-class IPU_Fifo
+struct IPU_Fifo
 {
-	public:
-	IPU_Fifo_Input in;
-	IPU_Fifo_Output out;
+	__aligned16 IPU_Fifo_Input in;
+	__aligned16 IPU_Fifo_Output out;
 
 	void init();
 	void clear();
 };
 
-extern IPU_Fifo ipu_fifo;
+extern __aligned16 IPU_Fifo ipu_fifo;
 
 #endif // IPU_FIFO_H_INCLUDED
diff --git a/pcsx2/IPU/mpeg2lib/Idct.cpp b/pcsx2/IPU/mpeg2lib/Idct.cpp
index cb2012adc1..69daaa89b6 100644
--- a/pcsx2/IPU/mpeg2lib/Idct.cpp
+++ b/pcsx2/IPU/mpeg2lib/Idct.cpp
@@ -22,10 +22,15 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  */
 
+// [TODO] : There are modern SSE versions of idct (idct_mmx.c) in the mpeg2 libs that we
+// should probably upgrade to.  They use their own raw-style intrinsics and not the intel
+// compiler-integrated ones.
+
 #include "PrecompiledHeader.h"
 
 #include "Common.h"
 #include "IPU/IPU.h"
+#include "Mpeg.h"
 
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -36,19 +41,14 @@
 #define clp(val,res)	res = (val < 0) ? 0 : ((val > 255) ? 255 : val);
 #define clp2(val,res)	res = (val < -255) ? -255 : ((val > 255) ? 255 : val);
 
-/* idct main entry point  */
-void (__fastcall *mpeg2_idct_copy) (s16 * block, u8 * dest, int stride);
-/* JayteeMaster: changed dest to 16 bit signed */
-void (__fastcall *mpeg2_idct_add) (int last, s16 * block,
-			 /*u8*/s16 * dest, int stride);
-
 /*
  * In legal streams, the IDCT output should be between -384 and +384.
  * In corrupted streams, it is possible to force the IDCT output to go
  * to +-3826 - this is the worst case for a column IDCT where the
  * column inputs are 16-bit values.
  */
-static u8 clip_lut[1024];
+static __aligned16 u8 clip_lut[1024];
+
 #define CLIP(i) ((clip_lut+384)[(i)])
 
 #if 0
@@ -75,13 +75,13 @@ static __forceinline void idct_row (s16 * const block)
     /* shortcut */
     if (!(block[1] | ((s32 *)block)[1] | ((s32 *)block)[2] |
 		  ((s32 *)block)[3])) {
-	u32 tmp = (u16) (block[0] << 3);
-	tmp |= tmp << 16;
-	((s32 *)block)[0] = tmp;
-	((s32 *)block)[1] = tmp;
-	((s32 *)block)[2] = tmp;
-	((s32 *)block)[3] = tmp;
-	return;
+		u32 tmp = (u16) (block[0] << 3);
+		tmp |= tmp << 16;
+		((s32 *)block)[0] = tmp;
+		((s32 *)block)[1] = tmp;
+		((s32 *)block)[2] = tmp;
+		((s32 *)block)[3] = tmp;
+		return;
     }
 
     d0 = (block[0] << 11) + 128;
@@ -160,122 +160,97 @@ static __forceinline void idct_col (s16 * const block)
     block[8*7] = (a0 - b0) >> 17;
 }
 
-static void __fastcall mpeg2_idct_copy_c (s16 * block, u8 * dest,
-			       const int stride)
+__releaseinline void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)
 {
     int i;
 
     for (i = 0; i < 8; i++)
-	idct_row (block + 8 * i);
+		idct_row (block + 8 * i);
     for (i = 0; i < 8; i++)
-	idct_col (block + i);
+		idct_col (block + i);
+
+	__m128 zero = _mm_setzero_ps();
     do {
-	dest[0] = CLIP (block[0]);
-	dest[1] = CLIP (block[1]);
-	dest[2] = CLIP (block[2]);
-	dest[3] = CLIP (block[3]);
-	dest[4] = CLIP (block[4]);
-	dest[5] = CLIP (block[5]);
-	dest[6] = CLIP (block[6]);
-	dest[7] = CLIP (block[7]);
+		dest[0] = CLIP (block[0]);
+		dest[1] = CLIP (block[1]);
+		dest[2] = CLIP (block[2]);
+		dest[3] = CLIP (block[3]);
+		dest[4] = CLIP (block[4]);
+		dest[5] = CLIP (block[5]);
+		dest[6] = CLIP (block[6]);
+		dest[7] = CLIP (block[7]);
 
-	block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
-	block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+		_mm_store_ps((float*)block, zero);
 
-	dest += stride;
-	block += 8;
+		dest += stride;
+		block += 8;
     } while (--i);
 }
 
-/* JayteeMaster: changed dest to 16 bit signed */
-static void __fastcall mpeg2_idct_add_c (const int last, s16 * block,
-			      /*u8*/s16 * dest, const int stride)
+
+// stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
+__releaseinline void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride)
 {
-    int i;
+	// on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).
 
-    if (last != 129 || (block[0] & 7) == 4) {
-	for (i = 0; i < 8; i++)
-	    idct_row (block + 8 * i);
-	for (i = 0; i < 8; i++)
-	    idct_col (block + i);
-	do {
-	    dest[0] = block[0];
-	    dest[1] = block[1];
-	    dest[2] = block[2];
-	    dest[3] = block[3];
-	    dest[4] = block[4];
-	    dest[5] = block[5];
-	    dest[6] = block[6];
-	    dest[7] = block[7];
+    if (last != 129 || (block[0] & 7) == 4)
+    {
+		int i;
+		for (i = 0; i < 8; i++)
+			idct_row (block + 8 * i);
+		for (i = 0; i < 8; i++)
+			idct_col (block + i);
 
-	    block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
-	    block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+		__m128 zero = _mm_setzero_ps();
+		do {
+			_mm_store_ps((float*)dest, _mm_load_ps((float*)block));
+			_mm_store_ps((float*)block, zero);
 
-	    dest += stride;
-	    block += 8;
-	} while (--i);
-    } else {
-	int DC;
+			dest += stride;
+			block += 8;
+		} while (--i);
 
-	DC = (block[0] + 4) >> 3;
-	block[0] = block[63] = 0;
-	i = 8;
-	do {
-	    dest[0] = DC;
-	    dest[1] = DC;
-	    dest[2] = DC;
-	    dest[3] = DC;
-	    dest[4] = DC;
-	    dest[5] = DC;
-	    dest[6] = DC;
-	    dest[7] = DC;
-	    dest += stride;
-	} while (--i);
+    }
+    else
+    {
+		int DC = (block[0] + 4) >> 3;
+		s16 dcf[2] = { DC, DC };
+		block[0] = block[63] = 0;
+
+		__m128 dc128 = _mm_set_ps1(*(float*)dcf);
+
+		for(int i=0; i<8; ++i)
+			_mm_store_ps((float*)(dest+(stride*i)), dc128);
     }
 }
 
-extern "C"
+mpeg2_scan_pack::mpeg2_scan_pack()
 {
-u8 mpeg2_scan_norm[64] = {
-    /* Zig-Zag scan pattern */
-     0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
-};
+	static const u8 mpeg2_scan_norm[64] = {
+		/* Zig-Zag scan pattern */
+		0,  1,  8,  16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+		12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+		35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+		58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+	};
 
-u8 mpeg2_scan_alt[64] = {
-    /* Alternate scan pattern */
-     0, 8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
-    41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
-    51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
-    53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
-};
-};
+	static const u8 mpeg2_scan_alt[64] = {
+		/* Alternate scan pattern */
+		0,  8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
+		41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
+		51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
+		53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
+	};
 
-// The MMX verson wasn't being used and it was only available as a .obj,
-// so I removed it (gigaherz).
-///* idct_mmx.c */
-//void mpeg2_idct_copy_mmxext (s16 * block, u8 * dest, int stride);
-//void mpeg2_idct_add_mmxext (int last, s16 * block,
-//			   s16 * dest, int stride);
-//void mpeg2_idct_copy_mmx (s16 * block, u8 * dest, int stride);
-//void mpeg2_idct_add_mmx (int last, s16 * block,
-//			   s16 * dest, int stride);
-//void mpeg2_idct_mmx_init (void);
-
-void mpeg2_idct_init()
-{
-	   int i, j;
-
-	mpeg2_idct_copy = mpeg2_idct_copy_c;
-	mpeg2_idct_add = mpeg2_idct_add_c;
-	for (i = -384; i < 640; i++)
+	for (int i = -384; i < 640; i++)
 		clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
-	for (i = 0; i < 64; i++) {
-		j = mpeg2_scan_norm[i];
-		mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+
+	for (int i = 0; i < 64; i++) {
+		int j = mpeg2_scan_norm[i];
+		norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 		j = mpeg2_scan_alt[i];
-		mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+		alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 	}
 }
+
+const __aligned16 mpeg2_scan_pack mpeg2_scan;
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
index bf9c68ccfb..9fbc48c155 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@@ -33,7 +33,7 @@
 #include "Mpeg.h"
 #include "Vlc.h"
 
-int non_linear_quantizer_scale [] =
+const int non_linear_quantizer_scale [] =
 {
 	0,  1,  2,  3,  4,  5,	6,	7,
 	8, 10, 12, 14, 16, 18,  20,  22,
@@ -341,8 +341,8 @@ static __forceinline bool get_intra_block()
 	int i;
 	int j;
 	int val;
-	const u8 * scan = decoder.scan;
-	const u8 * quant_matrix = decoder.intra_quantizer_matrix;
+	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
+	const u8 (&quant_matrix)[64] = decoder.iq;
 	int quantizer_scale = decoder.quantizer_scale;
 	s16 * dest = decoder.DCTblock;
 	u16 code; 
@@ -493,8 +493,8 @@ static __forceinline bool get_non_intra_block(int * last)
 	int i;
 	int j;
 	int val;
-	const u8 * scan = decoder.scan;
-	const u8 * quant_matrix = decoder.non_intra_quantizer_matrix;
+	const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
+	const u8 (&quant_matrix)[64] = decoder.niq;
 	int quantizer_scale = decoder.quantizer_scale;
 	s16 * dest = decoder.DCTblock;
 	u16 code;
@@ -699,7 +699,6 @@ void __forceinline finishmpeg2sliceIDEC()
 
 bool mpeg2sliceIDEC()
 {
-	u32 read;
 	u16 code;
 	u8 bit8;
 
@@ -725,6 +724,10 @@ bool mpeg2sliceIDEC()
 		ipu_cmd.pos[0] = 2;
 		while (1)
 		{
+			macroblock_8& mb8 = decoder.mb8;
+			macroblock_rgb16& rgb16 = decoder.rgb16;
+			macroblock_rgb32& rgb32 = decoder.rgb32;
+
 			int DCT_offset, DCT_stride;
 			const MBAtab * mba;
 
@@ -747,13 +750,13 @@ bool mpeg2sliceIDEC()
 
 				if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
 				{
-					DCT_offset = decoder.stride;
-					DCT_stride = decoder.stride * 2;
+					DCT_offset = decoder_stride;
+					DCT_stride = decoder_stride * 2;
 				}
 				else
 				{
-					DCT_offset = decoder.stride * 8;
-					DCT_stride = decoder.stride;
+					DCT_offset = decoder_stride * 8;
+					DCT_stride = decoder_stride;
 				}
 
 				switch (ipu_cmd.pos[2])
@@ -784,13 +787,13 @@ bool mpeg2sliceIDEC()
 						return false;
 					}
 				case 5:
-					if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5))
+					if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[2] == 5))
 					{
 						ipu_cmd.pos[2] = 5;
 						return false;
 					}
 				case 6:
-					if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6))
+					if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[2] == 6))
 					{
 						ipu_cmd.pos[2] = 6;
 						return false;
@@ -801,22 +804,17 @@ bool mpeg2sliceIDEC()
 				ipu_csc(mb8, rgb32, decoder.sgn);
 
 				if (decoder.ofm == 0)
-				{
-					g_nIPU0Data = 64;
-					g_pIPU0Pointer = (u8*)&rgb32;
-				}
+					decoder.SetOutputTo(rgb32);
 				else
 				{
 					ipu_dither(rgb32, rgb16, decoder.dte);
-
-					g_nIPU0Data = 32;
-					g_pIPU0Pointer = (u8*)&rgb16;
+					decoder.SetOutputTo(rgb16);
 				}
 
 			case 2:
-				while (g_nIPU0Data > 0)
+				while (decoder.ipu0_data > 0)
 				{
-					read = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
+					uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
 
 					if (read == 0)
 					{
@@ -825,9 +823,7 @@ bool mpeg2sliceIDEC()
 					}
 					else
 					{
-						g_pIPU0Pointer += read * 16;
-						g_nIPU0Data -= read;
-
+						decoder.AdvanceIpuDataBy(read);
 					}
 				}
 
@@ -932,7 +928,9 @@ bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
 	u8 bit8;
-	u32 size;
+
+	macroblock_8& mb8 = decoder.mb8;
+	macroblock_16& mb16 = decoder.mb16;
 
 	switch (ipu_cmd.pos[0])
 	{
@@ -960,13 +958,13 @@ bool mpeg2_slice()
 
 		if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
 		{
-			DCT_offset = decoder.stride;
-			DCT_stride = decoder.stride * 2;
+			DCT_offset = decoder_stride;
+			DCT_stride = decoder_stride * 2;
 		}
 		else
 		{
-			DCT_offset = decoder.stride * 8;
-			DCT_stride = decoder.stride;
+			DCT_offset = decoder_stride * 8;
+			DCT_stride = decoder_stride;
 		}
 
 		if (decoder.macroblock_modes & MACROBLOCK_INTRA)
@@ -1000,13 +998,13 @@ bool mpeg2_slice()
 					return false;
 				}
 			case 5:
-				if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
+				if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5))
 				{
 					ipu_cmd.pos[1] = 5;
 					return false;
 				}
 			case 6:
-				if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
+				if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6))
 				{
 					ipu_cmd.pos[1] = 6;
 					return false;
@@ -1063,7 +1061,7 @@ bool mpeg2_slice()
 				case 5:
 					if (decoder.coded_block_pattern & 0x2)
 					{
-						if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
+						if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5))
 						{
 							ipu_cmd.pos[1] = 5;
 							return false;
@@ -1072,7 +1070,7 @@ bool mpeg2_slice()
 				case 6:
 					if (decoder.coded_block_pattern & 0x1)
 					{
-						if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
+						if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6))
 						{
 							ipu_cmd.pos[1] = 6;
 							return false;
@@ -1083,8 +1081,7 @@ bool mpeg2_slice()
 			}
 		}
 
-		//Send The MacroBlock via DmaIpuFrom
-		size = 0;	// Reset
+		// Send The MacroBlock via DmaIpuFrom
 		ipuRegs->ctrl.SCD = 0;
 		coded_block_pattern = decoder.coded_block_pattern;
 		g_BP.BP += (int)decoder.bitstream_bits - 16;
@@ -1101,13 +1098,12 @@ bool mpeg2_slice()
 		}
 
 		decoder.mbc = 1;
-		g_nIPU0Data = 48;
-		g_pIPU0Pointer = (u8*)&mb16;
+		decoder.SetOutputTo(mb16);
 
 	case 3:
-		while (g_nIPU0Data > 0)
+		while (decoder.ipu0_data > 0)
 		{
-			size = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
+			uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
 
 			if (size == 0)
 			{
@@ -1116,8 +1112,7 @@ bool mpeg2_slice()
 			}
 			else
 			{
-				g_pIPU0Pointer += size * 16;
-				g_nIPU0Data -= size;
+				decoder.AdvanceIpuDataBy(size);
 			}
 		}
 
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h
index 9c26c1696c..13431eb396 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@@ -66,6 +66,8 @@ __noinline void memzero_sse_a( T& dest )
 #undef MZFqwc
 };
 
+// the IPU is fixed to 16 byte strides (128-bit / QWC resolution):
+static const uint decoder_stride = 16;
 
 enum macroblock_modes
 {
@@ -106,25 +108,25 @@ enum picture_coding_type
 };
 
 struct macroblock_8{
-	unsigned char Y[16][16];	//0
-	unsigned char Cb[8][8];		//1
-	unsigned char Cr[8][8];		//2
+	u8 Y[16][16];		//0
+	u8 Cb[8][8];		//1
+	u8 Cr[8][8];		//2
 };
 
 struct macroblock_16{
-	short Y[16][16];			//0
-	short Cb[8][8];				//1
-	short Cr[8][8];				//2
+	s16 Y[16][16];			//0
+	s16 Cb[8][8];			//1
+	s16 Cr[8][8];			//2
 };
 
 struct macroblock_rgb32{
 	struct {
-		unsigned char r, g, b, a;
+		u8 r, g, b, a;
 	} c[16][16];
 };
 
 struct rgb16_t{
-	unsigned short r:5, g:5, b:5, a:1;
+	u16 r:5, g:5, b:5, a:1;
 };
 
 struct macroblock_rgb16{
@@ -138,24 +140,26 @@ struct decoder_t {
 	/* DCT coefficients - should be kept aligned ! */
 	s16 DCTblock[64];
 
+	u8 niq[64];			//non-intraquant matrix (sequence header)
+	u8 iq[64];			//intraquant matrix (sequence header)
+
+	macroblock_8 mb8;
+	macroblock_16 mb16;
+	macroblock_rgb32 rgb32;
+	macroblock_rgb16 rgb16;
+
+	uint ipu0_data;
+	uint ipu0_idx;
+
 	/* bit parsing stuff */
 	u32 bitstream_buf;		/* current 32 bit working set */
 	int bitstream_bits;			/* used bits in working set */
 
-	int stride;
-
-	/* predictor for DC coefficients in intra blocks */
-	s16 dc_dct_pred[3];
-
 	int quantizer_scale;	/* remove */
 	int dmv_offset;		/* remove */
 
 	/* now non-slice-specific information */
 
-	/* sequence header stuff */
-	u8 *intra_quantizer_matrix;
-	u8 *non_intra_quantizer_matrix;
-
 	/* picture header stuff */
 
 	/* what type of picture this is (I, P, B, D) */
@@ -163,6 +167,9 @@ struct decoder_t {
 
 	/* picture coding extension stuff */
 
+	/* predictor for DC coefficients in intra blocks */
+	s16 dc_dct_pred[3];
+
 	/* quantization factor for intra dc coefficients */
 	int intra_dc_precision;
 	/* top/bottom/both fields */
@@ -195,16 +202,47 @@ struct decoder_t {
 
 	/* stuff derived from bitstream */
 
-	/* pointer to the zigzag scan we're supposed to be using */
-	const u8 * scan;
+	/* the zigzag scan we're supposed to be using, true for alt, false for normal */
+	bool scantype;
 
 	int second_field;
 
 	int mpeg1;
+
+	template< typename T >
+	void SetOutputTo( T& obj )
+	{
+		uint mb_offset = ((uptr)&obj - (uptr)&mb8);
+		pxAssume( (mb_offset & 15) == 0 );
+		ipu0_idx	= mb_offset / 16;
+		ipu0_data	= sizeof(obj)/16;
+	}
+
+	u128* GetIpuDataPtr()
+	{
+		return ((u128*)&mb8) + ipu0_idx;
+	}
+	
+	void AdvanceIpuDataBy(uint amt)
+	{
+		pxAssumeDev(ipu0_data>=amt, "IPU FIFO Overflow on advance!" );
+		ipu0_idx += amt;
+		ipu0_data -= amt;
+	}
+	
+	bool ReadIpuData(u128* out);
 };
 
-extern void (__fastcall *mpeg2_idct_copy) (s16 * block, u8* dest, int stride);
-extern void (__fastcall *mpeg2_idct_add) (int last, s16 * block, s16* dest, int stride);
+struct mpeg2_scan_pack
+{
+	u8 norm[64];
+	u8 alt[64];
+
+	mpeg2_scan_pack();
+};
+
+extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
+extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
 
 #define IDEC	0
 #define BDEC	1
@@ -217,16 +255,12 @@ extern int get_macroblock_modes();
 extern int get_motion_delta(const int f_code);
 extern int get_dmv();
 
-extern int non_linear_quantizer_scale[];
-
 extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
 extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
 extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
 extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);
 
 extern int slice (u8 * buffer);
-/* idct.c */
-extern void mpeg2_idct_init ();
 
 #ifdef _MSC_VER
 #define BigEndian(out, in) out = _byteswap_ulong(in)
@@ -240,13 +274,12 @@ extern void mpeg2_idct_init ();
 #define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
 #endif
 
+extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
+extern const int non_linear_quantizer_scale[];
+
 // The IPU can only do one task at once and never uses other buffers so all mpeg state variables
 // are made available to mpeg/vlc modules as globals here:
 
 extern __aligned16 tIPU_BP g_BP;
 extern __aligned16 decoder_t decoder;
-extern __aligned16 macroblock_8 mb8;
-extern __aligned16 macroblock_16 mb16;
-extern __aligned16 macroblock_rgb32 rgb32;
-extern __aligned16 macroblock_rgb16 rgb16;
 
diff --git a/pcsx2/IPU/yuv2rgb.cpp b/pcsx2/IPU/yuv2rgb.cpp
index 4c86e436c1..6b0dcb216d 100644
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@@ -39,6 +39,9 @@
 // conforming implementation for reference, do not optimise
 void yuv2rgb_reference(void)
 {
+	const macroblock_8& mb8 = decoder.mb8;
+	macroblock_rgb32& rgb32 = decoder.rgb32;
+
 	for (int y = 0; y < 16; y++)
 		for (int x = 0; x < 16; x++)
 		{
@@ -124,8 +127,8 @@ __releaseinline void yuv2rgb_sse2(void)
 
 		align 16
 tworows:
-		movq xmm3, qword ptr [mb8+256+esi]
-		movq xmm1, qword ptr [mb8+320+esi]
+		movq xmm3, qword ptr [decoder.mb8+256+esi]
+		movq xmm1, qword ptr [decoder.mb8+320+esi]
 		pxor xmm2, xmm2
 		pxor xmm0, xmm0
 		// could skip the movq but punpck requires 128-bit alignment
@@ -170,7 +173,7 @@ ihatemsvc:
 		movaps xmm4, xmm1
 		movaps xmm5, xmm2
 
-		movaps xmm6, xmmword ptr [mb8+edi]
+		movaps xmm6, xmmword ptr [decoder.mb8+edi]
 		psubusb xmm6, xmmword ptr [edx+Y_BIAS]
 		movaps xmm7, xmm6
 		psllw xmm6, 8                    // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
@@ -235,10 +238,10 @@ ihatemsvc:
 		punpckhwd xmm4, xmm5
 
 		// at last
-		movaps xmmword ptr [rgb32+edi*4+0], xmm0
-		movaps xmmword ptr [rgb32+edi*4+16], xmm1
-		movaps xmmword ptr [rgb32+edi*4+32], xmm3
-		movaps xmmword ptr [rgb32+edi*4+48], xmm4
+		movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
+		movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
+		movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
+		movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
 
 		add edi, 16
 
@@ -255,6 +258,8 @@ ihatemsvc:
 	// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
 	// to access all fields:
 	static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
+	static const macroblock_8* mb8 = (u8*)decoder.mb8;
+	static macroblock_rgb32* rgb32 = (u8*)decoder.rgb32;
 
 	__asm__ __volatile__ (
 		".intel_syntax noprefix\n"
@@ -262,15 +267,10 @@ ihatemsvc:
 		"xor esi, esi\n"
 		"xor edi, edi\n"
 
-		// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
-		// This saves 2-3 bytes per instruction where these are used. :)
-		//"mov ecx, offset %c[yuv2rgb_temp]\n"
-		//"mov edx, offset %c[sse2_tables]+64\n"
-
 		".align 16\n"
 "tworows:\n"
-		"movq xmm3, qword ptr [mb8+256+esi]\n"
-		"movq xmm1, qword ptr [mb8+320+esi]\n"
+		"movq xmm3, qword ptr [%[mb8]+256+esi]\n"
+		"movq xmm1, qword ptr [%[mb8]+320+esi]\n"
 		"pxor xmm2, xmm2\n"
 		"pxor xmm0, xmm0\n"
 		// could skip the movq but punpck requires 128-bit alignment
@@ -310,7 +310,7 @@ ihatemsvc:
 		"movaps xmm4, xmm1\n"
 		"movaps xmm5, xmm2\n"
 
-		"movaps xmm6, xmmword ptr [mb8+edi]\n"
+		"movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
 		"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
 		"movaps xmm7, xmm6\n"
 		"psllw xmm6, 8\n"                   // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
@@ -375,10 +375,10 @@ ihatemsvc:
 		"punpckhwd xmm4, xmm5\n"
 
 		// at last
-		"movaps xmmword ptr [rgb32+edi*4+0], xmm0\n"
-		"movaps xmmword ptr [rgb32+edi*4+16], xmm1\n"
-		"movaps xmmword ptr [rgb32+edi*4+32], xmm3\n"
-		"movaps xmmword ptr [rgb32+edi*4+48], xmm4\n"
+		"movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
+		"movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
+		"movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
+		"movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
 
 		"add edi, 16\n"
 
@@ -393,15 +393,11 @@ ihatemsvc:
 		:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
 			[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
 			[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
-			[yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset)
-		: "eax", "ebx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
+			[yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset),
+			[mb8]"r"(mb8), [rgb32]"r"(rgb32)
+		: "eax", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
 	);
 #else
 #	error Unsupported compiler
 #endif
 }
-
-void yuv2rgb_init(void)
-{
-	/* For later reimplementation of C version */
-}
diff --git a/pcsx2/IPU/yuv2rgb.h b/pcsx2/IPU/yuv2rgb.h
index 37d1049028..8d1efb2e14 100644
--- a/pcsx2/IPU/yuv2rgb.h
+++ b/pcsx2/IPU/yuv2rgb.h
@@ -17,6 +17,5 @@
 
 #define yuv2rgb yuv2rgb_sse2
 
-extern void yuv2rgb_reference(void);
-extern void yuv2rgb_sse2(void);
-extern void yuv2rgb_init(void);
+extern void yuv2rgb_reference();
+extern void yuv2rgb_sse2();
diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h
index 3ff9d18b43..5d2783a545 100644
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
 
-static const u32 g_SaveVersion = 0x8b470000;
+static const u32 g_SaveVersion = 0x8b480000;
 
 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)