diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp
index 10593fbd2c..7b422b6c8d 100644
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@@ -36,12 +36,9 @@
 
 // IPU Inline'd IRQs : Calls the IPU interrupt handlers directly instead of
 // feeding them through the EE's branch test. (see IPU.h for details)
-
-
-
-static tIPU_DMA g_nDMATransfer(0);
-static tIPU_cmd ipu_cmd;
-static IPUStatus IPU1Status;
+tIPU_DMA g_nDMATransfer(0);
+tIPU_cmd ipu_cmd;
+IPUStatus IPU1Status;
 
 // FIXME - g_nIPU0Data and Pointer are not saved in the savestate, which breaks savestates for some
 // FMVs at random (if they get saved during the half frame of a 30fps rate).  The fix is complicated
@@ -53,9 +50,6 @@ void ReorderBitstream();
 
 // the BP doesn't advance and returns -1 if there is no data to be read
 tIPU_BP g_BP;
-static coroutine_t s_routine; // used for executing BDEC/IDEC
-static int s_RoutineDone = 0;
-static u32 s_tempstack[0x4000]; // 64k
 
 void IPUWorker();
 
@@ -78,7 +72,7 @@ __aligned16 macroblock_rgb16 rgb16;
 u8 indx4[16*16/2];
 bool mpeg2_inited = false;		//mpeg2_idct_init() must be called only once
 u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'};
-decoder_t g_decoder;						//static, only to place it in bss
+decoder_t decoder;						//static, only to place it in bss
 decoder_t tempdec;
 
 extern "C"
@@ -98,14 +92,14 @@ __forceinline void IPUProcessInterrupt()
 void init_g_decoder()
 {
 	//other stuff
-	g_decoder.intra_quantizer_matrix = (u8*)iq;
-	g_decoder.non_intra_quantizer_matrix = (u8*)niq;
-	g_decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
-	g_decoder.mb8 = &mb8;
-	g_decoder.mb16 = &mb16;
-	g_decoder.rgb32 = &rgb32;
-	g_decoder.rgb16 = &rgb16;
-	g_decoder.stride = 16;
+	decoder.intra_quantizer_matrix = (u8*)iq;
+	decoder.non_intra_quantizer_matrix = (u8*)niq;
+	decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
+	decoder.mb8 = &mb8;
+	decoder.mb16 = &mb16;
+	decoder.rgb32 = &rgb32;
+	decoder.rgb16 = &rgb16;
+	decoder.stride = 16;
 }
 
 void mpeg2_init()
@@ -159,7 +153,7 @@ void ReportIPU()
 	Console.WriteLn("vqclut = 0x%x.", vqclut);
 	Console.WriteLn("s_thresh = 0x%x.", s_thresh);
 	Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
-	Console.WriteLn("g_decoder = 0x%x.", g_decoder);
+	Console.WriteLn("g_decoder = 0x%x.", decoder);
 	Console.WriteLn("mpeg2: scan_norm = 0x%x, alt = 0x%x.", mpeg2_scan_norm, mpeg2_scan_alt);
 	Console.WriteLn(ipu_cmd.desc());
 	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
@@ -186,7 +180,7 @@ void SaveStateBase::ipuFreeze()
 	Freeze(vqclut);
 	Freeze(s_thresh);
 	Freeze(coded_block_pattern);
-	Freeze(g_decoder);
+	Freeze(decoder);
 	Freeze(mpeg2_scan_norm);
 	Freeze(mpeg2_scan_alt);
 
@@ -377,72 +371,67 @@ static void ipuBCLR(u32 val)
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }
 
-static BOOL ipuIDEC(u32 val)
+static BOOL ipuIDEC(u32 val, bool resume)
 {
 	tIPU_CMD_IDEC idec(val);
 
-	idec.log();
-	g_BP.BP += idec.FB;//skip FB bits
-	//from IPU_CTRL
-	ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
-	g_decoder.coding_type = ipuRegs->ctrl.PCT;
-	g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-	g_decoder.q_scale_type	= ipuRegs->ctrl.QST;
-	g_decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-	g_decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-	g_decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+	if (!resume)
+	{
+		idec.log();
+		g_BP.BP += idec.FB;//skip FB bits
+		//from IPU_CTRL
+		ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
+		decoder.coding_type = ipuRegs->ctrl.PCT;
+		decoder.mpeg1 = ipuRegs->ctrl.MP1;
+		decoder.q_scale_type	= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
+		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
+		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
 
-	//from IDEC value
-	g_decoder.quantizer_scale = idec.QSC;
-	g_decoder.frame_pred_frame_dct = !idec.DTD;
-	g_decoder.sgn = idec.SGN;
-	g_decoder.dte = idec.DTE;
-	g_decoder.ofm = idec.OFM;
+		//from IDEC value
+		decoder.quantizer_scale = idec.QSC;
+		decoder.frame_pred_frame_dct = !idec.DTD;
+		decoder.sgn = idec.SGN;
+		decoder.dte = idec.DTE;
+		decoder.ofm = idec.OFM;
 
-	//other stuff
-	g_decoder.dcr = 1; // resets DC prediction value
+		//other stuff
+		decoder.dcr = 1; // resets DC prediction value
+	}
 
-	s_routine = so_create(mpeg2sliceIDEC, &s_RoutineDone, s_tempstack, sizeof(s_tempstack));
-	pxAssert(s_routine != NULL);
-	so_call(s_routine);
-	if (s_RoutineDone) s_routine = NULL;
-
-	return s_RoutineDone;
+	return mpeg2sliceIDEC();
 }
 
 static int s_bdec = 0;
 
-static __forceinline BOOL ipuBDEC(u32 val)
+static __forceinline BOOL ipuBDEC(u32 val, bool resume)
 {
 	tIPU_CMD_BDEC bdec(val);
 
-	bdec.log(s_bdec);
-	if (IsDebugBuild) s_bdec++;
+	if (!resume)
+	{
+		bdec.log(s_bdec);
+		if (IsDebugBuild) s_bdec++;
 
-	g_BP.BP += bdec.FB;//skip FB bits
-	g_decoder.coding_type = I_TYPE;
-	g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-	g_decoder.q_scale_type	= ipuRegs->ctrl.QST;
-	g_decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-	g_decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-	g_decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+		g_BP.BP += bdec.FB;//skip FB bits
+		decoder.coding_type = I_TYPE;
+		decoder.mpeg1 = ipuRegs->ctrl.MP1;
+		decoder.q_scale_type	= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
+		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
+		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
 
-	//from BDEC value
-	/* JayteeMaster: the quantizer (linear/non linear) depends on the q_scale_type */
-	g_decoder.quantizer_scale = g_decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
-	g_decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
-	g_decoder.dcr = bdec.DCR;
-	g_decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
+		//from BDEC value
+		decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
+		decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
+		decoder.dcr = bdec.DCR;
+		decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
 
-	memzero(mb8);
-	memzero(mb16);
+		memzero(mb8);
+		memzero(mb16);
+	}
 
-	s_routine = so_create(mpeg2_slice, &s_RoutineDone, s_tempstack, sizeof(s_tempstack));
-	pxAssert(s_routine != NULL);
-	so_call(s_routine);
-
-	if (s_RoutineDone) s_routine = NULL;
-	return s_RoutineDone;
+	return mpeg2_slice();
 }
 
 static BOOL __fastcall ipuVDEC(u32 val)
@@ -451,34 +440,34 @@ static BOOL __fastcall ipuVDEC(u32 val)
 	{
 		case 0:
 			ipuRegs->cmd.DATA = 0;
-			if (!getBits32((u8*)&g_decoder.bitstream_buf, 0)) return FALSE;
+			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return FALSE;
 
-			g_decoder.bitstream_bits = -16;
-			BigEndian(g_decoder.bitstream_buf, g_decoder.bitstream_buf);
+			decoder.bitstream_bits = -16;
+			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
 
 			switch ((val >> 26) & 3)
 			{
 				case 0://Macroblock Address Increment
-					g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-					ipuRegs->cmd.DATA = get_macroblock_address_increment(&g_decoder);
+					decoder.mpeg1 = ipuRegs->ctrl.MP1;
+					ipuRegs->cmd.DATA = get_macroblock_address_increment();
 					break;
 
-				case 1://Macroblock Type	//known issues: no error detected
-					g_decoder.frame_pred_frame_dct = 1;//prevent DCT_TYPE_INTERLACED
-					g_decoder.coding_type = ipuRegs->ctrl.PCT;
-					ipuRegs->cmd.DATA = get_macroblock_modes(&g_decoder);
+				case 1://Macroblock Type
+					decoder.frame_pred_frame_dct = 1;
+					decoder.coding_type = ipuRegs->ctrl.PCT;
+					ipuRegs->cmd.DATA = get_macroblock_modes();
 					break;
 
-				case 2://Motion Code		//known issues: no error detected
-					ipuRegs->cmd.DATA = get_motion_delta(&g_decoder, 0);
+				case 2://Motion Code
+					ipuRegs->cmd.DATA = get_motion_delta(0);
 					break;
 
 				case 3://DMVector
-					ipuRegs->cmd.DATA = get_dmv(&g_decoder);
+					ipuRegs->cmd.DATA = get_dmv();
 					break;
 			}
 
-			g_BP.BP += (g_decoder.bitstream_bits + 16);
+			g_BP.BP += (int)decoder.bitstream_bits + 16;
 
 			if ((int)g_BP.BP < 0)
 			{
@@ -486,9 +475,7 @@ static BOOL __fastcall ipuVDEC(u32 val)
 				ReorderBitstream();
 			}
 
-			FillInternalBuffer(&g_BP.BP, 1, 0);
-
-			ipuRegs->cmd.DATA = (ipuRegs->cmd.DATA & 0xFFFF) | ((g_decoder.bitstream_bits + 16) << 16);
+			ipuRegs->cmd.DATA = (ipuRegs->cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs->ctrl.ECD = (ipuRegs->cmd.DATA == 0);
 
 		case 1:
@@ -529,7 +516,10 @@ static BOOL ipuSETIQ(u32 val)
 
 	if ((val >> 27) & 1)
 	{
-		ipu_cmd.pos[0] += getBits((u8*)niq + ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1); // 8*8*8
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
+		{
+			if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+		}
 
 		IPU_LOG("Read non-intra quantization matrix from IPU FIFO.");
 		for (i = 0; i < 8; i++)
@@ -541,7 +531,10 @@ static BOOL ipuSETIQ(u32 val)
 	}
 	else
 	{
-		ipu_cmd.pos[0] += getBits((u8*)iq + 8 * ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1);
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
+		{
+			if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+		}
 
 		IPU_LOG("Read intra quantization matrix from IPU FIFO.");
 		for (i = 0; i < 8; i++)
@@ -552,40 +545,40 @@ static BOOL ipuSETIQ(u32 val)
 		}
 	}
 
-	return ipu_cmd.pos[0] == 64;
+	return TRUE;
 }
 
 static BOOL ipuSETVQ(u32 val)
 {
-	ipu_cmd.pos[0] += getBits((u8*)vqclut + ipu_cmd.pos[0], 256 - 8 * ipu_cmd.pos[0], 1); // 16*2*8
-
-	if (ipu_cmd.pos[0] == 32)
+	for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
 	{
-		IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
-		IPU_LOG(
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
-		    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
-		    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
-		    vqclut[2] >> 10, (vqclut[2] >> 5) & 0x1F, vqclut[2] & 0x1F,
-		    vqclut[3] >> 10, (vqclut[3] >> 5) & 0x1F, vqclut[3] & 0x1F,
-		    vqclut[4] >> 10, (vqclut[4] >> 5) & 0x1F, vqclut[4] & 0x1F,
-		    vqclut[5] >> 10, (vqclut[5] >> 5) & 0x1F, vqclut[5] & 0x1F,
-		    vqclut[6] >> 10, (vqclut[6] >> 5) & 0x1F, vqclut[6] & 0x1F,
-		    vqclut[7] >> 10, (vqclut[7] >> 5) & 0x1F, vqclut[7] & 0x1F,
-		    vqclut[8] >> 10, (vqclut[8] >> 5) & 0x1F, vqclut[8] & 0x1F,
-		    vqclut[9] >> 10, (vqclut[9] >> 5) & 0x1F, vqclut[9] & 0x1F,
-		    vqclut[10] >> 10, (vqclut[10] >> 5) & 0x1F, vqclut[10] & 0x1F,
-		    vqclut[11] >> 10, (vqclut[11] >> 5) & 0x1F, vqclut[11] & 0x1F,
-		    vqclut[12] >> 10, (vqclut[12] >> 5) & 0x1F, vqclut[12] & 0x1F,
-		    vqclut[13] >> 10, (vqclut[13] >> 5) & 0x1F, vqclut[13] & 0x1F,
-		    vqclut[14] >> 10, (vqclut[14] >> 5) & 0x1F, vqclut[14] & 0x1F,
-		    vqclut[15] >> 10, (vqclut[15] >> 5) & 0x1F, vqclut[15] & 0x1F);
+		if (!getBits64((u8*)vqclut + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 	}
 
-	return ipu_cmd.pos[0] == 32;
+	IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
+	IPU_LOG(
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
+	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
+	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
+	    vqclut[2] >> 10, (vqclut[2] >> 5) & 0x1F, vqclut[2] & 0x1F,
+	    vqclut[3] >> 10, (vqclut[3] >> 5) & 0x1F, vqclut[3] & 0x1F,
+	    vqclut[4] >> 10, (vqclut[4] >> 5) & 0x1F, vqclut[4] & 0x1F,
+	    vqclut[5] >> 10, (vqclut[5] >> 5) & 0x1F, vqclut[5] & 0x1F,
+	    vqclut[6] >> 10, (vqclut[6] >> 5) & 0x1F, vqclut[6] & 0x1F,
+	    vqclut[7] >> 10, (vqclut[7] >> 5) & 0x1F, vqclut[7] & 0x1F,
+	    vqclut[8] >> 10, (vqclut[8] >> 5) & 0x1F, vqclut[8] & 0x1F,
+	    vqclut[9] >> 10, (vqclut[9] >> 5) & 0x1F, vqclut[9] & 0x1F,
+	    vqclut[10] >> 10, (vqclut[10] >> 5) & 0x1F, vqclut[10] & 0x1F,
+	    vqclut[11] >> 10, (vqclut[11] >> 5) & 0x1F, vqclut[11] & 0x1F,
+	    vqclut[12] >> 10, (vqclut[12] >> 5) & 0x1F, vqclut[12] & 0x1F,
+	    vqclut[13] >> 10, (vqclut[13] >> 5) & 0x1F, vqclut[13] & 0x1F,
+	    vqclut[14] >> 10, (vqclut[14] >> 5) & 0x1F, vqclut[14] & 0x1F,
+	    vqclut[15] >> 10, (vqclut[15] >> 5) & 0x1F, vqclut[15] & 0x1F);
+
+	return TRUE;
 }
 
 // IPU Transfers are split into 8Qwords so we need to send ALL the data
@@ -596,17 +589,14 @@ static BOOL __fastcall ipuCSC(u32 val)
 
 	for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
 	{
-
-		if (ipu_cmd.pos[0] < 3072 / 8)
+		for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
 		{
-			ipu_cmd.pos[0] += getBits((u8*) & mb8 + ipu_cmd.pos[0], 3072 - 8 * ipu_cmd.pos[0], 1);
-
-			if (ipu_cmd.pos[0] < 3072 / 8) return FALSE;
-
-			ipu_csc(&mb8, &rgb32, 0);
-			if (csc.OFM) ipu_dither(&rgb32, &rgb16, csc.DTE);
+			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}
 
+		ipu_csc(&mb8, &rgb32, 0);
+		if (csc.OFM) ipu_dither(&rgb32, &rgb16, csc.DTE);
+		
 		if (csc.OFM)
 		{
 			while (ipu_cmd.pos[1] < 32)
@@ -641,18 +631,16 @@ static BOOL ipuPACK(u32 val)
 
 	for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
 	{
-		if (ipu_cmd.pos[0] < 512)
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
 		{
-			ipu_cmd.pos[0] += getBits((u8*) & mb8 + ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1);
-
-			if (ipu_cmd.pos[0] < 64) return FALSE;
-
-			ipu_csc(&mb8, &rgb32, 0);
-			ipu_dither(&rgb32, &rgb16, csc.DTE);
-
-			if (csc.OFM) ipu_vq(&rgb16, indx4);
+			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}
 
+		ipu_csc(&mb8, &rgb32, 0);
+		ipu_dither(&rgb32, &rgb16, csc.DTE);
+
+		if (csc.OFM) ipu_vq(&rgb16, indx4);
+		
 		if (csc.OFM)
 		{
 			ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
@@ -696,7 +684,7 @@ void IPUCMD_WRITE(u32 val)
 	ipuRegs->ctrl.ECD = 0;
 	ipuRegs->ctrl.SCD = 0; //clear ECD/SCD
 	ipuRegs->cmd.DATA = val;
-	ipu_cmd.pos[0] = 0;
+	ipu_cmd.clear();
 
 	switch (ipuRegs->cmd.CMD)
 	{
@@ -759,29 +747,27 @@ void IPUCMD_WRITE(u32 val)
 			break;
 
 		case SCE_IPU_IDEC:
-			if (ipuIDEC(val))
+			if (ipuIDEC(val, false))
 			{
 				// idec done, ipu0 done too
 				if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
 				return;
 			}
+
 			ipuRegs->topbusy = 0x80000000;
-			// have to resort to the thread
-			ipu_cmd.current = val >> 28;
-			ipuRegs->ctrl.BUSY = 1;
-			return;
+			break;
 
 		case SCE_IPU_BDEC:
-			if (ipuBDEC(val))
+			if (ipuBDEC(val, false))
 			{
 				if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
 				if (ipuRegs->ctrl.SCD || ipuRegs->ctrl.ECD) hwIntcIrq(INTC_IPU);
 				return;
 			}
-			ipuRegs->topbusy = 0x80000000;
-			ipu_cmd.current = val >> 28;
-			ipuRegs->ctrl.BUSY = 1;
-			return;
+			else
+			{
+				ipuRegs->topbusy = 0x80000000;
+			}
 	}
 
 	// have to resort to the thread
@@ -850,8 +836,7 @@ void IPUWorker()
 			break;
 
 		case SCE_IPU_IDEC:
-			so_call(s_routine);
-			if (!s_RoutineDone)
+			if (!ipuIDEC(ipuRegs->cmd.DATA, true))
 			{
 				if(ipu1dma->chcr.STR == false) hwIntcIrq(INTC_IPU);
 				return;
@@ -865,12 +850,10 @@ void IPUWorker()
 
 			// CHECK!: IPU0dma remains when IDEC is done, so we need to clear it
 			if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
-			s_routine = NULL;
 			break;
 
 		case SCE_IPU_BDEC:
-			so_call(s_routine);
-			if (!s_RoutineDone)
+			if (!ipuBDEC(ipuRegs->cmd.DATA, true))
 			{
 				if(ipu1dma->chcr.STR == false) hwIntcIrq(INTC_IPU);
 				return;
@@ -882,7 +865,6 @@ void IPUWorker()
 			ipu_cmd.current = 0xffffffff;
 
 			if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
-			s_routine = NULL;
 			if (ipuRegs->ctrl.SCD || ipuRegs->ctrl.ECD) hwIntcIrq(INTC_IPU);
 			return;
 
@@ -946,7 +928,7 @@ u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
 		g_BP.FP = 1;
 	}
 	
-	if ((g_BP.FP < 2) && (*(int*)pointer + size) >= 128)
+	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
 	{
 		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
 	}
@@ -967,6 +949,83 @@ u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
 	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
 }
 
+// whenever reading fractions of bytes. The low bits always come from the next byte
+// while the high bits come from the current byte
+u8 __fastcall getBits128(u8 *address, u32 advance)
+{
+	u64 mask2;
+	u128 mask;
+	u32 shift;
+	u8* readpos;
+
+	// Check if the current BP has exceeded or reached the limit of 128
+	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
+
+	readpos = readbits + (int)g_BP.BP / 8;
+
+	if (g_BP.BP & 7)
+	{
+		shift = g_BP.BP & 7;
+		mask2 = 0xff >> shift;
+		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
+		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
+
+		u128 notMask;
+		u128 data = *(u128*)(readpos + 1);
+		notMask.lo = ~mask.lo & data.lo;
+		notMask.hi = ~mask.hi & data.hi;
+		notMask.lo >>= 8 - shift;
+		notMask.lo |= (notMask.hi & (0xFFFFFFFFFFFFFFFF >> (64 - shift))) << (64 - shift);
+		notMask.hi >>= 8 - shift;
+
+		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
+		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
+		
+		notMask.lo |= mask.lo;
+		notMask.hi |= mask.hi;
+		*(u128*)address = notMask;
+	}
+	else
+	{
+		*(u128*)address = *(u128*)readpos;
+	}
+
+	if (advance) g_BP.BP += 128;
+
+	return 1;
+}
+
+// whenever reading fractions of bytes. The low bits always come from the next byte
+// while the high bits come from the current byte
+u8 __fastcall getBits64(u8 *address, u32 advance)
+{
+	register u64 mask = 0;
+	int shift = 0;
+	u8* readpos;
+
+	// Check if the current BP has exceeded or reached the limit of 128
+	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
+
+	readpos = readbits + (int)g_BP.BP / 8;
+
+	if (g_BP.BP & 7)
+	{
+		shift = g_BP.BP & 7;
+		mask = (0xff >> shift);
+		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);
+
+		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
+	}
+	else
+	{
+		*(u64*)address = *(u64*)readpos;
+	}
+
+	if (advance) g_BP.BP += 64;
+
+	return 1;
+}
+
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
 u8 __fastcall getBits32(u8 *address, u32 advance)
@@ -1053,102 +1112,6 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
 	return 1;
 }
 
-int __fastcall getBits(u8 *address, u32 size, u32 advance)
-{
-	register u32 mask = 0, shift = 0, howmuch;
-	u8* oldbits, *oldaddr = address;
-	u32 pointer = 0, temp;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8) return 0;
-
-	oldbits = readbits;
-	// Backup the current BP in case of VDEC/FDEC
-	pointer = g_BP.BP;
-
-	if (pointer & 7)
-	{
-		address--;
-		while (size)
-		{
-			if (shift == 0)
-			{
-				*++address = 0;
-				shift = 8;
-			}
-
-			temp = shift; // Lets not pass a register to min.
-			howmuch = min(min(8 - (pointer & 7), 128 - pointer), min(size, temp));
-
-			if (FillInternalBuffer(&pointer, advance, 8) < 8)
-			{
-				if (advance) g_BP.BP = pointer;
-				return address - oldaddr;
-			}
-
-			mask = ((0xFF >> (pointer & 7)) << (8 - howmuch - (pointer & 7))) & 0xFF;
-			mask &= readbits[((pointer) >> 3)];
-			mask >>= 8 - howmuch - (pointer & 7);
-			pointer += howmuch;
-			size -= howmuch;
-			shift -= howmuch;
-			*address |= mask << shift;
-		}
-		++address;
-	}
-	else
-	{
-		u8* readmem;
-		while (size)
-		{
-			if (FillInternalBuffer(&pointer, advance, 8) < 8)
-			{
-				if (advance) g_BP.BP = pointer;
-				return address -oldaddr;
-			}
-
-			howmuch = min(128 - pointer, size);
-			size -= howmuch;
-
-			readmem = readbits + (pointer >> 3);
-			pointer += howmuch;
-			howmuch >>= 3;
-
-			while (howmuch >= 4)
-			{
-				*(u32*)address = *(u32*)readmem;
-				howmuch -= 4;
-				address += 4;
-				readmem += 4;
-			}
-
-			switch (howmuch)
-			{
-				case 3:
-					address[2] = readmem[2];
-				case 2:
-					address[1] = readmem[1];
-				case 1:
-					address[0] = readmem[0];
-				case 0:
-					break;
-
-					jNO_DEFAULT
-			}
-
-			address += howmuch;
-		}
-	}
-
-	// If not advance then reset the Reading buffer value
-	if (advance)
-		g_BP.BP = pointer;
-	else
-		readbits = oldbits; // restore the last pointer
-
-	return address - oldaddr;
-}
-
 ///////////////////// CORE FUNCTIONS /////////////////
 void Skl_YUV_To_RGB32_MMX(u8 *RGB, const int Dst_BpS, const u8 *Y, const u8 *U, const u8 *V,
                           const int Src_BpS, const int Width, const int Height);
@@ -1244,7 +1207,7 @@ static __forceinline void ipuDmacSrcChain()
 		{
 			case TAG_REFE: // refe
 				//if(IPU1Status.InProgress == false) ipu1dma->tadr += 16;
-				if(IPU1Status.DMAFinished == false) IPU1Status.DMAFinished = true;
+				IPU1Status.DMAFinished = true;
 				break;
 			case TAG_CNT: // cnt
 				// Set the taddr to the next tag
@@ -1264,7 +1227,7 @@ static __forceinline void ipuDmacSrcChain()
 
 			case TAG_END: // end
 				ipu1dma->tadr = ipu1dma->madr;
-				if(IPU1Status.DMAFinished == false) IPU1Status.DMAFinished = true;
+				IPU1Status.DMAFinished = true;
 				break;
 		}
 }
@@ -1300,7 +1263,6 @@ static __forceinline int IPU1chain() {
 
 	if (ipu1dma->qwc > 0 && IPU1Status.InProgress == true)
 	{
-
 		int qwc = ipu1dma->qwc;
 		u32 *pMem;
 
@@ -1308,7 +1270,8 @@ static __forceinline int IPU1chain() {
 
 		if (pMem == NULL)
 		{
-			Console.Error("ipu1dma NULL!"); return totalqwc;
+			Console.Error("ipu1dma NULL!");
+			return totalqwc;
 		}
 
 		//Write our data to the fifo
@@ -1484,7 +1447,6 @@ int IPU1dma()
 	}
 	else 
 	{
-		IPU_LOG("Here");
 		cpuRegs.eCycle[4] = 0x9999;//IPU_INT_TO(2048);
 	}
 
@@ -1601,7 +1563,6 @@ __forceinline void dmaIPU1() // toIPU
 
 		IPU1Status.DMAMode = DMA_MODE_CHAIN;
 		IPU1dma();
-		//if (ipuRegs->ctrl.BUSY) IPUWorker();
 	}
 	else //Normal Mode
 	{
@@ -1623,7 +1584,6 @@ __forceinline void dmaIPU1() // toIPU
 			IPU1Status.DMAFinished = true;
 			IPU1Status.DMAMode = DMA_MODE_NORMAL;
 			IPU1dma();
-			//if (ipuRegs->ctrl.BUSY) IPUWorker();
 		}
 	}
 }
diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h
index ab831329aa..bb23f05e33 100644
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@@ -17,7 +17,6 @@
 #define __IPU_H__
 
 #include "mpeg2lib/Mpeg.h"
-#include "coroutine.h"
 #include "IPU_Fifo.h"
 
 #ifdef _MSC_VER
@@ -327,7 +326,7 @@ struct IPUregisters {
 struct tIPU_cmd
 {
 	int index;
-	int pos[2];
+	int pos[6];
 	int current;
 	void clear()
 	{
@@ -342,12 +341,13 @@ struct tIPU_cmd
 	}
 };
 
-//extern tIPU_cmd ipu_cmd;
+extern tIPU_cmd ipu_cmd;
 extern tIPU_BP g_BP;
 extern int coded_block_pattern;
 extern int g_nIPU0Data; // or 0x80000000 whenever transferring
 extern u8* g_pIPU0Pointer;
-
+extern IPUStatus IPU1Status;
+extern tIPU_DMA g_nDMATransfer;
 // The IPU can only do one task at once and never uses other buffers so these
 // should be made available to functions in other modules to save registers.
 extern __aligned16 macroblock_rgb32	rgb32;
@@ -376,10 +376,11 @@ extern int IPU0dma();
 extern int IPU1dma();
 
 extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
+extern u8 __fastcall getBits128(u8 *address, u32 advance);
+extern u8 __fastcall getBits64(u8 *address, u32 advance);
 extern u8 __fastcall getBits32(u8 *address, u32 advance);
 extern u8 __fastcall getBits16(u8 *address, u32 advance);
 extern u8 __fastcall getBits8(u8 *address, u32 advance);
-extern int __fastcall getBits(u8 *address, u32 size, u32 advance);
 
 
 #endif
diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp
index 66b1af502c..2435903aa3 100644
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@@ -13,7 +13,6 @@
  *  If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "IPU_Fifo.h"
@@ -106,20 +105,18 @@ int IPU_Fifo_Output::write(const u32 *value, int size)
 
 	ipuRegs->ctrl.OFC += firsttrans;
 	IPU0dma();
-	//Console.WriteLn("Written %d qwords, %d", firsttrans,ipuRegs->ctrl.OFC);
 
 	return firsttrans;
 }
 
 int IPU_Fifo_Input::read(void *value)
 {
-	// wait until enough data
-	if (g_BP.IFC < 8)
+	// wait until enough data to ensure proper streaming.
+	if (g_BP.IFC < 4)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
-			//DevCon.Warning("Setting ECycle");
 			CPU_INT( DMAC_TO_IPU, 4 );
 		}
 		
diff --git a/pcsx2/IPU/acoroutine.S b/pcsx2/IPU/acoroutine.S
deleted file mode 100644
index 2c28a2c248..0000000000
--- a/pcsx2/IPU/acoroutine.S
+++ /dev/null
@@ -1,78 +0,0 @@
-.intel_syntax noprefix
-
-.extern g_pCurrentRoutine
-
-.globl so_call
-so_call:
-		mov eax, dword ptr [esp+4]
-        test dword ptr [eax+24], 1
-        jnz RestoreRegs
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov dword ptr [eax+24], 1
-        jmp CallFn
-RestoreRegs:
-        // have to load and save at the same time
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-CallFn:
-        mov [g_pCurrentRoutine], eax
-        mov ecx, esp
-        mov esp, [eax+4]
-        mov [eax+4], ecx
-
-        jmp dword ptr [eax]
-
-.globl so_resume
-so_resume:
-		mov eax, [g_pCurrentRoutine]
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-        // put the return address in pcalladdr
-        mov ecx, [esp]
-        mov [eax], ecx
-        add esp, 4 // remove the return address
-
-        // swap stack pointers
-        mov ecx, [eax+4]
-        mov [eax+4], esp
-        mov esp, ecx
-        ret
-
-.globl so_exit
-so_exit:
-		mov eax, [g_pCurrentRoutine]
-        mov esp, [eax+4]
-        mov ebx, [eax+8]
-        mov esi, [eax+12]
-        mov edi, [eax+16]
-        mov ebp, [eax+20]
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
diff --git a/pcsx2/IPU/acoroutine.asm b/pcsx2/IPU/acoroutine.asm
deleted file mode 100644
index d81a5f12d2..0000000000
--- a/pcsx2/IPU/acoroutine.asm
+++ /dev/null
@@ -1,140 +0,0 @@
-; Pcsx2 - Pc Ps2 Emulator
-; Copyright (C) 2002-2008  Pcsx2 Team
-;
-; This program is free software; you can redistribute it and/or modify
-; it under the terms of the GNU General Public License as published by
-; the Free Software Foundation; either version 2 of the License, or
-; (at your option) any later version.
-
-; This program is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; GNU General Public License for more details.
-;
-; You should have received a copy of the GNU General Public License
-; along with this program; if not, write to the Free Software
-; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
-
-;; x86-64 coroutine fucntions
-extern g_pCurrentRoutine:ptr
-
-.code
-
-so_call proc public
-        test dword ptr [rcx+88], 1
-        jnz so_call_RestoreRegs
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov dword ptr [rcx+88], 1
-        jmp so_call_CallFn
-so_call_RestoreRegs:
-		;; have to load and save at the same time
-        ;; rbp, rbx, r12
-        mov rax, [rcx+24]
-        mov r8, [rcx+16]
-        mov rdx, [rcx+32]
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov rbp, rax
-        mov rbx, r8
-        mov r12, rdx
-        ;; r13, r14, r15
-        mov rax, [rcx+40]
-        mov r8, [rcx+48]
-        mov rdx, [rcx+56]
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov r13, rax
-        mov r14, r8
-        mov r15, rdx
-
-        ;; rsi, rdi
-        mov rax, [rcx+64]
-        mov rdx, [rcx+72]
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov rsi, rax
-        mov rdi, rdx
-        
-so_call_CallFn:
-        mov [g_pCurrentRoutine], rcx
-
-		;; swap the stack
-        mov rax, [rcx+8]
-        mov [rcx+8], rsp
-        mov rsp, rax
-        mov rax, [rcx+0]
-        mov rcx, [rcx+80]
-
-        jmp rax
-
-so_call endp
-
-; so_resume
-so_resume proc public
-        ;; rbp, rbx, r12
-        mov rcx, [g_pCurrentRoutine]
-        mov rax, [rcx+24]
-        mov r8, [rcx+16]
-        mov rdx, [rcx+32]
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov rbp, rax
-        mov rbx, r8
-        mov r12, rdx
-        ;; r13, r14, r15
-        mov rax, [rcx+40]
-        mov r8, [rcx+48]
-        mov rdx, [rcx+56]
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov r13, rax
-        mov r14, r8
-        mov r15, rdx
-        ;; rsi, rdi
-        mov rax, [rcx+64]
-        mov rdx, [rcx+72]
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov rsi, rax
-        mov rdi, rdx
-
-		;; put the return address in pcalladdr
-        mov rax, [rsp]
-        mov [rcx], rax
-        add rsp, 8 ;; remove the return address
-
-		;; swap stack pointers
-        mov rax, [rcx+8]
-        mov [rcx+8], rsp
-        mov rsp, rax
-
-        ret
-
-so_resume endp
-
-so_exit proc public
-        mov rcx, [g_pCurrentRoutine]
-        mov rsp, [rcx+8]
-        mov rbp, [rcx+24]
-        mov rbx, [rcx+16]
-        mov r12, [rcx+32]
-        mov r13, [rcx+40]
-        mov r14, [rcx+48]
-        mov r15, [rcx+56]
-        mov rsi, [rcx+64]
-        mov rdi, [rcx+72]
-        ret
-so_exit endp
-
-end
\ No newline at end of file
diff --git a/pcsx2/IPU/coroutine.cpp b/pcsx2/IPU/coroutine.cpp
deleted file mode 100644
index b2992091b1..0000000000
--- a/pcsx2/IPU/coroutine.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#include "PrecompiledHeader.h"
-
-#include "coroutine.h"
-
-struct coroutine {
-	void* pcalladdr;
-	void *pcurstack;
-
-    uptr storeebx, storeesi, storeedi, storeebp;
-
-    s32 restore; // if nonzero, restore the registers
-    s32 alloc;
-	//struct s_coroutine *caller;
-	//struct s_coroutine *restarget;
-
-};
-
-#define CO_STK_ALIGN 256
-#define CO_STK_COROSIZE ((sizeof(coroutine) + CO_STK_ALIGN - 1) & ~(CO_STK_ALIGN - 1))
-#define CO_MIN_SIZE (4 * 1024)
-
-coroutine* g_pCurrentRoutine;
-
-coroutine_t so_create(void (*func)(void *), void *data, void *stack, int size)
-{
-	void* endstack;
-	int alloc = 0; // r = CO_STK_COROSIZE;
-	coroutine *co;
-
-	if ((size &= ~(sizeof(s32) - 1)) < CO_MIN_SIZE) return NULL;
-	if (!stack) {
-		size = (size + sizeof(coroutine) + CO_STK_ALIGN - 1) & ~(CO_STK_ALIGN - 1);
-		stack = malloc(size);
-		if (!stack) return NULL;
-		alloc = size;
-	}
-	endstack = (char*)stack + size - 64;
-	co = (coroutine*)stack;
-	stack = (char *) stack + CO_STK_COROSIZE;
-	*(void**)endstack = NULL;
-	*(void**)((char*)endstack+sizeof(void*)) = data;
-	co->alloc = alloc;
-	co->pcalladdr = (void*)func;
-	co->pcurstack = endstack;
-	return co;
-}
-
-void so_delete(coroutine_t coro)
-{
-	coroutine *co = (coroutine *) coro;
-	pxAssert( co != NULL );
-	if (co->alloc) free(co);
-}
-
-// see acoroutines.S and acoroutines.asm for other asm implementations
-#if defined(_MSC_VER)
-
-__declspec(naked) void so_call(coroutine_t coro)
-{
-    __asm {
-        mov eax, dword ptr [esp+4]
-        test dword ptr [eax+24], 1
-        jnz RestoreRegs
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov dword ptr [eax+24], 1
-        jmp CallFn
-RestoreRegs:
-        // have to load and save at the same time
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-CallFn:
-        mov [g_pCurrentRoutine], eax
-        mov ecx, esp
-        mov esp, [eax+4]
-        mov [eax+4], ecx
-
-        jmp dword ptr [eax]
-    }
-}
-
-__declspec(naked) void so_resume(void)
-{
-    __asm {
-        mov eax, [g_pCurrentRoutine]
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-        // put the return address in pcalladdr
-        mov ecx, [esp]
-        mov [eax], ecx
-        add esp, 4 // remove the return address
-
-        // swap stack pointers
-        mov ecx, [eax+4]
-        mov [eax+4], esp
-        mov esp, ecx
-        ret
-    }
-}
-
-__declspec(naked) void so_exit(void)
-{
-    __asm {
-        mov eax, [g_pCurrentRoutine]
-        mov esp, [eax+4]
-        mov ebx, [eax+8]
-        mov esi, [eax+12]
-        mov edi, [eax+16]
-        mov ebp, [eax+20]
-        ret
-    }
-}
-#endif
diff --git a/pcsx2/IPU/coroutine.h b/pcsx2/IPU/coroutine.h
deleted file mode 100644
index 7d40348450..0000000000
--- a/pcsx2/IPU/coroutine.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef PCSX2_COROUTINE_LIB
-#define PCSX2_COROUTINE_LIB
-
-// low level coroutine library
-typedef void *coroutine_t;
-
-coroutine_t so_create(void (*func)(void *), void *data, void *stack, int size);
-void so_delete(coroutine_t coro);
-
-#include "NakedAsm.h"
-
-#endif
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
index c7c8afca15..7b7a278fa9 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
@@ -48,55 +48,51 @@ int non_linear_quantizer_scale [] =
 	back to the 1st slot when 128bits have been read.
 */
 extern void ReorderBitstream();
+const DCTtab * tab;
+int mbaCount = 0;
 
-int get_macroblock_modes(decoder_t * const decoder)
+int get_macroblock_modes()
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
 	int macroblock_modes;
 	const MBtab * tab;
 
-	switch (decoder->coding_type)
+	switch (decoder.coding_type)
 	{
-
 		case I_TYPE:
-			macroblock_modes = UBITS(bit_buf, 2);
+			macroblock_modes = UBITS(2);
 
 			if (macroblock_modes == 0) return 0;   // error
 
 			tab = MB_I + (macroblock_modes >> 1);
-			DUMPBITS(bit_buf, bits, tab->len);
+			DUMPBITS(tab->len);
 			macroblock_modes = tab->modes;
 
-			if ((!(decoder->frame_pred_frame_dct)) &&
-			        (decoder->picture_structure == FRAME_PICTURE))
+			if ((!(decoder.frame_pred_frame_dct)) &&
+			        (decoder.picture_structure == FRAME_PICTURE))
 			{
-				macroblock_modes |= UBITS(bit_buf, 1) * DCT_TYPE_INTERLACED;
-				DUMPBITS(bit_buf, bits, 1);
+				macroblock_modes |= GETBITS(1) * DCT_TYPE_INTERLACED;
 			}
 			return macroblock_modes;
 
 		case P_TYPE:
-			macroblock_modes = UBITS(bit_buf, 6);
+			macroblock_modes = UBITS(6);
 
 			if (macroblock_modes == 0) return 0;   // error
 
 			tab = MB_P + (macroblock_modes >> 1);
-			DUMPBITS(bit_buf, bits, tab->len);
+			DUMPBITS(tab->len);
 			macroblock_modes = tab->modes;
 
-			if (decoder->picture_structure != FRAME_PICTURE)
+			if (decoder.picture_structure != FRAME_PICTURE)
 			{
 				if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
 				{
-					macroblock_modes |= UBITS(bit_buf, 2) * MOTION_TYPE_BASE;
-					DUMPBITS(bit_buf, bits, 2);
+					macroblock_modes |= GETBITS(2) * MOTION_TYPE_BASE;
 				}
 
 				return macroblock_modes;
 			}
-			else if (decoder->frame_pred_frame_dct)
+			else if (decoder.frame_pred_frame_dct)
 			{
 				if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
 					macroblock_modes |= MC_FRAME;
@@ -107,39 +103,36 @@ int get_macroblock_modes(decoder_t * const decoder)
 			{
 				if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
 				{
-					macroblock_modes |= UBITS(bit_buf, 2) * MOTION_TYPE_BASE;
-					DUMPBITS(bit_buf, bits, 2);
+					macroblock_modes |= GETBITS(2) * MOTION_TYPE_BASE;
 				}
 
 				if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN))
 				{
-					macroblock_modes |= UBITS(bit_buf, 1) * DCT_TYPE_INTERLACED;
-					DUMPBITS(bit_buf, bits, 1);
+					macroblock_modes |= GETBITS(1) * DCT_TYPE_INTERLACED;
 				}
 
 				return macroblock_modes;
 			}
 
 		case B_TYPE:
-			macroblock_modes = UBITS(bit_buf, 6);
+			macroblock_modes = UBITS(6);
 
 			if (macroblock_modes == 0) return 0;   // error
 
 			tab = MB_B + macroblock_modes;
-			DUMPBITS(bit_buf, bits, tab->len);
+			DUMPBITS(tab->len);
 			macroblock_modes = tab->modes;
 
-			if (decoder->picture_structure != FRAME_PICTURE)
+			if (decoder.picture_structure != FRAME_PICTURE)
 			{
 				if (!(macroblock_modes & MACROBLOCK_INTRA))
 				{
-					macroblock_modes |= UBITS(bit_buf, 2) * MOTION_TYPE_BASE;
-					DUMPBITS(bit_buf, bits, 2);
+					macroblock_modes |= GETBITS(2) * MOTION_TYPE_BASE;
 				}
 
 				return macroblock_modes;
 			}
-			else if (decoder->frame_pred_frame_dct)
+			else if (decoder.frame_pred_frame_dct)
 			{
 				/* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
 				macroblock_modes |= MC_FRAME;
@@ -149,968 +142,544 @@ int get_macroblock_modes(decoder_t * const decoder)
 			{
 				if (macroblock_modes & MACROBLOCK_INTRA) goto intra;
 
-				macroblock_modes |= UBITS(bit_buf, 2) * MOTION_TYPE_BASE;
-				DUMPBITS(bit_buf, bits, 2);
+				macroblock_modes |= GETBITS(2) * MOTION_TYPE_BASE;
 
 				if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN))
 				{
 intra:
-					macroblock_modes |= UBITS(bit_buf, 1) * DCT_TYPE_INTERLACED;
-					DUMPBITS(bit_buf, bits, 1);
+					macroblock_modes |= GETBITS(1) * DCT_TYPE_INTERLACED;
 				}
 
 				return macroblock_modes;
 			}
 
 		case D_TYPE:
-			macroblock_modes = UBITS(bit_buf, 1);
+			macroblock_modes = GETBITS(1);
 
 			if (macroblock_modes == 0) return 0;   // error
-
-			DUMPBITS(bit_buf, bits, 1);
 			return MACROBLOCK_INTRA;
 
 		default:
 			return 0;
 	}
-
-#undef bit_buf
-#undef bits
-#undef bit_ptr
 }
 
-static __forceinline int get_quantizer_scale(decoder_t * const decoder)
+static __forceinline int get_quantizer_scale()
 {
 	int quantizer_scale_code;
 
-	quantizer_scale_code = UBITS(decoder->bitstream_buf, 5);
-	DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, 5);
+	quantizer_scale_code = GETBITS(5);
 
-	if (decoder->q_scale_type)
+	if (decoder.q_scale_type)
 		return non_linear_quantizer_scale [quantizer_scale_code];
 	else
 		return quantizer_scale_code << 1;
 }
 
-static __forceinline int get_coded_block_pattern(decoder_t * const decoder)
+static __forceinline int get_coded_block_pattern()
 {
 	const CBPtab * tab;
+	u16 code = UBITS(16);
 
-	NEEDBITS(decoder->bitstream_buf, decoder->bitstream_bits, decoder->bitstream_ptr);
-
-	if (decoder->bitstream_buf >= 0x20000000)
-		tab = CBP_7 + (UBITS(decoder->bitstream_buf, 7) - 16);
+	if (code >= 0x2000)
+		tab = CBP_7 + (UBITS(7) - 16);
 	else
-		tab = CBP_9 + UBITS(decoder->bitstream_buf, 9);
+		tab = CBP_9 + UBITS(9);
 
-	DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, tab->len);
+	DUMPBITS(tab->len);
 	return tab->cbp;
 }
 
-static __forceinline int get_luma_dc_dct_diff(decoder_t * const decoder)
+int __forceinline get_motion_delta(const int f_code)
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
+#define bit_buf (decoder.bitstream_buf)
+#define bits (decoder.bitstream_bits)
+#define bit_ptr (decoder.bitstream_ptr)
 
-	const DCtab * tab;
-	int size;
-	int dc_diff;
+	int delta;
+	int sign;
+	const MVtab * tab;
+	u16 code = UBITS(16);
 
-	if (bit_buf < 0xf8000000)
+	if ((code & 0x8000))
 	{
-		tab = DC_lum_5 + UBITS(bit_buf, 5);
-		size = tab->size;
-
-		if (size)
-		{
-			DUMPBITS(bit_buf, bits, tab->len);
-			bits += size;
-			dc_diff = UBITS(bit_buf, size) - UBITS(SBITS(~bit_buf, 1), size);
-			bit_buf <<= size;
-			return dc_diff;
-		}
-		else
-		{
-			DUMPBITS(bit_buf, bits, 3);
-			return 0;
-		}
+		DUMPBITS(1);
+		return 0x00010000;
+	}
+	else if ((code & 0xf000) || ((code & 0xfc00) == 0x0c00))
+	{
+		tab = MV_4 + UBITS(4);
+	}
+	else
+	{
+		tab = MV_10 + UBITS(10);
 	}
 
-	tab = DC_long + (UBITS(bit_buf, 9) - 0x1e0); //0x1e0);
+	delta = tab->delta + 1;
+	DUMPBITS(tab->len);
+
+	sign = SBITS(1);
+	DUMPBITS(1);
+	return (delta ^ sign) - sign;
 
-	size = tab->size;
-	DUMPBITS(bit_buf, bits, tab->len);
-	NEEDBITS(bit_buf, bits, bit_ptr);
-	dc_diff = UBITS(bit_buf, size) - UBITS(SBITS(~bit_buf, 1), size);
-	DUMPBITS(bit_buf, bits, size);
-	return dc_diff;
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static __forceinline int get_chroma_dc_dct_diff(decoder_t * const decoder)
+int __forceinline get_dmv()
 {
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
+	const DMVtab * tab;
 
-	const DCtab * tab;
+	tab = DMV_2 + UBITS(2);
+	DUMPBITS(tab->len);
+	return tab->dmv;
+}
+
+int get_macroblock_address_increment()
+{
+	const MBAtab *mba;
+	
+	u16 code = UBITS(16);
+
+	if (code >= 4096)
+		mba = MBA_5 + (UBITS(5) - 2);
+	else if (code >= 768)
+		mba = MBA_11 + (UBITS(11) - 24);
+	else switch (UBITS(11))
+		{
+
+			case 8:		/* macroblock_escape */
+				DUMPBITS(11);
+				return 0x23;
+
+			case 15:	/* macroblock_stuffing (MPEG1 only) */
+				if (decoder.mpeg1)
+				{
+					DUMPBITS(11);
+					return 0x22;
+				}
+
+			default:
+				return 0;//error
+		}
+
+	DUMPBITS(mba->len);
+
+	return mba->mba + 1;
+}
+
+static __forceinline int get_luma_dc_dct_diff()
+{
 	int size;
 	int dc_diff;
+	u16 code = UBITS(5);
 
-	if (bit_buf < 0xf8000000)
+	if (code < 31)
 	{
-		tab = DC_chrom_5 + UBITS(bit_buf, 5);
-		size = tab->size;
+		size = DClumtab0[code].size;
+		DUMPBITS(DClumtab0[code].len);
 
-		if (size)
-		{
-			DUMPBITS(bit_buf, bits, tab->len);
-			bits += size;
-			dc_diff = UBITS(bit_buf, size) - UBITS(SBITS(~bit_buf, 1), size);
-			bit_buf <<= size;
-			return dc_diff;
-		}
-		else
-		{
-			DUMPBITS(bit_buf, bits, 2);
-			return 0;
-		}
+		// 5 bits max
+	}
+	else
+	{
+		code = UBITS(9) - 0x1f0;
+		size = DClumtab1[code].size;
+		DUMPBITS(DClumtab1[code].len);
+
+		// 9 bits max
+	}
+	
+	if (size==0)
+		dc_diff = 0;
+	else
+	{
+		dc_diff = GETBITS(size);
+
+		// 6 for tab0 and 11 for tab1
+		if ((dc_diff & (1<<(size-1)))==0)
+		  dc_diff-= (1<<size) - 1;
 	}
 
-	tab = DC_long + (UBITS(bit_buf, 10) - 0x3e0);
-
-	size = tab->size;
-	DUMPBITS(bit_buf, bits, tab->len + 1);
-	NEEDBITS(bit_buf, bits, bit_ptr);
-	dc_diff = UBITS(bit_buf, size) - UBITS(SBITS(~bit_buf, 1), size);
-	DUMPBITS(bit_buf, bits, size);
 	return dc_diff;
-#undef bit_buf
-#undef bits
-#undef bit_ptr
+}
+
+static __forceinline int get_chroma_dc_dct_diff()
+{
+	int size;
+	int dc_diff;
+	u16 code = UBITS(5);
+
+    if (code<31)
+	{
+		size = DCchromtab0[code].size;
+		DUMPBITS(DCchromtab0[code].len);
+	}
+	else
+	{
+	    code = UBITS(10) - 0x3e0;
+	    size = DCchromtab1[code].size;
+		DUMPBITS(DCchromtab1[code].len);
+	}
+	
+	if (size==0)
+	    dc_diff = 0;
+	else
+	{
+		dc_diff = GETBITS(size);
+
+		if ((dc_diff & (1<<(size-1)))==0)
+		{
+			dc_diff-= (1<<size) - 1;
+		}
+	}
+  
+	return dc_diff;
 }
 
 #define SATURATE(val)					\
 do {							\
 	 if (((u32)(val + 2048) > 4095))	\
-	val = SBITS (val, 1) ^ 2047;			\
+	val = (((s32)val) >> 31) ^ 2047;			\
 } while (0)
 
-static __forceinline void get_intra_block_B14(decoder_t * const decoder)
+static __forceinline bool get_intra_block()
 {
 	int i;
 	int j;
 	int val;
-	const u8 * scan = decoder->scan;
-	const u8 * quant_matrix = decoder->intra_quantizer_matrix;
-	int quantizer_scale = decoder->quantizer_scale;
-	int mismatch;
-	const DCTtab * tab;
-	u32 bit_buf;
-	u8 * bit_ptr;
-	int bits;
-	s16 * dest;
+	const u8 * scan = decoder.scan;
+	const u8 * quant_matrix = decoder.intra_quantizer_matrix;
+	int quantizer_scale = decoder.quantizer_scale;
+	s16 * dest = decoder.DCTblock;
+	u16 code; 
 
-	dest = decoder->DCTblock;
-	i = 0;
-	mismatch = ~dest[0];
-
-	bit_buf = decoder->bitstream_buf;
-	bits = decoder->bitstream_bits;
-	bit_ptr = decoder->bitstream_ptr;
-
-	NEEDBITS(bit_buf, bits, bit_ptr);
-
-	while (1)
-	{
-		if (bit_buf >= 0x28000000)
+	/* decode AC coefficients */
+  for (i=1 + ipu_cmd.pos[4]; ; i++)
+  {
+	  switch (ipu_cmd.pos[5])
+	  {
+	  case 0:
+		if (!GETWORD())
 		{
-			tab = DCT_B14AC_5 + (UBITS(bit_buf, 5) - 5);
-			i += tab->run;
-			if (i >= 64) break;	/* end of block */
-
-normal_code:
-			j = scan[i];
-			bit_buf <<= tab->len;
-			bits += tab->len + 1;
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
-
-			/* if (bitstream_get (1)) val = -val; */
-			val = (val ^ SBITS(bit_buf, 1)) - SBITS(bit_buf, 1);
-
-			SATURATE(val);
-			dest[j] = val;
-			mismatch ^= val;
-			bit_buf <<= 1;
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
+		  ipu_cmd.pos[4] = i - 1;
+		  return false;
 		}
-		else if (bit_buf >= 0x04000000)
+
+		code = UBITS(16);
+
+		if (code >= 16384 && (!decoder.intra_vlc_format || decoder.mpeg1))
 		{
-			tab = DCT_B14_8 + (UBITS(bit_buf, 8) - 4);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-
-			/* escape code */
-
-			i += UBITS(bit_buf << 6, 6) - 64;
-
-			if (i >= 64) break;	/* illegal, check needed to avoid buffer overflow */
-
-			j = scan[i];
-
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (SBITS(bit_buf, 12) * quantizer_scale * quant_matrix[i]) / 16;
-
-			SATURATE(val);
-			dest[j] = val;
-			mismatch ^= val;
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-
+		  tab = &DCTtabnext[(code >> 12) - 4];
 		}
-		else if (bit_buf >= 0x02000000)
+		else if (code >= 1024)
 		{
-			tab = DCT_B14_10 + (UBITS(bit_buf, 10) - 8);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
+		  if (decoder.intra_vlc_format && !decoder.mpeg1)
+		  {
+			  tab = &DCTtab0a[(code >> 8) - 4];
+		  }
+		  else
+		  {
+			  tab = &DCTtab0[(code >> 8) - 4];
+		  }
 		}
-		else if (bit_buf >= 0x00800000)
+		else if (code >= 512)
 		{
-			tab = DCT_13 + (UBITS(bit_buf, 13) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
+		  if (decoder.intra_vlc_format && !decoder.mpeg1)
+		  {
+			tab = &DCTtab1a[(code >> 6) - 8];
+		  }
+		  else
+		  {
+			tab = &DCTtab1[(code >> 6) - 8];
+		  }
 		}
-		else if (bit_buf >= 0x00200000)
+		else if (code >= 256)
 		{
-			tab = DCT_15 + (UBITS(bit_buf, 15) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
+		  tab = &DCTtab2[(code >> 4) - 16];
+		}
+		else if (code >= 128)
+		{    
+			tab = &DCTtab3[(code >> 3) - 16];
+		}
+		else if (code >= 64)
+		{    
+			tab = &DCTtab4[(code >> 2) - 16];
+		}
+		else if (code >= 32)
+		{    
+			tab = &DCTtab5[(code >> 1) - 16];
+		}
+		else if (code >= 16)
+		{    
+			tab = &DCTtab6[code - 16];
 		}
 		else
 		{
-			tab = DCT_16 + UBITS(bit_buf, 16);
-			bit_buf <<= 16;
-			GETWORD(&bit_buf, bits + 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
+		  ipu_cmd.pos[4] = 0;
+		  return true;
 		}
 
-		break;	/* illegal, check needed to avoid buffer overflow */
-	}
+		DUMPBITS(tab->len);
 
-	dest[63] ^= mismatch & 1;
+		if (tab->run==64) /* end_of_block */
+		{
+			ipu_cmd.pos[4] = 0;
+			return true;
+		}
+		
+		i+= tab->run == 65 ? GETBITS(6) : tab->run;
+		if (i >= 64)
+		{
+			ipu_cmd.pos[4] = 0;
+			return true;
+		}
+	  case 1:
+		if (!GETWORD())
+		{
+		  ipu_cmd.pos[4] = i - 1;
+		  ipu_cmd.pos[5] = 1;
+		  return false;
+		}
 
-	if ((bit_buf >> 30) != 0x2) ipuRegs->ctrl.ECD = 1;
+		j = scan[i];
 
-	DUMPBITS(bit_buf, bits, tab->len);	/* dump end of block code */
+		if (tab->run==65) /* escape */
+		{
+		  if(!decoder.mpeg1)
+		  {
+			  val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
+			  DUMPBITS(12);
+		  }
+		  else
+		  {
+			  val = SBITS(8);
+			  DUMPBITS(8);
 
-	decoder->bitstream_buf = bit_buf;
-	decoder->bitstream_bits = bits;
+			  if (!(val & 0x7f))
+			  {
+				val = GETBITS(8) + 2 * val;
+			  }
+			
+			  val = (val * quantizer_scale * quant_matrix[i]) >> 4;
+			  val = (val + ~ (((s32)val) >> 31)) | 1;
+		  }
+		}
+		else
+		{
+		  val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+		  if(decoder.mpeg1)
+		  {
+			/* oddification */
+			val = (val - 1) | 1;
+		  }
+
+ 		  /* if (bitstream_get (1)) val = -val; */
+		  val = (val ^ SBITS(1)) - SBITS(1);
+		  DUMPBITS(1);
+		}
+
+		SATURATE(val);
+		dest[j] = val;
+		ipu_cmd.pos[5] = 0;
+	 }
+  }
+
+  ipu_cmd.pos[4] = 0;
+  return true;
 }
 
-static __forceinline void get_intra_block_B15(decoder_t * const decoder)
+static __forceinline bool get_non_intra_block(int * last)
 {
 	int i;
 	int j;
 	int val;
-	const u8 * scan = decoder->scan;
-	const u8 * quant_matrix = decoder->intra_quantizer_matrix;
-	int quantizer_scale = decoder->quantizer_scale;
-	int mismatch;
-	const DCTtab * tab;
-	u32 bit_buf;
-	u8 * bit_ptr;
-	int bits;
-	s16 * dest;
+	const u8 * scan = decoder.scan;
+	const u8 * quant_matrix = decoder.non_intra_quantizer_matrix;
+	int quantizer_scale = decoder.quantizer_scale;
+	s16 * dest = decoder.DCTblock;
+	u16 code;
 
-	dest = decoder->DCTblock;
-	i = 0;
-	mismatch = ~dest[0];
-
-	bit_buf = decoder->bitstream_buf;
-	bits = decoder->bitstream_bits;
-	bit_ptr = decoder->bitstream_ptr;
-
-	NEEDBITS(bit_buf, bits, bit_ptr);
-
-	while (1)
-	{
-		if (bit_buf >= 0x04000000)
+    /* decode AC coefficients */
+    for (i= ipu_cmd.pos[4] ; ; i++)
+    {
+		switch (ipu_cmd.pos[5])
 		{
-			tab = DCT_B15_8 + (UBITS(bit_buf, 8) - 4);
-			i += tab->run;
-
-			if (i < 64)
+		case 0:
+			if (!GETWORD())
 			{
-normal_code:
-				j = scan[i];
-				bit_buf <<= tab->len;
-				bits += tab->len + 1;
-				/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-				val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+				ipu_cmd.pos[4] = i;
+				return false;
+			}
 
-				/* if (bitstream_get (1)) val = -val; */
-				val = (val ^ SBITS(bit_buf, 1)) - SBITS(bit_buf, 1);
+			code = UBITS(16);
 
-				SATURATE(val);
-				dest[j] = val;
-				mismatch ^= val;
-
-				bit_buf <<= 1;
-				NEEDBITS(bit_buf, bits, bit_ptr);
-
-				continue;
+			if (code >= 16384)
+			{
+				if (i==0)
+				{
+					tab = &DCTtabfirst[(code >> 12) - 4];
+				}
+				else
+				{			
+					tab = &DCTtabnext[(code >> 12)- 4];
+				}
+			}
+			else if (code >= 1024)
+			{
+				tab = &DCTtab0[(code >> 8) - 4];
+			}
+			else if (code >= 512)
+			{		
+				tab = &DCTtab1[(code >> 6) - 8];
+			}
+			else if (code >= 256)
+			{		
+				tab = &DCTtab2[(code >> 4) - 16];
+			}
+			else if (code >= 128)
+			{		
+				tab = &DCTtab3[(code >> 3) - 16];
+			}
+			else if (code >= 64)
+			{		
+				tab = &DCTtab4[(code >> 2) - 16];
+			}
+			else if (code >= 32)
+			{		
+				tab = &DCTtab5[(code >> 1) - 16];
+			}
+			else if (code >= 16)
+			{		
+				tab = &DCTtab6[code - 16];
 			}
 			else
 			{
-				/* end of block. I commented out this code because if we */
-				/* dont exit here we will still exit at the later test :) */
-				//if (i >= 128) break;		/* end of block */
-				/* escape code */
-
-				i += UBITS(bit_buf << 6, 6) - 64;
-
-				if (i >= 64)  break;	/* illegal, check against buffer overflow */
-
-				j = scan[i];
-				DUMPBITS(bit_buf, bits, 12);
-				NEEDBITS(bit_buf, bits, bit_ptr);
-
-				/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-				val = (SBITS(bit_buf, 12) * quantizer_scale * quant_matrix[i]) / 16;
-
-				SATURATE(val);
-				dest[j] = val;
-				mismatch ^= val;
-				DUMPBITS(bit_buf, bits, 12);
-				NEEDBITS(bit_buf, bits, bit_ptr);
-				continue;
+				ipu_cmd.pos[4] = 0;
+				return true;
 			}
-		}
-		else if (bit_buf >= 0x02000000)
-		{
-			tab = DCT_B15_10 + (UBITS(bit_buf, 10) - 8);
-			i += tab->run;
 
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00800000)
-		{
-			tab = DCT_13 + (UBITS(bit_buf, 13) - 16);
-			i += tab->run;
+			DUMPBITS(tab->len);
 
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00200000)
-		{
-			tab = DCT_15 + (UBITS(bit_buf, 15) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else
-		{
-			tab = DCT_16 + UBITS(bit_buf, 16);
-			bit_buf <<= 16;
-			GETWORD(&bit_buf, bits + 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-
-		break;	/* illegal, check needed to avoid buffer overflow */
-	}
-
-	dest[63] ^= mismatch & 1;
-
-	if ((bit_buf >> 28) != 0x6)
-		ipuRegs->ctrl.ECD = 1;
-
-	DUMPBITS(bit_buf, bits, tab->len);	/* dump end of block code */
-
-	decoder->bitstream_buf = bit_buf;
-
-	decoder->bitstream_bits = bits;
-}
-
-static __forceinline int get_non_intra_block(decoder_t * const decoder)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-	int i;
-	int j;
-	int val;
-	const u8 * scan = decoder->scan;
-	const u8 * quant_matrix = decoder->non_intra_quantizer_matrix;
-	int quantizer_scale = decoder->quantizer_scale;
-	int mismatch;
-	const DCTtab * tab;
-	s16 * dest;
-
-	i = -1;
-	mismatch = -1;
-	dest = decoder->DCTblock;
-	NEEDBITS(bit_buf, bits, bit_ptr);
-
-	if (bit_buf >= 0x28000000)
-	{
-		tab = DCT_B14DC_5 + (UBITS(bit_buf, 5) - 5);
-		goto entry_1;
-	}
-	else
-		goto entry_2;
-
-	while (1)
-	{
-		if (bit_buf >= 0x28000000)
-		{
-			tab = DCT_B14AC_5 + (UBITS(bit_buf, 5) - 5);
-entry_1:
-			i += tab->run;
-
-			if (i >= 64) break;	/* end of block */
-normal_code:
-			j = scan[i];
-			bit_buf <<= tab->len;
-			bits += tab->len + 1;
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-
-			/* if (bitstream_get (1)) val = -val; */
-			val = (val ^ SBITS(bit_buf, 1)) - SBITS(bit_buf, 1);
-
-			SATURATE(val);
-			dest[j] = val;
-			mismatch ^= val;
-			bit_buf <<= 1;
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-		}
-entry_2:
-
-		if (bit_buf >= 0x04000000)
-		{
-			tab = DCT_B14_8 + (UBITS(bit_buf, 8) - 4);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-
-			/* escape code */
-
-			i += UBITS(bit_buf << 6, 6) - 64;
-
-			if (i >= 64) break;	/* illegal, check needed to avoid buffer overflow */
-
-			j = scan[i];
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			val = 2 * (SBITS(bit_buf, 12) + SBITS(bit_buf, 1)) + 1;
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (val * quantizer_scale * quant_matrix[i]) / 32;
-
-			SATURATE(val);
-			dest[j] = val;
-			mismatch ^= val;
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-		}
-		else if (bit_buf >= 0x02000000)
-		{
-			tab = DCT_B14_10 + (UBITS(bit_buf, 10) - 8);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00800000)
-		{
-			tab = DCT_13 + (UBITS(bit_buf, 13) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00200000)
-		{
-			tab = DCT_15 + (UBITS(bit_buf, 15) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else
-		{
-			tab = DCT_16 + UBITS(bit_buf, 16);
-			bit_buf <<= 16;
-			GETWORD(&bit_buf, bits + 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		break;	/* illegal, check needed to avoid buffer overflow */
-	}
-
-	dest[63] ^= mismatch & 1;
-
-	if ((bit_buf >> 30) != 0x2) ipuRegs->ctrl.ECD = 1;
-
-	DUMPBITS(bit_buf, bits, tab->len);	/* dump end of block code */
-
-	decoder->bitstream_buf = bit_buf;
-	decoder->bitstream_bits = bits;
-	return i;
-
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static __forceinline void get_mpeg1_intra_block(decoder_t * const decoder)
-{
-	int i;
-	int j;
-	int val;
-	const u8 * scan = decoder->scan;
-	const u8 * quant_matrix = decoder->intra_quantizer_matrix;
-	int quantizer_scale = decoder->quantizer_scale;
-	const DCTtab * tab;
-	u32 bit_buf;
-	int bits;
-	u8 * bit_ptr;
-	s16 * dest;
-
-	i = 0;
-	dest = decoder->DCTblock;
-	bit_buf = decoder->bitstream_buf;
-	bits = decoder->bitstream_bits;
-	bit_ptr = decoder->bitstream_ptr;
-	NEEDBITS(bit_buf, bits, bit_ptr);
-
-	while (1)
-	{
-		if (bit_buf >= 0x28000000)
-		{
-			tab = DCT_B14AC_5 + (UBITS(bit_buf, 5) - 5);
-			i += tab->run;
-
-			if (i >= 64) break;	/* end of block */
-
-normal_code:
-			j = scan[i];
-			bit_buf <<= tab->len;
-			bits += tab->len + 1;
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
-
-			/* oddification */
-			val = (val - 1) | 1;
-
-			/* if (bitstream_get (1)) val = -val; */
-			val = (val ^ SBITS(bit_buf, 1)) - SBITS(bit_buf, 1);
-
-			SATURATE(val);
-			dest[j] = val;
-			bit_buf <<= 1;
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-
-		}
-		else if (bit_buf >= 0x04000000)
-		{
-			tab = DCT_B14_8 + (UBITS(bit_buf, 8) - 4);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-
-			/* escape code */
-
-			i += UBITS(bit_buf << 6, 6) - 64;
-
-			if (i >= 64) break;	/* illegal, check needed to avoid buffer overflow */
-
-			j = scan[i];
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			val = SBITS(bit_buf, 8);
-
-			if (!(val & 0x7f))
+			if (tab->run==64) /* end_of_block */
 			{
-				DUMPBITS(bit_buf, bits, 8);
-				val = UBITS(bit_buf, 8) + 2 * val;
+				*last = i;
+				ipu_cmd.pos[4] = 0;
+				return true;
 			}
 
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (val * quantizer_scale * quant_matrix[i]) >> 4;
-
-			/* oddification */
-			val = (val + ~SBITS(val, 1)) | 1;
-
-			SATURATE(val);
-			dest[j] = val;
-			DUMPBITS(bit_buf, bits, 8);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-		}
-		else if (bit_buf >= 0x02000000)
-		{
-			tab = DCT_B14_10 + (UBITS(bit_buf, 10) - 8);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00800000)
-		{
-			tab = DCT_13 + (UBITS(bit_buf, 13) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00200000)
-		{
-			tab = DCT_15 + (UBITS(bit_buf, 15) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else
-		{
-			tab = DCT_16 + UBITS(bit_buf, 16);
-			bit_buf <<= 16;
-			GETWORD(&bit_buf, bits + 16);
-			i += tab->run;
- goto normal_code;
-		}
-
-		break;	/* illegal, check needed to avoid buffer overflow */
-	}
-
-	if ((bit_buf >> 30) != 0x2) ipuRegs->ctrl.ECD = 1;
-
-	DUMPBITS(bit_buf, bits, 2);	/* dump end of block code */
-	decoder->bitstream_buf = bit_buf;
-	decoder->bitstream_bits = bits;
-}
-
-static __forceinline int get_mpeg1_non_intra_block(decoder_t * const decoder)
-{
-	int i;
-	int j;
-	int val;
-	const u8 * scan = decoder->scan;
-	const u8 * quant_matrix = decoder->non_intra_quantizer_matrix;
-	int quantizer_scale = decoder->quantizer_scale;
-	const DCTtab * tab;
-	u32 bit_buf;
-	int bits;
-	u8 * bit_ptr;
-	s16 * dest;
-
-	i = -1;
-	dest = decoder->DCTblock;
-
-	bit_buf = decoder->bitstream_buf;
-	bits = decoder->bitstream_bits;
-	bit_ptr = decoder->bitstream_ptr;
-
-	NEEDBITS(bit_buf, bits, bit_ptr);
-
-	if (bit_buf >= 0x28000000)
-	{
-		tab = DCT_B14DC_5 + (UBITS(bit_buf, 5) - 5);
-		goto entry_1;
-	}
-	else
-		goto entry_2;
-
-	while (1)
-	{
-		if (bit_buf >= 0x28000000)
-		{
-			tab = DCT_B14AC_5 + (UBITS(bit_buf, 5) - 5);
-entry_1:
-			i += tab->run;
-
-			if (i >= 64) break;	/* end of block */
-
-normal_code:
-			j = scan[i];
-			bit_buf <<= tab->len;
-			bits += tab->len + 1;
-
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
-
-			/* oddification */
-			val = (val - 1) | 1;
-
-			/* if (bitstream_get (1)) val = -val; */
-			val = (val ^ SBITS(bit_buf, 1)) - SBITS(bit_buf, 1);
-
-			SATURATE(val);
-			dest[j] = val;
-			bit_buf <<= 1;
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
-		}
-entry_2:
-		if (bit_buf >= 0x04000000)
-		{
-			tab = DCT_B14_8 + (UBITS(bit_buf, 8) - 4);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-
-			/* escape code */
-
-			i += UBITS(bit_buf << 6, 6) - 64;
-
-			if (i >= 64) break;	/* illegal, check needed to avoid buffer overflow */
-
-			j = scan[i];
-			DUMPBITS(bit_buf, bits, 12);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			val = SBITS(bit_buf, 8);
-
-			if (!(val & 0x7f))
+			i += (tab->run == 65) ? GETBITS(6) : tab->run;
+			if (i >= 64)
 			{
-				DUMPBITS(bit_buf, bits, 8);
-				val = UBITS(bit_buf, 8) + 2 * val;
+				*last = i;
+				ipu_cmd.pos[4] = 0;
+				return true;
 			}
 
-			val = 2 * (val + SBITS(val, 1)) + 1;
+		case 1:
+			if (!GETWORD())
+			{
+			  ipu_cmd.pos[4] = i;
+			  ipu_cmd.pos[5] = 1;
+			  return false;
+			}
 
-			/* JayteeMaster: 10 points! Replaced quant_matrix[j] by quant_matrix[i] as should be */
-			val = (val * quantizer_scale * quant_matrix[i]) / 32;
+			j = scan[i];
 
-			/* oddification */
-			val = (val + ~SBITS(val, 1)) | 1;
+			if (tab->run==65) /* escape */
+			{
+				if (!decoder.mpeg1)
+				{
+					val = ((2 * (SBITS(12) + SBITS(1)) + 1) * quantizer_scale * quant_matrix[i]) >> 5;
+					DUMPBITS(12);
+				}
+				else
+				{
+				  val = SBITS(8);
+				  DUMPBITS(8);
+
+				  if (!(val & 0x7f))
+				  {
+					val = GETBITS(8) + 2 * val;
+				  }
+
+				  val = ((2 * (val + (((s32)val) >> 31)) + 1) * quantizer_scale * quant_matrix[i]) / 32;
+				  val = (val + ~ (((s32)val) >> 31)) | 1;
+				}
+			}
+			else
+			{
+				val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5;
+				val = (val ^ SBITS(1)) - SBITS(1);
+				DUMPBITS(1);
+			}
 
 			SATURATE(val);
 			dest[j] = val;
-			DUMPBITS(bit_buf, bits, 8);
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			continue;
+			ipu_cmd.pos[5] = 0;
 		}
-		else if (bit_buf >= 0x02000000)
-		{
-			tab = DCT_B14_10 + (UBITS(bit_buf, 10) - 8);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00800000)
-		{
-			tab = DCT_13 + (UBITS(bit_buf, 13) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else if (bit_buf >= 0x00200000)
-		{
-			tab = DCT_15 + (UBITS(bit_buf, 15) - 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-		else
-		{
-			tab = DCT_16 + UBITS(bit_buf, 16);
-			bit_buf <<= 16;
-			GETWORD(&bit_buf, bits + 16);
-			i += tab->run;
-
-			if (i < 64) goto normal_code;
-		}
-
-		break;	/* illegal, check needed to avoid buffer overflow */
 	}
 
-	if ((bit_buf >> 30) != 0x2) ipuRegs->ctrl.ECD = 1;
-
-	DUMPBITS(bit_buf, bits, 2);	/* dump end of block code */
-	decoder->bitstream_buf = bit_buf;
-	decoder->bitstream_bits = bits;
-	return i;
+	ipu_cmd.pos[4] = 0;
+	return true;
 }
 
-static void __fastcall slice_intra_DCT(decoder_t * const decoder, const int cc,
-                                       u8 * const dest, const int stride)
+static bool __fastcall slice_intra_DCT(const int cc, u8 * const dest, const int stride, const bool skip)
 {
-	NEEDBITS(decoder->bitstream_buf, decoder->bitstream_bits, decoder->bitstream_ptr);
-	/* Get the intra DC coefficient and inverse quantize it */
-
-	if (cc == 0)
-		decoder->dc_dct_pred[0] += get_luma_dc_dct_diff(decoder);
-	else
-		decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff(decoder);
-
-	decoder->DCTblock[0] = decoder->dc_dct_pred[cc] << (3 - decoder->intra_dc_precision);
-
-	if (decoder->mpeg1)
+	if (!skip || ipu_cmd.pos[3])
 	{
-		get_mpeg1_intra_block(decoder);
-	}
-	else if (decoder->intra_vlc_format)
-	{
-		get_intra_block_B15(decoder);
-	}
-	else
-	{
-		get_intra_block_B14(decoder);
+		ipu_cmd.pos[3] = 0;
+		if (!GETWORD())
+		{
+			ipu_cmd.pos[3] = 1;
+			return false;
+		}
+
+		/* Get the intra DC coefficient and inverse quantize it */
+		if (cc == 0)
+			decoder.dc_dct_pred[0] += get_luma_dc_dct_diff();
+		else
+			decoder.dc_dct_pred[cc] += get_chroma_dc_dct_diff();
+
+		decoder.DCTblock[0] = decoder.dc_dct_pred[cc] << (3 - decoder.intra_dc_precision);
 	}
 
-	mpeg2_idct_copy(decoder->DCTblock, dest, stride);
+	if (!get_intra_block())
+	{
+		return false;
+	}
+
+	mpeg2_idct_copy(decoder.DCTblock, dest, stride);
+
+	return true;
 }
 
-/* JayteeMaster: changed dest to 16 bit signed */
-static void __fastcall slice_non_intra_DCT(decoder_t * const decoder,
-        /*u8*/s16 * const dest, const int stride)
+static bool __fastcall slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip)
 {
 	int last;
-	memzero(decoder->DCTblock);
 
-	if (decoder->mpeg1)
-		last = get_mpeg1_non_intra_block(decoder);
-	else
-		last = get_non_intra_block(decoder);
-
-	mpeg2_idct_add(last, decoder->DCTblock, dest, stride);
-}
-
-#if defined(_MSC_VER)
-#pragma pack(1)
-#endif
-
-struct TGA_HEADER
-{
-	u8  identsize;		// size of ID field that follows 18 u8 header (0 usually)
-	u8  colourmaptype;	 // type of colour map 0=none, 1=has palette
-	u8  imagetype;		// type of image 0=none,1=indexed,2=rgb,3=grey,+8=rle packed
-
-	s16 colourmapstart;	// first colour map entry in palette
-	s16 colourmaplength;	 // number of colours in palette
-	u8  colourmapbits;	 // number of bits per palette entry 15,16,24,32
-
-    s16 xstart;             // image x origin
-    s16 ystart;             // image y origin
-    s16 width;              // image width in pixels
-    s16 height;             // image height in pixels
-    u8  bits;               // image bits per pixel 8,16,24,32
-    u8  descriptor;         // image descriptor bits (vh flip bits)
-
-    // pixel data follows header
-} __packed;
-
-#if defined(_MSC_VER)
-#	pragma pack()
-#endif
-
-void SaveTGA(const char* filename, int width, int height, void* pdata)
-{
-	TGA_HEADER hdr;
-	FILE* f = fopen(filename, "wb");
-
-	if (f == NULL) return;
-
-	assert(sizeof(TGA_HEADER) == 18 && sizeof(hdr) == 18);
-
-	memzero(hdr);
-	hdr.imagetype = 2;
-	hdr.bits = 32;
-	hdr.width = width;
-	hdr.height = height;
-	hdr.descriptor |= 8 | (1 << 5); // 8bit alpha, flip vertical
-	fwrite(&hdr, sizeof(hdr), 1, f);
-	fwrite(pdata, width*height*4, 1, f);
-	fclose(f);
-}
-
-static int s_index = 0; //, s_frame = 0;
-
-void SaveRGB32(u8* ptr)
-{
-	char filename[255];
-	sprintf(filename, "frames/frame%.4d.tga", s_index++);
-	SaveTGA(filename, 16, 16, ptr);
-}
-
-void waitForSCD()
-{
-	u8 bit8 = 1;
-
-	while (!getBits8((u8*)&bit8, 0))
+	if (!skip)
 	{
-		so_resume();
+		memzero(decoder.DCTblock);
 	}
 
-	if (bit8 == 0)
+	if (!get_non_intra_block(&last))
 	{
-		if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
-
-		ipuRegs->ctrl.SCD = 1;
+		return false;
 	}
 
-	while (!getBits32((u8*)&ipuRegs->top, 0))
-	{
-		so_resume();
-	}
+	mpeg2_idct_add(last, decoder.DCTblock, dest, stride);
 
-	BigEndian(ipuRegs->top, ipuRegs->top);
-
-	/*if(ipuRegs->ctrl.SCD)
-	{
-		switch(ipuRegs->top & 0xFFFFFFF0)
-		{
-			case 0x100:
-			case 0x1A0:
-				break;
-			case 0x1B0:
-				ipuRegs->ctrl.SCD = 0;
-				if(ipuRegs->top == 0x1b4) ipuRegs->ctrl.ECD = 1;
-				//else
-				//{
-				//	do
-				//	{
-				//		while(!getBits32((u8*)&ipuRegs->top, 1))
-				//		{
-				//			so_resume();
-				//		}
-
-				//		BigEndian(ipuRegs->top, ipuRegs->top);
-				//	}
-				//	while((ipuRegs->top & 0xfffffff0) != 0x100);
-				//}
-				break;
-			default:
-				ipuRegs->ctrl.SCD = 0;
-				break;
-		}
-	}*/
+	return true;
 }
 
-void __forceinline finishmpeg2sliceIDEC(decoder_t* &decoder)
+void __forceinline finishmpeg2sliceIDEC()
 {
 	ipuRegs->ctrl.SCD = 0;
-	coded_block_pattern = decoder->coded_block_pattern;
+	coded_block_pattern = decoder.coded_block_pattern;
 
-	g_BP.BP += decoder->bitstream_bits - 16;
+	g_BP.BP += decoder.bitstream_bits - 16;
 
 	if ((int)g_BP.BP < 0)
 	{
@@ -1122,103 +691,133 @@ void __forceinline finishmpeg2sliceIDEC(decoder_t* &decoder)
 	}
 
 	FillInternalBuffer(&g_BP.BP, 1, 0);
-
-	waitForSCD();
 }
 
-void mpeg2sliceIDEC(void* pdone)
+bool mpeg2sliceIDEC()
 {
 	u32 read;
+	u16 code;
+	u8 bit8;
 
-	bool resumed = false;
-	decoder_t *decoder = &g_decoder;
-
-	*(int*)pdone = 0;
-	bitstream_init(decoder);
-
-	decoder->dc_dct_pred[0] =
-	decoder->dc_dct_pred[1] =
-	decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
-
-	decoder->mbc = 0;
-	ipuRegs->ctrl.ECD = 0;
-
-	if (UBITS(decoder->bitstream_buf, 2) == 0)
-	{
-		ipuRegs->ctrl.SCD = 0;
-	}
-	else
+	switch (ipu_cmd.pos[0])
 	{
+	case 0:
+		decoder.dc_dct_pred[0] =
+		decoder.dc_dct_pred[1] =
+		decoder.dc_dct_pred[2] = 128 << decoder.intra_dc_precision;
+
+		decoder.mbc = 0;
+		ipuRegs->top = 0;
+		ipuRegs->ctrl.ECD = 0;
+
+	case 1:
+		ipu_cmd.pos[0] = 1;
+		if (!bitstream_init())
+		{
+			return false;
+		}
+
+	case 2:
+		ipu_cmd.pos[0] = 2;
 		while (1)
 		{
 			int DCT_offset, DCT_stride;
-			int mba_inc;
 			const MBAtab * mba;
 
-			NEEDBITS(decoder->bitstream_buf, decoder->bitstream_bits, decoder->bitstream_ptr);
-			decoder->macroblock_modes = get_macroblock_modes(decoder);
-
-			/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
-
-			if (decoder->macroblock_modes & MACROBLOCK_QUANT) //only IDEC
+			switch (ipu_cmd.pos[1])
 			{
-				decoder->quantizer_scale = get_quantizer_scale(decoder);
-			}
+			case 0:
+				decoder.macroblock_modes = get_macroblock_modes();
 
-			if (decoder->macroblock_modes & DCT_TYPE_INTERLACED)
-			{
-				DCT_offset = decoder->stride;
-				DCT_stride = decoder->stride * 2;
-			}
-			else
-			{
-				DCT_offset = decoder->stride * 8;
-				DCT_stride = decoder->stride;
-			}
-
-			if (decoder->macroblock_modes & MACROBLOCK_INTRA)
-			{
-				decoder->coded_block_pattern = 0x3F;//all 6 blocks
-				//ipuRegs->ctrl.CBP = 0x3f;
-
-				memzero(*decoder->mb8);
-				memzero(*decoder->rgb32);
-
-				slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y, DCT_stride);
-				slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + 8, DCT_stride);
-				slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + DCT_offset, DCT_stride);
-				slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + DCT_offset + 8, DCT_stride);
-				slice_intra_DCT(decoder, 1, (u8*)decoder->mb8->Cb, decoder->stride >> 1);
-				slice_intra_DCT(decoder, 2, (u8*)decoder->mb8->Cr, decoder->stride >> 1);
-
-				// Send The MacroBlock via DmaIpuFrom
-
-				if (decoder->ofm == 0)
+				if (decoder.macroblock_modes & MACROBLOCK_QUANT) //only IDEC
 				{
-					ipu_csc(decoder->mb8, decoder->rgb32, decoder->sgn);
+					decoder.quantizer_scale = get_quantizer_scale();
+				}
 
-					g_nIPU0Data = 64;
-					g_pIPU0Pointer = (u8*)decoder->rgb32;
-					//if ( s_frame >= 39 ) SaveRGB32(g_pIPU0Pointer);
+				decoder.coded_block_pattern = 0x3F;//all 6 blocks
+				memzero(*decoder.mb8);
+				memzero(*decoder.rgb32);
+
+			case 1:
+				ipu_cmd.pos[1] = 1;
+
+				if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
+				{
+					DCT_offset = decoder.stride;
+					DCT_stride = decoder.stride * 2;
 				}
 				else
 				{
-					ipu_csc(decoder->mb8, decoder->rgb32, decoder->sgn);
-					ipu_dither(decoder->rgb32, decoder->rgb16, decoder->dte);
-
-					g_nIPU0Data = 32;
-					g_pIPU0Pointer = (u8*)decoder->rgb16;
-					//if ( s_frame >= 39 ) SaveRGB32(g_pIPU0Pointer);
+					DCT_offset = decoder.stride * 8;
+					DCT_stride = decoder.stride;
 				}
 
+				switch (ipu_cmd.pos[2])
+				{
+				case 0:
+				case 1:
+					if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y, DCT_stride, ipu_cmd.pos[2] == 1))
+					{
+						ipu_cmd.pos[2] = 1;
+						return false;
+					}
+				case 2:
+					if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + 8, DCT_stride, ipu_cmd.pos[2] == 2))
+					{
+						ipu_cmd.pos[2] = 2;
+						return false;
+					}
+				case 3:
+					if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset, DCT_stride, ipu_cmd.pos[2] == 3))
+					{
+						ipu_cmd.pos[2] = 3;
+						return false;
+					}
+				case 4:
+					if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[2] == 4))
+					{
+						ipu_cmd.pos[2] = 4;
+						return false;
+					}
+				case 5:
+					if (!slice_intra_DCT(1, (u8*)decoder.mb8->Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5))
+					{
+						ipu_cmd.pos[2] = 5;
+						return false;
+					}
+				case 6:
+					if (!slice_intra_DCT(2, (u8*)decoder.mb8->Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6))
+					{
+						ipu_cmd.pos[2] = 6;
+						return false;
+					}
+				}
+
+				// Send The MacroBlock via DmaIpuFrom
+				ipu_csc(decoder.mb8, decoder.rgb32, decoder.sgn);
+
+				if (decoder.ofm == 0)
+				{
+					g_nIPU0Data = 64;
+					g_pIPU0Pointer = (u8*)decoder.rgb32;
+				}
+				else
+				{
+					ipu_dither(decoder.rgb32, decoder.rgb16, decoder.dte);
+
+					g_nIPU0Data = 32;
+					g_pIPU0Pointer = (u8*)decoder.rgb16;
+				}
+
+			case 2:
 				while (g_nIPU0Data > 0)
 				{
 					read = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
 
 					if (read == 0)
 					{
-						so_resume();
-						resumed = true;
+						ipu_cmd.pos[1] = 2;
+						return false;
 					}
 					else
 					{
@@ -1228,289 +827,322 @@ void mpeg2sliceIDEC(void* pdone)
 					}
 				}
 
-				decoder->mbc++;
-			}
-
-			NEEDBITS(decoder->bitstream_buf, decoder->bitstream_bits, decoder->bitstream_ptr);
-			mba_inc = 0;
-
-			while (1)
-			{
-				if (decoder->bitstream_buf >= 0x10000000)
+				decoder.mbc++;
+				mbaCount = 0;
+			case 3:
+				while (1)
 				{
-					mba = MBA_5 + (UBITS(decoder->bitstream_buf, 5) - 2);
-					break;
-				}
-				else if (decoder->bitstream_buf >= 0x03000000)
-				{
-					mba = MBA_11 + (UBITS(decoder->bitstream_buf, 11) - 24);
-					break;
-				}
-				else switch (UBITS(decoder->bitstream_buf, 11))
+					if (!GETWORD())
 					{
-
-						case 8:		/* macroblock_escape */
-							mba_inc += 33;
-							/* pass through */
-
-						case 15:	/* macroblock_stuffing (MPEG1 only) */
-							DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, 11);
-							NEEDBITS(decoder->bitstream_buf, decoder->bitstream_bits, decoder->bitstream_ptr);
-							continue;
-
-						default:	/* end of slice/frame, or error? */
-						{
-#ifdef MPEGHACK
-							if (!resumed) so_resume();
-#endif
-							finishmpeg2sliceIDEC(decoder);
-
-							*(int*)pdone = 1;
-							so_exit();
-						}
+						ipu_cmd.pos[1] = 3;
+						return false;
 					}
-			}
 
-			DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, mba->len);
-			mba_inc += mba->mba;
+					code = UBITS(16);
+					if (code >= 0x1000)
+					{
+						mba = MBA_5 + (UBITS(5) - 2);
+						break;
+					}
+					else if (code >= 0x0300)
+					{
+						mba = MBA_11 + (UBITS(11) - 24);
+						break;
+					}
+					else switch (UBITS(11))
+					{
+							case 8:		/* macroblock_escape */
+								mbaCount += 33;
+								/* pass through */
 
-			if (mba_inc)
-			{
-				decoder->dc_dct_pred[0] =
-				decoder->dc_dct_pred[1] =
-				decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
+							case 15:	/* macroblock_stuffing (MPEG1 only) */
+								DUMPBITS(11);
+								continue;
 
-				do
-				{
-					decoder->mbc++;
+							default:	/* end of slice/frame, or error? */
+							{
+								goto finish_idec;	
+							}
+					}
 				}
-				while (--mba_inc);
+
+				DUMPBITS(mba->len);
+				mbaCount += mba->mba;
+
+				if (mbaCount)
+				{
+					decoder.dc_dct_pred[0] =
+					decoder.dc_dct_pred[1] =
+					decoder.dc_dct_pred[2] = 128 << decoder.intra_dc_precision;
+
+					decoder.mbc += mbaCount;
+				}
+
+			case 4:
+				if (!GETWORD())
+				{
+					ipu_cmd.pos[1] = 4;
+					return false;
+				}
+
+				break;
 			}
+
+			ipu_cmd.pos[1] = 0;
+			ipu_cmd.pos[2] = 0;
 		}
+		
+finish_idec:
+		finishmpeg2sliceIDEC();
+
+	case 3:
+		bit8 = 1;
+		if (!getBits8((u8*)&bit8, 0))
+		{
+			ipu_cmd.pos[0] = 3;
+			return false;
+		}
+
+		if (bit8 == 0)
+		{
+			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
+
+			ipuRegs->ctrl.SCD = 1;
+		}
+
+	case 4:
+		if (!getBits32((u8*)&ipuRegs->top, 0))
+		{
+			ipu_cmd.pos[0] = 4;
+			return false;
+		}
+
+		BigEndian(ipuRegs->top, ipuRegs->top);
+		break;
 	}
 
-#ifdef MPEGHACK
-	if (!resumed) so_resume();
-#endif
-
-	finishmpeg2sliceIDEC(decoder);
-
-	*(int*)pdone = 1;
-	so_exit();
+	return true;
 }
 
-void mpeg2_slice(void* pdone)
+bool mpeg2_slice()
 {
 	int DCT_offset, DCT_stride;
-	//u8 bit8=0;
-	//u32 fp = g_BP.FP;
-	u32 bp;
-	decoder_t * decoder = &g_decoder;
-	u32 size = 0;
+	u8 bit8;
+	u32 size;
 
-	*(int*)pdone = 0;
-	ipuRegs->ctrl.ECD = 0;
-
-	memzero(*decoder->mb8);
-	memzero(*decoder->mb16);
-
-	bitstream_init(decoder);
-
-	if (decoder->dcr)
+	switch (ipu_cmd.pos[0])
 	{
-		decoder->dc_dct_pred[0] =
-		decoder->dc_dct_pred[1] =
-		decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
-	}
-
-	if (decoder->macroblock_modes & DCT_TYPE_INTERLACED)
-	{
-		DCT_offset = decoder->stride;
-		DCT_stride = decoder->stride * 2;
-	}
-	else
-	{
-		DCT_offset = decoder->stride * 8;
-		DCT_stride = decoder->stride;
-	}
-
-	if (decoder->macroblock_modes & MACROBLOCK_INTRA)
-	{
-		decoder->coded_block_pattern = 0x3F;//all 6 blocks
-		slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y, DCT_stride);
-		slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + 8, DCT_stride);
-		slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + DCT_offset, DCT_stride);
-		slice_intra_DCT(decoder, 0, (u8*)decoder->mb8->Y + DCT_offset + 8, DCT_stride);
-		slice_intra_DCT(decoder, 1, (u8*)decoder->mb8->Cb, decoder->stride >> 1);
-		slice_intra_DCT(decoder, 2, (u8*)decoder->mb8->Cr, decoder->stride >> 1);
-		ipu_copy(decoder->mb8, decoder->mb16);
-	}
-	else
-	{
-		if (decoder->macroblock_modes & MACROBLOCK_PATTERN)
+	case 0:
+		if (decoder.dcr)
 		{
-			decoder->coded_block_pattern = get_coded_block_pattern(decoder);
-			/* JayteeMaster: changed from mb8 to mb16 and from u8 to s16 */
-
-			if (decoder->coded_block_pattern & 0x20) slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Y, DCT_stride);
-			if (decoder->coded_block_pattern & 0x10) slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Y + 8, DCT_stride);
-			if (decoder->coded_block_pattern & 0x08) slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Y + DCT_offset,	 DCT_stride);
-			if (decoder->coded_block_pattern & 0x04) slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Y + DCT_offset + 8, DCT_stride);
-			if (decoder->coded_block_pattern & 0x2)  slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Cb, decoder->stride >> 1);
-			if (decoder->coded_block_pattern & 0x1)  slice_non_intra_DCT(decoder, (s16*)decoder->mb16->Cr, decoder->stride >> 1);
-
+			decoder.dc_dct_pred[0] =
+			decoder.dc_dct_pred[1] =
+			decoder.dc_dct_pred[2] = 128 << decoder.intra_dc_precision;
 		}
-	}
-
-	//Send The MacroBlock via DmaIpuFrom
-
-	size = 0;	// Reset
-	ipuRegs->ctrl.SCD = 0;
-	coded_block_pattern = decoder->coded_block_pattern;
-	bp = g_BP.BP;
-	g_BP.BP += ((int)decoder->bitstream_bits - 16);
-
-	// BP goes from 0 to 128, so negative values mean to read old buffer
-	// so we minus from 128 to get the correct BP
-	if ((int)g_BP.BP < 0)
-	{
-		g_BP.BP = 128 + (int)g_BP.BP;
-
-		// After BP is positioned correctly, we need to reload the old buffer
-		// so that reading may continue properly
-		ReorderBitstream();
-	}
-
-	FillInternalBuffer(&g_BP.BP, 1, 0);
-
-	decoder->mbc = 1;
-	g_nIPU0Data = 48;
-	g_pIPU0Pointer = (u8*)decoder->mb16;
-
-	while (g_nIPU0Data > 0)
-	{
-		size = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
-
-		if (size == 0)
+			
+		ipuRegs->ctrl.ECD = 0;
+		ipuRegs->top = 0;
+		memzero(*decoder.mb8);
+		memzero(*decoder.mb16);
+	case 1:
+		if (!bitstream_init())
 		{
-			so_resume();
+			ipu_cmd.pos[0] = 1;
+			return false;
+		}
+
+	case 2:
+		ipu_cmd.pos[0] = 2;
+
+		if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
+		{
+			DCT_offset = decoder.stride;
+			DCT_stride = decoder.stride * 2;
 		}
 		else
 		{
-			g_pIPU0Pointer += size * 16;
-			g_nIPU0Data -= size;
-		}
-	}
-	waitForSCD();
-
-	decoder->bitstream_bits = 0;
-	*(int*)pdone = 1;
-	so_exit();
-}
-
-int __forceinline get_motion_delta(decoder_t * const decoder,
-                                   const int f_code)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-
-	int delta;
-	int sign;
-	const MVtab * tab;
-
-	if ((bit_buf & 0x80000000))
-	{
-		DUMPBITS(bit_buf, bits, 1);
-		return 0x00010000;
-	}
-	else if ((bit_buf & 0xf0000000) || ((bit_buf & 0xfc000000) == 0x0c000000))
-	{
-
-		tab = MV_4 + UBITS(bit_buf, 4);
-		delta = (tab->delta << f_code) + 1;
-		bits += tab->len + f_code + 1;
-		bit_buf <<= tab->len;
-
-		sign = SBITS(bit_buf, 1);
-		bit_buf <<= 1;
-
-		if (f_code) delta += UBITS(bit_buf, f_code);
-
-		bit_buf <<= f_code;
-
-		return (delta ^ sign) - sign;
-
-	}
-	else
-	{
-		tab = MV_10 + UBITS(bit_buf, 10);
-		delta = (tab->delta << f_code) + 1;
-		bits += tab->len + 1;
-		bit_buf <<= tab->len;
-
-		sign = SBITS(bit_buf, 1);
-		bit_buf <<= 1;
-
-		if (f_code)
-		{
-			NEEDBITS(bit_buf, bits, bit_ptr);
-			delta += UBITS(bit_buf, f_code);
-			DUMPBITS(bit_buf, bits, f_code);
+			DCT_offset = decoder.stride * 8;
+			DCT_stride = decoder.stride;
 		}
 
-		return (delta ^ sign) - sign;
-
-	}
-
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-int __forceinline get_dmv(decoder_t * const decoder)
-{
-#define bit_buf (decoder->bitstream_buf)
-#define bits (decoder->bitstream_bits)
-#define bit_ptr (decoder->bitstream_ptr)
-
-	const DMVtab * tab;
-
-	tab = DMV_2 + UBITS(bit_buf, 2);
-	DUMPBITS(bit_buf, bits, tab->len);
-	return tab->dmv;
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-int get_macroblock_address_increment(decoder_t * const decoder)
-{
-	const MBAtab *mba;
-
-	if (decoder->bitstream_buf >= 0x10000000)
-		mba = MBA_5 + (UBITS(decoder->bitstream_buf, 5) - 2);
-	else if (decoder->bitstream_buf >= 0x03000000)
-		mba = MBA_11 + (UBITS(decoder->bitstream_buf, 11) - 24);
-	else switch (UBITS(decoder->bitstream_buf, 11))
+		if (decoder.macroblock_modes & MACROBLOCK_INTRA)
 		{
-
-			case 8:		/* macroblock_escape */
-				DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, 11);
-				return 0x23;
-
-			case 15:	/* macroblock_stuffing (MPEG1 only) */
-				if (decoder->mpeg1)
+			switch(ipu_cmd.pos[1])
+			{
+			case 0:
+				decoder.coded_block_pattern = 0x3F;
+			case 1:
+				if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y, DCT_stride, ipu_cmd.pos[1] == 1))
 				{
-					DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, 11);
-					return 0x22;
+					ipu_cmd.pos[1] = 1;
+					return false;
 				}
+			case 2:
+				if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + 8, DCT_stride, ipu_cmd.pos[1] == 2))
+				{
+					ipu_cmd.pos[1] = 2;
+					return false;
+				}
+			case 3:
+				if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3))
+				{
+					ipu_cmd.pos[1] = 3;
+					return false;
+				}
+			case 4:
+				if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4))
+				{
+					ipu_cmd.pos[1] = 4;
+					return false;
+				}
+			case 5:
+				if (!slice_intra_DCT(1, (u8*)decoder.mb8->Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
+				{
+					ipu_cmd.pos[1] = 5;
+					return false;
+				}
+			case 6:
+				if (!slice_intra_DCT(2, (u8*)decoder.mb8->Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
+				{
+					ipu_cmd.pos[1] = 6;
+					return false;
+				}
+				break;
+			}
 
-			default:
-				return 0;//error
+			ipu_copy(decoder.mb8, decoder.mb16);
+		}
+		else
+		{
+			if (decoder.macroblock_modes & MACROBLOCK_PATTERN)
+			{
+				switch(ipu_cmd.pos[1])
+				{
+				case 0:
+					decoder.coded_block_pattern = get_coded_block_pattern();  // max 9bits
+				case 1:
+					if (decoder.coded_block_pattern & 0x20)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Y, DCT_stride, ipu_cmd.pos[1] == 1))
+						{
+							ipu_cmd.pos[1] = 1;
+							return false;
+						}
+					}
+				case 2:
+					if (decoder.coded_block_pattern & 0x10)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + 8, DCT_stride, ipu_cmd.pos[1] == 2))
+						{
+							ipu_cmd.pos[1] = 2;
+							return false;
+						}
+					}
+				case 3:
+					if (decoder.coded_block_pattern & 0x08)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3))
+						{
+							ipu_cmd.pos[1] = 3;
+							return false;
+						}
+					}
+				case 4:
+					if (decoder.coded_block_pattern & 0x04)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4))
+						{
+							ipu_cmd.pos[1] = 4;
+							return false;
+						}
+					}
+				case 5:
+					if (decoder.coded_block_pattern & 0x2)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
+						{
+							ipu_cmd.pos[1] = 5;
+							return false;
+						}
+					}
+				case 6:
+					if (decoder.coded_block_pattern & 0x1)
+					{
+						if (!slice_non_intra_DCT((s16*)decoder.mb16->Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
+						{
+							ipu_cmd.pos[1] = 6;
+							return false;
+						}
+					}
+					break;
+				}
+			}
 		}
 
-	DUMPBITS(decoder->bitstream_buf, decoder->bitstream_bits, mba->len);
+		//Send The MacroBlock via DmaIpuFrom
+		size = 0;	// Reset
+		ipuRegs->ctrl.SCD = 0;
+		coded_block_pattern = decoder.coded_block_pattern;
+		g_BP.BP += (int)decoder.bitstream_bits - 16;
 
-	return mba->mba + 1;
-}
+		// BP goes from 0 to 128, so negative values mean to read old buffer
+		// so we minus from 128 to get the correct BP
+		if ((int)g_BP.BP < 0)
+		{
+			g_BP.BP = 128 + (int)g_BP.BP;
+
+			// After BP is positioned correctly, we need to reload the old buffer
+			// so that reading may continue properly
+			ReorderBitstream();
+		}
+
+		decoder.mbc = 1;
+		g_nIPU0Data = 48;
+		g_pIPU0Pointer = (u8*)decoder.mb16;
+
+	case 3:
+		while (g_nIPU0Data > 0)
+		{
+			size = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
+
+			if (size == 0)
+			{
+				ipu_cmd.pos[0] = 3;
+				return false;
+			}
+			else
+			{
+				g_pIPU0Pointer += size * 16;
+				g_nIPU0Data -= size;
+			}
+		}
+
+	case 4:
+		bit8 = 1;
+		if (!getBits8((u8*)&bit8, 0))
+		{
+			ipu_cmd.pos[0] = 4;
+			return false;
+		}
+
+		if (bit8 == 0)
+		{
+			if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7);
+
+			ipuRegs->ctrl.SCD = 1;
+		}
+
+	case 5:
+		if (!getBits32((u8*)&ipuRegs->top, 0))
+		{
+			ipu_cmd.pos[0] = 5;
+			return false;
+		}
+
+		BigEndian(ipuRegs->top, ipuRegs->top);
+		decoder.bitstream_bits = 0;
+		break;
+	}
+
+	return true;
+}
\ No newline at end of file
diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h
index 3c8cb79e11..2860e4f53b 100644
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@@ -99,7 +99,6 @@ struct decoder_t {
 	/* bit parsing stuff */
 	u32 bitstream_buf;		/* current 32 bit working set */
 	int bitstream_bits;			/* used bits in working set */
-	u8 * bitstream_ptr;			/* buffer with stream data; 128 bits buffer */
 
 	struct macroblock_8		*mb8;
 	struct macroblock_16	*mb16;
@@ -173,13 +172,13 @@ extern void (__fastcall *mpeg2_idct_add) (int last, s16 * block, s16* dest, int
 #define IDEC	0
 #define BDEC	1
 
-void mpeg2sliceIDEC(void* pdone);
-void mpeg2_slice(void* pdone);
-int get_macroblock_address_increment(decoder_t * const decoder);
-int get_macroblock_modes (decoder_t * const decoder);
+bool mpeg2sliceIDEC();
+bool mpeg2_slice();
+int get_macroblock_address_increment();
+int get_macroblock_modes();
 
-extern int get_motion_delta (decoder_t * const decoder, const int f_code);
-extern int get_dmv (decoder_t * const decoder);
+extern int get_motion_delta(const int f_code);
+extern int get_dmv();
 
 extern int non_linear_quantizer_scale[];
 extern decoder_t g_decoder;
@@ -189,7 +188,7 @@ void __fastcall ipu_dither(const macroblock_rgb32* rgb32, macroblock_rgb16 *rgb1
 void __fastcall ipu_vq(macroblock_rgb16 *rgb16, u8* indx4);
 void __fastcall ipu_copy(const macroblock_8 *mb8, macroblock_16 *mb16);
 
-int slice (decoder_t * const decoder, u8 * buffer);
+int slice (u8 * buffer);
 /* idct.c */
 void mpeg2_idct_init ();
 
@@ -199,4 +198,10 @@ void mpeg2_idct_init ();
 #define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif
 
+#ifdef _MSC_VER
+#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#else
+#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#endif
+
 #endif//__MPEG_H__
diff --git a/pcsx2/IPU/mpeg2lib/Vlc.h b/pcsx2/IPU/mpeg2lib/Vlc.h
index 4867b2175c..69727beac6 100644
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@@ -25,55 +25,70 @@
 #ifndef __VLC_H__
 #define __VLC_H__
 
-#include "IPU/coroutine.h"
-
 static u8 data[2];
-static u8 dword[4];
+//static u8 word[4];
+//static u8 dword[8];
+//static u8 qword[16];
 extern tIPU_BP g_BP;
-extern decoder_t g_decoder;
+extern decoder_t decoder;
 extern void ReorderBitstream();
 
-static __forceinline void GETWORD(u32 * bit_buf,int bits)
+static __forceinline int GETWORD()
 {
-	while(!getBits16(data,1))
+	if (decoder.bitstream_bits > 0)
 	{
-		so_resume();
+		if(!getBits16(data,1))
+		{
+			return 0;
+		}
+		
+		/*u32 data;
+		BigEndian(data, *(u32*)word);
+		decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
+		decoder.bitstream_bits -= 32;*/
+		decoder.bitstream_buf |= ((u32)(((u16)data[0] << 8) | data[1])) << decoder.bitstream_bits;
+		decoder.bitstream_bits -= 16;
 	}
-	*bit_buf |= ((data[0] << 8) | data[1]) << (bits);
+
+	return 1;
 }
 
-static __forceinline void bitstream_init (decoder_t * decoder){
-    decoder->bitstream_bits = -16;
+static __forceinline int bitstream_init ()
+{
+	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
+	{
+		return 0;
+	}
 
-	while( !getBits32(dword, 1) )
-		so_resume();
+	decoder.bitstream_bits = -16;
+	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+	/*decoder.bitstream_buf = *(u64*)dword;
+	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/
 
-	decoder->bitstream_buf = (dword[0] << 24) | (dword[1] << 16) |
-							 (dword[2] <<  8) |dword[3];
+	return 1;
 }
 
-/* make sure that there are at least 16 valid bits in bit_buf */
-#define NEEDBITS(bit_buf,bits,bit_ptr)		\
-do {						\
-    if (bits > 0) {			\
-	GETWORD(&bit_buf,bits);	\
-	bits -= 16;	\
-    }						\
-} while (0)
-
 /* remove num valid bits from bit_buf */
-#define DUMPBITS(bit_buf,bits,num)	\
-do {					\
-	/*IPU_LOG("DUMPBITS %d\n",num);*/	\
-    bit_buf <<= (num);			\
-    bits += (num);			\
-} while (0)
+static __forceinline void DUMPBITS(int num)
+{
+	decoder.bitstream_buf <<= num;
+    decoder.bitstream_bits += num;
+}
 
 /* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(bit_buf,num) (((u32)(bit_buf)) >> (32 - (num)))
+#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))
 
 /* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(bit_buf,num) (((s32)(bit_buf)) >> (32 - (num)))
+#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
+
+/* Get bits from bitstream */
+static __forceinline u32 GETBITS(int num)
+{
+	u16 retVal = UBITS(num);
+	DUMPBITS(num);
+
+	return retVal;
+}
 
 struct MBtab {
     u8 modes;
@@ -443,4 +458,247 @@ static const MBAtab MBA_11 [] = {
     { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
     { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
 };
+
+// New
+
+
+/* Table B-1, macroblock_address_increment, codes 00010 ... 011xx */
+static MBAtab MBAtab1[16] =
+{ {0,0}, {0,0}, {7,5}, {6,5}, {5,4}, {5,4}, {4,4}, {4,4},
+  {3,3}, {3,3}, {3,3}, {3,3}, {2,3}, {2,3}, {2,3}, {2,3}
+};
+
+/* Table B-1, macroblock_address_increment, codes 00000011000 ... 0000111xxxx */
+static MBAtab MBAtab2[104] =
+{
+  {33,11}, {32,11}, {31,11}, {30,11}, {29,11}, {28,11}, {27,11}, {26,11},
+  {25,11}, {24,11}, {23,11}, {22,11}, {21,10}, {21,10}, {20,10}, {20,10},
+  {19,10}, {19,10}, {18,10}, {18,10}, {17,10}, {17,10}, {16,10}, {16,10},
+  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},
+  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},
+  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},
+  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},
+  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},
+  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},
+  {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},
+  {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},
+  {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},
+  {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7}
+};
+
+/* Table B-12, dct_dc_size_luminance, codes 00xxx ... 11110 */
+static const DCtab DClumtab0[32] =
+{ {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+  {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+  {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+  {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}, {0, 0}
+};
+
+/* Table B-12, dct_dc_size_luminance, codes 111110xxx ... 111111111 */
+static const DCtab DClumtab1[16] =
+{ {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6},
+  {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10,9}, {11,9}
+};
+
+/* Table B-13, dct_dc_size_chrominance, codes 00xxx ... 11110 */
+static const DCtab DCchromtab0[32] =
+{ {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+  {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+  {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+  {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}, {0, 0}
+};
+
+/* Table B-13, dct_dc_size_chrominance, codes 111110xxxx ... 1111111111 */
+static const DCtab DCchromtab1[32] =
+{ {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6},
+  {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6},
+  {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7},
+  {8, 8}, {8, 8}, {8, 8}, {8, 8}, {9, 9}, {9, 9}, {10,10}, {11,10}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0100 ... 1xxx (used for first (DC) coefficient)
+ */
+static const DCTtab DCTtabfirst[12] =
+{
+  {0,2,4}, {2,1,4}, {1,1,3}, {1,1,3},
+  {0,1,1}, {0,1,1}, {0,1,1}, {0,1,1},
+  {0,1,1}, {0,1,1}, {0,1,1}, {0,1,1}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0100 ... 1xxx (used for all other coefficients)
+ */
+static const DCTtab DCTtabnext[12] =
+{
+  {0,2,4},  {2,1,4},  {1,1,3},  {1,1,3},
+  {64,0,2}, {64,0,2}, {64,0,2}, {64,0,2}, /* EOB */
+  {0,1,2},  {0,1,2},  {0,1,2},  {0,1,2}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 000001xx ... 00111xxx
+ */
+static const DCTtab DCTtab0[60] =
+{
+  {65,0,6}, {65,0,6}, {65,0,6}, {65,0,6}, /* Escape */
+  {2,2,7}, {2,2,7}, {9,1,7}, {9,1,7},
+  {0,4,7}, {0,4,7}, {8,1,7}, {8,1,7},
+  {7,1,6}, {7,1,6}, {7,1,6}, {7,1,6},
+  {6,1,6}, {6,1,6}, {6,1,6}, {6,1,6},
+  {1,2,6}, {1,2,6}, {1,2,6}, {1,2,6},
+  {5,1,6}, {5,1,6}, {5,1,6}, {5,1,6},
+  {13,1,8}, {0,6,8}, {12,1,8}, {11,1,8},
+  {3,2,8}, {1,3,8}, {0,5,8}, {10,1,8},
+  {0,3,5}, {0,3,5}, {0,3,5}, {0,3,5},
+  {0,3,5}, {0,3,5}, {0,3,5}, {0,3,5},
+  {4,1,5}, {4,1,5}, {4,1,5}, {4,1,5},
+  {4,1,5}, {4,1,5}, {4,1,5}, {4,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5}
+};
+
+/* Table B-15, DCT coefficients table one,
+ * codes 000001xx ... 11111111
+*/
+static const DCTtab DCTtab0a[252] =
+{
+  {65,0,6}, {65,0,6}, {65,0,6}, {65,0,6}, /* Escape */
+  {7,1,7}, {7,1,7}, {8,1,7}, {8,1,7},
+  {6,1,7}, {6,1,7}, {2,2,7}, {2,2,7},
+  {0,7,6}, {0,7,6}, {0,7,6}, {0,7,6},
+  {0,6,6}, {0,6,6}, {0,6,6}, {0,6,6},
+  {4,1,6}, {4,1,6}, {4,1,6}, {4,1,6},
+  {5,1,6}, {5,1,6}, {5,1,6}, {5,1,6},
+  {1,5,8}, {11,1,8}, {0,11,8}, {0,10,8},
+  {13,1,8}, {12,1,8}, {3,2,8}, {1,4,8},
+  {2,1,5}, {2,1,5}, {2,1,5}, {2,1,5},
+  {2,1,5}, {2,1,5}, {2,1,5}, {2,1,5},
+  {1,2,5}, {1,2,5}, {1,2,5}, {1,2,5},
+  {1,2,5}, {1,2,5}, {1,2,5}, {1,2,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4}, /* EOB */
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,4,5}, {0,4,5}, {0,4,5}, {0,4,5},
+  {0,4,5}, {0,4,5}, {0,4,5}, {0,4,5},
+  {0,5,5}, {0,5,5}, {0,5,5}, {0,5,5},
+  {0,5,5}, {0,5,5}, {0,5,5}, {0,5,5},
+  {9,1,7}, {9,1,7}, {1,3,7}, {1,3,7},
+  {10,1,7}, {10,1,7}, {0,8,7}, {0,8,7},
+  {0,9,7}, {0,9,7}, {0,12,8}, {0,13,8},
+  {2,3,8}, {4,2,8}, {0,14,8}, {0,15,8}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0000001000 ... 0000001111
+ */
+static const DCTtab DCTtab1[8] =
+{
+  {16,1,10}, {5,2,10}, {0,7,10}, {2,3,10},
+  {1,4,10}, {15,1,10}, {14,1,10}, {4,2,10}
+};
+
+/* Table B-15, DCT coefficients table one,
+ * codes 000000100x ... 000000111x
+ */
+static const DCTtab DCTtab1a[8] =
+{
+  {5,2,9}, {5,2,9}, {14,1,9}, {14,1,9},
+  {2,4,10}, {16,1,10}, {15,1,9}, {15,1,9}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 000000010000 ... 000000011111
+ */
+static const DCTtab DCTtab2[16] =
+{
+  {0,11,12}, {8,2,12}, {4,3,12}, {0,10,12},
+  {2,4,12}, {7,2,12}, {21,1,12}, {20,1,12},
+  {0,9,12}, {19,1,12}, {18,1,12}, {1,5,12},
+  {3,3,12}, {0,8,12}, {6,2,12}, {17,1,12}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 0000000010000 ... 0000000011111
+ */
+static const DCTtab DCTtab3[16] =
+{
+  {10,2,13}, {9,2,13}, {5,3,13}, {3,4,13},
+  {2,5,13}, {1,7,13}, {1,6,13}, {0,15,13},
+  {0,14,13}, {0,13,13}, {0,12,13}, {26,1,13},
+  {25,1,13}, {24,1,13}, {23,1,13}, {22,1,13}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 00000000010000 ... 00000000011111
+ */
+static const DCTtab DCTtab4[16] =
+{
+  {0,31,14}, {0,30,14}, {0,29,14}, {0,28,14},
+  {0,27,14}, {0,26,14}, {0,25,14}, {0,24,14},
+  {0,23,14}, {0,22,14}, {0,21,14}, {0,20,14},
+  {0,19,14}, {0,18,14}, {0,17,14}, {0,16,14}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 000000000010000 ... 000000000011111
+ */
+static const DCTtab DCTtab5[16] =
+{
+  {0,40,15}, {0,39,15}, {0,38,15}, {0,37,15},
+  {0,36,15}, {0,35,15}, {0,34,15}, {0,33,15},
+  {0,32,15}, {1,14,15}, {1,13,15}, {1,12,15},
+  {1,11,15}, {1,10,15}, {1,9,15}, {1,8,15}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 0000000000010000 ... 0000000000011111
+ */
+static const DCTtab DCTtab6[16] =
+{
+  {1,18,16}, {1,17,16}, {1,16,16}, {1,15,16},
+  {6,3,16}, {16,2,16}, {15,2,16}, {14,2,16},
+  {13,2,16}, {12,2,16}, {11,2,16}, {31,1,16},
+  {30,1,16}, {29,1,16}, {28,1,16}, {27,1,16}
+};
+
 #endif//__VLC_H__
diff --git a/pcsx2/NakedAsm.h b/pcsx2/NakedAsm.h
index c14e6e7d6b..0446e243c8 100644
--- a/pcsx2/NakedAsm.h
+++ b/pcsx2/NakedAsm.h
@@ -17,17 +17,6 @@
 #ifndef NAKED_ASM_H
 #define NAKED_ASM_H
 
-#include "IPU/coroutine.h"
-
-// Common to Windows and Linux
-extern "C"
-{
-	// acoroutine.S
-	void so_call(coroutine_t coro);
-	void so_resume(void);
-	void so_exit(void);
-}
-
 #ifdef __LINUX__
 
 extern "C"
diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
index 9c67ac3571..f76dc73072 100644
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@@ -1254,14 +1254,6 @@
 				<Filter
 					Name="IPU"
 					>
-					<File
-						RelativePath="..\..\IPU\coroutine.cpp"
-						>
-					</File>
-					<File
-						RelativePath="..\..\IPU\coroutine.h"
-						>
-					</File>
 					<File
 						RelativePath="..\..\Ipu\IPU.cpp"
 						>
@@ -1270,7 +1262,7 @@
 							>
 							<Tool
 								Name="VCCLCompilerTool"
-								UsePrecompiledHeader="0"
+								UsePrecompiledHeader="2"
 							/>
 						</FileConfiguration>
 						<FileConfiguration
@@ -1302,7 +1294,7 @@
 							>
 							<Tool
 								Name="VCCLCompilerTool"
-								UsePrecompiledHeader="0"
+								UsePrecompiledHeader="2"
 							/>
 						</FileConfiguration>
 						<FileConfiguration