Added a threadless state managed IPU. The code is still in it's early stages and will now be worked on to optimize for speed. The first optimization is to increase the read size in Vlc.h from 32 bit to 64 bit.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3568 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-07-26 10:18:28 +00:00 · 2010-07-26 10:18:28 +00:00 · 5165350170
parent 83604ec59d
commit 5165350170
12 changed files with 1329 additions and 1893 deletions
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@ -36,12 +36,9 @@

 // IPU Inline'd IRQs : Calls the IPU interrupt handlers directly instead of
 // feeding them through the EE's branch test. (see IPU.h for details)
-
-
-
-static tIPU_DMA g_nDMATransfer(0);
-static tIPU_cmd ipu_cmd;
-static IPUStatus IPU1Status;
+tIPU_DMA g_nDMATransfer(0);
+tIPU_cmd ipu_cmd;
+IPUStatus IPU1Status;

 // FIXME - g_nIPU0Data and Pointer are not saved in the savestate, which breaks savestates for some
 // FMVs at random (if they get saved during the half frame of a 30fps rate).  The fix is complicated
@ -53,9 +50,6 @@ void ReorderBitstream();

 // the BP doesn't advance and returns -1 if there is no data to be read
 tIPU_BP g_BP;
-static coroutine_t s_routine; // used for executing BDEC/IDEC
-static int s_RoutineDone = 0;
-static u32 s_tempstack[0x4000]; // 64k

 void IPUWorker();

@ -78,7 +72,7 @@ __aligned16 macroblock_rgb16 rgb16;
 u8 indx4[16*16/2];
 bool mpeg2_inited = false;		//mpeg2_idct_init() must be called only once
 u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'};
-decoder_t g_decoder;						//static, only to place it in bss
+decoder_t decoder;						//static, only to place it in bss
 decoder_t tempdec;

 extern "C"
@ -98,14 +92,14 @@ __forceinline void IPUProcessInterrupt()
 void init_g_decoder()
 {
 	//other stuff
-	g_decoder.intra_quantizer_matrix = (u8*)iq;
-	g_decoder.non_intra_quantizer_matrix = (u8*)niq;
-	g_decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
-	g_decoder.mb8 = &mb8;
-	g_decoder.mb16 = &mb16;
-	g_decoder.rgb32 = &rgb32;
-	g_decoder.rgb16 = &rgb16;
-	g_decoder.stride = 16;
+	decoder.intra_quantizer_matrix = (u8*)iq;
+	decoder.non_intra_quantizer_matrix = (u8*)niq;
+	decoder.picture_structure = FRAME_PICTURE;	//default: progressive...my guess:P
+	decoder.mb8 = &mb8;
+	decoder.mb16 = &mb16;
+	decoder.rgb32 = &rgb32;
+	decoder.rgb16 = &rgb16;
+	decoder.stride = 16;
 }

 void mpeg2_init()
@ -159,7 +153,7 @@ void ReportIPU()
 	Console.WriteLn("vqclut = 0x%x.", vqclut);
 	Console.WriteLn("s_thresh = 0x%x.", s_thresh);
 	Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
-	Console.WriteLn("g_decoder = 0x%x.", g_decoder);
+	Console.WriteLn("g_decoder = 0x%x.", decoder);
 	Console.WriteLn("mpeg2: scan_norm = 0x%x, alt = 0x%x.", mpeg2_scan_norm, mpeg2_scan_alt);
 	Console.WriteLn(ipu_cmd.desc());
 	Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
@ -186,7 +180,7 @@ void SaveStateBase::ipuFreeze()
 	Freeze(vqclut);
 	Freeze(s_thresh);
 	Freeze(coded_block_pattern);
-	Freeze(g_decoder);
+	Freeze(decoder);
 	Freeze(mpeg2_scan_norm);
 	Freeze(mpeg2_scan_alt);

@ -377,72 +371,67 @@ static void ipuBCLR(u32 val)
 	IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP);
 }

-static BOOL ipuIDEC(u32 val)
+static BOOL ipuIDEC(u32 val, bool resume)
 {
 	tIPU_CMD_IDEC idec(val);

-	idec.log();
-	g_BP.BP += idec.FB;//skip FB bits
-	//from IPU_CTRL
-	ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
-	g_decoder.coding_type = ipuRegs->ctrl.PCT;
-	g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-	g_decoder.q_scale_type	= ipuRegs->ctrl.QST;
-	g_decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-	g_decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-	g_decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+	if (!resume)
+	{
+		idec.log();
+		g_BP.BP += idec.FB;//skip FB bits
+		//from IPU_CTRL
+		ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
+		decoder.coding_type = ipuRegs->ctrl.PCT;
+		decoder.mpeg1 = ipuRegs->ctrl.MP1;
+		decoder.q_scale_type	= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
+		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
+		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;

-	//from IDEC value
-	g_decoder.quantizer_scale = idec.QSC;
-	g_decoder.frame_pred_frame_dct = !idec.DTD;
-	g_decoder.sgn = idec.SGN;
-	g_decoder.dte = idec.DTE;
-	g_decoder.ofm = idec.OFM;
+		//from IDEC value
+		decoder.quantizer_scale = idec.QSC;
+		decoder.frame_pred_frame_dct = !idec.DTD;
+		decoder.sgn = idec.SGN;
+		decoder.dte = idec.DTE;
+		decoder.ofm = idec.OFM;

-	//other stuff
-	g_decoder.dcr = 1; // resets DC prediction value
+		//other stuff
+		decoder.dcr = 1; // resets DC prediction value
+	}

-	s_routine = so_create(mpeg2sliceIDEC, &s_RoutineDone, s_tempstack, sizeof(s_tempstack));
-	pxAssert(s_routine != NULL);
-	so_call(s_routine);
-	if (s_RoutineDone) s_routine = NULL;
-
-	return s_RoutineDone;
+	return mpeg2sliceIDEC();
 }

 static int s_bdec = 0;

-static __forceinline BOOL ipuBDEC(u32 val)
+static __forceinline BOOL ipuBDEC(u32 val, bool resume)
 {
 	tIPU_CMD_BDEC bdec(val);

-	bdec.log(s_bdec);
-	if (IsDebugBuild) s_bdec++;
+	if (!resume)
+	{
+		bdec.log(s_bdec);
+		if (IsDebugBuild) s_bdec++;

-	g_BP.BP += bdec.FB;//skip FB bits
-	g_decoder.coding_type = I_TYPE;
-	g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-	g_decoder.q_scale_type	= ipuRegs->ctrl.QST;
-	g_decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
-	g_decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
-	g_decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
+		g_BP.BP += bdec.FB;//skip FB bits
+		decoder.coding_type = I_TYPE;
+		decoder.mpeg1 = ipuRegs->ctrl.MP1;
+		decoder.q_scale_type	= ipuRegs->ctrl.QST;
+		decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
+		decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
+		decoder.intra_dc_precision = ipuRegs->ctrl.IDP;

-	//from BDEC value
-	/* JayteeMaster: the quantizer (linear/non linear) depends on the q_scale_type */
-	g_decoder.quantizer_scale = g_decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
-	g_decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
-	g_decoder.dcr = bdec.DCR;
-	g_decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
+		//from BDEC value
+		decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
+		decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
+		decoder.dcr = bdec.DCR;
+		decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;

-	memzero(mb8);
-	memzero(mb16);
+		memzero(mb8);
+		memzero(mb16);
+	}

-	s_routine = so_create(mpeg2_slice, &s_RoutineDone, s_tempstack, sizeof(s_tempstack));
-	pxAssert(s_routine != NULL);
-	so_call(s_routine);
-
-	if (s_RoutineDone) s_routine = NULL;
-	return s_RoutineDone;
+	return mpeg2_slice();
 }

 static BOOL __fastcall ipuVDEC(u32 val)
@ -451,34 +440,34 @@ static BOOL __fastcall ipuVDEC(u32 val)
 	{
 		case 0:
 			ipuRegs->cmd.DATA = 0;
-			if (!getBits32((u8*)&g_decoder.bitstream_buf, 0)) return FALSE;
+			if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return FALSE;

-			g_decoder.bitstream_bits = -16;
-			BigEndian(g_decoder.bitstream_buf, g_decoder.bitstream_buf);
+			decoder.bitstream_bits = -16;
+			BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);

 			switch ((val >> 26) & 3)
 			{
 				case 0://Macroblock Address Increment
-					g_decoder.mpeg1 = ipuRegs->ctrl.MP1;
-					ipuRegs->cmd.DATA = get_macroblock_address_increment(&g_decoder);
+					decoder.mpeg1 = ipuRegs->ctrl.MP1;
+					ipuRegs->cmd.DATA = get_macroblock_address_increment();
 					break;

-				case 1://Macroblock Type	//known issues: no error detected
-					g_decoder.frame_pred_frame_dct = 1;//prevent DCT_TYPE_INTERLACED
-					g_decoder.coding_type = ipuRegs->ctrl.PCT;
-					ipuRegs->cmd.DATA = get_macroblock_modes(&g_decoder);
+				case 1://Macroblock Type
+					decoder.frame_pred_frame_dct = 1;
+					decoder.coding_type = ipuRegs->ctrl.PCT;
+					ipuRegs->cmd.DATA = get_macroblock_modes();
 					break;

-				case 2://Motion Code		//known issues: no error detected
-					ipuRegs->cmd.DATA = get_motion_delta(&g_decoder, 0);
+				case 2://Motion Code
+					ipuRegs->cmd.DATA = get_motion_delta(0);
 					break;

 				case 3://DMVector
-					ipuRegs->cmd.DATA = get_dmv(&g_decoder);
+					ipuRegs->cmd.DATA = get_dmv();
 					break;
 			}

-			g_BP.BP += (g_decoder.bitstream_bits + 16);
+			g_BP.BP += (int)decoder.bitstream_bits + 16;

 			if ((int)g_BP.BP < 0)
 			{
@ -486,9 +475,7 @@ static BOOL __fastcall ipuVDEC(u32 val)
 				ReorderBitstream();
 			}

-			FillInternalBuffer(&g_BP.BP, 1, 0);
-
-			ipuRegs->cmd.DATA = (ipuRegs->cmd.DATA & 0xFFFF) | ((g_decoder.bitstream_bits + 16) << 16);
+			ipuRegs->cmd.DATA = (ipuRegs->cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16);
 			ipuRegs->ctrl.ECD = (ipuRegs->cmd.DATA == 0);

 		case 1:
@ -529,7 +516,10 @@ static BOOL ipuSETIQ(u32 val)

 	if ((val >> 27) & 1)
 	{
-		ipu_cmd.pos[0] += getBits((u8*)niq + ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1); // 8*8*8
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
+		{
+			if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+		}

 		IPU_LOG("Read non-intra quantization matrix from IPU FIFO.");
 		for (i = 0; i < 8; i++)
@ -541,7 +531,10 @@ static BOOL ipuSETIQ(u32 val)
 	}
 	else
 	{
-		ipu_cmd.pos[0] += getBits((u8*)iq + 8 * ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1);
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
+		{
+			if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
+		}

 		IPU_LOG("Read intra quantization matrix from IPU FIFO.");
 		for (i = 0; i < 8; i++)
@ -552,40 +545,40 @@ static BOOL ipuSETIQ(u32 val)
 		}
 	}

-	return ipu_cmd.pos[0] == 64;
+	return TRUE;
 }

 static BOOL ipuSETVQ(u32 val)
 {
-	ipu_cmd.pos[0] += getBits((u8*)vqclut + ipu_cmd.pos[0], 256 - 8 * ipu_cmd.pos[0], 1); // 16*2*8
-
-	if (ipu_cmd.pos[0] == 32)
+	for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
 	{
-		IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
-		IPU_LOG(
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
-		    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
-		    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
-		    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
-		    vqclut[2] >> 10, (vqclut[2] >> 5) & 0x1F, vqclut[2] & 0x1F,
-		    vqclut[3] >> 10, (vqclut[3] >> 5) & 0x1F, vqclut[3] & 0x1F,
-		    vqclut[4] >> 10, (vqclut[4] >> 5) & 0x1F, vqclut[4] & 0x1F,
-		    vqclut[5] >> 10, (vqclut[5] >> 5) & 0x1F, vqclut[5] & 0x1F,
-		    vqclut[6] >> 10, (vqclut[6] >> 5) & 0x1F, vqclut[6] & 0x1F,
-		    vqclut[7] >> 10, (vqclut[7] >> 5) & 0x1F, vqclut[7] & 0x1F,
-		    vqclut[8] >> 10, (vqclut[8] >> 5) & 0x1F, vqclut[8] & 0x1F,
-		    vqclut[9] >> 10, (vqclut[9] >> 5) & 0x1F, vqclut[9] & 0x1F,
-		    vqclut[10] >> 10, (vqclut[10] >> 5) & 0x1F, vqclut[10] & 0x1F,
-		    vqclut[11] >> 10, (vqclut[11] >> 5) & 0x1F, vqclut[11] & 0x1F,
-		    vqclut[12] >> 10, (vqclut[12] >> 5) & 0x1F, vqclut[12] & 0x1F,
-		    vqclut[13] >> 10, (vqclut[13] >> 5) & 0x1F, vqclut[13] & 0x1F,
-		    vqclut[14] >> 10, (vqclut[14] >> 5) & 0x1F, vqclut[14] & 0x1F,
-		    vqclut[15] >> 10, (vqclut[15] >> 5) & 0x1F, vqclut[15] & 0x1F);
+		if (!getBits64((u8*)vqclut + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 	}

-	return ipu_cmd.pos[0] == 32;
+	IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
+	IPU_LOG(
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d"
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d "
+	    "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d",
+	    vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F,
+	    vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F,
+	    vqclut[2] >> 10, (vqclut[2] >> 5) & 0x1F, vqclut[2] & 0x1F,
+	    vqclut[3] >> 10, (vqclut[3] >> 5) & 0x1F, vqclut[3] & 0x1F,
+	    vqclut[4] >> 10, (vqclut[4] >> 5) & 0x1F, vqclut[4] & 0x1F,
+	    vqclut[5] >> 10, (vqclut[5] >> 5) & 0x1F, vqclut[5] & 0x1F,
+	    vqclut[6] >> 10, (vqclut[6] >> 5) & 0x1F, vqclut[6] & 0x1F,
+	    vqclut[7] >> 10, (vqclut[7] >> 5) & 0x1F, vqclut[7] & 0x1F,
+	    vqclut[8] >> 10, (vqclut[8] >> 5) & 0x1F, vqclut[8] & 0x1F,
+	    vqclut[9] >> 10, (vqclut[9] >> 5) & 0x1F, vqclut[9] & 0x1F,
+	    vqclut[10] >> 10, (vqclut[10] >> 5) & 0x1F, vqclut[10] & 0x1F,
+	    vqclut[11] >> 10, (vqclut[11] >> 5) & 0x1F, vqclut[11] & 0x1F,
+	    vqclut[12] >> 10, (vqclut[12] >> 5) & 0x1F, vqclut[12] & 0x1F,
+	    vqclut[13] >> 10, (vqclut[13] >> 5) & 0x1F, vqclut[13] & 0x1F,
+	    vqclut[14] >> 10, (vqclut[14] >> 5) & 0x1F, vqclut[14] & 0x1F,
+	    vqclut[15] >> 10, (vqclut[15] >> 5) & 0x1F, vqclut[15] & 0x1F);
+
+	return TRUE;
 }

 // IPU Transfers are split into 8Qwords so we need to send ALL the data
@ -596,17 +589,14 @@ static BOOL __fastcall ipuCSC(u32 val)

 	for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
 	{
-
-		if (ipu_cmd.pos[0] < 3072 / 8)
+		for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
 		{
-			ipu_cmd.pos[0] += getBits((u8*) & mb8 + ipu_cmd.pos[0], 3072 - 8 * ipu_cmd.pos[0], 1);
-
-			if (ipu_cmd.pos[0] < 3072 / 8) return FALSE;
-
-			ipu_csc(&mb8, &rgb32, 0);
-			if (csc.OFM) ipu_dither(&rgb32, &rgb16, csc.DTE);
+			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}

+		ipu_csc(&mb8, &rgb32, 0);
+		if (csc.OFM) ipu_dither(&rgb32, &rgb16, csc.DTE);
+		
 		if (csc.OFM)
 		{
 			while (ipu_cmd.pos[1] < 32)
@ -641,18 +631,16 @@ static BOOL ipuPACK(u32 val)

 	for (;ipu_cmd.index < (int)csc.MBC; ipu_cmd.index++)
 	{
-		if (ipu_cmd.pos[0] < 512)
+		for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
 		{
-			ipu_cmd.pos[0] += getBits((u8*) & mb8 + ipu_cmd.pos[0], 512 - 8 * ipu_cmd.pos[0], 1);
-
-			if (ipu_cmd.pos[0] < 64) return FALSE;
-
-			ipu_csc(&mb8, &rgb32, 0);
-			ipu_dither(&rgb32, &rgb16, csc.DTE);
-
-			if (csc.OFM) ipu_vq(&rgb16, indx4);
+			if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
 		}

+		ipu_csc(&mb8, &rgb32, 0);
+		ipu_dither(&rgb32, &rgb16, csc.DTE);
+
+		if (csc.OFM) ipu_vq(&rgb16, indx4);
+		
 		if (csc.OFM)
 		{
 			ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
@ -696,7 +684,7 @@ void IPUCMD_WRITE(u32 val)
 	ipuRegs->ctrl.ECD = 0;
 	ipuRegs->ctrl.SCD = 0; //clear ECD/SCD
 	ipuRegs->cmd.DATA = val;
-	ipu_cmd.pos[0] = 0;
+	ipu_cmd.clear();

 	switch (ipuRegs->cmd.CMD)
 	{
@ -759,29 +747,27 @@ void IPUCMD_WRITE(u32 val)
 			break;

 		case SCE_IPU_IDEC:
-			if (ipuIDEC(val))
+			if (ipuIDEC(val, false))
 			{
 				// idec done, ipu0 done too
 				if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
 				return;
 			}
+
 			ipuRegs->topbusy = 0x80000000;
-			// have to resort to the thread
-			ipu_cmd.current = val >> 28;
-			ipuRegs->ctrl.BUSY = 1;
-			return;
+			break;

 		case SCE_IPU_BDEC:
-			if (ipuBDEC(val))
+			if (ipuBDEC(val, false))
 			{
 				if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
 				if (ipuRegs->ctrl.SCD || ipuRegs->ctrl.ECD) hwIntcIrq(INTC_IPU);
 				return;
 			}
-			ipuRegs->topbusy = 0x80000000;
-			ipu_cmd.current = val >> 28;
-			ipuRegs->ctrl.BUSY = 1;
-			return;
+			else
+			{
+				ipuRegs->topbusy = 0x80000000;
+			}
 	}

 	// have to resort to the thread
@ -850,8 +836,7 @@ void IPUWorker()
 			break;

 		case SCE_IPU_IDEC:
-			so_call(s_routine);
-			if (!s_RoutineDone)
+			if (!ipuIDEC(ipuRegs->cmd.DATA, true))
 			{
 				if(ipu1dma->chcr.STR == false) hwIntcIrq(INTC_IPU);
 				return;
@ -865,12 +850,10 @@ void IPUWorker()

 			// CHECK!: IPU0dma remains when IDEC is done, so we need to clear it
 			if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
-			s_routine = NULL;
 			break;

 		case SCE_IPU_BDEC:
-			so_call(s_routine);
-			if (!s_RoutineDone)
+			if (!ipuBDEC(ipuRegs->cmd.DATA, true))
 			{
 				if(ipu1dma->chcr.STR == false) hwIntcIrq(INTC_IPU);
 				return;
@ -882,7 +865,6 @@ void IPUWorker()
 			ipu_cmd.current = 0xffffffff;

 			if (ipu0dma->qwc > 0 && ipu0dma->chcr.STR) IPU_INT0_FROM();
-			s_routine = NULL;
 			if (ipuRegs->ctrl.SCD || ipuRegs->ctrl.ECD) hwIntcIrq(INTC_IPU);
 			return;

@ -946,7 +928,7 @@ u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
 		g_BP.FP = 1;
 	}
 	
-	if ((g_BP.FP < 2) && (*(int*)pointer + size) >= 128)
+	if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128))
 	{
 		if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1;
 	}
@ -967,6 +949,83 @@ u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size)
 	return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0;
 }

+// whenever reading fractions of bytes. The low bits always come from the next byte
+// while the high bits come from the current byte
+u8 __fastcall getBits128(u8 *address, u32 advance)
+{
+	u64 mask2;
+	u128 mask;
+	u32 shift;
+	u8* readpos;
+
+	// Check if the current BP has exceeded or reached the limit of 128
+	if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0;
+
+	readpos = readbits + (int)g_BP.BP / 8;
+
+	if (g_BP.BP & 7)
+	{
+		shift = g_BP.BP & 7;
+		mask2 = 0xff >> shift;
+		mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);
+		mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56);		
+
+		u128 notMask;
+		u128 data = *(u128*)(readpos + 1);
+		notMask.lo = ~mask.lo & data.lo;
+		notMask.hi = ~mask.hi & data.hi;
+		notMask.lo >>= 8 - shift;
+		notMask.lo |= (notMask.hi & (0xFFFFFFFFFFFFFFFF >> (64 - shift))) << (64 - shift);
+		notMask.hi >>= 8 - shift;
+
+		mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift));
+		mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift;
+		
+		notMask.lo |= mask.lo;
+		notMask.hi |= mask.hi;
+		*(u128*)address = notMask;
+	}
+	else
+	{
+		*(u128*)address = *(u128*)readpos;
+	}
+
+	if (advance) g_BP.BP += 128;
+
+	return 1;
+}
+
+// whenever reading fractions of bytes. The low bits always come from the next byte
+// while the high bits come from the current byte
+u8 __fastcall getBits64(u8 *address, u32 advance)
+{
+	register u64 mask = 0;
+	int shift = 0;
+	u8* readpos;
+
+	// Check if the current BP has exceeded or reached the limit of 128
+	if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0;
+
+	readpos = readbits + (int)g_BP.BP / 8;
+
+	if (g_BP.BP & 7)
+	{
+		shift = g_BP.BP & 7;
+		mask = (0xff >> shift);
+		mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56);
+
+		*(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift);
+	}
+	else
+	{
+		*(u64*)address = *(u64*)readpos;
+	}
+
+	if (advance) g_BP.BP += 64;
+
+	return 1;
+}
+
 // whenever reading fractions of bytes. The low bits always come from the next byte
 // while the high bits come from the current byte
 u8 __fastcall getBits32(u8 *address, u32 advance)
@ -1053,102 +1112,6 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
 	return 1;
 }

-int __fastcall getBits(u8 *address, u32 size, u32 advance)
-{
-	register u32 mask = 0, shift = 0, howmuch;
-	u8* oldbits, *oldaddr = address;
-	u32 pointer = 0, temp;
-
-	// Check if the current BP has exceeded or reached the limit of 128
-	if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8) return 0;
-
-	oldbits = readbits;
-	// Backup the current BP in case of VDEC/FDEC
-	pointer = g_BP.BP;
-
-	if (pointer & 7)
-	{
-		address--;
-		while (size)
-		{
-			if (shift == 0)
-			{
-				*++address = 0;
-				shift = 8;
-			}
-
-			temp = shift; // Lets not pass a register to min.
-			howmuch = min(min(8 - (pointer & 7), 128 - pointer), min(size, temp));
-
-			if (FillInternalBuffer(&pointer, advance, 8) < 8)
-			{
-				if (advance) g_BP.BP = pointer;
-				return address - oldaddr;
-			}
-
-			mask = ((0xFF >> (pointer & 7)) << (8 - howmuch - (pointer & 7))) & 0xFF;
-			mask &= readbits[((pointer) >> 3)];
-			mask >>= 8 - howmuch - (pointer & 7);
-			pointer += howmuch;
-			size -= howmuch;
-			shift -= howmuch;
-			*address |= mask << shift;
-		}
-		++address;
-	}
-	else
-	{
-		u8* readmem;
-		while (size)
-		{
-			if (FillInternalBuffer(&pointer, advance, 8) < 8)
-			{
-				if (advance) g_BP.BP = pointer;
-				return address -oldaddr;
-			}
-
-			howmuch = min(128 - pointer, size);
-			size -= howmuch;
-
-			readmem = readbits + (pointer >> 3);
-			pointer += howmuch;
-			howmuch >>= 3;
-
-			while (howmuch >= 4)
-			{
-				*(u32*)address = *(u32*)readmem;
-				howmuch -= 4;
-				address += 4;
-				readmem += 4;
-			}
-
-			switch (howmuch)
-			{
-				case 3:
-					address[2] = readmem[2];
-				case 2:
-					address[1] = readmem[1];
-				case 1:
-					address[0] = readmem[0];
-				case 0:
-					break;
-
-					jNO_DEFAULT
-			}
-
-			address += howmuch;
-		}
-	}
-
-	// If not advance then reset the Reading buffer value
-	if (advance)
-		g_BP.BP = pointer;
-	else
-		readbits = oldbits; // restore the last pointer
-
-	return address - oldaddr;
-}
-
 ///////////////////// CORE FUNCTIONS /////////////////
 void Skl_YUV_To_RGB32_MMX(u8 *RGB, const int Dst_BpS, const u8 *Y, const u8 *U, const u8 *V,
                          const int Src_BpS, const int Width, const int Height);
@ -1244,7 +1207,7 @@ static __forceinline void ipuDmacSrcChain()
 		{
 			case TAG_REFE: // refe
 				//if(IPU1Status.InProgress == false) ipu1dma->tadr += 16;
-				if(IPU1Status.DMAFinished == false) IPU1Status.DMAFinished = true;
+				IPU1Status.DMAFinished = true;
 				break;
 			case TAG_CNT: // cnt
 				// Set the taddr to the next tag
@ -1264,7 +1227,7 @@ static __forceinline void ipuDmacSrcChain()

 			case TAG_END: // end
 				ipu1dma->tadr = ipu1dma->madr;
-				if(IPU1Status.DMAFinished == false) IPU1Status.DMAFinished = true;
+				IPU1Status.DMAFinished = true;
 				break;
 		}
 }
@ -1300,7 +1263,6 @@ static __forceinline int IPU1chain() {

 	if (ipu1dma->qwc > 0 && IPU1Status.InProgress == true)
 	{
-
 		int qwc = ipu1dma->qwc;
 		u32 *pMem;

@ -1308,7 +1270,8 @@ static __forceinline int IPU1chain() {

 		if (pMem == NULL)
 		{
-			Console.Error("ipu1dma NULL!"); return totalqwc;
+			Console.Error("ipu1dma NULL!");
+			return totalqwc;
 		}

 		//Write our data to the fifo
@ -1484,7 +1447,6 @@ int IPU1dma()
 	}
 	else 
 	{
-		IPU_LOG("Here");
 		cpuRegs.eCycle[4] = 0x9999;//IPU_INT_TO(2048);
 	}

@ -1601,7 +1563,6 @@ __forceinline void dmaIPU1() // toIPU

 		IPU1Status.DMAMode = DMA_MODE_CHAIN;
 		IPU1dma();
-		//if (ipuRegs->ctrl.BUSY) IPUWorker();
 	}
 	else //Normal Mode
 	{
@ -1623,7 +1584,6 @@ __forceinline void dmaIPU1() // toIPU
 			IPU1Status.DMAFinished = true;
 			IPU1Status.DMAMode = DMA_MODE_NORMAL;
 			IPU1dma();
-			//if (ipuRegs->ctrl.BUSY) IPUWorker();
 		}
 	}
 }
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@ -17,7 +17,6 @@
 #define __IPU_H__

 #include "mpeg2lib/Mpeg.h"
-#include "coroutine.h"
 #include "IPU_Fifo.h"

 #ifdef _MSC_VER
@ -327,7 +326,7 @@ struct IPUregisters {
 struct tIPU_cmd
 {
 	int index;
-	int pos[2];
+	int pos[6];
 	int current;
 	void clear()
 	{
@ -342,12 +341,13 @@ struct tIPU_cmd
 	}
 };

-//extern tIPU_cmd ipu_cmd;
+extern tIPU_cmd ipu_cmd;
 extern tIPU_BP g_BP;
 extern int coded_block_pattern;
 extern int g_nIPU0Data; // or 0x80000000 whenever transferring
 extern u8* g_pIPU0Pointer;
-
+extern IPUStatus IPU1Status;
+extern tIPU_DMA g_nDMATransfer;
 // The IPU can only do one task at once and never uses other buffers so these
 // should be made available to functions in other modules to save registers.
 extern __aligned16 macroblock_rgb32	rgb32;
@ -376,10 +376,11 @@ extern int IPU0dma();
 extern int IPU1dma();

 extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size);
+extern u8 __fastcall getBits128(u8 *address, u32 advance);
+extern u8 __fastcall getBits64(u8 *address, u32 advance);
 extern u8 __fastcall getBits32(u8 *address, u32 advance);
 extern u8 __fastcall getBits16(u8 *address, u32 advance);
 extern u8 __fastcall getBits8(u8 *address, u32 advance);
-extern int __fastcall getBits(u8 *address, u32 size, u32 advance);


 #endif
--- a/pcsx2/IPU/IPU_Fifo.cpp
+++ b/pcsx2/IPU/IPU_Fifo.cpp
@ -13,7 +13,6 @@
 *  If not, see <http://www.gnu.org/licenses/>.
 */

-
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "IPU_Fifo.h"
@ -106,20 +105,18 @@ int IPU_Fifo_Output::write(const u32 *value, int size)

 	ipuRegs->ctrl.OFC += firsttrans;
 	IPU0dma();
-	//Console.WriteLn("Written %d qwords, %d", firsttrans,ipuRegs->ctrl.OFC);

 	return firsttrans;
 }

 int IPU_Fifo_Input::read(void *value)
 {
-	// wait until enough data
-	if (g_BP.IFC < 8)
+	// wait until enough data to ensure proper streaming.
+	if (g_BP.IFC < 4)
 	{
 		// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
 		if(cpuRegs.eCycle[4] == 0x9999)
 		{
-			//DevCon.Warning("Setting ECycle");
 			CPU_INT( DMAC_TO_IPU, 4 );
 		}
 		
--- a/pcsx2/IPU/acoroutine.S
+++ b/pcsx2/IPU/acoroutine.S
@ -1,78 +0,0 @@
-.intel_syntax noprefix
-
-.extern g_pCurrentRoutine
-
-.globl so_call
-so_call:
-		mov eax, dword ptr [esp+4]
-        test dword ptr [eax+24], 1
-        jnz RestoreRegs
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov dword ptr [eax+24], 1
-        jmp CallFn
-RestoreRegs:
-        // have to load and save at the same time
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-CallFn:
-        mov [g_pCurrentRoutine], eax
-        mov ecx, esp
-        mov esp, [eax+4]
-        mov [eax+4], ecx
-
-        jmp dword ptr [eax]
-
-.globl so_resume
-so_resume:
-		mov eax, [g_pCurrentRoutine]
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-        // put the return address in pcalladdr
-        mov ecx, [esp]
-        mov [eax], ecx
-        add esp, 4 // remove the return address
-
-        // swap stack pointers
-        mov ecx, [eax+4]
-        mov [eax+4], esp
-        mov esp, ecx
-        ret
-
-.globl so_exit
-so_exit:
-		mov eax, [g_pCurrentRoutine]
-        mov esp, [eax+4]
-        mov ebx, [eax+8]
-        mov esi, [eax+12]
-        mov edi, [eax+16]
-        mov ebp, [eax+20]
-        ret
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
--- a/pcsx2/IPU/acoroutine.asm
+++ b/pcsx2/IPU/acoroutine.asm
@ -1,140 +0,0 @@
-; Pcsx2 - Pc Ps2 Emulator
-; Copyright (C) 2002-2008  Pcsx2 Team
-;
-; This program is free software; you can redistribute it and/or modify
-; it under the terms of the GNU General Public License as published by
-; the Free Software Foundation; either version 2 of the License, or
-; (at your option) any later version.
-
-; This program is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; GNU General Public License for more details.
-;
-; You should have received a copy of the GNU General Public License
-; along with this program; if not, write to the Free Software
-; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
-
-;; x86-64 coroutine fucntions
-extern g_pCurrentRoutine:ptr
-
-.code
-
-so_call proc public
-        test dword ptr [rcx+88], 1
-        jnz so_call_RestoreRegs
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov dword ptr [rcx+88], 1
-        jmp so_call_CallFn
-so_call_RestoreRegs:
-		;; have to load and save at the same time
-        ;; rbp, rbx, r12
-        mov rax, [rcx+24]
-        mov r8, [rcx+16]
-        mov rdx, [rcx+32]
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov rbp, rax
-        mov rbx, r8
-        mov r12, rdx
-        ;; r13, r14, r15
-        mov rax, [rcx+40]
-        mov r8, [rcx+48]
-        mov rdx, [rcx+56]
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov r13, rax
-        mov r14, r8
-        mov r15, rdx
-
-        ;; rsi, rdi
-        mov rax, [rcx+64]
-        mov rdx, [rcx+72]
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov rsi, rax
-        mov rdi, rdx
-        
-so_call_CallFn:
-        mov [g_pCurrentRoutine], rcx
-
-		;; swap the stack
-        mov rax, [rcx+8]
-        mov [rcx+8], rsp
-        mov rsp, rax
-        mov rax, [rcx+0]
-        mov rcx, [rcx+80]
-
-        jmp rax
-
-so_call endp
-
-; so_resume
-so_resume proc public
-        ;; rbp, rbx, r12
-        mov rcx, [g_pCurrentRoutine]
-        mov rax, [rcx+24]
-        mov r8, [rcx+16]
-        mov rdx, [rcx+32]
-        mov [rcx+24], rbp
-        mov [rcx+16], rbx
-        mov [rcx+32], r12
-        mov rbp, rax
-        mov rbx, r8
-        mov r12, rdx
-        ;; r13, r14, r15
-        mov rax, [rcx+40]
-        mov r8, [rcx+48]
-        mov rdx, [rcx+56]
-        mov [rcx+40], r13
-        mov [rcx+48], r14
-        mov [rcx+56], r15
-        mov r13, rax
-        mov r14, r8
-        mov r15, rdx
-        ;; rsi, rdi
-        mov rax, [rcx+64]
-        mov rdx, [rcx+72]
-        mov [rcx+64], rsi
-        mov [rcx+72], rdi
-        mov rsi, rax
-        mov rdi, rdx
-
-		;; put the return address in pcalladdr
-        mov rax, [rsp]
-        mov [rcx], rax
-        add rsp, 8 ;; remove the return address
-
-		;; swap stack pointers
-        mov rax, [rcx+8]
-        mov [rcx+8], rsp
-        mov rsp, rax
-
-        ret
-
-so_resume endp
-
-so_exit proc public
-        mov rcx, [g_pCurrentRoutine]
-        mov rsp, [rcx+8]
-        mov rbp, [rcx+24]
-        mov rbx, [rcx+16]
-        mov r12, [rcx+32]
-        mov r13, [rcx+40]
-        mov r14, [rcx+48]
-        mov r15, [rcx+56]
-        mov rsi, [rcx+64]
-        mov rdi, [rcx+72]
-        ret
-so_exit endp
-
-end
--- a/pcsx2/IPU/coroutine.cpp
+++ b/pcsx2/IPU/coroutine.cpp
@ -1,153 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#include "PrecompiledHeader.h"
-
-#include "coroutine.h"
-
-struct coroutine {
-	void* pcalladdr;
-	void *pcurstack;
-
-    uptr storeebx, storeesi, storeedi, storeebp;
-
-    s32 restore; // if nonzero, restore the registers
-    s32 alloc;
-	//struct s_coroutine *caller;
-	//struct s_coroutine *restarget;
-
-};
-
-#define CO_STK_ALIGN 256
-#define CO_STK_COROSIZE ((sizeof(coroutine) + CO_STK_ALIGN - 1) & ~(CO_STK_ALIGN - 1))
-#define CO_MIN_SIZE (4 * 1024)
-
-coroutine* g_pCurrentRoutine;
-
-coroutine_t so_create(void (*func)(void *), void *data, void *stack, int size)
-{
-	void* endstack;
-	int alloc = 0; // r = CO_STK_COROSIZE;
-	coroutine *co;
-
-	if ((size &= ~(sizeof(s32) - 1)) < CO_MIN_SIZE) return NULL;
-	if (!stack) {
-		size = (size + sizeof(coroutine) + CO_STK_ALIGN - 1) & ~(CO_STK_ALIGN - 1);
-		stack = malloc(size);
-		if (!stack) return NULL;
-		alloc = size;
-	}
-	endstack = (char*)stack + size - 64;
-	co = (coroutine*)stack;
-	stack = (char *) stack + CO_STK_COROSIZE;
-	*(void**)endstack = NULL;
-	*(void**)((char*)endstack+sizeof(void*)) = data;
-	co->alloc = alloc;
-	co->pcalladdr = (void*)func;
-	co->pcurstack = endstack;
-	return co;
-}
-
-void so_delete(coroutine_t coro)
-{
-	coroutine *co = (coroutine *) coro;
-	pxAssert( co != NULL );
-	if (co->alloc) free(co);
-}
-
-// see acoroutines.S and acoroutines.asm for other asm implementations
-#if defined(_MSC_VER)
-
-__declspec(naked) void so_call(coroutine_t coro)
-{
-    __asm {
-        mov eax, dword ptr [esp+4]
-        test dword ptr [eax+24], 1
-        jnz RestoreRegs
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov dword ptr [eax+24], 1
-        jmp CallFn
-RestoreRegs:
-        // have to load and save at the same time
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-CallFn:
-        mov [g_pCurrentRoutine], eax
-        mov ecx, esp
-        mov esp, [eax+4]
-        mov [eax+4], ecx
-
-        jmp dword ptr [eax]
-    }
-}
-
-__declspec(naked) void so_resume(void)
-{
-    __asm {
-        mov eax, [g_pCurrentRoutine]
-        mov ecx, [eax+8]
-        mov edx, [eax+12]
-        mov [eax+8], ebx
-        mov [eax+12], esi
-        mov ebx, ecx
-        mov esi, edx
-        mov ecx, [eax+16]
-        mov edx, [eax+20]
-        mov [eax+16], edi
-        mov [eax+20], ebp
-        mov edi, ecx
-        mov ebp, edx
-
-        // put the return address in pcalladdr
-        mov ecx, [esp]
-        mov [eax], ecx
-        add esp, 4 // remove the return address
-
-        // swap stack pointers
-        mov ecx, [eax+4]
-        mov [eax+4], esp
-        mov esp, ecx
-        ret
-    }
-}
-
-__declspec(naked) void so_exit(void)
-{
-    __asm {
-        mov eax, [g_pCurrentRoutine]
-        mov esp, [eax+4]
-        mov ebx, [eax+8]
-        mov esi, [eax+12]
-        mov edi, [eax+16]
-        mov ebp, [eax+20]
-        ret
-    }
-}
-#endif
--- a/pcsx2/IPU/coroutine.h
+++ b/pcsx2/IPU/coroutine.h
@ -1,27 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef PCSX2_COROUTINE_LIB
-#define PCSX2_COROUTINE_LIB
-
-// low level coroutine library
-typedef void *coroutine_t;
-
-coroutine_t so_create(void (*func)(void *), void *data, void *stack, int size);
-void so_delete(coroutine_t coro);
-
-#include "NakedAsm.h"
-
-#endif
--- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@ -99,7 +99,6 @@ struct decoder_t {
 	/* bit parsing stuff */
 	u32 bitstream_buf;		/* current 32 bit working set */
 	int bitstream_bits;			/* used bits in working set */
-	u8 * bitstream_ptr;			/* buffer with stream data; 128 bits buffer */

 	struct macroblock_8		*mb8;
 	struct macroblock_16	*mb16;
@ -173,13 +172,13 @@ extern void (__fastcall *mpeg2_idct_add) (int last, s16 * block, s16* dest, int
 #define IDEC	0
 #define BDEC	1

-void mpeg2sliceIDEC(void* pdone);
-void mpeg2_slice(void* pdone);
-int get_macroblock_address_increment(decoder_t * const decoder);
-int get_macroblock_modes (decoder_t * const decoder);
+bool mpeg2sliceIDEC();
+bool mpeg2_slice();
+int get_macroblock_address_increment();
+int get_macroblock_modes();

-extern int get_motion_delta (decoder_t * const decoder, const int f_code);
-extern int get_dmv (decoder_t * const decoder);
+extern int get_motion_delta(const int f_code);
+extern int get_dmv();

 extern int non_linear_quantizer_scale[];
 extern decoder_t g_decoder;
@ -189,7 +188,7 @@ void __fastcall ipu_dither(const macroblock_rgb32* rgb32, macroblock_rgb16 *rgb1
 void __fastcall ipu_vq(macroblock_rgb16 *rgb16, u8* indx4);
 void __fastcall ipu_copy(const macroblock_8 *mb8, macroblock_16 *mb16);

-int slice (decoder_t * const decoder, u8 * buffer);
+int slice (u8 * buffer);
 /* idct.c */
 void mpeg2_idct_init ();

@ -199,4 +198,10 @@ void mpeg2_idct_init ();
 #define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
 #endif

+#ifdef _MSC_VER
+#define BigEndian64(out, in) out = _byteswap_uint64(in)
+#else
+#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
+#endif
+
 #endif//__MPEG_H__
--- a/pcsx2/IPU/mpeg2lib/Vlc.h
+++ b/pcsx2/IPU/mpeg2lib/Vlc.h
@ -25,55 +25,70 @@
 #ifndef __VLC_H__
 #define __VLC_H__

-#include "IPU/coroutine.h"
-
 static u8 data[2];
-static u8 dword[4];
+//static u8 word[4];
+//static u8 dword[8];
+//static u8 qword[16];
 extern tIPU_BP g_BP;
-extern decoder_t g_decoder;
+extern decoder_t decoder;
 extern void ReorderBitstream();

-static __forceinline void GETWORD(u32 * bit_buf,int bits)
+static __forceinline int GETWORD()
 {
-	while(!getBits16(data,1))
+	if (decoder.bitstream_bits > 0)
 	{
-		so_resume();
+		if(!getBits16(data,1))
+		{
+			return 0;
+		}
+		
+		/*u32 data;
+		BigEndian(data, *(u32*)word);
+		decoder.bitstream_buf |=  (u64)data << decoder.bitstream_bits;
+		decoder.bitstream_bits -= 32;*/
+		decoder.bitstream_buf |= ((u32)(((u16)data[0] << 8) | data[1])) << decoder.bitstream_bits;
+		decoder.bitstream_bits -= 16;
 	}
-	*bit_buf |= ((data[0] << 8) | data[1]) << (bits);
+
+	return 1;
 }

-static __forceinline void bitstream_init (decoder_t * decoder){
-    decoder->bitstream_bits = -16;
+static __forceinline int bitstream_init ()
+{
+	if (!getBits32((u8*)&decoder.bitstream_buf, 1))
+	{
+		return 0;
+	}

-	while( !getBits32(dword, 1) )
-		so_resume();
+	decoder.bitstream_bits = -16;
+	BigEndian(decoder.bitstream_buf, decoder.bitstream_buf);
+	/*decoder.bitstream_buf = *(u64*)dword;
+	BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/

-	decoder->bitstream_buf = (dword[0] << 24) | (dword[1] << 16) |
-							 (dword[2] <<  8) |dword[3];
+	return 1;
 }

-/* make sure that there are at least 16 valid bits in bit_buf */
-#define NEEDBITS(bit_buf,bits,bit_ptr)		\
-do {						\
-    if (bits > 0) {			\
-	GETWORD(&bit_buf,bits);	\
-	bits -= 16;	\
-    }						\
-} while (0)
-
 /* remove num valid bits from bit_buf */
-#define DUMPBITS(bit_buf,bits,num)	\
-do {					\
-	/*IPU_LOG("DUMPBITS %d\n",num);*/	\
-    bit_buf <<= (num);			\
-    bits += (num);			\
-} while (0)
+static __forceinline void DUMPBITS(int num)
+{
+	decoder.bitstream_buf <<= num;
+    decoder.bitstream_bits += num;
+}

 /* take num bits from the high part of bit_buf and zero extend them */
-#define UBITS(bit_buf,num) (((u32)(bit_buf)) >> (32 - (num)))
+#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num)))

 /* take num bits from the high part of bit_buf and sign extend them */
-#define SBITS(bit_buf,num) (((s32)(bit_buf)) >> (32 - (num)))
+#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num)))
+
+/* Get bits from bitstream */
+static __forceinline u32 GETBITS(int num)
+{
+	u16 retVal = UBITS(num);
+	DUMPBITS(num);
+
+	return retVal;
+}

 struct MBtab {
    u8 modes;
@ -443,4 +458,247 @@ static const MBAtab MBA_11 [] = {
    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
 };
+
+// New
+
+
+/* Table B-1, macroblock_address_increment, codes 00010 ... 011xx */
+static MBAtab MBAtab1[16] =
+{ {0,0}, {0,0}, {7,5}, {6,5}, {5,4}, {5,4}, {4,4}, {4,4},
+  {3,3}, {3,3}, {3,3}, {3,3}, {2,3}, {2,3}, {2,3}, {2,3}
+};
+
+/* Table B-1, macroblock_address_increment, codes 00000011000 ... 0000111xxxx */
+static MBAtab MBAtab2[104] =
+{
+  {33,11}, {32,11}, {31,11}, {30,11}, {29,11}, {28,11}, {27,11}, {26,11},
+  {25,11}, {24,11}, {23,11}, {22,11}, {21,10}, {21,10}, {20,10}, {20,10},
+  {19,10}, {19,10}, {18,10}, {18,10}, {17,10}, {17,10}, {16,10}, {16,10},
+  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},  {15,8},
+  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},  {14,8},
+  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},  {13,8},
+  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},  {12,8},
+  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},  {11,8},
+  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},  {10,8},
+  {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},
+  {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},   {9,7},
+  {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},
+  {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7},   {8,7}
+};
+
+/* Table B-12, dct_dc_size_luminance, codes 00xxx ... 11110 */
+static const DCtab DClumtab0[32] =
+{ {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+  {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+  {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+  {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}, {0, 0}
+};
+
+/* Table B-12, dct_dc_size_luminance, codes 111110xxx ... 111111111 */
+static const DCtab DClumtab1[16] =
+{ {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6},
+  {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10,9}, {11,9}
+};
+
+/* Table B-13, dct_dc_size_chrominance, codes 00xxx ... 11110 */
+static const DCtab DCchromtab0[32] =
+{ {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+  {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+  {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+  {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}, {0, 0}
+};
+
+/* Table B-13, dct_dc_size_chrominance, codes 111110xxxx ... 1111111111 */
+static const DCtab DCchromtab1[32] =
+{ {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6},
+  {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6},
+  {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7}, {7, 7},
+  {8, 8}, {8, 8}, {8, 8}, {8, 8}, {9, 9}, {9, 9}, {10,10}, {11,10}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0100 ... 1xxx (used for first (DC) coefficient)
+ */
+static const DCTtab DCTtabfirst[12] =
+{
+  {0,2,4}, {2,1,4}, {1,1,3}, {1,1,3},
+  {0,1,1}, {0,1,1}, {0,1,1}, {0,1,1},
+  {0,1,1}, {0,1,1}, {0,1,1}, {0,1,1}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0100 ... 1xxx (used for all other coefficients)
+ */
+static const DCTtab DCTtabnext[12] =
+{
+  {0,2,4},  {2,1,4},  {1,1,3},  {1,1,3},
+  {64,0,2}, {64,0,2}, {64,0,2}, {64,0,2}, /* EOB */
+  {0,1,2},  {0,1,2},  {0,1,2},  {0,1,2}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 000001xx ... 00111xxx
+ */
+static const DCTtab DCTtab0[60] =
+{
+  {65,0,6}, {65,0,6}, {65,0,6}, {65,0,6}, /* Escape */
+  {2,2,7}, {2,2,7}, {9,1,7}, {9,1,7},
+  {0,4,7}, {0,4,7}, {8,1,7}, {8,1,7},
+  {7,1,6}, {7,1,6}, {7,1,6}, {7,1,6},
+  {6,1,6}, {6,1,6}, {6,1,6}, {6,1,6},
+  {1,2,6}, {1,2,6}, {1,2,6}, {1,2,6},
+  {5,1,6}, {5,1,6}, {5,1,6}, {5,1,6},
+  {13,1,8}, {0,6,8}, {12,1,8}, {11,1,8},
+  {3,2,8}, {1,3,8}, {0,5,8}, {10,1,8},
+  {0,3,5}, {0,3,5}, {0,3,5}, {0,3,5},
+  {0,3,5}, {0,3,5}, {0,3,5}, {0,3,5},
+  {4,1,5}, {4,1,5}, {4,1,5}, {4,1,5},
+  {4,1,5}, {4,1,5}, {4,1,5}, {4,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5}
+};
+
+/* Table B-15, DCT coefficients table one,
+ * codes 000001xx ... 11111111
+*/
+static const DCTtab DCTtab0a[252] =
+{
+  {65,0,6}, {65,0,6}, {65,0,6}, {65,0,6}, /* Escape */
+  {7,1,7}, {7,1,7}, {8,1,7}, {8,1,7},
+  {6,1,7}, {6,1,7}, {2,2,7}, {2,2,7},
+  {0,7,6}, {0,7,6}, {0,7,6}, {0,7,6},
+  {0,6,6}, {0,6,6}, {0,6,6}, {0,6,6},
+  {4,1,6}, {4,1,6}, {4,1,6}, {4,1,6},
+  {5,1,6}, {5,1,6}, {5,1,6}, {5,1,6},
+  {1,5,8}, {11,1,8}, {0,11,8}, {0,10,8},
+  {13,1,8}, {12,1,8}, {3,2,8}, {1,4,8},
+  {2,1,5}, {2,1,5}, {2,1,5}, {2,1,5},
+  {2,1,5}, {2,1,5}, {2,1,5}, {2,1,5},
+  {1,2,5}, {1,2,5}, {1,2,5}, {1,2,5},
+  {1,2,5}, {1,2,5}, {1,2,5}, {1,2,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {3,1,5}, {3,1,5}, {3,1,5}, {3,1,5},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {1,1,3}, {1,1,3}, {1,1,3}, {1,1,3},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4}, /* EOB */
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {64,0,4}, {64,0,4}, {64,0,4}, {64,0,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,3,4}, {0,3,4}, {0,3,4}, {0,3,4},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,1,2}, {0,1,2}, {0,1,2}, {0,1,2},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,2,3}, {0,2,3}, {0,2,3}, {0,2,3},
+  {0,4,5}, {0,4,5}, {0,4,5}, {0,4,5},
+  {0,4,5}, {0,4,5}, {0,4,5}, {0,4,5},
+  {0,5,5}, {0,5,5}, {0,5,5}, {0,5,5},
+  {0,5,5}, {0,5,5}, {0,5,5}, {0,5,5},
+  {9,1,7}, {9,1,7}, {1,3,7}, {1,3,7},
+  {10,1,7}, {10,1,7}, {0,8,7}, {0,8,7},
+  {0,9,7}, {0,9,7}, {0,12,8}, {0,13,8},
+  {2,3,8}, {4,2,8}, {0,14,8}, {0,15,8}
+};
+
+/* Table B-14, DCT coefficients table zero,
+ * codes 0000001000 ... 0000001111
+ */
+static const DCTtab DCTtab1[8] =
+{
+  {16,1,10}, {5,2,10}, {0,7,10}, {2,3,10},
+  {1,4,10}, {15,1,10}, {14,1,10}, {4,2,10}
+};
+
+/* Table B-15, DCT coefficients table one,
+ * codes 000000100x ... 000000111x
+ */
+static const DCTtab DCTtab1a[8] =
+{
+  {5,2,9}, {5,2,9}, {14,1,9}, {14,1,9},
+  {2,4,10}, {16,1,10}, {15,1,9}, {15,1,9}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 000000010000 ... 000000011111
+ */
+static const DCTtab DCTtab2[16] =
+{
+  {0,11,12}, {8,2,12}, {4,3,12}, {0,10,12},
+  {2,4,12}, {7,2,12}, {21,1,12}, {20,1,12},
+  {0,9,12}, {19,1,12}, {18,1,12}, {1,5,12},
+  {3,3,12}, {0,8,12}, {6,2,12}, {17,1,12}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 0000000010000 ... 0000000011111
+ */
+static const DCTtab DCTtab3[16] =
+{
+  {10,2,13}, {9,2,13}, {5,3,13}, {3,4,13},
+  {2,5,13}, {1,7,13}, {1,6,13}, {0,15,13},
+  {0,14,13}, {0,13,13}, {0,12,13}, {26,1,13},
+  {25,1,13}, {24,1,13}, {23,1,13}, {22,1,13}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 00000000010000 ... 00000000011111
+ */
+static const DCTtab DCTtab4[16] =
+{
+  {0,31,14}, {0,30,14}, {0,29,14}, {0,28,14},
+  {0,27,14}, {0,26,14}, {0,25,14}, {0,24,14},
+  {0,23,14}, {0,22,14}, {0,21,14}, {0,20,14},
+  {0,19,14}, {0,18,14}, {0,17,14}, {0,16,14}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 000000000010000 ... 000000000011111
+ */
+static const DCTtab DCTtab5[16] =
+{
+  {0,40,15}, {0,39,15}, {0,38,15}, {0,37,15},
+  {0,36,15}, {0,35,15}, {0,34,15}, {0,33,15},
+  {0,32,15}, {1,14,15}, {1,13,15}, {1,12,15},
+  {1,11,15}, {1,10,15}, {1,9,15}, {1,8,15}
+};
+
+/* Table B-14/15, DCT coefficients table zero / one,
+ * codes 0000000000010000 ... 0000000000011111
+ */
+static const DCTtab DCTtab6[16] =
+{
+  {1,18,16}, {1,17,16}, {1,16,16}, {1,15,16},
+  {6,3,16}, {16,2,16}, {15,2,16}, {14,2,16},
+  {13,2,16}, {12,2,16}, {11,2,16}, {31,1,16},
+  {30,1,16}, {29,1,16}, {28,1,16}, {27,1,16}
+};
+
 #endif//__VLC_H__
--- a/pcsx2/NakedAsm.h
+++ b/pcsx2/NakedAsm.h
@ -17,17 +17,6 @@
 #ifndef NAKED_ASM_H
 #define NAKED_ASM_H

-#include "IPU/coroutine.h"
-
-// Common to Windows and Linux
-extern "C"
-{
-	// acoroutine.S
-	void so_call(coroutine_t coro);
-	void so_resume(void);
-	void so_exit(void);
-}
-
 #ifdef __LINUX__

 extern "C"
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@ -1254,14 +1254,6 @@
 				<Filter
 					Name="IPU"
 					>
-					<File
-						RelativePath="..\..\IPU\coroutine.cpp"
-						>
-					</File>
-					<File
-						RelativePath="..\..\IPU\coroutine.h"
-						>
-					</File>
 					<File
 						RelativePath="..\..\Ipu\IPU.cpp"
 						>
@ -1270,7 +1262,7 @@
 							>
 							<Tool
 								Name="VCCLCompilerTool"
-								UsePrecompiledHeader="0"
+								UsePrecompiledHeader="2"
 							/>
 						</FileConfiguration>
 						<FileConfiguration
@ -1302,7 +1294,7 @@
 							>
 							<Tool
 								Name="VCCLCompilerTool"
-								UsePrecompiledHeader="0"
+								UsePrecompiledHeader="2"
 							/>
 						</FileConfiguration>
 						<FileConfiguration