IPU Bug/Feature fix: Pseudonym has coded a new yuv2rgb decoder which is up to IPU spec (which differs slightly from MPEG spec). Improves color hue/saturation on many vids, and is a bit faster too.

Dynarec: Removed RET2(). git-svn-id: http://pcsx2.googlecode.com/svn/trunk@683 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-03-05 02:21:07 +00:00 · 2009-03-05 02:21:07 +00:00 · fc84ade01d
parent 7fd0f67f93
commit fc84ade01d
10 changed files with 296 additions and 541 deletions
--- a/pcsx2/IPU/IPU.cpp
+++ b/pcsx2/IPU/IPU.cpp
@ -96,9 +96,6 @@ int IPU1dma();
 //char convert_data_buffer[sizeof(convert_rgb_t)];
 char convert_data_buffer[0x1C];

-convert_init_t convert_init={convert_data_buffer, sizeof(convert_data_buffer)};
-convert_t *convert;
-
 // Quantization matrix
 static u8 niq[64],			//non-intraquant matrix
 		iq[64];			//intraquant matrix
@ -216,8 +213,7 @@ void SaveState::ipuFreeze() {

 		if (!mpeg2_inited){
 			mpeg2_idct_init();
-			convert=convert_rgb (CONVERT_RGB, 32);
-			convert(16, 16, 0, NULL, &convert_init);
+			yuv2rgb_init();
 			memzero_obj(mb8.Y);
 			memzero_obj(mb8.Cb);
 			memzero_obj(mb8.Cr);
@ -314,8 +310,7 @@ void ipuSoftReset()
 {
 	if (!mpeg2_inited){
        mpeg2_idct_init();
-		convert=convert_rgb (CONVERT_RGB, 32);
-		convert(16, 16, 0, NULL, &convert_init);
+		yuv2rgb_init();
 		memzero_obj(mb8.Y);
 		memzero_obj(mb8.Cb);
 		memzero_obj(mb8.Cr);
@ -1274,8 +1269,7 @@ void __fastcall ipu_csc(macroblock_8 *mb8, macroblock_rgb32 *rgb32, int sgn){
 	int i;
 	u8* p = (u8*)rgb32;

-	convert_init.start(convert_init.id, (u8*)rgb32, CONVERT_FRAME);
-	convert_init.copy(convert_init.id, (u8*)mb8->Y, (u8*)mb8->Cr, (u8*)mb8->Cb, 0);
+	yuv2rgb_sse2();

 	if( s_thresh[0] > 0 ) {
 		for(i = 0; i < 64*4; i++, p += 4) {
--- a/pcsx2/IPU/IPU.h
+++ b/pcsx2/IPU/IPU.h
@ -19,6 +19,8 @@
 #ifndef __IPU_H__
 #define __IPU_H__

+#include "mpeg2lib/Mpeg.h"
+
 // IPU_INLINE_IRQS
 // Scheduling ints into the future is a purist approach to emulation, and
 // is mostly cosmetic since the emulator itself performs all actions instantly
@ -222,6 +224,10 @@ extern int coded_block_pattern;
 extern int g_nIPU0Data; // or 0x80000000 whenever transferring
 extern u8* g_pIPU0Pointer;

+// The IPU can only do one task at once and never uses other buffers so these
+// should be made available to functions in other modules to save registers.
+PCSX2_ALIGNED16(extern macroblock_rgb32 rgb32);
+PCSX2_ALIGNED16(extern macroblock_8 mb8);

 void dmaIPU0();
 void dmaIPU1();
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@ -64,12 +64,10 @@ struct macroblock_16{
 	short Cr[8][8];				//2
 };

-struct rgb32{
-	unsigned char r, g, b, a;
-};
-
 struct macroblock_rgb32{
-	struct rgb32	c[16][16];
+	struct {
+		unsigned char r, g, b, a;
+	} c[16][16];
 };

 struct rgb16{
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@ -1,514 +1,308 @@
-/*
- * yuv2rgb.c
- * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
- * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- * Modified by Florin for PCSX2 emu
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
 *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- * See http://libmpeg2.sourceforge.net/ for updates.
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
 *  
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
 *  
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

+// IPU-correct yuv conversions by Pseudonym
+// SSE2 Implementation by Pseudonym
+
 #include "PrecompiledHeader.h"

 #include "System.h"
-#include "mpeg2lib/Mpeg.h"
+#include "IPU.h"
 #include "yuv2rgb.h"

-//#include "convert_internal.h" //START
-struct convert_rgb_t {
-    u8 * rgb_ptr;
-    int width;
-    int uv_stride, uv_stride_frame;
-    int rgb_stride, rgb_stride_frame;
-    void (__fastcall * yuv2rgb) (u8 *, u8 *, u8 *, u8 *,
-		      void *, void *, int);
-};
+// Everything below is bit accurate to the IPU specification (except maybe rounding).
+// Know the specification before you touch it.

-typedef void __fastcall yuv2rgb_copy (void * id, u8 * const * src,
-			   unsigned int v_offset);
+PCSX2_ALIGNED16(u16 C_bias)[8] = {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000};
+PCSX2_ALIGNED16(u8 Y_bias)[16] = {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
+#define SSE_COEFFICIENTS(name, x) \
+	PCSX2_ALIGNED16(u16 name)[8] = {x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2};
+SSE_COEFFICIENTS(Y_coefficients, 0x95);    // 1.1640625
+SSE_COEFFICIENTS(RCr_coefficients, 0xcc);  // 1.59375
+SSE_COEFFICIENTS(GCr_coefficients, (-0x68));  // -0.8125
+SSE_COEFFICIENTS(GCb_coefficients, (-0x32));  // -0.390625
+SSE_COEFFICIENTS(BCb_coefficients, 0x102); // 2.015625
+PCSX2_ALIGNED16(u16 Y_mask)[8] = {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00};
+// Specifying round off instead of round down as everywhere else
+// implies that this is right
+PCSX2_ALIGNED16(u16 round_1bit)[8] = {1,1,1,1,1,1,1,1};
+PCSX2_ALIGNED16(u16 yuv2rgb_temp)[3][8];

-yuv2rgb_copy __fastcall * yuv2rgb_init_mmxext (int bpp, int mode);
-yuv2rgb_copy __fastcall * yuv2rgb_init_mmx (int bpp, int mode);
-yuv2rgb_copy __fastcall * yuv2rgb_init_mlib (int bpp, int mode);
-//#include "convert_internal.h" //END
-
-static u32 matrix_coefficients = 6;
-
-const s32 Inverse_Table_6_9[8][4] = {
-    {117504, 138453, 13954, 34903}, /*0 no sequence_display_extension */
-    {117504, 138453, 13954, 34903}, /*1 ITU-R Rec. 709 (1990) */
-    {104597, 132201, 25675, 53279}, /*2 unspecified */
-    {104597, 132201, 25675, 53279}, /*3 reserved */
-    {104448, 132798, 24759, 53109}, /*4 FCC */
-    {104597, 132201, 25675, 53279}, /*5 ITU-R Rec. 624-4 System B, G */
-    {104597, 132201, 25675, 53279}, /*6 SMPTE 170M */
-    {117579, 136230, 16907, 35559}  /*7 SMPTE 240M (1987) */
-};
-
-typedef void __fastcall yuv2rgb_c_internal (u8 *, u8 *, u8 *, u8 *,
-				 void *, void *, int);
-
-void * table_rV[256];
-void * table_gU[256];
-int table_gV[256];
-void * table_bU[256];
-
-#define _RGB(type,i)						\
-	U = pu[i];						\
-	V = pv[i];						\
-	r = (type *) table_rV[V];				\
-	g = (type *) (((u8 *)table_gU[U]) + table_gV[V]);	\
-	b = (type *) table_bU[U];
-
-#define DST(py,dst,i)				\
-	Y = py[2*i];				\
-	dst[2*i] = r[Y] + g[Y] + b[Y];		\
-	Y = py[2*i+1];				\
-	dst[2*i+1] = r[Y] + g[Y] + b[Y];
-
-#define DSTRGB(py,dst,i)						\
-	Y = py[2*i];							\
-	dst[6*i] = r[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = b[Y];		\
-	Y = py[2*i+1];							\
-	dst[6*i+3] = r[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = b[Y];
-
-#define DSTBGR(py,dst,i)						\
-	Y = py[2*i];							\
-	dst[6*i] = b[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = r[Y];		\
-	Y = py[2*i+1];							\
-	dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y];
-
-static void __fastcall yuv2rgb_c_32 (u8 * py_1, u8 * py_2,
-			  u8 * pu, u8 * pv,
-			  void * _dst_1, void * _dst_2, int width)
+// This could potentially be improved for SSE4
+void yuv2rgb_sse2(void)
 {
-    int U, V, Y;
-    u32 * r, * g, * b;
-    u32 * dst_1, * dst_2;
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+	__asm {
+		mov eax, 1
+		mov esi, 0
+		mov edi, 0

-    width >>= 3;
-    dst_1 = (u32 *) _dst_1;
-    dst_2 = (u32 *) _dst_2;
+		align 16
+tworows:
+		movq xmm3, qword ptr [mb8+256+esi]
+		movq xmm1, qword ptr [mb8+320+esi]
+		pxor xmm2, xmm2
+		pxor xmm0, xmm0
+		// could skip the movq but punpck requires 128-bit alignment
+		// for some reason, so two versions would be needed,
+		// bloating the function (further)
+		punpcklbw xmm2, xmm3
+		punpcklbw xmm0, xmm1
+		// unfortunately I don't think this will matter despite being
+		// technically potentially a little faster, but this is
+		// equivalent to an add or sub
+		pxor xmm2, xmmword ptr [C_bias] // xmm2 <-- 8 x (Cb - 128) << 8
+		pxor xmm0, xmmword ptr [C_bias] // xmm0 <-- 8 x (Cr - 128) << 8

-    do {
-		_RGB (u32, 0);
-		DST (py_1, dst_1, 0);
-		DST (py_2, dst_2, 0);
+		movaps xmm1, xmm0
+		movaps xmm3, xmm2
+		pmulhw xmm1, xmmword ptr [GCr_coefficients]
+		pmulhw xmm3, xmmword ptr [GCb_coefficients]
+		pmulhw xmm0, xmmword ptr [RCr_coefficients]
+		pmulhw xmm2, xmmword ptr [BCb_coefficients]
+		paddsw xmm1, xmm3
+		// store for the next line; looking at the code above
+		// compared to the code below, I have to wonder whether
+		// this was worth the hassle
+		movaps xmmword ptr [yuv2rgb_temp], xmm0
+		movaps xmmword ptr [yuv2rgb_temp+16], xmm1
+		movaps xmmword ptr [yuv2rgb_temp+32], xmm2
+		jmp ihatemsvc

-		_RGB (u32, 1);
-		DST (py_2, dst_2, 1);
-		DST (py_1, dst_1, 1);
+		align 16
+onerow:
+		movaps xmm0, xmmword ptr [yuv2rgb_temp]
+		movaps xmm1, xmmword ptr [yuv2rgb_temp+16]
+		movaps xmm2, xmmword ptr [yuv2rgb_temp+32]

-		_RGB (u32, 2);
-		DST (py_1, dst_1, 2);
-		DST (py_2, dst_2, 2);
+// If masm directives worked properly in inline asm, I'd be using them,
+// but I'm not inclined to write ~70 line #defines to simulate them.
+// Maybe the function's faster like this anyway because it's smaller?
+// I'd have to write a 70 line #define to benchmark it.

-		_RGB (u32, 3);
-		DST (py_2, dst_2, 3);
-		DST (py_1, dst_1, 3);
+ihatemsvc:
+		movaps xmm3, xmm0
+		movaps xmm4, xmm1
+		movaps xmm5, xmm2

-		pu += 4;
-		pv += 4;
-		py_1 += 8;
-		py_2 += 8;
-		dst_1 += 8;
-		dst_2 += 8;
-    } while (--width);
-}
+		movaps xmm6, xmmword ptr [mb8+edi]
+		psubusb xmm6, xmmword ptr [Y_bias]
+		movaps xmm7, xmm6
+		psllw xmm6, 8                    // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
+		pand xmm7, xmmword ptr [Y_mask]  // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15

-/* This is very near from the yuv2rgb_c_32 code */
-static void __fastcall yuv2rgb_c_24_rgb (u8 * py_1, u8 * py_2,
-			      u8 * pu, u8 * pv,
-			      void * _dst_1, void * _dst_2, int width)
-{
-    int U, V, Y;
-    u8 * r, * g, * b;
-    u8 * dst_1, * dst_2;
+		pmulhuw xmm6, xmmword ptr [Y_coefficients]
+		pmulhuw xmm7, xmmword ptr [Y_coefficients]

-    width >>= 3;
-    dst_1 = (u8 *) _dst_1;
-    dst_2 = (u8 *) _dst_2;
+		paddsw xmm0, xmm6
+		paddsw xmm3, xmm7
+		paddsw xmm1, xmm6
+		paddsw xmm4, xmm7
+		paddsw xmm2, xmm6
+		paddsw xmm5, xmm7

-    do {
-		_RGB (u8, 0);
-		DSTRGB (py_1, dst_1, 0);
-		DSTRGB (py_2, dst_2, 0);
+		// round
+		movaps xmm6, xmmword ptr [round_1bit]
+		paddw xmm0, xmm6
+		paddw xmm1, xmm6
+		paddw xmm2, xmm6
+		paddw xmm3, xmm6
+		paddw xmm4, xmm6
+		paddw xmm5, xmm6
+		psraw xmm0, 1
+		psraw xmm1, 1
+		psraw xmm2, 1
+		psraw xmm3, 1
+		psraw xmm4, 1
+		psraw xmm5, 1

-		_RGB (u8, 1);
-		DSTRGB (py_2, dst_2, 1);
-		DSTRGB (py_1, dst_1, 1);
+		// combine even and odd bytes
+		packuswb xmm0, xmm3
+		packuswb xmm1, xmm4
+		packuswb xmm2, xmm5
+		movhlps xmm3, xmm0
+		movhlps xmm4, xmm1
+		movhlps xmm5, xmm2
+		punpcklbw xmm0, xmm3 // Red bytes, back in order 
+		punpcklbw xmm1, xmm4 // Green ""
+		punpcklbw xmm2, xmm5 // Blue ""
+		movaps xmm3, xmm0
+		movaps xmm4, xmm1
+		movaps xmm5, xmm2

-		_RGB (u8, 2);
-		DSTRGB (py_1, dst_1, 2);
-		DSTRGB (py_2, dst_2, 2);
+		// Create RGBA (we could generate A here, but we don't) quads
+		punpcklbw xmm0, xmm1
+		punpcklbw xmm2, xmm7
+		movaps xmm1, xmm0
+		punpcklwd xmm0, xmm2
+		punpckhwd xmm1, xmm2

-		_RGB (u8, 3);
-		DSTRGB (py_2, dst_2, 3);
-		DSTRGB (py_1, dst_1, 3);
+		punpckhbw xmm3, xmm4
+		punpckhbw xmm5, xmm7
+		movaps xmm4, xmm3
+		punpcklwd xmm3, xmm5
+		punpckhwd xmm4, xmm5

-		pu += 4;
-		pv += 4;
-		py_1 += 8;
-		py_2 += 8;
-		dst_1 += 24;
-		dst_2 += 24;
-    } while (--width);
-}
+		// at last
+		movaps xmmword ptr [rgb32+edi*4+0], xmm0
+		movaps xmmword ptr [rgb32+edi*4+16], xmm1
+		movaps xmmword ptr [rgb32+edi*4+32], xmm3
+		movaps xmmword ptr [rgb32+edi*4+48], xmm4

-/* only trivial mods from yuv2rgb_c_24_rgb */
-static void __fastcall yuv2rgb_c_24_bgr (u8 * py_1, u8 * py_2,
-			      u8 * pu, u8 * pv,
-			      void * _dst_1, void * _dst_2, int width)
-{
-    int U, V, Y;
-    u8 * r, * g, * b;
-    u8 * dst_1, * dst_2;
+		add edi, 16

-    width >>= 3;
-    dst_1 = (u8 *) _dst_1;
-    dst_2 = (u8 *) _dst_2;
+		neg eax
+		jl onerow // run twice

-    do {
-		_RGB (u8, 0);
-		DSTBGR (py_1, dst_1, 0);
-		DSTBGR (py_2, dst_2, 0);
+		add esi, 8
+		cmp esi, 64
+		jne tworows
+	}
+#elif defined(__GNUC__)
+	asm(
+		".intel_syntax noprefix\n"
+		"mov eax, 1\n"
+		"mov esi, 0\n"
+		"mov edi, 0\n"

-		_RGB (u8, 1);
-		DSTBGR (py_2, dst_2, 1);
-		DSTBGR (py_1, dst_1, 1);
+		".align 16\n"
+"tworows:\n"
+		"movq xmm3, qword ptr [mb8+256+esi]\n"
+		"movq xmm1, qword ptr [mb8+320+esi]\n"
+		"pxor xmm2, xmm2\n"
+		"pxor xmm0, xmm0\n"
+		// could skip the movq but punpck requires 128-bit alignment
+		// for some reason, so two versions would be needed,
+		// bloating the function (further)
+		"punpcklbw xmm2, xmm3\n"
+		"punpcklbw xmm0, xmm1\n"
+		// unfortunately I don't think this will matter despite being
+		// technically potentially a little faster, but this is
+		// equivalent to an add or sub
+		"pxor xmm2, xmmword ptr [C_bias]\n" // xmm2 <-- 8 x (Cb - 128) << 8
+		"pxor xmm0, xmmword ptr [C_bias]\n" // xmm0 <-- 8 x (Cr - 128) << 8

-		_RGB (u8, 2);
-		DSTBGR (py_1, dst_1, 2);
-		DSTBGR (py_2, dst_2, 2);
+		"movaps xmm1, xmm0\n"
+		"movaps xmm3, xmm2\n"
+		"pmulhw xmm1, xmmword ptr [GCr_coefficients]\n"
+		"pmulhw xmm3, xmmword ptr [GCb_coefficients]\n"
+		"pmulhw xmm0, xmmword ptr [RCr_coefficients]\n"
+		"pmulhw xmm2, xmmword ptr [BCb_coefficients]\n"
+		"paddsw xmm1, xmm3\n"
+		// store for the next line; looking at the code above
+		// compared to the code below, I have to wonder whether
+		// this was worth the hassle
+		"movaps xmmword ptr [yuv2rgb_temp], xmm0\n"
+		"movaps xmmword ptr [yuv2rgb_temp+16], xmm1\n"
+		"movaps xmmword ptr [yuv2rgb_temp+32], xmm2\n"
+		"jmp ihategcctoo\n"

-		_RGB (u8, 3);
-		DSTBGR (py_2, dst_2, 3);
-		DSTBGR (py_1, dst_1, 3);
+		".align 16\n"
+"onerow:\n"
+		"movaps xmm0, xmmword ptr [yuv2rgb_temp]\n"
+		"movaps xmm1, xmmword ptr [yuv2rgb_temp+16]\n"
+		"movaps xmm2, xmmword ptr [yuv2rgb_temp+32]\n"

-		pu += 4;
-		pv += 4;
-		py_1 += 8;
-		py_2 += 8;
-		dst_1 += 24;
-		dst_2 += 24;
-    } while (--width);
-}
+"ihategcctoo:\n"
+		"movaps xmm3, xmm0\n"
+		"movaps xmm4, xmm1\n"
+		"movaps xmm5, xmm2\n"

-/* This is exactly the same code as yuv2rgb_c_32 except for the types of */
-/* r, g, b, dst_1, dst_2 */
-static void __fastcall yuv2rgb_c_16 (u8 * py_1, u8 * py_2,
-			  u8 * pu, u8 * pv,
-			  void * _dst_1, void * _dst_2, int width)
-{
-    int U, V, Y;
-    u16 * r, * g, * b;
-    u16 * dst_1, * dst_2;
+		"movaps xmm6, xmmword ptr [mb8+edi]\n"
+		"psubusb xmm6, xmmword ptr [Y_bias]\n"
+		"movaps xmm7, xmm6\n"
+		"psllw xmm6, 8\n"                   // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
+		"pand xmm7, xmmword ptr [Y_mask]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15

-    width >>= 3;
-    dst_1 = (u16 *) _dst_1;
-    dst_2 = (u16 *) _dst_2;
+		"pmulhuw xmm6, xmmword ptr [Y_coefficients]\n"
+		"pmulhuw xmm7, xmmword ptr [Y_coefficients]\n"

-    do {
-		_RGB (u16, 0);
-		DST (py_1, dst_1, 0);
-		DST (py_2, dst_2, 0);
+		"paddsw xmm0, xmm6\n"
+		"paddsw xmm3, xmm7\n"
+		"paddsw xmm1, xmm6\n"
+		"paddsw xmm4, xmm7\n"
+		"paddsw xmm2, xmm6\n"
+		"paddsw xmm5, xmm7\n"

-		_RGB (u16, 1);
-		DST (py_2, dst_2, 1);
-		DST (py_1, dst_1, 1);
+		// round
+		"movaps xmm6, xmmword ptr [round_1bit]\n"
+		"paddw xmm0, xmm6\n"
+		"paddw xmm1, xmm6\n"
+		"paddw xmm2, xmm6\n"
+		"paddw xmm3, xmm6\n"
+		"paddw xmm4, xmm6\n"
+		"paddw xmm5, xmm6\n"
+		"psraw xmm0, 1\n"
+		"psraw xmm1, 1\n"
+		"psraw xmm2, 1\n"
+		"psraw xmm3, 1\n"
+		"psraw xmm4, 1\n"
+		"psraw xmm5, 1\n"

-		_RGB (u16, 2);
-		DST (py_1, dst_1, 2);
-		DST (py_2, dst_2, 2);
+		// combine even and odd bytes
+		"packuswb xmm0, xmm3\n"
+		"packuswb xmm1, xmm4\n"
+		"packuswb xmm2, xmm5\n"
+		"movhlps xmm3, xmm0\n"
+		"movhlps xmm4, xmm1\n"
+		"movhlps xmm5, xmm2\n"
+		"punpcklbw xmm0, xmm3\n" // Red bytes, back in order
+		"punpcklbw xmm1, xmm4\n" // Green ""
+		"punpcklbw xmm2, xmm5\n" // Blue ""
+		"movaps xmm3, xmm0\n"
+		"movaps xmm4, xmm1\n"
+		"movaps xmm5, xmm2\n"

-		_RGB (u16, 3);
-		DST (py_2, dst_2, 3);
-		DST (py_1, dst_1, 3);
+		// Create RGBA (we could generate A here, but we don't) quads
+		"punpcklbw xmm0, xmm1\n"
+		"punpcklbw xmm2, xmm7\n"
+		"movaps xmm1, xmm0\n"
+		"punpcklwd xmm0, xmm2\n"
+		"punpckhwd xmm1, xmm2\n"

-		pu += 4;
-		pv += 4;
-		py_1 += 8;
-		py_2 += 8;
-		dst_1 += 8;
-		dst_2 += 8;
-    } while (--width);
-}
+		"punpckhbw xmm3, xmm4\n"
+		"punpckhbw xmm5, xmm7\n"
+		"movaps xmm4, xmm3\n"
+		"punpcklwd xmm3, xmm5\n"
+		"punpckhwd xmm4, xmm5\n"

-static int div_round (int dividend, int divisor)
-{
-    if (dividend > 0)
-		return (dividend + (divisor>>1)) / divisor;
-    else
-		return -((-dividend + (divisor>>1)) / divisor);
-}
+		// at last
+		"movaps xmmword ptr [rgb32+edi*4+0], xmm0\n"
+		"movaps xmmword ptr [rgb32+edi*4+16], xmm1\n"
+		"movaps xmmword ptr [rgb32+edi*4+32], xmm3\n"
+		"movaps xmmword ptr [rgb32+edi*4+48], xmm4\n"

-static yuv2rgb_c_internal __fastcall * yuv2rgb_c_init (int order, int bpp)
-{
-    int i;
-    u8 table_Y[1024];
-    u32 * table_32 = 0;
-    u16 * table_16 = 0;
-    u8 * table_8 = 0;
-    int entry_size = 0;
-    void * table_r = 0;
-    void * table_g = 0;
-    void * table_b = 0;
-    yuv2rgb_c_internal * yuv2rgb;
+		"add edi, 16\n"

-    int crv = Inverse_Table_6_9[matrix_coefficients][0];
-    int cbu = Inverse_Table_6_9[matrix_coefficients][1];
-    int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
-    int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
+		"neg eax\n"
+		"jl onerow\n" // run twice

-    for (i = 0; i < 1024; i++)
-	{
-		int j;
-
-		j = (76309 * (i - 384 - 16) + 32768) >> 16;
-		j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
-		table_Y[i] = j;
-    }
-
-    switch (bpp)
-	{
-    case 32:
-		yuv2rgb = yuv2rgb_c_32;
-
-		table_32 = (u32 *) malloc ((197 + 2*682 + 256 + 132) *
-						sizeof (u32));
-
-		entry_size = sizeof (u32);
-		table_r = table_32 + 197;
-		table_b = table_32 + 197 + 685;
-		table_g = table_32 + 197 + 2*682;
-
-		for (i = -197; i < 256+197; i++)
-			((u32 *) table_r)[i] =
-			table_Y[i+384] << ((order == CONVERT_RGB) ? 16 : 0);
-		for (i = -132; i < 256+132; i++)
-			((u32 *) table_g)[i] = table_Y[i+384] << 8;
-		for (i = -232; i < 256+232; i++)
-			((u32 *) table_b)[i] =
-			table_Y[i+384] << ((order == CONVERT_RGB) ? 0 : 16);
-	break;
-
-    case 24:
-		yuv2rgb = (order == CONVERT_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;
-
-		table_8 = (u8 *) malloc ((256 + 2*232) * sizeof (u8));
-
-		entry_size = sizeof (u8);
-		table_r = table_g = table_b = table_8 + 232;
-
-		for (i = -232; i < 256+232; i++)
-			((u8 * )table_b)[i] = table_Y[i+384];
-	break;
-
-    case 15:
-    case 16:
-		yuv2rgb = yuv2rgb_c_16;
-
-		table_16 = (u16 *) malloc ((197 + 2*682 + 256 + 132) *
-						sizeof (u16));
-
-		entry_size = sizeof (u16);
-		table_r = table_16 + 197;
-		table_b = table_16 + 197 + 685;
-		table_g = table_16 + 197 + 2*682;
-
-		for (i = -197; i < 256+197; i++) {
-			int j = table_Y[i+384] >> 3;
-
-			if (order == CONVERT_RGB)
-			j <<= ((bpp==16) ? 11 : 10);
-
-			((u16 *)table_r)[i] = j;
-		}
-		for (i = -132; i < 256+132; i++) {
-			int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
-
-			((u16 *)table_g)[i] = j << 5;
-		}
-		for (i = -232; i < 256+232; i++) {
-			int j = table_Y[i+384] >> 3;
-
-			if (order == CONVERT_RGB)
-			j <<= ((bpp==16) ? 11 : 10);
-
-			((u16 *)table_b)[i] = j;
-		}
-	break;
-
-#ifdef PCSX2_DEVBUILD
-    default:
-		DevCon::Error( "IPU Panic!  %ibpp not supported by yuv2rgb", params bpp );
+		"add esi, 8\n"
+		"cmp esi, 64\n"
+		"jne tworows\n"
+		".att_syntax\n"
+	);
 #else
-		jNO_DEFAULT
+#error Unsupported compiler
 #endif
-	}
-
-    for (i = 0; i < 256; i++) {
-		table_rV[i] = (((u8 *)table_r) +
-			entry_size * div_round (crv * (i-128), 76309));
-		table_gU[i] = (((u8 *)table_g) +
-			entry_size * div_round (cgu * (i-128), 76309));
-		table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
-		table_bU[i] = (((u8 *)table_b) +
-			entry_size * div_round (cbu * (i-128), 76309));
-    }
-
-    return yuv2rgb;
 }

-static void __fastcall convert_yuv2rgb_c (void * _id, u8 * Y, u8 * Cr, u8 * Cb,
-			       unsigned int v_offset)
+void yuv2rgb_init(void)
 {
-    convert_rgb_t * id = (convert_rgb_t *) _id;
-    u8 * dst;
-    u8 * py;
-    u8 * pu;
-    u8 * pv;
-    int loop;
-
-    dst = id->rgb_ptr + id->rgb_stride * v_offset;
-    py = Y; pu = Cr; pv = Cb;
-
-    loop = 8;
-    do {
-		id->yuv2rgb (py, py + (id->uv_stride << 1), pu, pv,
-		     dst, dst + id->rgb_stride, id->width);
-		py += id->uv_stride << 2;
-		pu += id->uv_stride;
-		pv += id->uv_stride;
-		dst += 2 * id->rgb_stride;
-    } while (--loop);
-}
-
-static void __fastcall convert_start (void * _id, u8 * dest, int flags)
-{
-    convert_rgb_t * id = (convert_rgb_t *) _id;
-    id->rgb_ptr = dest;
-    switch (flags) {
-    case CONVERT_BOTTOM_FIELD:
-		id->rgb_ptr += id->rgb_stride_frame;
-		/* break thru */
-    case CONVERT_TOP_FIELD:
-		id->uv_stride = id->uv_stride_frame << 1;
-		id->rgb_stride = id->rgb_stride_frame << 1;
-	break;
-    default:
-		id->uv_stride = id->uv_stride_frame;
-		id->rgb_stride = id->rgb_stride_frame;
-    }
-}
-
-static void __fastcall convert_internal (int order, int bpp, int width, int height,
-			      u32 accel, void * arg, convert_init_t * result)
-{
-    convert_rgb_t * id = (convert_rgb_t *) result->id;
-
-    if (!id) {
-		result->id_size = sizeof (convert_rgb_t);
-    } else {
-		id->width = width;
-		id->uv_stride_frame = width >> 1;
-		id->rgb_stride_frame = ((bpp + 7) >> 3) * width;
-
-		result->buf_size[0] = id->rgb_stride_frame * height;
-		result->buf_size[1] = result->buf_size[2] = 0;
-		result->start = convert_start;
-
-		result->copy = NULL;
-	#ifdef ARCH_X86
-		if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMXEXT)) {
-			result->copy = yuv2rgb_init_mmxext (order, bpp);
-		}
-		if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMX)) {
-			result->copy = yuv2rgb_init_mmx (order, bpp);
-		}
-	#endif
-	#ifdef LIBVO_MLIB
-		if ((result->copy == NULL) && (accel & MPEG2_ACCEL_MLIB)) {
-			result->copy = yuv2rgb_init_mlib (order, bpp);
-		}
-	#endif
-		if (result->copy == NULL) {
-			result->copy = convert_yuv2rgb_c;
-			id->yuv2rgb = yuv2rgb_c_init (order, bpp);
-		}
-    }
-}
-
-void __fastcall convert_rgb32 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_RGB, 32, width, height, accel, arg, result);
-}
-
-void __fastcall convert_rgb24 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_RGB, 24, width, height, accel, arg, result);
-}
-
-void __fastcall convert_rgb16 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_RGB, 16, width, height, accel, arg, result);
-}
-
-void __fastcall convert_rgb15 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_RGB, 15, width, height, accel, arg, result);
-}
-
-void __fastcall convert_bgr32 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_BGR, 32, width, height, accel, arg, result);
-}
-
-void __fastcall convert_bgr24 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_BGR, 24, width, height, accel, arg, result);
-}
-
-void __fastcall convert_bgr16 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_BGR, 16, width, height, accel, arg, result);
-}
-
-void __fastcall convert_bgr15 (int width, int height, u32 accel, void * arg,
-		    convert_init_t * result)
-{
-    convert_internal (CONVERT_BGR, 15, width, height, accel, arg, result);
-}
-
-__forceinline convert_t* convert_rgb (int order, int bpp)
-{
-    if (order == CONVERT_RGB || order == CONVERT_BGR)
-	switch (bpp) {
-	case 32: return (order == CONVERT_RGB) ? convert_rgb32 : convert_bgr32;
-	case 24: return (order == CONVERT_RGB) ? convert_rgb24 : convert_bgr24;
-	case 16: return (order == CONVERT_RGB) ? convert_rgb16 : convert_bgr16;
-	case 15: return (order == CONVERT_RGB) ? convert_rgb15 : convert_bgr15;
-	}
-    return NULL;
+	/* For later reimplementation of C version */
 }
--- a/pcsx2/IPU/yuv2rgb.h
+++ b/pcsx2/IPU/yuv2rgb.h
@ -1,57 +1,22 @@
-/*
- * yuv2rgb.h
- * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
- * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- * Modified by Florin for PCSX2 emu
+/*  Pcsx2 - Pc Ps2 Emulator
+ *  Copyright (C) 2002-2009  Pcsx2 Team
 *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- * See http://libmpeg2.sourceforge.net/ for updates.
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
 *  
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
 *  
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

-#ifndef YUV2RGB_H
-#define YUV2RGB_H
+#pragma once

-#define CONVERT_FRAME 0
-#define CONVERT_TOP_FIELD 1
-#define CONVERT_BOTTOM_FIELD 2
-#define CONVERT_BOTH_FIELDS 3
-
-struct convert_init_t {
-    void * id;
-    int id_size;
-    int buf_size[3];
-    void (__fastcall* start) (void * id, u8 * dest, int flags);
-    void (__fastcall* copy) (void * id, u8 * Y, u8 * Cr, u8 * Cb, unsigned int v_offset);
-};
-
-typedef void __fastcall convert_t (int width, int height, u32 accel, void * arg,
-			convert_init_t * result);
-
-convert_t convert_rgb32;
-convert_t convert_rgb24;
-convert_t convert_rgb16;
-convert_t convert_rgb15;
-convert_t convert_bgr32;
-convert_t convert_bgr24;
-convert_t convert_bgr16;
-convert_t convert_bgr15;
-
-#define CONVERT_RGB 0
-#define CONVERT_BGR 1
-extern convert_t* convert_rgb (int order, int bpp);
-
-#endif /* YUV2RGB_H */
+void yuv2rgb_sse2(void);
+void yuv2rgb_init(void);
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -993,7 +993,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)

 	j8Ptr[2] = JG8( 0 );	// jump if psxCycleEE > 0

-	RET2();		// returns control to the EE
+	RET();		// returns control to the EE

 	// Continue onward with branching here:
 	x86SetJ8( j8Ptr[2] );
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -993,7 +993,7 @@ void CheckForBIOSEnd()
 	x86SetJ8( j8Ptr[1] );

 	// bios end
-	RET2();
+	RET();

 	x86SetJ8( j8Ptr[2] );
 }
@ -1250,7 +1250,7 @@ static void iBranchTest(u32 newpc, bool noDispatch)
 		JS32((uptr)DispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
 	}

-	RET2();
+	RET();
 }

 static void checkcodefn()
--- a/pcsx2/x86/ix86/ix86.inl
+++ b/pcsx2/x86/ix86/ix86.inl
@ -3207,8 +3207,7 @@ emitterT void ePUSHFD( void ) { write8<I>( 0x9C ); }
 /* popfd */
 emitterT void ePOPFD( void ) { write8<I>( 0x9D ); }

-emitterT void eRET( void ) { write8<I>( 0xC3 ); }
-emitterT void eRET2( void ) { write16<I>( 0xc3f3 ); }
+emitterT void eRET( void ) { /*write8<I>( 0xf3 );  /*<-- K8 opt?*/ write8<I>( 0xC3 ); }

 emitterT void eCBW( void ) { write16<I>( 0x9866 );  }
 emitterT void eCWD( void )  { write8<I>( 0x98 ); }
--- a/pcsx2/x86/ix86/ix86_macros.h
+++ b/pcsx2/x86/ix86/ix86_macros.h
@ -394,7 +394,6 @@
 #define PUSHFD ePUSHFD<_EmitterId_>
 #define POPFD ePOPFD<_EmitterId_>
 #define RET eRET<_EmitterId_>
-#define RET2 eRET2<_EmitterId_>
 #define CBW eCBW<_EmitterId_>
 #define CWDE eCWDE<_EmitterId_>
 #define CWD eCWD<_EmitterId_>
--- a/pcsx2/x86/ix86/ix86_sse.inl
+++ b/pcsx2/x86/ix86/ix86_sse.inl
@ -276,7 +276,7 @@ emitterT void eSSE_MOVUPSRmtoROffset( x86SSERegType to, x86IntRegType from, int
 }

 // movups r32 to [r32+offset]
-emitterT void eSSE_MOVUPSRtoRmOffset( x86SSERegType to, x86IntRegType from, int offset )
+emitterT void eSSE_MOVUPSRtoRmOffset( x86IntRegType to, x86SSERegType from, int offset )
 {
    RexRB(0, from, to);
 	write16<I>( 0x110f );
@ -955,7 +955,7 @@ emitterT void eSSE2_PXOR_M128_to_XMM( x86SSERegType to, uptr from )				{ SSEMtoR

 emitterT void eSSE2_MOVDQA_M128_to_XMM(x86SSERegType to, uptr from)				{ if( AlwaysUseMovaps ) eSSE_MOVAPS_M128_to_XMM<I>( to, from ); else SSEMtoR66(0x6F0F); }
 emitterT void eSSE2_MOVDQA_XMM_to_M128( uptr to, x86SSERegType from )			{ if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_M128<I>( to, from ); else SSERtoM66(0x7F0F); } 
-emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from)	{ if (to != from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM<I>( to, from ); else SSERtoR66(0x6F0F); } }
+emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from)	{ if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM<I>( to, from ); else if( to != from ) SSERtoR66(0x6F0F); }

 emitterT void eSSE2_MOVDQU_M128_to_XMM(x86SSERegType to, uptr from)
 {