diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp index 99a1a9ef94..96edda1524 100644 --- a/pcsx2/IPU/IPU.cpp +++ b/pcsx2/IPU/IPU.cpp @@ -96,9 +96,6 @@ int IPU1dma(); //char convert_data_buffer[sizeof(convert_rgb_t)]; char convert_data_buffer[0x1C]; -convert_init_t convert_init={convert_data_buffer, sizeof(convert_data_buffer)}; -convert_t *convert; - // Quantization matrix static u8 niq[64], //non-intraquant matrix iq[64]; //intraquant matrix @@ -216,8 +213,7 @@ void SaveState::ipuFreeze() { if (!mpeg2_inited){ mpeg2_idct_init(); - convert=convert_rgb (CONVERT_RGB, 32); - convert(16, 16, 0, NULL, &convert_init); + yuv2rgb_init(); memzero_obj(mb8.Y); memzero_obj(mb8.Cb); memzero_obj(mb8.Cr); @@ -314,8 +310,7 @@ void ipuSoftReset() { if (!mpeg2_inited){ mpeg2_idct_init(); - convert=convert_rgb (CONVERT_RGB, 32); - convert(16, 16, 0, NULL, &convert_init); + yuv2rgb_init(); memzero_obj(mb8.Y); memzero_obj(mb8.Cb); memzero_obj(mb8.Cr); @@ -1274,8 +1269,7 @@ void __fastcall ipu_csc(macroblock_8 *mb8, macroblock_rgb32 *rgb32, int sgn){ int i; u8* p = (u8*)rgb32; - convert_init.start(convert_init.id, (u8*)rgb32, CONVERT_FRAME); - convert_init.copy(convert_init.id, (u8*)mb8->Y, (u8*)mb8->Cr, (u8*)mb8->Cb, 0); + yuv2rgb_sse2(); if( s_thresh[0] > 0 ) { for(i = 0; i < 64*4; i++, p += 4) { diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h index e2d5f85b1a..9915000a0d 100644 --- a/pcsx2/IPU/IPU.h +++ b/pcsx2/IPU/IPU.h @@ -19,6 +19,8 @@ #ifndef __IPU_H__ #define __IPU_H__ +#include "mpeg2lib/Mpeg.h" + // IPU_INLINE_IRQS // Scheduling ints into the future is a purist approach to emulation, and // is mostly cosmetic since the emulator itself performs all actions instantly @@ -222,6 +224,10 @@ extern int coded_block_pattern; extern int g_nIPU0Data; // or 0x80000000 whenever transferring extern u8* g_pIPU0Pointer; +// The IPU can only do one task at once and never uses other buffers so these +// should be made available to functions in other modules to save registers. +PCSX2_ALIGNED16(extern macroblock_rgb32 rgb32); +PCSX2_ALIGNED16(extern macroblock_8 mb8); void dmaIPU0(); void dmaIPU1(); diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h index 400662cb2a..42de91e4c9 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.h +++ b/pcsx2/IPU/mpeg2lib/Mpeg.h @@ -64,12 +64,10 @@ struct macroblock_16{ short Cr[8][8]; //2 }; -struct rgb32{ - unsigned char r, g, b, a; -}; - struct macroblock_rgb32{ - struct rgb32 c[16][16]; + struct { + unsigned char r, g, b, a; + } c[16][16]; }; struct rgb16{ diff --git a/pcsx2/IPU/yuv2rgb.cpp b/pcsx2/IPU/yuv2rgb.cpp index d76997a809..7cd4d2b6f9 100644 --- a/pcsx2/IPU/yuv2rgb.cpp +++ b/pcsx2/IPU/yuv2rgb.cpp @@ -1,514 +1,308 @@ -/* - * yuv2rgb.c - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * Modified by Florin for PCSX2 emu +/* Pcsx2 - Pc Ps2 Emulator + * Copyright (C) 2002-2009 Pcsx2 Team * - * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. - * See http://libmpeg2.sourceforge.net/ for updates. - * - * mpeg2dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpeg2dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ +// IPU-correct yuv conversions by Pseudonym +// SSE2 Implementation by Pseudonym + #include "PrecompiledHeader.h" #include "System.h" -#include "mpeg2lib/Mpeg.h" +#include "IPU.h" #include "yuv2rgb.h" -//#include "convert_internal.h" //START -struct convert_rgb_t { - u8 * rgb_ptr; - int width; - int uv_stride, uv_stride_frame; - int rgb_stride, rgb_stride_frame; - void (__fastcall * yuv2rgb) (u8 *, u8 *, u8 *, u8 *, - void *, void *, int); -}; +// Everything below is bit accurate to the IPU specification (except maybe rounding). +// Know the specification before you touch it. -typedef void __fastcall yuv2rgb_copy (void * id, u8 * const * src, - unsigned int v_offset); +PCSX2_ALIGNED16(u16 C_bias)[8] = {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}; +PCSX2_ALIGNED16(u8 Y_bias)[16] = {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}; +#define SSE_COEFFICIENTS(name, x) \ + PCSX2_ALIGNED16(u16 name)[8] = {x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2}; +SSE_COEFFICIENTS(Y_coefficients, 0x95); // 1.1640625 +SSE_COEFFICIENTS(RCr_coefficients, 0xcc); // 1.59375 +SSE_COEFFICIENTS(GCr_coefficients, (-0x68)); // -0.8125 +SSE_COEFFICIENTS(GCb_coefficients, (-0x32)); // -0.390625 +SSE_COEFFICIENTS(BCb_coefficients, 0x102); // 2.015625 +PCSX2_ALIGNED16(u16 Y_mask)[8] = {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00}; +// Specifying round off instead of round down as everywhere else +// implies that this is right +PCSX2_ALIGNED16(u16 round_1bit)[8] = {1,1,1,1,1,1,1,1}; +PCSX2_ALIGNED16(u16 yuv2rgb_temp)[3][8]; -yuv2rgb_copy __fastcall * yuv2rgb_init_mmxext (int bpp, int mode); -yuv2rgb_copy __fastcall * yuv2rgb_init_mmx (int bpp, int mode); -yuv2rgb_copy __fastcall * yuv2rgb_init_mlib (int bpp, int mode); -//#include "convert_internal.h" //END - -static u32 matrix_coefficients = 6; - -const s32 Inverse_Table_6_9[8][4] = { - {117504, 138453, 13954, 34903}, /*0 no sequence_display_extension */ - {117504, 138453, 13954, 34903}, /*1 ITU-R Rec. 709 (1990) */ - {104597, 132201, 25675, 53279}, /*2 unspecified */ - {104597, 132201, 25675, 53279}, /*3 reserved */ - {104448, 132798, 24759, 53109}, /*4 FCC */ - {104597, 132201, 25675, 53279}, /*5 ITU-R Rec. 624-4 System B, G */ - {104597, 132201, 25675, 53279}, /*6 SMPTE 170M */ - {117579, 136230, 16907, 35559} /*7 SMPTE 240M (1987) */ -}; - -typedef void __fastcall yuv2rgb_c_internal (u8 *, u8 *, u8 *, u8 *, - void *, void *, int); - -void * table_rV[256]; -void * table_gU[256]; -int table_gV[256]; -void * table_bU[256]; - -#define _RGB(type,i) \ - U = pu[i]; \ - V = pv[i]; \ - r = (type *) table_rV[V]; \ - g = (type *) (((u8 *)table_gU[U]) + table_gV[V]); \ - b = (type *) table_bU[U]; - -#define DST(py,dst,i) \ - Y = py[2*i]; \ - dst[2*i] = r[Y] + g[Y] + b[Y]; \ - Y = py[2*i+1]; \ - dst[2*i+1] = r[Y] + g[Y] + b[Y]; - -#define DSTRGB(py,dst,i) \ - Y = py[2*i]; \ - dst[6*i] = r[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = b[Y]; \ - Y = py[2*i+1]; \ - dst[6*i+3] = r[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = b[Y]; - -#define DSTBGR(py,dst,i) \ - Y = py[2*i]; \ - dst[6*i] = b[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = r[Y]; \ - Y = py[2*i+1]; \ - dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y]; - -static void __fastcall yuv2rgb_c_32 (u8 * py_1, u8 * py_2, - u8 * pu, u8 * pv, - void * _dst_1, void * _dst_2, int width) +// This could potentially be improved for SSE4 +void yuv2rgb_sse2(void) { - int U, V, Y; - u32 * r, * g, * b; - u32 * dst_1, * dst_2; +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) + __asm { + mov eax, 1 + mov esi, 0 + mov edi, 0 - width >>= 3; - dst_1 = (u32 *) _dst_1; - dst_2 = (u32 *) _dst_2; + align 16 +tworows: + movq xmm3, qword ptr [mb8+256+esi] + movq xmm1, qword ptr [mb8+320+esi] + pxor xmm2, xmm2 + pxor xmm0, xmm0 + // could skip the movq but punpck requires 128-bit alignment + // for some reason, so two versions would be needed, + // bloating the function (further) + punpcklbw xmm2, xmm3 + punpcklbw xmm0, xmm1 + // unfortunately I don't think this will matter despite being + // technically potentially a little faster, but this is + // equivalent to an add or sub + pxor xmm2, xmmword ptr [C_bias] // xmm2 <-- 8 x (Cb - 128) << 8 + pxor xmm0, xmmword ptr [C_bias] // xmm0 <-- 8 x (Cr - 128) << 8 - do { - _RGB (u32, 0); - DST (py_1, dst_1, 0); - DST (py_2, dst_2, 0); + movaps xmm1, xmm0 + movaps xmm3, xmm2 + pmulhw xmm1, xmmword ptr [GCr_coefficients] + pmulhw xmm3, xmmword ptr [GCb_coefficients] + pmulhw xmm0, xmmword ptr [RCr_coefficients] + pmulhw xmm2, xmmword ptr [BCb_coefficients] + paddsw xmm1, xmm3 + // store for the next line; looking at the code above + // compared to the code below, I have to wonder whether + // this was worth the hassle + movaps xmmword ptr [yuv2rgb_temp], xmm0 + movaps xmmword ptr [yuv2rgb_temp+16], xmm1 + movaps xmmword ptr [yuv2rgb_temp+32], xmm2 + jmp ihatemsvc - _RGB (u32, 1); - DST (py_2, dst_2, 1); - DST (py_1, dst_1, 1); + align 16 +onerow: + movaps xmm0, xmmword ptr [yuv2rgb_temp] + movaps xmm1, xmmword ptr [yuv2rgb_temp+16] + movaps xmm2, xmmword ptr [yuv2rgb_temp+32] - _RGB (u32, 2); - DST (py_1, dst_1, 2); - DST (py_2, dst_2, 2); +// If masm directives worked properly in inline asm, I'd be using them, +// but I'm not inclined to write ~70 line #defines to simulate them. +// Maybe the function's faster like this anyway because it's smaller? +// I'd have to write a 70 line #define to benchmark it. - _RGB (u32, 3); - DST (py_2, dst_2, 3); - DST (py_1, dst_1, 3); +ihatemsvc: + movaps xmm3, xmm0 + movaps xmm4, xmm1 + movaps xmm5, xmm2 - pu += 4; - pv += 4; - py_1 += 8; - py_2 += 8; - dst_1 += 8; - dst_2 += 8; - } while (--width); -} + movaps xmm6, xmmword ptr [mb8+edi] + psubusb xmm6, xmmword ptr [Y_bias] + movaps xmm7, xmm6 + psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 + pand xmm7, xmmword ptr [Y_mask] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 -/* This is very near from the yuv2rgb_c_32 code */ -static void __fastcall yuv2rgb_c_24_rgb (u8 * py_1, u8 * py_2, - u8 * pu, u8 * pv, - void * _dst_1, void * _dst_2, int width) -{ - int U, V, Y; - u8 * r, * g, * b; - u8 * dst_1, * dst_2; + pmulhuw xmm6, xmmword ptr [Y_coefficients] + pmulhuw xmm7, xmmword ptr [Y_coefficients] - width >>= 3; - dst_1 = (u8 *) _dst_1; - dst_2 = (u8 *) _dst_2; + paddsw xmm0, xmm6 + paddsw xmm3, xmm7 + paddsw xmm1, xmm6 + paddsw xmm4, xmm7 + paddsw xmm2, xmm6 + paddsw xmm5, xmm7 - do { - _RGB (u8, 0); - DSTRGB (py_1, dst_1, 0); - DSTRGB (py_2, dst_2, 0); + // round + movaps xmm6, xmmword ptr [round_1bit] + paddw xmm0, xmm6 + paddw xmm1, xmm6 + paddw xmm2, xmm6 + paddw xmm3, xmm6 + paddw xmm4, xmm6 + paddw xmm5, xmm6 + psraw xmm0, 1 + psraw xmm1, 1 + psraw xmm2, 1 + psraw xmm3, 1 + psraw xmm4, 1 + psraw xmm5, 1 - _RGB (u8, 1); - DSTRGB (py_2, dst_2, 1); - DSTRGB (py_1, dst_1, 1); + // combine even and odd bytes + packuswb xmm0, xmm3 + packuswb xmm1, xmm4 + packuswb xmm2, xmm5 + movhlps xmm3, xmm0 + movhlps xmm4, xmm1 + movhlps xmm5, xmm2 + punpcklbw xmm0, xmm3 // Red bytes, back in order + punpcklbw xmm1, xmm4 // Green "" + punpcklbw xmm2, xmm5 // Blue "" + movaps xmm3, xmm0 + movaps xmm4, xmm1 + movaps xmm5, xmm2 - _RGB (u8, 2); - DSTRGB (py_1, dst_1, 2); - DSTRGB (py_2, dst_2, 2); + // Create RGBA (we could generate A here, but we don't) quads + punpcklbw xmm0, xmm1 + punpcklbw xmm2, xmm7 + movaps xmm1, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm1, xmm2 - _RGB (u8, 3); - DSTRGB (py_2, dst_2, 3); - DSTRGB (py_1, dst_1, 3); + punpckhbw xmm3, xmm4 + punpckhbw xmm5, xmm7 + movaps xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 - pu += 4; - pv += 4; - py_1 += 8; - py_2 += 8; - dst_1 += 24; - dst_2 += 24; - } while (--width); -} + // at last + movaps xmmword ptr [rgb32+edi*4+0], xmm0 + movaps xmmword ptr [rgb32+edi*4+16], xmm1 + movaps xmmword ptr [rgb32+edi*4+32], xmm3 + movaps xmmword ptr [rgb32+edi*4+48], xmm4 -/* only trivial mods from yuv2rgb_c_24_rgb */ -static void __fastcall yuv2rgb_c_24_bgr (u8 * py_1, u8 * py_2, - u8 * pu, u8 * pv, - void * _dst_1, void * _dst_2, int width) -{ - int U, V, Y; - u8 * r, * g, * b; - u8 * dst_1, * dst_2; + add edi, 16 - width >>= 3; - dst_1 = (u8 *) _dst_1; - dst_2 = (u8 *) _dst_2; + neg eax + jl onerow // run twice - do { - _RGB (u8, 0); - DSTBGR (py_1, dst_1, 0); - DSTBGR (py_2, dst_2, 0); + add esi, 8 + cmp esi, 64 + jne tworows + } +#elif defined(__GNUC__) + asm( + ".intel_syntax noprefix\n" + "mov eax, 1\n" + "mov esi, 0\n" + "mov edi, 0\n" - _RGB (u8, 1); - DSTBGR (py_2, dst_2, 1); - DSTBGR (py_1, dst_1, 1); + ".align 16\n" +"tworows:\n" + "movq xmm3, qword ptr [mb8+256+esi]\n" + "movq xmm1, qword ptr [mb8+320+esi]\n" + "pxor xmm2, xmm2\n" + "pxor xmm0, xmm0\n" + // could skip the movq but punpck requires 128-bit alignment + // for some reason, so two versions would be needed, + // bloating the function (further) + "punpcklbw xmm2, xmm3\n" + "punpcklbw xmm0, xmm1\n" + // unfortunately I don't think this will matter despite being + // technically potentially a little faster, but this is + // equivalent to an add or sub + "pxor xmm2, xmmword ptr [C_bias]\n" // xmm2 <-- 8 x (Cb - 128) << 8 + "pxor xmm0, xmmword ptr [C_bias]\n" // xmm0 <-- 8 x (Cr - 128) << 8 - _RGB (u8, 2); - DSTBGR (py_1, dst_1, 2); - DSTBGR (py_2, dst_2, 2); + "movaps xmm1, xmm0\n" + "movaps xmm3, xmm2\n" + "pmulhw xmm1, xmmword ptr [GCr_coefficients]\n" + "pmulhw xmm3, xmmword ptr [GCb_coefficients]\n" + "pmulhw xmm0, xmmword ptr [RCr_coefficients]\n" + "pmulhw xmm2, xmmword ptr [BCb_coefficients]\n" + "paddsw xmm1, xmm3\n" + // store for the next line; looking at the code above + // compared to the code below, I have to wonder whether + // this was worth the hassle + "movaps xmmword ptr [yuv2rgb_temp], xmm0\n" + "movaps xmmword ptr [yuv2rgb_temp+16], xmm1\n" + "movaps xmmword ptr [yuv2rgb_temp+32], xmm2\n" + "jmp ihategcctoo\n" - _RGB (u8, 3); - DSTBGR (py_2, dst_2, 3); - DSTBGR (py_1, dst_1, 3); + ".align 16\n" +"onerow:\n" + "movaps xmm0, xmmword ptr [yuv2rgb_temp]\n" + "movaps xmm1, xmmword ptr [yuv2rgb_temp+16]\n" + "movaps xmm2, xmmword ptr [yuv2rgb_temp+32]\n" - pu += 4; - pv += 4; - py_1 += 8; - py_2 += 8; - dst_1 += 24; - dst_2 += 24; - } while (--width); -} +"ihategcctoo:\n" + "movaps xmm3, xmm0\n" + "movaps xmm4, xmm1\n" + "movaps xmm5, xmm2\n" -/* This is exactly the same code as yuv2rgb_c_32 except for the types of */ -/* r, g, b, dst_1, dst_2 */ -static void __fastcall yuv2rgb_c_16 (u8 * py_1, u8 * py_2, - u8 * pu, u8 * pv, - void * _dst_1, void * _dst_2, int width) -{ - int U, V, Y; - u16 * r, * g, * b; - u16 * dst_1, * dst_2; + "movaps xmm6, xmmword ptr [mb8+edi]\n" + "psubusb xmm6, xmmword ptr [Y_bias]\n" + "movaps xmm7, xmm6\n" + "psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 + "pand xmm7, xmmword ptr [Y_mask]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 - width >>= 3; - dst_1 = (u16 *) _dst_1; - dst_2 = (u16 *) _dst_2; + "pmulhuw xmm6, xmmword ptr [Y_coefficients]\n" + "pmulhuw xmm7, xmmword ptr [Y_coefficients]\n" - do { - _RGB (u16, 0); - DST (py_1, dst_1, 0); - DST (py_2, dst_2, 0); + "paddsw xmm0, xmm6\n" + "paddsw xmm3, xmm7\n" + "paddsw xmm1, xmm6\n" + "paddsw xmm4, xmm7\n" + "paddsw xmm2, xmm6\n" + "paddsw xmm5, xmm7\n" - _RGB (u16, 1); - DST (py_2, dst_2, 1); - DST (py_1, dst_1, 1); + // round + "movaps xmm6, xmmword ptr [round_1bit]\n" + "paddw xmm0, xmm6\n" + "paddw xmm1, xmm6\n" + "paddw xmm2, xmm6\n" + "paddw xmm3, xmm6\n" + "paddw xmm4, xmm6\n" + "paddw xmm5, xmm6\n" + "psraw xmm0, 1\n" + "psraw xmm1, 1\n" + "psraw xmm2, 1\n" + "psraw xmm3, 1\n" + "psraw xmm4, 1\n" + "psraw xmm5, 1\n" - _RGB (u16, 2); - DST (py_1, dst_1, 2); - DST (py_2, dst_2, 2); + // combine even and odd bytes + "packuswb xmm0, xmm3\n" + "packuswb xmm1, xmm4\n" + "packuswb xmm2, xmm5\n" + "movhlps xmm3, xmm0\n" + "movhlps xmm4, xmm1\n" + "movhlps xmm5, xmm2\n" + "punpcklbw xmm0, xmm3\n" // Red bytes, back in order + "punpcklbw xmm1, xmm4\n" // Green "" + "punpcklbw xmm2, xmm5\n" // Blue "" + "movaps xmm3, xmm0\n" + "movaps xmm4, xmm1\n" + "movaps xmm5, xmm2\n" - _RGB (u16, 3); - DST (py_2, dst_2, 3); - DST (py_1, dst_1, 3); + // Create RGBA (we could generate A here, but we don't) quads + "punpcklbw xmm0, xmm1\n" + "punpcklbw xmm2, xmm7\n" + "movaps xmm1, xmm0\n" + "punpcklwd xmm0, xmm2\n" + "punpckhwd xmm1, xmm2\n" - pu += 4; - pv += 4; - py_1 += 8; - py_2 += 8; - dst_1 += 8; - dst_2 += 8; - } while (--width); -} + "punpckhbw xmm3, xmm4\n" + "punpckhbw xmm5, xmm7\n" + "movaps xmm4, xmm3\n" + "punpcklwd xmm3, xmm5\n" + "punpckhwd xmm4, xmm5\n" -static int div_round (int dividend, int divisor) -{ - if (dividend > 0) - return (dividend + (divisor>>1)) / divisor; - else - return -((-dividend + (divisor>>1)) / divisor); -} + // at last + "movaps xmmword ptr [rgb32+edi*4+0], xmm0\n" + "movaps xmmword ptr [rgb32+edi*4+16], xmm1\n" + "movaps xmmword ptr [rgb32+edi*4+32], xmm3\n" + "movaps xmmword ptr [rgb32+edi*4+48], xmm4\n" -static yuv2rgb_c_internal __fastcall * yuv2rgb_c_init (int order, int bpp) -{ - int i; - u8 table_Y[1024]; - u32 * table_32 = 0; - u16 * table_16 = 0; - u8 * table_8 = 0; - int entry_size = 0; - void * table_r = 0; - void * table_g = 0; - void * table_b = 0; - yuv2rgb_c_internal * yuv2rgb; + "add edi, 16\n" - int crv = Inverse_Table_6_9[matrix_coefficients][0]; - int cbu = Inverse_Table_6_9[matrix_coefficients][1]; - int cgu = -Inverse_Table_6_9[matrix_coefficients][2]; - int cgv = -Inverse_Table_6_9[matrix_coefficients][3]; + "neg eax\n" + "jl onerow\n" // run twice - for (i = 0; i < 1024; i++) - { - int j; - - j = (76309 * (i - 384 - 16) + 32768) >> 16; - j = (j < 0) ? 0 : ((j > 255) ? 255 : j); - table_Y[i] = j; - } - - switch (bpp) - { - case 32: - yuv2rgb = yuv2rgb_c_32; - - table_32 = (u32 *) malloc ((197 + 2*682 + 256 + 132) * - sizeof (u32)); - - entry_size = sizeof (u32); - table_r = table_32 + 197; - table_b = table_32 + 197 + 685; - table_g = table_32 + 197 + 2*682; - - for (i = -197; i < 256+197; i++) - ((u32 *) table_r)[i] = - table_Y[i+384] << ((order == CONVERT_RGB) ? 16 : 0); - for (i = -132; i < 256+132; i++) - ((u32 *) table_g)[i] = table_Y[i+384] << 8; - for (i = -232; i < 256+232; i++) - ((u32 *) table_b)[i] = - table_Y[i+384] << ((order == CONVERT_RGB) ? 0 : 16); - break; - - case 24: - yuv2rgb = (order == CONVERT_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr; - - table_8 = (u8 *) malloc ((256 + 2*232) * sizeof (u8)); - - entry_size = sizeof (u8); - table_r = table_g = table_b = table_8 + 232; - - for (i = -232; i < 256+232; i++) - ((u8 * )table_b)[i] = table_Y[i+384]; - break; - - case 15: - case 16: - yuv2rgb = yuv2rgb_c_16; - - table_16 = (u16 *) malloc ((197 + 2*682 + 256 + 132) * - sizeof (u16)); - - entry_size = sizeof (u16); - table_r = table_16 + 197; - table_b = table_16 + 197 + 685; - table_g = table_16 + 197 + 2*682; - - for (i = -197; i < 256+197; i++) { - int j = table_Y[i+384] >> 3; - - if (order == CONVERT_RGB) - j <<= ((bpp==16) ? 11 : 10); - - ((u16 *)table_r)[i] = j; - } - for (i = -132; i < 256+132; i++) { - int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3); - - ((u16 *)table_g)[i] = j << 5; - } - for (i = -232; i < 256+232; i++) { - int j = table_Y[i+384] >> 3; - - if (order == CONVERT_RGB) - j <<= ((bpp==16) ? 11 : 10); - - ((u16 *)table_b)[i] = j; - } - break; - -#ifdef PCSX2_DEVBUILD - default: - DevCon::Error( "IPU Panic! %ibpp not supported by yuv2rgb", params bpp ); + "add esi, 8\n" + "cmp esi, 64\n" + "jne tworows\n" + ".att_syntax\n" + ); #else - jNO_DEFAULT +#error Unsupported compiler #endif - } - - for (i = 0; i < 256; i++) { - table_rV[i] = (((u8 *)table_r) + - entry_size * div_round (crv * (i-128), 76309)); - table_gU[i] = (((u8 *)table_g) + - entry_size * div_round (cgu * (i-128), 76309)); - table_gV[i] = entry_size * div_round (cgv * (i-128), 76309); - table_bU[i] = (((u8 *)table_b) + - entry_size * div_round (cbu * (i-128), 76309)); - } - - return yuv2rgb; } -static void __fastcall convert_yuv2rgb_c (void * _id, u8 * Y, u8 * Cr, u8 * Cb, - unsigned int v_offset) +void yuv2rgb_init(void) { - convert_rgb_t * id = (convert_rgb_t *) _id; - u8 * dst; - u8 * py; - u8 * pu; - u8 * pv; - int loop; - - dst = id->rgb_ptr + id->rgb_stride * v_offset; - py = Y; pu = Cr; pv = Cb; - - loop = 8; - do { - id->yuv2rgb (py, py + (id->uv_stride << 1), pu, pv, - dst, dst + id->rgb_stride, id->width); - py += id->uv_stride << 2; - pu += id->uv_stride; - pv += id->uv_stride; - dst += 2 * id->rgb_stride; - } while (--loop); -} - -static void __fastcall convert_start (void * _id, u8 * dest, int flags) -{ - convert_rgb_t * id = (convert_rgb_t *) _id; - id->rgb_ptr = dest; - switch (flags) { - case CONVERT_BOTTOM_FIELD: - id->rgb_ptr += id->rgb_stride_frame; - /* break thru */ - case CONVERT_TOP_FIELD: - id->uv_stride = id->uv_stride_frame << 1; - id->rgb_stride = id->rgb_stride_frame << 1; - break; - default: - id->uv_stride = id->uv_stride_frame; - id->rgb_stride = id->rgb_stride_frame; - } -} - -static void __fastcall convert_internal (int order, int bpp, int width, int height, - u32 accel, void * arg, convert_init_t * result) -{ - convert_rgb_t * id = (convert_rgb_t *) result->id; - - if (!id) { - result->id_size = sizeof (convert_rgb_t); - } else { - id->width = width; - id->uv_stride_frame = width >> 1; - id->rgb_stride_frame = ((bpp + 7) >> 3) * width; - - result->buf_size[0] = id->rgb_stride_frame * height; - result->buf_size[1] = result->buf_size[2] = 0; - result->start = convert_start; - - result->copy = NULL; - #ifdef ARCH_X86 - if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMXEXT)) { - result->copy = yuv2rgb_init_mmxext (order, bpp); - } - if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMX)) { - result->copy = yuv2rgb_init_mmx (order, bpp); - } - #endif - #ifdef LIBVO_MLIB - if ((result->copy == NULL) && (accel & MPEG2_ACCEL_MLIB)) { - result->copy = yuv2rgb_init_mlib (order, bpp); - } - #endif - if (result->copy == NULL) { - result->copy = convert_yuv2rgb_c; - id->yuv2rgb = yuv2rgb_c_init (order, bpp); - } - } -} - -void __fastcall convert_rgb32 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_RGB, 32, width, height, accel, arg, result); -} - -void __fastcall convert_rgb24 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_RGB, 24, width, height, accel, arg, result); -} - -void __fastcall convert_rgb16 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_RGB, 16, width, height, accel, arg, result); -} - -void __fastcall convert_rgb15 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_RGB, 15, width, height, accel, arg, result); -} - -void __fastcall convert_bgr32 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_BGR, 32, width, height, accel, arg, result); -} - -void __fastcall convert_bgr24 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_BGR, 24, width, height, accel, arg, result); -} - -void __fastcall convert_bgr16 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_BGR, 16, width, height, accel, arg, result); -} - -void __fastcall convert_bgr15 (int width, int height, u32 accel, void * arg, - convert_init_t * result) -{ - convert_internal (CONVERT_BGR, 15, width, height, accel, arg, result); -} - -__forceinline convert_t* convert_rgb (int order, int bpp) -{ - if (order == CONVERT_RGB || order == CONVERT_BGR) - switch (bpp) { - case 32: return (order == CONVERT_RGB) ? convert_rgb32 : convert_bgr32; - case 24: return (order == CONVERT_RGB) ? convert_rgb24 : convert_bgr24; - case 16: return (order == CONVERT_RGB) ? convert_rgb16 : convert_bgr16; - case 15: return (order == CONVERT_RGB) ? convert_rgb15 : convert_bgr15; - } - return NULL; + /* For later reimplementation of C version */ } diff --git a/pcsx2/IPU/yuv2rgb.h b/pcsx2/IPU/yuv2rgb.h index 9d802a7d52..438175b383 100644 --- a/pcsx2/IPU/yuv2rgb.h +++ b/pcsx2/IPU/yuv2rgb.h @@ -1,57 +1,22 @@ -/* - * yuv2rgb.h - * Copyright (C) 2000-2002 Michel Lespinasse - * Copyright (C) 1999-2000 Aaron Holtzman - * Modified by Florin for PCSX2 emu +/* Pcsx2 - Pc Ps2 Emulator + * Copyright (C) 2002-2009 Pcsx2 Team * - * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. - * See http://libmpeg2.sourceforge.net/ for updates. - * - * mpeg2dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpeg2dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ -#ifndef YUV2RGB_H -#define YUV2RGB_H +#pragma once -#define CONVERT_FRAME 0 -#define CONVERT_TOP_FIELD 1 -#define CONVERT_BOTTOM_FIELD 2 -#define CONVERT_BOTH_FIELDS 3 - -struct convert_init_t { - void * id; - int id_size; - int buf_size[3]; - void (__fastcall* start) (void * id, u8 * dest, int flags); - void (__fastcall* copy) (void * id, u8 * Y, u8 * Cr, u8 * Cb, unsigned int v_offset); -}; - -typedef void __fastcall convert_t (int width, int height, u32 accel, void * arg, - convert_init_t * result); - -convert_t convert_rgb32; -convert_t convert_rgb24; -convert_t convert_rgb16; -convert_t convert_rgb15; -convert_t convert_bgr32; -convert_t convert_bgr24; -convert_t convert_bgr16; -convert_t convert_bgr15; - -#define CONVERT_RGB 0 -#define CONVERT_BGR 1 -extern convert_t* convert_rgb (int order, int bpp); - -#endif /* YUV2RGB_H */ +void yuv2rgb_sse2(void); +void yuv2rgb_init(void); diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index eca0384526..99b4903d42 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -993,7 +993,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch) j8Ptr[2] = JG8( 0 ); // jump if psxCycleEE > 0 - RET2(); // returns control to the EE + RET(); // returns control to the EE // Continue onward with branching here: x86SetJ8( j8Ptr[2] ); diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index f50291d81d..2059c8d261 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -993,7 +993,7 @@ void CheckForBIOSEnd() x86SetJ8( j8Ptr[1] ); // bios end - RET2(); + RET(); x86SetJ8( j8Ptr[2] ); } @@ -1250,7 +1250,7 @@ static void iBranchTest(u32 newpc, bool noDispatch) JS32((uptr)DispatcherReg - ( (uptr)x86Ptr[0] + 6 )); } - RET2(); + RET(); } static void checkcodefn() diff --git a/pcsx2/x86/ix86/ix86.inl b/pcsx2/x86/ix86/ix86.inl index 8e087c155e..a6458ad2f2 100644 --- a/pcsx2/x86/ix86/ix86.inl +++ b/pcsx2/x86/ix86/ix86.inl @@ -3207,8 +3207,7 @@ emitterT void ePUSHFD( void ) { write8( 0x9C ); } /* popfd */ emitterT void ePOPFD( void ) { write8( 0x9D ); } -emitterT void eRET( void ) { write8( 0xC3 ); } -emitterT void eRET2( void ) { write16( 0xc3f3 ); } +emitterT void eRET( void ) { /*write8( 0xf3 ); /*<-- K8 opt?*/ write8( 0xC3 ); } emitterT void eCBW( void ) { write16( 0x9866 ); } emitterT void eCWD( void ) { write8( 0x98 ); } diff --git a/pcsx2/x86/ix86/ix86_macros.h b/pcsx2/x86/ix86/ix86_macros.h index 48e0931095..8265141040 100644 --- a/pcsx2/x86/ix86/ix86_macros.h +++ b/pcsx2/x86/ix86/ix86_macros.h @@ -394,7 +394,6 @@ #define PUSHFD ePUSHFD<_EmitterId_> #define POPFD ePOPFD<_EmitterId_> #define RET eRET<_EmitterId_> -#define RET2 eRET2<_EmitterId_> #define CBW eCBW<_EmitterId_> #define CWDE eCWDE<_EmitterId_> #define CWD eCWD<_EmitterId_> diff --git a/pcsx2/x86/ix86/ix86_sse.inl b/pcsx2/x86/ix86/ix86_sse.inl index 4f27e03d27..87c2c6d764 100644 --- a/pcsx2/x86/ix86/ix86_sse.inl +++ b/pcsx2/x86/ix86/ix86_sse.inl @@ -276,7 +276,7 @@ emitterT void eSSE_MOVUPSRmtoROffset( x86SSERegType to, x86IntRegType from, int } // movups r32 to [r32+offset] -emitterT void eSSE_MOVUPSRtoRmOffset( x86SSERegType to, x86IntRegType from, int offset ) +emitterT void eSSE_MOVUPSRtoRmOffset( x86IntRegType to, x86SSERegType from, int offset ) { RexRB(0, from, to); write16( 0x110f ); @@ -955,7 +955,7 @@ emitterT void eSSE2_PXOR_M128_to_XMM( x86SSERegType to, uptr from ) { SSEMtoR emitterT void eSSE2_MOVDQA_M128_to_XMM(x86SSERegType to, uptr from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_M128_to_XMM( to, from ); else SSEMtoR66(0x6F0F); } emitterT void eSSE2_MOVDQA_XMM_to_M128( uptr to, x86SSERegType from ) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_M128( to, from ); else SSERtoM66(0x7F0F); } -emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from) { if (to != from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM( to, from ); else SSERtoR66(0x6F0F); } } +emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM( to, from ); else if( to != from ) SSERtoR66(0x6F0F); } emitterT void eSSE2_MOVDQU_M128_to_XMM(x86SSERegType to, uptr from) {