IPU Bug/Feature fix: Pseudonym has coded a new yuv2rgb decoder which is up to IPU spec (which differs slightly from MPEG spec). Improves color hue/saturation on many vids, and is a bit faster too.

Dynarec: Removed RET2().

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@683 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-03-05 02:21:07 +00:00
parent 7fd0f67f93
commit fc84ade01d
10 changed files with 296 additions and 541 deletions

View File

@ -96,9 +96,6 @@ int IPU1dma();
//char convert_data_buffer[sizeof(convert_rgb_t)];
char convert_data_buffer[0x1C];
convert_init_t convert_init={convert_data_buffer, sizeof(convert_data_buffer)};
convert_t *convert;
// Quantization matrix
static u8 niq[64], //non-intraquant matrix
iq[64]; //intraquant matrix
@ -216,8 +213,7 @@ void SaveState::ipuFreeze() {
if (!mpeg2_inited){
mpeg2_idct_init();
convert=convert_rgb (CONVERT_RGB, 32);
convert(16, 16, 0, NULL, &convert_init);
yuv2rgb_init();
memzero_obj(mb8.Y);
memzero_obj(mb8.Cb);
memzero_obj(mb8.Cr);
@ -314,8 +310,7 @@ void ipuSoftReset()
{
if (!mpeg2_inited){
mpeg2_idct_init();
convert=convert_rgb (CONVERT_RGB, 32);
convert(16, 16, 0, NULL, &convert_init);
yuv2rgb_init();
memzero_obj(mb8.Y);
memzero_obj(mb8.Cb);
memzero_obj(mb8.Cr);
@ -1274,8 +1269,7 @@ void __fastcall ipu_csc(macroblock_8 *mb8, macroblock_rgb32 *rgb32, int sgn){
int i;
u8* p = (u8*)rgb32;
convert_init.start(convert_init.id, (u8*)rgb32, CONVERT_FRAME);
convert_init.copy(convert_init.id, (u8*)mb8->Y, (u8*)mb8->Cr, (u8*)mb8->Cb, 0);
yuv2rgb_sse2();
if( s_thresh[0] > 0 ) {
for(i = 0; i < 64*4; i++, p += 4) {

View File

@ -19,6 +19,8 @@
#ifndef __IPU_H__
#define __IPU_H__
#include "mpeg2lib/Mpeg.h"
// IPU_INLINE_IRQS
// Scheduling ints into the future is a purist approach to emulation, and
// is mostly cosmetic since the emulator itself performs all actions instantly
@ -222,6 +224,10 @@ extern int coded_block_pattern;
extern int g_nIPU0Data; // or 0x80000000 whenever transferring
extern u8* g_pIPU0Pointer;
// The IPU can only do one task at once and never uses other buffers so these
// should be made available to functions in other modules to save registers.
PCSX2_ALIGNED16(extern macroblock_rgb32 rgb32);
PCSX2_ALIGNED16(extern macroblock_8 mb8);
void dmaIPU0();
void dmaIPU1();

View File

@ -64,12 +64,10 @@ struct macroblock_16{
short Cr[8][8]; //2
};
struct rgb32{
unsigned char r, g, b, a;
};
struct macroblock_rgb32{
struct rgb32 c[16][16];
struct {
unsigned char r, g, b, a;
} c[16][16];
};
struct rgb16{

View File

@ -1,514 +1,308 @@
/*
* yuv2rgb.c
* Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* Modified by Florin for PCSX2 emu
/* Pcsx2 - Pc Ps2 Emulator
* Copyright (C) 2002-2009 Pcsx2 Team
*
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
* See http://libmpeg2.sourceforge.net/ for updates.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
// IPU-correct yuv conversions by Pseudonym
// SSE2 Implementation by Pseudonym
#include "PrecompiledHeader.h"
#include "System.h"
#include "mpeg2lib/Mpeg.h"
#include "IPU.h"
#include "yuv2rgb.h"
//#include "convert_internal.h" //START
struct convert_rgb_t {
u8 * rgb_ptr;
int width;
int uv_stride, uv_stride_frame;
int rgb_stride, rgb_stride_frame;
void (__fastcall * yuv2rgb) (u8 *, u8 *, u8 *, u8 *,
void *, void *, int);
};
// Everything below is bit accurate to the IPU specification (except maybe rounding).
// Know the specification before you touch it.
typedef void __fastcall yuv2rgb_copy (void * id, u8 * const * src,
unsigned int v_offset);
PCSX2_ALIGNED16(u16 C_bias)[8] = {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000};
PCSX2_ALIGNED16(u8 Y_bias)[16] = {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
#define SSE_COEFFICIENTS(name, x) \
PCSX2_ALIGNED16(u16 name)[8] = {x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2,x<<2};
SSE_COEFFICIENTS(Y_coefficients, 0x95); // 1.1640625
SSE_COEFFICIENTS(RCr_coefficients, 0xcc); // 1.59375
SSE_COEFFICIENTS(GCr_coefficients, (-0x68)); // -0.8125
SSE_COEFFICIENTS(GCb_coefficients, (-0x32)); // -0.390625
SSE_COEFFICIENTS(BCb_coefficients, 0x102); // 2.015625
PCSX2_ALIGNED16(u16 Y_mask)[8] = {0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00,0xff00};
// Specifying round off instead of round down as everywhere else
// implies that this is right
PCSX2_ALIGNED16(u16 round_1bit)[8] = {1,1,1,1,1,1,1,1};
PCSX2_ALIGNED16(u16 yuv2rgb_temp)[3][8];
yuv2rgb_copy __fastcall * yuv2rgb_init_mmxext (int bpp, int mode);
yuv2rgb_copy __fastcall * yuv2rgb_init_mmx (int bpp, int mode);
yuv2rgb_copy __fastcall * yuv2rgb_init_mlib (int bpp, int mode);
//#include "convert_internal.h" //END
static u32 matrix_coefficients = 6;
const s32 Inverse_Table_6_9[8][4] = {
{117504, 138453, 13954, 34903}, /*0 no sequence_display_extension */
{117504, 138453, 13954, 34903}, /*1 ITU-R Rec. 709 (1990) */
{104597, 132201, 25675, 53279}, /*2 unspecified */
{104597, 132201, 25675, 53279}, /*3 reserved */
{104448, 132798, 24759, 53109}, /*4 FCC */
{104597, 132201, 25675, 53279}, /*5 ITU-R Rec. 624-4 System B, G */
{104597, 132201, 25675, 53279}, /*6 SMPTE 170M */
{117579, 136230, 16907, 35559} /*7 SMPTE 240M (1987) */
};
typedef void __fastcall yuv2rgb_c_internal (u8 *, u8 *, u8 *, u8 *,
void *, void *, int);
void * table_rV[256];
void * table_gU[256];
int table_gV[256];
void * table_bU[256];
#define _RGB(type,i) \
U = pu[i]; \
V = pv[i]; \
r = (type *) table_rV[V]; \
g = (type *) (((u8 *)table_gU[U]) + table_gV[V]); \
b = (type *) table_bU[U];
#define DST(py,dst,i) \
Y = py[2*i]; \
dst[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py[2*i+1]; \
dst[2*i+1] = r[Y] + g[Y] + b[Y];
#define DSTRGB(py,dst,i) \
Y = py[2*i]; \
dst[6*i] = r[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = b[Y]; \
Y = py[2*i+1]; \
dst[6*i+3] = r[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = b[Y];
#define DSTBGR(py,dst,i) \
Y = py[2*i]; \
dst[6*i] = b[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = r[Y]; \
Y = py[2*i+1]; \
dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y];
static void __fastcall yuv2rgb_c_32 (u8 * py_1, u8 * py_2,
u8 * pu, u8 * pv,
void * _dst_1, void * _dst_2, int width)
// This could potentially be improved for SSE4
void yuv2rgb_sse2(void)
{
int U, V, Y;
u32 * r, * g, * b;
u32 * dst_1, * dst_2;
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
__asm {
mov eax, 1
mov esi, 0
mov edi, 0
width >>= 3;
dst_1 = (u32 *) _dst_1;
dst_2 = (u32 *) _dst_2;
align 16
tworows:
movq xmm3, qword ptr [mb8+256+esi]
movq xmm1, qword ptr [mb8+320+esi]
pxor xmm2, xmm2
pxor xmm0, xmm0
// could skip the movq but punpck requires 128-bit alignment
// for some reason, so two versions would be needed,
// bloating the function (further)
punpcklbw xmm2, xmm3
punpcklbw xmm0, xmm1
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
pxor xmm2, xmmword ptr [C_bias] // xmm2 <-- 8 x (Cb - 128) << 8
pxor xmm0, xmmword ptr [C_bias] // xmm0 <-- 8 x (Cr - 128) << 8
do {
_RGB (u32, 0);
DST (py_1, dst_1, 0);
DST (py_2, dst_2, 0);
movaps xmm1, xmm0
movaps xmm3, xmm2
pmulhw xmm1, xmmword ptr [GCr_coefficients]
pmulhw xmm3, xmmword ptr [GCb_coefficients]
pmulhw xmm0, xmmword ptr [RCr_coefficients]
pmulhw xmm2, xmmword ptr [BCb_coefficients]
paddsw xmm1, xmm3
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
movaps xmmword ptr [yuv2rgb_temp], xmm0
movaps xmmword ptr [yuv2rgb_temp+16], xmm1
movaps xmmword ptr [yuv2rgb_temp+32], xmm2
jmp ihatemsvc
_RGB (u32, 1);
DST (py_2, dst_2, 1);
DST (py_1, dst_1, 1);
align 16
onerow:
movaps xmm0, xmmword ptr [yuv2rgb_temp]
movaps xmm1, xmmword ptr [yuv2rgb_temp+16]
movaps xmm2, xmmword ptr [yuv2rgb_temp+32]
_RGB (u32, 2);
DST (py_1, dst_1, 2);
DST (py_2, dst_2, 2);
// If masm directives worked properly in inline asm, I'd be using them,
// but I'm not inclined to write ~70 line #defines to simulate them.
// Maybe the function's faster like this anyway because it's smaller?
// I'd have to write a 70 line #define to benchmark it.
_RGB (u32, 3);
DST (py_2, dst_2, 3);
DST (py_1, dst_1, 3);
ihatemsvc:
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
pu += 4;
pv += 4;
py_1 += 8;
py_2 += 8;
dst_1 += 8;
dst_2 += 8;
} while (--width);
}
movaps xmm6, xmmword ptr [mb8+edi]
psubusb xmm6, xmmword ptr [Y_bias]
movaps xmm7, xmm6
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
pand xmm7, xmmword ptr [Y_mask] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
/* This is very near from the yuv2rgb_c_32 code */
static void __fastcall yuv2rgb_c_24_rgb (u8 * py_1, u8 * py_2,
u8 * pu, u8 * pv,
void * _dst_1, void * _dst_2, int width)
{
int U, V, Y;
u8 * r, * g, * b;
u8 * dst_1, * dst_2;
pmulhuw xmm6, xmmword ptr [Y_coefficients]
pmulhuw xmm7, xmmword ptr [Y_coefficients]
width >>= 3;
dst_1 = (u8 *) _dst_1;
dst_2 = (u8 *) _dst_2;
paddsw xmm0, xmm6
paddsw xmm3, xmm7
paddsw xmm1, xmm6
paddsw xmm4, xmm7
paddsw xmm2, xmm6
paddsw xmm5, xmm7
do {
_RGB (u8, 0);
DSTRGB (py_1, dst_1, 0);
DSTRGB (py_2, dst_2, 0);
// round
movaps xmm6, xmmword ptr [round_1bit]
paddw xmm0, xmm6
paddw xmm1, xmm6
paddw xmm2, xmm6
paddw xmm3, xmm6
paddw xmm4, xmm6
paddw xmm5, xmm6
psraw xmm0, 1
psraw xmm1, 1
psraw xmm2, 1
psraw xmm3, 1
psraw xmm4, 1
psraw xmm5, 1
_RGB (u8, 1);
DSTRGB (py_2, dst_2, 1);
DSTRGB (py_1, dst_1, 1);
// combine even and odd bytes
packuswb xmm0, xmm3
packuswb xmm1, xmm4
packuswb xmm2, xmm5
movhlps xmm3, xmm0
movhlps xmm4, xmm1
movhlps xmm5, xmm2
punpcklbw xmm0, xmm3 // Red bytes, back in order
punpcklbw xmm1, xmm4 // Green ""
punpcklbw xmm2, xmm5 // Blue ""
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
_RGB (u8, 2);
DSTRGB (py_1, dst_1, 2);
DSTRGB (py_2, dst_2, 2);
// Create RGBA (we could generate A here, but we don't) quads
punpcklbw xmm0, xmm1
punpcklbw xmm2, xmm7
movaps xmm1, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm1, xmm2
_RGB (u8, 3);
DSTRGB (py_2, dst_2, 3);
DSTRGB (py_1, dst_1, 3);
punpckhbw xmm3, xmm4
punpckhbw xmm5, xmm7
movaps xmm4, xmm3
punpcklwd xmm3, xmm5
punpckhwd xmm4, xmm5
pu += 4;
pv += 4;
py_1 += 8;
py_2 += 8;
dst_1 += 24;
dst_2 += 24;
} while (--width);
}
// at last
movaps xmmword ptr [rgb32+edi*4+0], xmm0
movaps xmmword ptr [rgb32+edi*4+16], xmm1
movaps xmmword ptr [rgb32+edi*4+32], xmm3
movaps xmmword ptr [rgb32+edi*4+48], xmm4
/* only trivial mods from yuv2rgb_c_24_rgb */
static void __fastcall yuv2rgb_c_24_bgr (u8 * py_1, u8 * py_2,
u8 * pu, u8 * pv,
void * _dst_1, void * _dst_2, int width)
{
int U, V, Y;
u8 * r, * g, * b;
u8 * dst_1, * dst_2;
add edi, 16
width >>= 3;
dst_1 = (u8 *) _dst_1;
dst_2 = (u8 *) _dst_2;
neg eax
jl onerow // run twice
do {
_RGB (u8, 0);
DSTBGR (py_1, dst_1, 0);
DSTBGR (py_2, dst_2, 0);
add esi, 8
cmp esi, 64
jne tworows
}
#elif defined(__GNUC__)
asm(
".intel_syntax noprefix\n"
"mov eax, 1\n"
"mov esi, 0\n"
"mov edi, 0\n"
_RGB (u8, 1);
DSTBGR (py_2, dst_2, 1);
DSTBGR (py_1, dst_1, 1);
".align 16\n"
"tworows:\n"
"movq xmm3, qword ptr [mb8+256+esi]\n"
"movq xmm1, qword ptr [mb8+320+esi]\n"
"pxor xmm2, xmm2\n"
"pxor xmm0, xmm0\n"
// could skip the movq but punpck requires 128-bit alignment
// for some reason, so two versions would be needed,
// bloating the function (further)
"punpcklbw xmm2, xmm3\n"
"punpcklbw xmm0, xmm1\n"
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
"pxor xmm2, xmmword ptr [C_bias]\n" // xmm2 <-- 8 x (Cb - 128) << 8
"pxor xmm0, xmmword ptr [C_bias]\n" // xmm0 <-- 8 x (Cr - 128) << 8
_RGB (u8, 2);
DSTBGR (py_1, dst_1, 2);
DSTBGR (py_2, dst_2, 2);
"movaps xmm1, xmm0\n"
"movaps xmm3, xmm2\n"
"pmulhw xmm1, xmmword ptr [GCr_coefficients]\n"
"pmulhw xmm3, xmmword ptr [GCb_coefficients]\n"
"pmulhw xmm0, xmmword ptr [RCr_coefficients]\n"
"pmulhw xmm2, xmmword ptr [BCb_coefficients]\n"
"paddsw xmm1, xmm3\n"
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
"movaps xmmword ptr [yuv2rgb_temp], xmm0\n"
"movaps xmmword ptr [yuv2rgb_temp+16], xmm1\n"
"movaps xmmword ptr [yuv2rgb_temp+32], xmm2\n"
"jmp ihategcctoo\n"
_RGB (u8, 3);
DSTBGR (py_2, dst_2, 3);
DSTBGR (py_1, dst_1, 3);
".align 16\n"
"onerow:\n"
"movaps xmm0, xmmword ptr [yuv2rgb_temp]\n"
"movaps xmm1, xmmword ptr [yuv2rgb_temp+16]\n"
"movaps xmm2, xmmword ptr [yuv2rgb_temp+32]\n"
pu += 4;
pv += 4;
py_1 += 8;
py_2 += 8;
dst_1 += 24;
dst_2 += 24;
} while (--width);
}
"ihategcctoo:\n"
"movaps xmm3, xmm0\n"
"movaps xmm4, xmm1\n"
"movaps xmm5, xmm2\n"
/* This is exactly the same code as yuv2rgb_c_32 except for the types of */
/* r, g, b, dst_1, dst_2 */
static void __fastcall yuv2rgb_c_16 (u8 * py_1, u8 * py_2,
u8 * pu, u8 * pv,
void * _dst_1, void * _dst_2, int width)
{
int U, V, Y;
u16 * r, * g, * b;
u16 * dst_1, * dst_2;
"movaps xmm6, xmmword ptr [mb8+edi]\n"
"psubusb xmm6, xmmword ptr [Y_bias]\n"
"movaps xmm7, xmm6\n"
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
"pand xmm7, xmmword ptr [Y_mask]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
width >>= 3;
dst_1 = (u16 *) _dst_1;
dst_2 = (u16 *) _dst_2;
"pmulhuw xmm6, xmmword ptr [Y_coefficients]\n"
"pmulhuw xmm7, xmmword ptr [Y_coefficients]\n"
do {
_RGB (u16, 0);
DST (py_1, dst_1, 0);
DST (py_2, dst_2, 0);
"paddsw xmm0, xmm6\n"
"paddsw xmm3, xmm7\n"
"paddsw xmm1, xmm6\n"
"paddsw xmm4, xmm7\n"
"paddsw xmm2, xmm6\n"
"paddsw xmm5, xmm7\n"
_RGB (u16, 1);
DST (py_2, dst_2, 1);
DST (py_1, dst_1, 1);
// round
"movaps xmm6, xmmword ptr [round_1bit]\n"
"paddw xmm0, xmm6\n"
"paddw xmm1, xmm6\n"
"paddw xmm2, xmm6\n"
"paddw xmm3, xmm6\n"
"paddw xmm4, xmm6\n"
"paddw xmm5, xmm6\n"
"psraw xmm0, 1\n"
"psraw xmm1, 1\n"
"psraw xmm2, 1\n"
"psraw xmm3, 1\n"
"psraw xmm4, 1\n"
"psraw xmm5, 1\n"
_RGB (u16, 2);
DST (py_1, dst_1, 2);
DST (py_2, dst_2, 2);
// combine even and odd bytes
"packuswb xmm0, xmm3\n"
"packuswb xmm1, xmm4\n"
"packuswb xmm2, xmm5\n"
"movhlps xmm3, xmm0\n"
"movhlps xmm4, xmm1\n"
"movhlps xmm5, xmm2\n"
"punpcklbw xmm0, xmm3\n" // Red bytes, back in order
"punpcklbw xmm1, xmm4\n" // Green ""
"punpcklbw xmm2, xmm5\n" // Blue ""
"movaps xmm3, xmm0\n"
"movaps xmm4, xmm1\n"
"movaps xmm5, xmm2\n"
_RGB (u16, 3);
DST (py_2, dst_2, 3);
DST (py_1, dst_1, 3);
// Create RGBA (we could generate A here, but we don't) quads
"punpcklbw xmm0, xmm1\n"
"punpcklbw xmm2, xmm7\n"
"movaps xmm1, xmm0\n"
"punpcklwd xmm0, xmm2\n"
"punpckhwd xmm1, xmm2\n"
pu += 4;
pv += 4;
py_1 += 8;
py_2 += 8;
dst_1 += 8;
dst_2 += 8;
} while (--width);
}
"punpckhbw xmm3, xmm4\n"
"punpckhbw xmm5, xmm7\n"
"movaps xmm4, xmm3\n"
"punpcklwd xmm3, xmm5\n"
"punpckhwd xmm4, xmm5\n"
static int div_round (int dividend, int divisor)
{
if (dividend > 0)
return (dividend + (divisor>>1)) / divisor;
else
return -((-dividend + (divisor>>1)) / divisor);
}
// at last
"movaps xmmword ptr [rgb32+edi*4+0], xmm0\n"
"movaps xmmword ptr [rgb32+edi*4+16], xmm1\n"
"movaps xmmword ptr [rgb32+edi*4+32], xmm3\n"
"movaps xmmword ptr [rgb32+edi*4+48], xmm4\n"
static yuv2rgb_c_internal __fastcall * yuv2rgb_c_init (int order, int bpp)
{
int i;
u8 table_Y[1024];
u32 * table_32 = 0;
u16 * table_16 = 0;
u8 * table_8 = 0;
int entry_size = 0;
void * table_r = 0;
void * table_g = 0;
void * table_b = 0;
yuv2rgb_c_internal * yuv2rgb;
"add edi, 16\n"
int crv = Inverse_Table_6_9[matrix_coefficients][0];
int cbu = Inverse_Table_6_9[matrix_coefficients][1];
int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
"neg eax\n"
"jl onerow\n" // run twice
for (i = 0; i < 1024; i++)
{
int j;
j = (76309 * (i - 384 - 16) + 32768) >> 16;
j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
table_Y[i] = j;
}
switch (bpp)
{
case 32:
yuv2rgb = yuv2rgb_c_32;
table_32 = (u32 *) malloc ((197 + 2*682 + 256 + 132) *
sizeof (u32));
entry_size = sizeof (u32);
table_r = table_32 + 197;
table_b = table_32 + 197 + 685;
table_g = table_32 + 197 + 2*682;
for (i = -197; i < 256+197; i++)
((u32 *) table_r)[i] =
table_Y[i+384] << ((order == CONVERT_RGB) ? 16 : 0);
for (i = -132; i < 256+132; i++)
((u32 *) table_g)[i] = table_Y[i+384] << 8;
for (i = -232; i < 256+232; i++)
((u32 *) table_b)[i] =
table_Y[i+384] << ((order == CONVERT_RGB) ? 0 : 16);
break;
case 24:
yuv2rgb = (order == CONVERT_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;
table_8 = (u8 *) malloc ((256 + 2*232) * sizeof (u8));
entry_size = sizeof (u8);
table_r = table_g = table_b = table_8 + 232;
for (i = -232; i < 256+232; i++)
((u8 * )table_b)[i] = table_Y[i+384];
break;
case 15:
case 16:
yuv2rgb = yuv2rgb_c_16;
table_16 = (u16 *) malloc ((197 + 2*682 + 256 + 132) *
sizeof (u16));
entry_size = sizeof (u16);
table_r = table_16 + 197;
table_b = table_16 + 197 + 685;
table_g = table_16 + 197 + 2*682;
for (i = -197; i < 256+197; i++) {
int j = table_Y[i+384] >> 3;
if (order == CONVERT_RGB)
j <<= ((bpp==16) ? 11 : 10);
((u16 *)table_r)[i] = j;
}
for (i = -132; i < 256+132; i++) {
int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
((u16 *)table_g)[i] = j << 5;
}
for (i = -232; i < 256+232; i++) {
int j = table_Y[i+384] >> 3;
if (order == CONVERT_RGB)
j <<= ((bpp==16) ? 11 : 10);
((u16 *)table_b)[i] = j;
}
break;
#ifdef PCSX2_DEVBUILD
default:
DevCon::Error( "IPU Panic! %ibpp not supported by yuv2rgb", params bpp );
"add esi, 8\n"
"cmp esi, 64\n"
"jne tworows\n"
".att_syntax\n"
);
#else
jNO_DEFAULT
#error Unsupported compiler
#endif
}
for (i = 0; i < 256; i++) {
table_rV[i] = (((u8 *)table_r) +
entry_size * div_round (crv * (i-128), 76309));
table_gU[i] = (((u8 *)table_g) +
entry_size * div_round (cgu * (i-128), 76309));
table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
table_bU[i] = (((u8 *)table_b) +
entry_size * div_round (cbu * (i-128), 76309));
}
return yuv2rgb;
}
static void __fastcall convert_yuv2rgb_c (void * _id, u8 * Y, u8 * Cr, u8 * Cb,
unsigned int v_offset)
void yuv2rgb_init(void)
{
convert_rgb_t * id = (convert_rgb_t *) _id;
u8 * dst;
u8 * py;
u8 * pu;
u8 * pv;
int loop;
dst = id->rgb_ptr + id->rgb_stride * v_offset;
py = Y; pu = Cr; pv = Cb;
loop = 8;
do {
id->yuv2rgb (py, py + (id->uv_stride << 1), pu, pv,
dst, dst + id->rgb_stride, id->width);
py += id->uv_stride << 2;
pu += id->uv_stride;
pv += id->uv_stride;
dst += 2 * id->rgb_stride;
} while (--loop);
}
static void __fastcall convert_start (void * _id, u8 * dest, int flags)
{
convert_rgb_t * id = (convert_rgb_t *) _id;
id->rgb_ptr = dest;
switch (flags) {
case CONVERT_BOTTOM_FIELD:
id->rgb_ptr += id->rgb_stride_frame;
/* break thru */
case CONVERT_TOP_FIELD:
id->uv_stride = id->uv_stride_frame << 1;
id->rgb_stride = id->rgb_stride_frame << 1;
break;
default:
id->uv_stride = id->uv_stride_frame;
id->rgb_stride = id->rgb_stride_frame;
}
}
static void __fastcall convert_internal (int order, int bpp, int width, int height,
u32 accel, void * arg, convert_init_t * result)
{
convert_rgb_t * id = (convert_rgb_t *) result->id;
if (!id) {
result->id_size = sizeof (convert_rgb_t);
} else {
id->width = width;
id->uv_stride_frame = width >> 1;
id->rgb_stride_frame = ((bpp + 7) >> 3) * width;
result->buf_size[0] = id->rgb_stride_frame * height;
result->buf_size[1] = result->buf_size[2] = 0;
result->start = convert_start;
result->copy = NULL;
#ifdef ARCH_X86
if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMXEXT)) {
result->copy = yuv2rgb_init_mmxext (order, bpp);
}
if ((result->copy == NULL) && (accel & MPEG2_ACCEL_X86_MMX)) {
result->copy = yuv2rgb_init_mmx (order, bpp);
}
#endif
#ifdef LIBVO_MLIB
if ((result->copy == NULL) && (accel & MPEG2_ACCEL_MLIB)) {
result->copy = yuv2rgb_init_mlib (order, bpp);
}
#endif
if (result->copy == NULL) {
result->copy = convert_yuv2rgb_c;
id->yuv2rgb = yuv2rgb_c_init (order, bpp);
}
}
}
void __fastcall convert_rgb32 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_RGB, 32, width, height, accel, arg, result);
}
void __fastcall convert_rgb24 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_RGB, 24, width, height, accel, arg, result);
}
void __fastcall convert_rgb16 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_RGB, 16, width, height, accel, arg, result);
}
void __fastcall convert_rgb15 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_RGB, 15, width, height, accel, arg, result);
}
void __fastcall convert_bgr32 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_BGR, 32, width, height, accel, arg, result);
}
void __fastcall convert_bgr24 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_BGR, 24, width, height, accel, arg, result);
}
void __fastcall convert_bgr16 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_BGR, 16, width, height, accel, arg, result);
}
void __fastcall convert_bgr15 (int width, int height, u32 accel, void * arg,
convert_init_t * result)
{
convert_internal (CONVERT_BGR, 15, width, height, accel, arg, result);
}
__forceinline convert_t* convert_rgb (int order, int bpp)
{
if (order == CONVERT_RGB || order == CONVERT_BGR)
switch (bpp) {
case 32: return (order == CONVERT_RGB) ? convert_rgb32 : convert_bgr32;
case 24: return (order == CONVERT_RGB) ? convert_rgb24 : convert_bgr24;
case 16: return (order == CONVERT_RGB) ? convert_rgb16 : convert_bgr16;
case 15: return (order == CONVERT_RGB) ? convert_rgb15 : convert_bgr15;
}
return NULL;
/* For later reimplementation of C version */
}

View File

@ -1,57 +1,22 @@
/*
* yuv2rgb.h
* Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
* Modified by Florin for PCSX2 emu
/* Pcsx2 - Pc Ps2 Emulator
* Copyright (C) 2002-2009 Pcsx2 Team
*
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
* See http://libmpeg2.sourceforge.net/ for updates.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* mpeg2dec is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* mpeg2dec is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef YUV2RGB_H
#define YUV2RGB_H
#pragma once
#define CONVERT_FRAME 0
#define CONVERT_TOP_FIELD 1
#define CONVERT_BOTTOM_FIELD 2
#define CONVERT_BOTH_FIELDS 3
struct convert_init_t {
void * id;
int id_size;
int buf_size[3];
void (__fastcall* start) (void * id, u8 * dest, int flags);
void (__fastcall* copy) (void * id, u8 * Y, u8 * Cr, u8 * Cb, unsigned int v_offset);
};
typedef void __fastcall convert_t (int width, int height, u32 accel, void * arg,
convert_init_t * result);
convert_t convert_rgb32;
convert_t convert_rgb24;
convert_t convert_rgb16;
convert_t convert_rgb15;
convert_t convert_bgr32;
convert_t convert_bgr24;
convert_t convert_bgr16;
convert_t convert_bgr15;
#define CONVERT_RGB 0
#define CONVERT_BGR 1
extern convert_t* convert_rgb (int order, int bpp);
#endif /* YUV2RGB_H */
void yuv2rgb_sse2(void);
void yuv2rgb_init(void);

View File

@ -993,7 +993,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
j8Ptr[2] = JG8( 0 ); // jump if psxCycleEE > 0
RET2(); // returns control to the EE
RET(); // returns control to the EE
// Continue onward with branching here:
x86SetJ8( j8Ptr[2] );

View File

@ -993,7 +993,7 @@ void CheckForBIOSEnd()
x86SetJ8( j8Ptr[1] );
// bios end
RET2();
RET();
x86SetJ8( j8Ptr[2] );
}
@ -1250,7 +1250,7 @@ static void iBranchTest(u32 newpc, bool noDispatch)
JS32((uptr)DispatcherReg - ( (uptr)x86Ptr[0] + 6 ));
}
RET2();
RET();
}
static void checkcodefn()

View File

@ -3207,8 +3207,7 @@ emitterT void ePUSHFD( void ) { write8<I>( 0x9C ); }
/* popfd */
emitterT void ePOPFD( void ) { write8<I>( 0x9D ); }
emitterT void eRET( void ) { write8<I>( 0xC3 ); }
emitterT void eRET2( void ) { write16<I>( 0xc3f3 ); }
emitterT void eRET( void ) { /*write8<I>( 0xf3 ); /*<-- K8 opt?*/ write8<I>( 0xC3 ); }
emitterT void eCBW( void ) { write16<I>( 0x9866 ); }
emitterT void eCWD( void ) { write8<I>( 0x98 ); }

View File

@ -394,7 +394,6 @@
#define PUSHFD ePUSHFD<_EmitterId_>
#define POPFD ePOPFD<_EmitterId_>
#define RET eRET<_EmitterId_>
#define RET2 eRET2<_EmitterId_>
#define CBW eCBW<_EmitterId_>
#define CWDE eCWDE<_EmitterId_>
#define CWD eCWD<_EmitterId_>

View File

@ -276,7 +276,7 @@ emitterT void eSSE_MOVUPSRmtoROffset( x86SSERegType to, x86IntRegType from, int
}
// movups r32 to [r32+offset]
emitterT void eSSE_MOVUPSRtoRmOffset( x86SSERegType to, x86IntRegType from, int offset )
emitterT void eSSE_MOVUPSRtoRmOffset( x86IntRegType to, x86SSERegType from, int offset )
{
RexRB(0, from, to);
write16<I>( 0x110f );
@ -955,7 +955,7 @@ emitterT void eSSE2_PXOR_M128_to_XMM( x86SSERegType to, uptr from ) { SSEMtoR
emitterT void eSSE2_MOVDQA_M128_to_XMM(x86SSERegType to, uptr from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_M128_to_XMM<I>( to, from ); else SSEMtoR66(0x6F0F); }
emitterT void eSSE2_MOVDQA_XMM_to_M128( uptr to, x86SSERegType from ) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_M128<I>( to, from ); else SSERtoM66(0x7F0F); }
emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from) { if (to != from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM<I>( to, from ); else SSERtoR66(0x6F0F); } }
emitterT void eSSE2_MOVDQA_XMM_to_XMM( x86SSERegType to, x86SSERegType from) { if( AlwaysUseMovaps ) eSSE_MOVAPS_XMM_to_XMM<I>( to, from ); else if( to != from ) SSERtoR66(0x6F0F); }
emitterT void eSSE2_MOVDQU_M128_to_XMM(x86SSERegType to, uptr from)
{