From 4bb830827da2881bf36e77d4fa364a5b90c7012d Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Mon, 26 Jul 2010 18:14:56 +0000 Subject: [PATCH] IPU: Various minor header file, table, and inline function tweakings/cleanups. Note that I unified several tables into structs and applied __aligned16 to them. I'm not just being silly: this seems to have a noticeable positive effect on framerates (~3-4% here). git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3573 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/IPU/IPU.cpp | 48 ++-- pcsx2/IPU/IPU.h | 6 - pcsx2/IPU/IPU_Fifo.cpp | 3 +- pcsx2/IPU/mpeg2lib/Mpeg.cpp | 160 ++++++------ pcsx2/IPU/mpeg2lib/Mpeg.h | 87 +++++-- pcsx2/IPU/mpeg2lib/Vlc.h | 480 ++++++++++++++++++------------------ pcsx2/IPU/yuv2rgb.cpp | 1 + 7 files changed, 419 insertions(+), 366 deletions(-) diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp index c5e7098fbc..1b216a3dcd 100644 --- a/pcsx2/IPU/IPU.cpp +++ b/pcsx2/IPU/IPU.cpp @@ -24,6 +24,8 @@ #include "IPU.h" #include "yuv2rgb.h" +#include "mpeg2lib/Mpeg.h" + #include "Vif.h" #include "Gif.h" #include "Vif_Dma.h" @@ -50,7 +52,7 @@ u8* g_pIPU0Pointer = NULL; void ReorderBitstream(); // the BP doesn't advance and returns -1 if there is no data to be read -tIPU_BP g_BP; +__aligned16 tIPU_BP g_BP; void IPUWorker(); @@ -65,6 +67,7 @@ static u8 iq[64]; //intraquant matrix u16 vqclut[16]; //clut conversion table static u8 s_thresh[2]; //thresholds for color conversions int coded_block_pattern = 0; + __aligned16 macroblock_8 mb8; __aligned16 macroblock_16 mb16; __aligned16 macroblock_rgb32 rgb32; @@ -73,8 +76,7 @@ __aligned16 macroblock_rgb16 rgb16; u8 indx4[16*16/2]; bool mpeg2_inited = false; //mpeg2_idct_init() must be called only once u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'}; -decoder_t decoder; //static, only to place it in bss -decoder_t tempdec; +__aligned16 decoder_t decoder; //static, only to place it in bss extern "C" { @@ -96,10 +98,6 @@ void init_g_decoder() decoder.intra_quantizer_matrix = (u8*)iq; decoder.non_intra_quantizer_matrix = (u8*)niq; decoder.picture_structure = FRAME_PICTURE; //default: progressive...my guess:P - decoder.mb8 = &mb8; - decoder.mb16 = &mb16; - decoder.rgb32 = &rgb32; - decoder.rgb16 = &rgb16; decoder.stride = 16; } @@ -428,8 +426,8 @@ static __forceinline BOOL ipuBDEC(u32 val, bool resume) decoder.dcr = bdec.DCR; decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN; - memzero(mb8); - memzero(mb16); + memzero_sse_a(mb8); + memzero_sse_a(mb16); } return mpeg2_slice(); @@ -595,8 +593,8 @@ static BOOL __fastcall ipuCSC(u32 val) if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; } - ipu_csc(&mb8, &rgb32, 0); - if (csc.OFM) ipu_dither(&rgb32, &rgb16, csc.DTE); + ipu_csc(mb8, rgb32, 0); + if (csc.OFM) ipu_dither(rgb32, rgb16, csc.DTE); if (csc.OFM) { @@ -637,10 +635,10 @@ static BOOL ipuPACK(u32 val) if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; } - ipu_csc(&mb8, &rgb32, 0); - ipu_dither(&rgb32, &rgb16, csc.DTE); + ipu_csc(mb8, rgb32, 0); + ipu_dither(rgb32, rgb16, csc.DTE); - if (csc.OFM) ipu_vq(&rgb16, indx4); + if (csc.OFM) ipu_vq(rgb16, indx4); if (csc.OFM) { @@ -1117,10 +1115,10 @@ u8 __fastcall getBits8(u8 *address, u32 advance) void Skl_YUV_To_RGB32_MMX(u8 *RGB, const int Dst_BpS, const u8 *Y, const u8 *U, const u8 *V, const int Src_BpS, const int Width, const int Height); -void __fastcall ipu_csc(macroblock_8 *mb8, macroblock_rgb32 *rgb32, int sgn) +__forceinline void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn) { int i; - u8* p = (u8*)rgb32; + u8* p = (u8*)&rgb32; yuv2rgb(); @@ -1151,30 +1149,30 @@ void __fastcall ipu_csc(macroblock_8 *mb8, macroblock_rgb32 *rgb32, int sgn) } } -void __fastcall ipu_dither(const macroblock_rgb32* rgb32, macroblock_rgb16 *rgb16, int dte) +__forceinline void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte) { int i, j; for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) { - rgb16->c[i][j].r = rgb32->c[i][j].r >> 3; - rgb16->c[i][j].g = rgb32->c[i][j].g >> 3; - rgb16->c[i][j].b = rgb32->c[i][j].b >> 3; - rgb16->c[i][j].a = rgb32->c[i][j].a == 0x40; + rgb16.c[i][j].r = rgb32.c[i][j].r >> 3; + rgb16.c[i][j].g = rgb32.c[i][j].g >> 3; + rgb16.c[i][j].b = rgb32.c[i][j].b >> 3; + rgb16.c[i][j].a = rgb32.c[i][j].a == 0x40; } } } -void __fastcall ipu_vq(macroblock_rgb16 *rgb16, u8* indx4) +__forceinline void ipu_vq(macroblock_rgb16& rgb16, u8* indx4) { Console.Error("IPU: VQ not implemented"); } -void __fastcall ipu_copy(const macroblock_8 *mb8, macroblock_16 *mb16) +__forceinline void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16) { - const u8 *s = (const u8*)mb8; - s16 *d = (s16*)mb16; + const u8 *s = (const u8*)&mb8; + s16 *d = (s16*)&mb16; int i; for (i = 0; i < 256; i++) *d++ = *s++; //Y bias - 16 for (i = 0; i < 64; i++) *d++ = *s++; //Cr bias - 128 diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h index bb23f05e33..d753e7c32d 100644 --- a/pcsx2/IPU/IPU.h +++ b/pcsx2/IPU/IPU.h @@ -16,7 +16,6 @@ #ifndef __IPU_H__ #define __IPU_H__ -#include "mpeg2lib/Mpeg.h" #include "IPU_Fifo.h" #ifdef _MSC_VER @@ -342,16 +341,11 @@ struct tIPU_cmd }; extern tIPU_cmd ipu_cmd; -extern tIPU_BP g_BP; extern int coded_block_pattern; extern int g_nIPU0Data; // or 0x80000000 whenever transferring extern u8* g_pIPU0Pointer; extern IPUStatus IPU1Status; extern tIPU_DMA g_nDMATransfer; -// The IPU can only do one task at once and never uses other buffers so these -// should be made available to functions in other modules to save registers. -extern __aligned16 macroblock_rgb32 rgb32; -extern __aligned16 macroblock_8 mb8; extern int ipuInit(); extern void ipuReset(); diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp index 2435903aa3..b6a3b08127 100644 --- a/pcsx2/IPU/IPU_Fifo.cpp +++ b/pcsx2/IPU/IPU_Fifo.cpp @@ -15,8 +15,9 @@ #include "PrecompiledHeader.h" #include "Common.h" -#include "IPU_Fifo.h" #include "IPU.h" +#include "mpeg2lib/Mpeg.h" + IPU_Fifo ipu_fifo; diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp index 7b7a278fa9..2325b37752 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp +++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp @@ -243,9 +243,9 @@ int get_macroblock_address_increment() u16 code = UBITS(16); if (code >= 4096) - mba = MBA_5 + (UBITS(5) - 2); + mba = MBA.mba5 + (UBITS(5) - 2); else if (code >= 768) - mba = MBA_11 + (UBITS(11) - 24); + mba = MBA.mba11 + (UBITS(11) - 24); else switch (UBITS(11)) { @@ -277,16 +277,16 @@ static __forceinline int get_luma_dc_dct_diff() if (code < 31) { - size = DClumtab0[code].size; - DUMPBITS(DClumtab0[code].len); + size = DCtable.lum0[code].size; + DUMPBITS(DCtable.lum0[code].len); // 5 bits max } else { code = UBITS(9) - 0x1f0; - size = DClumtab1[code].size; - DUMPBITS(DClumtab1[code].len); + size = DCtable.lum1[code].size; + DUMPBITS(DCtable.lum1[code].len); // 9 bits max } @@ -313,14 +313,14 @@ static __forceinline int get_chroma_dc_dct_diff() if (code<31) { - size = DCchromtab0[code].size; - DUMPBITS(DCchromtab0[code].len); + size = DCtable.chrom0[code].size; + DUMPBITS(DCtable.chrom0[code].len); } else { code = UBITS(10) - 0x3e0; - size = DCchromtab1[code].size; - DUMPBITS(DCchromtab1[code].len); + size = DCtable.chrom1[code].size; + DUMPBITS(DCtable.chrom1[code].len); } if (size==0) @@ -371,49 +371,55 @@ static __forceinline bool get_intra_block() if (code >= 16384 && (!decoder.intra_vlc_format || decoder.mpeg1)) { - tab = &DCTtabnext[(code >> 12) - 4]; + tab = &DCT.next[(code >> 12) - 4]; } else if (code >= 1024) { - if (decoder.intra_vlc_format && !decoder.mpeg1) - { - tab = &DCTtab0a[(code >> 8) - 4]; - } - else - { - tab = &DCTtab0[(code >> 8) - 4]; - } + if (decoder.intra_vlc_format && !decoder.mpeg1) + { + tab = &DCT.tab0a[(code >> 8) - 4]; + } + else + { + tab = &DCT.tab0[(code >> 8) - 4]; + } } else if (code >= 512) { - if (decoder.intra_vlc_format && !decoder.mpeg1) - { - tab = &DCTtab1a[(code >> 6) - 8]; - } - else - { - tab = &DCTtab1[(code >> 6) - 8]; - } + if (decoder.intra_vlc_format && !decoder.mpeg1) + { + tab = &DCT.tab1a[(code >> 6) - 8]; + } + else + { + tab = &DCT.tab1[(code >> 6) - 8]; + } } + + // [TODO] Optimization: Following codes can all be done by a single "expedited" lookup + // that should use a single unrolled DCT table instead of five separate tables used + // here. Multiple conditional statements are very slow, while modern CPU data caches + // have lots of room to spare. + else if (code >= 256) { - tab = &DCTtab2[(code >> 4) - 16]; + tab = &DCT.tab2[(code >> 4) - 16]; } else if (code >= 128) { - tab = &DCTtab3[(code >> 3) - 16]; + tab = &DCT.tab3[(code >> 3) - 16]; } else if (code >= 64) { - tab = &DCTtab4[(code >> 2) - 16]; + tab = &DCT.tab4[(code >> 2) - 16]; } else if (code >= 32) { - tab = &DCTtab5[(code >> 1) - 16]; + tab = &DCT.tab5[(code >> 1) - 16]; } else if (code >= 16) { - tab = &DCTtab6[code - 16]; + tab = &DCT.tab6[code - 16]; } else { @@ -519,40 +525,46 @@ static __forceinline bool get_non_intra_block(int * last) { if (i==0) { - tab = &DCTtabfirst[(code >> 12) - 4]; + tab = &DCT.first[(code >> 12) - 4]; } else { - tab = &DCTtabnext[(code >> 12)- 4]; + tab = &DCT.next[(code >> 12)- 4]; } } else if (code >= 1024) { - tab = &DCTtab0[(code >> 8) - 4]; + tab = &DCT.tab0[(code >> 8) - 4]; } else if (code >= 512) { - tab = &DCTtab1[(code >> 6) - 8]; + tab = &DCT.tab1[(code >> 6) - 8]; } + + // [TODO] Optimization: Following codes can all be done by a single "expedited" lookup + // that should use a single unrolled DCT table instead of five separate tables used + // here. Multiple conditional statements are very slow, while modern CPU data caches + // have lots of room to spare. + else if (code >= 256) { - tab = &DCTtab2[(code >> 4) - 16]; + tab = &DCT.tab2[(code >> 4) - 16]; } else if (code >= 128) { - tab = &DCTtab3[(code >> 3) - 16]; + tab = &DCT.tab3[(code >> 3) - 16]; } else if (code >= 64) { - tab = &DCTtab4[(code >> 2) - 16]; + tab = &DCT.tab4[(code >> 2) - 16]; } else if (code >= 32) { - tab = &DCTtab5[(code >> 1) - 16]; + tab = &DCT.tab5[(code >> 1) - 16]; } else if (code >= 16) { - tab = &DCTtab6[code - 16]; + tab = &DCT.tab6[code - 16]; } else { @@ -625,7 +637,7 @@ static __forceinline bool get_non_intra_block(int * last) return true; } -static bool __fastcall slice_intra_DCT(const int cc, u8 * const dest, const int stride, const bool skip) +static __forceinline bool slice_intra_DCT(const int cc, u8 * const dest, const int stride, const bool skip) { if (!skip || ipu_cmd.pos[3]) { @@ -655,13 +667,13 @@ static bool __fastcall slice_intra_DCT(const int cc, u8 * const dest, const int return true; } -static bool __fastcall slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip) +static __forceinline bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip) { int last; if (!skip) { - memzero(decoder.DCTblock); + memzero_sse_a(decoder.DCTblock); } if (!get_non_intra_block(&last)) @@ -735,8 +747,8 @@ bool mpeg2sliceIDEC() } decoder.coded_block_pattern = 0x3F;//all 6 blocks - memzero(*decoder.mb8); - memzero(*decoder.rgb32); + memzero_sse_a(mb8); + memzero_sse_a(rgb32); case 1: ipu_cmd.pos[1] = 1; @@ -756,37 +768,37 @@ bool mpeg2sliceIDEC() { case 0: case 1: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y, DCT_stride, ipu_cmd.pos[2] == 1)) + if (!slice_intra_DCT(0, (u8*)mb8.Y, DCT_stride, ipu_cmd.pos[2] == 1)) { ipu_cmd.pos[2] = 1; return false; } case 2: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + 8, DCT_stride, ipu_cmd.pos[2] == 2)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + 8, DCT_stride, ipu_cmd.pos[2] == 2)) { ipu_cmd.pos[2] = 2; return false; } case 3: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset, DCT_stride, ipu_cmd.pos[2] == 3)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + DCT_offset, DCT_stride, ipu_cmd.pos[2] == 3)) { ipu_cmd.pos[2] = 3; return false; } case 4: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[2] == 4)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[2] == 4)) { ipu_cmd.pos[2] = 4; return false; } case 5: - if (!slice_intra_DCT(1, (u8*)decoder.mb8->Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5)) + if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5)) { ipu_cmd.pos[2] = 5; return false; } case 6: - if (!slice_intra_DCT(2, (u8*)decoder.mb8->Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6)) + if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6)) { ipu_cmd.pos[2] = 6; return false; @@ -794,19 +806,19 @@ bool mpeg2sliceIDEC() } // Send The MacroBlock via DmaIpuFrom - ipu_csc(decoder.mb8, decoder.rgb32, decoder.sgn); + ipu_csc(mb8, rgb32, decoder.sgn); if (decoder.ofm == 0) { g_nIPU0Data = 64; - g_pIPU0Pointer = (u8*)decoder.rgb32; + g_pIPU0Pointer = (u8*)&rgb32; } else { - ipu_dither(decoder.rgb32, decoder.rgb16, decoder.dte); + ipu_dither(rgb32, rgb16, decoder.dte); g_nIPU0Data = 32; - g_pIPU0Pointer = (u8*)decoder.rgb16; + g_pIPU0Pointer = (u8*)&rgb16; } case 2: @@ -841,12 +853,12 @@ bool mpeg2sliceIDEC() code = UBITS(16); if (code >= 0x1000) { - mba = MBA_5 + (UBITS(5) - 2); + mba = MBA.mba5 + (UBITS(5) - 2); break; } else if (code >= 0x0300) { - mba = MBA_11 + (UBITS(11) - 24); + mba = MBA.mba11 + (UBITS(11) - 24); break; } else switch (UBITS(11)) @@ -942,8 +954,8 @@ bool mpeg2_slice() ipuRegs->ctrl.ECD = 0; ipuRegs->top = 0; - memzero(*decoder.mb8); - memzero(*decoder.mb16); + memzero_sse_a(mb8); + memzero_sse_a(mb16); case 1: if (!bitstream_init()) { @@ -972,37 +984,37 @@ bool mpeg2_slice() case 0: decoder.coded_block_pattern = 0x3F; case 1: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y, DCT_stride, ipu_cmd.pos[1] == 1)) + if (!slice_intra_DCT(0, (u8*)mb8.Y, DCT_stride, ipu_cmd.pos[1] == 1)) { ipu_cmd.pos[1] = 1; return false; } case 2: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + 8, DCT_stride, ipu_cmd.pos[1] == 2)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + 8, DCT_stride, ipu_cmd.pos[1] == 2)) { ipu_cmd.pos[1] = 2; return false; } case 3: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3)) { ipu_cmd.pos[1] = 3; return false; } case 4: - if (!slice_intra_DCT(0, (u8*)decoder.mb8->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4)) + if (!slice_intra_DCT(0, (u8*)mb8.Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4)) { ipu_cmd.pos[1] = 4; return false; } case 5: - if (!slice_intra_DCT(1, (u8*)decoder.mb8->Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) + if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) { ipu_cmd.pos[1] = 5; return false; } case 6: - if (!slice_intra_DCT(2, (u8*)decoder.mb8->Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) + if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) { ipu_cmd.pos[1] = 6; return false; @@ -1010,7 +1022,7 @@ bool mpeg2_slice() break; } - ipu_copy(decoder.mb8, decoder.mb16); + ipu_copy(mb8, mb16); } else { @@ -1023,7 +1035,7 @@ bool mpeg2_slice() case 1: if (decoder.coded_block_pattern & 0x20) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Y, DCT_stride, ipu_cmd.pos[1] == 1)) + if (!slice_non_intra_DCT((s16*)mb16.Y, DCT_stride, ipu_cmd.pos[1] == 1)) { ipu_cmd.pos[1] = 1; return false; @@ -1032,7 +1044,7 @@ bool mpeg2_slice() case 2: if (decoder.coded_block_pattern & 0x10) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + 8, DCT_stride, ipu_cmd.pos[1] == 2)) + if (!slice_non_intra_DCT((s16*)mb16.Y + 8, DCT_stride, ipu_cmd.pos[1] == 2)) { ipu_cmd.pos[1] = 2; return false; @@ -1041,7 +1053,7 @@ bool mpeg2_slice() case 3: if (decoder.coded_block_pattern & 0x08) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3)) + if (!slice_non_intra_DCT((s16*)mb16.Y + DCT_offset, DCT_stride, ipu_cmd.pos[1] == 3)) { ipu_cmd.pos[1] = 3; return false; @@ -1050,7 +1062,7 @@ bool mpeg2_slice() case 4: if (decoder.coded_block_pattern & 0x04) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4)) + if (!slice_non_intra_DCT((s16*)mb16.Y + DCT_offset + 8, DCT_stride, ipu_cmd.pos[1] == 4)) { ipu_cmd.pos[1] = 4; return false; @@ -1059,7 +1071,7 @@ bool mpeg2_slice() case 5: if (decoder.coded_block_pattern & 0x2) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) + if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) { ipu_cmd.pos[1] = 5; return false; @@ -1068,7 +1080,7 @@ bool mpeg2_slice() case 6: if (decoder.coded_block_pattern & 0x1) { - if (!slice_non_intra_DCT((s16*)decoder.mb16->Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) + if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) { ipu_cmd.pos[1] = 6; return false; @@ -1098,7 +1110,7 @@ bool mpeg2_slice() decoder.mbc = 1; g_nIPU0Data = 48; - g_pIPU0Pointer = (u8*)decoder.mb16; + g_pIPU0Pointer = (u8*)&mb16; case 3: while (g_nIPU0Data > 0) diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h index 2860e4f53b..9c26c1696c 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.h +++ b/pcsx2/IPU/mpeg2lib/Mpeg.h @@ -22,8 +22,50 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ -#ifndef __MPEG_H__ -#define __MPEG_H__ +#pragma once + +#include + +template< typename T > +__noinline void memzero_sse_a( T& dest ) +{ +#define MZFqwc (sizeof(dest)/16) + + C_ASSERT( (sizeof(dest) & 0xf) == 0 ); + + __m128 zeroreg = _mm_setzero_ps(); + + float (*destxmm)[4] = (float(*)[4])&dest; + +#define StoreDestIdx(idx) case idx: _mm_store_ps(&destxmm[idx][0], zeroreg) + + switch( MZFqwc & 0x07 ) + { + StoreDestIdx(0x07); + StoreDestIdx(0x06); + StoreDestIdx(0x05); + StoreDestIdx(0x04); + StoreDestIdx(0x03); + StoreDestIdx(0x02); + StoreDestIdx(0x01); + } + + destxmm += (MZFqwc & 0x07); + for( uint i=0; i