diff --git a/pcsx2/Dmac.h b/pcsx2/Dmac.h index 75c1496cc9..f3432c7013 100644 --- a/pcsx2/Dmac.h +++ b/pcsx2/Dmac.h @@ -161,8 +161,8 @@ union tDMA_SADR { union tDMA_MADR { struct { - u32 ADDR : 31; // Transfer memory address - u32 SPR : 1; // Memory/SPR Address + u32 ADDR : 31; // Transfer memory address + u32 SPR : 1; // Memory/SPR Address }; u32 _u32; @@ -175,8 +175,8 @@ union tDMA_MADR { union tDMA_TADR { struct { - u32 ADDR : 31; // Next Tag address - u32 SPR : 1; // Memory/SPR Address + u32 ADDR : 31; // Next Tag address + u32 SPR : 1; // Memory/SPR Address }; u32 _u32; @@ -190,8 +190,8 @@ union tDMA_TADR { // The Address Stack Register union tDMA_ASR { struct { - u32 ADDR : 31; // Tag memory address - u32 SPR : 1; // Memory/SPR Address + u32 ADDR : 31; // Tag memory address + u32 SPR : 1; // Memory/SPR Address }; u32 _u32; diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp index 28122713c4..79ca234fc2 100644 --- a/pcsx2/FiFo.cpp +++ b/pcsx2/FiFo.cpp @@ -94,27 +94,7 @@ void __fastcall ReadFIFO_page_6(u32 mem, u64 *out) out[1] = psHu64(GIF_FIFO + 8); } -void __fastcall ReadFIFO_page_7(u32 mem, u64 *out) -{ - pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) ); - - // All addresses in this page map to 0x7000 and 0x7010: - mem &= 0x10; - - if( mem == 0 ) // IPUout_FIFO - { - if( g_nIPU0Data > 0 ) - { - out[0] = *(u64*)(g_pIPU0Pointer); - out[1] = *(u64*)(g_pIPU0Pointer+8); - ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31; - g_nIPU0Data--; - g_pIPU0Pointer += 16; - } - } - else // IPUin_FIFO - ipu_fifo.out.readsingle((void*)out); -} +// ReadFIFO_page_7 is contained in IPU_Fifo.cpp ////////////////////////////////////////////////////////////////////////// // WriteFIFO Pages diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp index 9160f98ec9..fdf9bc6779 100644 --- a/pcsx2/IPU/IPU.cpp +++ b/pcsx2/IPU/IPU.cpp @@ -43,12 +43,6 @@ tIPU_DMA g_nDMATransfer(0); tIPU_cmd ipu_cmd; IPUStatus IPU1Status; -// FIXME - g_nIPU0Data and Pointer are not saved in the savestate, which breaks savestates for some -// FMVs at random (if they get saved during the half frame of a 30fps rate). The fix is complicated -// since coroutine is such a pita. (air) -int g_nIPU0Data = 0; // data left to transfer -u8* g_pIPU0Pointer = NULL; - void ReorderBitstream(); // the BP doesn't advance and returns -1 if there is no data to be read @@ -59,72 +53,36 @@ void IPUWorker(); // Color conversion stuff, the memory layout is a total hack // convert_data_buffer is a pointer to the internal rgb struct (the first param in convert_init_t) //char convert_data_buffer[sizeof(convert_rgb_t)]; -char convert_data_buffer[0x1C]; +//char convert_data_buffer[0x1C]; // unused? +//u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'}; // unused? // Quantization matrix -// Pointers outside of IPU.cpp point to niq & iq. As such, all hell breaks loose under gcc if you make them static. -u8 niq[64]; //non-intraquant matrix -u8 iq[64]; //intraquant matrix -u16 vqclut[16]; //clut conversion table -static u8 s_thresh[2]; //thresholds for color conversions +static u16 vqclut[16]; //clut conversion table +static u8 s_thresh[2]; //thresholds for color conversions int coded_block_pattern = 0; -__aligned16 macroblock_8 mb8; -__aligned16 macroblock_16 mb16; -__aligned16 macroblock_rgb32 rgb32; -__aligned16 macroblock_rgb16 rgb16; u8 indx4[16*16/2]; -bool mpeg2_inited = false; //mpeg2_idct_init() must be called only once -u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'}; -__aligned16 decoder_t decoder; //static, only to place it in bss - -extern "C" -{ - extern u8 mpeg2_scan_norm[64]; - extern u8 mpeg2_scan_alt[64]; -} +__aligned16 decoder_t decoder; __aligned16 u8 _readbits[80]; //local buffer (ring buffer) -u8* readbits = _readbits; // always can decrement by one 1qw +u8* readbits = _readbits; // always can decrement by one 1qw __forceinline void IPUProcessInterrupt() { if (ipuRegs->ctrl.BUSY && g_BP.IFC) IPUWorker(); } -void init_g_decoder() -{ - //other stuff - decoder.intra_quantizer_matrix = (u8*)iq; - decoder.non_intra_quantizer_matrix = (u8*)niq; - decoder.picture_structure = FRAME_PICTURE; //default: progressive...my guess:P - decoder.stride = 16; -} - -void mpeg2_init() -{ - if (!mpeg2_inited) - { - mpeg2_idct_init(); - yuv2rgb_init(); - memzero(mb8.Y); - memzero(mb8.Cb); - memzero(mb8.Cr); - memzero(mb16.Y); - memzero(mb16.Cb); - memzero(mb16.Cr); - mpeg2_inited = true; - } -} - ///////////////////////////////////////////////////////// // Register accesses (run on EE thread) int ipuInit() { memzero(*ipuRegs); memzero(g_BP); - init_g_decoder(); + memzero(decoder); + + decoder.picture_structure = FRAME_PICTURE; //default: progressive...my guess:P + g_nDMATransfer.reset(); IPU1Status.InProgress = false; IPU1Status.DMAMode = DMA_MODE_NORMAL; @@ -149,18 +107,16 @@ void ReportIPU() Console.WriteLn(ipu_fifo.in.desc()); Console.WriteLn(ipu_fifo.out.desc()); Console.WriteLn(g_BP.desc()); - Console.WriteLn("niq = 0x%x, iq = 0x%x.", niq, iq); Console.WriteLn("vqclut = 0x%x.", vqclut); Console.WriteLn("s_thresh = 0x%x.", s_thresh); Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern); - Console.WriteLn("g_decoder = 0x%x.", decoder); - Console.WriteLn("mpeg2: scan_norm = 0x%x, alt = 0x%x.", mpeg2_scan_norm, mpeg2_scan_alt); + Console.WriteLn("g_decoder = 0x%x.", &decoder); + Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan); Console.WriteLn(ipu_cmd.desc()); Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.", _readbits, readbits - _readbits); Console.Newline(); } -// fixme - ipuFreeze looks fairly broken. Should probably take a closer look at some point. void SaveStateBase::ipuFreeze() { @@ -168,24 +124,15 @@ void SaveStateBase::ipuFreeze() //ReportIPU(); FreezeTag("IPU"); - // old versions saved the IPU regs, but they're already saved as part of HW! - //FreezeMem(ipuRegs, sizeof(IPUregisters)); - Freeze(g_nDMATransfer); Freeze(ipu_fifo); Freeze(g_BP); - Freeze(niq); - Freeze(iq); Freeze(vqclut); Freeze(s_thresh); Freeze(coded_block_pattern); Freeze(decoder); - Freeze(mpeg2_scan_norm); - Freeze(mpeg2_scan_alt); - Freeze(ipu_cmd); - Freeze(_readbits); int temp = readbits - _readbits; @@ -194,16 +141,9 @@ void SaveStateBase::ipuFreeze() if (IsLoading()) { readbits = _readbits; - init_g_decoder(); - mpeg2_init(); } } -bool ipuCanFreeze() -{ - return (ipu_cmd.current == -1); -} - __forceinline u32 ipuRead32(u32 mem) { // Note: It's assumed that mem's input value is always in the 0x10002000 page @@ -223,7 +163,7 @@ __forceinline u32 ipuRead32(u32 mem) if (!ipuRegs->ctrl.BUSY) IPU_LOG("Ipu read32: IPU_CTRL=0x%08X %x", ipuRegs->ctrl._u32, cpuRegs.pc); - return ipuRegs->ctrl._u32; + return ipuRegs->ctrl._u32; ipucase(IPU_BP): // IPU_BP ipuRegs->ipubp = g_BP.BP & 0x7f; @@ -231,7 +171,8 @@ __forceinline u32 ipuRead32(u32 mem) ipuRegs->ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16; IPU_LOG("Ipu read32: IPU_BP=0x%08X", ipuRegs->ipubp); - return ipuRegs->ipubp; + return ipuRegs->ipubp; + default: IPU_LOG("Ipu read32: Addr=0x%x Value = 0x%08X", mem, *(u32*)(((u8*)ipuRegs) + mem)); } @@ -277,7 +218,6 @@ __forceinline u64 ipuRead64(u32 mem) void ipuSoftReset() { - mpeg2_init(); ipu_fifo.clear(); coded_block_pattern = 0; @@ -381,16 +321,16 @@ static BOOL ipuIDEC(u32 val, bool resume) g_BP.BP += idec.FB;//skip FB bits //from IPU_CTRL ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;) - decoder.coding_type = ipuRegs->ctrl.PCT; - decoder.mpeg1 = ipuRegs->ctrl.MP1; - decoder.q_scale_type = ipuRegs->ctrl.QST; - decoder.intra_vlc_format = ipuRegs->ctrl.IVF; - decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm; - decoder.intra_dc_precision = ipuRegs->ctrl.IDP; + decoder.coding_type = ipuRegs->ctrl.PCT; + decoder.mpeg1 = ipuRegs->ctrl.MP1; + decoder.q_scale_type = ipuRegs->ctrl.QST; + decoder.intra_vlc_format = ipuRegs->ctrl.IVF; + decoder.scantype = ipuRegs->ctrl.AS; + decoder.intra_dc_precision = ipuRegs->ctrl.IDP; //from IDEC value - decoder.quantizer_scale = idec.QSC; - decoder.frame_pred_frame_dct = !idec.DTD; + decoder.quantizer_scale = idec.QSC; + decoder.frame_pred_frame_dct= !idec.DTD; decoder.sgn = idec.SGN; decoder.dte = idec.DTE; decoder.ofm = idec.OFM; @@ -414,21 +354,21 @@ static __forceinline BOOL ipuBDEC(u32 val, bool resume) if (IsDebugBuild) s_bdec++; g_BP.BP += bdec.FB;//skip FB bits - decoder.coding_type = I_TYPE; - decoder.mpeg1 = ipuRegs->ctrl.MP1; - decoder.q_scale_type = ipuRegs->ctrl.QST; - decoder.intra_vlc_format = ipuRegs->ctrl.IVF; - decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm; - decoder.intra_dc_precision = ipuRegs->ctrl.IDP; + decoder.coding_type = I_TYPE; + decoder.mpeg1 = ipuRegs->ctrl.MP1; + decoder.q_scale_type = ipuRegs->ctrl.QST; + decoder.intra_vlc_format = ipuRegs->ctrl.IVF; + decoder.scantype = ipuRegs->ctrl.AS; + decoder.intra_dc_precision = ipuRegs->ctrl.IDP; //from BDEC value - decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1; - decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0; - decoder.dcr = bdec.DCR; - decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN; + decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1; + decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0; + decoder.dcr = bdec.DCR; + decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN; - memzero_sse_a(mb8); - memzero_sse_a(mb16); + memzero_sse_a(decoder.mb8); + memzero_sse_a(decoder.mb16); } return mpeg2_slice(); @@ -516,6 +456,8 @@ static BOOL ipuSETIQ(u32 val) if ((val >> 27) & 1) { + u8 (&niq)[64] = decoder.niq; + for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++) { if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE; @@ -531,6 +473,8 @@ static BOOL ipuSETIQ(u32 val) } else { + u8 (&iq)[64] = decoder.iq; + for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++) { if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE; @@ -552,7 +496,7 @@ static BOOL ipuSETVQ(u32 val) { for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++) { - if (!getBits64((u8*)vqclut + 8 * ipu_cmd.pos[0], 1)) return FALSE; + if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return FALSE; } IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO."); @@ -591,17 +535,17 @@ static BOOL __fastcall ipuCSC(u32 val) { for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++) { - if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; + if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; } - ipu_csc(mb8, rgb32, 0); - if (csc.OFM) ipu_dither(rgb32, rgb16, csc.DTE); + ipu_csc(decoder.mb8, decoder.rgb32, 0); + if (csc.OFM) ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE); if (csc.OFM) { while (ipu_cmd.pos[1] < 32) { - ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]); + ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]); if (ipu_cmd.pos[1] <= 0) return FALSE; } @@ -610,7 +554,7 @@ static BOOL __fastcall ipuCSC(u32 val) { while (ipu_cmd.pos[1] < 64) { - ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]); + ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]); if (ipu_cmd.pos[1] <= 0) return FALSE; } @@ -633,17 +577,17 @@ static BOOL ipuPACK(u32 val) { for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++) { - if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; + if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE; } - ipu_csc(mb8, rgb32, 0); - ipu_dither(rgb32, rgb16, csc.DTE); + ipu_csc(decoder.mb8, decoder.rgb32, 0); + ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE); - if (csc.OFM) ipu_vq(rgb16, indx4); + if (csc.OFM) ipu_vq(decoder.rgb16, indx4); if (csc.OFM) { - ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]); + ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]); if (ipu_cmd.pos[1] < 32) return FALSE; } diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h index cc3be3772f..a8f9ed30bd 100644 --- a/pcsx2/IPU/IPU.h +++ b/pcsx2/IPU/IPU.h @@ -342,8 +342,6 @@ struct tIPU_cmd extern tIPU_cmd ipu_cmd; extern int coded_block_pattern; -extern int g_nIPU0Data; // or 0x80000000 whenever transferring -extern u8* g_pIPU0Pointer; extern IPUStatus IPU1Status; extern tIPU_DMA g_nDMATransfer; diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp index b6a3b08127..e1467724e2 100644 --- a/pcsx2/IPU/IPU_Fifo.cpp +++ b/pcsx2/IPU/IPU_Fifo.cpp @@ -19,7 +19,7 @@ #include "mpeg2lib/Mpeg.h" -IPU_Fifo ipu_fifo; +__aligned16 IPU_Fifo ipu_fifo; void IPU_Fifo::init() { @@ -167,3 +167,32 @@ void IPU_Fifo_Output::readsingle(void *value) _readsingle(value); } } + +__forceinline bool decoder_t::ReadIpuData(u128* out) +{ + if(decoder.ipu0_data == 0) return false; + _mm_store_ps((float*)out, _mm_load_ps((float*)GetIpuDataPtr())); + + --ipu0_data; + ++ipu0_idx; + + return true; +} + +void __fastcall ReadFIFO_page_7(u32 mem, u64 *out) +{ + pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) ); + + // All addresses in this page map to 0x7000 and 0x7010: + mem &= 0x10; + + if (mem == 0) // IPUout_FIFO + { + if (decoder.ReadIpuData((u128*)out)) + { + ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31; + } + } + else // IPUin_FIFO + ipu_fifo.out.readsingle((void*)out); +} diff --git a/pcsx2/IPU/IPU_Fifo.h b/pcsx2/IPU/IPU_Fifo.h index 6ea658cfd7..10a1e940d3 100644 --- a/pcsx2/IPU/IPU_Fifo.h +++ b/pcsx2/IPU/IPU_Fifo.h @@ -16,12 +16,14 @@ #ifndef IPU_FIFO_H_INCLUDED #define IPU_FIFO_H_INCLUDED -class IPU_Fifo_Input -{ - public: +// Important! All FIFO containers in this header should be 'struct' type, not class type. +// They are saved into the savestate as-is, and keeping them as struct ensures that the +// layout of their contents is reliable. - int readpos, writepos; +struct IPU_Fifo_Input +{ __aligned16 u32 data[32]; + int readpos, writepos; int write(u32* pMem, int size); int read(void *value); @@ -29,12 +31,10 @@ class IPU_Fifo_Input wxString desc() const; }; -class IPU_Fifo_Output +struct IPU_Fifo_Output { - public: - - int readpos, writepos; __aligned16 u32 data[32]; + int readpos, writepos; // returns number of qw read int write(const u32 * value, int size); @@ -42,20 +42,19 @@ class IPU_Fifo_Output void readsingle(void *value); void clear(); wxString desc() const; - private: + void _readsingle(void *value); }; -class IPU_Fifo +struct IPU_Fifo { - public: - IPU_Fifo_Input in; - IPU_Fifo_Output out; + __aligned16 IPU_Fifo_Input in; + __aligned16 IPU_Fifo_Output out; void init(); void clear(); }; -extern IPU_Fifo ipu_fifo; +extern __aligned16 IPU_Fifo ipu_fifo; #endif // IPU_FIFO_H_INCLUDED diff --git a/pcsx2/IPU/mpeg2lib/Idct.cpp b/pcsx2/IPU/mpeg2lib/Idct.cpp index cb2012adc1..69daaa89b6 100644 --- a/pcsx2/IPU/mpeg2lib/Idct.cpp +++ b/pcsx2/IPU/mpeg2lib/Idct.cpp @@ -22,10 +22,15 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ +// [TODO] : There are modern SSE versions of idct (idct_mmx.c) in the mpeg2 libs that we +// should probably upgrade to. They use their own raw-style intrinsics and not the intel +// compiler-integrated ones. + #include "PrecompiledHeader.h" #include "Common.h" #include "IPU/IPU.h" +#include "Mpeg.h" #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ @@ -36,19 +41,14 @@ #define clp(val,res) res = (val < 0) ? 0 : ((val > 255) ? 255 : val); #define clp2(val,res) res = (val < -255) ? -255 : ((val > 255) ? 255 : val); -/* idct main entry point */ -void (__fastcall *mpeg2_idct_copy) (s16 * block, u8 * dest, int stride); -/* JayteeMaster: changed dest to 16 bit signed */ -void (__fastcall *mpeg2_idct_add) (int last, s16 * block, - /*u8*/s16 * dest, int stride); - /* * In legal streams, the IDCT output should be between -384 and +384. * In corrupted streams, it is possible to force the IDCT output to go * to +-3826 - this is the worst case for a column IDCT where the * column inputs are 16-bit values. */ -static u8 clip_lut[1024]; +static __aligned16 u8 clip_lut[1024]; + #define CLIP(i) ((clip_lut+384)[(i)]) #if 0 @@ -75,13 +75,13 @@ static __forceinline void idct_row (s16 * const block) /* shortcut */ if (!(block[1] | ((s32 *)block)[1] | ((s32 *)block)[2] | ((s32 *)block)[3])) { - u32 tmp = (u16) (block[0] << 3); - tmp |= tmp << 16; - ((s32 *)block)[0] = tmp; - ((s32 *)block)[1] = tmp; - ((s32 *)block)[2] = tmp; - ((s32 *)block)[3] = tmp; - return; + u32 tmp = (u16) (block[0] << 3); + tmp |= tmp << 16; + ((s32 *)block)[0] = tmp; + ((s32 *)block)[1] = tmp; + ((s32 *)block)[2] = tmp; + ((s32 *)block)[3] = tmp; + return; } d0 = (block[0] << 11) + 128; @@ -160,122 +160,97 @@ static __forceinline void idct_col (s16 * const block) block[8*7] = (a0 - b0) >> 17; } -static void __fastcall mpeg2_idct_copy_c (s16 * block, u8 * dest, - const int stride) +__releaseinline void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride) { int i; for (i = 0; i < 8; i++) - idct_row (block + 8 * i); + idct_row (block + 8 * i); for (i = 0; i < 8; i++) - idct_col (block + i); + idct_col (block + i); + + __m128 zero = _mm_setzero_ps(); do { - dest[0] = CLIP (block[0]); - dest[1] = CLIP (block[1]); - dest[2] = CLIP (block[2]); - dest[3] = CLIP (block[3]); - dest[4] = CLIP (block[4]); - dest[5] = CLIP (block[5]); - dest[6] = CLIP (block[6]); - dest[7] = CLIP (block[7]); + dest[0] = CLIP (block[0]); + dest[1] = CLIP (block[1]); + dest[2] = CLIP (block[2]); + dest[3] = CLIP (block[3]); + dest[4] = CLIP (block[4]); + dest[5] = CLIP (block[5]); + dest[6] = CLIP (block[6]); + dest[7] = CLIP (block[7]); - block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0; - block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0; + _mm_store_ps((float*)block, zero); - dest += stride; - block += 8; + dest += stride; + block += 8; } while (--i); } -/* JayteeMaster: changed dest to 16 bit signed */ -static void __fastcall mpeg2_idct_add_c (const int last, s16 * block, - /*u8*/s16 * dest, const int stride) + +// stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]). +__releaseinline void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride) { - int i; + // on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0). - if (last != 129 || (block[0] & 7) == 4) { - for (i = 0; i < 8; i++) - idct_row (block + 8 * i); - for (i = 0; i < 8; i++) - idct_col (block + i); - do { - dest[0] = block[0]; - dest[1] = block[1]; - dest[2] = block[2]; - dest[3] = block[3]; - dest[4] = block[4]; - dest[5] = block[5]; - dest[6] = block[6]; - dest[7] = block[7]; + if (last != 129 || (block[0] & 7) == 4) + { + int i; + for (i = 0; i < 8; i++) + idct_row (block + 8 * i); + for (i = 0; i < 8; i++) + idct_col (block + i); - block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0; - block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0; + __m128 zero = _mm_setzero_ps(); + do { + _mm_store_ps((float*)dest, _mm_load_ps((float*)block)); + _mm_store_ps((float*)block, zero); - dest += stride; - block += 8; - } while (--i); - } else { - int DC; + dest += stride; + block += 8; + } while (--i); - DC = (block[0] + 4) >> 3; - block[0] = block[63] = 0; - i = 8; - do { - dest[0] = DC; - dest[1] = DC; - dest[2] = DC; - dest[3] = DC; - dest[4] = DC; - dest[5] = DC; - dest[6] = DC; - dest[7] = DC; - dest += stride; - } while (--i); + } + else + { + int DC = (block[0] + 4) >> 3; + s16 dcf[2] = { DC, DC }; + block[0] = block[63] = 0; + + __m128 dc128 = _mm_set_ps1(*(float*)dcf); + + for(int i=0; i<8; ++i) + _mm_store_ps((float*)(dest+(stride*i)), dc128); } } -extern "C" +mpeg2_scan_pack::mpeg2_scan_pack() { -u8 mpeg2_scan_norm[64] = { - /* Zig-Zag scan pattern */ - 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, - 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, - 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, - 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 -}; + static const u8 mpeg2_scan_norm[64] = { + /* Zig-Zag scan pattern */ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 + }; -u8 mpeg2_scan_alt[64] = { - /* Alternate scan pattern */ - 0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49, - 41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43, - 51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45, - 53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63 -}; -}; + static const u8 mpeg2_scan_alt[64] = { + /* Alternate scan pattern */ + 0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49, + 41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43, + 51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45, + 53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63 + }; -// The MMX verson wasn't being used and it was only available as a .obj, -// so I removed it (gigaherz). -///* idct_mmx.c */ -//void mpeg2_idct_copy_mmxext (s16 * block, u8 * dest, int stride); -//void mpeg2_idct_add_mmxext (int last, s16 * block, -// s16 * dest, int stride); -//void mpeg2_idct_copy_mmx (s16 * block, u8 * dest, int stride); -//void mpeg2_idct_add_mmx (int last, s16 * block, -// s16 * dest, int stride); -//void mpeg2_idct_mmx_init (void); - -void mpeg2_idct_init() -{ - int i, j; - - mpeg2_idct_copy = mpeg2_idct_copy_c; - mpeg2_idct_add = mpeg2_idct_add_c; - for (i = -384; i < 640; i++) + for (int i = -384; i < 640; i++) clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i); - for (i = 0; i < 64; i++) { - j = mpeg2_scan_norm[i]; - mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); + + for (int i = 0; i < 64; i++) { + int j = mpeg2_scan_norm[i]; + norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); j = mpeg2_scan_alt[i]; - mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); + alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2); } } + +const __aligned16 mpeg2_scan_pack mpeg2_scan; diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp index bf9c68ccfb..9fbc48c155 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp +++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp @@ -33,7 +33,7 @@ #include "Mpeg.h" #include "Vlc.h" -int non_linear_quantizer_scale [] = +const int non_linear_quantizer_scale [] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, @@ -341,8 +341,8 @@ static __forceinline bool get_intra_block() int i; int j; int val; - const u8 * scan = decoder.scan; - const u8 * quant_matrix = decoder.intra_quantizer_matrix; + const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm; + const u8 (&quant_matrix)[64] = decoder.iq; int quantizer_scale = decoder.quantizer_scale; s16 * dest = decoder.DCTblock; u16 code; @@ -493,8 +493,8 @@ static __forceinline bool get_non_intra_block(int * last) int i; int j; int val; - const u8 * scan = decoder.scan; - const u8 * quant_matrix = decoder.non_intra_quantizer_matrix; + const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm; + const u8 (&quant_matrix)[64] = decoder.niq; int quantizer_scale = decoder.quantizer_scale; s16 * dest = decoder.DCTblock; u16 code; @@ -699,7 +699,6 @@ void __forceinline finishmpeg2sliceIDEC() bool mpeg2sliceIDEC() { - u32 read; u16 code; u8 bit8; @@ -725,6 +724,10 @@ bool mpeg2sliceIDEC() ipu_cmd.pos[0] = 2; while (1) { + macroblock_8& mb8 = decoder.mb8; + macroblock_rgb16& rgb16 = decoder.rgb16; + macroblock_rgb32& rgb32 = decoder.rgb32; + int DCT_offset, DCT_stride; const MBAtab * mba; @@ -747,13 +750,13 @@ bool mpeg2sliceIDEC() if (decoder.macroblock_modes & DCT_TYPE_INTERLACED) { - DCT_offset = decoder.stride; - DCT_stride = decoder.stride * 2; + DCT_offset = decoder_stride; + DCT_stride = decoder_stride * 2; } else { - DCT_offset = decoder.stride * 8; - DCT_stride = decoder.stride; + DCT_offset = decoder_stride * 8; + DCT_stride = decoder_stride; } switch (ipu_cmd.pos[2]) @@ -784,13 +787,13 @@ bool mpeg2sliceIDEC() return false; } case 5: - if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5)) + if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[2] == 5)) { ipu_cmd.pos[2] = 5; return false; } case 6: - if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6)) + if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[2] == 6)) { ipu_cmd.pos[2] = 6; return false; @@ -801,22 +804,17 @@ bool mpeg2sliceIDEC() ipu_csc(mb8, rgb32, decoder.sgn); if (decoder.ofm == 0) - { - g_nIPU0Data = 64; - g_pIPU0Pointer = (u8*)&rgb32; - } + decoder.SetOutputTo(rgb32); else { ipu_dither(rgb32, rgb16, decoder.dte); - - g_nIPU0Data = 32; - g_pIPU0Pointer = (u8*)&rgb16; + decoder.SetOutputTo(rgb16); } case 2: - while (g_nIPU0Data > 0) + while (decoder.ipu0_data > 0) { - read = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data); + uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); if (read == 0) { @@ -825,9 +823,7 @@ bool mpeg2sliceIDEC() } else { - g_pIPU0Pointer += read * 16; - g_nIPU0Data -= read; - + decoder.AdvanceIpuDataBy(read); } } @@ -932,7 +928,9 @@ bool mpeg2_slice() { int DCT_offset, DCT_stride; u8 bit8; - u32 size; + + macroblock_8& mb8 = decoder.mb8; + macroblock_16& mb16 = decoder.mb16; switch (ipu_cmd.pos[0]) { @@ -960,13 +958,13 @@ bool mpeg2_slice() if (decoder.macroblock_modes & DCT_TYPE_INTERLACED) { - DCT_offset = decoder.stride; - DCT_stride = decoder.stride * 2; + DCT_offset = decoder_stride; + DCT_stride = decoder_stride * 2; } else { - DCT_offset = decoder.stride * 8; - DCT_stride = decoder.stride; + DCT_offset = decoder_stride * 8; + DCT_stride = decoder_stride; } if (decoder.macroblock_modes & MACROBLOCK_INTRA) @@ -1000,13 +998,13 @@ bool mpeg2_slice() return false; } case 5: - if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) + if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5)) { ipu_cmd.pos[1] = 5; return false; } case 6: - if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) + if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6)) { ipu_cmd.pos[1] = 6; return false; @@ -1063,7 +1061,7 @@ bool mpeg2_slice() case 5: if (decoder.coded_block_pattern & 0x2) { - if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5)) + if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5)) { ipu_cmd.pos[1] = 5; return false; @@ -1072,7 +1070,7 @@ bool mpeg2_slice() case 6: if (decoder.coded_block_pattern & 0x1) { - if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6)) + if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6)) { ipu_cmd.pos[1] = 6; return false; @@ -1083,8 +1081,7 @@ bool mpeg2_slice() } } - //Send The MacroBlock via DmaIpuFrom - size = 0; // Reset + // Send The MacroBlock via DmaIpuFrom ipuRegs->ctrl.SCD = 0; coded_block_pattern = decoder.coded_block_pattern; g_BP.BP += (int)decoder.bitstream_bits - 16; @@ -1101,13 +1098,12 @@ bool mpeg2_slice() } decoder.mbc = 1; - g_nIPU0Data = 48; - g_pIPU0Pointer = (u8*)&mb16; + decoder.SetOutputTo(mb16); case 3: - while (g_nIPU0Data > 0) + while (decoder.ipu0_data > 0) { - size = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data); + uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); if (size == 0) { @@ -1116,8 +1112,7 @@ bool mpeg2_slice() } else { - g_pIPU0Pointer += size * 16; - g_nIPU0Data -= size; + decoder.AdvanceIpuDataBy(size); } } diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h index 9c26c1696c..13431eb396 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.h +++ b/pcsx2/IPU/mpeg2lib/Mpeg.h @@ -66,6 +66,8 @@ __noinline void memzero_sse_a( T& dest ) #undef MZFqwc }; +// the IPU is fixed to 16 byte strides (128-bit / QWC resolution): +static const uint decoder_stride = 16; enum macroblock_modes { @@ -106,25 +108,25 @@ enum picture_coding_type }; struct macroblock_8{ - unsigned char Y[16][16]; //0 - unsigned char Cb[8][8]; //1 - unsigned char Cr[8][8]; //2 + u8 Y[16][16]; //0 + u8 Cb[8][8]; //1 + u8 Cr[8][8]; //2 }; struct macroblock_16{ - short Y[16][16]; //0 - short Cb[8][8]; //1 - short Cr[8][8]; //2 + s16 Y[16][16]; //0 + s16 Cb[8][8]; //1 + s16 Cr[8][8]; //2 }; struct macroblock_rgb32{ struct { - unsigned char r, g, b, a; + u8 r, g, b, a; } c[16][16]; }; struct rgb16_t{ - unsigned short r:5, g:5, b:5, a:1; + u16 r:5, g:5, b:5, a:1; }; struct macroblock_rgb16{ @@ -138,24 +140,26 @@ struct decoder_t { /* DCT coefficients - should be kept aligned ! */ s16 DCTblock[64]; + u8 niq[64]; //non-intraquant matrix (sequence header) + u8 iq[64]; //intraquant matrix (sequence header) + + macroblock_8 mb8; + macroblock_16 mb16; + macroblock_rgb32 rgb32; + macroblock_rgb16 rgb16; + + uint ipu0_data; + uint ipu0_idx; + /* bit parsing stuff */ u32 bitstream_buf; /* current 32 bit working set */ int bitstream_bits; /* used bits in working set */ - int stride; - - /* predictor for DC coefficients in intra blocks */ - s16 dc_dct_pred[3]; - int quantizer_scale; /* remove */ int dmv_offset; /* remove */ /* now non-slice-specific information */ - /* sequence header stuff */ - u8 *intra_quantizer_matrix; - u8 *non_intra_quantizer_matrix; - /* picture header stuff */ /* what type of picture this is (I, P, B, D) */ @@ -163,6 +167,9 @@ struct decoder_t { /* picture coding extension stuff */ + /* predictor for DC coefficients in intra blocks */ + s16 dc_dct_pred[3]; + /* quantization factor for intra dc coefficients */ int intra_dc_precision; /* top/bottom/both fields */ @@ -195,16 +202,47 @@ struct decoder_t { /* stuff derived from bitstream */ - /* pointer to the zigzag scan we're supposed to be using */ - const u8 * scan; + /* the zigzag scan we're supposed to be using, true for alt, false for normal */ + bool scantype; int second_field; int mpeg1; + + template< typename T > + void SetOutputTo( T& obj ) + { + uint mb_offset = ((uptr)&obj - (uptr)&mb8); + pxAssume( (mb_offset & 15) == 0 ); + ipu0_idx = mb_offset / 16; + ipu0_data = sizeof(obj)/16; + } + + u128* GetIpuDataPtr() + { + return ((u128*)&mb8) + ipu0_idx; + } + + void AdvanceIpuDataBy(uint amt) + { + pxAssumeDev(ipu0_data>=amt, "IPU FIFO Overflow on advance!" ); + ipu0_idx += amt; + ipu0_data -= amt; + } + + bool ReadIpuData(u128* out); }; -extern void (__fastcall *mpeg2_idct_copy) (s16 * block, u8* dest, int stride); -extern void (__fastcall *mpeg2_idct_add) (int last, s16 * block, s16* dest, int stride); +struct mpeg2_scan_pack +{ + u8 norm[64]; + u8 alt[64]; + + mpeg2_scan_pack(); +}; + +extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride); +extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride); #define IDEC 0 #define BDEC 1 @@ -217,16 +255,12 @@ extern int get_macroblock_modes(); extern int get_motion_delta(const int f_code); extern int get_dmv(); -extern int non_linear_quantizer_scale[]; - extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn); extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte); extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4); extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16); extern int slice (u8 * buffer); -/* idct.c */ -extern void mpeg2_idct_init (); #ifdef _MSC_VER #define BigEndian(out, in) out = _byteswap_ulong(in) @@ -240,13 +274,12 @@ extern void mpeg2_idct_init (); #define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap... #endif +extern __aligned16 const mpeg2_scan_pack mpeg2_scan; +extern const int non_linear_quantizer_scale[]; + // The IPU can only do one task at once and never uses other buffers so all mpeg state variables // are made available to mpeg/vlc modules as globals here: extern __aligned16 tIPU_BP g_BP; extern __aligned16 decoder_t decoder; -extern __aligned16 macroblock_8 mb8; -extern __aligned16 macroblock_16 mb16; -extern __aligned16 macroblock_rgb32 rgb32; -extern __aligned16 macroblock_rgb16 rgb16; diff --git a/pcsx2/IPU/yuv2rgb.cpp b/pcsx2/IPU/yuv2rgb.cpp index 4c86e436c1..6b0dcb216d 100644 --- a/pcsx2/IPU/yuv2rgb.cpp +++ b/pcsx2/IPU/yuv2rgb.cpp @@ -39,6 +39,9 @@ // conforming implementation for reference, do not optimise void yuv2rgb_reference(void) { + const macroblock_8& mb8 = decoder.mb8; + macroblock_rgb32& rgb32 = decoder.rgb32; + for (int y = 0; y < 16; y++) for (int x = 0; x < 16; x++) { @@ -124,8 +127,8 @@ __releaseinline void yuv2rgb_sse2(void) align 16 tworows: - movq xmm3, qword ptr [mb8+256+esi] - movq xmm1, qword ptr [mb8+320+esi] + movq xmm3, qword ptr [decoder.mb8+256+esi] + movq xmm1, qword ptr [decoder.mb8+320+esi] pxor xmm2, xmm2 pxor xmm0, xmm0 // could skip the movq but punpck requires 128-bit alignment @@ -170,7 +173,7 @@ ihatemsvc: movaps xmm4, xmm1 movaps xmm5, xmm2 - movaps xmm6, xmmword ptr [mb8+edi] + movaps xmm6, xmmword ptr [decoder.mb8+edi] psubusb xmm6, xmmword ptr [edx+Y_BIAS] movaps xmm7, xmm6 psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 @@ -235,10 +238,10 @@ ihatemsvc: punpckhwd xmm4, xmm5 // at last - movaps xmmword ptr [rgb32+edi*4+0], xmm0 - movaps xmmword ptr [rgb32+edi*4+16], xmm1 - movaps xmmword ptr [rgb32+edi*4+32], xmm3 - movaps xmmword ptr [rgb32+edi*4+48], xmm4 + movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0 + movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1 + movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3 + movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4 add edi, 16 @@ -255,6 +258,8 @@ ihatemsvc: // offset to the middle of the sse2 table, so that we can use 1-byte address displacement // to access all fields: static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64; + static const macroblock_8* mb8 = (u8*)decoder.mb8; + static macroblock_rgb32* rgb32 = (u8*)decoder.rgb32; __asm__ __volatile__ ( ".intel_syntax noprefix\n" @@ -262,15 +267,10 @@ ihatemsvc: "xor esi, esi\n" "xor edi, edi\n" - // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps. - // This saves 2-3 bytes per instruction where these are used. :) - //"mov ecx, offset %c[yuv2rgb_temp]\n" - //"mov edx, offset %c[sse2_tables]+64\n" - ".align 16\n" "tworows:\n" - "movq xmm3, qword ptr [mb8+256+esi]\n" - "movq xmm1, qword ptr [mb8+320+esi]\n" + "movq xmm3, qword ptr [%[mb8]+256+esi]\n" + "movq xmm1, qword ptr [%[mb8]+320+esi]\n" "pxor xmm2, xmm2\n" "pxor xmm0, xmm0\n" // could skip the movq but punpck requires 128-bit alignment @@ -310,7 +310,7 @@ ihatemsvc: "movaps xmm4, xmm1\n" "movaps xmm5, xmm2\n" - "movaps xmm6, xmmword ptr [mb8+edi]\n" + "movaps xmm6, xmmword ptr [%[mb8]+edi]\n" "psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n" "movaps xmm7, xmm6\n" "psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 @@ -375,10 +375,10 @@ ihatemsvc: "punpckhwd xmm4, xmm5\n" // at last - "movaps xmmword ptr [rgb32+edi*4+0], xmm0\n" - "movaps xmmword ptr [rgb32+edi*4+16], xmm1\n" - "movaps xmmword ptr [rgb32+edi*4+32], xmm3\n" - "movaps xmmword ptr [rgb32+edi*4+48], xmm4\n" + "movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n" + "movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n" + "movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n" + "movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n" "add edi, 16\n" @@ -393,15 +393,11 @@ ihatemsvc: :[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK), [ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF), [GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF), - [yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset) - : "eax", "ebx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" + [yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset), + [mb8]"r"(mb8), [rgb32]"r"(rgb32) + : "eax", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); #else # error Unsupported compiler #endif } - -void yuv2rgb_init(void) -{ - /* For later reimplementation of C version */ -} diff --git a/pcsx2/IPU/yuv2rgb.h b/pcsx2/IPU/yuv2rgb.h index 37d1049028..8d1efb2e14 100644 --- a/pcsx2/IPU/yuv2rgb.h +++ b/pcsx2/IPU/yuv2rgb.h @@ -17,6 +17,5 @@ #define yuv2rgb yuv2rgb_sse2 -extern void yuv2rgb_reference(void); -extern void yuv2rgb_sse2(void); -extern void yuv2rgb_init(void); +extern void yuv2rgb_reference(); +extern void yuv2rgb_sse2(); diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h index 3ff9d18b43..5d2783a545 100644 --- a/pcsx2/SaveState.h +++ b/pcsx2/SaveState.h @@ -24,7 +24,7 @@ // the lower 16 bit value. IF the change is breaking of all compatibility with old // states, increment the upper 16 bit value, and clear the lower 16 bits to 0. -static const u32 g_SaveVersion = 0x8b470000; +static const u32 g_SaveVersion = 0x8b480000; // this function is meant to be used in the place of GSfreeze, and provides a safe layer // between the GS saving function and the MTGS's needs. :)