From 1c75440a6c4ea0414c7fdae00254495462b74e5d Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Wed, 15 Sep 2010 17:11:24 +0000 Subject: [PATCH] IPU optimizations -- use SSE for FIFO reads/writes, and streamlined IPUdma0 /IPUdma1 feeds a bit. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3771 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/IPU/IPU.cpp | 10 +-- pcsx2/IPU/IPU_Fifo.cpp | 148 ++++++++++++++------------------ pcsx2/IPU/IPU_Fifo.h | 7 +- pcsx2/IPU/IPUdma.cpp | 5 +- pcsx2/IPU/mpeg2lib/Mpeg.cpp | 164 +++++++++++++++++++----------------- pcsx2/IPU/mpeg2lib/Mpeg.h | 4 +- pcsx2/IPU/mpeg2lib/Vlc.h | 25 +++--- 7 files changed, 174 insertions(+), 189 deletions(-) diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp index 165e2a0b40..55f988dcc3 100644 --- a/pcsx2/IPU/IPU.cpp +++ b/pcsx2/IPU/IPU.cpp @@ -924,11 +924,11 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance) mask = mask | (mask << 8); *(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift); - } + } else { *(u16*)address = *(u16*)readpos; - } + } if (advance) g_BP.BP += 16; @@ -947,14 +947,14 @@ u8 __fastcall getBits8(u8 *address, u32 advance) readpos = readbits + (int)g_BP.BP / 8; if (uint shift = (g_BP.BP & 7)) - { + { mask = (0xff >> shift); *(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift); - } + } else { *(u8*)address = *(u8*)readpos; - } + } if (advance) g_BP.BP += 8; diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp index 2c2902cf6f..3a4af9192e 100644 --- a/pcsx2/IPU/IPU_Fifo.cpp +++ b/pcsx2/IPU/IPU_Fifo.cpp @@ -75,10 +75,7 @@ int IPU_Fifo_Input::write(u32* pMem, int size) while (transsize-- > 0) { - for (int i = 0; i <= 3; i++) - { - data[writepos + i] = pMem[i]; - } + CopyQWC(&data[writepos], pMem); writepos = (writepos + 4) & 31; pMem += 4; } @@ -86,118 +83,97 @@ int IPU_Fifo_Input::write(u32* pMem, int size) return firsttrans; } -int IPU_Fifo_Output::write(const u32 *value, int size) -{ - int transsize, firsttrans; - - if ((int)ipuRegs.ctrl.OFC >= 8) IPU0dma(); - - transsize = min(size, 8 - (int)ipuRegs.ctrl.OFC); - firsttrans = transsize; - - while (transsize-- > 0) - { - for (int i = 0; i <= 3; i++) - { - data[writepos + i] = ((u32*)value)[i]; - } - writepos = (writepos + 4) & 31; - value += 4; - } - - ipuRegs.ctrl.OFC += firsttrans; - IPU0dma(); - - return firsttrans; -} - int IPU_Fifo_Input::read(void *value) { // wait until enough data to ensure proper streaming. - if (g_BP.IFC < 4) + if (g_BP.IFC < 1) { // IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO if(cpuRegs.eCycle[4] == 0x9999) - { - CPU_INT( DMAC_TO_IPU, 4 ); - } - + IPU1dma(); + if (g_BP.IFC == 0) return 0; - pxAssert(g_BP.IFC > 0); } - // transfer 1 qword, split into two transfers - for (int i = 0; i <= 3; i++) - { - ((u32*)value)[i] = data[readpos + i]; - data[readpos + i] = 0; - } + CopyQWC(value, &data[readpos]); readpos = (readpos + 4) & 31; g_BP.IFC--; return 1; } -void IPU_Fifo_Output::_readsingle(void *value) +int IPU_Fifo_Output::write(const u32 *value, uint size) { - // transfer 1 qword, split into two transfers - for (int i = 0; i <= 3; i++) + pxAssumeMsg(size>0, "Invalid size==0 when calling IPU_Fifo_Output::write"); + + uint origsize = size; + do { + IPU0dma(); + + uint transsize = min(size, 8 - ipuRegs.ctrl.OFC); + if(!transsize) break; + + ipuRegs.ctrl.OFC = transsize; + size -= transsize; + while (transsize > 0) + { + CopyQWC(&data[writepos], value); + writepos = (writepos + 4) & 31; + value += 4; + --transsize; + } + } while(true); + + return origsize - size; + +#if 0 + if (ipuRegs.ctrl.OFC >= 8) IPU0dma(); + + uint transsize = min(size, 8 - ipuRegs.ctrl.OFC); + uint firsttrans = transsize; + + while (transsize > 0) { - ((u32*)value)[i] = data[readpos + i]; - data[readpos + i] = 0; + CopyQWC(&data[writepos], value); + writepos = (writepos + 4) & 31; + value += 4; + --transsize; } - readpos = (readpos + 4) & 31; + + ipuRegs.ctrl.OFC += firsttrans; + IPU0dma(); + + return firsttrans; +#endif } -void IPU_Fifo_Output::read(void *value, int size) +void IPU_Fifo_Output::read(void *value, uint size) { + pxAssume(ipuRegs.ctrl.OFC >= size); ipuRegs.ctrl.OFC -= size; + + // Zeroing the read data is not needed, since the ringbuffer design will never read back + // the zero'd data anyway. --air + + //__m128 zeroreg = _mm_setzero_ps(); while (size > 0) { - _readsingle(value); - value = (u32*)value + 4; - size--; + CopyQWC(value, &data[readpos]); + //_mm_store_ps((float*)&data[readpos], zeroreg); + + readpos = (readpos + 4) & 31; + value = (u128*)value + 1; + --size; } } -void IPU_Fifo_Output::readsingle(void *value) -{ - if (ipuRegs.ctrl.OFC > 0) - { - ipuRegs.ctrl.OFC--; - _readsingle(value); - } -} - -__fi bool decoder_t::ReadIpuData(u128* out) -{ - if(ipu0_data == 0) - { - IPU_LOG( "ReadFIFO/IPUout -> (fifo empty/no data available)" ); - return false; - } - - CopyQWC(out, GetIpuDataPtr()); - - --ipu0_data; - ++ipu0_idx; - - IPU_LOG( "ReadFIFO/IPUout -> %ls", out->ToString().c_str() ); - - return true; -} - void __fastcall ReadFIFO_IPUout(mem128_t* out) { - // FIXME! When ReadIpuData() doesn't succeed (returns false), the EE should probably stall - // until a value becomes available. This isn't exactly easy to do since the virtualized EE - // in PCSX2 *has* to be running in order for the IPU DMA to upload new input data to allow - // IPUout's FIFO to fill. Thus if we implement an EE stall, PCSX2 deadlocks. Grr. --air + if (!pxAssertDev( ipuRegs.ctrl.OFC > 0, "Attempted read from IPUout's FIFO, but the FIFO is empty!" )) return; + ipu_fifo.out.read(out, 1); - if (decoder.ReadIpuData(out)) - { - ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31; - } + // Games should always check the fifo before reading from it -- so if the FIFO has no data + // its either some glitchy game or a bug in pcsx2. } void __fastcall WriteFIFO_IPUin(const mem128_t* value) diff --git a/pcsx2/IPU/IPU_Fifo.h b/pcsx2/IPU/IPU_Fifo.h index 10a1e940d3..69d2eab597 100644 --- a/pcsx2/IPU/IPU_Fifo.h +++ b/pcsx2/IPU/IPU_Fifo.h @@ -37,13 +37,10 @@ struct IPU_Fifo_Output int readpos, writepos; // returns number of qw read - int write(const u32 * value, int size); - void read(void *value,int size); - void readsingle(void *value); + int write(const u32 * value, uint size); + void read(void *value, uint size); void clear(); wxString desc() const; - - void _readsingle(void *value); }; struct IPU_Fifo diff --git a/pcsx2/IPU/IPUdma.cpp b/pcsx2/IPU/IPUdma.cpp index 5a5949e0b0..70ca857171 100644 --- a/pcsx2/IPU/IPUdma.cpp +++ b/pcsx2/IPU/IPUdma.cpp @@ -312,8 +312,9 @@ int IPU1dma() int IPU0dma() { + if(!ipuRegs.ctrl.OFC) return 0; + int readsize; - static int totalsize = 0; tDMA_TAG* pMem; if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0)) @@ -329,7 +330,6 @@ int IPU0dma() pMem = dmaGetAddr(ipu0dma.madr, true); readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC); - totalsize+=readsize; ipu_fifo.out.read(pMem, readsize); ipu0dma.madr += readsize << 4; @@ -363,7 +363,6 @@ int IPU0dma() //This broke vids in Digital Devil Saga //Note that interrupting based on totalsize is just guessing.. IPU_INT_FROM( readsize * BIAS ); - totalsize = 0; } return readsize; diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp index 27edd38f89..f222ab68c3 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp +++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp @@ -338,9 +338,6 @@ do { \ static __fi bool get_intra_block() { - int i; - int j; - int val; const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm; const u8 (&quant_matrix)[64] = decoder.iq; int quantizer_scale = decoder.quantizer_scale; @@ -348,7 +345,7 @@ static __fi bool get_intra_block() u16 code; /* decode AC coefficients */ - for (i=1 + ipu_cmd.pos[4]; ; i++) + for (int i=1 + ipu_cmd.pos[4]; ; i++) { switch (ipu_cmd.pos[5]) { @@ -427,60 +424,64 @@ static __fi bool get_intra_block() return true; } - i+= tab->run == 65 ? GETBITS(6) : tab->run; + i += (tab->run == 65) ? GETBITS(6) : tab->run; if (i >= 64) { ipu_cmd.pos[4] = 0; return true; } + case 1: - if (!GETWORD()) - { - ipu_cmd.pos[4] = i - 1; - ipu_cmd.pos[5] = 1; - return false; + { + if (!GETWORD()) + { + ipu_cmd.pos[4] = i - 1; + ipu_cmd.pos[5] = 1; + return false; + } + + uint j = scan[i]; + int val; + + if (tab->run==65) /* escape */ + { + if(!decoder.mpeg1) + { + val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4; + DUMPBITS(12); + } + else + { + val = SBITS(8); + DUMPBITS(8); + + if (!(val & 0x7f)) + { + val = GETBITS(8) + 2 * val; + } + + val = (val * quantizer_scale * quant_matrix[i]) >> 4; + val = (val + ~ (((s32)val) >> 31)) | 1; + } + } + else + { + val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4; + if(decoder.mpeg1) + { + /* oddification */ + val = (val - 1) | 1; + } + + /* if (bitstream_get (1)) val = -val; */ + val = (val ^ SBITS(1)) - SBITS(1); + DUMPBITS(1); + } + + SATURATE(val); + dest[j] = val; + ipu_cmd.pos[5] = 0; } - - j = scan[i]; - - if (tab->run==65) /* escape */ - { - if(!decoder.mpeg1) - { - val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4; - DUMPBITS(12); - } - else - { - val = SBITS(8); - DUMPBITS(8); - - if (!(val & 0x7f)) - { - val = GETBITS(8) + 2 * val; - } - - val = (val * quantizer_scale * quant_matrix[i]) >> 4; - val = (val + ~ (((s32)val) >> 31)) | 1; - } - } - else - { - val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4; - if(decoder.mpeg1) - { - /* oddification */ - val = (val - 1) | 1; - } - - /* if (bitstream_get (1)) val = -val; */ - val = (val ^ SBITS(1)) - SBITS(1); - DUMPBITS(1); - } - - SATURATE(val); - dest[j] = val; - ipu_cmd.pos[5] = 0; } } @@ -798,6 +799,9 @@ bool mpeg2sliceIDEC() ipu_cmd.pos[2] = 6; return false; } + break; + + jNO_DEFAULT; } // Send The MacroBlock via DmaIpuFrom @@ -812,23 +816,23 @@ bool mpeg2sliceIDEC() } case 2: - while (decoder.ipu0_data > 0) - { - uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); + { + pxAssume(decoder.ipu0_data > 0); - if (read == 0) - { - ipu_cmd.pos[1] = 2; - return false; - } - else - { - decoder.AdvanceIpuDataBy(read); - } + uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); + decoder.AdvanceIpuDataBy(read); + + if (decoder.ipu0_data != 0) + { + // IPU FIFO filled up -- Will have to finish transferring later. + ipu_cmd.pos[1] = 2; + return false; } decoder.mbc++; mbaCount = 0; + } + case 3: while (1) { @@ -886,6 +890,8 @@ bool mpeg2sliceIDEC() } break; + + jNO_DEFAULT; } ipu_cmd.pos[1] = 0; @@ -919,6 +925,8 @@ finish_idec: BigEndian(ipuRegs.top, ipuRegs.top); break; + + jNO_DEFAULT; } return true; @@ -1010,6 +1018,8 @@ bool mpeg2_slice() return false; } break; + + jNO_DEFAULT; } ipu_copy(mb8, mb16); @@ -1077,6 +1087,8 @@ bool mpeg2_slice() } } break; + + jNO_DEFAULT; } } } @@ -1101,21 +1113,23 @@ bool mpeg2_slice() decoder.SetOutputTo(mb16); case 3: - while (decoder.ipu0_data > 0) - { - uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); + { + pxAssume(decoder.ipu0_data > 0); - if (size == 0) - { - ipu_cmd.pos[0] = 3; - return false; - } - else - { - decoder.AdvanceIpuDataBy(size); - } + uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data); + decoder.AdvanceIpuDataBy(read); + + if (decoder.ipu0_data != 0) + { + // IPU FIFO filled up -- Will have to finish transferring later. + ipu_cmd.pos[0] = 3; + return false; } + decoder.mbc++; + mbaCount = 0; + } + case 4: bit8 = 1; if (!getBits8((u8*)&bit8, 0)) diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h index 13431eb396..877f4cdd46 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.h +++ b/pcsx2/IPU/mpeg2lib/Mpeg.h @@ -148,7 +148,7 @@ struct decoder_t { macroblock_rgb32 rgb32; macroblock_rgb16 rgb16; - uint ipu0_data; + uint ipu0_data; // amount of data in the output macroblock (in QWC) uint ipu0_idx; /* bit parsing stuff */ @@ -230,7 +230,7 @@ struct decoder_t { ipu0_data -= amt; } - bool ReadIpuData(u128* out); + __fi bool ReadIpuData(u128* out); }; struct mpeg2_scan_pack diff --git a/pcsx2/IPU/mpeg2lib/Vlc.h b/pcsx2/IPU/mpeg2lib/Vlc.h index 0b30d1b8bb..86f9f7ad5e 100644 --- a/pcsx2/IPU/mpeg2lib/Vlc.h +++ b/pcsx2/IPU/mpeg2lib/Vlc.h @@ -36,22 +36,21 @@ static __fi int GETWORD() { - static u8 data[2]; + if (decoder.bitstream_bits <= 0) return 1; - if (decoder.bitstream_bits > 0) + static u8 data[2]; + + if(!getBits16(data,1)) { - if(!getBits16(data,1)) - { - return 0; - } - - /*u32 data; - BigEndian(data, *(u32*)word); - decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits; - decoder.bitstream_bits -= 32;*/ - decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits; - decoder.bitstream_bits -= 16; + return 0; } + + /*u32 data; + BigEndian(data, *(u32*)word); + decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits; + decoder.bitstream_bits -= 32;*/ + decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits; + decoder.bitstream_bits -= 16; return 1; }