diff --git a/pcsx2/IPU/IPU.cpp b/pcsx2/IPU/IPU.cpp index 55f988dcc3..5e19cd9124 100644 --- a/pcsx2/IPU/IPU.cpp +++ b/pcsx2/IPU/IPU.cpp @@ -55,9 +55,6 @@ int coded_block_pattern = 0; u8 indx4[16*16/2]; __aligned16 decoder_t decoder; -__aligned16 u8 _readbits[80]; //local buffer (ring buffer) -u8* readbits = _readbits; // always can decrement by one 1qw - __fi void IPUProcessInterrupt() { if (ipuRegs.ctrl.BUSY && g_BP.IFC) IPUWorker(); @@ -96,8 +93,6 @@ void ReportIPU() Console.WriteLn("g_decoder = 0x%x.", &decoder); Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan); Console.WriteLn(ipu_cmd.desc()); - Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.", - _readbits, readbits - _readbits); Console.Newline(); } @@ -114,15 +109,6 @@ void SaveStateBase::ipuFreeze() Freeze(coded_block_pattern); Freeze(decoder); Freeze(ipu_cmd); - Freeze(_readbits); - - int temp = readbits - _readbits; - Freeze(temp); - - if (IsLoading()) - { - readbits = _readbits; - } } void tIPU_CMD_IDEC::log() const @@ -213,21 +199,27 @@ __fi u32 ipuRead32(u32 mem) switch (mem) { ipucase(IPU_CTRL): // IPU_CTRL + { ipuRegs.ctrl.IFC = g_BP.IFC; ipuRegs.ctrl.CBP = coded_block_pattern; if (!ipuRegs.ctrl.BUSY) IPU_LOG("read32: IPU_CTRL=0x%08X", ipuRegs.ctrl._u32); - return ipuRegs.ctrl._u32; + return ipuRegs.ctrl._u32; + } ipucase(IPU_BP): // IPU_BP + { + pxAssume(g_BP.FP <= 2); + ipuRegs.ipubp = g_BP.BP & 0x7f; ipuRegs.ipubp |= g_BP.IFC << 8; - ipuRegs.ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16; + ipuRegs.ipubp |= g_BP.FP << 16; IPU_LOG("read32: IPU_BP=0x%08X", ipuRegs.ipubp); - return ipuRegs.ipubp; + return ipuRegs.ipubp; + } default: IPU_LOG("read32: Addr=0x%08X Value = 0x%08X", mem, psHu32(IPU_CMD + mem)); @@ -283,9 +275,7 @@ void ipuSoftReset() ipu_cmd.clear(); ipuRegs.cmd.BUSY = 0; - g_BP.BP = 0; - g_BP.FP = 0; - //g_BP.bufferhasnew = 0; + memzero(g_BP); } __fi bool ipuWrite32(u32 mem, u32 value) @@ -354,12 +344,11 @@ static void ipuBCLR(u32 val) { ipu_fifo.in.clear(); + memzero(g_BP); g_BP.BP = val & 0x7F; - g_BP.FP = 0; - //g_BP.bufferhasnew = 0; + ipuRegs.ctrl.BUSY = 0; ipuRegs.cmd.BUSY = 0; - memzero(_readbits); IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X", g_BP.BP); } @@ -370,7 +359,7 @@ static bool ipuIDEC(u32 val, bool resume) if (!resume) { idec.log(); - g_BP.BP += idec.FB;//skip FB bits + g_BP.Advance(idec.FB); //from IPU_CTRL ipuRegs.ctrl.PCT = I_TYPE; //Intra DECoding;) @@ -407,7 +396,7 @@ static __fi bool ipuBDEC(u32 val, bool resume) bdec.log(s_bdec); if (IsDebugBuild) s_bdec++; - g_BP.BP += bdec.FB;//skip FB bits + g_BP.Advance(bdec.FB); decoder.coding_type = I_TYPE; decoder.mpeg1 = ipuRegs.ctrl.MP1; decoder.q_scale_type = ipuRegs.ctrl.QST; @@ -433,11 +422,7 @@ static bool __fastcall ipuVDEC(u32 val) switch (ipu_cmd.pos[0]) { case 0: - ipuRegs.cmd.DATA = 0; - if (!getBits32((u8*)&decoder.bitstream_buf, 0)) return false; - - decoder.bitstream_bits = -16; - BigEndian(decoder.bitstream_buf, decoder.bitstream_buf); + if (!bitstream_init()) return false; switch ((val >> 26) & 3) { @@ -459,17 +444,14 @@ static bool __fastcall ipuVDEC(u32 val) case 3://DMVector ipuRegs.cmd.DATA = get_dmv(); break; + + jNO_DEFAULT } - g_BP.BP += (int)decoder.bitstream_bits + 16; + ipuRegs.cmd.DATA &= 0xFFFF; + ipuRegs.cmd.DATA |= 0x10000; - if ((int)g_BP.BP < 0) - { - g_BP.BP += 128; - ReorderBitstream(); - } - - ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16); + //ipuRegs.cmd.DATA = (ipuRegs.cmd.DATA & 0xFFFF) | ((decoder.bitstream_bits + 16) << 16); ipuRegs.ctrl.ECD = (ipuRegs.cmd.DATA == 0); case 1: @@ -479,14 +461,14 @@ static bool __fastcall ipuVDEC(u32 val) return false; } - BigEndian(ipuRegs.top, ipuRegs.top); + ipuRegs.top = BigEndian(ipuRegs.top); IPU_LOG("VDEC command data 0x%x(0x%x). Skip 0x%X bits/Table=%d (%s), pct %d", ipuRegs.cmd.DATA, ipuRegs.cmd.DATA >> 16, val & 0x3f, (val >> 26) & 3, (val >> 26) & 1 ? ((val >> 26) & 2 ? "DMV" : "MBT") : (((val >> 26) & 2 ? "MC" : "MBAI")), ipuRegs.ctrl.PCT); return true; - jNO_DEFAULT + jNO_DEFAULT } return false; @@ -496,7 +478,7 @@ static __fi bool ipuFDEC(u32 val) { if (!getBits32((u8*)&ipuRegs.cmd.DATA, 0)) return false; - BigEndian(ipuRegs.cmd.DATA, ipuRegs.cmd.DATA); + ipuRegs.cmd.DATA = BigEndian(ipuRegs.cmd.DATA); ipuRegs.top = ipuRegs.cmd.DATA; IPU_LOG("FDEC read: 0x%08x", ipuRegs.top); @@ -553,11 +535,10 @@ static bool ipuSETVQ(u32 val) if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return false; } - IPU_LOG("SETVQ command.\nRead VQCLUT table from FIFO."); - IPU_LOG( - "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d " - "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d" - "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d " + IPU_LOG("SETVQ command. Read VQCLUT table from FIFO.\n" + "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n" + "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n" + "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d\n" "%02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d %02d:%02d:%02d", vqclut[0] >> 10, (vqclut[0] >> 5) & 0x1F, vqclut[0] & 0x1F, vqclut[1] >> 10, (vqclut[1] >> 5) & 0x1F, vqclut[1] & 0x1F, @@ -723,148 +704,48 @@ __fi void ipu_vq(macroblock_rgb16& rgb16, u8* indx4) Console.Error("IPU: VQ not implemented"); } -__fi void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16) -{ - const u8 *s = (const u8*)&mb8; - s16 *d = (s16*)&mb16; - int i; - for (i = 0; i < 256; i++) *d++ = *s++; //Y bias - 16 - for (i = 0; i < 64; i++) *d++ = *s++; //Cr bias - 128 - for (i = 0; i < 64; i++) *d++ = *s++; //Cb bias - 128 -} - // -------------------------------------------------------------------------------------- // Buffer reader // -------------------------------------------------------------------------------------- -// move the readbits queue -__fi void inc_readbits() +__ri u32 UBITS(uint bits) { - readbits += 16; - if (readbits >= _readbits + 64) - { - // move back - *(u64*)(_readbits) = *(u64*)(_readbits + 64); - *(u64*)(_readbits + 8) = *(u64*)(_readbits + 72); - readbits = _readbits; - } + uint readpos8 = g_BP.BP/8; + + uint result = BigEndian(*(u32*)( (u8*)g_BP.internal_qwc + readpos8 )); + uint bp7 = (g_BP.BP & 7); + result <<= bp7; + result >>= (32 - bits); + + return result; } -// returns the pointer of readbits moved by 1 qword -__fi u8* next_readbits() +__ri s32 SBITS(uint bits) { - return readbits + 16; -} + // Read an unaligned 32 bit value and then shift the bits up and then back down. -// returns the pointer of readbits moved by 1 qword -u8* prev_readbits() -{ - if (readbits < _readbits + 16) return _readbits + 48 - (readbits - _readbits); + uint readpos8 = g_BP.BP/8; - return readbits - 16; -} + int result = BigEndian(*(s32*)( (s8*)g_BP.internal_qwc + readpos8 )); + uint bp7 = (g_BP.BP & 7); + result <<= bp7; + result >>= (32 - bits); -void ReorderBitstream() -{ - readbits = prev_readbits(); - g_BP.FP = 2; -} - -// IPU has a 2qword internal buffer whose status is pointed by FP. -// If FP is 1, there's 1 qword in buffer. Second qword is only loaded -// incase there are less than 32bits available in the first qword. -// \return Number of bits available (clamps at 16 bits) -u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size) -{ - if (g_BP.FP == 0) - { - if (ipu_fifo.in.read(next_readbits()) == 0) return 0; - - inc_readbits(); - g_BP.FP = 1; - } - - if ((g_BP.FP < 2) && ((*(int*)pointer + size) >= 128)) - { - if (ipu_fifo.in.read(next_readbits())) g_BP.FP += 1; - } - - if (*(int*)pointer >= 128) - { - pxAssert(g_BP.FP >= 1); - - if (g_BP.FP > 1) inc_readbits(); - - if (advance) - { - g_BP.FP--; - *pointer &= 127; - } - } - - return (g_BP.FP >= 1) ? g_BP.FP * 128 - (*(int*)pointer) : 0; + return result; } // whenever reading fractions of bytes. The low bits always come from the next byte // while the high bits come from the current byte -u8 __fastcall getBits128(u8 *address, u32 advance) +u8 getBits64(u8 *address, bool advance) { - u64 mask2; - u128 mask; - u8* readpos; + if (!g_BP.FillBuffer(64)) return 0; - // Check if the current BP has exceeded or reached the limit of 128 - if (FillInternalBuffer(&g_BP.BP, 1, 128) < 128) return 0; - - readpos = readbits + (int)g_BP.BP / 8; + const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8]; if (uint shift = (g_BP.BP & 7)) { - mask2 = 0xff >> shift; - mask.lo = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56); - mask.hi = mask2 | (mask2 << 8) | (mask2 << 16) | (mask2 << 24) | (mask2 << 32) | (mask2 << 40) | (mask2 << 48) | (mask2 << 56); - - u128 notMask; - u128 data = *(u128*)(readpos + 1); - notMask.lo = ~mask.lo & data.lo; - notMask.hi = ~mask.hi & data.hi; - notMask.lo >>= 8 - shift; - notMask.lo |= (notMask.hi & (ULLONG_MAX >> (64 - shift))) << (64 - shift); - notMask.hi >>= 8 - shift; - - mask.hi = (((*(u128*)readpos).hi & mask.hi) << shift) | (((*(u128*)readpos).lo & mask.lo) >> (64 - shift)); - mask.lo = ((*(u128*)readpos).lo & mask.lo) << shift; - - notMask.lo |= mask.lo; - notMask.hi |= mask.hi; - *(u128*)address = notMask; - } - else - { - *(u128*)address = *(u128*)readpos; - } - - if (advance) g_BP.BP += 128; - - return 1; -} - -// whenever reading fractions of bytes. The low bits always come from the next byte -// while the high bits come from the current byte -u8 __fastcall getBits64(u8 *address, u32 advance) -{ - register u64 mask = 0; - u8* readpos; - - // Check if the current BP has exceeded or reached the limit of 128 - if (FillInternalBuffer(&g_BP.BP, 1, 64) < 64) return 0; - - readpos = readbits + (int)g_BP.BP / 8; - - if (uint shift = (g_BP.BP & 7)) - { - mask = (0xff >> shift); + u64 mask = (0xff >> shift); mask = mask | (mask << 8) | (mask << 16) | (mask << 24) | (mask << 32) | (mask << 40) | (mask << 48) | (mask << 56); *(u64*)address = ((~mask & *(u64*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u64*)readpos) << shift); @@ -874,55 +755,47 @@ u8 __fastcall getBits64(u8 *address, u32 advance) *(u64*)address = *(u64*)readpos; } - if (advance) g_BP.BP += 64; + if (advance) g_BP.Advance(64); return 1; } // whenever reading fractions of bytes. The low bits always come from the next byte // while the high bits come from the current byte -u8 __fastcall getBits32(u8 *address, u32 advance) +__fi u8 getBits32(u8 *address, bool advance) { - u32 mask; - u8* readpos; + if (!g_BP.FillBuffer(32)) return 0; - // Check if the current BP has exceeded or reached the limit of 128 - if (FillInternalBuffer(&g_BP.BP, 1, 32) < 32) return 0; - - readpos = readbits + (int)g_BP.BP / 8; - - if (uint shift = (g_BP.BP & 7)) + const u8* readpos = &g_BP.internal_qwc->_u8[g_BP.BP/8]; + + if(uint shift = (g_BP.BP & 7)) { - mask = (0xff >> shift); + u32 mask = (0xff >> shift); mask = mask | (mask << 8) | (mask << 16) | (mask << 24); *(u32*)address = ((~mask & *(u32*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u32*)readpos) << shift); } else { + // Bit position-aligned -- no masking/shifting necessary *(u32*)address = *(u32*)readpos; } - if (advance) g_BP.BP += 32; + if (advance) g_BP.Advance(32); return 1; } -__fi u8 __fastcall getBits16(u8 *address, u32 advance) +__fi u8 getBits16(u8 *address, bool advance) { - u32 mask; - u8* readpos; + if (!g_BP.FillBuffer(16)) return 0; - // Check if the current BP has exceeded or reached the limit of 128 - if (FillInternalBuffer(&g_BP.BP, 1, 16) < 16) return 0; - - readpos = readbits + (int)g_BP.BP / 8; + const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8]; if (uint shift = (g_BP.BP & 7)) { - mask = (0xff >> shift); + uint mask = (0xff >> shift); mask = mask | (mask << 8); - *(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift); } else @@ -930,25 +803,20 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance) *(u16*)address = *(u16*)readpos; } - if (advance) g_BP.BP += 16; + if (advance) g_BP.Advance(16); return 1; } -u8 __fastcall getBits8(u8 *address, u32 advance) +u8 getBits8(u8 *address, bool advance) { - u32 mask; - u8* readpos; + if (!g_BP.FillBuffer(8)) return 0; - // Check if the current BP has exceeded or reached the limit of 128 - if (FillInternalBuffer(&g_BP.BP, 1, 8) < 8) - return 0; - - readpos = readbits + (int)g_BP.BP / 8; + const u8* readpos = &g_BP.internal_qwc[0]._u8[g_BP.BP/8]; if (uint shift = (g_BP.BP & 7)) { - mask = (0xff >> shift); + uint mask = (0xff >> shift); *(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift); } else @@ -956,7 +824,7 @@ u8 __fastcall getBits8(u8 *address, u32 advance) *(u8*)address = *(u8*)readpos; } - if (advance) g_BP.BP += 8; + if (advance) g_BP.Advance(8); return 1; } @@ -983,7 +851,7 @@ void IPUCMD_WRITE(u32 val) case SCE_IPU_VDEC: - g_BP.BP += val & 0x3F; + g_BP.Advance(val & 0x3F); // check if enough data in queue if (ipuVDEC(val)) return; @@ -993,9 +861,11 @@ void IPUCMD_WRITE(u32 val) break; case SCE_IPU_FDEC: - IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, FP %d, CHCR 0x%x", - val & 0x3f, g_BP.IFC, (int)g_BP.BP, g_BP.FP, ipu1dma.chcr._u32); - g_BP.BP += val & 0x3F; + IPU_LOG("FDEC command. Skip 0x%X bits, FIFO 0x%X qwords, BP 0x%X, CHCR 0x%x", + val & 0x3f, g_BP.IFC, (int)g_BP.BP, ipu1dma.chcr._u32); + + g_BP.Advance(val & 0x3F); + if (ipuFDEC(val)) return; ipuRegs.cmd.BUSY = 0x80000000; ipuRegs.topbusy = 0x80000000; @@ -1009,7 +879,7 @@ void IPUCMD_WRITE(u32 val) case SCE_IPU_SETIQ: IPU_LOG("SETIQ command."); if (val & 0x3f) IPU_LOG("Skip %d bits.", val & 0x3f); - g_BP.BP += val & 0x3F; + g_BP.Advance(val & 0x3F); if (ipuSETIQ(val)) return; break; diff --git a/pcsx2/IPU/IPU.h b/pcsx2/IPU/IPU.h index e33c211b3e..a719e07474 100644 --- a/pcsx2/IPU/IPU.h +++ b/pcsx2/IPU/IPU.h @@ -67,11 +67,66 @@ union tIPU_CTRL { void reset() { _u32 = 0; } }; -struct tIPU_BP { - u32 BP; // Bit stream point - u16 IFC; // Input FIFO counter - u8 FP; // FIFO point - u8 bufferhasnew; // Always 0. +__aligned16 struct tIPU_BP { + __aligned16 u128 internal_qwc[2]; + + u32 BP; // Bit stream point (0 to 128*2) + u32 IFC; // Input FIFO counter (8QWC) (0 to 8) + u32 FP; // internal FIFO (2QWC) fill status (0 to 2) + + __fi void Align() + { + BP = (BP + 7) & ~7; + Advance(0); + } + + __fi void Advance(uint bits) + { + BP += bits; + pxAssume( BP <= 256 ); + + if (BP > 127) + { + BP -= 128; + + if (FP == 2) + { + // when BP is over 128 it means we're reading data from the second quadword. Shift that one + // to the front and load the new quadword into the second QWC (its a manualized ringbuffer!) + + CopyQWC(&internal_qwc[0], &internal_qwc[1]); + FP = 1; + } + else + { + // if FP == 1 then the buffer has been completely drained. + // if FP == 0 then an already-drained buffer is being advanced. + // In either case we just assign FP to 0. + + FP = 0; + } + } + } + + __fi bool FillBuffer(u32 bits) + { + while (FP < 2) + { + if (ipu_fifo.in.read(&internal_qwc[FP]) == 0) + { + // Here we *try* to fill the entire internal QWC buffer; however that may not necessarily + // be possible -- so if the fill fails we'll only return 0 if we don't have enough + // remaining bits in the FIFO to fill the request. + + return ((FP!=0) && (BP + bits) <= 128); + } + + ++FP; + } + + return true; + } + wxString desc() const { return wxsFormat(L"Ipu BP: bp = 0x%x, IFC = 0x%x, FP = 0x%x.", BP, IFC, FP); @@ -217,10 +272,9 @@ extern void IPUCMD_WRITE(u32 val); extern void ipuSoftReset(); extern void IPUProcessInterrupt(); -extern u16 __fastcall FillInternalBuffer(u32 * pointer, u32 advance, u32 size); -extern u8 __fastcall getBits128(u8 *address, u32 advance); -extern u8 __fastcall getBits64(u8 *address, u32 advance); -extern u8 __fastcall getBits32(u8 *address, u32 advance); -extern u8 __fastcall getBits16(u8 *address, u32 advance); -extern u8 __fastcall getBits8(u8 *address, u32 advance); +extern u8 getBits128(u8 *address, bool advance); +extern u8 getBits64(u8 *address, bool advance); +extern u8 getBits32(u8 *address, bool advance); +extern u8 getBits16(u8 *address, bool advance); +extern u8 getBits8(u8 *address, bool advance); diff --git a/pcsx2/IPU/IPU_Fifo.cpp b/pcsx2/IPU/IPU_Fifo.cpp index 4b749ae339..25b0aad6f5 100644 --- a/pcsx2/IPU/IPU_Fifo.cpp +++ b/pcsx2/IPU/IPU_Fifo.cpp @@ -85,14 +85,14 @@ int IPU_Fifo_Input::write(u32* pMem, int size) int IPU_Fifo_Input::read(void *value) { // wait until enough data to ensure proper streaming. - if (g_BP.IFC < 4) + if (g_BP.IFC < 3) { // IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO if(cpuRegs.eCycle[4] == 0x9999) { CPU_INT( DMAC_TO_IPU, 32 ); } - + if (g_BP.IFC == 0) return 0; pxAssert(g_BP.IFC > 0); } diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.cpp b/pcsx2/IPU/mpeg2lib/Mpeg.cpp index f222ab68c3..bdef4b2d49 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.cpp +++ b/pcsx2/IPU/mpeg2lib/Mpeg.cpp @@ -47,10 +47,14 @@ const int non_linear_quantizer_scale [] = into 1st slot is copied to the 2nd slot. Which will later be copied back to the 1st slot when 128bits have been read. */ -extern void ReorderBitstream(); const DCTtab * tab; int mbaCount = 0; +int bitstream_init () +{ + return g_BP.FillBuffer(32); +} + int get_macroblock_modes() { int macroblock_modes; @@ -221,9 +225,7 @@ int __fi get_motion_delta(const int f_code) int __fi get_dmv() { - const DMVtab * tab; - - tab = DMV_2 + UBITS(2); + const DMVtab* tab = DMV_2 + UBITS(2); DUMPBITS(tab->len); return tab->dmv; } @@ -239,22 +241,21 @@ int get_macroblock_address_increment() else if (code >= 768) mba = MBA.mba11 + (UBITS(11) - 24); else switch (UBITS(11)) - { + { + case 8: /* macroblock_escape */ + DUMPBITS(11); + return 0x23; - case 8: /* macroblock_escape */ + case 15: /* macroblock_stuffing (MPEG1 only) */ + if (decoder.mpeg1) + { DUMPBITS(11); - return 0x23; + return 0x22; + } - case 15: /* macroblock_stuffing (MPEG1 only) */ - if (decoder.mpeg1) - { - DUMPBITS(11); - return 0x22; - } - - default: - return 0;//error - } + default: + return 0;//error + } DUMPBITS(mba->len); @@ -336,7 +337,7 @@ do { \ val = (((s32)val) >> 31) ^ 2047; \ } while (0) -static __fi bool get_intra_block() +static bool get_intra_block() { const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm; const u8 (&quant_matrix)[64] = decoder.iq; @@ -474,7 +475,8 @@ static __fi bool get_intra_block() } /* if (bitstream_get (1)) val = -val; */ - val = (val ^ SBITS(1)) - SBITS(1); + int bit1 = SBITS(1); + val = (val ^ bit1) - bit1; DUMPBITS(1); } @@ -489,7 +491,7 @@ static __fi bool get_intra_block() return true; } -static __fi bool get_non_intra_block(int * last) +static bool get_non_intra_block(int * last) { int i; int j; @@ -615,8 +617,9 @@ static __fi bool get_non_intra_block(int * last) } else { + int bit1 = SBITS(1); val = ((2 * tab->level + 1) * quantizer_scale * quant_matrix[i]) >> 5; - val = (val ^ SBITS(1)) - SBITS(1); + val = (val ^ bit1) - bit1; DUMPBITS(1); } @@ -683,25 +686,11 @@ void __fi finishmpeg2sliceIDEC() { ipuRegs.ctrl.SCD = 0; coded_block_pattern = decoder.coded_block_pattern; - - g_BP.BP += decoder.bitstream_bits - 16; - - if ((int)g_BP.BP < 0) - { - g_BP.BP = 128 + (int)g_BP.BP; - - // After BP is positioned correctly, we need to reload the old buffer - // so that reading may continue properly - ReorderBitstream(); - } - - FillInternalBuffer(&g_BP.BP, 1, 0); } bool mpeg2sliceIDEC() { u16 code; - u8 bit8; switch (ipu_cmd.pos[0]) { @@ -855,18 +844,18 @@ bool mpeg2sliceIDEC() } else switch (UBITS(11)) { - case 8: /* macroblock_escape */ - mbaCount += 33; - /* pass through */ + case 8: /* macroblock_escape */ + mbaCount += 33; + /* pass through */ - case 15: /* macroblock_stuffing (MPEG1 only) */ - DUMPBITS(11); - continue; + case 15: /* macroblock_stuffing (MPEG1 only) */ + DUMPBITS(11); + continue; - default: /* end of slice/frame, or error? */ - { - goto finish_idec; - } + default: /* end of slice/frame, or error? */ + { + goto finish_idec; + } } } @@ -897,12 +886,13 @@ bool mpeg2sliceIDEC() ipu_cmd.pos[1] = 0; ipu_cmd.pos[2] = 0; } - + finish_idec: finishmpeg2sliceIDEC(); case 3: - bit8 = 1; + { + u8 bit8; if (!getBits8((u8*)&bit8, 0)) { ipu_cmd.pos[0] = 3; @@ -911,10 +901,10 @@ finish_idec: if (bit8 == 0) { - if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7); - + g_BP.Align(); ipuRegs.ctrl.SCD = 1; } + } case 4: if (!getBits32((u8*)&ipuRegs.top, 0)) @@ -923,7 +913,7 @@ finish_idec: return false; } - BigEndian(ipuRegs.top, ipuRegs.top); + ipuRegs.top = BigEndian(ipuRegs.top); break; jNO_DEFAULT; @@ -935,7 +925,6 @@ finish_idec: bool mpeg2_slice() { int DCT_offset, DCT_stride; - u8 bit8; macroblock_8& mb8 = decoder.mb8; macroblock_16& mb16 = decoder.mb16; @@ -1022,7 +1011,31 @@ bool mpeg2_slice() jNO_DEFAULT; } - ipu_copy(mb8, mb16); + // Copy macroblock8 to macroblock16 - without sign extension. + // Manually inlined due to MSVC refusing to inline the SSE-optimized version. + { + const u8 *s = (const u8*)&mb8; + u16 *d = (u16*)&mb16; + + //Y bias - 16 * 16 + //Cr bias - 8 * 8 + //Cb bias - 8 * 8 + + __m128i zeroreg = _mm_setzero_si128(); + + for (uint i = 0; i < (256+64+64) / 32; ++i) + { + //*d++ = *s++; + __m128i woot1 = _mm_load_si128((__m128i*)s); + __m128i woot2 = _mm_load_si128((__m128i*)s+1); + _mm_store_si128((__m128i*)d, _mm_unpacklo_epi8(woot1, zeroreg)); + _mm_store_si128((__m128i*)d+1, _mm_unpackhi_epi8(woot1, zeroreg)); + _mm_store_si128((__m128i*)d+2, _mm_unpacklo_epi8(woot2, zeroreg)); + _mm_store_si128((__m128i*)d+3, _mm_unpackhi_epi8(woot2, zeroreg)); + s += 32; + d += 32; + } + } } else { @@ -1096,18 +1109,6 @@ bool mpeg2_slice() // Send The MacroBlock via DmaIpuFrom ipuRegs.ctrl.SCD = 0; coded_block_pattern = decoder.coded_block_pattern; - g_BP.BP += (int)decoder.bitstream_bits - 16; - - // BP goes from 0 to 128, so negative values mean to read old buffer - // so we minus from 128 to get the correct BP - if ((int)g_BP.BP < 0) - { - g_BP.BP = 128 + (int)g_BP.BP; - - // After BP is positioned correctly, we need to reload the old buffer - // so that reading may continue properly - ReorderBitstream(); - } decoder.mbc = 1; decoder.SetOutputTo(mb16); @@ -1131,7 +1132,8 @@ bool mpeg2_slice() } case 4: - bit8 = 1; + { + u8 bit8; if (!getBits8((u8*)&bit8, 0)) { ipu_cmd.pos[0] = 4; @@ -1140,11 +1142,11 @@ bool mpeg2_slice() if (bit8 == 0) { - if (g_BP.BP & 7) g_BP.BP += 8 - (g_BP.BP & 7); - + g_BP.Align(); ipuRegs.ctrl.SCD = 1; } - + } + case 5: if (!getBits32((u8*)&ipuRegs.top, 0)) { @@ -1152,8 +1154,7 @@ bool mpeg2_slice() return false; } - BigEndian(ipuRegs.top, ipuRegs.top); - decoder.bitstream_bits = 0; + ipuRegs.top = BigEndian(ipuRegs.top); break; } diff --git a/pcsx2/IPU/mpeg2lib/Mpeg.h b/pcsx2/IPU/mpeg2lib/Mpeg.h index 877f4cdd46..5ea46631e7 100644 --- a/pcsx2/IPU/mpeg2lib/Mpeg.h +++ b/pcsx2/IPU/mpeg2lib/Mpeg.h @@ -152,8 +152,8 @@ struct decoder_t { uint ipu0_idx; /* bit parsing stuff */ - u32 bitstream_buf; /* current 32 bit working set */ - int bitstream_bits; /* used bits in working set */ + //u32 bitstream_buf; /* current 32 bit working set */ + //int bitstream_bits; /* used bits in working set */ int quantizer_scale; /* remove */ int dmv_offset; /* remove */ @@ -241,6 +241,10 @@ struct mpeg2_scan_pack mpeg2_scan_pack(); }; +extern int bitstream_init (); +extern u32 UBITS(uint bits); +extern s32 SBITS(uint bits); + extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride); extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride); @@ -258,20 +262,19 @@ extern int get_dmv(); extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn); extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte); extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4); -extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16); extern int slice (u8 * buffer); #ifdef _MSC_VER -#define BigEndian(out, in) out = _byteswap_ulong(in) +#define BigEndian(in) _byteswap_ulong(in) #else -#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap... +#define BigEndian(in) __builtin_bswap32(in) // or we could use the asm function bswap... #endif #ifdef _MSC_VER -#define BigEndian64(out, in) out = _byteswap_uint64(in) +#define BigEndian64(in) _byteswap_uint64(in) #else -#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap... +#define BigEndian64(in) __builtin_bswap64(in) // or we could use the asm function bswap... #endif extern __aligned16 const mpeg2_scan_pack mpeg2_scan; diff --git a/pcsx2/IPU/mpeg2lib/Vlc.h b/pcsx2/IPU/mpeg2lib/Vlc.h index 86f9f7ad5e..cac61dd40c 100644 --- a/pcsx2/IPU/mpeg2lib/Vlc.h +++ b/pcsx2/IPU/mpeg2lib/Vlc.h @@ -30,64 +30,24 @@ #ifndef __VLC_H__ #define __VLC_H__ -//static u8 word[4]; -//static u8 dword[8]; -//static u8 qword[16]; - static __fi int GETWORD() { - if (decoder.bitstream_bits <= 0) return 1; - - static u8 data[2]; - - if(!getBits16(data,1)) - { - return 0; - } - - /*u32 data; - BigEndian(data, *(u32*)word); - decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits; - decoder.bitstream_bits -= 32;*/ - decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits; - decoder.bitstream_bits -= 16; - - return 1; + return g_BP.FillBuffer(16); } -static __fi int bitstream_init () +// Removes bits from the bitstream. This is done independently of UBITS/SBITS because a +// lot of mpeg streams have to read ahead and rewind bits and re-read them at different +// bit depths or sign'age. +static __fi void DUMPBITS(uint num) { - if (!getBits32((u8*)&decoder.bitstream_buf, 1)) - { - return 0; - } - - decoder.bitstream_bits = -16; - BigEndian(decoder.bitstream_buf, decoder.bitstream_buf); - /*decoder.bitstream_buf = *(u64*)dword; - BigEndian64(decoder.bitstream_buf, decoder.bitstream_buf);*/ - - return 1; + g_BP.Advance(num); + //pxAssume(g_BP.FP != 0); } -/* remove num valid bits from bit_buf */ -static __fi void DUMPBITS(int num) +static __fi u32 GETBITS(uint num) { - decoder.bitstream_buf <<= num; - decoder.bitstream_bits += num; -} - -/* take num bits from the high part of bit_buf and zero extend them */ -#define UBITS(num) (((u32)decoder.bitstream_buf) >> (32 - (num))) - -/* take num bits from the high part of bit_buf and sign extend them */ -#define SBITS(num) (((s32)decoder.bitstream_buf) >> (32 - (num))) - -/* Get bits from bitstream */ -static __fi u32 GETBITS(int num) -{ - u16 retVal = UBITS(num); - DUMPBITS(num); + uint retVal = UBITS(num); + g_BP.Advance(num); return retVal; } diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h index bf506a3d58..a981837907 100644 --- a/pcsx2/SaveState.h +++ b/pcsx2/SaveState.h @@ -24,7 +24,7 @@ // the lower 16 bit value. IF the change is breaking of all compatibility with old // states, increment the upper 16 bit value, and clear the lower 16 bits to 0. -static const u32 g_SaveVersion = 0x8b4a0000; +static const u32 g_SaveVersion = 0x8b4b0000; // this function is meant to be used in the place of GSfreeze, and provides a safe layer // between the GS saving function and the MTGS's needs. :)