diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp index e36a5b1541..721e6d51ca 100644 --- a/pcsx2/Vif_Codes.cpp +++ b/pcsx2/Vif_Codes.cpp @@ -257,12 +257,14 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) { vu1Thread.WriteMicroMem(addr, (u8*)data, size*4); return; } - if (memcmp_mmx(VUx.Micro + addr, data, size*4)) { + //The compare is pretty much a waste of time, likelyhood is that the program isnt there, thats why its copying it. + //Faster without. + //if (memcmp_mmx(VUx.Micro + addr, data, size*4)) { // Clear VU memory before writing! if (!idx) CpuVU0->Clear(addr, size*4); else CpuVU1->Clear(addr, size*4); - memcpy_fast(VUx.Micro + addr, data, size*4); - } + memcpy_aligned(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast + //} } vifOp(vifCode_MPG) { @@ -381,14 +383,6 @@ vifOp(vifCode_Nop) { pass1 { GetVifX.cmd = 0; GetVifX.pass = 0; - /*if(idx && vif1ch.chcr.STR == true) - { - //Some games use a huge stream of NOPS to wait for a GIF packet to start, alas the way PCSX2 works it never starts - //So the mask can go on before the packet continues, causing desync. - - if(((data[1] >> 24) & 0x7f) == 0x6) //Look in to the future and see if we have a mask path 3 command (NFSU) - GetVifX.vifstalled = true; //Stall if we do to get the timing right. - }*/ } pass3 { VifCodeLog("Nop"); } return 1; diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp index d11472d12a..52c58923d0 100644 --- a/pcsx2/Vif_Transfer.cpp +++ b/pcsx2/Vif_Transfer.cpp @@ -22,34 +22,12 @@ // VifCode Transfer Interpreter (Vif0/Vif1) //------------------------------------------------------------------ -// Doesn't stall if the next vifCode is the Mark command -_vifT bool runMark(u32* &data) { - if (((vifXRegs.code >> 24) & 0x7f) == 0x7) { - //DevCon.WriteLn("Vif%d: Running Mark with I-bit", idx); - return 1; // No Stall? - } - return 1; // Stall -} - -// Returns 1 if i-bit && finished vifcode && i-bit not masked -_vifT bool analyzeIbit(u32* &data, int iBit) { - vifStruct& vifX = GetVifX; - if (iBit && !vifX.cmd && !vifXRegs.err.MII) { - //DevCon.WriteLn("Vif I-Bit IRQ"); - vifX.irq++; - - if(CHECK_VIF1STALLHACK) return 0; - else return 1; - } - return 0; -} - // Interprets packet _vifT void vifTransferLoop(u32* &data) { vifStruct& vifX = GetVifX; u32& pSize = vifX.vifpacketsize; - int iBit = vifX.cmd >> 7; + int ret = 0; vifXRegs.stat.VPS |= VPS_TRANSFERRING; @@ -58,11 +36,19 @@ _vifT void vifTransferLoop(u32* &data) { while (pSize > 0 && !vifX.vifstalled) { if(!vifX.cmd) { // Get new VifCode - + + if(!vifXRegs.err.MII) + { + if(vifX.irq && !CHECK_VIF1STALLHACK) + break; + + vifX.irq = data[0] >> 31; + } + vifXRegs.code = data[0]; vifX.cmd = data[0] >> 24; - iBit = data[0] >> 31; - + + //VIF_LOG("New VifCMD %x tagsize %x", vifX.cmd, vifX.tag.size); if (IsDevBuild && SysTrace.EE.VIFcode.IsActive()) { // Pass 2 means "log it" @@ -73,10 +59,7 @@ _vifT void vifTransferLoop(u32* &data) { ret = vifCmdHandler[idx][vifX.cmd & 0x7f](vifX.pass, data); data += ret; pSize -= ret; - if (analyzeIbit(data, iBit)) break; } - - if (pSize) vifX.vifstalled = true; } _vifT static __fi bool vifTransfer(u32 *data, int size, bool TTE) { diff --git a/pcsx2/x86/newVif_Dynarec.cpp b/pcsx2/x86/newVif_Dynarec.cpp index 925f24f69f..08f91083e5 100644 --- a/pcsx2/x86/newVif_Dynarec.cpp +++ b/pcsx2/x86/newVif_Dynarec.cpp @@ -75,12 +75,14 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const { const int idx = v.idx; const vifStruct& vif = MTVU_VifX; - u32 m0 = vB.mask; - u32 m1 = m0 & 0xaaaaaaaa; - u32 m2 =(~m1>>1) & m0; - u32 m3 = (m1>>1) & ~m0; - if((m2&&doMask)||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); } + //This could have ended up copying the row when there was no row to write.1810080 + u32 m0 = vB.mask; //The actual mask example 0x03020100 + u32 m3 = ((m0 & 0xaaaaaaaa)>>1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge) + u32 m2 = (m0 & 0x55555555) & (~m0>>1); // 0x1000100 & 0xFE7EFF7F = 0x00000100 Just the row + + if((m2&&doMask)||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); MSKPATH3_LOG("Moving row");} if (m3&&doMask) { + MSKPATH3_LOG("Merging Cols"); xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]); if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1); if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2); @@ -92,33 +94,37 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const { void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const { pxAssertDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking."); - xRegisterSSE t = regX == xmm0 ? xmm1 : xmm0; // Get Temp Reg + int cc = aMin(vCL, 3); - u32 m0 = (vB.mask >> (cc * 8)) & 0xff; - u32 m1 = m0 & 0xaa; - u32 m2 =(~m1>>1) & m0; - u32 m3 = (m1>>1) & ~m0; - u32 m4 = (m1>>1) & m0; + u32 m0 = (vB.mask >> (cc * 8)) & 0xff; //The actual mask example 0xE4 (protect, col, row, clear) + u32 m3 = ((m0 & 0xaa)>>1) & ~m0; //all the upper bits (cols shifted right) cancelling out any write protects 0x10 + u32 m2 = (m0 & 0x55) & (~m0>>1); // all the lower bits (rows)cancelling out any write protects 0x04 + u32 m4 = (m0 & ~((m3<<1) | m2)) & 0x55; // = 0xC0 & 0x55 = 0x40 (for merge mask) + makeMergeMask(m2); makeMergeMask(m3); makeMergeMask(m4); - if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect - if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge MaskRow - if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge MaskCol - if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect + + if (doMask&&m2) { mergeVectors(regX, xmmRow, xmmTemp, m2); } // Merge MaskRow + if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), xmmTemp, m3); } // Merge MaskCol + if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); + mergeVectors(regX, xmmTemp, xmmTemp, m4); } // Merge Write Protect if (doMode) { - u32 m5 = (~m1>>1) & ~m0; + u32 m5 = ~(m2|m3|m4) & 0xf; + if (!doMask) m5 = 0xf; - else makeMergeMask(m5); - if (m5 < 0xf) { + + if (m5 < 0xf) + { xPXOR(xmmTemp, xmmTemp); - mergeVectors(xmmTemp, xmmRow, t, m5); + mergeVectors(xmmTemp, xmmRow, xmmTemp, m5); xPADD.D(regX, xmmTemp); - if (doMode==2) mergeVectors(xmmRow, regX, t, m5); + if (doMode==2) mergeVectors(xmmRow, regX, xmmTemp, m5); } - else if (m5 == 0xf) { + else + { xPADD.D(regX, xmmRow); - if (doMode==2) xMOVAPS(xmmRow, regX); + if (doMode==2){ xMOVAPS(xmmRow, regX); } } } xMOVAPS(ptr32[dstIndirect], regX); @@ -127,6 +133,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const { void VifUnpackSSE_Dynarec::writeBackRow() const { const int idx = v.idx; xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow); + DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]"); // ToDo: Do we need to write back to vifregs.rX too!? :/ } @@ -143,9 +150,39 @@ static void ShiftDisplacementWindow( xAddressVoid& addr, const xRegister32& modR addImm += 0xf0; addr -= 0xf0; } - if(addImm) xADD(modReg, addImm); + if(addImm) { xADD(modReg, addImm); } } +void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp ) +{ + + switch( upknum ) + { + case 0: + case 1: + case 2: UnpkNoOfIterations = 4; if(PostOp == true) { UnpkLoopIteration++; UnpkLoopIteration = UnpkLoopIteration % UnpkNoOfIterations; } break; + + case 4: + case 5: + case 6: UnpkNoOfIterations = 2; if(PostOp == true) { UnpkLoopIteration++; UnpkLoopIteration = UnpkLoopIteration % UnpkNoOfIterations; } break; + + case 8: break; + case 9: break; + case 10: break; + + case 12: break; + case 13: break; + case 14: break; + case 15: break; + + case 3: + case 7: + case 11: + pxFailRel( wxsFormat( L"Vpu/Vif - Invalid Unpack! [%d]", upknum ) ); + break; + } + +} void VifUnpackSSE_Dynarec::CompileRoutine() { const int upkNum = vB.upkType & 0xf; const u8& vift = nVifT[upkNum]; @@ -155,29 +192,32 @@ void VifUnpackSSE_Dynarec::CompileRoutine() { uint vNum = vB.num ? vB.num : 256; doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature. + MSKPATH3_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum); pxAssume(vCL == 0); - + UnpkLoopIteration = 0; // Value passed determines # of col regs we need to load SetMasks(isFill ? blockSize : cycleSize); while (vNum) { - ShiftDisplacementWindow( srcIndirect, edx ); + ShiftDisplacementWindow( dstIndirect, ecx ); + if(UnpkNoOfIterations == 0) + ShiftDisplacementWindow( srcIndirect, edx ); //Don't need to do this otherwise as we arent reading the source. + + if (vCL < cycleSize) { + ModUnpack(upkNum, false); xUnpack(upkNum); xMovDest(); + ModUnpack(upkNum, true); + dstIndirect += 16; srcIndirect += vift; - if( IsUnmaskedOp() ) { - ++destReg; - ++workReg; - } - vNum--; if (++vCL == blockSize) vCL = 0; } @@ -189,11 +229,6 @@ void VifUnpackSSE_Dynarec::CompileRoutine() { dstIndirect += 16; - if( IsUnmaskedOp() ) { - ++destReg; - ++workReg; - } - vNum--; if (++vCL == blockSize) vCL = 0; } @@ -256,7 +291,7 @@ _vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill) ((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data); } else { - //DevCon.WriteLn("Running Interpreter Block"); + DevCon.WriteLn("Running Interpreter Block"); _nVifUnpack(idx, data, vifRegs.mode, isFill); } return true; diff --git a/pcsx2/x86/newVif_Unpack.cpp b/pcsx2/x86/newVif_Unpack.cpp index bf5dc26094..2bb5474487 100644 --- a/pcsx2/x86/newVif_Unpack.cpp +++ b/pcsx2/x86/newVif_Unpack.cpp @@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) { if (ret == vif.tag.size) { // Full Transfer if (v.bSize) { // Last transfer was partial - memcpy_fast(&v.buffer[v.bSize], data, size); + memcpy_aligned(&v.buffer[v.bSize], data, size); v.bSize += size; size = v.bSize; data = v.buffer; @@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) { v.bSize = 0; } else { // Partial Transfer - memcpy_fast(&v.buffer[v.bSize], data, size); + memcpy_aligned(&v.buffer[v.bSize], data, size); v.bSize += size; vif.tag.size -= ret; diff --git a/pcsx2/x86/newVif_UnpackSSE.cpp b/pcsx2/x86/newVif_UnpackSSE.cpp index d0806d42b2..deb7d9405b 100644 --- a/pcsx2/x86/newVif_UnpackSSE.cpp +++ b/pcsx2/x86/newVif_UnpackSSE.cpp @@ -25,7 +25,6 @@ //static __pagealigned u8 nVifUpkExec[__pagesize*4]; static RecompiledCodeReserve* nVifUpkExec = NULL; - // Merges xmm vectors without modifying source reg void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) { if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15) @@ -33,7 +32,7 @@ void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xy mVUmergeRegs(dest, src, xyzw); } else { - xMOVAPS(temp, src); + if(temp != src) xMOVAPS(temp, src); //Sometimes we don't care if the source is modified and is temp reg. mVUmergeRegs(dest, temp, xyzw); } } @@ -61,7 +60,7 @@ void VifUnpackSSE_Base::xShiftR(const xRegisterSSE& regX, int n) const { void VifUnpackSSE_Base::xPMOVXX8(const xRegisterSSE& regX) const { if (usn) xPMOVZX.BD(regX, ptr32[srcIndirect]); - else xPMOVSX.BD(regX, ptr32[srcIndirect]); + else xPMOVSX.BD(regX, ptr32[srcIndirect]); } void VifUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const { @@ -70,37 +69,87 @@ void VifUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const { } void VifUnpackSSE_Base::xUPK_S_32() const { - xMOV32 (workReg, ptr32[srcIndirect]); - xPSHUF.D (destReg, workReg, _v0); + + switch(UnpkLoopIteration) + { + case 0: + xMOV128 (workReg, ptr32[srcIndirect]); + xPSHUF.D (destReg, workReg, _v0); + break; + case 1: + xPSHUF.D (destReg, workReg, _v1); + break; + case 2: + xPSHUF.D (destReg, workReg, _v2); + break; + case 3: + xPSHUF.D (destReg, workReg, _v3); + break; + } + } void VifUnpackSSE_Base::xUPK_S_16() const { - if (x86caps.hasStreamingSIMD4Extensions) + + if (!x86caps.hasStreamingSIMD4Extensions) { - xPMOVXX16 (workReg); + xMOV16 (workReg, ptr32[srcIndirect]); + xPUNPCK.LWD(workReg, workReg); + xShiftR (workReg, 16); + + xPSHUF.D (destReg, workReg, _v0); + return; } - else - { - xMOV16 (workReg, ptr32[srcIndirect]); - xPUNPCK.LWD(workReg, workReg); - xShiftR (workReg, 16); + + switch(UnpkLoopIteration) + { + case 0: + xPMOVXX16 (workReg); + xPSHUF.D (destReg, workReg, _v0); + break; + case 1: + xPSHUF.D (destReg, workReg, _v1); + break; + case 2: + xPSHUF.D (destReg, workReg, _v2); + break; + case 3: + xPSHUF.D (destReg, workReg, _v3); + break; } - xPSHUF.D (destReg, workReg, _v0); + } void VifUnpackSSE_Base::xUPK_S_8() const { - if (x86caps.hasStreamingSIMD4Extensions) - { - xPMOVXX8 (workReg); - } - else + + if (!x86caps.hasStreamingSIMD4Extensions) { xMOV8 (workReg, ptr32[srcIndirect]); xPUNPCK.LBW(workReg, workReg); xPUNPCK.LWD(workReg, workReg); xShiftR (workReg, 24); + + xPSHUF.D (destReg, workReg, _v0); + return; } - xPSHUF.D (destReg, workReg, _v0); + + switch(UnpkLoopIteration) + { + case 0: + xPMOVXX8 (workReg); + xPSHUF.D (destReg, workReg, _v0); + break; + case 1: + xPSHUF.D (destReg, workReg, _v1); + break; + case 2: + xPSHUF.D (destReg, workReg, _v2); + break; + case 3: + xPSHUF.D (destReg, workReg, _v3); + break; + } + } // The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate". @@ -109,44 +158,75 @@ void VifUnpackSSE_Base::xUPK_S_8() const { // I have commented after each shuffle to show what data is going where - Ref void VifUnpackSSE_Base::xUPK_V2_32() const { - xMOV64 (destReg, ptr32[srcIndirect]); - xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0 + + if(UnpkLoopIteration == 0) + { + xMOV128 (workReg, ptr32[srcIndirect]); + xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0 + } + else + { + xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2 + } + } void VifUnpackSSE_Base::xUPK_V2_16() const { - if (x86caps.hasStreamingSIMD4Extensions) + + if(UnpkLoopIteration == 0 || !x86caps.hasStreamingSIMD4Extensions) { - xPMOVXX16 (destReg); + if (x86caps.hasStreamingSIMD4Extensions) + { + xPMOVXX16 (workReg); + + } + else + { + xMOV32 (workReg, ptr32[srcIndirect]); + xPUNPCK.LWD(workReg, workReg); + xShiftR (workReg, 16); + } + xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0 } - else + else { - xMOV32 (destReg, ptr32[srcIndirect]); - xPUNPCK.LWD(destReg, destReg); - xShiftR (destReg, 16); + xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2 } - xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0 + + } void VifUnpackSSE_Base::xUPK_V2_8() const { - if (x86caps.hasStreamingSIMD4Extensions) + + if(UnpkLoopIteration == 0 || !x86caps.hasStreamingSIMD4Extensions) { - xPMOVXX8 (destReg); + if (x86caps.hasStreamingSIMD4Extensions) + { + xPMOVXX8 (workReg); + } + else + { + xMOV16 (workReg, ptr32[srcIndirect]); + xPUNPCK.LBW(workReg, workReg); + xPUNPCK.LWD(workReg, workReg); + xShiftR (workReg, 24); + } + xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0 } - else + else { - xMOV16 (destReg, ptr32[srcIndirect]); - xPUNPCK.LBW(destReg, destReg); - xPUNPCK.LWD(destReg, destReg); - xShiftR (destReg, 24); + xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2 } - xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0 + } void VifUnpackSSE_Base::xUPK_V3_32() const { + xMOV128 (destReg, ptr128[srcIndirect]); } void VifUnpackSSE_Base::xUPK_V3_16() const { + if (x86caps.hasStreamingSIMD4Extensions) { xPMOVXX16 (destReg); @@ -160,6 +240,7 @@ void VifUnpackSSE_Base::xUPK_V3_16() const { } void VifUnpackSSE_Base::xUPK_V3_8() const { + if (x86caps.hasStreamingSIMD4Extensions) { xPMOVXX8 (destReg); @@ -174,10 +255,12 @@ void VifUnpackSSE_Base::xUPK_V3_8() const { } void VifUnpackSSE_Base::xUPK_V4_32() const { + xMOV128 (destReg, ptr32[srcIndirect]); } void VifUnpackSSE_Base::xUPK_V4_16() const { + if (x86caps.hasStreamingSIMD4Extensions) { xPMOVXX16 (destReg); @@ -191,6 +274,7 @@ void VifUnpackSSE_Base::xUPK_V4_16() const { } void VifUnpackSSE_Base::xUPK_V4_8() const { + if (x86caps.hasStreamingSIMD4Extensions) { xPMOVXX8 (destReg); @@ -205,6 +289,7 @@ void VifUnpackSSE_Base::xUPK_V4_8() const { } void VifUnpackSSE_Base::xUPK_V4_5() const { + xMOV16 (workReg, ptr32[srcIndirect]); xPSHUF.D (workReg, workReg, _v0); xPSLL.D (workReg, 3); // ABG|R5.000 diff --git a/pcsx2/x86/newVif_UnpackSSE.h b/pcsx2/x86/newVif_UnpackSSE.h index 231a080b35..6c43b5b34b 100644 --- a/pcsx2/x86/newVif_UnpackSSE.h +++ b/pcsx2/x86/newVif_UnpackSSE.h @@ -34,12 +34,15 @@ class VifUnpackSSE_Base public: bool usn; // unsigned flag bool doMask; // masking write enable flag + int UnpkLoopIteration; + int UnpkNoOfIterations; + protected: xAddressVoid dstIndirect; xAddressVoid srcIndirect; xRegisterSSE workReg; - xRegisterSSE destReg; + xRegisterSSE destReg; public: VifUnpackSSE_Base(); @@ -105,7 +108,7 @@ class VifUnpackSSE_Dynarec : public VifUnpackSSE_Base public: bool isFill; int doMode; // two bit value representing... something! - + protected: const nVifStruct& v; // vif0 or vif1 const nVifBlock& vB; // some pre-collected data from VifStruct @@ -126,7 +129,9 @@ public: virtual bool IsUnmaskedOp() const{ return !doMode && !doMask; } + void ModUnpack( int upknum, bool PostOp ); void CompileRoutine(); + protected: virtual void doMaskWrite(const xRegisterSSE& regX) const;