diff --git a/pcsx2/Vif1Dma.cpp b/pcsx2/Vif1Dma.cpp index 01231ca75e..8c7df26cb3 100644 --- a/pcsx2/Vif1Dma.cpp +++ b/pcsx2/Vif1Dma.cpp @@ -313,15 +313,25 @@ static int __fastcall Vif1TransDirectHL(u32 *data) return ret; } - +#ifdef newVif1 + extern void initNewVif(int idx); + extern int nVifUnpack(int idx, u32 *data); + static int testVif = 0; +#endif static int __fastcall Vif1TransUnpack(u32 *data) { +#ifdef newVif1 + if (!testVif) { initNewVif(1); testVif = 1; } + //int temp = nVifUnpack(1, data); + //if (temp >= 0) return temp; + return nVifUnpack(1, data); +#endif XMMRegisters::Freeze(); if (vif1.vifpacketsize < vif1.tag.size) { int ret = vif1.tag.size; - /* size is less that the total size, transfer is 'in pieces' */ + // size is less that the total size, transfer is 'in pieces' if (vif1Regs->offset != 0 || vif1.cl != 0) { vif1.tag.size -= vif1.vifpacketsize - VIFalign<1>(data, &vif1.tag, vif1.vifpacketsize); diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp index 2ed529de1a..a27a88f0bb 100644 --- a/pcsx2/VifDma.cpp +++ b/pcsx2/VifDma.cpp @@ -460,11 +460,13 @@ template u32 VIFalign(u32 *data, vifCode *v, u32 size) } return size>>2; } - +#include "newVif.h" +#ifndef newVif template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); template void VIFunpack(u32 *data, vifCode *v, u32 size) { + //DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); u32 *dest; u32 unpackType; UNPACKFUNCTYPE func; @@ -786,6 +788,7 @@ template void VIFunpack(u32 *data, vifCode *v, u32 size) } } } +#endif // #ifndef newVif template void vuExecMicro<0>(u32 addr); template void vuExecMicro<1>(u32 addr); diff --git a/pcsx2/VifDma_internal.h b/pcsx2/VifDma_internal.h index 774e7eb748..7ce8556ebf 100644 --- a/pcsx2/VifDma_internal.h +++ b/pcsx2/VifDma_internal.h @@ -60,4 +60,7 @@ static __forceinline u32 vif_size(u8 num) return (num == 0) ? 0x1000 : 0x4000; } +//#define newVif // Enable 'newVif' Code (if the below macros are not defined, it will use old non-sse code) +//#define newVif1 // Use New Code for Vif1 Unpacks (needs newVif defined) +//#define newVif0 // Use New Code for Vif0 Unpacks (not implemented) #endif diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj index 92e16e5956..2461eab428 100644 --- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj +++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj @@ -836,6 +836,30 @@ > + + + + + + + + + + + + . + */ + +#pragma once + +#ifdef newVif +#include "x86emitter/x86emitter.h" +using namespace x86Emitter; +extern void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0); +extern void _nVifUnpack(int idx, u8 *data, u32 size); + +struct instBlock { u8 data[16*64]; }; +static __pagealigned instBlock nVifUpk[2][2][4][3][16]; // [USN][Masking][curCycle][CyclesToWrite-1][Unpack Type] +static __aligned16 u32 nVifMask[3][4][4] = {0}; // [MaskNumber][CycleNumber][Vector] +typedef u32 (__fastcall *nVifCall)(void*, void*); +#define nVifUnpackF(dest, src, usn, doMask, curCycle, cycles, unpackType) { \ + (((nVifCall)((void*)&nVifUpk[usn][doMask][curCycle][cycles][unpackType]))(dest, src)); \ +} + +#define _v0 0 +#define _v1 0x55 +#define _v2 0xaa +#define _v3 0xff +#define aMax(x, y) (((x) > (y) ? (x) : (y))) +#define aMin(x, y) (((x) < (y) ? (x) : (y))) +#define _f __forceinline + +#define xShiftR(regX, n) { \ + if (usn) { xPSRL.D(regX, n); } \ + else { xPSRA.D(regX, n); } \ +} + +u32 nVifT[16] = { + 4, // S-32 + 2, // S-16 + 1, // S-8 + 0, // ---- + 8, // V2-32 + 4, // V2-16 + 2, // V2-8 + 0, // ---- + 12,// V3-32 + 6, // V3-16 + 3, // V3-8 + 0, // ---- + 16,// V4-32 + 8, // V4-16 + 4, // V4-8 + 2, // V4-5 +}; + +#include "newVif_BlockBuffer.h" +#include "newVif_OldUnpack.inl" +#include "newVif_UnpackGen.inl" +#include "newVif_Unpack.inl" + +#endif diff --git a/pcsx2/x86/newVif_BlockBuffer.h b/pcsx2/x86/newVif_BlockBuffer.h new file mode 100644 index 0000000000..806565cff4 --- /dev/null +++ b/pcsx2/x86/newVif_BlockBuffer.h @@ -0,0 +1,40 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +class BlockBuffer { +private: + u32 mSize; // Cur Size + u32 mSizeT; // Total Size + u8* mData; // Data Ptr + void grow(u32 newSize) { + u8* temp = new u8[newSize]; + memcpy(temp, mData, mSizeT); + safe_delete( mData ); + mData = temp; + } +public: + BlockBuffer(u32 tSize) { mSizeT = tSize; mSize = 0; mData = new u8[mSizeT]; } + virtual ~BlockBuffer() { safe_delete(mData); } + void append(void *addr, u32 size) { + if (mSize + size > mSizeT) grow(mSize*2 + size); + memcpy(&mData[mSize], addr, size); + mSize += size; + } + void clear() { mSize = 0; } + u32 getSize() { return mSize; } + u8* getBlock() { return mData; } +}; diff --git a/pcsx2/x86/newVif_OldUnpack.inl b/pcsx2/x86/newVif_OldUnpack.inl new file mode 100644 index 0000000000..3f19e93a43 --- /dev/null +++ b/pcsx2/x86/newVif_OldUnpack.inl @@ -0,0 +1,167 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// Old Vif Unpack Code +// Only here for testing/reference +// If newVif is defined and newVif1 isn't, vif1 will use this code +// same goes for vif0... +template void VIFunpack<0>(u32 *data, vifCode *v, u32 size); +template void VIFunpack<1>(u32 *data, vifCode *v, u32 size); +template void VIFunpack(u32 *data, vifCode *v, u32 size) { + //if (!VIFdmanum) DevCon.WriteLn("vif#%d, size = %d [%x]", VIFdmanum, size, data); + UNPACKFUNCTYPE func; + const VIFUnpackFuncTable *ft; + VURegs * VU; + u8 *cdata = (u8*)data; + u32 tempsize = 0; + const u32 memlimit = vif_size(VIFdmanum); + + if (VIFdmanum == 0) { + VU = &VU0; + vifRegs = vif0Regs; + vifMaskRegs = g_vif0Masks; + vif = &vif0; + vifRow = g_vifmask.Row0; + } + else { + VU = &VU1; + vifRegs = vif1Regs; + vifMaskRegs = g_vif1Masks; + vif = &vif1; + vifRow = g_vifmask.Row1; + } + + u32 *dest = (u32*)(VU->Mem + v->addr); + u32 unpackType = v->cmd & 0xf; + + ft = &VIFfuncTable[ unpackType ]; + func = vif->usn ? ft->funcU : ft->funcS; + size <<= 2; + + if (vifRegs->cycle.cl >= vifRegs->cycle.wl) { // skipping write + if (v->addr >= memlimit) { + DevCon.Warning("Overflown at the start"); + v->addr &= (memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + + size = min(size, (int)vifRegs->num * ft->gsize); //size will always be the same or smaller + + tempsize = v->addr + ((((vifRegs->num-1) / vifRegs->cycle.wl) * + (vifRegs->cycle.cl - vifRegs->cycle.wl)) * 16) + (vifRegs->num * 16); + + //Sanity Check (memory overflow) + if (tempsize > memlimit) { + if (((vifRegs->cycle.cl != vifRegs->cycle.wl) && + ((memlimit + (vifRegs->cycle.cl - vifRegs->cycle.wl) * 16) == tempsize))) { + //It's a red herring, so ignore it! SSE unpacks will be much quicker. + DevCon.WriteLn("what!!!!!!!!!"); + //tempsize = 0; + tempsize = size; + size = 0; + } + else { + DevCon.Warning("VIF%x Unpack ending %x > %x", VIFdmanum, tempsize, VIFdmanum ? 0x4000 : 0x1000); + tempsize = size; + size = 0; + } + } + else { + tempsize = size; + size = 0; + } + if (tempsize) { + int incdest = ((vifRegs->cycle.cl - vifRegs->cycle.wl) << 2) + 4; + size = 0; + int addrstart = v->addr; + //if((tempsize >> 2) != v->size) DevCon.Warning("split when size != tagsize"); + + VIFUNPACK_LOG("sorting tempsize :p, size %d, vifnum %d, addr %x", tempsize, vifRegs->num, v->addr); + + while ((tempsize >= ft->gsize) && (vifRegs->num > 0)) { + if(v->addr >= memlimit) { + DevCon.Warning("Mem limit overflow"); + v->addr &= (memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + + func(dest, (u32*)cdata, ft->qsize); + cdata += ft->gsize; + tempsize -= ft->gsize; + + vifRegs->num--; + vif->cl++; + + if (vif->cl == vifRegs->cycle.wl) { + dest += incdest; + v->addr +=(incdest * 4); + vif->cl = 0; + } + else { + dest += 4; + v->addr += 16; + } + } + if (v->addr >= memlimit) { + v->addr &=(memlimit - 1); + dest = (u32*)(VU->Mem + v->addr); + } + v->addr = addrstart; + if(tempsize > 0) size = tempsize; + } + + if (size >= ft->dsize && vifRegs->num > 0) { //Else write what we do have + DevCon.Warning("huh!!!!!!!!!!!!!!!!!!!!!!"); + VIF_LOG("warning, end with size = %d", size); + // unpack one qword + //v->addr += (size / ft->dsize) * 4; + func(dest, (u32*)cdata, size / ft->dsize); + size = 0; + VIFUNPACK_LOG("leftover done, size %d, vifnum %d, addr %x", size, vifRegs->num, v->addr); + } + } + else { // filling write + if(vifRegs->cycle.cl > 0) // Quicker and avoids zero division :P + if((u32)(((size / ft->gsize) / vifRegs->cycle.cl) * vifRegs->cycle.wl) < vifRegs->num) + DevCon.Warning("Filling write warning! %x < %x and CL = %x WL = %x", (size / ft->gsize), vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl); + + DevCon.Warning("filling write %d cl %d, wl %d mask %x mode %x unpacktype %x addr %x", vifRegs->num, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mask, vifRegs->mode, unpackType, vif->tag.addr); + while (vifRegs->num > 0) { + if (vif->cl == vifRegs->cycle.wl) { + vif->cl = 0; + } + // unpack one qword + if (vif->cl < vifRegs->cycle.cl) { + if(size < ft->gsize) { DevCon.WriteLn("Out of Filling write data!"); break; } + func(dest, (u32*)cdata, ft->qsize); + cdata += ft->gsize; + size -= ft->gsize; + vif->cl++; + vifRegs->num--; + if (vif->cl == vifRegs->cycle.wl) { + vif->cl = 0; + } + } + else { + func(dest, (u32*)cdata, ft->qsize); + v->addr += 16; + vifRegs->num--; + vif->cl++; + } + dest += 4; + if (vifRegs->num == 0) break; + } + } +} diff --git a/pcsx2/x86/newVif_Unpack.inl b/pcsx2/x86/newVif_Unpack.inl new file mode 100644 index 0000000000..f46373c70d --- /dev/null +++ b/pcsx2/x86/newVif_Unpack.inl @@ -0,0 +1,190 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +// newVif! - author: cottonvibes(@gmail.com) + +#pragma once + +struct nVifStruct { + u32 idx; // VIF0 or VIF1 + vifStruct* vif; // Vif Struct ptr + VIFregisters* vifRegs; // Vif Regs ptr + VURegs* VU; // VU Regs ptr + u8* vuMemEnd; // End of VU Memory + u32 vuMemLimit; // Use for fast AND + BlockBuffer* vifBlock; // Block Buffer +}; +nVifStruct nVif[2]; + +void initNewVif(int idx) { + nVif[idx].idx = idx; + nVif[idx].VU = idx ? &VU1 : &VU0; + nVif[idx].vif = idx ? &vif1 : &vif0; + nVif[idx].vifRegs = idx ? vif1Regs : vif0Regs; + nVif[idx].vifBlock = new BlockBuffer(0x2000); // 8kb Block Buffer + nVif[idx].vuMemEnd = idx ? ((u8*)(VU1.Mem + 0x4000)) : ((u8*)(VU0.Mem + 0x1000)); + nVif[idx].vuMemLimit= idx ? 0x3ff0 : 0xff0; + memset_8<0xcc,sizeof(nVifUpk)>(nVifUpk); + for (int a = 0; a < 2; a++) { + for (int b = 0; b < 2; b++) { + for (int c = 0; c < 4; c++) { + for (int d = 0; d < 3; d++) { + nVifGen(a, b, c, d); //nVifUpk[2][2][4][3][16]; + }}}} +} + +int nVifUnpack(int idx, u32 *data) { + XMMRegisters::Freeze(); + BlockBuffer* vB = nVif[idx].vifBlock; + int ret = aMin(vif1.vifpacketsize, vif1.tag.size); + vif1.tag.size -= ret; + _nVifUnpack(idx, (u8*)data, ret<<2); + if (vif1.tag.size <= 0) vif1.tag.size = 0; + if (vif1.tag.size <= 0) vif1.cmd = 0; + XMMRegisters::Thaw(); + return ret; +} + +_f u8* setVUptr(int idx, int offset) { + return (u8*)(nVif[idx].VU->Mem + (offset & nVif[idx].vuMemLimit)); +} + +_f void incVUptr(int idx, u8* &ptr, int amount) { + ptr += amount; + int diff = ptr - nVif[idx].vuMemEnd; + if (diff >= 0) { + ptr = nVif[idx].VU->Mem + diff; + } + if ((uptr)ptr & 0xf) DevCon.WriteLn("unaligned wtf :("); +} + +_f void setMasks(VIFregisters* v) { + for (int i = 0; i < 16; i++) { + int m = (v->mask >> (i*2)) & 3; + switch (m) { + case 0: // Data + nVifMask[0][i/4][i%4] = 0xffffffff; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = 0; + break; + case 1: // Row + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = ((u32*)&v->r0)[(i%4)*4]; + break; + case 2: // Col + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0; + nVifMask[2][i/4][i%4] = ((u32*)&v->c0)[(i/4)*4]; + break; + case 3: // Write Protect + nVifMask[0][i/4][i%4] = 0; + nVifMask[1][i/4][i%4] = 0xffffffff; + nVifMask[2][i/4][i%4] = 0; + break; + } + } +} + +_f void _nVifUnpack(int idx, u8 *data, u32 size) { + /*if (nVif[idx].vifRegs->cycle.cl >= nVif[idx].vifRegs->cycle.wl) { // skipping write + if (!idx) VIFunpack<0>((u32*)data, &vif0.tag, size>>2); + else VIFunpack<1>((u32*)data, &vif1.tag, size>>2); + return; + } + else*/ { // filling write + vif = nVif[idx].vif; + vifRegs = nVif[idx].vifRegs; + int isFill = !!(vifRegs->cycle.cl < vifRegs->cycle.wl); + int usn = !!(vif->usn); + int doMask = !!(vif->tag.cmd & 0x10); + int upkNum = vif->tag.cmd & 0xf; + int doMode = !!(vifRegs->mode); + if (doMask) setMasks(vifRegs); + + //if (isFill) + //DevCon.WriteLn("%s Write! [num = %d][%s]", (isFill?"Filling":"Skipping"), vifRegs->num, (vifRegs->num%3 ? "bad!" : "ok")); + //DevCon.WriteLn("%s Write! [mask = %08x][type = %02d][num = %d]", (isFill?"Filling":"Skipping"), vifRegs->mask, upkNum, vifRegs->num); + + u8* dest = setVUptr(idx, vif->tag.addr); + const VIFUnpackFuncTable* ft = &VIFfuncTable[vif->tag.cmd & 0xf]; + UNPACKFUNCTYPE func = vif->usn ? ft->funcU : ft->funcS; + int cycleSize = isFill ? vifRegs->cycle.cl : vifRegs->cycle.wl; + int blockSize = isFill ? vifRegs->cycle.wl : vifRegs->cycle.cl; + //vif->cl = 0; + while (vifRegs->num > 0) { + if (vif->cl >= blockSize) { + vif->cl = 0; + } + if (vif->cl < cycleSize) { + if (size <= 0) { DevCon.WriteLn("_nVifUnpack: Out of Data!"); break; } + if (doMode /*|| doMask*/) { + //if (doMask) + //DevCon.WriteLn("Non SSE; unpackNum = %d", upkNum); + func((u32*)dest, (u32*)data, ft->qsize); + data += ft->gsize; + size -= ft->gsize; + vifRegs->num--; + } + else if (1) { + //DevCon.WriteLn("SSE Unpack!"); + nVifUnpackF(dest, data, usn, doMask, aMin(vif->cl, 4), 0, upkNum); + data += nVifT[upkNum]; + size -= nVifT[upkNum]; + vifRegs->num--; + } + else { + //DevCon.WriteLn("SSE Unpack!"); + int c = aMin((cycleSize - vif->cl), 3); + int t = nVifT[upkNum]; + size -= t * c; + //if (c>1) { DevCon.WriteLn("C > 1!"); } + if (c<0||c>3) { DevCon.WriteLn("C wtf!"); } + if (size < 0) { DevCon.WriteLn("Size Shit"); size+=t*c;c=1;size-=t*c;} + nVifUnpackF(dest, data, usn, doMask, aMin(vif->cl, 4), c-1, upkNum); + data += t * c; + vifRegs->num -= c; + } + } + else if (isFill) { + func((u32*)dest, (u32*)data, ft->qsize); + vifRegs->num--; + } + incVUptr(idx, dest, 16); + vif->cl = (vif->cl+1) % blockSize; + } + } +} + +//int nVifUnpack(int idx, u32 *data) { +// XMMRegisters::Freeze(); +// BlockBuffer* vB = nVif[idx].vifBlock; +// int ret = aMin(vif1.vifpacketsize, vif1.tag.size); +// //vB->append(data, ret<<2); +// vif1.tag.size -= ret; +// //DevCon.WriteLn("2 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); +// //if (vif1.tag.size <= 0) { +// //DevCon.WriteLn("3 [0x%x][%d][%d]", vif1.tag.addr, vB->getSize(), vif1.tag.size<<2); +// //VIFunpack<1>(vB->getBlock(), &vif1.tag, vB->getSize()>>2); +// //_nVifUnpack(idx, vB->getBlock(), vB->getSize()); +// _nVifUnpack(idx, (u8*)data, ret<<2); +// if (vif1.tag.size <= 0) vif1.tag.size = 0; +// if (vif1.tag.size <= 0) vif1.cmd = 0; +// //vB->clear(); +// //} +// //else { vif1.tag.size+=ret; ret = -1; vB->clear(); } +// XMMRegisters::Thaw(); +// return ret; +//} diff --git a/pcsx2/x86/newVif_UnpackGen.inl b/pcsx2/x86/newVif_UnpackGen.inl new file mode 100644 index 0000000000..08c44c004c --- /dev/null +++ b/pcsx2/x86/newVif_UnpackGen.inl @@ -0,0 +1,223 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2009 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#define xMaskWrite(regX, x) { \ + if (x==0) xMOVAPS(xmm7, ptr32[ecx]); \ + if (x==1) xMOVAPS(xmm7, ptr32[ecx+0x10]); \ + if (x==2) xMOVAPS(xmm7, ptr32[ecx+0x20]); \ + int offX = aMin(curCycle+x, 4); \ + xPAND(regX, ptr32[nVifMask[0][offX]]); \ + xPAND(xmm7, ptr32[nVifMask[1][offX]]); \ + xPOR (regX, ptr32[nVifMask[2][offX]]); \ + xPOR (regX, xmm7); \ + if (x==0) xMOVAPS(ptr32[ecx], regX); \ + if (x==1) xMOVAPS(ptr32[ecx+0x10], regX); \ + if (x==2) xMOVAPS(ptr32[ecx+0x20], regX); \ +} + +#define xMovDest(reg0, reg1, reg2) { \ + if (mask==0) { \ + if (cycles>=0) { xMOVAPS (ptr32[ecx], reg0); } \ + if (cycles>=1) { xMOVAPS (ptr32[ecx+0x10], reg1); } \ + if (cycles>=2) { xMOVAPS (ptr32[ecx+0x20], reg2); } \ + } \ + else { \ + if (cycles>=0) { xMaskWrite(reg0, 0); } \ + if (cycles>=1) { xMaskWrite(reg1, 1); } \ + if (cycles>=2) { xMaskWrite(reg2, 2); } \ + } \ +} + +// xmm2 gets result +void convertRGB() { + xPSLL.D (xmm1, 3); // ABG|R5.000 + xMOVAPS (xmm2, xmm1);// R5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // ABG + xPSLL.D (xmm1, 3); // AB|G5.000 + xMOVAPS (xmm3, xmm1);// G5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // AB + xPSLL.D (xmm1, 3); // A|B5.000 + xMOVAPS (xmm4, xmm1);// B5.000 (garbage upper bits) + xPSRL.D (xmm1, 8); // A + xPSLL.D (xmm1, 7); // A.0000000 + + xPSHUF.D (xmm1, xmm1, _v0); // A|A|A|A + xPSHUF.D (xmm3, xmm3, _v0); // G|G|G|G + xPSHUF.D (xmm4, xmm4, _v0); // B|B|B|B + mVUmergeRegs(XMM2, XMM1, 0x3); // A|x|x|R + mVUmergeRegs(XMM2, XMM3, 0x4); // A|x|G|R + mVUmergeRegs(XMM2, XMM4, 0x2); // A|B|G|R + + xPSLL.D (xmm2, 24); // can optimize to + xPSRL.D (xmm2, 24); // single AND... +} + +// ecx = dest, edx = src +void nVifGen(int usn, int mask, int curCycle, int cycles) { + HostSys::MemProtect(nVifUpk, sizeof(nVifUpk), Protect_ReadWrite, false); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x0]); // S-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x1]); // S-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x2]); // S-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=0) xPSHUF.D (xmm1, xmm0, _v0); + if (cycles>=1) xPSHUF.D (xmm2, xmm0, _v1); + if (cycles>=2) xPSHUF.D (xmm3, xmm0, _v2); + if (cycles>=0) xMovDest (xmm1, xmm2, xmm3); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x3]); // ---- + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x4]); // V2-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x10]); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x5]); // V2-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x6]); // V2-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=2) xPSHUF.D (xmm2, xmm0, _v2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=1) xPSHUF.D (xmm1, xmm0, 0xe); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x7]); // ---- + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x8]); // V3-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+12]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+24]); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0x9]); // V3-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+6]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+12]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=1) xShiftR (xmm1, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xa]); // V3-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+3]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+6]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=1) xShiftR (xmm1, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xb]); // ---- + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xc]); // V4-32 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xd]); // V4-16 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+0x10]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+0x20]); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 16); + if (cycles>=1) xShiftR (xmm1, 16); + if (cycles>=2) xShiftR (xmm2, 16); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xe]); // V4-8 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=1) xMOVUPS (xmm1, ptr32[edx+4]); + if (cycles>=2) xMOVUPS (xmm2, ptr32[edx+8]); + if (cycles>=0) xPUNPCK.LBW(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LBW(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LBW(xmm2, xmm2); + if (cycles>=0) xPUNPCK.LWD(xmm0, xmm0); + if (cycles>=1) xPUNPCK.LWD(xmm1, xmm1); + if (cycles>=2) xPUNPCK.LWD(xmm2, xmm2); + if (cycles>=0) xShiftR (xmm0, 24); + if (cycles>=1) xShiftR (xmm1, 24); + if (cycles>=2) xShiftR (xmm2, 24); + if (cycles>=0) xMovDest (xmm0, xmm1, xmm2); + xRET(); + + // A | B5 | G5 | R5 + // ..0.. A 0000000 | ..0.. B 000 | ..0.. G 000 | ..0.. R 000 + xSetPtr(&nVifUpk[usn][mask][curCycle][cycles][0xf]); // V4-5 + if (cycles>=0) xMOVUPS (xmm0, ptr32[edx]); + if (cycles>=0) xMOVAPS (xmm1, xmm0); + if (cycles>=0) convertRGB(); + if (cycles>=0) xMOVAPS (ptr32[ecx], xmm2); + if (cycles>=1) xMOVAPS (xmm1, xmm0); + if (cycles>=1) xPSRL.D (xmm1, 16); + if (cycles>=1) convertRGB(); + if (cycles>=1) xMOVAPS (ptr32[ecx+0x10], xmm2); + if (cycles>=2) xPSHUF.D (xmm1, xmm0, _v1); + if (cycles>=2) convertRGB(); + if (cycles>=2) xMOVAPS (ptr32[ecx+0x20], xmm2); + xRET(); + HostSys::MemProtect(nVifUpk, sizeof(nVifUpk), Protect_ReadOnly, true); +}