mirror of https://github.com/PCSX2/pcsx2.git
VIF: Some optimizations for the VIF Rec, some small clean-up/optimizations for VIF itself.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5368 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
68a833f4e7
commit
f19c0b7ca9
|
@ -257,12 +257,14 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
|
||||||
vu1Thread.WriteMicroMem(addr, (u8*)data, size*4);
|
vu1Thread.WriteMicroMem(addr, (u8*)data, size*4);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (memcmp_mmx(VUx.Micro + addr, data, size*4)) {
|
//The compare is pretty much a waste of time, likelyhood is that the program isnt there, thats why its copying it.
|
||||||
|
//Faster without.
|
||||||
|
//if (memcmp_mmx(VUx.Micro + addr, data, size*4)) {
|
||||||
// Clear VU memory before writing!
|
// Clear VU memory before writing!
|
||||||
if (!idx) CpuVU0->Clear(addr, size*4);
|
if (!idx) CpuVU0->Clear(addr, size*4);
|
||||||
else CpuVU1->Clear(addr, size*4);
|
else CpuVU1->Clear(addr, size*4);
|
||||||
memcpy_fast(VUx.Micro + addr, data, size*4);
|
memcpy_aligned(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
vifOp(vifCode_MPG) {
|
vifOp(vifCode_MPG) {
|
||||||
|
@ -381,14 +383,6 @@ vifOp(vifCode_Nop) {
|
||||||
pass1 {
|
pass1 {
|
||||||
GetVifX.cmd = 0;
|
GetVifX.cmd = 0;
|
||||||
GetVifX.pass = 0;
|
GetVifX.pass = 0;
|
||||||
/*if(idx && vif1ch.chcr.STR == true)
|
|
||||||
{
|
|
||||||
//Some games use a huge stream of NOPS to wait for a GIF packet to start, alas the way PCSX2 works it never starts
|
|
||||||
//So the mask can go on before the packet continues, causing desync.
|
|
||||||
|
|
||||||
if(((data[1] >> 24) & 0x7f) == 0x6) //Look in to the future and see if we have a mask path 3 command (NFSU)
|
|
||||||
GetVifX.vifstalled = true; //Stall if we do to get the timing right.
|
|
||||||
}*/
|
|
||||||
}
|
}
|
||||||
pass3 { VifCodeLog("Nop"); }
|
pass3 { VifCodeLog("Nop"); }
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -22,34 +22,12 @@
|
||||||
// VifCode Transfer Interpreter (Vif0/Vif1)
|
// VifCode Transfer Interpreter (Vif0/Vif1)
|
||||||
//------------------------------------------------------------------
|
//------------------------------------------------------------------
|
||||||
|
|
||||||
// Doesn't stall if the next vifCode is the Mark command
|
|
||||||
_vifT bool runMark(u32* &data) {
|
|
||||||
if (((vifXRegs.code >> 24) & 0x7f) == 0x7) {
|
|
||||||
//DevCon.WriteLn("Vif%d: Running Mark with I-bit", idx);
|
|
||||||
return 1; // No Stall?
|
|
||||||
}
|
|
||||||
return 1; // Stall
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns 1 if i-bit && finished vifcode && i-bit not masked
|
|
||||||
_vifT bool analyzeIbit(u32* &data, int iBit) {
|
|
||||||
vifStruct& vifX = GetVifX;
|
|
||||||
if (iBit && !vifX.cmd && !vifXRegs.err.MII) {
|
|
||||||
//DevCon.WriteLn("Vif I-Bit IRQ");
|
|
||||||
vifX.irq++;
|
|
||||||
|
|
||||||
if(CHECK_VIF1STALLHACK) return 0;
|
|
||||||
else return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Interprets packet
|
// Interprets packet
|
||||||
_vifT void vifTransferLoop(u32* &data) {
|
_vifT void vifTransferLoop(u32* &data) {
|
||||||
vifStruct& vifX = GetVifX;
|
vifStruct& vifX = GetVifX;
|
||||||
|
|
||||||
u32& pSize = vifX.vifpacketsize;
|
u32& pSize = vifX.vifpacketsize;
|
||||||
int iBit = vifX.cmd >> 7;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
vifXRegs.stat.VPS |= VPS_TRANSFERRING;
|
vifXRegs.stat.VPS |= VPS_TRANSFERRING;
|
||||||
|
@ -59,9 +37,17 @@ _vifT void vifTransferLoop(u32* &data) {
|
||||||
|
|
||||||
if(!vifX.cmd) { // Get new VifCode
|
if(!vifX.cmd) { // Get new VifCode
|
||||||
|
|
||||||
|
if(!vifXRegs.err.MII)
|
||||||
|
{
|
||||||
|
if(vifX.irq && !CHECK_VIF1STALLHACK)
|
||||||
|
break;
|
||||||
|
|
||||||
|
vifX.irq = data[0] >> 31;
|
||||||
|
}
|
||||||
|
|
||||||
vifXRegs.code = data[0];
|
vifXRegs.code = data[0];
|
||||||
vifX.cmd = data[0] >> 24;
|
vifX.cmd = data[0] >> 24;
|
||||||
iBit = data[0] >> 31;
|
|
||||||
|
|
||||||
//VIF_LOG("New VifCMD %x tagsize %x", vifX.cmd, vifX.tag.size);
|
//VIF_LOG("New VifCMD %x tagsize %x", vifX.cmd, vifX.tag.size);
|
||||||
if (IsDevBuild && SysTrace.EE.VIFcode.IsActive()) {
|
if (IsDevBuild && SysTrace.EE.VIFcode.IsActive()) {
|
||||||
|
@ -73,10 +59,7 @@ _vifT void vifTransferLoop(u32* &data) {
|
||||||
ret = vifCmdHandler[idx][vifX.cmd & 0x7f](vifX.pass, data);
|
ret = vifCmdHandler[idx][vifX.cmd & 0x7f](vifX.pass, data);
|
||||||
data += ret;
|
data += ret;
|
||||||
pSize -= ret;
|
pSize -= ret;
|
||||||
if (analyzeIbit<idx>(data, iBit)) break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pSize) vifX.vifstalled = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_vifT static __fi bool vifTransfer(u32 *data, int size, bool TTE) {
|
_vifT static __fi bool vifTransfer(u32 *data, int size, bool TTE) {
|
||||||
|
|
|
@ -75,12 +75,14 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
||||||
const int idx = v.idx;
|
const int idx = v.idx;
|
||||||
const vifStruct& vif = MTVU_VifX;
|
const vifStruct& vif = MTVU_VifX;
|
||||||
|
|
||||||
u32 m0 = vB.mask;
|
//This could have ended up copying the row when there was no row to write.1810080
|
||||||
u32 m1 = m0 & 0xaaaaaaaa;
|
u32 m0 = vB.mask; //The actual mask example 0x03020100
|
||||||
u32 m2 =(~m1>>1) & m0;
|
u32 m3 = ((m0 & 0xaaaaaaaa)>>1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge)
|
||||||
u32 m3 = (m1>>1) & ~m0;
|
u32 m2 = (m0 & 0x55555555) & (~m0>>1); // 0x1000100 & 0xFE7EFF7F = 0x00000100 Just the row
|
||||||
if((m2&&doMask)||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); }
|
|
||||||
|
if((m2&&doMask)||doMode) { xMOVAPS(xmmRow, ptr128[&vif.MaskRow]); MSKPATH3_LOG("Moving row");}
|
||||||
if (m3&&doMask) {
|
if (m3&&doMask) {
|
||||||
|
MSKPATH3_LOG("Merging Cols");
|
||||||
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]);
|
xMOVAPS(xmmCol0, ptr128[&vif.MaskCol]);
|
||||||
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
if ((cS>=2) && (m3&0x0000ff00)) xPSHUF.D(xmmCol1, xmmCol0, _v1);
|
||||||
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
if ((cS>=3) && (m3&0x00ff0000)) xPSHUF.D(xmmCol2, xmmCol0, _v2);
|
||||||
|
@ -92,33 +94,37 @@ __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
|
||||||
|
|
||||||
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
||||||
pxAssertDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
|
pxAssertDev(regX.Id <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
|
||||||
xRegisterSSE t = regX == xmm0 ? xmm1 : xmm0; // Get Temp Reg
|
|
||||||
int cc = aMin(vCL, 3);
|
int cc = aMin(vCL, 3);
|
||||||
u32 m0 = (vB.mask >> (cc * 8)) & 0xff;
|
u32 m0 = (vB.mask >> (cc * 8)) & 0xff; //The actual mask example 0xE4 (protect, col, row, clear)
|
||||||
u32 m1 = m0 & 0xaa;
|
u32 m3 = ((m0 & 0xaa)>>1) & ~m0; //all the upper bits (cols shifted right) cancelling out any write protects 0x10
|
||||||
u32 m2 =(~m1>>1) & m0;
|
u32 m2 = (m0 & 0x55) & (~m0>>1); // all the lower bits (rows)cancelling out any write protects 0x04
|
||||||
u32 m3 = (m1>>1) & ~m0;
|
u32 m4 = (m0 & ~((m3<<1) | m2)) & 0x55; // = 0xC0 & 0x55 = 0x40 (for merge mask)
|
||||||
u32 m4 = (m1>>1) & m0;
|
|
||||||
makeMergeMask(m2);
|
makeMergeMask(m2);
|
||||||
makeMergeMask(m3);
|
makeMergeMask(m3);
|
||||||
makeMergeMask(m4);
|
makeMergeMask(m4);
|
||||||
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]); } // Load Write Protect
|
|
||||||
if (doMask&&m2) { mergeVectors(regX, xmmRow, t, m2); } // Merge MaskRow
|
if (doMask&&m2) { mergeVectors(regX, xmmRow, xmmTemp, m2); } // Merge MaskRow
|
||||||
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), t, m3); } // Merge MaskCol
|
if (doMask&&m3) { mergeVectors(regX, xRegisterSSE(xmmCol0.Id+cc), xmmTemp, m3); } // Merge MaskCol
|
||||||
if (doMask&&m4) { mergeVectors(regX, xmmTemp, t, m4); } // Merge Write Protect
|
if (doMask&&m4) { xMOVAPS(xmmTemp, ptr[dstIndirect]);
|
||||||
|
mergeVectors(regX, xmmTemp, xmmTemp, m4); } // Merge Write Protect
|
||||||
if (doMode) {
|
if (doMode) {
|
||||||
u32 m5 = (~m1>>1) & ~m0;
|
u32 m5 = ~(m2|m3|m4) & 0xf;
|
||||||
|
|
||||||
if (!doMask) m5 = 0xf;
|
if (!doMask) m5 = 0xf;
|
||||||
else makeMergeMask(m5);
|
|
||||||
if (m5 < 0xf) {
|
if (m5 < 0xf)
|
||||||
|
{
|
||||||
xPXOR(xmmTemp, xmmTemp);
|
xPXOR(xmmTemp, xmmTemp);
|
||||||
mergeVectors(xmmTemp, xmmRow, t, m5);
|
mergeVectors(xmmTemp, xmmRow, xmmTemp, m5);
|
||||||
xPADD.D(regX, xmmTemp);
|
xPADD.D(regX, xmmTemp);
|
||||||
if (doMode==2) mergeVectors(xmmRow, regX, t, m5);
|
if (doMode==2) mergeVectors(xmmRow, regX, xmmTemp, m5);
|
||||||
}
|
}
|
||||||
else if (m5 == 0xf) {
|
else
|
||||||
|
{
|
||||||
xPADD.D(regX, xmmRow);
|
xPADD.D(regX, xmmRow);
|
||||||
if (doMode==2) xMOVAPS(xmmRow, regX);
|
if (doMode==2){ xMOVAPS(xmmRow, regX); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
xMOVAPS(ptr32[dstIndirect], regX);
|
xMOVAPS(ptr32[dstIndirect], regX);
|
||||||
|
@ -127,6 +133,7 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
|
||||||
void VifUnpackSSE_Dynarec::writeBackRow() const {
|
void VifUnpackSSE_Dynarec::writeBackRow() const {
|
||||||
const int idx = v.idx;
|
const int idx = v.idx;
|
||||||
xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
|
xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
|
||||||
|
|
||||||
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
|
||||||
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
// ToDo: Do we need to write back to vifregs.rX too!? :/
|
||||||
}
|
}
|
||||||
|
@ -143,9 +150,39 @@ static void ShiftDisplacementWindow( xAddressVoid& addr, const xRegister32& modR
|
||||||
addImm += 0xf0;
|
addImm += 0xf0;
|
||||||
addr -= 0xf0;
|
addr -= 0xf0;
|
||||||
}
|
}
|
||||||
if(addImm) xADD(modReg, addImm);
|
if(addImm) { xADD(modReg, addImm); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VifUnpackSSE_Dynarec::ModUnpack( int upknum, bool PostOp )
|
||||||
|
{
|
||||||
|
|
||||||
|
switch( upknum )
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
case 1:
|
||||||
|
case 2: UnpkNoOfIterations = 4; if(PostOp == true) { UnpkLoopIteration++; UnpkLoopIteration = UnpkLoopIteration % UnpkNoOfIterations; } break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
case 5:
|
||||||
|
case 6: UnpkNoOfIterations = 2; if(PostOp == true) { UnpkLoopIteration++; UnpkLoopIteration = UnpkLoopIteration % UnpkNoOfIterations; } break;
|
||||||
|
|
||||||
|
case 8: break;
|
||||||
|
case 9: break;
|
||||||
|
case 10: break;
|
||||||
|
|
||||||
|
case 12: break;
|
||||||
|
case 13: break;
|
||||||
|
case 14: break;
|
||||||
|
case 15: break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
case 7:
|
||||||
|
case 11:
|
||||||
|
pxFailRel( wxsFormat( L"Vpu/Vif - Invalid Unpack! [%d]", upknum ) );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||||
const int upkNum = vB.upkType & 0xf;
|
const int upkNum = vB.upkType & 0xf;
|
||||||
const u8& vift = nVifT[upkNum];
|
const u8& vift = nVifT[upkNum];
|
||||||
|
@ -155,29 +192,32 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||||
|
|
||||||
uint vNum = vB.num ? vB.num : 256;
|
uint vNum = vB.num ? vB.num : 256;
|
||||||
doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
|
doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
|
||||||
|
MSKPATH3_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum);
|
||||||
|
|
||||||
pxAssume(vCL == 0);
|
pxAssume(vCL == 0);
|
||||||
|
UnpkLoopIteration = 0;
|
||||||
// Value passed determines # of col regs we need to load
|
// Value passed determines # of col regs we need to load
|
||||||
SetMasks(isFill ? blockSize : cycleSize);
|
SetMasks(isFill ? blockSize : cycleSize);
|
||||||
|
|
||||||
while (vNum) {
|
while (vNum) {
|
||||||
|
|
||||||
ShiftDisplacementWindow( srcIndirect, edx );
|
|
||||||
ShiftDisplacementWindow( dstIndirect, ecx );
|
ShiftDisplacementWindow( dstIndirect, ecx );
|
||||||
|
|
||||||
|
if(UnpkNoOfIterations == 0)
|
||||||
|
ShiftDisplacementWindow( srcIndirect, edx ); //Don't need to do this otherwise as we arent reading the source.
|
||||||
|
|
||||||
|
|
||||||
if (vCL < cycleSize) {
|
if (vCL < cycleSize) {
|
||||||
|
ModUnpack(upkNum, false);
|
||||||
xUnpack(upkNum);
|
xUnpack(upkNum);
|
||||||
xMovDest();
|
xMovDest();
|
||||||
|
ModUnpack(upkNum, true);
|
||||||
|
|
||||||
|
|
||||||
dstIndirect += 16;
|
dstIndirect += 16;
|
||||||
srcIndirect += vift;
|
srcIndirect += vift;
|
||||||
|
|
||||||
if( IsUnmaskedOp() ) {
|
|
||||||
++destReg;
|
|
||||||
++workReg;
|
|
||||||
}
|
|
||||||
|
|
||||||
vNum--;
|
vNum--;
|
||||||
if (++vCL == blockSize) vCL = 0;
|
if (++vCL == blockSize) vCL = 0;
|
||||||
}
|
}
|
||||||
|
@ -189,11 +229,6 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
|
||||||
|
|
||||||
dstIndirect += 16;
|
dstIndirect += 16;
|
||||||
|
|
||||||
if( IsUnmaskedOp() ) {
|
|
||||||
++destReg;
|
|
||||||
++workReg;
|
|
||||||
}
|
|
||||||
|
|
||||||
vNum--;
|
vNum--;
|
||||||
if (++vCL == blockSize) vCL = 0;
|
if (++vCL == blockSize) vCL = 0;
|
||||||
}
|
}
|
||||||
|
@ -256,7 +291,7 @@ _vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
|
||||||
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
//DevCon.WriteLn("Running Interpreter Block");
|
DevCon.WriteLn("Running Interpreter Block");
|
||||||
_nVifUnpack(idx, data, vifRegs.mode, isFill);
|
_nVifUnpack(idx, data, vifRegs.mode, isFill);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) {
|
||||||
|
|
||||||
if (ret == vif.tag.size) { // Full Transfer
|
if (ret == vif.tag.size) { // Full Transfer
|
||||||
if (v.bSize) { // Last transfer was partial
|
if (v.bSize) { // Last transfer was partial
|
||||||
memcpy_fast(&v.buffer[v.bSize], data, size);
|
memcpy_aligned(&v.buffer[v.bSize], data, size);
|
||||||
v.bSize += size;
|
v.bSize += size;
|
||||||
size = v.bSize;
|
size = v.bSize;
|
||||||
data = v.buffer;
|
data = v.buffer;
|
||||||
|
@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) {
|
||||||
v.bSize = 0;
|
v.bSize = 0;
|
||||||
}
|
}
|
||||||
else { // Partial Transfer
|
else { // Partial Transfer
|
||||||
memcpy_fast(&v.buffer[v.bSize], data, size);
|
memcpy_aligned(&v.buffer[v.bSize], data, size);
|
||||||
v.bSize += size;
|
v.bSize += size;
|
||||||
vif.tag.size -= ret;
|
vif.tag.size -= ret;
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@
|
||||||
//static __pagealigned u8 nVifUpkExec[__pagesize*4];
|
//static __pagealigned u8 nVifUpkExec[__pagesize*4];
|
||||||
static RecompiledCodeReserve* nVifUpkExec = NULL;
|
static RecompiledCodeReserve* nVifUpkExec = NULL;
|
||||||
|
|
||||||
|
|
||||||
// Merges xmm vectors without modifying source reg
|
// Merges xmm vectors without modifying source reg
|
||||||
void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
|
void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xyzw) {
|
||||||
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|
if (x86caps.hasStreamingSIMD4Extensions || (xyzw==15)
|
||||||
|
@ -33,7 +32,7 @@ void mergeVectors(xRegisterSSE dest, xRegisterSSE src, xRegisterSSE temp, int xy
|
||||||
mVUmergeRegs(dest, src, xyzw);
|
mVUmergeRegs(dest, src, xyzw);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
xMOVAPS(temp, src);
|
if(temp != src) xMOVAPS(temp, src); //Sometimes we don't care if the source is modified and is temp reg.
|
||||||
mVUmergeRegs(dest, temp, xyzw);
|
mVUmergeRegs(dest, temp, xyzw);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -70,37 +69,87 @@ void VifUnpackSSE_Base::xPMOVXX16(const xRegisterSSE& regX) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_S_32() const {
|
void VifUnpackSSE_Base::xUPK_S_32() const {
|
||||||
xMOV32 (workReg, ptr32[srcIndirect]);
|
|
||||||
xPSHUF.D (destReg, workReg, _v0);
|
switch(UnpkLoopIteration)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
xMOV128 (workReg, ptr32[srcIndirect]);
|
||||||
|
xPSHUF.D (destReg, workReg, _v0);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
xPSHUF.D (destReg, workReg, _v1);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
xPSHUF.D (destReg, workReg, _v2);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
xPSHUF.D (destReg, workReg, _v3);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_S_16() const {
|
void VifUnpackSSE_Base::xUPK_S_16() const {
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
|
||||||
|
if (!x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX16 (workReg);
|
xMOV16 (workReg, ptr32[srcIndirect]);
|
||||||
|
xPUNPCK.LWD(workReg, workReg);
|
||||||
|
xShiftR (workReg, 16);
|
||||||
|
|
||||||
|
xPSHUF.D (destReg, workReg, _v0);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
switch(UnpkLoopIteration)
|
||||||
{
|
{
|
||||||
xMOV16 (workReg, ptr32[srcIndirect]);
|
case 0:
|
||||||
xPUNPCK.LWD(workReg, workReg);
|
xPMOVXX16 (workReg);
|
||||||
xShiftR (workReg, 16);
|
xPSHUF.D (destReg, workReg, _v0);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
xPSHUF.D (destReg, workReg, _v1);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
xPSHUF.D (destReg, workReg, _v2);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
xPSHUF.D (destReg, workReg, _v3);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
xPSHUF.D (destReg, workReg, _v0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_S_8() const {
|
void VifUnpackSSE_Base::xUPK_S_8() const {
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
|
||||||
{
|
if (!x86caps.hasStreamingSIMD4Extensions)
|
||||||
xPMOVXX8 (workReg);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
xMOV8 (workReg, ptr32[srcIndirect]);
|
xMOV8 (workReg, ptr32[srcIndirect]);
|
||||||
xPUNPCK.LBW(workReg, workReg);
|
xPUNPCK.LBW(workReg, workReg);
|
||||||
xPUNPCK.LWD(workReg, workReg);
|
xPUNPCK.LWD(workReg, workReg);
|
||||||
xShiftR (workReg, 24);
|
xShiftR (workReg, 24);
|
||||||
|
|
||||||
|
xPSHUF.D (destReg, workReg, _v0);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
xPSHUF.D (destReg, workReg, _v0);
|
|
||||||
|
switch(UnpkLoopIteration)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
xPMOVXX8 (workReg);
|
||||||
|
xPSHUF.D (destReg, workReg, _v0);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
xPSHUF.D (destReg, workReg, _v1);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
xPSHUF.D (destReg, workReg, _v2);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
xPSHUF.D (destReg, workReg, _v3);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate".
|
// The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate".
|
||||||
|
@ -109,44 +158,75 @@ void VifUnpackSSE_Base::xUPK_S_8() const {
|
||||||
// I have commented after each shuffle to show what data is going where - Ref
|
// I have commented after each shuffle to show what data is going where - Ref
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V2_32() const {
|
void VifUnpackSSE_Base::xUPK_V2_32() const {
|
||||||
xMOV64 (destReg, ptr32[srcIndirect]);
|
|
||||||
xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0
|
if(UnpkLoopIteration == 0)
|
||||||
|
{
|
||||||
|
xMOV128 (workReg, ptr32[srcIndirect]);
|
||||||
|
xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V2_16() const {
|
void VifUnpackSSE_Base::xUPK_V2_16() const {
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
|
||||||
|
if(UnpkLoopIteration == 0 || !x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX16 (destReg);
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
|
{
|
||||||
|
xPMOVXX16 (workReg);
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
xMOV32 (workReg, ptr32[srcIndirect]);
|
||||||
|
xPUNPCK.LWD(workReg, workReg);
|
||||||
|
xShiftR (workReg, 16);
|
||||||
|
}
|
||||||
|
xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
xMOV32 (destReg, ptr32[srcIndirect]);
|
xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2
|
||||||
xPUNPCK.LWD(destReg, destReg);
|
|
||||||
xShiftR (destReg, 16);
|
|
||||||
}
|
}
|
||||||
xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V2_8() const {
|
void VifUnpackSSE_Base::xUPK_V2_8() const {
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
|
||||||
|
if(UnpkLoopIteration == 0 || !x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX8 (destReg);
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
|
{
|
||||||
|
xPMOVXX8 (workReg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
xMOV16 (workReg, ptr32[srcIndirect]);
|
||||||
|
xPUNPCK.LBW(workReg, workReg);
|
||||||
|
xPUNPCK.LWD(workReg, workReg);
|
||||||
|
xShiftR (workReg, 24);
|
||||||
|
}
|
||||||
|
xPSHUF.D (destReg, workReg, 0x44); //v1v0v1v0
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
xMOV16 (destReg, ptr32[srcIndirect]);
|
xPSHUF.D (destReg, workReg, 0xEE); //v3v2v3v2
|
||||||
xPUNPCK.LBW(destReg, destReg);
|
|
||||||
xPUNPCK.LWD(destReg, destReg);
|
|
||||||
xShiftR (destReg, 24);
|
|
||||||
}
|
}
|
||||||
xPSHUF.D (destReg, destReg, 0x44); //v1v0v1v0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V3_32() const {
|
void VifUnpackSSE_Base::xUPK_V3_32() const {
|
||||||
|
|
||||||
xMOV128 (destReg, ptr128[srcIndirect]);
|
xMOV128 (destReg, ptr128[srcIndirect]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V3_16() const {
|
void VifUnpackSSE_Base::xUPK_V3_16() const {
|
||||||
|
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX16 (destReg);
|
xPMOVXX16 (destReg);
|
||||||
|
@ -160,6 +240,7 @@ void VifUnpackSSE_Base::xUPK_V3_16() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V3_8() const {
|
void VifUnpackSSE_Base::xUPK_V3_8() const {
|
||||||
|
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX8 (destReg);
|
xPMOVXX8 (destReg);
|
||||||
|
@ -174,10 +255,12 @@ void VifUnpackSSE_Base::xUPK_V3_8() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V4_32() const {
|
void VifUnpackSSE_Base::xUPK_V4_32() const {
|
||||||
|
|
||||||
xMOV128 (destReg, ptr32[srcIndirect]);
|
xMOV128 (destReg, ptr32[srcIndirect]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V4_16() const {
|
void VifUnpackSSE_Base::xUPK_V4_16() const {
|
||||||
|
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX16 (destReg);
|
xPMOVXX16 (destReg);
|
||||||
|
@ -191,6 +274,7 @@ void VifUnpackSSE_Base::xUPK_V4_16() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V4_8() const {
|
void VifUnpackSSE_Base::xUPK_V4_8() const {
|
||||||
|
|
||||||
if (x86caps.hasStreamingSIMD4Extensions)
|
if (x86caps.hasStreamingSIMD4Extensions)
|
||||||
{
|
{
|
||||||
xPMOVXX8 (destReg);
|
xPMOVXX8 (destReg);
|
||||||
|
@ -205,6 +289,7 @@ void VifUnpackSSE_Base::xUPK_V4_8() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VifUnpackSSE_Base::xUPK_V4_5() const {
|
void VifUnpackSSE_Base::xUPK_V4_5() const {
|
||||||
|
|
||||||
xMOV16 (workReg, ptr32[srcIndirect]);
|
xMOV16 (workReg, ptr32[srcIndirect]);
|
||||||
xPSHUF.D (workReg, workReg, _v0);
|
xPSHUF.D (workReg, workReg, _v0);
|
||||||
xPSLL.D (workReg, 3); // ABG|R5.000
|
xPSLL.D (workReg, 3); // ABG|R5.000
|
||||||
|
|
|
@ -34,6 +34,9 @@ class VifUnpackSSE_Base
|
||||||
public:
|
public:
|
||||||
bool usn; // unsigned flag
|
bool usn; // unsigned flag
|
||||||
bool doMask; // masking write enable flag
|
bool doMask; // masking write enable flag
|
||||||
|
int UnpkLoopIteration;
|
||||||
|
int UnpkNoOfIterations;
|
||||||
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
xAddressVoid dstIndirect;
|
xAddressVoid dstIndirect;
|
||||||
|
@ -126,8 +129,10 @@ public:
|
||||||
|
|
||||||
virtual bool IsUnmaskedOp() const{ return !doMode && !doMask; }
|
virtual bool IsUnmaskedOp() const{ return !doMode && !doMask; }
|
||||||
|
|
||||||
|
void ModUnpack( int upknum, bool PostOp );
|
||||||
void CompileRoutine();
|
void CompileRoutine();
|
||||||
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void doMaskWrite(const xRegisterSSE& regX) const;
|
virtual void doMaskWrite(const xRegisterSSE& regX) const;
|
||||||
void SetMasks(int cS) const;
|
void SetMasks(int cS) const;
|
||||||
|
|
Loading…
Reference in New Issue