x86/microVU: Pack VF cycles into bitfields

The VF cycle count doesn't go above 4, across 32 registers that saves 64
bytes.

Also gets rid of blockhasmbit, since save states are getting invalidated
anyway, it was never used.

[SAVEVERSION+] VU struct changes.
This commit is contained in:
Stenzek 2023-04-27 15:02:36 +10:00 committed by refractionpcsx2
parent d3e527f2a4
commit cd9b6c7ac3
9 changed files with 56 additions and 106 deletions

View File

@ -36,7 +36,7 @@ enum class FreezeAction
// [SAVEVERSION+]
// This informs the auto updater that the users savestates will be invalidated.
static const u32 g_SaveVersion = (0x9A35 << 16) | 0x0000;
static const u32 g_SaveVersion = (0x9A36 << 16) | 0x0000;
// the freezing data between submodules and core

View File

@ -157,7 +157,6 @@ struct alignas(16) VURegs
u32 ebit;
u32 pending_q;
u32 pending_p;
u32 blockhasmbit;
alignas(16) u32 micro_macflags[4];
alignas(16) u32 micro_clipflags[4];

View File

@ -54,7 +54,6 @@ static void _vu0Exec(VURegs* VU)
if (ptr[1] & 0x20000000 && VU == &VU0) // M flag
{
VU->flags |= VUFLAG_MFLAGSET;
VU0.blockhasmbit = true;
// Console.WriteLn("fixme: M flag set");
}
if (ptr[1] & 0x10000000) // D flag
@ -185,8 +184,6 @@ static void _vu0Exec(VURegs* VU)
{
VU->VI[REG_TPC].UL = VU->branchpc;
VU->blockhasmbit = false;
if (VU->takedelaybranch)
{
DevCon.Warning("VU0 - Branch/Jump in Delay Slot");
@ -205,8 +202,6 @@ static void _vu0Exec(VURegs* VU)
_vuFlushAll(VU);
VU0.VI[REG_VPU_STAT].UL &= ~0x1; /* E flag */
vif0Regs.stat.VEW = false;
VU->blockhasmbit = false;
}
}

View File

@ -109,7 +109,6 @@ void SaveStateBase::vuMicroFreeze()
Freeze(VU0.ebit);
Freeze(VU0.pending_q);
Freeze(VU0.pending_p);
Freeze(VU0.blockhasmbit);
Freeze(VU0.micro_macflags);
Freeze(VU0.micro_clipflags);
Freeze(VU0.micro_statusflags);
@ -149,7 +148,6 @@ void SaveStateBase::vuMicroFreeze()
Freeze(VU1.ebit);
Freeze(VU1.pending_q);
Freeze(VU1.pending_p);
Freeze(VU1.blockhasmbit);
Freeze(VU1.micro_macflags);
Freeze(VU1.micro_clipflags);
Freeze(VU1.micro_statusflags);

View File

@ -164,7 +164,7 @@ public:
{
u32 viCRC = 0, vfCRC = 0, crc = 0, z = sizeof(microRegInfo) / 4;
for (u32 j = 0; j < 4; j++) viCRC -= ((u32*)linkI->block.pState.VI)[j];
for (u32 j = 0; j < 32; j++) vfCRC -= linkI->block.pState.VF[j].reg;
for (u32 j = 0; j < 32; j++) vfCRC -= linkI->block.pState.VF[j].x + (linkI->block.pState.VF[j].y << 8) + (linkI->block.pState.VF[j].z << 16) + (linkI->block.pState.VF[j].x << 24);
for (u32 j = 0; j < z; j++) crc -= ((u32*)&linkI->block.pState)[j];
DevCon.WriteLn(Color_Green,
"[%04x][Block #%d][crc=%08x][q=%02d][p=%02d][xgkick=%d][vi15=%04x][vi15v=%d][viBackup=%02d]"

View File

@ -314,9 +314,9 @@ __ri void eBitWarning(mV)
//------------------------------------------------------------------
// Cycles / Pipeline State / Early Exit from Execution
//------------------------------------------------------------------
__fi void optimizeReg(u8& rState) { rState = (rState == 1) ? 0 : rState; }
__fi void calcCycles(u8& reg, u8 x) { reg = ((reg > x) ? (reg - x) : 0); }
__fi void tCycles(u8& dest, u8& src) { dest = std::max(dest, src); }
__fi u8 optimizeReg(u8 rState) { return (rState == 1) ? 0 : rState; }
__fi u8 calcCycles(u8 reg, u8 x) { return ((reg > x) ? (reg - x) : 0); }
__fi u8 tCycles(u8 dest, u8 src) { return std::max(dest, src); }
__fi void incP(mV) { mVU.p ^= 1; }
__fi void incQ(mV) { mVU.q ^= 1; }
@ -328,17 +328,17 @@ void mVUoptimizePipeState(mV)
{
for (int i = 0; i < 32; i++)
{
optimizeReg(mVUregs.VF[i].x);
optimizeReg(mVUregs.VF[i].y);
optimizeReg(mVUregs.VF[i].z);
optimizeReg(mVUregs.VF[i].w);
mVUregs.VF[i].x = optimizeReg(mVUregs.VF[i].x);
mVUregs.VF[i].y = optimizeReg(mVUregs.VF[i].y);
mVUregs.VF[i].z = optimizeReg(mVUregs.VF[i].z);
mVUregs.VF[i].w = optimizeReg(mVUregs.VF[i].w);
}
for (int i = 0; i < 16; i++)
{
optimizeReg(mVUregs.VI[i]);
mVUregs.VI[i] = optimizeReg(mVUregs.VI[i]);
}
if (mVUregs.q) { optimizeReg(mVUregs.q); if (!mVUregs.q) { incQ(mVU); } }
if (mVUregs.p) { optimizeReg(mVUregs.p); if (!mVUregs.p) { incP(mVU); } }
if (mVUregs.q) { mVUregs.q = optimizeReg(mVUregs.q); if (!mVUregs.q) { incQ(mVU); } }
if (mVUregs.p) { mVUregs.p = optimizeReg(mVUregs.p); if (!mVUregs.p) { incP(mVU); } }
mVUregs.r = 0; // There are no stalls on the R-reg, so its Safe to discard info
}
@ -348,21 +348,21 @@ void mVUincCycles(mV, int x)
// VF[0] is a constant value (0.0 0.0 0.0 1.0)
for (int z = 31; z > 0; z--)
{
calcCycles(mVUregs.VF[z].x, x);
calcCycles(mVUregs.VF[z].y, x);
calcCycles(mVUregs.VF[z].z, x);
calcCycles(mVUregs.VF[z].w, x);
mVUregs.VF[z].x = calcCycles(mVUregs.VF[z].x, x);
mVUregs.VF[z].y = calcCycles(mVUregs.VF[z].y, x);
mVUregs.VF[z].z = calcCycles(mVUregs.VF[z].z, x);
mVUregs.VF[z].w = calcCycles(mVUregs.VF[z].w, x);
}
// VI[0] is a constant value (0)
for (int z = 15; z > 0; z--)
{
calcCycles(mVUregs.VI[z], x);
mVUregs.VI[z] = calcCycles(mVUregs.VI[z], x);
}
if (mVUregs.q)
{
if (mVUregs.q > 4)
{
calcCycles(mVUregs.q, x);
mVUregs.q = calcCycles(mVUregs.q, x);
if (mVUregs.q <= 4)
{
mVUinfo.doDivFlag = 1;
@ -370,27 +370,27 @@ void mVUincCycles(mV, int x)
}
else
{
calcCycles(mVUregs.q, x);
mVUregs.q = calcCycles(mVUregs.q, x);
}
if (!mVUregs.q)
incQ(mVU);
}
if (mVUregs.p)
{
calcCycles(mVUregs.p, x);
mVUregs.p = calcCycles(mVUregs.p, x);
if (!mVUregs.p || mVUregsTemp.p)
incP(mVU);
}
if (mVUregs.xgkick)
{
calcCycles(mVUregs.xgkick, x);
mVUregs.xgkick = calcCycles(mVUregs.xgkick, x);
if (!mVUregs.xgkick)
{
mVUinfo.doXGKICK = 1;
mVUinfo.XGKICKPC = xPC;
}
}
calcCycles(mVUregs.r, x);
mVUregs.r = calcCycles(mVUregs.r, x);
}
// Helps check if upper/lower ops read/write to same regs...
@ -430,21 +430,21 @@ void mVUsetCycles(mV)
cmpVFregs(mVUlow.VF_write, mVUup.VF_read[1], mVUinfo.backupVF);
}
tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].x, mVUregsTemp.VF[0].x);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].y, mVUregsTemp.VF[0].y);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].z, mVUregsTemp.VF[0].z);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].w, mVUregsTemp.VF[0].w);
mVUregs.VF[mVUregsTemp.VFreg[0]].x = tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].x, mVUregsTemp.VF[0].x);
mVUregs.VF[mVUregsTemp.VFreg[0]].y = tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].y, mVUregsTemp.VF[0].y);
mVUregs.VF[mVUregsTemp.VFreg[0]].z = tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].z, mVUregsTemp.VF[0].z);
mVUregs.VF[mVUregsTemp.VFreg[0]].w = tCycles(mVUregs.VF[mVUregsTemp.VFreg[0]].w, mVUregsTemp.VF[0].w);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].x, mVUregsTemp.VF[1].x);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].y, mVUregsTemp.VF[1].y);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].z, mVUregsTemp.VF[1].z);
tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].w, mVUregsTemp.VF[1].w);
mVUregs.VF[mVUregsTemp.VFreg[1]].x = tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].x, mVUregsTemp.VF[1].x);
mVUregs.VF[mVUregsTemp.VFreg[1]].y = tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].y, mVUregsTemp.VF[1].y);
mVUregs.VF[mVUregsTemp.VFreg[1]].z = tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].z, mVUregsTemp.VF[1].z);
mVUregs.VF[mVUregsTemp.VFreg[1]].w = tCycles(mVUregs.VF[mVUregsTemp.VFreg[1]].w, mVUregsTemp.VF[1].w);
tCycles(mVUregs.VI[mVUregsTemp.VIreg], mVUregsTemp.VI);
tCycles(mVUregs.q, mVUregsTemp.q);
tCycles(mVUregs.p, mVUregsTemp.p);
tCycles(mVUregs.r, mVUregsTemp.r);
tCycles(mVUregs.xgkick, mVUregsTemp.xgkick);
mVUregs.VI[mVUregsTemp.VIreg] = tCycles(mVUregs.VI[mVUregsTemp.VIreg], mVUregsTemp.VI);
mVUregs.q = tCycles(mVUregs.q, mVUregsTemp.q);
mVUregs.p = tCycles(mVUregs.p, mVUregsTemp.p);
mVUregs.r = tCycles(mVUregs.r, mVUregsTemp.r);
mVUregs.xgkick = tCycles(mVUregs.xgkick, mVUregsTemp.xgkick);
}
// Prints Start/End PC of blocks executed, for debugging...
@ -556,7 +556,6 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr)
mVUregs.blockType = 0;
mVUregs.viBackUp = 0;
mVUregs.flagInfo = 0;
mVUregs.mbitinblock = false;
mVUsFlagHack = CHECK_VU_FLAGHACK;
mVUinitConstValues(mVU);
}
@ -727,7 +726,6 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState)
if ((curI & _Mbit_) && isVU0)
{
mVUregs.mbitinblock = true;
if (xPC > 0)
{
incPC(-2);
@ -850,7 +848,6 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState)
// Fix up vi15 const info for propagation through blocks
mVUregs.vi15 = (doConstProp && mVUconstReg[15].isValid) ? (u16)mVUconstReg[15].regValue : 0;
mVUregs.vi15v = (doConstProp && mVUconstReg[15].isValid) ? 1 : 0;
xMOV(ptr32[&mVU.regs().blockhasmbit], mVUregs.mbitinblock);
mVUsetFlags(mVU, mFC); // Sets Up Flag instances
mVUoptimizePipeState(mVU); // Optimize the End Pipeline State for nicer Block Linking
mVUdebugPrintBlocks(mVU, false); // Prints Start/End PC of blocks executed, for debugging...

View File

@ -232,14 +232,10 @@ void mvuGenerateCopyPipelineState(mV)
xVMOVAPS(ymm0, ptr[rax]);
xVMOVAPS(ymm1, ptr[rax + 32u]);
xVMOVAPS(ymm2, ptr[rax + 64u]);
xVMOVAPS(ymm3, ptr[rax + 96u]);
xVMOVAPS(ymm4, ptr[rax + 128u]);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], ymm0);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], ymm1);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], ymm2);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 96u], ymm3);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 128u], ymm4);
xVZEROUPPER();
}
@ -251,10 +247,6 @@ void mvuGenerateCopyPipelineState(mV)
xMOVAPS(xmm3, ptr[rax + 48u]);
xMOVAPS(xmm4, ptr[rax + 64u]);
xMOVAPS(xmm5, ptr[rax + 80u]);
xMOVAPS(xmm6, ptr[rax + 96u]);
xMOVAPS(xmm7, ptr[rax + 112u]);
xMOVAPS(xmm8, ptr[rax + 128u]);
xMOVAPS(xmm9, ptr[rax + 144u]);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], xmm0);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 16u], xmm1);
@ -262,10 +254,6 @@ void mvuGenerateCopyPipelineState(mV)
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 48u], xmm3);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], xmm4);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 80u], xmm5);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 96u], xmm6);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 112u], xmm7);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 128u], xmm8);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 144u], xmm9);
}
xRET();

View File

@ -17,16 +17,12 @@
#include "microVU.h"
#include <array>
union regInfo
struct regCycleInfo
{
u32 reg;
struct
{
u8 x;
u8 y;
u8 z;
u8 w;
};
u8 x : 4;
u8 y : 4;
u8 z : 4;
u8 w : 4;
};
// microRegInfo is carefully ordered for faster compares. The "important" information is
@ -57,24 +53,24 @@ union alignas(16) microRegInfo
};
u32 xgkickcycles;
u8 mbitinblock;
u8 unused;
u8 vi15v; // 'vi15' constant is valid
u16 vi15; // Constant Prop Info for vi15
struct
{
u8 VI[16];
regInfo VF[32];
regCycleInfo VF[32];
};
};
u128 full128[160 / sizeof(u128)];
u64 full64[160 / sizeof(u64)];
u32 full32[160 / sizeof(u32)];
u128 full128[96 / sizeof(u128)];
u64 full64[96 / sizeof(u64)];
u32 full32[96 / sizeof(u32)];
};
// Note: mVUcustomSearch needs to be updated if this is changed
static_assert(sizeof(microRegInfo) == 160, "microRegInfo was not 160 bytes");
static_assert(sizeof(microRegInfo) == 96, "microRegInfo was not 96 bytes");
struct microProgram;
struct microJumpCache
@ -94,14 +90,14 @@ struct alignas(16) microBlock
struct microTempRegInfo
{
regInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction
u8 VFreg[2]; // Index of the VF reg
u8 VI; // Holds cycle info for Id
u8 VIreg; // Index of the VI reg
u8 q; // Holds cycle info for Q reg
u8 p; // Holds cycle info for P reg
u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified)
u8 xgkick; // Holds the cycle info for XGkick
regCycleInfo VF[2]; // Holds cycle info for Fd, VF[0] = Upper Instruction, VF[1] = Lower Instruction
u8 VFreg[2]; // Index of the VF reg
u8 VI; // Holds cycle info for Id
u8 VIreg; // Index of the VI reg
u8 q; // Holds cycle info for Q reg
u8 p; // Holds cycle info for P reg
u8 r; // Holds cycle info for R reg (Will never cause stalls, but useful to know if R is modified)
u8 xgkick; // Holds the cycle info for XGkick
};
struct microVFreg

View File

@ -644,22 +644,8 @@ void mVUcustomSearch()
xMOVAPS (xmm2, ptr32[arg1reg + 0x50]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x50]);
xPAND (xmm1, xmm2);
xPAND (xmm0, xmm1);
xMOVAPS (xmm2, ptr32[arg1reg + 0x60]);
xPCMP.EQD(xmm2, ptr32[arg2reg + 0x60]);
xMOVAPS (xmm3, ptr32[arg1reg + 0x70]);
xPCMP.EQD(xmm3, ptr32[arg2reg + 0x70]);
xPAND (xmm2, xmm3);
xMOVAPS (xmm3, ptr32[arg1reg + 0x80]);
xPCMP.EQD(xmm3, ptr32[arg2reg + 0x80]);
xMOVAPS (xmm4, ptr32[arg1reg + 0x90]);
xPCMP.EQD(xmm4, ptr32[arg2reg + 0x90]);
xPAND (xmm3, xmm4);
xPAND (xmm0, xmm1);
xPAND (xmm2, xmm3);
xPAND (xmm0, xmm2);
xMOVMSKPS(eax, xmm0);
xXOR(eax, 0xf);
@ -675,20 +661,11 @@ void mVUcustomSearch()
xForwardJNZ8 exitPoint;
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVMOVUPS(ymm2, ptr[arg1reg + 0x60]);
xVPCMP.EQD(ymm2, ymm2, ptr[arg2reg + 0x60]);
xVPAND(ymm0, ymm0, ymm1);
xVMOVUPS(ymm3, ptr[arg1reg + 0x80]);
xVPCMP.EQD(ymm3, ymm3, ptr[arg2reg + 0x80]);
xVPAND(ymm2, ymm2, ymm3);
xVPAND(ymm0, ymm0, ymm2);
xVPMOVMSKB(eax, ymm0);
xNOT(eax);