VU: Improve VU0/EE sync, Implement better M-Bit Handling, Fix VU program handing on VIF

This commit is contained in:
kozarovv 2020-08-09 08:30:24 +02:00 committed by refractionpcsx2
parent 0354e5e710
commit df79a17baa
23 changed files with 404 additions and 133 deletions

View File

@ -29,13 +29,13 @@ using namespace R5900::Interpreter;
void VCALLMS() {
vu0Finish();
vu0ExecMicro(((cpuRegs.code >> 6) & 0x7FFF));
vif0Regs.stat.VEW = false;
//vif0Regs.stat.VEW = false;
}
void VCALLMSR() {
vu0Finish();
vu0ExecMicro(VU0.VI[REG_CMSAR0].US[0]);
vif0Regs.stat.VEW = false;
//vif0Regs.stat.VEW = false;
}
void BC2F()

View File

@ -59,6 +59,7 @@ enum GamefixId
Fix_GoemonTlbMiss,
Fix_ScarfaceIbit,
Fix_CrashTagTeamIbit,
Fix_VU0Kickstart,
GamefixId_COUNT
};
@ -361,7 +362,8 @@ struct Pcsx2Config
FMVinSoftwareHack : 1, // Toggle in and out of software rendering when an FMV runs.
GoemonTlbHack : 1, // Gomeon tlb miss hack. The game need to access unmapped virtual address. Instead to handle it as exception, tlb are preloaded at startup
ScarfaceIbit : 1, // Scarface I bit hack. Needed to stop constant VU recompilation
CrashTagTeamRacingIbit : 1; // Crash Tag Team Racing I bit hack. Needed to stop constant VU recompilation
CrashTagTeamRacingIbit : 1, // Crash Tag Team Racing I bit hack. Needed to stop constant VU recompilation
VU0KickstartHack : 1; // Speed up VU0 at start of program to avoid some VU1 sync issues
BITFIELD_END
GamefixOptions();

View File

@ -267,7 +267,8 @@ const wxChar *const tbl_GamefixNames[] =
L"FMVinSoftware",
L"GoemonTlb",
L"ScarfaceIbit",
L"CrashTagTeamRacingIbit"
L"CrashTagTeamRacingIbit",
L"VU0Kickstart"
};
const __fi wxChar* EnumToString( GamefixId id )
@ -330,7 +331,8 @@ void Pcsx2Config::GamefixOptions::Set( GamefixId id, bool enabled )
case Fix_FMVinSoftware: FMVinSoftwareHack = enabled; break;
case Fix_GoemonTlbMiss: GoemonTlbHack = enabled; break;
case Fix_ScarfaceIbit: ScarfaceIbit = enabled; break;
case Fix_CrashTagTeamIbit: CrashTagTeamRacingIbit = enabled; break;
case Fix_CrashTagTeamIbit: CrashTagTeamRacingIbit = enabled; break;
case Fix_VU0Kickstart: VU0KickstartHack = enabled; break;
jNO_DEFAULT;
}
}
@ -356,7 +358,8 @@ bool Pcsx2Config::GamefixOptions::Get( GamefixId id ) const
case Fix_FMVinSoftware: return FMVinSoftwareHack;
case Fix_GoemonTlbMiss: return GoemonTlbHack;
case Fix_ScarfaceIbit: return ScarfaceIbit;
case Fix_CrashTagTeamIbit: return CrashTagTeamRacingIbit;
case Fix_CrashTagTeamIbit: return CrashTagTeamRacingIbit;
case Fix_VU0Kickstart: return VU0KickstartHack;
jNO_DEFAULT;
}
return false; // unreachable, but we still need to suppress warnings >_<
@ -382,7 +385,8 @@ void Pcsx2Config::GamefixOptions::LoadSave( IniInterface& ini )
IniBitBool( FMVinSoftwareHack );
IniBitBool( GoemonTlbHack );
IniBitBool( ScarfaceIbit );
IniBitBool( CrashTagTeamRacingIbit );
IniBitBool( CrashTagTeamRacingIbit );
IniBitBool( VU0KickstartHack );
}

View File

@ -141,7 +141,12 @@ struct __aligned16 VURegs {
u32 branchpc;
u32 delaybranchpc;
bool takedelaybranch;
u32 pending_q;
u32 pending_p;
__aligned16 u32 micro_macflags[4];
__aligned16 u32 micro_clipflags[4];
__aligned16 u32 micro_statusflags[4];
// MAC/Status flags -- these are used by interpreters but are kind of hacky
// and shouldn't be relied on for any useful/valid info. Would like to move them out of
// this struct eventually.

View File

@ -58,17 +58,27 @@ __fi void _vu0run(bool breakOnMbit, bool addCycles) {
if (!(VU0.VI[REG_VPU_STAT].UL & 1)) return;
int startcycle = VU0.cycle;
u32 runCycles = breakOnMbit ? vu0RunCycles : 0x7fffffff;
VU0.flags &= ~VUFLAG_MFLAGSET;
//VU0 is ahead of the EE and M-Bit is already encountered, so no need to wait for it, just catch up the EE
if ((VU0.flags & VUFLAG_MFLAGSET) && breakOnMbit && VU0.cycle >= cpuRegs.cycle)
{
cpuRegs.cycle = VU0.cycle;
return;
}
u32 startcycle = VU0.cycle;
u32 runCycles = 0x7fffffff;
do { // Run VU until it finishes or M-Bit
CpuVU0->Execute(runCycles);
} while ((VU0.VI[REG_VPU_STAT].UL & 1) // E-bit Termination
&& (!breakOnMbit || !(VU0.flags & VUFLAG_MFLAGSET))); // M-bit Break
&& (!breakOnMbit || !(VU0.flags & VUFLAG_MFLAGSET) || VU0.cycle < cpuRegs.cycle)); // M-bit Break
// Add cycles if called from EE's COP2
if (addCycles) cpuRegs.cycle += (VU0.cycle-startcycle)*2;
if (addCycles)
{
cpuRegs.cycle += (VU0.cycle - startcycle);
VU0.cycle = cpuRegs.cycle;
}
}
void _vu0WaitMicro() { _vu0run(1, 1); } // Runs VU0 Micro Until E-bit or M-Bit End
@ -101,7 +111,7 @@ namespace OpcodeImpl
void QMFC2() {
if (cpuRegs.code & 1) {
_vu0WaitMicro();
_vu0FinishMicro();
}
if (_Rt_ == 0) return;
cpuRegs.GPR.r[_Rt_].UD[0] = VU0.VF[_Fs_].UD[0];
@ -119,7 +129,7 @@ void QMTC2() {
void CFC2() {
if (cpuRegs.code & 1) {
_vu0WaitMicro();
_vu0FinishMicro();
}
if (_Rt_ == 0) return;

View File

@ -44,7 +44,7 @@ void __fastcall vu0ExecMicro(u32 addr) {
VU0.VI[REG_VPU_STAT].UL &= ~0xFF;
VU0.VI[REG_VPU_STAT].UL |= 0x01;
VU0.cycle = cpuRegs.cycle;
if ((s32)addr != -1) VU0.VI[REG_TPC].UL = addr;
_vuExecMicroDebug(VU0);
CpuVU0->ExecuteBlock(1);

View File

@ -157,12 +157,12 @@ static void _vu0Exec(VURegs* VU)
if(VU->takedelaybranch)
{
VU->branch = 2;
DevCon.Warning("VU0 - Branch/Jump in Delay Slot");
VU->branch = 1;
DevCon.Warning("VU0 - Branch/Jump in Delay Slot");
VU->branchpc = VU->delaybranchpc;
VU->delaybranchpc = 0;
VU->takedelaybranch = false;
}
}
}
}
@ -206,8 +206,9 @@ void InterpVU0::Step()
void InterpVU0::Execute(u32 cycles)
{
VU0.VI[REG_TPC].UL <<= 3;
for (int i = (int)cycles; i > 0 ; i--) {
if (!(VU0.VI[REG_VPU_STAT].UL & 0x1)) {
VU0.flags &= ~VUFLAG_MFLAGSET;
for (int i = (int)cycles; i > 0; i--) {
if (!(VU0.VI[REG_VPU_STAT].UL & 0x1) || (VU0.flags & VUFLAG_MFLAGSET)) {
if (VU0.branch || VU0.ebit) {
vu0Exec(&VU0); // run branch delay slot?
}
@ -217,4 +218,3 @@ void InterpVU0::Execute(u32 cycles)
}
VU0.VI[REG_TPC].UL >>= 3;
}

View File

@ -57,10 +57,9 @@ void __fastcall vu1ExecMicro(u32 addr)
vu1Finish();
VUM_LOG("vu1ExecMicro %x (count=%d)", addr, count++);
VU1.cycle = cpuRegs.cycle;
VU0.VI[REG_VPU_STAT].UL &= ~0xFF00;
VU0.VI[REG_VPU_STAT].UL |= 0x0100;
if ((s32)addr != -1) VU1.VI[REG_TPC].UL = addr;
_vuExecMicroDebug(VU1);

View File

@ -157,7 +157,7 @@ static void _vu1Exec(VURegs* VU)
if(VU->takedelaybranch)
{
VU->branch = 2;
VU->branch = 1;
//DevCon.Warning("VU1 - Branch/Jump in Delay Slot");
VU->branchpc = VU->delaybranchpc;
VU->delaybranchpc = 0;

View File

@ -25,27 +25,29 @@
void BaseVUmicroCPU::ExecuteBlock(bool startUp) {
const u32& stat = VU0.VI[REG_VPU_STAT].UL;
const int test = m_Idx ? 0x100 : 1;
const int s = 1024*8; // Kick Start Cycles (Silver Surfer needs this amount)
const int c = 1024*1; // Continue Cycles
const int s = EmuConfig.Gamefixes.VU0KickstartHack ? 2048 : 0; // Kick Start Cycles (Silver Surfer, POP:SOT, Lotus needs this amount)
if (!(stat & test)) return;
if (startUp) { // Start Executing a microprogram
if (startUp && s) { // Start Executing a microprogram
Execute(s); // Kick start VU
// Let VUs run behind EE instead of ahead
if (stat & test) {
cpuSetNextEventDelta((s+c)*2);
m_lastEEcycles = cpuRegs.cycle + (s*2);
cpuSetNextEventDelta(s);
if (m_Idx)
VU1.cycle = cpuRegs.cycle;
else
VU0.cycle = cpuRegs.cycle;
}
}
else { // Continue Executing (VU roughly half the mhz of EE)
s32 delta = (s32)(u32)(cpuRegs.cycle - m_lastEEcycles) & ~1;
if (delta > 0) { // Enough time has passed
delta >>= 1; // Divide by 2 (unsigned)
else { // Continue Executing
u32 cycle = m_Idx ? VU1.cycle : VU0.cycle;
s32 delta = (s32)(u32)(cpuRegs.cycle - cycle);
if (delta > 0) { // Enough time has passed
Execute(delta); // Execute the time since the last call
if (stat & test) {
cpuSetNextEventDelta(c*2);
m_lastEEcycles = cpuRegs.cycle;
}
if (stat & test)
cpuSetNextEventDelta(delta);
}
else cpuSetNextEventDelta(-delta); // Haven't caught-up from kick start
}
@ -55,10 +57,10 @@ void BaseVUmicroCPU::ExecuteBlock(bool startUp) {
// EE data to VU0's registers. We want to run VU0 Micro right after this
// to ensure that the register is used at the correct time.
// This fixes spinning/hanging in some games like Ratchet and Clank's Intro.
void __fastcall BaseVUmicroCPU::ExecuteBlockJIT(BaseVUmicroCPU* cpu) {
void BaseVUmicroCPU::ExecuteBlockJIT(BaseVUmicroCPU* cpu) {
const u32& stat = VU0.VI[REG_VPU_STAT].UL;
const int test = cpu->m_Idx ? 0x100 : 1;
const int c = 128; // VU Execution Cycles
if (stat & test) { // VU is running
#ifdef PCSX2_DEVBUILD
static int warn = 5;
@ -67,10 +69,17 @@ void __fastcall BaseVUmicroCPU::ExecuteBlockJIT(BaseVUmicroCPU* cpu) {
warn--;
}
#endif
cpu->Execute(c); // Execute VU
if (stat & test) {
cpu->m_lastEEcycles+=(c*2);
cpuSetNextEventDelta(c*2);
u32 cycle = cpu->m_Idx ? VU1.cycle : VU0.cycle;
s32 delta = (s32)(u32)(cpuRegs.cycle - cycle);
if (delta > 0) { // Enough time has passed
cpu->Execute(delta); // Execute the time since the last call
if (stat & test) {
cpuSetNextEventDelta(delta);
}
}
else {
cpuSetNextEventDelta(-delta); // Haven't caught-up from kick start
}
}
}

View File

@ -262,6 +262,7 @@ extern BaseVUmicroCPU* CpuVU1;
extern void vu0ResetRegs();
extern void __fastcall vu0ExecMicro(u32 addr);
extern void vu0Exec(VURegs* VU);
extern void _vu0FinishMicro();
extern void vu0Finish();
extern void iDumpVU0Registers();

View File

@ -174,10 +174,16 @@ __fi void vif0Interrupt()
if (!(vif0ch.chcr.STR)) Console.WriteLn("vif0 running when CHCR == %x", vif0ch.chcr._u32);
if(vif0.waitforvu)
{
//CPU_INT(DMAC_VIF0, 16);
return;
}
if (vif0.irq && vif0.vifstalled.enabled && vif0.vifstalled.value == VIF_IRQ_STALL)
{
vif0Regs.stat.INT = true;
//Yakuza watches VIF_STAT so lets do this here.
if (((vif0Regs.code >> 24) & 0x7f) != 0x7) {
vif0Regs.stat.VIS = true;
@ -193,7 +199,7 @@ __fi void vif0Interrupt()
// One game doesn't like vif stalling at end, can't remember what. Spiderman isn't keen on it tho
//vif0ch.chcr.STR = false;
vif0Regs.stat.FQC = std::min((u16)0x8, vif0ch.qwc);
if(vif0ch.qwc > 0 || !vif0.done)
if (vif0ch.qwc > 0 || !vif0.done)
{
VIF_LOG("VIF0 Stalled");
return;
@ -201,13 +207,6 @@ __fi void vif0Interrupt()
}
}
if(vif0.waitforvu)
{
//DevCon.Warning("Waiting on VU0");
//CPU_INT(DMAC_VIF0, 16);
return;
}
vif0.vifstalled.enabled = false;
//Must go after the Stall, incase it's still in progress, GTC africa likes to see it still transferring.

View File

@ -36,7 +36,7 @@ vifOp(vifCode_Null);
__ri void vifExecQueue(int idx)
{
if (!GetVifX.queued_program)
if (!GetVifX.queued_program || (VU0.VI[REG_VPU_STAT].UL & 1 << (idx * 8)))
return;
GetVifX.queued_program = false;
@ -59,6 +59,8 @@ __ri void vifExecQueue(int idx)
}
static __fi void vifFlush(int idx) {
vifExecQueue(idx);
if (!idx) vif0FLUSH();
else vif1FLUSH();
@ -119,6 +121,7 @@ void ExecuteVU(int idx)
vifX.cmd = 0;
vifX.pass = 0;
}
vifExecQueue(idx);
}
//------------------------------------------------------------------

View File

@ -49,7 +49,7 @@ _vifT void vifTransferLoop(u32* &data) {
vifX.cmd = data[0] >> 24;
//VIF_LOG("New VifCMD %x tagsize %x", vifX.cmd, vifX.tag.size);
VIF_LOG("New VifCMD %x tagsize %x irq %d", vifX.cmd, vifX.tag.size, vifX.irq);
if (IsDevBuild && SysTrace.EE.VIFcode.IsActive()) {
// Pass 2 means "log it"
vifCmdHandler[idx][vifX.cmd & 0x7f](2, data);

View File

@ -104,9 +104,13 @@ Panels::GameFixesPanel::GameFixesPanel( wxWindow* parent )
_("VU I bit Hack avoid constant recompilation (Scarface The World Is Yours)"),
wxEmptyString
},
{
{
_("VU I bit Hack avoid constant recompilation (Crash Tag Team Racing)"),
wxEmptyString
wxEmptyString
},
{
_("VU0 Kickstart to avoid sync problems with VU1"),
wxEmptyString
}
};

View File

@ -74,6 +74,7 @@ void SetBranchImm( u32 imm );
void iFlushCall(int flushtype);
void recBranchCall( void (*func)() );
void recCall( void (*func)() );
u32 scaleblockcycles_clear();
namespace R5900{
namespace Dynarec {

View File

@ -1027,6 +1027,31 @@ static u32 scaleblockcycles()
return scaled;
}
u32 scaleblockcycles_clear()
{
u32 scaled = scaleblockcycles_calculation();
#if 0 // Enable this to get some runtime statistics about the scaling result in practice
static u32 scaled_overall = 0, unscaled_overall = 0;
if (g_resetEeScalingStats)
{
scaled_overall = unscaled_overall = 0;
g_resetEeScalingStats = false;
}
u32 unscaled = DEFAULT_SCALED_BLOCKS();
if (!unscaled) unscaled = 1;
scaled_overall += scaled;
unscaled_overall += unscaled;
float ratio = static_cast<float>(unscaled_overall) / scaled_overall;
DevCon.WriteLn(L"Unscaled overall: %d, scaled overall: %d, relative EE clock speed: %d %%",
unscaled_overall, scaled_overall, static_cast<int>(100 * ratio));
#endif
s_nBlockCycles &= 0x7;
return scaled;
}
// Generates dynarec code for Event tests followed by a block dispatch (branch).
// Parameters:

View File

@ -573,6 +573,14 @@ void recSWC1()
void recLQC2()
{
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
iFlushCall(FLUSH_EVERYTHING);
if (_Rt_)
xLEA(arg2reg, ptr[&VU0.VF[_Ft_].UD[0]]);
else
@ -602,6 +610,14 @@ void recLQC2()
void recSQC2()
{
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
iFlushCall(FLUSH_EVERYTHING);
xLEA(arg2reg, ptr[&VU0.VF[_Ft_].UD[0]]);
if (GPR_IS_CONST1(_Rs_))
@ -628,4 +644,4 @@ void recSQC2()
} } } // end namespace R5900::Dynarec::OpcodeImpl
using namespace R5900::Dynarec;
using namespace R5900::Dynarec::OpcodeImpl;
using namespace R5900::Dynarec::OpcodeImpl;

View File

@ -351,8 +351,11 @@ void recMicroVU1::Reset() {
void recMicroVU0::Execute(u32 cycles) {
pxAssert(m_Reserved); // please allocate me first! :|
VU0.flags &= ~VUFLAG_MFLAGSET;
if(!(VU0.VI[REG_VPU_STAT].UL & 1)) return;
VU0.VI[REG_TPC].UL <<= 3;
// Sometimes games spin on vu0, so be careful with this value
// woody hangs if too high on sVU (untested on mVU)
// Edit: Need to test this again, if anyone ever has a "Woody" game :p

View File

@ -70,12 +70,36 @@ void mVUDTendProgram(mV, microFlagCycles* mFC, int isEbit) {
xMOVSS(ptr32[&mVU.regs().VI[REG_P].UL], xmmPQ);
}
// Save Flag Instances
xMOV(ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
// Save MAC, Status and CLIP Flag Instances
xMOV(ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
mVUallocMFLAGa(mVU, gprT1, fMac);
mVUallocCFLAGa(mVU, gprT2, fClip);
xMOV(ptr32[&mVU.regs().VI[REG_MAC_FLAG].UL], gprT1);
xMOV(ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL], gprT2);
xMOV(ptr32[&mVU.regs().VI[REG_MAC_FLAG].UL], gprT1);
xMOV(ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL], gprT2);
if (!isEbit) { // Backup flag instances
xMOVAPS(xmmT1, ptr128[mVU.macFlag]);
xMOVAPS(ptr128[&mVU.regs().micro_macflags], xmmT1);
xMOVAPS(xmmT1, ptr128[mVU.clipFlag]);
xMOVAPS(ptr128[&mVU.regs().micro_clipflags], xmmT1);
xMOV(ptr32[&mVU.regs().micro_statusflags[0]], gprF0);
xMOV(ptr32[&mVU.regs().micro_statusflags[1]], gprF1);
xMOV(ptr32[&mVU.regs().micro_statusflags[2]], gprF2);
xMOV(ptr32[&mVU.regs().micro_statusflags[3]], gprF3);
} else { // Flush flag instances
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_clipflags], xmmT1);
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_MAC_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_macflags], xmmT1);
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_statusflags], xmmT1);
}
if (isEbit || isVU1) { // Clear 'is busy' Flags
if (!mVU.index || !THREAD_VU1) {
@ -98,7 +122,12 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
int fClip = getLastFlagInst(mVUpBlock->pState, mFC->xClip, 2, isEbit);
int qInst = 0;
int pInst = 0;
mVU.regAlloc->flushAll();
microBlock stateBackup;
memcpy(&stateBackup, &mVUregs, sizeof(mVUregs)); //backup the state, it's about to get screwed with.
if(!isEbit)
mVU.regAlloc->TDwritebackAll(); //Writing back ok, invalidating early kills the rec, so don't do it :P
else
mVU.regAlloc->flushAll();
if (isEbit) {
memzero(mVUinfo);
@ -124,20 +153,55 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
}
// Save P/Q Regs
if (qInst) { xPSHUF.D(xmmPQ, xmmPQ, 0xe5); }
if (qInst) { xPSHUF.D(xmmPQ, xmmPQ, 0xe1); }
xMOVSS(ptr32[&mVU.regs().VI[REG_Q].UL], xmmPQ);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
xMOVSS(ptr32[&mVU.regs().pending_q], xmmPQ);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
if (isVU1) {
xPSHUF.D(xmmPQ, xmmPQ, pInst ? 3 : 2);
xPSHUF.D(xmmPQ, xmmPQ, pInst ? 0x1b : 0x1e);
xMOVSS(ptr32[&mVU.regs().VI[REG_P].UL], xmmPQ);
xPSHUF.D(xmmPQ, xmmPQ, pInst ? 0x1b : 0x4b);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
xMOVSS(ptr32[&mVU.regs().pending_p], xmmPQ);
xPSHUF.D(xmmPQ, xmmPQ, 0x1b);
}
// Save Flag Instances
// Save MAC, Status and CLIP Flag Instances
xMOV(ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL], getFlagReg(fStatus));
mVUallocMFLAGa(mVU, gprT1, fMac);
mVUallocCFLAGa(mVU, gprT2, fClip);
xMOV(ptr32[&mVU.regs().VI[REG_MAC_FLAG].UL], gprT1);
xMOV(ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL], gprT2);
if (!isEbit) { // Backup flag instances
xMOVAPS(xmmT1, ptr128[mVU.macFlag]);
xMOVAPS(ptr128[&mVU.regs().micro_macflags], xmmT1);
xMOVAPS(xmmT1, ptr128[mVU.clipFlag]);
xMOVAPS(ptr128[&mVU.regs().micro_clipflags], xmmT1);
xMOV(ptr32[&mVU.regs().micro_statusflags[0]], gprF0);
xMOV(ptr32[&mVU.regs().micro_statusflags[1]], gprF1);
xMOV(ptr32[&mVU.regs().micro_statusflags[2]], gprF2);
xMOV(ptr32[&mVU.regs().micro_statusflags[3]], gprF3);
}
else { // Flush flag instances
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_clipflags], xmmT1);
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_MAC_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_macflags], xmmT1);
xMOVDZX(xmmT1, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS(ptr128[&mVU.regs().micro_statusflags], xmmT1);
}
if (isEbit || isVU1) { // Clear 'is busy' Flags
if (!mVU.index || !THREAD_VU1) {
xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
@ -149,6 +213,7 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
xMOV(ptr32[&mVU.regs().VI[REG_TPC].UL], xPC);
xJMP(mVU.exitFunct);
}
memcpy(&mVUregs, &stateBackup, sizeof(mVUregs)); //Restore the state for the rest of the recompile
}
// Recompiles Code for Proper Flags and Q/P regs on Block Linkings

View File

@ -356,7 +356,7 @@ void mVUdebugPrintBlocks(microVU& mVU, bool isEndPC) {
// vu0 is allowed to exit early, so are dev builds (for inf loops)
__fi bool doEarlyExit(microVU& mVU) {
return IsDevBuild || !isVU1;
return true;// IsDevBuild || !isVU1;
}
// Saves Pipeline State for resuming from early exits
@ -368,27 +368,32 @@ __fi void mVUsavePipelineState(microVU& mVU) {
}
// Test cycles to see if we need to exit-early...
void mVUtestCycles(microVU& mVU) {
void mVUtestCycles(microVU& mVU, microFlagCycles& mFC) {
iPC = mVUstartPC;
if (doEarlyExit(mVU)) {
xCMP(ptr32[&mVU.cycles], 0);
xForwardJG32 skip;
xMOV(eax, ptr32[&mVU.cycles]);
if (!EmuConfig.Gamefixes.VU0KickstartHack)
xSUB(eax, mVUcycles); // Running behind, make sure we have time to run the block
else
xSUB(eax, 1); // Running ahead, make sure cycles left are above 0
xCMP(eax, 0);
xForwardJGE32 skip;
mVUsavePipelineState(mVU);
if (isVU0) {
// TEST32ItoM((uptr)&mVU.regs().flags, VUFLAG_MFLAGSET);
// xFowardJZ32 vu0jmp;
// mVUbackupRegs(mVU, true);
// xFastCall(mVUwarning0, mVU.prog.cur->idx, xPC); // VU0 is allowed early exit for COP2 Interlock Simulation
// mVUrestoreRegs(mVU, true);
mVUsavePipelineState(mVU);
mVUendProgram(mVU, NULL, 0);
mVUendProgram(mVU, &mFC, 0);
// vu0jmp.SetTarget();
}
else {
mVUbackupRegs(mVU, true);
/*mVUbackupRegs(mVU, true);
xFastCall(mVUwarning1, mVU.prog.cur->idx, xPC);
mVUrestoreRegs(mVU, true);
mVUsavePipelineState(mVU);
mVUendProgram(mVU, NULL, 0);
mVUsavePipelineState(mVU);*/
mVUendProgram(mVU, &mFC, 0);
}
skip.SetTarget();
}
@ -401,7 +406,7 @@ void mVUtestCycles(microVU& mVU) {
// This gets run at the start of every loop of mVU's first pass
__fi void startLoop(mV) {
if (curI & _Mbit_) { DevCon.WriteLn (Color_Green, "microVU%d: M-bit set! PC = %x", getIndex, xPC); }
if (curI & _Mbit_ && isVU0) { DevCon.WriteLn (Color_Green, "microVU%d: M-bit set! PC = %x", getIndex, xPC); }
if (curI & _Dbit_) { DevCon.WriteLn (Color_Green, "microVU%d: D-bit set! PC = %x", getIndex, xPC); }
if (curI & _Tbit_) { DevCon.WriteLn (Color_Green, "microVU%d: T-bit set! PC = %x", getIndex, xPC); }
memzero(mVUinfo);
@ -475,8 +480,8 @@ void* mVUcompileSingleInstruction(microVU& mVU, u32 startPC, uptr pState, microF
mVUsetCycles(mVU);
mVUinfo.readQ = mVU.q;
mVUinfo.writeQ = !mVU.q;
mVUinfo.readP = mVU.p;
mVUinfo.writeP = !mVU.p;
mVUinfo.readP = mVU.p && isVU1;
mVUinfo.writeP = !mVU.p && isVU1;
mVUcount++;
mVUsetFlagInfo(mVU);
incPC(1);
@ -485,7 +490,8 @@ void* mVUcompileSingleInstruction(microVU& mVU, u32 startPC, uptr pState, microF
mVUsetFlags(mVU, mFC); // Sets Up Flag instances
mVUoptimizePipeState(mVU); // Optimize the End Pipeline State for nicer Block Linking
mVUdebugPrintBlocks(mVU, false); // Prints Start/End PC of blocks executed, for debugging...
mVUtestCycles(mVU); // Update VU Cycles and Exit Early if Necessary
mVUtestCycles(mVU, mFC); // Update VU Cycles and Exit Early if Necessary
// Second Pass
iPC = startPC / 4;
@ -534,37 +540,52 @@ void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup)
memcpy(&mFCBackup, &mFC, sizeof(microFlagCycles));
mVUsetFlags(mVU, mFCBackup); // Sets Up Flag instances
}
void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
void* mVUcompile(microVU& mVU, u32 startPC, uptr pState)
{
microFlagCycles mFC;
u8* thisPtr = x86Ptr;
const u32 endCount = (((microRegInfo*)pState)->blockType) ? 1 : (mVU.microMemSize / 8);
u8* thisPtr = x86Ptr;
const u32 endCount = (((microRegInfo*)pState)->blockType) ? 1 : (mVU.microMemSize / 8);
// First Pass
iPC = startPC / 4;
mVUsetupRange(mVU, startPC, 1); // Setup Program Bounds/Range
mVU.regAlloc->reset(); // Reset regAlloc
mVU.regAlloc->reset(); // Reset regAlloc
mVUinitFirstPass(mVU, pState, thisPtr);
mVUbranch = 0;
for(int branch = 0; mVUcount < endCount;) {
for (int branch = 0; mVUcount < endCount;) {
incPC(1);
startLoop(mVU);
mVUincCycles(mVU, 1);
mVUopU(mVU, 0);
mVUcheckBadOp(mVU);
if (curI & _Ebit_) { eBitPass1(mVU, branch); }
if (curI & _Mbit_) { mVUup.mBit = true; }
if (curI & _Ibit_) { mVUlow.isNOP = true; mVUup.iBit = true; }
else { incPC(-1); mVUopL(mVU, 0); incPC(1); }
if (curI & _Dbit_) { mVUup.dBit = true; }
if (curI & _Tbit_) { mVUup.tBit = true; }
if (curI & _Ebit_) {
eBitPass1(mVU, branch);
}
if ((curI & _Mbit_) && isVU0) {
mVUup.mBit = true;
}
if (curI & _Ibit_) {
mVUlow.isNOP = true;
mVUup.iBit = true;
}
else {
incPC(-1);
mVUopL(mVU, 0);
incPC(1);
}
if (curI & _Dbit_) {
mVUup.dBit = true;
}
if (curI & _Tbit_) {
mVUup.tBit = true;
}
mVUsetCycles(mVU);
mVUinfo.readQ = mVU.q;
mVUinfo.readQ = mVU.q;
mVUinfo.writeQ = !mVU.q;
mVUinfo.readP = mVU.p;
mVUinfo.writeP = !mVU.p;
mVUinfo.readP = mVU.p && isVU1;
mVUinfo.writeP = !mVU.p && isVU1;
mVUcount++;
if (branch >= 2) {
@ -588,6 +609,9 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
mVUbranch = 0;
}
if (mVUup.mBit && !branch && !mVUup.eBit)
break;
if (mVUinfo.isEOB)
break;
@ -595,24 +619,30 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
}
// Fix up vi15 const info for propagation through blocks
mVUregs.vi15 = (doConstProp && mVUconstReg[15].isValid) ? (u16)mVUconstReg[15].regValue : 0;
mVUregs.vi15 = (doConstProp && mVUconstReg[15].isValid) ? (u16)mVUconstReg[15].regValue : 0;
mVUregs.vi15v = (doConstProp && mVUconstReg[15].isValid) ? 1 : 0;
mVUsetFlags(mVU, mFC); // Sets Up Flag instances
mVUoptimizePipeState(mVU); // Optimize the End Pipeline State for nicer Block Linking
mVUdebugPrintBlocks(mVU, false); // Prints Start/End PC of blocks executed, for debugging...
mVUtestCycles(mVU); // Update VU Cycles and Exit Early if Necessary
mVUtestCycles(mVU, mFC); // Update VU Cycles and Exit Early if Necessary
// Second Pass
iPC = mVUstartPC;
setCode();
mVUbranch = 0;
u32 x = 0;
for( ; x < endCount; x++) {
if (mVUinfo.isEOB) { handleBadOp(mVU, x); x = 0xffff; } // handleBadOp currently just prints a warning
if (mVUup.mBit) { xOR(ptr32[&mVU.regs().flags], VUFLAG_MFLAGSET); }
for (; x < endCount; x++) {
if (mVUinfo.isEOB) {
handleBadOp(mVU, x);
x = 0xffff;
} // handleBadOp currently just prints a warning
if (mVUup.mBit) {
xOR(ptr32[&mVU.regs().flags], VUFLAG_MFLAGSET);
}
mVUexecuteInstruction(mVU);
if(!mVUinfo.isBdelay && !mVUlow.branch) //T/D Bit on branch is handled after the branch, branch delay slots are executed.
if (!mVUinfo.isBdelay && !mVUlow.branch) //T/D Bit on branch is handled after the branch, branch delay slots are executed.
{
if (mVUup.tBit) {
mVUDoTBit(mVU, &mFC);
@ -620,6 +650,13 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
else if (mVUup.dBit && doDBitHandling) {
mVUDoDBit(mVU, &mFC);
}
else if (mVUup.mBit && !mVUup.eBit && !mVUinfo.isEOB) {
mVUsetupRange(mVU, xPC, false);
incPC(2);
mVUendProgram(mVU, &mFC, 0);
incPC(-2);
goto perf_and_return;
}
}
if (mVUinfo.doXGKICK) {
@ -640,22 +677,41 @@ void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
incPC(-3); // Go back to branch opcode
switch (mVUlow.branch) {
case 1: case 2: normBranch(mVU, mFC); goto perf_and_return; // B/BAL
case 9: case 10: normJump (mVU, mFC); goto perf_and_return; // JR/JALR
case 3: condBranch(mVU, mFC, Jcc_Equal); goto perf_and_return; // IBEQ
case 4: condBranch(mVU, mFC, Jcc_GreaterOrEqual); goto perf_and_return; // IBGEZ
case 5: condBranch(mVU, mFC, Jcc_Greater); goto perf_and_return; // IBGTZ
case 6: condBranch(mVU, mFC, Jcc_LessOrEqual); goto perf_and_return; // IBLEQ
case 7: condBranch(mVU, mFC, Jcc_Less); goto perf_and_return; // IBLTZ
case 8: condBranch(mVU, mFC, Jcc_NotEqual); goto perf_and_return; // IBNEQ
case 1: // B/BAL
case 2:
normBranch(mVU, mFC);
goto perf_and_return;
case 9: // JR/JALR
case 10:
normJump(mVU, mFC);
goto perf_and_return;
case 3: // IBEQ
condBranch(mVU, mFC, Jcc_Equal);
goto perf_and_return;
case 4: // IBGEZ
condBranch(mVU, mFC, Jcc_GreaterOrEqual);
goto perf_and_return;
case 5: // IBGTZ
condBranch(mVU, mFC, Jcc_Greater);
goto perf_and_return;
case 6: // IBLEQ
condBranch(mVU, mFC, Jcc_LessOrEqual);
goto perf_and_return;
case 7: // IBLTZ
condBranch(mVU, mFC, Jcc_Less);
goto perf_and_return;
case 8: // IBNEQ
condBranch(mVU, mFC, Jcc_NotEqual);
goto perf_and_return;
}
}
}
if ((x == endCount) && (x!=1)) { Console.Error("microVU%d: Possible infinite compiling loop!", mVU.index); }
if ((x == endCount) && (x != 1)) {
Console.Error("microVU%d: Possible infinite compiling loop!", mVU.index);
}
// E-bit End
mVUsetupRange(mVU, xPC-8, false);
mVUsetupRange(mVU, xPC - 8, false);
mVUendProgram(mVU, &mFC, 1);
perf_and_return:

View File

@ -34,22 +34,35 @@ void mVUdispatcherAB(mV) {
xLDMXCSR(g_sseVUMXCSR);
// Load Regs
xMOV(gprF0, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]);
xMOV(gprF1, gprF0);
xMOV(gprF2, gprF0);
xMOV(gprF3, gprF0);
xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_MAC_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS (ptr128[mVU.macFlag], xmmT1);
xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_CLIP_FLAG].UL]);
xSHUF.PS(xmmT1, xmmT1, 0);
xMOVAPS (ptr128[mVU.clipFlag], xmmT1);
xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_P].UL]);
xMOVAPS (xmmPQ, ptr128[&mVU.regs().VI[REG_Q].UL]);
xMOVDZX (xmmT2, ptr32[&mVU.regs().pending_q]);
xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ
//Load in other Q instance
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
xMOVSS(xmmPQ, xmmT2);
xPSHUF.D(xmmPQ, xmmPQ, 0xe1);
if (isVU1)
{
//Load in other P instance
xMOVDZX(xmmT2, ptr32[&mVU.regs().pending_p]);
xPSHUF.D(xmmPQ, xmmPQ, 0x1B);
xMOVSS(xmmPQ, xmmT2);
xPSHUF.D(xmmPQ, xmmPQ, 0x1B);
}
xMOVAPS(xmmT1, ptr128[&mVU.regs().micro_macflags]);
xMOVAPS(ptr128[mVU.macFlag], xmmT1);
xMOVAPS(xmmT1, ptr128[&mVU.regs().micro_clipflags]);
xMOVAPS(ptr128[mVU.clipFlag], xmmT1);
xMOV(gprF0, ptr32[&mVU.regs().micro_statusflags[0]]);
xMOV(gprF1, ptr32[&mVU.regs().micro_statusflags[1]]);
xMOV(gprF2, ptr32[&mVU.regs().micro_statusflags[2]]);
xMOV(gprF3, ptr32[&mVU.regs().micro_statusflags[3]]);
// Jump to Recompiled Code Block
xJMP(rax);

View File

@ -247,8 +247,14 @@ void recBC2TL() { _setupBranchTest(JZ32, true); }
//------------------------------------------------------------------
void COP2_Interlock(bool mBitSync) {
if (cpuRegs.code & 1) {
iFlushCall(FLUSH_EVERYTHING | FLUSH_PC);
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
if (mBitSync) xFastCall((void*)_vu0WaitMicro);
else xFastCall((void*)_vu0FinishMicro);
}
@ -268,6 +274,14 @@ static void recCFC2() {
COP2_Interlock(false);
if (!_Rt_) return;
if (!(cpuRegs.code & 1) && !EmuConfig.Gamefixes.VU0KickstartHack) {
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
}
iFlushCall(FLUSH_EVERYTHING);
if (_Rd_ == REG_STATUS_FLAG) { // Normalize Status Flag
@ -331,6 +345,14 @@ static void recCTC2() {
printCOP2("CTC2");
COP2_Interlock(1);
if (!_Rd_) return;
if (!(cpuRegs.code & 1) && !EmuConfig.Gamefixes.VU0KickstartHack) {
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
}
iFlushCall(FLUSH_EVERYTHING);
switch(_Rd_) {
@ -342,12 +364,25 @@ static void recCTC2() {
xMOV(ptr32[&vu0Regs.VI[REG_R].UL], eax);
break;
case REG_STATUS_FLAG:
{
if (_Rt_) { // Denormalizes flag into eax (gprT1)
mVUallocSFLAGd(&cpuRegs.GPR.r[_Rt_].UL[0]);
xMOV(ptr32[&vu0Regs.VI[_Rd_].UL], eax);
}
else xMOV(ptr32[&vu0Regs.VI[_Rd_].UL], 0);
__aligned16 u32 sticky_flags[4] = { 0xFC0,0xFC0,0xFC0,0xFC0 };
__aligned16 u32 status_flags[4] = { 0x3F,0x3F,0x3F,0x3F };
//Need to update the sticky flags for microVU
xMOVDZX(xmmT1, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xSHUF.PS(xmmT1, xmmT1, 0);
xAND.PS(xmmT1, ptr128[&sticky_flags]);
xMOVAPS(xmmT2, ptr128[&VU0.micro_statusflags]);
xAND.PS(xmmT1, ptr128[&status_flags]);
xOR.PS(xmmT1, xmmT2);
xMOVAPS(ptr128[&VU0.micro_statusflags], xmmT1);
break;
}
case REG_CMSAR1: // Execute VU1 Micro SubRoutine
if (_Rt_) {
xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
@ -357,8 +392,8 @@ static void recCTC2() {
xFastCall((void*)vif1VUFinish);
break;
case REG_FBRST:
if (!_Rt_) {
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], 0);
if (!_Rt_) {
xMOV(ptr32[&vu0Regs.VI[REG_FBRST].UL], 0);
return;
}
else xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]);
@ -373,8 +408,6 @@ static void recCTC2() {
// Executing vu0 block here fixes the intro of Ratchet and Clank
// sVU's COP2 has a comment that "Donald Duck" needs this too...
if (_Rd_) _eeMoveGPRtoM((uptr)&vu0Regs.VI[_Rd_].UL, _Rt_);
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
break;
}
}
@ -384,6 +417,15 @@ static void recQMFC2() {
printCOP2("QMFC2");
COP2_Interlock(false);
if (!_Rt_) return;
if (!(cpuRegs.code & 1) && !EmuConfig.Gamefixes.VU0KickstartHack) {
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
}
iFlushCall(FLUSH_EVERYTHING);
// FixMe: For some reason this line is needed or else games break:
@ -398,6 +440,14 @@ static void recQMTC2() {
printCOP2("QMTC2");
COP2_Interlock(true);
if (!_Rd_) return;
if (!(cpuRegs.code & 1) && !EmuConfig.Gamefixes.VU0KickstartHack) {
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xLoadFarAddr(arg1reg, CpuVU0);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg);
}
iFlushCall(FLUSH_EVERYTHING);
xMOVAPS(xmmT1, ptr128[&cpuRegs.GPR.r[_Rt_]]);
@ -468,5 +518,11 @@ namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl { void recCOP2() { recCOP2t[_Rs_](); }}}}
void recCOP2_BC2 () { recCOP2_BC2t[_Rt_](); }
void recCOP2_SPEC1() { recCOP2SPECIAL1t[_Funct_](); }
void recCOP2_SPEC2() { recCOP2SPECIAL2t[(cpuRegs.code&3)|((cpuRegs.code>>4)&0x7c)](); }
void recCOP2_SPEC1() {
iFlushCall(FLUSH_EVERYTHING);
xMOV(eax, ptr[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr[&cpuRegs.cycle], eax); // update cycles
xFastCall((void*)_vu0FinishMicro); recCOP2SPECIAL1t[_Funct_]();
}
void recCOP2_SPEC2() { recCOP2SPECIAL2t[(cpuRegs.code&3)|((cpuRegs.code>>4)&0x7c)](); }