diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h index 0d50f3ac51..346af479f2 100644 --- a/Source/Core/VideoCommon/BPMemory.h +++ b/Source/Core/VideoCommon/BPMemory.h @@ -1085,5 +1085,6 @@ struct BPMemory extern BPMemory bpmem; void LoadBPReg(u32 value0); +void LoadBPRegPreprocess(u32 value0); void GetBPRegInfo(const u8* data, std::string* name, std::string* desc); diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp index 308badae6e..152d15a7d5 100644 --- a/Source/Core/VideoCommon/BPStructs.cpp +++ b/Source/Core/VideoCommon/BPStructs.cpp @@ -173,7 +173,8 @@ static void BPWritten(const BPCmd& bp) switch (bp.newvalue & 0xFF) { case 0x02: - PixelEngine::SetFinish(); // may generate interrupt + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetFinish(); // may generate interrupt DEBUG_LOG(VIDEO, "GXSetDrawDone SetPEFinish (value: 0x%02X)", (bp.newvalue & 0xFFFF)); return; @@ -183,11 +184,13 @@ static void BPWritten(const BPCmd& bp) } return; case BPMEM_PE_TOKEN_ID: // Pixel Engine Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), false); DEBUG_LOG(VIDEO, "SetPEToken 0x%04x", (bp.newvalue & 0xFFFF)); return; case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID - PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); + if (!g_use_deterministic_gpu_thread) + PixelEngine::SetToken(static_cast(bp.newvalue & 0xFFFF), true); DEBUG_LOG(VIDEO, "SetPEToken + INT 0x%04x", (bp.newvalue & 0xFFFF)); return; @@ -685,6 +688,26 @@ void LoadBPReg(u32 value0) BPWritten(bp); } +void LoadBPRegPreprocess(u32 value0) +{ + int regNum = value0 >> 24; + // masking could hypothetically be a problem + u32 newval = value0 & 0xffffff; + switch (regNum) + { + case BPMEM_SETDRAWDONE: + if ((newval & 0xff) == 0x02) + PixelEngine::SetFinish(); + break; + case BPMEM_PE_TOKEN_ID: + PixelEngine::SetToken(newval & 0xffff, false); + break; + case BPMEM_PE_TOKEN_INT_ID: // Pixel Engine Interrupt Token ID + PixelEngine::SetToken(newval & 0xffff, true); + break; + } +} + void GetBPRegInfo(const u8* data, std::string* name, std::string* desc) { const char* no_yes[2] = { "No", "Yes" }; diff --git a/Source/Core/VideoCommon/BPStructs.h b/Source/Core/VideoCommon/BPStructs.h index 2a99443346..a1dc48b821 100644 --- a/Source/Core/VideoCommon/BPStructs.h +++ b/Source/Core/VideoCommon/BPStructs.h @@ -7,5 +7,4 @@ #include "VideoCommon/BPMemory.h" void BPInit(); -void LoadBPReg(u32 value0); void BPReload(); diff --git a/Source/Core/VideoCommon/CommandProcessor.cpp b/Source/Core/VideoCommon/CommandProcessor.cpp index aa9f8c4f28..6f8997cc58 100644 --- a/Source/Core/VideoCommon/CommandProcessor.cpp +++ b/Source/Core/VideoCommon/CommandProcessor.cpp @@ -77,7 +77,7 @@ void DoState(PointerWrap &p) p.Do(interruptFinishWaiting); } -UNUSED static inline void WriteLow(volatile u32& _reg, u16 lowbits) +static inline void WriteLow(volatile u32& _reg, u16 lowbits) { Common::AtomicStore(_reg, (_reg & 0xFFFF0000) | lowbits); } @@ -159,9 +159,8 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { FIFO_WRITE_POINTER_LO, MMIO::Utils::LowPart(&fifo.CPWritePointer), false, true }, { FIFO_WRITE_POINTER_HI, MMIO::Utils::HighPart(&fifo.CPWritePointer) }, // FIFO_READ_POINTER has different code for single/dual core. - { FIFO_BP_LO, MMIO::Utils::LowPart(&fifo.CPBreakpoint), false, true }, - { FIFO_BP_HI, MMIO::Utils::HighPart(&fifo.CPBreakpoint) }, }; + for (auto& mapped_var : directly_mapped_vars) { u16 wmask = mapped_var.writes_align_to_32_bytes ? 0xFFE0 : 0xFFFF; @@ -173,6 +172,19 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) ); } + mmio->Register(base | FIFO_BP_LO, + MMIO::DirectRead(MMIO::Utils::LowPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteLow(fifo.CPBreakpoint, val & 0xffe0); + }) + ); + mmio->Register(base | FIFO_BP_HI, + MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPBreakpoint)), + MMIO::ComplexWrite([](u32, u16 val) { + WriteHigh(fifo.CPBreakpoint, val); + }) + ); + // Timing and metrics MMIOs are stubbed with fixed values. struct { u32 addr; @@ -216,8 +228,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPCtrlReg tmp(val); m_CPCtrlReg.Hex = tmp.Hex; SetCpControlRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -227,8 +238,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) UCPClearReg tmp(val); m_CPClearReg.Hex = tmp.Hex; SetCpClearRegister(); - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); @@ -260,6 +270,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) : MMIO::DirectRead(MMIO::Utils::HighPart(&fifo.CPReadWriteDistance)), MMIO::ComplexWrite([](u32, u16 val) { WriteHigh(fifo.CPReadWriteDistance, val); + SyncGPU(SYNC_GPU_OTHER); if (fifo.CPReadWriteDistance == 0) { GPFifo::ResetGatherPipe(); @@ -269,8 +280,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { ResetVideoBuffer(); } - if (!IsOnThread()) - RunGpu(); + RunGpu(); }) ); mmio->Register(base | FIFO_READ_POINTER_LO, @@ -298,11 +308,7 @@ void STACKALIGN GatherPipeBursted() // if we aren't linked, we don't care about gather pipe data if (!m_CPCtrlReg.GPLinkEnable) { - if (!IsOnThread()) - { - RunGpu(); - } - else + if (IsOnThread() && !g_use_deterministic_gpu_thread) { // In multibuffer mode is not allowed write in the same FIFO attached to the GPU. // Fix Pokemon XD in DC mode. @@ -313,6 +319,10 @@ void STACKALIGN GatherPipeBursted() ProcessFifoAllDistance(); } } + else + { + RunGpu(); + } return; } @@ -327,8 +337,7 @@ void STACKALIGN GatherPipeBursted() Common::AtomicAdd(fifo.CPReadWriteDistance, GATHER_PIPE_SIZE); - if (!IsOnThread()) - RunGpu(); + RunGpu(); _assert_msg_(COMMANDPROCESSOR, fifo.CPReadWriteDistance <= fifo.CPEnd - fifo.CPBase, "FIFO is overflowed by GatherPipe !\nCPU thread is too fast!"); @@ -358,7 +367,8 @@ void UpdateInterrupts(u64 userdata) void UpdateInterruptsFromVideoBackend(u64 userdata) { - CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); + if (!g_use_deterministic_gpu_thread) + CoreTiming::ScheduleEvent_Threadsafe(0, et_UpdateInterrupts, userdata); } void SetCPStatusFromGPU() diff --git a/Source/Core/VideoCommon/CommandProcessor.h b/Source/Core/VideoCommon/CommandProcessor.h index b29816e8fe..0dad1578af 100644 --- a/Source/Core/VideoCommon/CommandProcessor.h +++ b/Source/Core/VideoCommon/CommandProcessor.h @@ -16,6 +16,7 @@ namespace CommandProcessor { extern SCPFifoStruct fifo; //This one is shared between gfx thread and emulator thread. + extern volatile bool isPossibleWaitingSetDrawDone; //This one is used for sync gfx thread and emulator thread. extern volatile bool interruptSet; extern volatile bool interruptWaiting; diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h index 7f317b177c..fcb89d36e4 100644 --- a/Source/Core/VideoCommon/DataReader.h +++ b/Source/Core/VideoCommon/DataReader.h @@ -25,9 +25,9 @@ __forceinline void DataSkip() } template -__forceinline T DataPeek(int _uOffset) +__forceinline T DataPeek(int _uOffset, u8** bufp = &g_video_buffer_read_ptr) { - auto const result = Common::FromBigEndian(*reinterpret_cast(g_video_buffer_read_ptr + _uOffset)); + auto const result = Common::FromBigEndian(*reinterpret_cast(*bufp + _uOffset)); return result; } @@ -48,10 +48,10 @@ __forceinline u32 DataPeek32(int _uOffset) } template -__forceinline T DataRead() +__forceinline T DataRead(u8** bufp = &g_video_buffer_read_ptr) { - auto const result = DataPeek(0); - DataSkip(); + auto const result = DataPeek(0, bufp); + *bufp += sizeof(T); return result; } diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp index b6412c796c..87764ec85c 100644 --- a/Source/Core/VideoCommon/Fifo.cpp +++ b/Source/Core/VideoCommon/Fifo.cpp @@ -25,19 +25,46 @@ bool g_bSkipCurrentFrame = false; static volatile bool GpuRunningState = false; static volatile bool EmuRunningState = false; static std::mutex m_csHWVidOccupied; -// STATE_TO_SAVE -static u8* s_video_buffer; -static u8* s_video_buffer_write_ptr; -// Note: during display list execution, temporarily points to the list instead -// of inside s_video_buffer. +// Most of this array is unlikely to be faulted in... +static u8 s_fifo_aux_data[FIFO_SIZE]; +static u8* s_fifo_aux_write_ptr; +static u8* s_fifo_aux_read_ptr; + +bool g_use_deterministic_gpu_thread = true; // XXX + +// STATE_TO_SAVE +static std::mutex s_video_buffer_lock; +static std::condition_variable s_video_buffer_cond; +static u8* s_video_buffer; u8* g_video_buffer_read_ptr; +static std::atomic s_video_buffer_write_ptr; +static std::atomic s_video_buffer_seen_ptr; +u8* g_video_buffer_pp_read_ptr; +// The read_ptr is always owned by the GPU thread. In normal mode, so is the +// write_ptr, despite it being atomic. In g_use_deterministic_gpu_thread mode, +// things get a bit more complicated: +// - The seen_ptr is written by the GPU thread, and points to what it's already +// processed as much of as possible - in the case of a partial command which +// caused it to stop, not the same as the read ptr. It's written by the GPU, +// under the lock, and updating the cond. +// - The write_ptr is written by the CPU thread after it copies data from the +// FIFO. Maybe someday it will be under the lock. For now, because RunGpuLoop +// polls, it's just atomic. +// - The pp_read_ptr is the CPU preprocessing version of the read_ptr. void Fifo_DoState(PointerWrap &p) { p.DoArray(s_video_buffer, FIFO_SIZE); - p.DoPointer(s_video_buffer_write_ptr, s_video_buffer); + u8* write_ptr = s_video_buffer_write_ptr; + p.DoPointer(write_ptr, s_video_buffer); + s_video_buffer_write_ptr = write_ptr; p.DoPointer(g_video_buffer_read_ptr, s_video_buffer); + if (p.mode == PointerWrap::MODE_READ && g_use_deterministic_gpu_thread) + { + // We're good and paused, right? + s_video_buffer_seen_ptr = g_video_buffer_pp_read_ptr = g_video_buffer_read_ptr; + } p.Do(g_bSkipCurrentFrame); } @@ -45,6 +72,7 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) { if (doLock) { + SyncGPU(SYNC_GPU_OTHER); EmulatorState(false); if (!Core::IsGPUThread()) m_csHWVidOccupied.lock(); @@ -63,7 +91,7 @@ void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock) void Fifo_Init() { s_video_buffer = (u8*)AllocateMemoryPages(FIFO_SIZE); - s_video_buffer_write_ptr = s_video_buffer; + ResetVideoBuffer(); GpuRunningState = false; Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); } @@ -73,6 +101,12 @@ void Fifo_Shutdown() if (GpuRunningState) PanicAlert("Fifo shutting down while active"); FreeMemoryPages(s_video_buffer, FIFO_SIZE); s_video_buffer = nullptr; + s_video_buffer_write_ptr = nullptr; + g_video_buffer_pp_read_ptr = nullptr; + g_video_buffer_read_ptr = nullptr; + s_video_buffer_seen_ptr = nullptr; + s_fifo_aux_write_ptr = nullptr; + s_fifo_aux_read_ptr = nullptr; } u8* GetVideoBufferStartPtr() @@ -108,6 +142,66 @@ void EmulatorState(bool running) EmuRunningState = running; } +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr) +{ + if (g_use_deterministic_gpu_thread && GpuRunningState) + { + std::unique_lock lk(s_video_buffer_lock); + u8* write_ptr = s_video_buffer_write_ptr; + s_video_buffer_cond.wait(lk, [&]() { + return !GpuRunningState || s_video_buffer_seen_ptr == write_ptr; + }); + if (!GpuRunningState) + return; + + // Opportunistically reset FIFOs so we don't wrap around. + if (may_move_read_ptr && s_fifo_aux_write_ptr != s_fifo_aux_read_ptr) + PanicAlert("aux fifo not synced (%p, %p)", s_fifo_aux_write_ptr, s_fifo_aux_read_ptr); + + memmove(s_fifo_aux_data, s_fifo_aux_read_ptr, s_fifo_aux_write_ptr - s_fifo_aux_read_ptr); + s_fifo_aux_write_ptr -= (s_fifo_aux_read_ptr - s_fifo_aux_data); + s_fifo_aux_read_ptr = s_fifo_aux_data; + + if (may_move_read_ptr) + { + // what's left over in the buffer + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + + memmove(s_video_buffer, g_video_buffer_pp_read_ptr, size); + // This change always decreases the pointers. We write seen_ptr + // after write_ptr here, and read it before in RunGpuLoop, so + // 'write_ptr > seen_ptr' there cannot become spuriously true. + s_video_buffer_write_ptr = write_ptr = s_video_buffer + size; + g_video_buffer_pp_read_ptr = s_video_buffer; + g_video_buffer_read_ptr = s_video_buffer; + s_video_buffer_seen_ptr = write_ptr; + } + } +} + +void PushFifoAuxBuffer(void* ptr, size_t size) +{ + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + SyncGPU(SYNC_GPU_AUX_SPACE, /* may_move_read_ptr */ false); + if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr)) + { + // That will sync us up to the last 32 bytes, so this short region + // of FIFO would have to point to a 2MB display list or something. + PanicAlert("absurdly large aux buffer"); + return; + } + } + memcpy(s_fifo_aux_write_ptr, ptr, size); + s_fifo_aux_write_ptr += size; +} + +void* PopFifoAuxBuffer(size_t size) +{ + void* ret = s_fifo_aux_read_ptr; + s_fifo_aux_read_ptr += size; + return ret; +} // Description: RunGpuLoop() sends data through this function. static void ReadDataFromFifo(u8* _uData, u32 len) @@ -129,10 +223,42 @@ static void ReadDataFromFifo(u8* _uData, u32 len) s_video_buffer_write_ptr += len; } +// The deterministic_gpu_thread version. +static void ReadDataFromFifoOnCPU(u8* _uData, u32 len) +{ + u8 *write_ptr = s_video_buffer_write_ptr; + if (len > (s_video_buffer + FIFO_SIZE - write_ptr)) + { + // We can't wrap around while the GPU is working on the data. + // This should be very rare due to the reset in SyncGPU. + SyncGPU(SYNC_GPU_WRAPAROUND); + if (g_video_buffer_pp_read_ptr != g_video_buffer_read_ptr) + { + PanicAlert("desynced read pointers"); + return; + } + write_ptr = s_video_buffer_write_ptr; + size_t size = write_ptr - g_video_buffer_pp_read_ptr; + if (len > FIFO_SIZE - size) + { + PanicAlert("FIFO out of bounds (existing %lu + new %lu > %lu)", (unsigned long) size, (unsigned long) len, (unsigned long) FIFO_SIZE); + return; + } + } + memcpy(write_ptr, _uData, len); + OpcodeDecoder_Preprocess(write_ptr + len); + // This would have to be locked if the GPU thread didn't spin. + s_video_buffer_write_ptr = write_ptr + len; +} + void ResetVideoBuffer() { g_video_buffer_read_ptr = s_video_buffer; s_video_buffer_write_ptr = s_video_buffer; + s_video_buffer_seen_ptr = s_video_buffer; + g_video_buffer_pp_read_ptr = s_video_buffer; + s_fifo_aux_write_ptr = s_fifo_aux_data; + s_fifo_aux_read_ptr = s_fifo_aux_data; } @@ -150,53 +276,75 @@ void RunGpuLoop() g_video_backend->PeekMessages(); VideoFifo_CheckAsyncRequest(); - - CommandProcessor::SetCPStatusFromGPU(); - - Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - - // check if we are able to run this buffer - while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + if (g_use_deterministic_gpu_thread) { - fifo.isGpuReadingData = true; - CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; - - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + // All the fifo/CP stuff is on the CPU. We just need to run the opcode decoder. + u8* seen_ptr = s_video_buffer_seen_ptr; + u8* write_ptr = s_video_buffer_write_ptr; + // See comment in SyncGPU + if (write_ptr > seen_ptr) { - u32 readPtr = fifo.CPReadPointer; - u8 *uData = Memory::GetPointer(readPtr); + OpcodeDecoder_Run(write_ptr); - if (readPtr == fifo.CPEnd) - readPtr = fifo.CPBase; - else - readPtr += 32; - - _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , - "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); - - ReadDataFromFifo(uData, 32); - - cyclesExecuted = OpcodeDecoder_Run(GetVideoBufferEndPtr()); - - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) - Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); - - Common::AtomicStore(fifo.CPReadPointer, readPtr); - Common::AtomicAdd(fifo.CPReadWriteDistance, -32); - if ((GetVideoBufferEndPtr() - g_video_buffer_read_ptr) == 0) - Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + { + std::lock_guard vblk(s_video_buffer_lock); + s_video_buffer_seen_ptr = write_ptr; + s_video_buffer_cond.notify_all(); + } } - + } + else + { CommandProcessor::SetCPStatusFromGPU(); - // This call is pretty important in DualCore mode and must be called in the FIFO Loop. - // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false - // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. - VideoFifo_CheckAsyncRequest(); - CommandProcessor::isPossibleWaitingSetDrawDone = false; - } + Common::AtomicStore(CommandProcessor::VITicks, CommandProcessor::m_cpClockOrigin); - fifo.isGpuReadingData = false; + // check if we are able to run this buffer + while (GpuRunningState && EmuRunningState && !CommandProcessor::interruptWaiting && fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint()) + { + fifo.isGpuReadingData = true; + CommandProcessor::isPossibleWaitingSetDrawDone = fifo.bFF_GPLinkEnable ? true : false; + + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || Common::AtomicLoad(CommandProcessor::VITicks) > CommandProcessor::m_cpClockOrigin) + { + u32 readPtr = fifo.CPReadPointer; + u8 *uData = Memory::GetPointer(readPtr); + + if (readPtr == fifo.CPEnd) + readPtr = fifo.CPBase; + else + readPtr += 32; + + _assert_msg_(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0 , + "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce instability in the game. Please report it.", fifo.CPReadWriteDistance - 32); + + ReadDataFromFifo(uData, 32); + + u8* write_ptr = s_video_buffer_write_ptr; + + cyclesExecuted = OpcodeDecoder_Run(write_ptr); + + + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) >= cyclesExecuted) + Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted); + + Common::AtomicStore(fifo.CPReadPointer, readPtr); + Common::AtomicAdd(fifo.CPReadWriteDistance, -32); + if ((write_ptr - g_video_buffer_read_ptr) == 0) + Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer); + } + + CommandProcessor::SetCPStatusFromGPU(); + + // This call is pretty important in DualCore mode and must be called in the FIFO Loop. + // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false + // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down. + VideoFifo_CheckAsyncRequest(); + CommandProcessor::isPossibleWaitingSetDrawDone = false; + } + + fifo.isGpuReadingData = false; + } if (EmuRunningState) { @@ -219,6 +367,8 @@ void RunGpuLoop() } } } + // wake up SyncGPU if we were interrupted + s_video_buffer_cond.notify_all(); } @@ -230,16 +380,27 @@ bool AtBreakpoint() void RunGpu() { + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread && + !g_use_deterministic_gpu_thread) + return; + SCPFifoStruct &fifo = CommandProcessor::fifo; while (fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint() ) { u8 *uData = Memory::GetPointer(fifo.CPReadPointer); - FPURoundMode::SaveSIMDState(); - FPURoundMode::LoadDefaultSIMDState(); - ReadDataFromFifo(uData, 32); - OpcodeDecoder_Run(GetVideoBufferEndPtr()); - FPURoundMode::LoadSIMDState(); + if (g_use_deterministic_gpu_thread) + { + ReadDataFromFifoOnCPU(uData, 32); + } + else + { + FPURoundMode::SaveSIMDState(); + FPURoundMode::LoadDefaultSIMDState(); + ReadDataFromFifo(uData, 32); + OpcodeDecoder_Run(s_video_buffer_write_ptr); + FPURoundMode::LoadSIMDState(); + } //DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base"); diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h index 66399680c1..175d6b6e4a 100644 --- a/Source/Core/VideoCommon/Fifo.h +++ b/Source/Core/VideoCommon/Fifo.h @@ -13,6 +13,11 @@ class PointerWrap; extern bool g_bSkipCurrentFrame; +// This could be in SCoreStartupParameter, but it depends on multiple settings +// and can change at runtime. +extern bool g_use_deterministic_gpu_thread; +extern std::atomic g_video_buffer_write_ptr_xthread; +extern u8* g_video_buffer_pp_read_ptr; void Fifo_Init(); void Fifo_Shutdown(); @@ -23,6 +28,22 @@ u8* GetVideoBufferEndPtr(); void Fifo_DoState(PointerWrap &f); void Fifo_PauseAndLock(bool doLock, bool unpauseOnUnlock); +// Used for diagnostics. +enum SyncGPUReason { + SYNC_GPU_NONE, + SYNC_GPU_OTHER, + SYNC_GPU_WRAPAROUND, + SYNC_GPU_EFB_POKE, + SYNC_GPU_PERFQUERY, + SYNC_GPU_SWAP, + SYNC_GPU_AUX_SPACE, +}; +// In g_use_deterministic_gpu_thread mode, waits for the GPU to be done with pending work. +void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr = true); + +void PushFifoAuxBuffer(void* ptr, size_t size); +void* PopFifoAuxBuffer(size_t size); + void RunGpu(); void RunGpuLoop(); void ExitGpuLoop(); diff --git a/Source/Core/VideoCommon/MainBase.cpp b/Source/Core/VideoCommon/MainBase.cpp index 99fac60eae..082c81c2ea 100644 --- a/Source/Core/VideoCommon/MainBase.cpp +++ b/Source/Core/VideoCommon/MainBase.cpp @@ -118,6 +118,7 @@ void VideoBackendHardware::Video_EndField() { if (s_BackendInitialized) { + SyncGPU(SYNC_GPU_SWAP); s_swapRequested.Set(); } } @@ -153,6 +154,8 @@ u32 VideoBackendHardware::Video_AccessEFB(EFBAccessType type, u32 x, u32 y, u32 { if (s_BackendInitialized && g_ActiveConfig.bEFBAccessEnable) { + SyncGPU(SYNC_GPU_EFB_POKE); + s_accessEFBArgs.type = type; s_accessEFBArgs.x = x; s_accessEFBArgs.y = y; @@ -194,6 +197,8 @@ u32 VideoBackendHardware::Video_GetQueryResult(PerfQueryType type) return 0; } + SyncGPU(SYNC_GPU_PERFQUERY); + // TODO: Is this check sane? if (!g_perf_query->IsFlushed()) { diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index fe70bcf492..1bb5fae940 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -24,6 +24,7 @@ #include "VideoCommon/DataReader.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/OpcodeDecoding.h" +#include "VideoCommon/PixelEngine.h" #include "VideoCommon/Statistics.h" #include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoCommon.h" @@ -36,7 +37,12 @@ bool g_bRecordFifoData = false; static u32 InterpretDisplayList(u32 address, u32 size) { u8* old_pVideoData = g_video_buffer_read_ptr; - u8* startAddress = Memory::GetPointer(address); + u8* startAddress; + + if (g_use_deterministic_gpu_thread) + startAddress = (u8*) PopFifoAuxBuffer(size); + else + startAddress = Memory::GetPointer(address); u32 cycles = 0; @@ -62,11 +68,29 @@ static u32 InterpretDisplayList(u32 address, u32 size) return cycles; } +static void InterpretDisplayListPreprocess(u32 address, u32 size) +{ + u8* old_read_ptr = g_video_buffer_pp_read_ptr; + u8* startAddress = Memory::GetPointer(address); + + PushFifoAuxBuffer(startAddress, size); + + if (startAddress != nullptr) + { + g_video_buffer_pp_read_ptr = startAddress; + + u8 *end = startAddress + size; + OpcodeDecoder_Preprocess(end); + } + + g_video_buffer_pp_read_ptr = old_read_ptr; +} + static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) { // TODO(Omega): Maybe dump FIFO to file on this error std::string temp = StringFromFormat( - "GFX FIFO: Unknown Opcode (0x%x @ %p).\n" + "GFX FIFO: Unknown Opcode (0x%x @ %p, preprocessing=%s).\n" "This means one of the following:\n" "* The emulated GPU got desynced, disabling dual core can help\n" "* Command stream corrupted by some spurious memory bug\n" @@ -74,7 +98,8 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) "* Some other sort of bug\n\n" "Dolphin will now likely crash or hang. Enjoy." , cmd_byte, - buffer); + buffer, + preprocess ? "yes" : "no"); Host_SysMessage(temp.c_str()); INFO_LOG(VIDEO, "%s", temp.c_str()); { @@ -104,14 +129,16 @@ static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess) } } +template static u32 Decode(u8* end) { - u8 *opcodeStart = g_video_buffer_read_ptr; - if (g_video_buffer_read_ptr == end) + u8 *opcodeStart = *bufp; + if (*bufp == end) return 0; - u8 cmd_byte = DataReadU8(); + u8 cmd_byte = DataRead(bufp); u32 cycles; + int refarray; switch (cmd_byte) { case GX_NOP: @@ -120,64 +147,72 @@ static u32 Decode(u8* end) case GX_LOAD_CP_REG: //0x08 { - if (end - g_video_buffer_read_ptr < 1 + 4) + if (end - *bufp < 1 + 4) return 0; cycles = 12; - u8 sub_cmd = DataReadU8(); - u32 value = DataReadU32(); - LoadCPReg(sub_cmd, value); - INCSTAT(stats.thisFrame.numCPLoads); + u8 sub_cmd = DataRead(bufp); + u32 value = DataRead(bufp); + LoadCPReg(sub_cmd, value, is_preprocess); + if (!is_preprocess) + INCSTAT(stats.thisFrame.numCPLoads); } break; case GX_LOAD_XF_REG: { - if (end - g_video_buffer_read_ptr < 4) + if (end - *bufp < 4) return 0; - u32 Cmd2 = DataReadU32(); + u32 Cmd2 = DataRead(bufp); int transfer_size = ((Cmd2 >> 16) & 15) + 1; - if ((size_t) (end - g_video_buffer_read_ptr) < transfer_size * sizeof(u32)) + if ((size_t) (end - *bufp) < transfer_size * sizeof(u32)) return 0; cycles = 18 + 6 * transfer_size; - u32 xf_address = Cmd2 & 0xFFFF; - LoadXFReg(transfer_size, xf_address); + if (!is_preprocess) + { + u32 xf_address = Cmd2 & 0xFFFF; + LoadXFReg(transfer_size, xf_address); - INCSTAT(stats.thisFrame.numXFLoads); + INCSTAT(stats.thisFrame.numXFLoads); + } + else + { + *bufp += transfer_size * sizeof(u32); + } } break; case GX_LOAD_INDX_A: //used for position matrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xC); - break; + refarray = 0xC; + goto load_indx; case GX_LOAD_INDX_B: //used for normal matrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xD); - break; + refarray = 0xD; + goto load_indx; case GX_LOAD_INDX_C: //used for postmatrices - if (end - g_video_buffer_read_ptr < 4) - return 0; - cycles = 6; - LoadIndexedXF(DataReadU32(), 0xE); - break; + refarray = 0xE; + goto load_indx; case GX_LOAD_INDX_D: //used for lights - if (end - g_video_buffer_read_ptr < 4) + refarray = 0xF; + goto load_indx; + load_indx: + if (end - *bufp < 4) return 0; cycles = 6; - LoadIndexedXF(DataReadU32(), 0xF); + if (is_preprocess) + PreprocessIndexedXF(DataRead(bufp), refarray); + else + LoadIndexedXF(DataRead(bufp), refarray); break; case GX_CMD_CALL_DL: { - if (end - g_video_buffer_read_ptr < 8) + if (end - *bufp < 8) return 0; - u32 address = DataReadU32(); - u32 count = DataReadU32(); - cycles = 6 + InterpretDisplayList(address, count); + u32 address = DataRead(bufp); + u32 count = DataRead(bufp); + if (is_preprocess) + InterpretDisplayListPreprocess(address, count); + else + cycles = 6 + InterpretDisplayList(address, count); } break; @@ -195,12 +230,19 @@ static u32 Decode(u8* end) // In skipped_frame case: We have to let BP writes through because they set // tokens and stuff. TODO: Call a much simplified LoadBPReg instead. { - if (end - g_video_buffer_read_ptr < 4) + if (end - *bufp < 4) return 0; cycles = 12; - u32 bp_cmd = DataReadU32(); - LoadBPReg(bp_cmd); - INCSTAT(stats.thisFrame.numBPLoads); + u32 bp_cmd = DataRead(bufp); + if (is_preprocess) + { + LoadBPRegPreprocess(bp_cmd); + } + else + { + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + } } break; @@ -210,33 +252,43 @@ static u32 Decode(u8* end) { cycles = 1600; // load vertices - if (end - g_video_buffer_read_ptr < 2) + if (end - *bufp < 2) return 0; - u16 numVertices = DataReadU16(); + u16 num_vertices = DataRead(bufp); - if (!VertexLoaderManager::RunVertices( - cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) - (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, - numVertices, - end - g_video_buffer_read_ptr, - g_bSkipCurrentFrame)) + if (is_preprocess) { - return 0; + size_t size = num_vertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK, is_preprocess); + if ((size_t) (end - *bufp) < size) + return 0; + *bufp += size; + } + else + { + if (!VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + num_vertices, + end - *bufp, + g_bSkipCurrentFrame)) + return 0; } } else { - UnknownOpcode(cmd_byte, opcodeStart, false); + UnknownOpcode(cmd_byte, opcodeStart, is_preprocess); cycles = 1; } break; } // Display lists get added directly into the FIFO stream - if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) - FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_video_buffer_read_ptr - opcodeStart)); + if (!is_preprocess && g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) + FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(*bufp - opcodeStart)); - return cycles; + // In is_preprocess mode, we don't actually care about cycles, at least for + // now... make sure the compiler realizes that. + return is_preprocess ? 1 : cycles; } void OpcodeDecoder_Init() @@ -255,7 +307,7 @@ u32 OpcodeDecoder_Run(u8* end) while (true) { u8* old = g_video_buffer_read_ptr; - u32 cycles = Decode(end); + u32 cycles = Decode(end); if (cycles == 0) { g_video_buffer_read_ptr = old; @@ -265,3 +317,17 @@ u32 OpcodeDecoder_Run(u8* end) } return totalCycles; } + +void OpcodeDecoder_Preprocess(u8 *end) +{ + while (true) + { + u8* old = g_video_buffer_pp_read_ptr; + u32 cycles = Decode(end); + if (cycles == 0) + { + g_video_buffer_pp_read_ptr = old; + break; + } + } +} diff --git a/Source/Core/VideoCommon/OpcodeDecoding.h b/Source/Core/VideoCommon/OpcodeDecoding.h index 1702969825..e5b1b23e89 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.h +++ b/Source/Core/VideoCommon/OpcodeDecoding.h @@ -39,3 +39,4 @@ extern bool g_bRecordFifoData; void OpcodeDecoder_Init(); void OpcodeDecoder_Shutdown(); u32 OpcodeDecoder_Run(u8* end); +void OpcodeDecoder_Preprocess(u8* write_ptr); diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h index c9d4d35216..33077aa69b 100644 --- a/Source/Core/VideoCommon/XFMemory.h +++ b/Source/Core/VideoCommon/XFMemory.h @@ -275,3 +275,4 @@ extern XFMemory xfmem; void LoadXFReg(u32 transferSize, u32 address); void LoadIndexedXF(u32 val, int array); +void PreprocessIndexedXF(u32 val, int refarray); diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index a0941a0133..0552aa0986 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -6,6 +6,7 @@ #include "Core/HW/Memmap.h" #include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" +#include "VideoCommon/Fifo.h" #include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexShaderManager.h" @@ -252,7 +253,15 @@ void LoadIndexedXF(u32 val, int refarray) //load stuff from array to address in xf mem u32* currData = (u32*)(&xfmem) + address; - u32* newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); + u32* newData; + if (g_use_deterministic_gpu_thread) + { + newData = (u32*)PopFifoAuxBuffer(size * sizeof(u32)); + } + else + { + newData = (u32*)Memory::GetPointer(g_main_cp_state.array_bases[refarray] + g_main_cp_state.array_strides[refarray] * index); + } bool changed = false; for (int i = 0; i < size; ++i) { @@ -269,3 +278,14 @@ void LoadIndexedXF(u32 val, int refarray) currData[i] = Common::swap32(newData[i]); } } + +void PreprocessIndexedXF(u32 val, int refarray) +{ + int index = val >> 16; + int size = ((val >> 12) & 0xF) + 1; + + u32* new_data = (u32*)Memory::GetPointer(g_preprocess_cp_state.array_bases[refarray] + g_preprocess_cp_state.array_strides[refarray] * index); + + size_t buf_size = size * sizeof(u32); + PushFifoAuxBuffer(new_data, buf_size); +}