From 417bf0c3bc479cc43d705459493ca5947fa43c0d Mon Sep 17 00:00:00 2001 From: Stenzek Date: Wed, 10 Apr 2024 20:00:09 +1000 Subject: [PATCH] DMA: Better enforce CPU runtime during linked list and get rid of the hack for Newman Haas. --- data/resources/gamedb.yaml | 2 +- src/core/dma.cpp | 44 ++++-------- src/core/gpu.cpp | 35 +++------- src/core/gpu.h | 4 +- src/core/gpu_commands.cpp | 137 ++++++++++++++++++------------------- src/core/timing_event.cpp | 7 ++ src/core/timing_event.h | 1 + 7 files changed, 105 insertions(+), 125 deletions(-) diff --git a/data/resources/gamedb.yaml b/data/resources/gamedb.yaml index c89868a9c..b1267792c 100644 --- a/data/resources/gamedb.yaml +++ b/data/resources/gamedb.yaml @@ -90470,7 +90470,7 @@ SLPS-02376: - DigitalController settings: dmaMaxSliceTicks: 100 - dmaHaltTicks: 150 + dmaHaltTicks: 200 codes: - SLPS-02376 - SLPS-02356 diff --git a/src/core/dma.cpp b/src/core/dma.cpp index 75e375b77..a965d80e9 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -47,7 +47,7 @@ static constexpr PhysicalMemoryAddress LINKED_LIST_TERMINATOR = UINT32_C(0x00FFF static constexpr TickCount LINKED_LIST_HEADER_READ_TICKS = 10; static constexpr TickCount LINKED_LIST_BLOCK_SETUP_TICKS = 5; -static constexpr TickCount HALT_TICKS_WHEN_TRANSMITTING_PAD = 100; +static constexpr TickCount SLICE_SIZE_WHEN_TRANSMITTING_PAD = 10; struct ChannelState { @@ -194,7 +194,7 @@ static TickCount TransferDeviceToMemory(u32 address, u32 increment, u32 word_cou template static TickCount TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); - +static TickCount GetMaxSliceTicks(); // configuration static TickCount s_max_slice_ticks = 1000; @@ -543,6 +543,17 @@ ALWAYS_INLINE_RELEASE void DMA::CompleteTransfer(Channel channel, ChannelState& } } +TickCount DMA::GetMaxSliceTicks() +{ + const TickCount max = Pad::IsTransmitting() ? SLICE_SIZE_WHEN_TRANSMITTING_PAD : s_max_slice_ticks; + if (!TimingEvents::IsRunningEvents()) + return max; + + const u32 current_ticks = TimingEvents::GetGlobalTickCounter(); + const u32 max_ticks = TimingEvents::GetEventRunTickCounter() + static_cast(max); + return std::clamp(static_cast(max_ticks - current_ticks), 0, max); +} + template bool DMA::TransferChannel() { @@ -586,35 +597,13 @@ bool DMA::TransferChannel() return true; } - if constexpr (channel == Channel::GPU) - { - // Plenty of games seem to suffer from this issue where they have a linked list DMA going while polling the - // controller. Having a large slice size causes the serial transfer to complete before the silly busy wait - // in the BIOS poll routine returns, resulting in it thinking that the controller is disconnected. Some games - // are very sensitive to this (e.g. Newman Haas Racing), to the point that even using a slice size of 1 is - // insufficient for avoiding the race, probably due to the linked list layout. - // - // Therefore, without major refactoring to ensure the CPU runs every DMA block, and the associated performance - // penalty, we just halt the DMA until the serial transfers have completed. To reduce the chances of this - // significantly affecting timing, we add accumulate the ticks that have been "lost", and allow them to be - // "used up" when the transfer does happen. - // - if (Pad::IsTransmitting()) - { - Log_DebugFmt("DMA transfer while transmitting pad - {} ticks are buffered", -s_halt_ticks_remaining); - if (!s_unhalt_event->IsActive()) - s_unhalt_event->SetIntervalAndSchedule(HALT_TICKS_WHEN_TRANSMITTING_PAD); - return false; - } - } - Log_DebugFmt("DMA[{}]: Copying linked list starting at 0x{:08X} to device", channel, current_address); // Prove to the compiler that nothing's going to modify these. const u8* const ram_ptr = Bus::g_ram; const u32 mask = Bus::g_ram_mask; - const TickCount slice_ticks = s_max_slice_ticks + -s_halt_ticks_remaining; + const TickCount slice_ticks = GetMaxSliceTicks(); TickCount remaining_ticks = slice_ticks; while (cs.request && remaining_ticks > 0) { @@ -658,9 +647,6 @@ bool DMA::TransferChannel() cs.base_address = current_address; if (cs.request) { - // don't actually delay the transfer for the buffered ticks, this variable is dual-purposed. - s_halt_ticks_remaining = std::max(s_halt_ticks_remaining, 0); - // stall the transfer for a bit if we ran for too long HaltTransfer(s_halt_ticks); return false; @@ -681,7 +667,7 @@ bool DMA::TransferChannel() const u32 block_size = cs.block_control.request.GetBlockSize(); u32 blocks_remaining = cs.block_control.request.GetBlockCount(); - TickCount ticks_remaining = s_max_slice_ticks; + TickCount ticks_remaining = GetMaxSliceTicks(); if (copy_to_device) { diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index 2698454ee..9fc0cc810 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -467,7 +467,6 @@ void GPU::WriteRegister(u32 offset, u32 value) case 0x00: m_fifo.Push(value); ExecuteCommands(); - UpdateCommandTickEvent(); return; case 0x04: @@ -495,16 +494,7 @@ void GPU::DMARead(u32* words, u32 word_count) void GPU::EndDMAWrite() { - m_fifo_pushed = true; - if (!m_syncing) - { - ExecuteCommands(); - UpdateCommandTickEvent(); - } - else - { - UpdateDMARequest(); - } + ExecuteCommands(); } /** @@ -1029,26 +1019,24 @@ void GPU::CRTCTickEvent(TickCount ticks) void GPU::CommandTickEvent(TickCount ticks) { m_pending_command_ticks -= SystemTicksToGPUTicks(ticks); - m_command_tick_event->Deactivate(); - // we can be syncing if this came from a DMA write. recursively executing commands would be bad. - if (!m_syncing) - ExecuteCommands(); - - UpdateGPUIdle(); - - if (m_pending_command_ticks <= 0) - m_pending_command_ticks = 0; - else - m_command_tick_event->SetIntervalAndSchedule(GPUTicksToSystemTicks(m_pending_command_ticks)); + m_executing_commands = true; + ExecuteCommands(); + UpdateCommandTickEvent(); + m_executing_commands = false; } void GPU::UpdateCommandTickEvent() { if (m_pending_command_ticks <= 0) + { + m_pending_command_ticks = 0; m_command_tick_event->Deactivate(); - else if (!m_command_tick_event->IsActive()) + } + else + { m_command_tick_event->SetIntervalAndSchedule(GPUTicksToSystemTicks(m_pending_command_ticks)); + } } void GPU::ConvertScreenCoordinatesToDisplayCoordinates(float window_x, float window_y, float* display_x, @@ -1121,7 +1109,6 @@ u32 GPU::ReadGPUREAD() // end of transfer, catch up on any commands which were written (unlikely) ExecuteCommands(); - UpdateCommandTickEvent(); break; } } diff --git a/src/core/gpu.h b/src/core/gpu.h index bbfe4e6eb..9ecda46ff 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -307,6 +307,7 @@ protected: void WriteGP1(u32 value); void EndCommand(); void ExecuteCommands(); + void TryExecuteCommands(); void HandleGetGPUInfoCommand(u32 value); // Rendering in the backend @@ -542,8 +543,7 @@ protected: u32 m_GPUREAD_latch = 0; /// True if currently executing/syncing. - bool m_syncing = false; - bool m_fifo_pushed = false; + bool m_executing_commands = false; struct VRAMTransfer { diff --git a/src/core/gpu_commands.cpp b/src/core/gpu_commands.cpp index 6ad306b7e..9d444f598 100644 --- a/src/core/gpu_commands.cpp +++ b/src/core/gpu_commands.cpp @@ -25,94 +25,93 @@ static constexpr u32 ReplaceZero(u32 value, u32 value_for_zero) return value == 0 ? value_for_zero : value; } -void GPU::ExecuteCommands() +void GPU::TryExecuteCommands() { - m_syncing = true; - - for (;;) + while (m_pending_command_ticks <= m_max_run_ahead && !m_fifo.IsEmpty()) { - if (m_pending_command_ticks <= m_max_run_ahead && !m_fifo.IsEmpty()) + switch (m_blitter_state) { - switch (m_blitter_state) + case BlitterState::Idle: { - case BlitterState::Idle: + const u32 command = FifoPeek(0) >> 24; + if ((this->*s_GP0_command_handler_table[command])()) + continue; + else + return; + } + + case BlitterState::WritingVRAM: + { + DebugAssert(m_blit_remaining_words > 0); + const u32 words_to_copy = std::min(m_blit_remaining_words, m_fifo.GetSize()); + m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy); + for (u32 i = 0; i < words_to_copy; i++) + m_blit_buffer.push_back(FifoPop()); + m_blit_remaining_words -= words_to_copy; + + Log_DebugPrintf("VRAM write burst of %u words, %u words remaining", words_to_copy, m_blit_remaining_words); + if (m_blit_remaining_words == 0) + FinishVRAMWrite(); + + continue; + } + + case BlitterState::ReadingVRAM: + { + return; + } + break; + + case BlitterState::DrawingPolyLine: + { + const u32 words_per_vertex = m_render_command.shading_enable ? 2 : 1; + u32 terminator_index = + m_render_command.shading_enable ? ((static_cast(m_blit_buffer.size()) & 1u) ^ 1u) : 0u; + for (; terminator_index < m_fifo.GetSize(); terminator_index += words_per_vertex) { - const u32 command = FifoPeek(0) >> 24; - if ((this->*s_GP0_command_handler_table[command])()) - continue; - else - goto batch_done; + // polyline must have at least two vertices, and the terminator is (word & 0xf000f000) == 0x50005000. + // terminator is on the first word for the vertex + if ((FifoPeek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000)) + break; } - case BlitterState::WritingVRAM: + const bool found_terminator = (terminator_index < m_fifo.GetSize()); + const u32 words_to_copy = std::min(terminator_index, m_fifo.GetSize()); + if (words_to_copy > 0) { - DebugAssert(m_blit_remaining_words > 0); - const u32 words_to_copy = std::min(m_blit_remaining_words, m_fifo.GetSize()); m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy); for (u32 i = 0; i < words_to_copy; i++) m_blit_buffer.push_back(FifoPop()); - m_blit_remaining_words -= words_to_copy; - - Log_DebugPrintf("VRAM write burst of %u words, %u words remaining", words_to_copy, m_blit_remaining_words); - if (m_blit_remaining_words == 0) - FinishVRAMWrite(); + } + Log_DebugPrintf("Added %u words to polyline", words_to_copy); + if (found_terminator) + { + // drop terminator + m_fifo.RemoveOne(); + Log_DebugPrintf("Drawing poly-line with %u vertices", GetPolyLineVertexCount()); + DispatchRenderCommand(); + m_blit_buffer.clear(); + EndCommand(); continue; } - - case BlitterState::ReadingVRAM: - { - goto batch_done; - } - break; - - case BlitterState::DrawingPolyLine: - { - const u32 words_per_vertex = m_render_command.shading_enable ? 2 : 1; - u32 terminator_index = - m_render_command.shading_enable ? ((static_cast(m_blit_buffer.size()) & 1u) ^ 1u) : 0u; - for (; terminator_index < m_fifo.GetSize(); terminator_index += words_per_vertex) - { - // polyline must have at least two vertices, and the terminator is (word & 0xf000f000) == 0x50005000. - // terminator is on the first word for the vertex - if ((FifoPeek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000)) - break; - } - - const bool found_terminator = (terminator_index < m_fifo.GetSize()); - const u32 words_to_copy = std::min(terminator_index, m_fifo.GetSize()); - if (words_to_copy > 0) - { - m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy); - for (u32 i = 0; i < words_to_copy; i++) - m_blit_buffer.push_back(FifoPop()); - } - - Log_DebugPrintf("Added %u words to polyline", words_to_copy); - if (found_terminator) - { - // drop terminator - m_fifo.RemoveOne(); - Log_DebugPrintf("Drawing poly-line with %u vertices", GetPolyLineVertexCount()); - DispatchRenderCommand(); - m_blit_buffer.clear(); - EndCommand(); - continue; - } - } - break; } - } - - batch_done: - m_fifo_pushed = false; - UpdateDMARequest(); - if (!m_fifo_pushed) break; + } } +} +void GPU::ExecuteCommands() +{ + const bool was_executing_from_event = std::exchange(m_executing_commands, true); + + TryExecuteCommands(); + UpdateDMARequest(); UpdateGPUIdle(); - m_syncing = false; + + m_executing_commands = was_executing_from_event; + if (!was_executing_from_event) + UpdateCommandTickEvent(); } void GPU::EndCommand() diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp index 845708e2c..556b54a07 100644 --- a/src/core/timing_event.cpp +++ b/src/core/timing_event.cpp @@ -17,6 +17,7 @@ static TimingEvent* s_active_events_tail; static TimingEvent* s_current_event = nullptr; static u32 s_active_event_count = 0; static u32 s_global_tick_counter = 0; +static u32 s_event_run_tick_counter = 0; static bool s_frame_done = false; u32 GetGlobalTickCounter() @@ -24,6 +25,11 @@ u32 GetGlobalTickCounter() return s_global_tick_counter; } +u32 GetEventRunTickCounter() +{ + return s_event_run_tick_counter; +} + void Initialize() { Reset(); @@ -293,6 +299,7 @@ void RunEvents() if (pending_ticks >= s_active_events_head->GetDowncount()) { CPU::ResetPendingTicks(); + s_event_run_tick_counter = s_global_tick_counter + static_cast(pending_ticks); do { diff --git a/src/core/timing_event.h b/src/core/timing_event.h index 08c2698da..01733ba51 100644 --- a/src/core/timing_event.h +++ b/src/core/timing_event.h @@ -81,6 +81,7 @@ public: namespace TimingEvents { u32 GetGlobalTickCounter(); +u32 GetEventRunTickCounter(); void Initialize(); void Reset();