From 417bf0c3bc479cc43d705459493ca5947fa43c0d Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Wed, 10 Apr 2024 20:00:09 +1000
Subject: [PATCH] DMA: Better enforce CPU runtime during linked list

and get rid of the hack for Newman Haas.
---
 data/resources/gamedb.yaml |   2 +-
 src/core/dma.cpp           |  44 ++++--------
 src/core/gpu.cpp           |  35 +++-------
 src/core/gpu.h             |   4 +-
 src/core/gpu_commands.cpp  | 137 ++++++++++++++++++-------------------
 src/core/timing_event.cpp  |   7 ++
 src/core/timing_event.h    |   1 +
 7 files changed, 105 insertions(+), 125 deletions(-)
diff --git a/data/resources/gamedb.yaml b/data/resources/gamedb.yaml
index c89868a9c..b1267792c 100644
--- a/data/resources/gamedb.yaml
+++ b/data/resources/gamedb.yaml
@@ -90470,7 +90470,7 @@ SLPS-02376:
     - DigitalController
   settings:
     dmaMaxSliceTicks: 100
-    dmaHaltTicks: 150
+    dmaHaltTicks: 200
   codes:
     - SLPS-02376
     - SLPS-02356
diff --git a/src/core/dma.cpp b/src/core/dma.cpp
index 75e375b77..a965d80e9 100644
--- a/src/core/dma.cpp
+++ b/src/core/dma.cpp
@@ -47,7 +47,7 @@ static constexpr PhysicalMemoryAddress LINKED_LIST_TERMINATOR = UINT32_C(0x00FFF
 
 static constexpr TickCount LINKED_LIST_HEADER_READ_TICKS = 10;
 static constexpr TickCount LINKED_LIST_BLOCK_SETUP_TICKS = 5;
-static constexpr TickCount HALT_TICKS_WHEN_TRANSMITTING_PAD = 100;
+static constexpr TickCount SLICE_SIZE_WHEN_TRANSMITTING_PAD = 10;
 
 struct ChannelState
 {
@@ -194,7 +194,7 @@ static TickCount TransferDeviceToMemory(u32 address, u32 increment, u32 word_cou
 template<Channel channel>
 static TickCount TransferMemoryToDevice(u32 address, u32 increment, u32 word_count);
 
-
+static TickCount GetMaxSliceTicks();
 
 // configuration
 static TickCount s_max_slice_ticks = 1000;
@@ -543,6 +543,17 @@ ALWAYS_INLINE_RELEASE void DMA::CompleteTransfer(Channel channel, ChannelState&
   }
 }
 
+TickCount DMA::GetMaxSliceTicks()
+{
+  const TickCount max = Pad::IsTransmitting() ? SLICE_SIZE_WHEN_TRANSMITTING_PAD : s_max_slice_ticks;
+  if (!TimingEvents::IsRunningEvents())
+    return max;
+
+  const u32 current_ticks = TimingEvents::GetGlobalTickCounter();
+  const u32 max_ticks = TimingEvents::GetEventRunTickCounter() + static_cast<u32>(max);
+  return std::clamp(static_cast<TickCount>(max_ticks - current_ticks), 0, max);
+}
+
 template<DMA::Channel channel>
 bool DMA::TransferChannel()
 {
@@ -586,35 +597,13 @@ bool DMA::TransferChannel()
         return true;
       }
 
-      if constexpr (channel == Channel::GPU)
-      {
-        // Plenty of games seem to suffer from this issue where they have a linked list DMA going while polling the
-        // controller. Having a large slice size causes the serial transfer to complete before the silly busy wait
-        // in the BIOS poll routine returns, resulting in it thinking that the controller is disconnected. Some games
-        // are very sensitive to this (e.g. Newman Haas Racing), to the point that even using a slice size of 1 is
-        // insufficient for avoiding the race, probably due to the linked list layout.
-        //
-        // Therefore, without major refactoring to ensure the CPU runs every DMA block, and the associated performance
-        // penalty, we just halt the DMA until the serial transfers have completed. To reduce the chances of this
-        // significantly affecting timing, we add accumulate the ticks that have been "lost", and allow them to be
-        // "used up" when the transfer does happen.
-        //
-        if (Pad::IsTransmitting())
-        {
-          Log_DebugFmt("DMA transfer while transmitting pad - {} ticks are buffered", -s_halt_ticks_remaining);
-          if (!s_unhalt_event->IsActive())
-            s_unhalt_event->SetIntervalAndSchedule(HALT_TICKS_WHEN_TRANSMITTING_PAD);
-          return false;
-        }
-      }
-
       Log_DebugFmt("DMA[{}]: Copying linked list starting at 0x{:08X} to device", channel, current_address);
 
       // Prove to the compiler that nothing's going to modify these.
       const u8* const ram_ptr = Bus::g_ram;
       const u32 mask = Bus::g_ram_mask;
 
-      const TickCount slice_ticks = s_max_slice_ticks + -s_halt_ticks_remaining;
+      const TickCount slice_ticks = GetMaxSliceTicks();
       TickCount remaining_ticks = slice_ticks;
       while (cs.request && remaining_ticks > 0)
       {
@@ -658,9 +647,6 @@ bool DMA::TransferChannel()
       cs.base_address = current_address;
       if (cs.request)
       {
-        // don't actually delay the transfer for the buffered ticks, this variable is dual-purposed.
-        s_halt_ticks_remaining = std::max(s_halt_ticks_remaining, 0);
-
         // stall the transfer for a bit if we ran for too long
         HaltTransfer(s_halt_ticks);
         return false;
@@ -681,7 +667,7 @@ bool DMA::TransferChannel()
 
       const u32 block_size = cs.block_control.request.GetBlockSize();
       u32 blocks_remaining = cs.block_control.request.GetBlockCount();
-      TickCount ticks_remaining = s_max_slice_ticks;
+      TickCount ticks_remaining = GetMaxSliceTicks();
 
       if (copy_to_device)
       {
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index 2698454ee..9fc0cc810 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -467,7 +467,6 @@ void GPU::WriteRegister(u32 offset, u32 value)
     case 0x00:
       m_fifo.Push(value);
       ExecuteCommands();
-      UpdateCommandTickEvent();
       return;
 
     case 0x04:
@@ -495,16 +494,7 @@ void GPU::DMARead(u32* words, u32 word_count)
 
 void GPU::EndDMAWrite()
 {
-  m_fifo_pushed = true;
-  if (!m_syncing)
-  {
-    ExecuteCommands();
-    UpdateCommandTickEvent();
-  }
-  else
-  {
-    UpdateDMARequest();
-  }
+  ExecuteCommands();
 }
 
 /**
@@ -1029,26 +1019,24 @@ void GPU::CRTCTickEvent(TickCount ticks)
 void GPU::CommandTickEvent(TickCount ticks)
 {
   m_pending_command_ticks -= SystemTicksToGPUTicks(ticks);
-  m_command_tick_event->Deactivate();
 
-  // we can be syncing if this came from a DMA write. recursively executing commands would be bad.
-  if (!m_syncing)
-    ExecuteCommands();
-
-  UpdateGPUIdle();
-
-  if (m_pending_command_ticks <= 0)
-    m_pending_command_ticks = 0;
-  else
-    m_command_tick_event->SetIntervalAndSchedule(GPUTicksToSystemTicks(m_pending_command_ticks));
+  m_executing_commands = true;
+  ExecuteCommands();
+  UpdateCommandTickEvent();
+  m_executing_commands = false;
 }
 
 void GPU::UpdateCommandTickEvent()
 {
   if (m_pending_command_ticks <= 0)
+  {
+    m_pending_command_ticks = 0;
     m_command_tick_event->Deactivate();
-  else if (!m_command_tick_event->IsActive())
+  }
+  else
+  {
     m_command_tick_event->SetIntervalAndSchedule(GPUTicksToSystemTicks(m_pending_command_ticks));
+  }
 }
 
 void GPU::ConvertScreenCoordinatesToDisplayCoordinates(float window_x, float window_y, float* display_x,
@@ -1121,7 +1109,6 @@ u32 GPU::ReadGPUREAD()
 
         // end of transfer, catch up on any commands which were written (unlikely)
         ExecuteCommands();
-        UpdateCommandTickEvent();
         break;
       }
     }
diff --git a/src/core/gpu.h b/src/core/gpu.h
index bbfe4e6eb..9ecda46ff 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -307,6 +307,7 @@ protected:
   void WriteGP1(u32 value);
   void EndCommand();
   void ExecuteCommands();
+  void TryExecuteCommands();
   void HandleGetGPUInfoCommand(u32 value);
 
   // Rendering in the backend
@@ -542,8 +543,7 @@ protected:
   u32 m_GPUREAD_latch = 0;
 
   /// True if currently executing/syncing.
-  bool m_syncing = false;
-  bool m_fifo_pushed = false;
+  bool m_executing_commands = false;
 
   struct VRAMTransfer
   {
diff --git a/src/core/gpu_commands.cpp b/src/core/gpu_commands.cpp
index 6ad306b7e..9d444f598 100644
--- a/src/core/gpu_commands.cpp
+++ b/src/core/gpu_commands.cpp
@@ -25,94 +25,93 @@ static constexpr u32 ReplaceZero(u32 value, u32 value_for_zero)
   return value == 0 ? value_for_zero : value;
 }
 
-void GPU::ExecuteCommands()
+void GPU::TryExecuteCommands()
 {
-  m_syncing = true;
-
-  for (;;)
+  while (m_pending_command_ticks <= m_max_run_ahead && !m_fifo.IsEmpty())
   {
-    if (m_pending_command_ticks <= m_max_run_ahead && !m_fifo.IsEmpty())
+    switch (m_blitter_state)
     {
-      switch (m_blitter_state)
+      case BlitterState::Idle:
       {
-        case BlitterState::Idle:
+        const u32 command = FifoPeek(0) >> 24;
+        if ((this->*s_GP0_command_handler_table[command])())
+          continue;
+        else
+          return;
+      }
+
+      case BlitterState::WritingVRAM:
+      {
+        DebugAssert(m_blit_remaining_words > 0);
+        const u32 words_to_copy = std::min(m_blit_remaining_words, m_fifo.GetSize());
+        m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy);
+        for (u32 i = 0; i < words_to_copy; i++)
+          m_blit_buffer.push_back(FifoPop());
+        m_blit_remaining_words -= words_to_copy;
+
+        Log_DebugPrintf("VRAM write burst of %u words, %u words remaining", words_to_copy, m_blit_remaining_words);
+        if (m_blit_remaining_words == 0)
+          FinishVRAMWrite();
+
+        continue;
+      }
+
+      case BlitterState::ReadingVRAM:
+      {
+        return;
+      }
+      break;
+
+      case BlitterState::DrawingPolyLine:
+      {
+        const u32 words_per_vertex = m_render_command.shading_enable ? 2 : 1;
+        u32 terminator_index =
+          m_render_command.shading_enable ? ((static_cast<u32>(m_blit_buffer.size()) & 1u) ^ 1u) : 0u;
+        for (; terminator_index < m_fifo.GetSize(); terminator_index += words_per_vertex)
         {
-          const u32 command = FifoPeek(0) >> 24;
-          if ((this->*s_GP0_command_handler_table[command])())
-            continue;
-          else
-            goto batch_done;
+          // polyline must have at least two vertices, and the terminator is (word & 0xf000f000) == 0x50005000.
+          // terminator is on the first word for the vertex
+          if ((FifoPeek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000))
+            break;
         }
 
-        case BlitterState::WritingVRAM:
+        const bool found_terminator = (terminator_index < m_fifo.GetSize());
+        const u32 words_to_copy = std::min(terminator_index, m_fifo.GetSize());
+        if (words_to_copy > 0)
         {
-          DebugAssert(m_blit_remaining_words > 0);
-          const u32 words_to_copy = std::min(m_blit_remaining_words, m_fifo.GetSize());
           m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy);
           for (u32 i = 0; i < words_to_copy; i++)
             m_blit_buffer.push_back(FifoPop());
-          m_blit_remaining_words -= words_to_copy;
-
-          Log_DebugPrintf("VRAM write burst of %u words, %u words remaining", words_to_copy, m_blit_remaining_words);
-          if (m_blit_remaining_words == 0)
-            FinishVRAMWrite();
+        }
 
+        Log_DebugPrintf("Added %u words to polyline", words_to_copy);
+        if (found_terminator)
+        {
+          // drop terminator
+          m_fifo.RemoveOne();
+          Log_DebugPrintf("Drawing poly-line with %u vertices", GetPolyLineVertexCount());
+          DispatchRenderCommand();
+          m_blit_buffer.clear();
+          EndCommand();
           continue;
         }
-
-        case BlitterState::ReadingVRAM:
-        {
-          goto batch_done;
-        }
-        break;
-
-        case BlitterState::DrawingPolyLine:
-        {
-          const u32 words_per_vertex = m_render_command.shading_enable ? 2 : 1;
-          u32 terminator_index =
-            m_render_command.shading_enable ? ((static_cast<u32>(m_blit_buffer.size()) & 1u) ^ 1u) : 0u;
-          for (; terminator_index < m_fifo.GetSize(); terminator_index += words_per_vertex)
-          {
-            // polyline must have at least two vertices, and the terminator is (word & 0xf000f000) == 0x50005000.
-            // terminator is on the first word for the vertex
-            if ((FifoPeek(terminator_index) & UINT32_C(0xF000F000)) == UINT32_C(0x50005000))
-              break;
-          }
-
-          const bool found_terminator = (terminator_index < m_fifo.GetSize());
-          const u32 words_to_copy = std::min(terminator_index, m_fifo.GetSize());
-          if (words_to_copy > 0)
-          {
-            m_blit_buffer.reserve(m_blit_buffer.size() + words_to_copy);
-            for (u32 i = 0; i < words_to_copy; i++)
-              m_blit_buffer.push_back(FifoPop());
-          }
-
-          Log_DebugPrintf("Added %u words to polyline", words_to_copy);
-          if (found_terminator)
-          {
-            // drop terminator
-            m_fifo.RemoveOne();
-            Log_DebugPrintf("Drawing poly-line with %u vertices", GetPolyLineVertexCount());
-            DispatchRenderCommand();
-            m_blit_buffer.clear();
-            EndCommand();
-            continue;
-          }
-        }
-        break;
       }
-    }
-
-  batch_done:
-    m_fifo_pushed = false;
-    UpdateDMARequest();
-    if (!m_fifo_pushed)
       break;
+    }
   }
+}
 
+void GPU::ExecuteCommands()
+{
+  const bool was_executing_from_event = std::exchange(m_executing_commands, true);
+
+  TryExecuteCommands();
+  UpdateDMARequest();
   UpdateGPUIdle();
-  m_syncing = false;
+
+  m_executing_commands = was_executing_from_event;
+  if (!was_executing_from_event)
+    UpdateCommandTickEvent();
 }
 
 void GPU::EndCommand()
diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp
index 845708e2c..556b54a07 100644
--- a/src/core/timing_event.cpp
+++ b/src/core/timing_event.cpp
@@ -17,6 +17,7 @@ static TimingEvent* s_active_events_tail;
 static TimingEvent* s_current_event = nullptr;
 static u32 s_active_event_count = 0;
 static u32 s_global_tick_counter = 0;
+static u32 s_event_run_tick_counter = 0;
 static bool s_frame_done = false;
 
 u32 GetGlobalTickCounter()
@@ -24,6 +25,11 @@ u32 GetGlobalTickCounter()
   return s_global_tick_counter;
 }
 
+u32 GetEventRunTickCounter()
+{
+  return s_event_run_tick_counter;
+}
+
 void Initialize()
 {
   Reset();
@@ -293,6 +299,7 @@ void RunEvents()
     if (pending_ticks >= s_active_events_head->GetDowncount())
     {
       CPU::ResetPendingTicks();
+      s_event_run_tick_counter = s_global_tick_counter + static_cast<u32>(pending_ticks);
 
       do
       {
diff --git a/src/core/timing_event.h b/src/core/timing_event.h
index 08c2698da..01733ba51 100644
--- a/src/core/timing_event.h
+++ b/src/core/timing_event.h
@@ -81,6 +81,7 @@ public:
 namespace TimingEvents {
 
 u32 GetGlobalTickCounter();
+u32 GetEventRunTickCounter();
 
 void Initialize();
 void Reset();