diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp
index 4ef11a1283..0f44d16023 100644
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@@ -376,6 +376,22 @@ static void CheckCPConfiguration(int vtx_attr_group)
   }
 }
 
+static bool CanSplit(OpcodeDecoder::Primitive primitive)
+{
+  // Splitting is currently only implemented for the easy cases (individual lines/points/triangles)
+  switch (primitive)
+  {
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
+  case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
+  case OpcodeDecoder::Primitive::GX_DRAW_LINES:
+  case OpcodeDecoder::Primitive::GX_DRAW_POINTS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 template <bool IsPreprocess>
 int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, const u8* src)
 {
@@ -414,9 +430,9 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
 
     // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data
     // Therefore it's only useful to check if culling could remove a flush
-    const bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
-                              primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES &&
-                              !g_vertex_manager->HasSendableVertices();
+    bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
+                        primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES &&
+                        !g_vertex_manager->HasSendableVertices();
 
     // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
     // They still need to go through vertex loading, because we need to calculate a zfreeze
@@ -425,24 +441,35 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
                           primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
 
     const int stride = loader->m_native_vtx_decl.stride;
-    DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, count, stride,
-                                                                cullall || can_cpu_cull);
-
-    count = loader->RunVertices(src, dst.GetPointer(), count);
-
-    if (can_cpu_cull && !cullall)
+    do
     {
-      if (!g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), count))
+      const int max_vertices = 16380;  // Max is 16383, but 16380 is divisible by both 4 and 3
+      const int run = CanSplit(primitive) && count > max_vertices ? max_vertices : count;
+      count -= run;
+      DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, run, stride,
+                                                                  cullall || can_cpu_cull);
+
+      const int num_loaded = loader->RunVertices(src, dst.GetPointer(), run);
+      src += loader->m_vertex_size * max_vertices;
+
+      if (can_cpu_cull && !cullall)
       {
-        DataReader new_dst = g_vertex_manager->DisableCullAll(stride);
-        memmove(new_dst.GetPointer(), dst.GetPointer(), count * stride);
+        const bool all_culled =
+            g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), num_loaded);
+        if (!all_culled)
+        {
+          DataReader new_dst = g_vertex_manager->DisableCullAll(stride);
+          memmove(new_dst.GetPointer(), dst.GetPointer(), num_loaded * stride);
+          can_cpu_cull = false;
+        }
       }
-    }
 
-    g_vertex_manager->AddIndices(primitive, count);
-    g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);
+      g_vertex_manager->AddIndices(primitive, num_loaded);
+      g_vertex_manager->FlushData(num_loaded, stride);
+
+      ADDSTAT(g_stats.this_frame.num_prims, num_loaded);
+    } while (count);
 
-    ADDSTAT(g_stats.this_frame.num_prims, count);
     INCSTAT(g_stats.this_frame.num_primitive_joins);
   }
   return size;