Merge pull request #11067 from K0bin/cache-vertex-size

OpcodeDecoding: Cache vertex sizes
2022-09-18 22:38:06 -04:00 · 2022-09-18 22:38:06 -04:00 · 6f4f5b0b7b
parent d19994b4bd 2db74e7f21
commit 6f4f5b0b7b
7 changed files with 139 additions and 82 deletions
--- a/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
+++ b/Source/Core/Core/FifoPlayer/FifoPlayer.cpp
@ -55,6 +55,11 @@ public:

  OPCODE_CALLBACK(CPState& GetCPState()) { return m_cpmem; }

+  OPCODE_CALLBACK(u32 GetVertexSize(u8 vat))
+  {
+    return VertexLoaderBase::GetVertexSize(GetCPState().vtx_desc, GetCPState().vtx_attr[vat]);
+  }
+
  bool m_start_of_primitives = false;
  bool m_end_of_primitives = false;
  bool m_efb_copy = false;
--- a/Source/Core/Core/FifoPlayer/FifoRecorder.cpp
+++ b/Source/Core/Core/FifoPlayer/FifoRecorder.cpp
@ -45,6 +45,11 @@ public:

  OPCODE_CALLBACK(CPState& GetCPState()) { return m_cpmem; }

+  OPCODE_CALLBACK(u32 GetVertexSize(u8 vat))
+  {
+    return VertexLoaderBase::GetVertexSize(GetCPState().vtx_desc, GetCPState().vtx_attr[vat]);
+  }
+
 private:
  void ProcessVertexComponent(CPArray array_index, VertexComponentFormat array_type,
                              u32 component_offset, u32 vertex_size, u16 num_vertices,
--- a/Source/Core/DolphinQt/FIFO/FIFOAnalyzer.cpp
+++ b/Source/Core/DolphinQt/FIFO/FIFOAnalyzer.cpp
@ -316,6 +316,11 @@ public:

  OPCODE_CALLBACK(CPState& GetCPState()) { return m_cpmem; }

+  OPCODE_CALLBACK(u32 GetVertexSize(u8 vat))
+  {
+    return VertexLoaderBase::GetVertexSize(GetCPState().vtx_desc, GetCPState().vtx_attr[vat]);
+  }
+
  QString text;
  CPState m_cpmem;
 };
@ -731,6 +736,11 @@ public:

  OPCODE_CALLBACK(CPState& GetCPState()) { return m_cpmem; }

+  OPCODE_CALLBACK(u32 GetVertexSize(u8 vat))
+  {
+    return VertexLoaderBase::GetVertexSize(GetCPState().vtx_desc, GetCPState().vtx_attr[vat]);
+  }
+
  QString text;
  CPState m_cpmem;
 };
--- a/Source/Core/VideoCommon/OpcodeDecoding.cpp
+++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp
@ -122,7 +122,7 @@ public:
    // HACK
    DataReader src{const_cast<u8*>(vertex_data), const_cast<u8*>(vertex_data) + size};
    const u32 bytes =
-        VertexLoaderManager::RunVertices(vat, primitive, num_vertices, src, is_preprocess);
+        VertexLoaderManager::RunVertices<is_preprocess>(vat, primitive, num_vertices, src);

    ASSERT(bytes == size);

@ -228,6 +228,12 @@ public:
      return g_main_cp_state;
  }

+  OPCODE_CALLBACK(u32 GetVertexSize(u8 vat))
+  {
+    VertexLoaderBase* loader = VertexLoaderManager::RefreshLoader<is_preprocess>(vat);
+    return loader->m_vertex_size;
+  }
+
  u32 m_cycles = 0;
  bool m_in_display_list = false;
 };
--- a/Source/Core/VideoCommon/OpcodeDecoding.h
+++ b/Source/Core/VideoCommon/OpcodeDecoding.h
@ -110,6 +110,8 @@ public:

  // Get the current CP state.  Needed for vertex decoding; will also be mutated for CP commands.
  virtual CPState& GetCPState() = 0;
+
+  virtual u32 GetVertexSize(u8 vat) = 0;
 #endif
 };

@ -229,8 +231,7 @@ static DOLPHIN_FORCE_INLINE u32 RunCommand(const u8* data, u32 available, T& cal
          (cmdbyte & OpcodeDecoder::GX_PRIMITIVE_MASK) >> OpcodeDecoder::GX_PRIMITIVE_SHIFT);
      const u8 vat = cmdbyte & OpcodeDecoder::GX_VAT_MASK;

-      const u32 vertex_size = VertexLoaderBase::GetVertexSize(callback.GetCPState().vtx_desc,
-                                                              callback.GetCPState().vtx_attr[vat]);
+      const u32 vertex_size = callback.GetVertexSize(vat);
      const u16 num_vertices = Common::swap16(&data[1]);

      if (available < 3 + num_vertices * vertex_size)
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@ -54,7 +54,6 @@ Common::EnumMap<u8*, CPArray::TexCoord7> cached_arraybases;
 BitSet8 g_main_vat_dirty;
 BitSet8 g_preprocess_vat_dirty;
 bool g_bases_dirty;  // Main only
-u8 g_current_vat;    // Main only
 std::array<VertexLoaderBase*, CP_NUM_VAT_REG> g_main_vertex_loaders;
 std::array<VertexLoaderBase*, CP_NUM_VAT_REG> g_preprocess_vertex_loaders;

@ -78,7 +77,7 @@ void Clear()
 void UpdateVertexArrayPointers()
 {
  // Anything to update?
-  if (!g_bases_dirty)
+  if (!g_bases_dirty) [[likely]]
    return;

  // Some games such as Burnout 2 can put invalid addresses into
@ -198,59 +197,50 @@ NativeVertexFormat* GetUberVertexFormat(const PortableVertexDeclaration& decl)
  return GetOrCreateMatchingFormat(new_decl);
 }

-static VertexLoaderBase* RefreshLoader(int vtx_attr_group, bool preprocess = false)
+namespace detail
 {
-  CPState* state = preprocess ? &g_preprocess_cp_state : &g_main_cp_state;
-  BitSet8& attr_dirty = preprocess ? g_preprocess_vat_dirty : g_main_vat_dirty;
-  auto& vertex_loaders = preprocess ? g_main_vertex_loaders : g_preprocess_vertex_loaders;
-  g_current_vat = vtx_attr_group;
+template <bool IsPreprocess>
+VertexLoaderBase* GetOrCreateLoader(int vtx_attr_group)
+{
+  constexpr CPState* state = IsPreprocess ? &g_preprocess_cp_state : &g_main_cp_state;
+  constexpr BitSet8& attr_dirty = IsPreprocess ? g_preprocess_vat_dirty : g_main_vat_dirty;
+  constexpr auto& vertex_loaders =
+      IsPreprocess ? g_preprocess_vertex_loaders : g_main_vertex_loaders;

  VertexLoaderBase* loader;
-  if (attr_dirty[vtx_attr_group])
-  {
-    // We are not allowed to create a native vertex format on preprocessing as this is on the wrong
-    // thread
-    bool check_for_native_format = !preprocess;

-    VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]);
-    std::lock_guard<std::mutex> lk(s_vertex_loader_map_lock);
-    VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid);
-    if (iter != s_vertex_loader_map.end())
-    {
-      loader = iter->second.get();
-      check_for_native_format &= !loader->m_native_vertex_format;
-    }
-    else
-    {
-      s_vertex_loader_map[uid] =
-          VertexLoaderBase::CreateVertexLoader(state->vtx_desc, state->vtx_attr[vtx_attr_group]);
-      loader = s_vertex_loader_map[uid].get();
-      INCSTAT(g_stats.num_vertex_loaders);
-    }
-    if (check_for_native_format)
-    {
-      // search for a cached native vertex format
-      const PortableVertexDeclaration& format = loader->m_native_vtx_decl;
-      std::unique_ptr<NativeVertexFormat>& native = s_native_vertex_map[format];
-      if (!native)
-        native = g_renderer->CreateNativeVertexFormat(format);
-      loader->m_native_vertex_format = native.get();
-    }
-    vertex_loaders[vtx_attr_group] = loader;
-    attr_dirty[vtx_attr_group] = false;
+  // We are not allowed to create a native vertex format on preprocessing as this is on the wrong
+  // thread
+  bool check_for_native_format = !IsPreprocess;
+
+  VertexLoaderUID uid(state->vtx_desc, state->vtx_attr[vtx_attr_group]);
+  std::lock_guard<std::mutex> lk(s_vertex_loader_map_lock);
+  VertexLoaderMap::iterator iter = s_vertex_loader_map.find(uid);
+  if (iter != s_vertex_loader_map.end())
+  {
+    loader = iter->second.get();
+    check_for_native_format &= !loader->m_native_vertex_format;
  }
  else
  {
-    loader = vertex_loaders[vtx_attr_group];
+    auto [it, added] = s_vertex_loader_map.try_emplace(
+        uid,
+        VertexLoaderBase::CreateVertexLoader(state->vtx_desc, state->vtx_attr[vtx_attr_group]));
+    loader = it->second.get();
+    INCSTAT(g_stats.num_vertex_loaders);
  }
-
-  // Lookup pointers for any vertex arrays.
-  if (!preprocess)
-    UpdateVertexArrayPointers();
-
+  if (check_for_native_format)
+  {
+    // search for a cached native vertex format
+    loader->m_native_vertex_format = GetOrCreateMatchingFormat(loader->m_native_vtx_decl);
+  }
+  vertex_loaders[vtx_attr_group] = loader;
+  attr_dirty[vtx_attr_group] = false;
  return loader;
 }

+}  // namespace detail
+
 static void CheckCPConfiguration(int vtx_attr_group)
 {
  // Validate that the XF input configuration matches the CP configuration
@ -335,53 +325,61 @@ static void CheckCPConfiguration(int vtx_attr_group)
  }
 }

-int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, DataReader src,
-                bool is_preprocess)
+template <bool IsPreprocess>
+int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, DataReader src)
 {
  if (count == 0)
    return 0;
  ASSERT(count > 0);

-  VertexLoaderBase* loader = RefreshLoader(vtx_attr_group, is_preprocess);
+  VertexLoaderBase* loader = RefreshLoader<IsPreprocess>(vtx_attr_group);

  int size = count * loader->m_vertex_size;
  if ((int)src.size() < size)
    return -1;

-  if (is_preprocess)
-    return size;
-
-  CheckCPConfiguration(vtx_attr_group);
-
-  // If the native vertex format changed, force a flush.
-  if (loader->m_native_vertex_format != s_current_vtx_fmt ||
-      loader->m_native_components != g_current_components)
+  if constexpr (!IsPreprocess)
  {
-    g_vertex_manager->Flush();
+    // Doing early return for the opposite case would be cleaner
+    // but triggers a false unreachable code warning in MSVC debug builds.
+
+    CheckCPConfiguration(vtx_attr_group);
+
+    // If the native vertex format changed, force a flush.
+    if (loader->m_native_vertex_format != s_current_vtx_fmt ||
+        loader->m_native_components != g_current_components)
+    {
+      g_vertex_manager->Flush();
+    }
+    s_current_vtx_fmt = loader->m_native_vertex_format;
+    g_current_components = loader->m_native_components;
+    VertexShaderManager::SetVertexFormat(loader->m_native_components);
+
+    // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
+    // They still need to go through vertex loading, because we need to calculate a zfreeze refrence
+    // slope.
+    bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
+                    primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
+
+    DataReader dst = g_vertex_manager->PrepareForAdditionalData(
+        primitive, count, loader->m_native_vtx_decl.stride, cullall);
+
+    count = loader->RunVertices(src, dst, count);
+
+    g_vertex_manager->AddIndices(primitive, count);
+    g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);
+
+    ADDSTAT(g_stats.this_frame.num_prims, count);
+    INCSTAT(g_stats.this_frame.num_primitive_joins);
  }
-  s_current_vtx_fmt = loader->m_native_vertex_format;
-  g_current_components = loader->m_native_components;
-  VertexShaderManager::SetVertexFormat(loader->m_native_components);
-
-  // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
-  // They still need to go through vertex loading, because we need to calculate a zfreeze refrence
-  // slope.
-  bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
-                  primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
-
-  DataReader dst = g_vertex_manager->PrepareForAdditionalData(
-      primitive, count, loader->m_native_vtx_decl.stride, cullall);
-
-  count = loader->RunVertices(src, dst, count);
-
-  g_vertex_manager->AddIndices(primitive, count);
-  g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);
-
-  ADDSTAT(g_stats.this_frame.num_prims, count);
-  INCSTAT(g_stats.this_frame.num_primitive_joins);
  return size;
 }

+template int RunVertices<false>(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count,
+                                DataReader src);
+template int RunVertices<true>(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count,
+                               DataReader src);
+
 NativeVertexFormat* GetCurrentVertexFormat()
 {
  return s_current_vtx_fmt;
--- a/Source/Core/VideoCommon/VertexLoaderManager.h
+++ b/Source/Core/VideoCommon/VertexLoaderManager.h
@ -42,8 +42,16 @@ NativeVertexFormat* GetOrCreateMatchingFormat(const PortableVertexDeclaration& d
 NativeVertexFormat* GetUberVertexFormat(const PortableVertexDeclaration& decl);

 // Returns -1 if buf_size is insufficient, else the amount of bytes consumed
-int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, DataReader src,
-                bool is_preprocess);
+template <bool IsPreprocess = false>
+int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, DataReader src);
+
+namespace detail
+{
+// This will look for an existing loader in the global hashmap or create a new one if there is none.
+// It should not be used directly because RefreshLoaders() has another cache for fast lookups.
+template <bool IsPreprocess = false>
+VertexLoaderBase* GetOrCreateLoader(int vtx_attr_group);
+}  // namespace detail

 NativeVertexFormat* GetCurrentVertexFormat();

@ -66,7 +74,31 @@ extern u32 g_current_components;
 extern BitSet8 g_main_vat_dirty;
 extern BitSet8 g_preprocess_vat_dirty;
 extern bool g_bases_dirty;  // Main only
-extern u8 g_current_vat;    // Main only
 extern std::array<VertexLoaderBase*, CP_NUM_VAT_REG> g_main_vertex_loaders;
 extern std::array<VertexLoaderBase*, CP_NUM_VAT_REG> g_preprocess_vertex_loaders;
+
+template <bool IsPreprocess = false>
+VertexLoaderBase* RefreshLoader(int vtx_attr_group)
+{
+  constexpr const BitSet8& attr_dirty = IsPreprocess ? g_preprocess_vat_dirty : g_main_vat_dirty;
+  constexpr const auto& vertex_loaders =
+      IsPreprocess ? g_preprocess_vertex_loaders : g_main_vertex_loaders;
+
+  VertexLoaderBase* loader;
+  if (!attr_dirty[vtx_attr_group]) [[likely]]
+  {
+    loader = vertex_loaders[vtx_attr_group];
+  }
+  else [[unlikely]]
+  {
+    loader = detail::GetOrCreateLoader<IsPreprocess>(vtx_attr_group);
+  }
+
+  // Lookup pointers for any vertex arrays.
+  if constexpr (!IsPreprocess)
+    UpdateVertexArrayPointers();
+
+  return loader;
+}
+
 }  // namespace VertexLoaderManager