Merge pull request #11673 from K0bin/vertex-loader-micro-opt

Vertex Loader Microoptimization
2023-03-30 11:05:02 +02:00 · 2023-03-30 11:05:02 +02:00 · 50a45bd614
parent 806ea59d77 93fce0e4b6
commit 50a45bd614
4 changed files with 64 additions and 59 deletions
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@ -258,11 +258,6 @@ VertexLoaderBase* GetOrCreateLoader(int vtx_attr_group)

 static void CheckCPConfiguration(int vtx_attr_group)
 {
-  if (!g_needs_cp_xf_consistency_check) [[likely]]
-    return;
-
-  g_needs_cp_xf_consistency_check = false;
-
  // Validate that the XF input configuration matches the CP configuration
  u32 num_cp_colors = std::count_if(
      g_main_cp_state.vtx_desc.low.Color.begin(), g_main_cp_state.vtx_desc.low.Color.end(),
@ -359,20 +354,25 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
    // Doing early return for the opposite case would be cleaner
    // but triggers a false unreachable code warning in MSVC debug builds.

+    if (g_needs_cp_xf_consistency_check) [[unlikely]]
+    {
      CheckCPConfiguration(vtx_attr_group);
+      g_needs_cp_xf_consistency_check = false;
+    }

    // If the native vertex format changed, force a flush.
    if (loader->m_native_vertex_format != s_current_vtx_fmt ||
        loader->m_native_components != g_current_components) [[unlikely]]
    {
      g_vertex_manager->Flush();
-    }
+
      s_current_vtx_fmt = loader->m_native_vertex_format;
      g_current_components = loader->m_native_components;
      auto& system = Core::System::GetInstance();
      auto& vertex_shader_manager = system.GetVertexShaderManager();
      vertex_shader_manager.SetVertexFormat(loader->m_native_components,
                                            loader->m_native_vertex_format->GetVertexDeclaration());
+    }

    // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data
    // Therefore it's only useful to check if culling could remove a flush
--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@ -140,7 +140,7 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
  PrimitiveType new_primitive_type = g_ActiveConfig.backend_info.bSupportsPrimitiveRestart ?
                                         primitive_from_gx_pr[primitive] :
                                         primitive_from_gx[primitive];
-  if (m_current_primitive_type != new_primitive_type)
+  if (m_current_primitive_type != new_primitive_type) [[unlikely]]
  {
    Flush();

@ -149,9 +149,11 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
    SetRasterizationStateChanged();
  }

+  u32 remaining_indices = GetRemainingIndices(primitive);
+  u32 remaining_index_generator_indices = m_index_generator.GetRemainingIndices(primitive);
+
  // Check for size in buffer, if the buffer gets full, call Flush()
-  if (!m_is_flushed && (count > m_index_generator.GetRemainingIndices(primitive) ||
-                        count > GetRemainingIndices(primitive) ||
+  if (!m_is_flushed && (count > remaining_index_generator_indices || count > remaining_indices ||
                        needed_vertex_bytes > GetRemainingSize())) [[unlikely]]
  {
    Flush();
@ -160,7 +162,7 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
  m_cull_all = cullall;

  // need to alloc new buffer
-  if (m_is_flushed)
+  if (m_is_flushed) [[unlikely]]
  {
    if (cullall)
    {
@ -174,6 +176,8 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
      ResetBuffer(stride);
    }

+    remaining_index_generator_indices = m_index_generator.GetRemainingIndices(primitive);
+    remaining_indices = GetRemainingIndices(primitive);
    m_is_flushed = false;
  }

@ -181,14 +185,14 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
  // won't have enough space in a few rare cases, such as vertex shader line/point expansion with a
  // ton of lines in one draw command, in which case we will either need to add support for
  // splitting a single draw command into multiple draws or using bigger indices.
-  ASSERT_MSG(VIDEO, count <= m_index_generator.GetRemainingIndices(primitive),
+  ASSERT_MSG(VIDEO, count <= remaining_index_generator_indices,
             "VertexManager: Too few remaining index values ({} > {}). "
             "32-bit indices or primitive breaking needed.",
-             count, m_index_generator.GetRemainingIndices(primitive));
-  ASSERT_MSG(VIDEO, count <= GetRemainingIndices(primitive),
+             count, remaining_index_generator_indices);
+  ASSERT_MSG(VIDEO, count <= remaining_indices,
             "VertexManager: Buffer not large enough for all indices! ({} > {}) "
             "Increase MAXIBUFFERSIZE or we need primitive breaking after all.",
-             count, GetRemainingIndices(primitive));
+             count, remaining_indices);
  ASSERT_MSG(VIDEO, needed_vertex_bytes <= GetRemainingSize(),
             "VertexManager: Buffer not large enough for all vertices! ({} > {}) "
             "Increase MAXVBUFFERSIZE or we need primitive breaking after all.",
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@ -621,44 +621,6 @@ void VertexShaderManager::SetMaterialColorChanged(int index)
  m_materials_changed[index] = true;
 }

-static void UpdateValue(bool* dirty, u32* old_value, u32 new_value)
-{
-  if (*old_value == new_value)
-    return;
-  *old_value = new_value;
-  *dirty = true;
-}
-
-static void UpdateOffset(bool* dirty, bool include_components, u32* old_value,
-                         const AttributeFormat& attribute)
-{
-  if (!attribute.enable)
-    return;
-  u32 new_value = attribute.offset / 4;  // GPU uses uint offsets
-  if (include_components)
-    new_value |= attribute.components << 16;
-  UpdateValue(dirty, old_value, new_value);
-}
-
-template <size_t N>
-static void UpdateOffsets(bool* dirty, bool include_components, std::array<u32, N>* old_value,
-                          const std::array<AttributeFormat, N>& attribute)
-{
-  for (size_t i = 0; i < N; i++)
-    UpdateOffset(dirty, include_components, &(*old_value)[i], attribute[i]);
-}
-
-void VertexShaderManager::SetVertexFormat(u32 components, const PortableVertexDeclaration& format)
-{
-  UpdateValue(&dirty, &constants.components, components);
-  UpdateValue(&dirty, &constants.vertex_stride, format.stride / 4);
-  UpdateOffset(&dirty, true, &constants.vertex_offset_position, format.position);
-  UpdateOffset(&dirty, false, &constants.vertex_offset_posmtx, format.posmtx);
-  UpdateOffsets(&dirty, true, &constants.vertex_offset_texcoords, format.texcoords);
-  UpdateOffsets(&dirty, false, &constants.vertex_offset_colors, format.colors);
-  UpdateOffsets(&dirty, false, &constants.vertex_offset_normals, format.normals);
-}
-
 void VertexShaderManager::SetTexMatrixInfoChanged(int index)
 {
  // TODO: Should we track this with more precision, like which indices changed?
--- a/Source/Core/VideoCommon/VertexShaderManager.h
+++ b/Source/Core/VideoCommon/VertexShaderManager.h
@ -11,6 +11,7 @@
 #include "Common/CommonTypes.h"
 #include "Common/Matrix.h"
 #include "VideoCommon/ConstantManager.h"
+#include "VideoCommon/NativeVertexFormat.h"

 class PointerWrap;
 struct PortableVertexDeclaration;
@ -34,7 +35,6 @@ public:
  void SetProjectionChanged();
  void SetMaterialColorChanged(int index);

-  void SetVertexFormat(u32 components, const PortableVertexDeclaration& format);
  void SetTexMatrixInfoChanged(int index);
  void SetLightingConfigChanged();

@ -49,6 +49,45 @@ public:
  VertexShaderConstants constants{};
  bool dirty = false;

+  static DOLPHIN_FORCE_INLINE void UpdateValue(bool* dirty, u32* old_value, u32 new_value)
+  {
+    if (*old_value == new_value)
+      return;
+    *old_value = new_value;
+    *dirty = true;
+  }
+
+  static DOLPHIN_FORCE_INLINE void UpdateOffset(bool* dirty, bool include_components,
+                                                u32* old_value, const AttributeFormat& attribute)
+  {
+    if (!attribute.enable)
+      return;
+    u32 new_value = attribute.offset / 4;  // GPU uses uint offsets
+    if (include_components)
+      new_value |= attribute.components << 16;
+    UpdateValue(dirty, old_value, new_value);
+  }
+
+  template <size_t N>
+  static DOLPHIN_FORCE_INLINE void UpdateOffsets(bool* dirty, bool include_components,
+                                                 std::array<u32, N>* old_value,
+                                                 const std::array<AttributeFormat, N>& attribute)
+  {
+    for (size_t i = 0; i < N; i++)
+      UpdateOffset(dirty, include_components, &(*old_value)[i], attribute[i]);
+  }
+
+  DOLPHIN_FORCE_INLINE void SetVertexFormat(u32 components, const PortableVertexDeclaration& format)
+  {
+    UpdateValue(&dirty, &constants.components, components);
+    UpdateValue(&dirty, &constants.vertex_stride, format.stride / 4);
+    UpdateOffset(&dirty, true, &constants.vertex_offset_position, format.position);
+    UpdateOffset(&dirty, false, &constants.vertex_offset_posmtx, format.posmtx);
+    UpdateOffsets(&dirty, true, &constants.vertex_offset_texcoords, format.texcoords);
+    UpdateOffsets(&dirty, false, &constants.vertex_offset_colors, format.colors);
+    UpdateOffsets(&dirty, false, &constants.vertex_offset_normals, format.normals);
+  }
+
 private:
  alignas(16) std::array<float, 16> m_projection_matrix;