GS: Add new heap for SW renderer allocations

2021-05-02 23:18:48 -05:00 · 2021-05-02 23:18:48 -05:00 · 342170b077
parent df02d784d6
commit 342170b077
5 changed files with 546 additions and 0 deletions
--- a/pcsx2/CMakeLists.txt
+++ b/pcsx2/CMakeLists.txt
@ -618,6 +618,7 @@ set(pcsx2GSSources
 	GS/GSLzma.cpp
 	GS/GSPerfMon.cpp
 	GS/GSPng.cpp
+	GS/GSRingHeap.cpp
 	GS/GSState.cpp
 	GS/GSTables.cpp
 	GS/GSUtil.cpp
@ -677,6 +678,7 @@ set(pcsx2GSHeaders
 	GS/GSLzma.h
 	GS/GSPerfMon.h
 	GS/GSPng.h
+	GS/GSRingHeap.h
 	GS/GSState.h
 	GS/GSTables.h
 	GS/GSThread_CXX11.h
--- a/pcsx2/GS/GSRingHeap.cpp
+++ b/pcsx2/GS/GSRingHeap.cpp
@ -0,0 +1,256 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2021 PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "PrecompiledHeader.h"
+#include "GSRingHeap.h"
+#include "GS.h"
+#include "GSExtra.h"
+
+namespace
+{
+	/// Align `value` to `align` bytes
+	template <size_t align>
+	size_t alignTo(size_t value)
+	{
+		return ((value + (align - 1)) / align) * align;
+	}
+
+	/// Align to a power of 2 using a mask that's equal to that value - 1
+	size_t alignUsingMask(size_t align_mask, size_t value)
+	{
+		return (value + align_mask) & ~align_mask;
+	}
+} // namespace
+
+/// GSRingHeap operates as a ring buffer, with usage counters for each quadrant
+/// If a new quadrant needs to be used but is still in use by existing allocations,
+///   the buffer is orphaned and a replaced with a new, larger buffer
+struct GSRingHeap::Buffer
+{
+	friend class GSRingHeap;
+
+	static const size_t BEGINNING_OFFSET;
+
+	static constexpr size_t USAGE_ARR_SIZE = sizeof(uint64_t) / sizeof(size_t);
+	static constexpr size_t USAGE_ARR_ELEMS_PER_ENTRY = sizeof(size_t) / sizeof(uint16_t);
+
+	/// Refcount, main heap holds onto buffer with a +1
+	/// Each allocation adds `sizeof(allocation)`, to allow for detection of buffers being used inefficiently
+	///   (e.g. if a buffer is orphaned with very little usage that means allocations aren't being freed in a similar order to being made)
+	/// Buffer is freed when the main heap drops it (-1) and every allocation is freed, causing this to reach 0
+	std::atomic<size_t> m_amt_allocated;
+	/// Holds 4x 16-bit usage counters, indicating how many allocations have been made from the nth quadrant of memory
+	/// Merged into `size_t` chunks so that they can be operated on with fewer atomic operations
+	std::atomic<size_t> m_usage[USAGE_ARR_SIZE];
+	/// Size of whole buffer (including header)
+	/// Should be kept to at least 2x the largest allocation
+	size_t m_size;
+	/// Offset of new allocations
+	size_t m_write_loc;
+	/// Amount to rshift buffer offset to get which quadrant it's in (`log2(m_size/4)`)
+	int m_quadrant_shift;
+
+	/// Increment usage counts (use when allocating)
+	void beginUse(uint64_t usage)
+	{
+		for (size_t i = 0; i < USAGE_ARR_SIZE; i++)
+		{
+			size_t piece = static_cast<size_t>(usage >> (i * (64 / USAGE_ARR_SIZE)));
+			size_t prev = m_usage[i].fetch_add(piece, std::memory_order_relaxed);
+			for (size_t j = 0; j < USAGE_ARR_ELEMS_PER_ENTRY; j++)
+			{
+				[[maybe_unused]] uint16_t section = prev >> (j * 16);
+				assert(section != UINT16_MAX && "Usage count overflow");
+			}
+		}
+	}
+
+	/// Decrement usage counts (use when freeing)
+	void endUse(uint64_t usage)
+	{
+		for (size_t i = 0; i < USAGE_ARR_SIZE; i++)
+		{
+			size_t piece = static_cast<size_t>(usage >> (i * (64 / USAGE_ARR_SIZE)));
+			m_usage[i].fetch_sub(piece, std::memory_order_release);
+		}
+	}
+
+	/// Check if the given quadrant is still in use
+	bool isStillInUse(uint32_t quadrant)
+	{
+		int arridx = (quadrant / USAGE_ARR_ELEMS_PER_ENTRY) % USAGE_ARR_SIZE;
+		int shift = (quadrant % USAGE_ARR_ELEMS_PER_ENTRY) * 16;
+		return ((m_usage[arridx].load(std::memory_order_acquire) >> shift) & 0xFFFF) != 0;
+	}
+
+	uint32_t quadrant(size_t off)
+	{
+		return static_cast<uint32_t>(off >> m_quadrant_shift);
+	}
+
+	/// Calculate a usage mask from an offset + size
+	uint64_t usageMask(size_t begin_off, size_t size)
+	{
+		// We guarantee size <= two quadrants
+		// Therefore we only need to check beginning, middle, and end
+		uint64_t mask = 0;
+		mask |= 1ull << (quadrant(begin_off) * 16);
+		size_t mid_off = begin_off + size / 2;
+		mask |= 1ull << (quadrant(mid_off) * 16);
+		size_t end_off = begin_off + size - 1;
+		mask |= 1ull << (quadrant(end_off) * 16);
+		return mask;
+	}
+
+	/// Decrement the main amt_allocated refcount
+	void decref(size_t amt)
+	{
+		if (unlikely(m_amt_allocated.fetch_sub(amt, std::memory_order_release) == amt))
+		{
+			std::atomic_thread_fence(std::memory_order_acquire);
+			vmfree(this, m_size);
+		}
+	}
+
+	/// Free an allocation
+	void free(void* allocation, size_t size)
+	{
+		const char* base = reinterpret_cast<const char*>(this);
+		size_t begin_off = static_cast<const char*>(allocation) - base;
+		endUse(usageMask(begin_off, size));
+		decref(size);
+	}
+
+	/// Allocate a value of `size` bytes with `prefix_size` bytes before it (for allocation tracking) and alignment specified by `align_mask`
+	void* alloc(size_t size, size_t align_mask, size_t prefix_size)
+	{
+		uint32_t prev_quadrant = quadrant(m_write_loc - 1);
+		size_t base_off = alignUsingMask(align_mask, m_write_loc + prefix_size);
+		uint64_t usage_mask = 1ull << (quadrant(base_off - prefix_size) * 16);
+		uint32_t new_quadrant = quadrant(base_off + size - 1);
+		if (prev_quadrant != new_quadrant)
+		{
+			uint32_t cur_quadrant = prev_quadrant + 1;
+			if (new_quadrant >= 4)
+			{
+				cur_quadrant = 0;
+				usage_mask = 0;
+				base_off = alignUsingMask(align_mask, BEGINNING_OFFSET + prefix_size);
+				new_quadrant = quadrant(base_off + size - 1);
+			}
+			do
+			{
+				usage_mask |= 1ull << (cur_quadrant * 16);
+				if (unlikely(isStillInUse(cur_quadrant)))
+					return nullptr;
+			} while (++cur_quadrant <= new_quadrant);
+		}
+
+		m_write_loc = base_off + size;
+		beginUse(usage_mask);
+		m_amt_allocated.fetch_add(size + prefix_size, std::memory_order_relaxed);
+		return reinterpret_cast<char*>(this) + base_off - prefix_size;
+	}
+
+	static Buffer* make(int quadrant_shift)
+	{
+		size_t size = 4ull << quadrant_shift;
+		Buffer* buffer = reinterpret_cast<Buffer*>(vmalloc(size, false));
+		buffer->m_size = size;
+		buffer->m_quadrant_shift = quadrant_shift;
+		buffer->m_amt_allocated.store(1, std::memory_order_relaxed);
+		for (std::atomic<size_t>& usage : buffer->m_usage)
+			usage.store(0, std::memory_order_relaxed);
+		buffer->m_write_loc = BEGINNING_OFFSET;
+		return buffer;
+	}
+};
+
+const size_t GSRingHeap::Buffer::BEGINNING_OFFSET = alignTo<64>(sizeof(Buffer));
+constexpr size_t GSRingHeap::MIN_ALIGN;
+
+GSRingHeap::GSRingHeap()
+{
+	m_current_buffer = Buffer::make(14); // Start with 64k buffer
+}
+
+GSRingHeap::~GSRingHeap() noexcept
+{
+	orphanBuffer();
+}
+
+void GSRingHeap::orphanBuffer() noexcept
+{
+	m_current_buffer->decref(1);
+}
+
+void* GSRingHeap::alloc_internal(size_t size, size_t align_mask, size_t prefix_size)
+{
+	prefix_size += sizeof(Buffer*); // Add space for a pointer to the buffer
+	size_t total_size = size + prefix_size;
+
+	if (likely(total_size <= (m_current_buffer->m_size / 2)))
+	{
+		if (void* ptr = m_current_buffer->alloc(size, align_mask, prefix_size))
+		{
+			Buffer** bptr = static_cast<Buffer**>(ptr);
+			*bptr = m_current_buffer;
+			return bptr + 1;
+		}
+		else if (IsDevBuild)
+		{
+			size_t total = m_current_buffer->m_size;
+			size_t mb = 1024 * 1024;
+			if (total >= mb)
+			{
+				size_t used = m_current_buffer->m_amt_allocated.load(std::memory_order_relaxed) - 1;
+				if (used * 4 < total)
+				{
+					fprintf(stderr, "GSRingHeap: Orphaning %dmb buffer with low usage of %d%%, check that allocations are actually being deallocated approximately in order\n", total / mb, static_cast<int>((used * 100) / total));
+				}
+			}
+		}
+	}
+
+	// Couldn't allocate, orphan buffer and make a new one
+	int shift = m_current_buffer->m_quadrant_shift;
+	do
+	{
+		shift++;
+	} while (total_size > (2ull << shift));
+
+	if (shift > 24 && total_size <= (2ull << (shift - 1)))
+	{
+		// If this needs to be >64 mb, we're doing something wrong
+		fprintf(stderr, "GSRingHeap: Refusing to grow to %umb\n", 4u << (shift - 20));
+		shift--;
+	}
+	Buffer* new_buffer = Buffer::make(shift);
+	orphanBuffer();
+	m_current_buffer = new_buffer;
+	void* ptr = m_current_buffer->alloc(size, align_mask, prefix_size);
+	assert(ptr && "Fresh buffer failed to allocate!");
+
+	Buffer** bptr = static_cast<Buffer**>(ptr);
+	*bptr = m_current_buffer;
+	return bptr + 1;
+}
+
+void GSRingHeap::free_internal(void* ptr, size_t size) noexcept
+{
+	size += sizeof(Buffer*);
+	Buffer** bptr = static_cast<Buffer**>(ptr) - 1;
+	(*bptr)->free(bptr, size);
+}
--- a/pcsx2/GS/GSRingHeap.h
+++ b/pcsx2/GS/GSRingHeap.h
@ -0,0 +1,280 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2021 PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+
+/// A ring buffer pretending to be a heap (screams if you don't actually use it like a ring buffer)
+/// Meant for one producer thread creating data and sharing it with multiple consumer threads
+/// Expectations:
+/// - One thread allocates and writes to allocations
+/// - Other threads read from allocations (once shared, no one writes)
+/// - Any thread can free
+/// - Frees are done in approximately the same order as allocations (but not exactly the same order)
+class GSRingHeap
+{
+	struct Buffer;
+	Buffer* m_current_buffer;
+
+	void orphanBuffer() noexcept;
+	/// Allocate a value of `size` bytes with `prefix_size` bytes before it (for allocation tracking) and alignment specified by `align_mask`
+	void* alloc_internal(size_t size, size_t align_mask, size_t prefix_size);
+	/// Free a value of size `size` (equal to prefix_size + size when allocated)
+	static void free_internal(void* ptr, size_t size) noexcept;
+
+	static constexpr size_t MIN_ALIGN = std::max(alignof(size_t), alignof(void*));
+
+	static size_t getAlignMask(size_t align)
+	{
+		return std::max(MIN_ALIGN, align) - 1;
+	}
+
+public:
+	GSRingHeap(GSRingHeap&&) = delete;
+	GSRingHeap();
+	~GSRingHeap() noexcept;
+
+	/// Allocate a piece of memory with the given size and alignment
+	void* alloc(size_t size, size_t align)
+	{
+		size_t alloc_size = size + sizeof(size_t);
+		void* ptr = alloc_internal(size, getAlignMask(align), sizeof(size_t));
+		size_t* header = static_cast<size_t*>(ptr);
+		*header = alloc_size;
+		return static_cast<void*>(header + 1);
+	};
+
+	/// Allocate and initialize a T*
+	template <typename T, typename... Args>
+	T* make(Args&&... args)
+	{
+		void* ptr = alloc(sizeof(T), alignof(T));
+		new (ptr) T(std::forward<Args>(args)...);
+		return static_cast<T*>(ptr);
+	}
+
+	/// Allocate and default-initialize `count` `T`s
+	template <typename T>
+	T* make_array(size_t count)
+	{
+		void* ptr = alloc(sizeof(T) * count, alignof(T));
+		new (ptr) T[count]();
+		return static_cast<T*>(ptr);
+	}
+
+	/// Free a pointer allocated with `alloc`
+	static void free(void* ptr)
+	{
+		size_t* header = static_cast<size_t*>(ptr) - 1;
+		free_internal(static_cast<void*>(header), *header);
+	}
+
+	/// Deinitialize and free a pointer created with `make`
+	template <typename T>
+	static void destroy(T* ptr)
+	{
+		ptr->~T();
+		free(ptr);
+	}
+
+	/// Deinitialize and free an array allocated with `make_array`
+	template <typename T>
+	static void destroy_array(T* ptr)
+	{
+		size_t* header = const_cast<size_t*>(reinterpret_cast<const size_t*>(ptr)) - 1;
+		size_t size = (*header - sizeof(size_t)) / sizeof(T);
+		for (size_t i = 0; i < size; i++)
+			ptr[i].~T();
+		free(ptr);
+	}
+
+	/// Like `std::shared_ptr` but holds a pointer on this allocator
+	template <typename T>
+	class SharedPtr
+	{
+		friend class GSRingHeap;
+
+		struct alignas(MIN_ALIGN) AllocationHeader
+		{
+			uint32_t size;
+			std::atomic<uint32_t> refcnt;
+		};
+
+		T* m_ptr;
+
+		SharedPtr(T* ptr)
+			: m_ptr(ptr)
+		{
+		}
+
+		AllocationHeader* getHeader()
+		{
+			return const_cast<AllocationHeader*>(reinterpret_cast<const AllocationHeader*>(m_ptr)) - 1;
+		}
+
+	public:
+		SharedPtr()
+			: m_ptr(nullptr)
+		{
+		}
+
+		SharedPtr(std::nullptr_t)
+			: m_ptr(nullptr)
+		{
+		}
+
+		SharedPtr(const SharedPtr& other)
+			: m_ptr(other.m_ptr)
+		{
+			if (m_ptr)
+				getHeader()->refcnt.fetch_add(1, std::memory_order_relaxed);
+		}
+
+		SharedPtr(SharedPtr&& other)
+			: m_ptr(other.m_ptr)
+		{
+			other.m_ptr = nullptr;
+		}
+
+		SharedPtr& operator=(const SharedPtr& other)
+		{
+			this->~SharedPtr();
+			new (this) SharedPtr(other);
+			return *this;
+		}
+
+		SharedPtr& operator=(SharedPtr&& other)
+		{
+			this->~SharedPtr();
+			new (this) SharedPtr(other);
+			return *this;
+		}
+
+		~SharedPtr()
+		{
+			if (!m_ptr)
+				return;
+			AllocationHeader* header = getHeader();
+			// (See top) Expectation: Once shared, no one writes
+			// Therefore we don't need acquire/release semantics here
+			if (header->refcnt.fetch_sub(1, std::memory_order_relaxed) == 1)
+			{
+				m_ptr->~T();
+				free_internal(static_cast<void*>(header), header->size);
+			}
+		}
+
+		T& operator*() const { return *m_ptr; }
+		T* operator->() const { return m_ptr; }
+		T* get() const { return m_ptr; }
+
+		/// static_cast the pointer to another type
+		template <typename Other>
+		SharedPtr<Other> cast() const&
+		{
+			getHeader().refcount.fetch_add(1, std::memory_order_relaxed);
+			return SharedPtr<Other>(static_cast<Other*>(m_ptr));
+		}
+
+		/// static_cast the pointer to another type
+		template <typename Other>
+		SharedPtr<Other> cast() &&
+		{
+			SharedPtr<Other> other(static_cast<Other*>(m_ptr));
+			m_ptr = nullptr;
+			return other;
+		}
+	};
+
+	/// Make a shared pointer with a different alignment from what the type would normally expect
+	template <typename T, typename... Args>
+	SharedPtr<T> make_shared(Args&&... args)
+	{
+		using Header = typename SharedPtr<T>::AllocationHeader;
+		size_t alloc_size = sizeof(T) + sizeof(Header);
+		static_assert(alignof(Header) <= MIN_ALIGN, "Header alignment too high");
+
+		void* ptr = alloc_internal(sizeof(T), getAlignMask(alignof(T)), sizeof(Header));
+		Header* header = static_cast<Header*>(ptr);
+		assert(alloc_size <= UINT32_MAX && "Allocation overflow");
+		header->size = static_cast<uint32_t>(alloc_size);
+		header->refcnt.store(1, std::memory_order_relaxed);
+
+		T* tptr = reinterpret_cast<T*>(header + 1);
+		new (tptr) T(std::forward<Args>(args)...);
+		return SharedPtr<T>(tptr);
+	}
+
+	template <typename T>
+	struct Deleter
+	{
+		void operator()(T* t)
+		{
+			if (t)
+				destroy(t);
+		}
+	};
+
+	template <typename T>
+	struct Deleter<T[]>
+	{
+		void operator()(T* t)
+		{
+			if (t)
+				destroy_array(t);
+		}
+	};
+
+	template <typename T>
+	using UniquePtr = std::unique_ptr<T, Deleter<T>>;
+
+	template <typename T>
+	struct _unique_if
+	{
+		typedef UniquePtr<T> _unique_single;
+	};
+
+	template <typename T>
+	struct _unique_if<T[]>
+	{
+		typedef UniquePtr<T[]> _unique_array_unknown_bound;
+	};
+
+	template <typename T, size_t N>
+	struct _unique_if<T[N]>
+	{
+		typedef void _unique_array_known_bound;
+	};
+
+	template <typename T, typename... Args>
+	typename _unique_if<T>::_unique_single make_unique(Args&&... args)
+	{
+		return UniquePtr<T>(make<T>(std::forward<Args>(args)...));
+	}
+
+	template <typename T>
+	typename _unique_if<T>::_unique_array_unknown_bound make_unique(size_t count)
+	{
+		typedef typename std::remove_extent<T>::type Base;
+		return UniquePtr<T>(make_array<Base>(count));
+	}
+
+	template <class T, class... _Args>
+	typename _unique_if<T>::_unique_array_known_bound make_unique(_Args&&...) = delete;
+};
--- a/pcsx2/pcsx2.vcxproj
+++ b/pcsx2/pcsx2.vcxproj
@ -480,6 +480,7 @@
    <ClCompile Include="GS\GSPerfMon.cpp" />
    <ClCompile Include="GS\Renderers\Common\GSOsdManager.cpp" />
    <ClCompile Include="GS\GSPng.cpp" />
+    <ClCompile Include="GS\GSRingHeap.cpp" />
    <ClCompile Include="GS\Renderers\SW\GSRasterizer.cpp" />
    <ClCompile Include="GS\Renderers\Common\GSRenderer.cpp" />
    <ClCompile Include="GS\Renderers\DX11\GSRendererDX11.cpp" />
@ -843,6 +844,7 @@
    <ClInclude Include="GS\GSPerfMon.h" />
    <ClInclude Include="GS\Renderers\Common\GSOsdManager.h" />
    <ClInclude Include="GS\GSPng.h" />
+    <ClInclude Include="GS\GSRingHeap.h" />
    <ClInclude Include="GS\Renderers\SW\GSRasterizer.h" />
    <ClInclude Include="GS\Renderers\Common\GSRenderer.h" />
    <ClInclude Include="GS\Renderers\DX11\GSRendererDX11.h" />
--- a/pcsx2/pcsx2.vcxproj.filters
+++ b/pcsx2/pcsx2.vcxproj.filters
@ -1463,6 +1463,9 @@
    <ClCompile Include="GS\GSPng.cpp">
      <Filter>System\Ps2\GS</Filter>
    </ClCompile>
+    <ClCompile Include="GS\GSRingHeap.cpp">
+      <Filter>System\Ps2\GS</Filter>
+    </ClCompile>
    <ClCompile Include="GS\GSLzma.cpp">
      <Filter>System\Ps2\GS</Filter>
    </ClCompile>
@ -2556,6 +2559,9 @@
    <ClInclude Include="GS\GSPng.h">
      <Filter>System\Ps2\GS</Filter>
    </ClInclude>
+    <ClInclude Include="GS\GSRingHeap.h">
+      <Filter>System\Ps2\GS</Filter>
+    </ClInclude>
    <ClInclude Include="GS\GSThread_CXX11.h">
      <Filter>System\Ps2\GS</Filter>
    </ClInclude>