GS: Add new heap for SW renderer allocations

This commit is contained in:
TellowKrinkle 2021-05-02 23:18:48 -05:00 committed by tellowkrinkle
parent df02d784d6
commit 342170b077
5 changed files with 546 additions and 0 deletions

View File

@ -618,6 +618,7 @@ set(pcsx2GSSources
GS/GSLzma.cpp
GS/GSPerfMon.cpp
GS/GSPng.cpp
GS/GSRingHeap.cpp
GS/GSState.cpp
GS/GSTables.cpp
GS/GSUtil.cpp
@ -677,6 +678,7 @@ set(pcsx2GSHeaders
GS/GSLzma.h
GS/GSPerfMon.h
GS/GSPng.h
GS/GSRingHeap.h
GS/GSState.h
GS/GSTables.h
GS/GSThread_CXX11.h

256
pcsx2/GS/GSRingHeap.cpp Normal file
View File

@ -0,0 +1,256 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSRingHeap.h"
#include "GS.h"
#include "GSExtra.h"
namespace
{
/// Align `value` to `align` bytes
template <size_t align>
size_t alignTo(size_t value)
{
return ((value + (align - 1)) / align) * align;
}
/// Align to a power of 2 using a mask that's equal to that value - 1
size_t alignUsingMask(size_t align_mask, size_t value)
{
return (value + align_mask) & ~align_mask;
}
} // namespace
/// GSRingHeap operates as a ring buffer, with usage counters for each quadrant
/// If a new quadrant needs to be used but is still in use by existing allocations,
/// the buffer is orphaned and a replaced with a new, larger buffer
struct GSRingHeap::Buffer
{
friend class GSRingHeap;
static const size_t BEGINNING_OFFSET;
static constexpr size_t USAGE_ARR_SIZE = sizeof(uint64_t) / sizeof(size_t);
static constexpr size_t USAGE_ARR_ELEMS_PER_ENTRY = sizeof(size_t) / sizeof(uint16_t);
/// Refcount, main heap holds onto buffer with a +1
/// Each allocation adds `sizeof(allocation)`, to allow for detection of buffers being used inefficiently
/// (e.g. if a buffer is orphaned with very little usage that means allocations aren't being freed in a similar order to being made)
/// Buffer is freed when the main heap drops it (-1) and every allocation is freed, causing this to reach 0
std::atomic<size_t> m_amt_allocated;
/// Holds 4x 16-bit usage counters, indicating how many allocations have been made from the nth quadrant of memory
/// Merged into `size_t` chunks so that they can be operated on with fewer atomic operations
std::atomic<size_t> m_usage[USAGE_ARR_SIZE];
/// Size of whole buffer (including header)
/// Should be kept to at least 2x the largest allocation
size_t m_size;
/// Offset of new allocations
size_t m_write_loc;
/// Amount to rshift buffer offset to get which quadrant it's in (`log2(m_size/4)`)
int m_quadrant_shift;
/// Increment usage counts (use when allocating)
void beginUse(uint64_t usage)
{
for (size_t i = 0; i < USAGE_ARR_SIZE; i++)
{
size_t piece = static_cast<size_t>(usage >> (i * (64 / USAGE_ARR_SIZE)));
size_t prev = m_usage[i].fetch_add(piece, std::memory_order_relaxed);
for (size_t j = 0; j < USAGE_ARR_ELEMS_PER_ENTRY; j++)
{
[[maybe_unused]] uint16_t section = prev >> (j * 16);
assert(section != UINT16_MAX && "Usage count overflow");
}
}
}
/// Decrement usage counts (use when freeing)
void endUse(uint64_t usage)
{
for (size_t i = 0; i < USAGE_ARR_SIZE; i++)
{
size_t piece = static_cast<size_t>(usage >> (i * (64 / USAGE_ARR_SIZE)));
m_usage[i].fetch_sub(piece, std::memory_order_release);
}
}
/// Check if the given quadrant is still in use
bool isStillInUse(uint32_t quadrant)
{
int arridx = (quadrant / USAGE_ARR_ELEMS_PER_ENTRY) % USAGE_ARR_SIZE;
int shift = (quadrant % USAGE_ARR_ELEMS_PER_ENTRY) * 16;
return ((m_usage[arridx].load(std::memory_order_acquire) >> shift) & 0xFFFF) != 0;
}
uint32_t quadrant(size_t off)
{
return static_cast<uint32_t>(off >> m_quadrant_shift);
}
/// Calculate a usage mask from an offset + size
uint64_t usageMask(size_t begin_off, size_t size)
{
// We guarantee size <= two quadrants
// Therefore we only need to check beginning, middle, and end
uint64_t mask = 0;
mask |= 1ull << (quadrant(begin_off) * 16);
size_t mid_off = begin_off + size / 2;
mask |= 1ull << (quadrant(mid_off) * 16);
size_t end_off = begin_off + size - 1;
mask |= 1ull << (quadrant(end_off) * 16);
return mask;
}
/// Decrement the main amt_allocated refcount
void decref(size_t amt)
{
if (unlikely(m_amt_allocated.fetch_sub(amt, std::memory_order_release) == amt))
{
std::atomic_thread_fence(std::memory_order_acquire);
vmfree(this, m_size);
}
}
/// Free an allocation
void free(void* allocation, size_t size)
{
const char* base = reinterpret_cast<const char*>(this);
size_t begin_off = static_cast<const char*>(allocation) - base;
endUse(usageMask(begin_off, size));
decref(size);
}
/// Allocate a value of `size` bytes with `prefix_size` bytes before it (for allocation tracking) and alignment specified by `align_mask`
void* alloc(size_t size, size_t align_mask, size_t prefix_size)
{
uint32_t prev_quadrant = quadrant(m_write_loc - 1);
size_t base_off = alignUsingMask(align_mask, m_write_loc + prefix_size);
uint64_t usage_mask = 1ull << (quadrant(base_off - prefix_size) * 16);
uint32_t new_quadrant = quadrant(base_off + size - 1);
if (prev_quadrant != new_quadrant)
{
uint32_t cur_quadrant = prev_quadrant + 1;
if (new_quadrant >= 4)
{
cur_quadrant = 0;
usage_mask = 0;
base_off = alignUsingMask(align_mask, BEGINNING_OFFSET + prefix_size);
new_quadrant = quadrant(base_off + size - 1);
}
do
{
usage_mask |= 1ull << (cur_quadrant * 16);
if (unlikely(isStillInUse(cur_quadrant)))
return nullptr;
} while (++cur_quadrant <= new_quadrant);
}
m_write_loc = base_off + size;
beginUse(usage_mask);
m_amt_allocated.fetch_add(size + prefix_size, std::memory_order_relaxed);
return reinterpret_cast<char*>(this) + base_off - prefix_size;
}
static Buffer* make(int quadrant_shift)
{
size_t size = 4ull << quadrant_shift;
Buffer* buffer = reinterpret_cast<Buffer*>(vmalloc(size, false));
buffer->m_size = size;
buffer->m_quadrant_shift = quadrant_shift;
buffer->m_amt_allocated.store(1, std::memory_order_relaxed);
for (std::atomic<size_t>& usage : buffer->m_usage)
usage.store(0, std::memory_order_relaxed);
buffer->m_write_loc = BEGINNING_OFFSET;
return buffer;
}
};
const size_t GSRingHeap::Buffer::BEGINNING_OFFSET = alignTo<64>(sizeof(Buffer));
constexpr size_t GSRingHeap::MIN_ALIGN;
GSRingHeap::GSRingHeap()
{
m_current_buffer = Buffer::make(14); // Start with 64k buffer
}
GSRingHeap::~GSRingHeap() noexcept
{
orphanBuffer();
}
void GSRingHeap::orphanBuffer() noexcept
{
m_current_buffer->decref(1);
}
void* GSRingHeap::alloc_internal(size_t size, size_t align_mask, size_t prefix_size)
{
prefix_size += sizeof(Buffer*); // Add space for a pointer to the buffer
size_t total_size = size + prefix_size;
if (likely(total_size <= (m_current_buffer->m_size / 2)))
{
if (void* ptr = m_current_buffer->alloc(size, align_mask, prefix_size))
{
Buffer** bptr = static_cast<Buffer**>(ptr);
*bptr = m_current_buffer;
return bptr + 1;
}
else if (IsDevBuild)
{
size_t total = m_current_buffer->m_size;
size_t mb = 1024 * 1024;
if (total >= mb)
{
size_t used = m_current_buffer->m_amt_allocated.load(std::memory_order_relaxed) - 1;
if (used * 4 < total)
{
fprintf(stderr, "GSRingHeap: Orphaning %dmb buffer with low usage of %d%%, check that allocations are actually being deallocated approximately in order\n", total / mb, static_cast<int>((used * 100) / total));
}
}
}
}
// Couldn't allocate, orphan buffer and make a new one
int shift = m_current_buffer->m_quadrant_shift;
do
{
shift++;
} while (total_size > (2ull << shift));
if (shift > 24 && total_size <= (2ull << (shift - 1)))
{
// If this needs to be >64 mb, we're doing something wrong
fprintf(stderr, "GSRingHeap: Refusing to grow to %umb\n", 4u << (shift - 20));
shift--;
}
Buffer* new_buffer = Buffer::make(shift);
orphanBuffer();
m_current_buffer = new_buffer;
void* ptr = m_current_buffer->alloc(size, align_mask, prefix_size);
assert(ptr && "Fresh buffer failed to allocate!");
Buffer** bptr = static_cast<Buffer**>(ptr);
*bptr = m_current_buffer;
return bptr + 1;
}
void GSRingHeap::free_internal(void* ptr, size_t size) noexcept
{
size += sizeof(Buffer*);
Buffer** bptr = static_cast<Buffer**>(ptr) - 1;
(*bptr)->free(bptr, size);
}

280
pcsx2/GS/GSRingHeap.h Normal file
View File

@ -0,0 +1,280 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <algorithm>
#include <atomic>
#include <cstdint>
#include <cstdlib>
/// A ring buffer pretending to be a heap (screams if you don't actually use it like a ring buffer)
/// Meant for one producer thread creating data and sharing it with multiple consumer threads
/// Expectations:
/// - One thread allocates and writes to allocations
/// - Other threads read from allocations (once shared, no one writes)
/// - Any thread can free
/// - Frees are done in approximately the same order as allocations (but not exactly the same order)
class GSRingHeap
{
struct Buffer;
Buffer* m_current_buffer;
void orphanBuffer() noexcept;
/// Allocate a value of `size` bytes with `prefix_size` bytes before it (for allocation tracking) and alignment specified by `align_mask`
void* alloc_internal(size_t size, size_t align_mask, size_t prefix_size);
/// Free a value of size `size` (equal to prefix_size + size when allocated)
static void free_internal(void* ptr, size_t size) noexcept;
static constexpr size_t MIN_ALIGN = std::max(alignof(size_t), alignof(void*));
static size_t getAlignMask(size_t align)
{
return std::max(MIN_ALIGN, align) - 1;
}
public:
GSRingHeap(GSRingHeap&&) = delete;
GSRingHeap();
~GSRingHeap() noexcept;
/// Allocate a piece of memory with the given size and alignment
void* alloc(size_t size, size_t align)
{
size_t alloc_size = size + sizeof(size_t);
void* ptr = alloc_internal(size, getAlignMask(align), sizeof(size_t));
size_t* header = static_cast<size_t*>(ptr);
*header = alloc_size;
return static_cast<void*>(header + 1);
};
/// Allocate and initialize a T*
template <typename T, typename... Args>
T* make(Args&&... args)
{
void* ptr = alloc(sizeof(T), alignof(T));
new (ptr) T(std::forward<Args>(args)...);
return static_cast<T*>(ptr);
}
/// Allocate and default-initialize `count` `T`s
template <typename T>
T* make_array(size_t count)
{
void* ptr = alloc(sizeof(T) * count, alignof(T));
new (ptr) T[count]();
return static_cast<T*>(ptr);
}
/// Free a pointer allocated with `alloc`
static void free(void* ptr)
{
size_t* header = static_cast<size_t*>(ptr) - 1;
free_internal(static_cast<void*>(header), *header);
}
/// Deinitialize and free a pointer created with `make`
template <typename T>
static void destroy(T* ptr)
{
ptr->~T();
free(ptr);
}
/// Deinitialize and free an array allocated with `make_array`
template <typename T>
static void destroy_array(T* ptr)
{
size_t* header = const_cast<size_t*>(reinterpret_cast<const size_t*>(ptr)) - 1;
size_t size = (*header - sizeof(size_t)) / sizeof(T);
for (size_t i = 0; i < size; i++)
ptr[i].~T();
free(ptr);
}
/// Like `std::shared_ptr` but holds a pointer on this allocator
template <typename T>
class SharedPtr
{
friend class GSRingHeap;
struct alignas(MIN_ALIGN) AllocationHeader
{
uint32_t size;
std::atomic<uint32_t> refcnt;
};
T* m_ptr;
SharedPtr(T* ptr)
: m_ptr(ptr)
{
}
AllocationHeader* getHeader()
{
return const_cast<AllocationHeader*>(reinterpret_cast<const AllocationHeader*>(m_ptr)) - 1;
}
public:
SharedPtr()
: m_ptr(nullptr)
{
}
SharedPtr(std::nullptr_t)
: m_ptr(nullptr)
{
}
SharedPtr(const SharedPtr& other)
: m_ptr(other.m_ptr)
{
if (m_ptr)
getHeader()->refcnt.fetch_add(1, std::memory_order_relaxed);
}
SharedPtr(SharedPtr&& other)
: m_ptr(other.m_ptr)
{
other.m_ptr = nullptr;
}
SharedPtr& operator=(const SharedPtr& other)
{
this->~SharedPtr();
new (this) SharedPtr(other);
return *this;
}
SharedPtr& operator=(SharedPtr&& other)
{
this->~SharedPtr();
new (this) SharedPtr(other);
return *this;
}
~SharedPtr()
{
if (!m_ptr)
return;
AllocationHeader* header = getHeader();
// (See top) Expectation: Once shared, no one writes
// Therefore we don't need acquire/release semantics here
if (header->refcnt.fetch_sub(1, std::memory_order_relaxed) == 1)
{
m_ptr->~T();
free_internal(static_cast<void*>(header), header->size);
}
}
T& operator*() const { return *m_ptr; }
T* operator->() const { return m_ptr; }
T* get() const { return m_ptr; }
/// static_cast the pointer to another type
template <typename Other>
SharedPtr<Other> cast() const&
{
getHeader().refcount.fetch_add(1, std::memory_order_relaxed);
return SharedPtr<Other>(static_cast<Other*>(m_ptr));
}
/// static_cast the pointer to another type
template <typename Other>
SharedPtr<Other> cast() &&
{
SharedPtr<Other> other(static_cast<Other*>(m_ptr));
m_ptr = nullptr;
return other;
}
};
/// Make a shared pointer with a different alignment from what the type would normally expect
template <typename T, typename... Args>
SharedPtr<T> make_shared(Args&&... args)
{
using Header = typename SharedPtr<T>::AllocationHeader;
size_t alloc_size = sizeof(T) + sizeof(Header);
static_assert(alignof(Header) <= MIN_ALIGN, "Header alignment too high");
void* ptr = alloc_internal(sizeof(T), getAlignMask(alignof(T)), sizeof(Header));
Header* header = static_cast<Header*>(ptr);
assert(alloc_size <= UINT32_MAX && "Allocation overflow");
header->size = static_cast<uint32_t>(alloc_size);
header->refcnt.store(1, std::memory_order_relaxed);
T* tptr = reinterpret_cast<T*>(header + 1);
new (tptr) T(std::forward<Args>(args)...);
return SharedPtr<T>(tptr);
}
template <typename T>
struct Deleter
{
void operator()(T* t)
{
if (t)
destroy(t);
}
};
template <typename T>
struct Deleter<T[]>
{
void operator()(T* t)
{
if (t)
destroy_array(t);
}
};
template <typename T>
using UniquePtr = std::unique_ptr<T, Deleter<T>>;
template <typename T>
struct _unique_if
{
typedef UniquePtr<T> _unique_single;
};
template <typename T>
struct _unique_if<T[]>
{
typedef UniquePtr<T[]> _unique_array_unknown_bound;
};
template <typename T, size_t N>
struct _unique_if<T[N]>
{
typedef void _unique_array_known_bound;
};
template <typename T, typename... Args>
typename _unique_if<T>::_unique_single make_unique(Args&&... args)
{
return UniquePtr<T>(make<T>(std::forward<Args>(args)...));
}
template <typename T>
typename _unique_if<T>::_unique_array_unknown_bound make_unique(size_t count)
{
typedef typename std::remove_extent<T>::type Base;
return UniquePtr<T>(make_array<Base>(count));
}
template <class T, class... _Args>
typename _unique_if<T>::_unique_array_known_bound make_unique(_Args&&...) = delete;
};

View File

@ -480,6 +480,7 @@
<ClCompile Include="GS\GSPerfMon.cpp" />
<ClCompile Include="GS\Renderers\Common\GSOsdManager.cpp" />
<ClCompile Include="GS\GSPng.cpp" />
<ClCompile Include="GS\GSRingHeap.cpp" />
<ClCompile Include="GS\Renderers\SW\GSRasterizer.cpp" />
<ClCompile Include="GS\Renderers\Common\GSRenderer.cpp" />
<ClCompile Include="GS\Renderers\DX11\GSRendererDX11.cpp" />
@ -843,6 +844,7 @@
<ClInclude Include="GS\GSPerfMon.h" />
<ClInclude Include="GS\Renderers\Common\GSOsdManager.h" />
<ClInclude Include="GS\GSPng.h" />
<ClInclude Include="GS\GSRingHeap.h" />
<ClInclude Include="GS\Renderers\SW\GSRasterizer.h" />
<ClInclude Include="GS\Renderers\Common\GSRenderer.h" />
<ClInclude Include="GS\Renderers\DX11\GSRendererDX11.h" />

View File

@ -1463,6 +1463,9 @@
<ClCompile Include="GS\GSPng.cpp">
<Filter>System\Ps2\GS</Filter>
</ClCompile>
<ClCompile Include="GS\GSRingHeap.cpp">
<Filter>System\Ps2\GS</Filter>
</ClCompile>
<ClCompile Include="GS\GSLzma.cpp">
<Filter>System\Ps2\GS</Filter>
</ClCompile>
@ -2556,6 +2559,9 @@
<ClInclude Include="GS\GSPng.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="GS\GSRingHeap.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="GS\GSThread_CXX11.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>