// Copyright 2013 Dolphin Emulator Project // Licensed under GPLv2+ // Refer to the license.txt file included. #include "VideoBackends/OGL/StreamBuffer.h" #include "Common/Align.h" #include "Common/GL/GLUtil.h" #include "Common/MathUtil.h" #include "Common/MemoryUtil.h" #include "VideoBackends/OGL/Render.h" #include "VideoCommon/DriverDetails.h" #include "VideoCommon/OnScreenDisplay.h" namespace OGL { // moved out of constructor, so m_buffer is allowed to be const static u32 GenBuffer() { u32 id; glGenBuffers(1, &id); return id; } StreamBuffer::StreamBuffer(u32 type, u32 size) : m_buffer(GenBuffer()), m_buffertype(type), m_size(MathUtil::NextPowerOf2(size)), m_bit_per_slot(IntLog2(MathUtil::NextPowerOf2(size) / SYNC_POINTS)) { m_iterator = 0; m_used_iterator = 0; m_free_iterator = 0; } StreamBuffer::~StreamBuffer() { glDeleteBuffers(1, &m_buffer); } /* Shared synchronization code for ring buffers * * The next three functions are to create/delete/use the OpenGL synchronization. * ARB_sync (OpenGL 3.2) is used and required. * * To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks. * For each of this chunks, there is a fence which checks if this chunk is still in use. * * As our API allows to alloc more memory then it has to use, we have to catch how much is already * written. * * m_iterator - writing position * m_free_iterator - last position checked if free * m_used_iterator - last position known to be written * * So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set * m_free_iterator to m_iterator afterwards). * * We also assume that this buffer is accessed by the GPU between the Unmap and Map function, * so we may create the fences on the start of mapping. * Some here, new fences for the chunks between m_used_iterator and m_iterator (also update * m_used_iterator). * * As ring buffers have an ugly behavior on rollover, have fun to read this code ;) */ void StreamBuffer::CreateFences() { for (int i = 0; i < SYNC_POINTS; i++) { m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } } void StreamBuffer::DeleteFences() { for (int i = Slot(m_free_iterator) + 1; i < SYNC_POINTS; i++) { glDeleteSync(m_fences[i]); } for (int i = 0; i < Slot(m_iterator); i++) { glDeleteSync(m_fences[i]); } } void StreamBuffer::AllocMemory(u32 size) { // insert waiting slots for used memory for (int i = Slot(m_used_iterator); i < Slot(m_iterator); i++) { m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } m_used_iterator = m_iterator; // wait for new slots to end of buffer for (int i = Slot(m_free_iterator) + 1; i <= Slot(m_iterator + size) && i < SYNC_POINTS; i++) { glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); glDeleteSync(m_fences[i]); } // If we allocate a large amount of memory (A), commit a smaller amount, then allocate memory // smaller than allocation A, we will have already waited for these fences in A, but not used // the space. In this case, don't set m_free_iterator to a position before that which we know // is safe to use, which would result in waiting on the same fence(s) next time. if ((m_iterator + size) > m_free_iterator) m_free_iterator = m_iterator + size; // if buffer is full if (m_iterator + size >= m_size) { // insert waiting slots in unused space at the end of the buffer for (int i = Slot(m_used_iterator); i < SYNC_POINTS; i++) { m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } // move to the start m_used_iterator = m_iterator = 0; // offset 0 is always aligned // wait for space at the start for (int i = 0; i <= Slot(m_iterator + size); i++) { glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); glDeleteSync(m_fences[i]); } m_free_iterator = m_iterator + size; } } /* The usual way to stream data to the GPU. * Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping * Just do unsync appends until the buffer is full. * When it's full, orphan (alloc a new buffer and free the old one) * * As reallocation is an overhead, this method isn't as fast as it is known to be. */ class MapAndOrphan : public StreamBuffer { public: MapAndOrphan(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); } ~MapAndOrphan() {} std::pair Map(u32 size) override { if (m_iterator + size >= m_size) { glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); m_iterator = 0; } u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); return std::make_pair(pointer, m_iterator); } void Unmap(u32 used_size) override { glFlushMappedBufferRange(m_buffertype, 0, used_size); glUnmapBuffer(m_buffertype); m_iterator += used_size; } }; /* A modified streaming way without reallocation * This one fixes the reallocation overhead of the MapAndOrphan one. * So it alloc a ring buffer on initialization. * But with this limited resource, we have to care about the CPU-GPU distance. * Else this fifo may overflow. * So we had traded orphan vs syncing. */ class MapAndSync : public StreamBuffer { public: MapAndSync(u32 type, u32 size) : StreamBuffer(type, size) { CreateFences(); glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); } ~MapAndSync() { DeleteFences(); } std::pair Map(u32 size) override { AllocMemory(size); u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); return std::make_pair(pointer, m_iterator); } void Unmap(u32 used_size) override { glFlushMappedBufferRange(m_buffertype, 0, used_size); glUnmapBuffer(m_buffertype); m_iterator += used_size; } }; /* Streaming fifo without mapping overhead. * This one usually requires ARB_buffer_storage (OpenGL 4.4). * And is usually not available on OpenGL3 GPUs. * * ARB_buffer_storage allows us to render from a mapped buffer. * So we map it persistently in the initialization. * * Unsync mapping sounds like an easy task, but it isn't for threaded drivers. * So every mapping on current close-source driver _will_ end in * at least a round trip time between two threads. * * As persistently mapped buffer can't use orphaning, we also have to sync. */ class BufferStorage : public StreamBuffer { public: BufferStorage(u32 type, u32 size, bool _coherent = false) : StreamBuffer(type, size), coherent(_coherent) { CreateFences(); glBindBuffer(m_buffertype, m_buffer); // PERSISTANT_BIT to make sure that the buffer can be used while mapped // COHERENT_BIT is set so we don't have to use a MemoryBarrier on write // CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then // server side glBufferStorage(m_buffertype, m_size, nullptr, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0)); m_pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT)); } ~BufferStorage() { DeleteFences(); glUnmapBuffer(m_buffertype); glBindBuffer(m_buffertype, 0); } std::pair Map(u32 size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } void Unmap(u32 used_size) override { if (!coherent) glFlushMappedBufferRange(m_buffertype, m_iterator, used_size); m_iterator += used_size; } u8* m_pointer; const bool coherent; }; /* --- AMD only --- * Another streaming fifo without mapping overhead. * As we can't orphan without mapping, we have to sync. * * This one uses AMD_pinned_memory which is available on all AMD GPUs. * OpenGL 4.4 drivers should use BufferStorage. */ class PinnedMemory : public StreamBuffer { public: PinnedMemory(u32 type, u32 size) : StreamBuffer(type, size) { CreateFences(); m_pointer = static_cast(Common::AllocateAlignedMemory( Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY)); glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer); glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY); glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0); glBindBuffer(m_buffertype, m_buffer); } ~PinnedMemory() { DeleteFences(); glBindBuffer(m_buffertype, 0); glFinish(); // ogl pipeline must be flushed, else this buffer can be in use Common::FreeAlignedMemory(m_pointer); m_pointer = nullptr; } std::pair Map(u32 size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } void Unmap(u32 used_size) override { m_iterator += used_size; } u8* m_pointer; static const u32 ALIGN_PINNED_MEMORY = 4096; }; /* Fifo based on the glBufferSubData call. * As everything must be copied before glBufferSubData returns, * an additional memcpy in the driver will be done. * So this is a huge overhead, only use it if required. */ class BufferSubData : public StreamBuffer { public: BufferSubData(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, size, nullptr, GL_STATIC_DRAW); m_pointer = new u8[m_size]; } ~BufferSubData() { delete[] m_pointer; } std::pair Map(u32 size) override { return std::make_pair(m_pointer, 0); } void Unmap(u32 used_size) override { glBufferSubData(m_buffertype, 0, used_size, m_pointer); } u8* m_pointer; }; /* Fifo based on the glBufferData call. * Some trashy drivers stall in BufferSubData. * So here we use glBufferData, which realloc this buffer every time. * This may avoid stalls, but it is a bigger overhead than BufferSubData. */ class BufferData : public StreamBuffer { public: BufferData(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); m_pointer = new u8[m_size]; } ~BufferData() { delete[] m_pointer; } std::pair Map(u32 size) override { return std::make_pair(m_pointer, 0); } void Unmap(u32 used_size) override { glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW); } u8* m_pointer; }; // Chooses the best streaming method based on the supported extensions and known issues std::unique_ptr StreamBuffer::Create(u32 type, u32 size) { // without basevertex support, only streaming methods whith uploads everything to zero works fine: if (!g_ogl_config.bSupportsGLBaseVertex) { if (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM)) return std::make_unique(type, size); // BufferData is by far the worst way, only use it if needed return std::make_unique(type, size); } // Prefer the syncing buffers over the orphaning one if (g_ogl_config.bSupportsGLSync) { // pinned memory is much faster than buffer storage on AMD cards if (g_ogl_config.bSupportsGLPinnedMemory && !(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_PINNED_MEMORY))) return std::make_unique(type, size); // buffer storage works well in most situations bool coherent = DriverDetails::HasBug(DriverDetails::BUG_BROKEN_EXPLICIT_FLUSH); if (g_ogl_config.bSupportsGLBufferStorage && !(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STORAGE) && type == GL_ARRAY_BUFFER) && !(DriverDetails::HasBug(DriverDetails::BUG_INTEL_BROKEN_BUFFER_STORAGE) && type == GL_ELEMENT_ARRAY_BUFFER)) return std::make_unique(type, size, coherent); // don't fall back to MapAnd* for Nvidia drivers if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_UNSYNC_MAPPING)) { if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM)) return std::make_unique(type, size); else return std::make_unique(type, size); } // mapping fallback if (g_ogl_config.bSupportsGLSync) return std::make_unique(type, size); } // default fallback, should work everywhere, but isn't the best way to do this job return std::make_unique(type, size); } }