Merge branch 'StreamBuffers'
This branch drops our temporary buffer in VertexLoaderBase. Instead, every backend now must provide a buffer to convert vertices and indices. D3D just uses a temporary buffer like before. OGL maps the gpu based buffer and stream to them directly. So this will avoid an unneeded memcpy on OGL backend.
This commit is contained in:
commit
bfd0b7275e
|
@ -77,6 +77,12 @@ void VertexManager::DestroyDeviceObjects()
|
|||
|
||||
VertexManager::VertexManager()
|
||||
{
|
||||
LocalVBuffer.resize(MAXVBUFFERSIZE);
|
||||
s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0];
|
||||
s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size();
|
||||
|
||||
LocalIBuffer.resize(MAXIBUFFERSIZE);
|
||||
|
||||
CreateDeviceObjects();
|
||||
}
|
||||
|
||||
|
@ -222,4 +228,10 @@ void VertexManager::vFlush()
|
|||
g_renderer->RestoreState();
|
||||
}
|
||||
|
||||
void VertexManager::ResetBuffer(u32 stride)
|
||||
{
|
||||
s_pCurBufferPointer = s_pBaseBufferPointer;
|
||||
IndexGenerator::Start(GetIndexBuffer());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -22,6 +22,10 @@ public:
|
|||
void CreateDeviceObjects();
|
||||
void DestroyDeviceObjects();
|
||||
|
||||
protected:
|
||||
virtual void ResetBuffer(u32 stride);
|
||||
u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
|
||||
|
||||
private:
|
||||
|
||||
void PrepareDrawBuffers();
|
||||
|
@ -41,6 +45,9 @@ private:
|
|||
|
||||
LineGeometryShader m_lineShader;
|
||||
PointGeometryShader m_pointShader;
|
||||
|
||||
std::vector<u8> LocalVBuffer;
|
||||
std::vector<u16> LocalIBuffer;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -193,29 +193,19 @@ void ProgramShaderCache::UploadConstants()
|
|||
{
|
||||
if(PixelShaderManager::dirty || VertexShaderManager::dirty)
|
||||
{
|
||||
s_buffer->Alloc(s_ubo_buffer_size);
|
||||
if (DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
|
||||
{
|
||||
// This is just a hack to support our BUFFERDATA upload method
|
||||
// as it's broken to uploaded in a splited way
|
||||
static u8 *tmpbuffer = new u8[s_ubo_buffer_size];
|
||||
memcpy(tmpbuffer, &PixelShaderManager::constants, sizeof(PixelShaderConstants));
|
||||
memcpy(tmpbuffer+ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), &VertexShaderManager::constants, sizeof(VertexShaderConstants));
|
||||
size_t offset = s_buffer->Upload(tmpbuffer, s_ubo_buffer_size);
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 1,
|
||||
s_buffer->getBuffer(), offset, sizeof(PixelShaderConstants));
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 2,
|
||||
s_buffer->getBuffer(), offset+ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), sizeof(VertexShaderConstants));
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t offset = s_buffer->Upload((u8*)&PixelShaderManager::constants, ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align));
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 1,
|
||||
s_buffer->getBuffer(), offset, sizeof(PixelShaderConstants));
|
||||
offset = s_buffer->Upload((u8*)&VertexShaderManager::constants, ROUND_UP(sizeof(VertexShaderConstants), s_ubo_align));
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 2,
|
||||
s_buffer->getBuffer(), offset, sizeof(VertexShaderConstants));
|
||||
}
|
||||
auto buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align);
|
||||
|
||||
memcpy(buffer.first,
|
||||
&PixelShaderManager::constants, sizeof(PixelShaderConstants));
|
||||
|
||||
memcpy(buffer.first + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
|
||||
&VertexShaderManager::constants, sizeof(VertexShaderConstants));
|
||||
|
||||
s_buffer->Unmap(s_ubo_buffer_size);
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->m_buffer, buffer.second,
|
||||
sizeof(PixelShaderConstants));
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->m_buffer, buffer.second + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align),
|
||||
sizeof(VertexShaderConstants));
|
||||
|
||||
PixelShaderManager::dirty = false;
|
||||
VertexShaderManager::dirty = false;
|
||||
|
@ -481,7 +471,7 @@ void ProgramShaderCache::Init(void)
|
|||
// We multiply by *4*4 because we need to get down to basic machine units.
|
||||
// So multiply by four to get how many floats we have from vec4s
|
||||
// Then once more to get bytes
|
||||
s_buffer = new StreamBuffer(GL_UNIFORM_BUFFER, UBO_LENGTH);
|
||||
s_buffer = StreamBuffer::Create(GL_UNIFORM_BUFFER, UBO_LENGTH);
|
||||
}
|
||||
|
||||
// Read our shader cache, only if supported
|
||||
|
|
|
@ -1653,7 +1653,7 @@ void Renderer::RestoreAPIState()
|
|||
|
||||
VertexManager *vm = (OGL::VertexManager*)g_vertex_manager;
|
||||
glBindBuffer(GL_ARRAY_BUFFER, vm->m_vertex_buffers);
|
||||
vm->m_last_vao = 0;
|
||||
glBindVertexArray(vm->m_last_vao);
|
||||
|
||||
TextureCache::SetStage();
|
||||
}
|
||||
|
|
|
@ -13,227 +13,60 @@
|
|||
namespace OGL
|
||||
{
|
||||
|
||||
static const u32 SYNC_POINTS = 16;
|
||||
static const u32 ALIGN_PINNED_MEMORY = 4096;
|
||||
// moved out of constructor, so m_buffer is allowed to be const
|
||||
static u32 genBuffer()
|
||||
{
|
||||
u32 id;
|
||||
glGenBuffers(1, &id);
|
||||
return id;
|
||||
}
|
||||
|
||||
StreamBuffer::StreamBuffer(u32 type, size_t size)
|
||||
: m_buffertype(type), m_size(size)
|
||||
{
|
||||
glGenBuffers(1, &m_buffer);
|
||||
|
||||
bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation");
|
||||
|
||||
if (g_ogl_config.bSupportsGLBufferStorage &&
|
||||
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
|
||||
m_uploadtype = BUFFERSTORAGE;
|
||||
else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
|
||||
m_uploadtype = BUFFERSUBDATA;
|
||||
else if(!g_ogl_config.bSupportsGLBaseVertex)
|
||||
m_uploadtype = BUFFERDATA;
|
||||
else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory &&
|
||||
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
|
||||
m_uploadtype = PINNED_MEMORY;
|
||||
else if(nvidia)
|
||||
m_uploadtype = BUFFERSUBDATA;
|
||||
else if(g_ogl_config.bSupportsGLSync)
|
||||
m_uploadtype = MAP_AND_SYNC;
|
||||
else
|
||||
m_uploadtype = MAP_AND_ORPHAN;
|
||||
|
||||
Init();
|
||||
}
|
||||
|
||||
StreamBuffer::~StreamBuffer()
|
||||
{
|
||||
Shutdown();
|
||||
glDeleteBuffers(1, &m_buffer);
|
||||
}
|
||||
|
||||
#define SLOT(x) ((x)*SYNC_POINTS/m_size)
|
||||
|
||||
void StreamBuffer::Alloc ( size_t size, u32 stride )
|
||||
{
|
||||
size_t m_iterator_aligned = m_iterator;
|
||||
if(m_iterator_aligned && stride) {
|
||||
m_iterator_aligned--;
|
||||
m_iterator_aligned = m_iterator_aligned - (m_iterator_aligned % stride) + stride;
|
||||
}
|
||||
size_t iter_end = m_iterator_aligned + size;
|
||||
|
||||
switch(m_uploadtype) {
|
||||
case MAP_AND_ORPHAN:
|
||||
if(iter_end >= m_size) {
|
||||
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
|
||||
m_iterator_aligned = 0;
|
||||
}
|
||||
break;
|
||||
case MAP_AND_SYNC:
|
||||
case PINNED_MEMORY:
|
||||
case BUFFERSTORAGE:
|
||||
// insert waiting slots for used memory
|
||||
for (size_t i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++)
|
||||
{
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
m_used_iterator = m_iterator;
|
||||
|
||||
// wait for new slots to end of buffer
|
||||
for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(iter_end) && i < SYNC_POINTS; i++)
|
||||
{
|
||||
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
|
||||
glDeleteSync(fences[i]);
|
||||
}
|
||||
m_free_iterator = iter_end;
|
||||
|
||||
// if buffer is full
|
||||
if (iter_end >= m_size) {
|
||||
|
||||
// insert waiting slots in unused space at the end of the buffer
|
||||
for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
|
||||
{
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
|
||||
// move to the start
|
||||
m_used_iterator = m_iterator_aligned = m_iterator = 0; // offset 0 is always aligned
|
||||
iter_end = size;
|
||||
|
||||
// wait for space at the start
|
||||
for (u32 i = 0; i <= SLOT(iter_end); i++)
|
||||
{
|
||||
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
|
||||
glDeleteSync(fences[i]);
|
||||
}
|
||||
m_free_iterator = iter_end;
|
||||
}
|
||||
|
||||
break;
|
||||
case BUFFERSUBDATA:
|
||||
case BUFFERDATA:
|
||||
m_iterator_aligned = 0;
|
||||
break;
|
||||
}
|
||||
m_iterator = m_iterator_aligned;
|
||||
}
|
||||
|
||||
size_t StreamBuffer::Upload ( u8* data, size_t size )
|
||||
{
|
||||
switch(m_uploadtype) {
|
||||
case MAP_AND_SYNC:
|
||||
case MAP_AND_ORPHAN:
|
||||
pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
|
||||
if(pointer) {
|
||||
memcpy(pointer, data, size);
|
||||
glUnmapBuffer(m_buffertype);
|
||||
} else {
|
||||
ERROR_LOG(VIDEO, "Buffer mapping failed");
|
||||
}
|
||||
break;
|
||||
case PINNED_MEMORY:
|
||||
case BUFFERSTORAGE:
|
||||
if (pointer)
|
||||
memcpy(pointer + m_iterator, data, size);
|
||||
break;
|
||||
case BUFFERSUBDATA:
|
||||
glBufferSubData(m_buffertype, m_iterator, size, data);
|
||||
break;
|
||||
case BUFFERDATA:
|
||||
glBufferData(m_buffertype, size, data, GL_STREAM_DRAW);
|
||||
break;
|
||||
}
|
||||
size_t ret = m_iterator;
|
||||
m_iterator += size;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void StreamBuffer::Init()
|
||||
: m_buffer(genBuffer()), m_buffertype(type), m_size(size)
|
||||
{
|
||||
m_iterator = 0;
|
||||
m_used_iterator = 0;
|
||||
m_free_iterator = 0;
|
||||
|
||||
switch(m_uploadtype) {
|
||||
case MAP_AND_SYNC:
|
||||
fences = new GLsync[SYNC_POINTS];
|
||||
for(u32 i=0; i<SYNC_POINTS; i++)
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
|
||||
case MAP_AND_ORPHAN:
|
||||
case BUFFERSUBDATA:
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
|
||||
break;
|
||||
case PINNED_MEMORY:
|
||||
glGetError(); // errors before this allocation should be ignored
|
||||
fences = new GLsync[SYNC_POINTS];
|
||||
for(u32 i=0; i<SYNC_POINTS; i++)
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
|
||||
pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY );
|
||||
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
|
||||
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), pointer, GL_STREAM_COPY);
|
||||
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
|
||||
// on error, switch to another backend. some old catalyst seems to have broken pinned memory support
|
||||
if(glGetError() != GL_NO_ERROR) {
|
||||
ERROR_LOG(VIDEO, "Pinned memory detected, but not working. Please report this: %s, %s, %s", g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, g_ogl_config.gl_version);
|
||||
Shutdown();
|
||||
m_uploadtype = MAP_AND_SYNC;
|
||||
Init();
|
||||
}
|
||||
break;
|
||||
|
||||
case BUFFERSTORAGE:
|
||||
glGetError(); // errors before this allocation should be ignored
|
||||
fences = new GLsync[SYNC_POINTS];
|
||||
for (u32 i = 0; i<SYNC_POINTS; i++)
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
|
||||
// PERSISTANT_BIT to make sure that the buffer can be used while mapped
|
||||
// COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
|
||||
// CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side
|
||||
glBufferStorage(m_buffertype, m_size, NULL,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_CLIENT_STORAGE_BIT);
|
||||
pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
|
||||
if(!pointer)
|
||||
ERROR_LOG(VIDEO, "Buffer allocation failed");
|
||||
break;
|
||||
|
||||
case BUFFERDATA:
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
break;
|
||||
}
|
||||
fences = nullptr;
|
||||
}
|
||||
|
||||
void StreamBuffer::Shutdown()
|
||||
|
||||
StreamBuffer::~StreamBuffer()
|
||||
{
|
||||
switch(m_uploadtype) {
|
||||
case MAP_AND_SYNC:
|
||||
DeleteFences();
|
||||
break;
|
||||
case MAP_AND_ORPHAN:
|
||||
case BUFFERSUBDATA:
|
||||
case BUFFERDATA:
|
||||
break;
|
||||
case PINNED_MEMORY:
|
||||
DeleteFences();
|
||||
glBindBuffer(m_buffertype, 0);
|
||||
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
|
||||
FreeAlignedMemory(pointer);
|
||||
break;
|
||||
case BUFFERSTORAGE:
|
||||
DeleteFences();
|
||||
glUnmapBuffer(m_buffertype);
|
||||
glBindBuffer(m_buffertype, 0);
|
||||
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
|
||||
break;
|
||||
}
|
||||
glDeleteBuffers(1, &m_buffer);
|
||||
}
|
||||
|
||||
/* Shared synchronisation code for ring buffers
|
||||
*
|
||||
* The next three functions are to create/delete/use the OpenGL synchronisation.
|
||||
* ARB_sync (OpenGL 3.2) is used and required.
|
||||
*
|
||||
* To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks.
|
||||
* For each of this chunks, there is a fence which checks if this chunk is still in use.
|
||||
*
|
||||
* As our API allows to alloc more memory then it has to use, we have to catch how much is already written.
|
||||
*
|
||||
* m_iterator - writing position
|
||||
* m_free_iterator - last position checked if free
|
||||
* m_used_iterator - last position known to be written
|
||||
*
|
||||
* So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set m_free_iterator to m_iterator afterwards).
|
||||
*
|
||||
* We also assume that this buffer is accessed by the gpu between the Unmap and Map function,
|
||||
* so we may create the fences on the start of mapping.
|
||||
* Some here, new fences for the chunks between m_used_iterator and m_iterator (also update m_used_iterator).
|
||||
*
|
||||
* As ring buffers have an ugly behavoir on rollover, have fun to read this code ;)
|
||||
*/
|
||||
|
||||
#define SLOT(x) ((x)*SYNC_POINTS/m_size)
|
||||
static const u32 SYNC_POINTS = 16;
|
||||
void StreamBuffer::CreateFences()
|
||||
{
|
||||
fences = new GLsync[SYNC_POINTS];
|
||||
for(u32 i=0; i<SYNC_POINTS; i++)
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
void StreamBuffer::DeleteFences()
|
||||
{
|
||||
for (size_t i = SLOT(m_free_iterator) + 1; i < SYNC_POINTS; i++)
|
||||
|
@ -246,5 +79,291 @@ void StreamBuffer::DeleteFences()
|
|||
}
|
||||
delete [] fences;
|
||||
}
|
||||
void StreamBuffer::AllocMemory(size_t size)
|
||||
{
|
||||
// insert waiting slots for used memory
|
||||
for (size_t i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++)
|
||||
{
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
m_used_iterator = m_iterator;
|
||||
|
||||
// wait for new slots to end of buffer
|
||||
for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++)
|
||||
{
|
||||
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
|
||||
glDeleteSync(fences[i]);
|
||||
}
|
||||
m_free_iterator = m_iterator + size;
|
||||
|
||||
// if buffer is full
|
||||
if (m_iterator + size >= m_size) {
|
||||
|
||||
// insert waiting slots in unused space at the end of the buffer
|
||||
for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++)
|
||||
{
|
||||
fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
|
||||
// move to the start
|
||||
m_used_iterator = m_iterator = 0; // offset 0 is always aligned
|
||||
|
||||
// wait for space at the start
|
||||
for (u32 i = 0; i <= SLOT(m_iterator + size); i++)
|
||||
{
|
||||
glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
|
||||
glDeleteSync(fences[i]);
|
||||
}
|
||||
m_free_iterator = m_iterator + size;
|
||||
}
|
||||
}
|
||||
#undef SLOT
|
||||
|
||||
void StreamBuffer::Align(u32 stride)
|
||||
{
|
||||
if(m_iterator && stride) {
|
||||
m_iterator--;
|
||||
m_iterator = m_iterator - (m_iterator % stride) + stride;
|
||||
}
|
||||
}
|
||||
|
||||
/* The usual way to stream data to the gpu.
|
||||
* Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping
|
||||
* Just do unsync appends until the buffer is full.
|
||||
* When it's full, orphan (alloc a new buffer and free the old one)
|
||||
*
|
||||
* As reallocation is an overhead, this method isn't as fast as it is known to be.
|
||||
*/
|
||||
class MapAndOrphan : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
MapAndOrphan(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
|
||||
}
|
||||
|
||||
~MapAndOrphan() {
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
Align(stride);
|
||||
if(m_iterator + size >= m_size) {
|
||||
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
|
||||
m_iterator = 0;
|
||||
}
|
||||
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
|
||||
return std::make_pair(pointer, m_iterator);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
glFlushMappedBufferRange(m_buffertype, 0, used_size);
|
||||
glUnmapBuffer(m_buffertype);
|
||||
m_iterator += used_size;
|
||||
}
|
||||
};
|
||||
|
||||
/* A modified streaming way without reallocation
|
||||
* This one fixes the reallocation overhead of the MapAndOrphan one.
|
||||
* So it alloc a ring buffer on initialization.
|
||||
* But with this limited ressource, we have to care about the cpu-gpu distance.
|
||||
* Else this fifo may overflow.
|
||||
* So we had traded orphan vs syncing.
|
||||
*/
|
||||
class MapAndSync : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
MapAndSync(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
CreateFences();
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW);
|
||||
}
|
||||
|
||||
~MapAndSync() {
|
||||
DeleteFences();
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
Align(stride);
|
||||
AllocMemory(size);
|
||||
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
|
||||
return std::make_pair(pointer, m_iterator);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
glFlushMappedBufferRange(m_buffertype, 0, used_size);
|
||||
glUnmapBuffer(m_buffertype);
|
||||
m_iterator += used_size;
|
||||
}
|
||||
};
|
||||
|
||||
/* Streaming fifo without mapping ovearhead.
|
||||
* This one usually requires ARB_buffer_storage (OpenGL 4.4).
|
||||
* And is usually not available on OpenGL3 gpus.
|
||||
*
|
||||
* ARB_buffer_storage allows us to render from a mapped buffer.
|
||||
* So we map it persistently in the initialization.
|
||||
*
|
||||
* Unsync mapping sounds like an easy task, but it isn't for threaded drivers.
|
||||
* So every mapping on current close-source driver _will_ end in
|
||||
* at least a round trip time between two threads.
|
||||
*
|
||||
* As persistently mapped buffer can't use orphaning, we also have to sync.
|
||||
*/
|
||||
class BufferStorage : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
BufferStorage(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
CreateFences();
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
|
||||
// PERSISTANT_BIT to make sure that the buffer can be used while mapped
|
||||
// COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
|
||||
// CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side
|
||||
glBufferStorage(m_buffertype, m_size, NULL,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_CLIENT_STORAGE_BIT);
|
||||
m_pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size,
|
||||
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);
|
||||
}
|
||||
|
||||
~BufferStorage() {
|
||||
DeleteFences();
|
||||
glUnmapBuffer(m_buffertype);
|
||||
glBindBuffer(m_buffertype, 0);
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
Align(stride);
|
||||
AllocMemory(size);
|
||||
return std::make_pair(m_pointer + m_iterator, m_iterator);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
m_iterator += used_size;
|
||||
}
|
||||
|
||||
u8* m_pointer;
|
||||
};
|
||||
|
||||
/* --- AMD only ---
|
||||
* Another streaming fifo without mapping overhead.
|
||||
* As we can't orphan without mapping, we have to sync.
|
||||
*
|
||||
* This one uses AMD_pinned_memory which is available on all AMD gpus.
|
||||
* OpenGL 4.4 drivers should use BufferStorage.
|
||||
*/
|
||||
class PinnedMemory : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
PinnedMemory(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
CreateFences();
|
||||
m_pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY );
|
||||
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
|
||||
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY);
|
||||
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
}
|
||||
|
||||
~PinnedMemory() {
|
||||
DeleteFences();
|
||||
glBindBuffer(m_buffertype, 0);
|
||||
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
|
||||
FreeAlignedMemory(m_pointer);
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
Align(stride);
|
||||
AllocMemory(size);
|
||||
return std::make_pair(m_pointer + m_iterator, m_iterator);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
m_iterator += used_size;
|
||||
}
|
||||
|
||||
u8* m_pointer;
|
||||
static const u32 ALIGN_PINNED_MEMORY = 4096;
|
||||
};
|
||||
|
||||
/* Fifo based on the glBufferSubData call.
|
||||
* As everything must be copied before glBufferSubData returns,
|
||||
* an additional memcpy in the driver will be done.
|
||||
* So this is a huge overhead, only use it if required.
|
||||
*/
|
||||
class BufferSubData : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
BufferSubData(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
glBufferData(m_buffertype, size, 0, GL_STATIC_DRAW);
|
||||
m_pointer = new u8[m_size];
|
||||
}
|
||||
|
||||
~BufferSubData() {
|
||||
delete [] m_pointer;
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
return std::make_pair(m_pointer, 0);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
glBufferSubData(m_buffertype, 0, used_size, m_pointer);
|
||||
}
|
||||
|
||||
u8* m_pointer;
|
||||
};
|
||||
|
||||
/* Fifo based on the glBufferData call.
|
||||
* Some trashy drivers stall in BufferSubData.
|
||||
* So here we use glBufferData, which realloc this buffer every time.
|
||||
* This may avoid stalls, but it is a bigger overhead than BufferSubData.
|
||||
*/
|
||||
class BufferData : public StreamBuffer
|
||||
{
|
||||
public:
|
||||
BufferData(u32 type, size_t size) : StreamBuffer(type, size) {
|
||||
glBindBuffer(m_buffertype, m_buffer);
|
||||
m_pointer = new u8[m_size];
|
||||
}
|
||||
|
||||
~BufferData() {
|
||||
delete [] m_pointer;
|
||||
}
|
||||
|
||||
std::pair<u8*, size_t> Map(size_t size, u32 stride) {
|
||||
return std::make_pair(m_pointer, 0);
|
||||
}
|
||||
|
||||
void Unmap(size_t used_size) {
|
||||
glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW);
|
||||
}
|
||||
|
||||
u8* m_pointer;
|
||||
};
|
||||
|
||||
// choose best streaming library based on the supported extensions and known issues
|
||||
StreamBuffer* StreamBuffer::Create(u32 type, size_t size)
|
||||
{
|
||||
bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation");
|
||||
|
||||
if (g_ogl_config.bSupportsGLBufferStorage &&
|
||||
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
|
||||
return new BufferStorage(type, size);
|
||||
else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
|
||||
return new BufferSubData(type, size);
|
||||
else if(!g_ogl_config.bSupportsGLBaseVertex)
|
||||
return new BufferData(type, size);
|
||||
else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory &&
|
||||
!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
|
||||
return new PinnedMemory(type, size);
|
||||
else if(nvidia)
|
||||
return new BufferSubData(type, size);
|
||||
else if(g_ogl_config.bSupportsGLSync)
|
||||
return new MapAndSync(type, size);
|
||||
else
|
||||
return new MapAndOrphan(type, size);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#ifndef STREAMBUFFER_H
|
||||
#define STREAMBUFFER_H
|
||||
|
||||
#include <utility>
|
||||
#include "VideoCommon.h"
|
||||
#include "FramebufferManager.h"
|
||||
#include "GLUtil.h"
|
||||
|
@ -17,39 +18,41 @@
|
|||
|
||||
namespace OGL
|
||||
{
|
||||
enum StreamType {
|
||||
MAP_AND_ORPHAN = (1 << 1),
|
||||
MAP_AND_SYNC = (1 << 2),
|
||||
PINNED_MEMORY = (1 << 3),
|
||||
BUFFERSUBDATA = (1 << 4),
|
||||
BUFFERDATA = (1 << 5),
|
||||
BUFFERSTORAGE = (1 << 6),
|
||||
};
|
||||
|
||||
class StreamBuffer {
|
||||
|
||||
public:
|
||||
static StreamBuffer* Create(u32 type, size_t size);
|
||||
virtual ~StreamBuffer();
|
||||
|
||||
/* This mapping function will return a pair of:
|
||||
* - the pointer to the mapped buffer
|
||||
* - the offset into the real gpu buffer (always multiple of stride)
|
||||
* On mapping, the maximum of size for allocation has to be set.
|
||||
* The size really pushed into this fifo only has to be known on Unmapping.
|
||||
* Mapping invalidates the current buffer content,
|
||||
* so it isn't allowed to access the old content any more.
|
||||
*/
|
||||
virtual std::pair<u8*, size_t> Map(size_t size, u32 stride = 0) = 0;
|
||||
virtual void Unmap(size_t used_size) = 0;
|
||||
|
||||
const u32 m_buffer;
|
||||
|
||||
protected:
|
||||
StreamBuffer(u32 type, size_t size);
|
||||
~StreamBuffer();
|
||||
|
||||
void Alloc(size_t size, u32 stride = 0);
|
||||
size_t Upload(u8 *data, size_t size);
|
||||
|
||||
u32 getBuffer() { return m_buffer; }
|
||||
|
||||
private:
|
||||
void Init();
|
||||
void Shutdown();
|
||||
void CreateFences();
|
||||
void DeleteFences();
|
||||
void AllocMemory(size_t size);
|
||||
void Align(u32 stride);
|
||||
|
||||
StreamType m_uploadtype;
|
||||
u32 m_buffer;
|
||||
u32 m_buffertype;
|
||||
size_t m_size;
|
||||
u8 *pointer;
|
||||
const u32 m_buffertype;
|
||||
const size_t m_size;
|
||||
|
||||
size_t m_iterator;
|
||||
size_t m_used_iterator;
|
||||
size_t m_free_iterator;
|
||||
|
||||
private:
|
||||
GLsync *fences;
|
||||
};
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ namespace OGL
|
|||
{
|
||||
//This are the initially requested size for the buffers expressed in bytes
|
||||
const u32 MAX_IBUFFER_SIZE = 2*1024*1024;
|
||||
const u32 MAX_VBUFFER_SIZE = 16*1024*1024;
|
||||
const u32 MAX_VBUFFER_SIZE = 32*1024*1024;
|
||||
|
||||
static StreamBuffer *s_vertexBuffer;
|
||||
static StreamBuffer *s_indexBuffer;
|
||||
|
@ -58,11 +58,11 @@ VertexManager::~VertexManager()
|
|||
|
||||
void VertexManager::CreateDeviceObjects()
|
||||
{
|
||||
s_vertexBuffer = new StreamBuffer(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE);
|
||||
m_vertex_buffers = s_vertexBuffer->getBuffer();
|
||||
s_vertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE);
|
||||
m_vertex_buffers = s_vertexBuffer->m_buffer;
|
||||
|
||||
s_indexBuffer = new StreamBuffer(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE);
|
||||
m_index_buffers = s_indexBuffer->getBuffer();
|
||||
s_indexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE);
|
||||
m_index_buffers = s_indexBuffer->m_buffer;
|
||||
|
||||
m_CurrentVertexFmt = NULL;
|
||||
m_last_vao = 0;
|
||||
|
@ -85,17 +85,25 @@ void VertexManager::PrepareDrawBuffers(u32 stride)
|
|||
u32 vertex_data_size = IndexGenerator::GetNumVerts() * stride;
|
||||
u32 index_data_size = IndexGenerator::GetIndexLen() * sizeof(u16);
|
||||
|
||||
s_vertexBuffer->Alloc(vertex_data_size, stride);
|
||||
size_t offset = s_vertexBuffer->Upload(GetVertexBuffer(), vertex_data_size);
|
||||
s_baseVertex = offset / stride;
|
||||
|
||||
s_indexBuffer->Alloc(index_data_size);
|
||||
s_index_offset = s_indexBuffer->Upload((u8*)GetIndexBuffer(), index_data_size);
|
||||
s_vertexBuffer->Unmap(vertex_data_size);
|
||||
s_indexBuffer->Unmap(index_data_size);
|
||||
|
||||
ADDSTAT(stats.thisFrame.bytesVertexStreamed, vertex_data_size);
|
||||
ADDSTAT(stats.thisFrame.bytesIndexStreamed, index_data_size);
|
||||
}
|
||||
|
||||
void VertexManager::ResetBuffer(u32 stride)
|
||||
{
|
||||
auto buffer = s_vertexBuffer->Map(MAXVBUFFERSIZE, stride);
|
||||
s_pCurBufferPointer = s_pBaseBufferPointer = buffer.first;
|
||||
s_pEndBufferPointer = buffer.first + MAXVBUFFERSIZE;
|
||||
s_baseVertex = buffer.second / stride;
|
||||
|
||||
buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16));
|
||||
IndexGenerator::Start((u16*)buffer.first);
|
||||
s_index_offset = buffer.second;
|
||||
}
|
||||
|
||||
void VertexManager::Draw(u32 stride)
|
||||
{
|
||||
u32 index_size = IndexGenerator::GetIndexLen();
|
||||
|
@ -234,4 +242,5 @@ void VertexManager::vFlush()
|
|||
GL_REPORT_ERRORD();
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -40,6 +40,8 @@ public:
|
|||
GLuint m_vertex_buffers;
|
||||
GLuint m_index_buffers;
|
||||
GLuint m_last_vao;
|
||||
protected:
|
||||
virtual void ResetBuffer(u32 stride);
|
||||
private:
|
||||
void Draw(u32 stride);
|
||||
void vFlush() override;
|
||||
|
|
|
@ -24,6 +24,8 @@ u8 *VertexManager::s_pEndBufferPointer;
|
|||
|
||||
PrimitiveType VertexManager::current_primitive_type;
|
||||
|
||||
bool VertexManager::IsFlushed;
|
||||
|
||||
static const PrimitiveType primitive_from_gx[8] = {
|
||||
PRIMITIVE_TRIANGLES, // GX_DRAW_QUADS
|
||||
PRIMITIVE_TRIANGLES, // GX_DRAW_NONE
|
||||
|
@ -37,25 +39,13 @@ static const PrimitiveType primitive_from_gx[8] = {
|
|||
|
||||
VertexManager::VertexManager()
|
||||
{
|
||||
LocalVBuffer.resize(MAXVBUFFERSIZE);
|
||||
s_pCurBufferPointer = s_pBaseBufferPointer = &LocalVBuffer[0];
|
||||
s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size();
|
||||
|
||||
LocalIBuffer.resize(MAXIBUFFERSIZE);
|
||||
|
||||
ResetBuffer();
|
||||
IsFlushed = true;
|
||||
}
|
||||
|
||||
VertexManager::~VertexManager()
|
||||
{
|
||||
}
|
||||
|
||||
void VertexManager::ResetBuffer()
|
||||
{
|
||||
s_pCurBufferPointer = s_pBaseBufferPointer;
|
||||
IndexGenerator::Start(GetIndexBuffer());
|
||||
}
|
||||
|
||||
u32 VertexManager::GetRemainingSize()
|
||||
{
|
||||
return (u32)(s_pEndBufferPointer - s_pCurBufferPointer);
|
||||
|
@ -84,11 +74,13 @@ void VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 strid
|
|||
ERROR_LOG(VIDEO, "VertexManager: Buffer not large enough for all vertices! "
|
||||
"Increase MAXVBUFFERSIZE or we need primitive breaking after all.");
|
||||
}
|
||||
}
|
||||
|
||||
bool VertexManager::IsFlushed() const
|
||||
{
|
||||
return s_pBaseBufferPointer == s_pCurBufferPointer;
|
||||
// need to alloc new buffer
|
||||
if(IsFlushed)
|
||||
{
|
||||
g_vertex_manager->ResetBuffer(stride);
|
||||
IsFlushed = false;
|
||||
}
|
||||
}
|
||||
|
||||
u32 VertexManager::GetRemainingIndices(int primitive)
|
||||
|
@ -160,8 +152,7 @@ void VertexManager::AddVertices(int primitive, u32 numVertices)
|
|||
|
||||
void VertexManager::Flush()
|
||||
{
|
||||
if (g_vertex_manager->IsFlushed())
|
||||
return;
|
||||
if (IsFlushed) return;
|
||||
|
||||
// loading a state will invalidate BP, so check for it
|
||||
g_video_backend->CheckInvalidState();
|
||||
|
@ -238,24 +229,10 @@ void VertexManager::Flush()
|
|||
// TODO: need to merge more stuff into VideoCommon
|
||||
g_vertex_manager->vFlush();
|
||||
|
||||
g_vertex_manager->ResetBuffer();
|
||||
IsFlushed = true;
|
||||
}
|
||||
|
||||
void VertexManager::DoState(PointerWrap& p)
|
||||
{
|
||||
g_vertex_manager->vDoState(p);
|
||||
}
|
||||
|
||||
void VertexManager::DoStateShared(PointerWrap& p)
|
||||
{
|
||||
// It seems we half-assume to be flushed here
|
||||
// We update s_pCurBufferPointer yet don't worry about IndexGenerator's outdated pointers
|
||||
// and maybe other things are overlooked
|
||||
|
||||
p.Do(LocalVBuffer);
|
||||
p.Do(LocalIBuffer);
|
||||
|
||||
s_pBaseBufferPointer = &LocalVBuffer[0];
|
||||
s_pEndBufferPointer = s_pBaseBufferPointer + LocalVBuffer.size();
|
||||
p.DoPointer(s_pCurBufferPointer, s_pBaseBufferPointer);
|
||||
}
|
||||
|
|
|
@ -51,25 +51,18 @@ public:
|
|||
virtual void DestroyDeviceObjects(){};
|
||||
|
||||
protected:
|
||||
u16* GetIndexBuffer() { return &LocalIBuffer[0]; }
|
||||
u8* GetVertexBuffer() { return &s_pBaseBufferPointer[0]; }
|
||||
|
||||
virtual void vDoState(PointerWrap& p) { DoStateShared(p); }
|
||||
void DoStateShared(PointerWrap& p);
|
||||
virtual void vDoState(PointerWrap& p) { }
|
||||
|
||||
static PrimitiveType current_primitive_type;
|
||||
|
||||
private:
|
||||
bool IsFlushed() const;
|
||||
virtual void ResetBuffer(u32 stride) = 0;
|
||||
|
||||
void ResetBuffer();
|
||||
private:
|
||||
static bool IsFlushed;
|
||||
|
||||
//virtual void Draw(u32 stride, bool alphapass) = 0;
|
||||
// temp
|
||||
virtual void vFlush() = 0;
|
||||
|
||||
std::vector<u8> LocalVBuffer;
|
||||
std::vector<u16> LocalIBuffer;
|
||||
};
|
||||
|
||||
extern VertexManager *g_vertex_manager;
|
||||
|
|
Loading…
Reference in New Issue