gsdx-ogl: improve speed of vertex streaming

Note yet enabled because I'm afraid of data corruption but feel free to test it

The option:
ogl_vertex_storage = 1

Performance note (warm cache+gs replay on colin3)
60 fps -> 76 fps
This commit is contained in:
Gregory Hainaut 2015-04-20 09:25:58 +02:00
parent 62489f42f1
commit ce98276322
2 changed files with 40 additions and 67 deletions

View File

@ -415,6 +415,9 @@ void GSDeviceOGL::BeforeDraw()
#ifdef _DEBUG #ifdef _DEBUG
ASSERT(gl_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE); ASSERT(gl_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);
#endif #endif
// Ensure VBOs are uploaded
if (GLLoader::found_GL_ARB_buffer_storage)
Barrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
//#ifdef ENABLE_OGL_STENCIL_DEBUG //#ifdef ENABLE_OGL_STENCIL_DEBUG
// if (m_date.t) // if (m_date.t)

View File

@ -42,7 +42,6 @@ class GSBufferOGL {
size_t m_limit; size_t m_limit;
const GLenum m_target; const GLenum m_target;
GLuint m_buffer_name; GLuint m_buffer_name;
const bool m_sub_data_config;
uint8* m_buffer_ptr; uint8* m_buffer_ptr;
const bool m_buffer_storage; const bool m_buffer_storage;
@ -53,7 +52,6 @@ class GSBufferOGL {
, m_count(0) , m_count(0)
, m_limit(0) , m_limit(0)
, m_target(target) , m_target(target)
, m_sub_data_config(theApp.GetConfig("ogl_vertex_subdata", 1) != 0)
, m_buffer_storage((theApp.GetConfig("ogl_vertex_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage) , m_buffer_storage((theApp.GetConfig("ogl_vertex_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage)
{ {
gl_GenBuffers(1, &m_buffer_name); gl_GenBuffers(1, &m_buffer_name);
@ -65,8 +63,14 @@ class GSBufferOGL {
#ifndef ENABLE_GLES #ifndef ENABLE_GLES
bind(); bind();
// FIXME do I need the dynamic // FIXME do I need the dynamic
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; const GLbitfield map_flags = GL_MAP_WRITE_BIT
const GLbitfield create_flags = map_flags | GL_DYNAMIC_STORAGE_BIT; | GL_MAP_PERSISTENT_BIT
// | GL_MAP_COHERENT_BIT (see barrier in GSDeviceOGL::BeforeDraw)
| GL_MAP_INVALIDATE_RANGE_BIT
;
const GLbitfield create_flags = map_flags
// | GL_CLIENT_STORAGE_BIT
;
gl_BufferStorage(m_target, m_stride*m_limit, NULL, create_flags ); gl_BufferStorage(m_target, m_stride*m_limit, NULL, create_flags );
m_buffer_ptr = (uint8*) gl_MapBufferRange(m_target, 0, m_stride*m_limit, map_flags); m_buffer_ptr = (uint8*) gl_MapBufferRange(m_target, 0, m_stride*m_limit, map_flags);
@ -124,19 +128,35 @@ class GSBufferOGL {
void map_upload(const void* src, uint32 count) void map_upload(const void* src, uint32 count)
{ {
void* dst; void* dst;
if (Map(&dst, count)) {
#if 0 m_count = count;
// FIXME which one to use. Note dst doesn't have any aligment guarantee
// because it depends of the offset // Get the pointer of the buffer
if (m_target == GL_ARRAY_BUFFER) { {
GSVector4i::storent(dst, src, m_count * m_stride); // It would need some protection of the data. For the moment finger cross!
} else { if (m_count > m_limit) {
memcpy(dst, src, m_stride*m_count); fprintf(stderr, "Buffer (%x) too small! Please report it upstream\n", m_target);
ASSERT(0);
} else if (m_count > (m_limit - m_start) ) {
//fprintf(stderr, "Wrap buffer (%x)\n", m_target);
// Wrap at startup
m_start = 0;
} }
#endif
memcpy(dst, src, m_stride*m_count); dst = m_buffer_ptr + m_start*m_stride;
Unmap();
} }
#if 0
// FIXME which one to use. Note dst doesn't have any aligment guarantee
// because it depends of the offset
if (m_target == GL_ARRAY_BUFFER) {
GSVector4i::storent(dst, src, m_count * m_stride);
} else {
memcpy(dst, src, m_stride*m_count);
}
#else
memcpy(dst, src, m_stride*m_count);
#endif
} }
#ifdef ENABLE_GLES #ifdef ENABLE_GLES
@ -157,61 +177,11 @@ class GSBufferOGL {
#ifdef ENABLE_OGL_DEBUG_MEM_BW #ifdef ENABLE_OGL_DEBUG_MEM_BW
g_vertex_upload_byte += count*m_stride; g_vertex_upload_byte += count*m_stride;
#endif #endif
if (m_sub_data_config && !m_buffer_storage) {
subdata_upload(src, count);
} else {
map_upload(src, count);
}
}
bool Map(void** pointer, uint32 count ) {
m_count = count;
if (m_buffer_storage) { if (m_buffer_storage) {
// It would need some protection of the data. For the moment finger cross! map_upload(src, count);
if (m_count > m_limit) {
fprintf(stderr, "Buffer (%x) too small! Please report it upstream\n", m_target);
ASSERT(0);
} else if (m_count > (m_limit - m_start) ) {
//fprintf(stderr, "Wrap buffer (%x)\n", m_target);
// Wrap at startup
m_start = 0;
}
*pointer = m_buffer_ptr + m_start*m_stride;
} else { } else {
// Note: For an explanation of the map flag subdata_upload(src, count);
// see http://www.opengl.org/wiki/Buffer_Object_Streaming
uint32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT;
// Current GPU buffer is really too small need to allocate a new one
if (m_count > m_limit) {
allocate(std::max<int>(m_count * 3 / 2, m_limit));
} else if (m_count > (m_limit - m_start) ) {
// Not enough left free room. Just go back at the beginning
m_start = 0;
// Tell the driver that it can orphan previous buffer and restart from a scratch buffer.
// Technically the buffer will not be accessible by the application anymore but the
// GL will effectively remove it when draws call are finised.
map_flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
} else {
// Tell the driver that it doesn't need to contain any valid buffer data, and that you promise to write the entire range you map
map_flags |= GL_MAP_INVALIDATE_RANGE_BIT;
}
// Upload the data to the buffer
*pointer = (uint8*) gl_MapBufferRange(m_target, m_stride*m_start, m_stride*m_count, map_flags);
} }
return true;
}
void Unmap() {
if (!m_buffer_storage) gl_UnmapBuffer(m_target);
} }
void EndScene() void EndScene()