gsdx-ogl: reduce pbo complexity

Copy the full line into the pbo. Dma will only take GL_UNPACK_ROW_LENGTH
- increase memcpy size by 2 in the pbo
+ single memcpy will be faster and can use sse

Enable buffer_storage extension:
* GL_CLIENT_STORAGE_BIT was required (it is the duty of TexSubImage to copy data into the GPU mem)
* Enable the extension by default
This commit is contained in:
Gregory Hainaut 2014-11-01 20:17:08 +01:00
parent e62af05496
commit 47f40ed79a
2 changed files with 41 additions and 67 deletions

View File

@ -158,10 +158,10 @@ namespace GLLoader {
bool found_GL_ARB_separate_shader_objects = false; // Issue with Mesa and Catalyst... bool found_GL_ARB_separate_shader_objects = false; // Issue with Mesa and Catalyst...
bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
bool found_GL_ARB_buffer_storage = false;
// Note: except Apple, all drivers support explicit uniform location // Note: except Apple, all drivers support explicit uniform location
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture
// GL4 hardware // GL4 hardware
bool found_GL_ARB_buffer_storage = false;
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
bool found_GL_ARB_gpu_shader5 = false; bool found_GL_ARB_gpu_shader5 = false;
bool found_GL_ARB_shader_image_load_store = false; // GLES3.1 bool found_GL_ARB_shader_image_load_store = false; // GLES3.1

View File

@ -33,51 +33,36 @@ namespace PboPool {
GLuint m_pool[PBO_POOL_SIZE]; GLuint m_pool[PBO_POOL_SIZE];
uint32 m_offset[PBO_POOL_SIZE]; uint32 m_offset[PBO_POOL_SIZE];
uint32 m_initial_offset[PBO_POOL_SIZE]; // work around silly driver
char* m_map[PBO_POOL_SIZE]; char* m_map[PBO_POOL_SIZE];
uint32 m_current_pbo = 0; uint32 m_current_pbo = 0;
uint32 m_size; uint32 m_size;
const uint32 m_pbo_size = (640*480*16) << 2; const uint32 m_pbo_size = 4*1024*1024;
bool m_buffer_storage = false;
#ifndef ENABLE_GLES #ifndef ENABLE_GLES
// Option for buffer storage // Option for buffer storage
// Note there is a barrier (but maybe coherent is faster) // Note there is a barrier (but maybe coherent is faster)
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; // XXX: actually does I really need coherent and barrier???
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
// the value visible to the server
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT /*| GL_MAP_COHERENT_BIT*/;
// FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write? // FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write?
const GLbitfield create_flags = map_flags | GL_DYNAMIC_STORAGE_BIT; const GLbitfield create_flags = map_flags /*| GL_DYNAMIC_STORAGE_BIT*/ | GL_CLIENT_STORAGE_BIT;
#endif #endif
// Normally driver must aligned the map....
void* align_map(void* ptr) {
void* aligned_map = (char*)(((uptr)ptr + 63) & ~0x3F);
m_initial_offset[m_current_pbo] = (uptr)aligned_map-(uptr)ptr;
if (m_initial_offset[m_current_pbo])
fprintf(stderr, "Buggy driver detected!!! Buffer alignment is not 64B as the spec request it\n");
return aligned_map;
}
void Init() { void Init() {
gl_GenBuffers(countof(m_pool), m_pool); gl_GenBuffers(countof(m_pool), m_pool);
m_buffer_storage = (theApp.GetConfig("ogl_texture_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage;
for (size_t i = 0; i < countof(m_pool); i++) { for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo(); BindPbo();
// Note the +64 gives additional room to realign the buffer (buggy driver....) if (GLLoader::found_GL_ARB_buffer_storage) {
if (m_buffer_storage) {
#ifndef ENABLE_GLES #ifndef ENABLE_GLES
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, create_flags); gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, create_flags);
m_map[m_current_pbo] = (char*)align_map(gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size+64, map_flags)); m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, map_flags);
// Workaround silly driver. (would be 0 otherwise)
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
#endif #endif
} else { } else {
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, GL_STREAM_COPY); gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_COPY);
m_map[m_current_pbo] = NULL; m_map[m_current_pbo] = NULL;
m_offset[m_current_pbo] = 0;
} }
NextPbo(); NextPbo();
@ -89,19 +74,27 @@ namespace PboPool {
char* map; char* map;
m_size = size; m_size = size;
if (m_size >= m_pbo_size) { if (m_size > m_pbo_size) {
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size); fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
} }
if (!m_buffer_storage) { if (GLLoader::found_GL_ARB_buffer_storage) {
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
}
// Note: texsubimage will access currently bound buffer
// Pbo ready let's get a pointer
BindPbo();
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
} else {
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT; GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) { if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo(); NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT; flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
flags |= GL_MAP_INVALIDATE_BUFFER_BIT; flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
} }
@ -111,20 +104,6 @@ namespace PboPool {
// Be sure the map is aligned // Be sure the map is aligned
map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags); map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
} else {
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free. Will be 0 if driver aligned properly the buffer
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
}
// Note: texsubimage will access currently bound buffer
// Pbo ready let's get a pointer
BindPbo();
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
} }
return map; return map;
@ -132,19 +111,16 @@ namespace PboPool {
// Used to unmap the buffer when context was detached. // Used to unmap the buffer when context was detached.
void UnmapAll() { void UnmapAll() {
if (m_map[m_current_pbo] == NULL) return;
for (size_t i = 0; i < countof(m_pool); i++) { for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo(); m_map[i] = NULL;
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER); m_offset[m_current_pbo] = 0;
m_map[m_current_pbo] = NULL;
NextPbo();
} }
UnbindPbo();
} }
void Unmap() { void Unmap() {
if (m_buffer_storage) { if (GLLoader::found_GL_ARB_buffer_storage) {
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
// the value visible to the server
//gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); //gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
} else { } else {
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER); gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
@ -156,7 +132,7 @@ namespace PboPool {
} }
void Destroy() { void Destroy() {
if (m_buffer_storage) if (GLLoader::found_GL_ARB_buffer_storage)
UnmapAll(); UnmapAll();
gl_DeleteBuffers(countof(m_pool), m_pool); gl_DeleteBuffers(countof(m_pool), m_pool);
} }
@ -167,6 +143,8 @@ namespace PboPool {
void NextPbo() { void NextPbo() {
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1); m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
// Mark new PBO as free
m_offset[m_current_pbo] = 0;
} }
void UnbindPbo() { void UnbindPbo() {
@ -338,27 +316,23 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
#if 1 #if 1
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
uint32 line_size = r.width() << m_int_shift;
char* src = (char*)data; char* src = (char*)data;
char* map = PboPool::Map(r.height() * line_size); char* map = PboPool::Map(r.height() * pitch);
for (uint32 h = r.height(); h > 0; h--) {
if ((uptr)map & 0x3F) {
memcpy(map, src, line_size);
} else {
GSVector4i::storent(map, src, line_size);
}
src += pitch;
map += line_size;
#ifdef ENABLE_OGL_DEBUG_MEM_BW #ifdef ENABLE_OGL_DEBUG_MEM_BW
g_texture_upload_byte += line_size; // Note: pitch is the line size that will be copied into the PBO
// pitch >> m_int_shift is the line size that will be actually dma-ed into the GPU
g_texture_upload_byte += pitch * r.height();
#endif #endif
}
memcpy(map, src, pitch*r.height());
PboPool::Unmap(); PboPool::Unmap();
glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift);
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset()); glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
// Normally only affect TexSubImage call. (i.e. only the previous line)
//glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
// FIXME OGL4: investigate, only 1 unpack buffer always bound // FIXME OGL4: investigate, only 1 unpack buffer always bound
PboPool::UnbindPbo(); PboPool::UnbindPbo();