gsdx-ogl: texture management

Improve arb_buffer_storage implementation
Try harder to align the texture buffer

Strangely arb_buffer_storage is 3 times slower on my PC (nvidia)

Tester are welcome! Open the ini file
"ogl_texture_storage = 1" <= enable the extension
"ogl_texture_storage = 0" <= disable the extension

Note: you need an opengl 4.4 driver or one that support arb_buffer_storage (i.e. not catalyst)
This commit is contained in:
Gregory Hainaut 2014-03-23 15:42:05 +01:00
parent 41091f8ebf
commit 403518e852
2 changed files with 54 additions and 35 deletions

View File

@ -29,52 +29,63 @@ namespace PboPool {
GLuint m_pool[PBO_POOL_SIZE];
uint32 m_offset[PBO_POOL_SIZE];
uint32 m_initial_offset[PBO_POOL_SIZE]; // work around silly driver
char* m_map[PBO_POOL_SIZE];
uint32 m_current_pbo = 0;
uint32 m_size;
const uint32 m_pbo_size = (640*480*16) << 2;
bool m_buffer_storage = false;
// Option for buffer storage
// Note there is a barrier (but maybe coherent is faster)
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
// FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write?
const GLbitfield create_flags = map_flags | GL_DYNAMIC_STORAGE_BIT;
// Normally driver must aligned the map....
void* align_map(void* ptr) {
void* aligned_map = (char*)(((uptr)ptr + 63) & ~0x3F);
m_initial_offset[m_current_pbo] = (uptr)aligned_map-(uptr)ptr;
if (m_initial_offset[m_current_pbo])
fprintf(stderr, "Buggy driver detected!!! Buffer alignment is not 64B as the spec request it\n");
return aligned_map;
}
void Init() {
gl_GenBuffers(countof(m_pool), m_pool);
m_buffer_storage = (theApp.GetConfig("ogl_texture_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage;
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
if (GLLoader::found_GL_ARB_buffer_storage) {
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT | GL_CLIENT_STORAGE_BIT);
// Note the +64 gives additional room to realign the buffer (buggy driver....)
if (m_buffer_storage) {
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, create_flags);
m_map[m_current_pbo] = (char*)align_map(gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size+64, map_flags));
// Workaround silly driver. (would be 0 otherwise)
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
} else {
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_COPY);
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, GL_STREAM_COPY);
m_map[m_current_pbo] = NULL;
m_offset[m_current_pbo] = 0;
}
m_offset[m_current_pbo] = 0;
m_map[m_current_pbo] = NULL;
NextPbo();
}
UnbindPbo();
}
void MapAll() {
if (m_map[m_current_pbo] != NULL) return;
// FIXME I'm not sure it is allowed to map another buffer after we get a pointer
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT;
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, flags);
NextPbo();
}
UnbindPbo();
}
char* Map(uint32 size) {
char* map;
m_size = size;
if (m_size >= m_pbo_size) {
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
}
if (!GLLoader::found_GL_ARB_buffer_storage) {
if (!m_buffer_storage) {
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
@ -90,26 +101,28 @@ namespace PboPool {
// Pbo ready let's get a pointer
BindPbo();
return (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
} else {
MapAll();
// Be sure the map is aligned
map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
} else {
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
// Mark current pbo free. Will be 0 if driver aligned properly the buffer
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
}
// Note: it still need it because texsubimage will access currently bound buffer
// Note: texsubimage will access currently bound buffer
// Pbo ready let's get a pointer
BindPbo();
return m_map[m_current_pbo] + m_offset[m_current_pbo];
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
}
return map;
}
// FIXME: unmap buffer when the context is dettached (not sure it is required actually)
// Used to unmap the buffer when context was detached.
void UnmapAll() {
if (m_map[m_current_pbo] == NULL) return;
@ -123,8 +136,8 @@ namespace PboPool {
}
void Unmap() {
if (GLLoader::found_GL_ARB_buffer_storage) {
gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
if (m_buffer_storage) {
//gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
} else {
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
@ -135,6 +148,8 @@ namespace PboPool {
}
void Destroy() {
if (m_buffer_storage)
UnmapAll();
gl_DeleteBuffers(countof(m_pool), m_pool);
}
@ -152,7 +167,7 @@ namespace PboPool {
void EndTransfer() {
// Note: keep offset aligned for SSE/AVX
m_offset[m_current_pbo] = (m_offset[m_current_pbo] + m_size + 31) & ~0x1F;
m_offset[m_current_pbo] = (m_offset[m_current_pbo] + m_size + 63) & ~0x3F;
}
}
@ -319,11 +334,11 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
char* map = PboPool::Map(r.height() * line_size);
for (uint32 h = r.height(); h > 0; h--) {
// avoid a crash if map is not aligned
if ((uint32)map & 0x1F)
if ((uptr)map & 0x3F) {
memcpy(map, src, line_size);
else
} else {
GSVector4i::storent(map, src, line_size);
}
src += pitch;
map += line_size;
}
@ -333,7 +348,6 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
// FIXME OGL4: investigate, only 1 unpack buffer always bound
//if (!GLLoader::found_GL_ARB_buffer_storage)
PboPool::UnbindPbo();
PboPool::EndTransfer();

View File

@ -60,6 +60,11 @@ typedef unsigned int uint32;
typedef signed int int32;
typedef unsigned long long uint64;
typedef signed long long int64;
#ifdef __x86_64__
typedef uint64 uptr;
#else
typedef uint32 uptr;
#endif
// stdc