mirror of https://github.com/PCSX2/pcsx2.git
gsdx-ogl: reduce pbo complexity
Copy the full line into the pbo. Dma will only take GL_UNPACK_ROW_LENGTH - increase memcpy size by 2 in the pbo + single memcpy will be faster and can use sse Enable buffer_storage extension: * GL_CLIENT_STORAGE_BIT was required (it is the duty of TexSubImage to copy data into the GPU mem) * Enable the extension by default
This commit is contained in:
parent
e62af05496
commit
47f40ed79a
|
@ -158,10 +158,10 @@ namespace GLLoader {
|
|||
bool found_GL_ARB_separate_shader_objects = false; // Issue with Mesa and Catalyst...
|
||||
bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
|
||||
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
|
||||
bool found_GL_ARB_buffer_storage = false;
|
||||
// Note: except Apple, all drivers support explicit uniform location
|
||||
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture
|
||||
// GL4 hardware
|
||||
bool found_GL_ARB_buffer_storage = false;
|
||||
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
|
||||
bool found_GL_ARB_gpu_shader5 = false;
|
||||
bool found_GL_ARB_shader_image_load_store = false; // GLES3.1
|
||||
|
|
|
@ -33,51 +33,36 @@ namespace PboPool {
|
|||
|
||||
GLuint m_pool[PBO_POOL_SIZE];
|
||||
uint32 m_offset[PBO_POOL_SIZE];
|
||||
uint32 m_initial_offset[PBO_POOL_SIZE]; // work around silly driver
|
||||
char* m_map[PBO_POOL_SIZE];
|
||||
uint32 m_current_pbo = 0;
|
||||
uint32 m_size;
|
||||
const uint32 m_pbo_size = (640*480*16) << 2;
|
||||
bool m_buffer_storage = false;
|
||||
const uint32 m_pbo_size = 4*1024*1024;
|
||||
|
||||
#ifndef ENABLE_GLES
|
||||
// Option for buffer storage
|
||||
// Note there is a barrier (but maybe coherent is faster)
|
||||
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
|
||||
// XXX: actually does I really need coherent and barrier???
|
||||
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
|
||||
// the value visible to the server
|
||||
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT /*| GL_MAP_COHERENT_BIT*/;
|
||||
// FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write?
|
||||
const GLbitfield create_flags = map_flags | GL_DYNAMIC_STORAGE_BIT;
|
||||
const GLbitfield create_flags = map_flags /*| GL_DYNAMIC_STORAGE_BIT*/ | GL_CLIENT_STORAGE_BIT;
|
||||
#endif
|
||||
|
||||
// Normally driver must aligned the map....
|
||||
void* align_map(void* ptr) {
|
||||
void* aligned_map = (char*)(((uptr)ptr + 63) & ~0x3F);
|
||||
|
||||
m_initial_offset[m_current_pbo] = (uptr)aligned_map-(uptr)ptr;
|
||||
if (m_initial_offset[m_current_pbo])
|
||||
fprintf(stderr, "Buggy driver detected!!! Buffer alignment is not 64B as the spec request it\n");
|
||||
|
||||
return aligned_map;
|
||||
}
|
||||
|
||||
void Init() {
|
||||
gl_GenBuffers(countof(m_pool), m_pool);
|
||||
m_buffer_storage = (theApp.GetConfig("ogl_texture_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage;
|
||||
|
||||
for (size_t i = 0; i < countof(m_pool); i++) {
|
||||
BindPbo();
|
||||
|
||||
// Note the +64 gives additional room to realign the buffer (buggy driver....)
|
||||
if (m_buffer_storage) {
|
||||
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||
#ifndef ENABLE_GLES
|
||||
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, create_flags);
|
||||
m_map[m_current_pbo] = (char*)align_map(gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size+64, map_flags));
|
||||
// Workaround silly driver. (would be 0 otherwise)
|
||||
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
|
||||
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, create_flags);
|
||||
m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, map_flags);
|
||||
#endif
|
||||
} else {
|
||||
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, GL_STREAM_COPY);
|
||||
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_COPY);
|
||||
m_map[m_current_pbo] = NULL;
|
||||
m_offset[m_current_pbo] = 0;
|
||||
}
|
||||
|
||||
NextPbo();
|
||||
|
@ -89,19 +74,27 @@ namespace PboPool {
|
|||
char* map;
|
||||
m_size = size;
|
||||
|
||||
if (m_size >= m_pbo_size) {
|
||||
if (m_size > m_pbo_size) {
|
||||
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
|
||||
}
|
||||
|
||||
if (!m_buffer_storage) {
|
||||
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
||||
NextPbo();
|
||||
}
|
||||
|
||||
// Note: texsubimage will access currently bound buffer
|
||||
// Pbo ready let's get a pointer
|
||||
BindPbo();
|
||||
|
||||
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
|
||||
|
||||
} else {
|
||||
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
|
||||
|
||||
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
||||
NextPbo();
|
||||
|
||||
// Mark current pbo free
|
||||
m_offset[m_current_pbo] = 0;
|
||||
|
||||
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
|
||||
flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
|
||||
}
|
||||
|
@ -111,20 +104,6 @@ namespace PboPool {
|
|||
|
||||
// Be sure the map is aligned
|
||||
map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
|
||||
|
||||
} else {
|
||||
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
||||
NextPbo();
|
||||
|
||||
// Mark current pbo free. Will be 0 if driver aligned properly the buffer
|
||||
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
|
||||
}
|
||||
|
||||
// Note: texsubimage will access currently bound buffer
|
||||
// Pbo ready let's get a pointer
|
||||
BindPbo();
|
||||
|
||||
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
|
||||
}
|
||||
|
||||
return map;
|
||||
|
@ -132,19 +111,16 @@ namespace PboPool {
|
|||
|
||||
// Used to unmap the buffer when context was detached.
|
||||
void UnmapAll() {
|
||||
if (m_map[m_current_pbo] == NULL) return;
|
||||
|
||||
for (size_t i = 0; i < countof(m_pool); i++) {
|
||||
BindPbo();
|
||||
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
||||
m_map[m_current_pbo] = NULL;
|
||||
NextPbo();
|
||||
m_map[i] = NULL;
|
||||
m_offset[m_current_pbo] = 0;
|
||||
}
|
||||
UnbindPbo();
|
||||
}
|
||||
|
||||
void Unmap() {
|
||||
if (m_buffer_storage) {
|
||||
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
|
||||
// the value visible to the server
|
||||
//gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
|
||||
} else {
|
||||
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
||||
|
@ -156,7 +132,7 @@ namespace PboPool {
|
|||
}
|
||||
|
||||
void Destroy() {
|
||||
if (m_buffer_storage)
|
||||
if (GLLoader::found_GL_ARB_buffer_storage)
|
||||
UnmapAll();
|
||||
gl_DeleteBuffers(countof(m_pool), m_pool);
|
||||
}
|
||||
|
@ -167,6 +143,8 @@ namespace PboPool {
|
|||
|
||||
void NextPbo() {
|
||||
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
|
||||
// Mark new PBO as free
|
||||
m_offset[m_current_pbo] = 0;
|
||||
}
|
||||
|
||||
void UnbindPbo() {
|
||||
|
@ -338,27 +316,23 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
|
|||
#if 1
|
||||
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
|
||||
|
||||
uint32 line_size = r.width() << m_int_shift;
|
||||
char* src = (char*)data;
|
||||
char* map = PboPool::Map(r.height() * line_size);
|
||||
|
||||
for (uint32 h = r.height(); h > 0; h--) {
|
||||
if ((uptr)map & 0x3F) {
|
||||
memcpy(map, src, line_size);
|
||||
} else {
|
||||
GSVector4i::storent(map, src, line_size);
|
||||
}
|
||||
src += pitch;
|
||||
map += line_size;
|
||||
char* map = PboPool::Map(r.height() * pitch);
|
||||
|
||||
#ifdef ENABLE_OGL_DEBUG_MEM_BW
|
||||
g_texture_upload_byte += line_size;
|
||||
// Note: pitch is the line size that will be copied into the PBO
|
||||
// pitch >> m_int_shift is the line size that will be actually dma-ed into the GPU
|
||||
g_texture_upload_byte += pitch * r.height();
|
||||
#endif
|
||||
}
|
||||
|
||||
memcpy(map, src, pitch*r.height());
|
||||
|
||||
PboPool::Unmap();
|
||||
|
||||
glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift);
|
||||
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
|
||||
// Normally only affect TexSubImage call. (i.e. only the previous line)
|
||||
//glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
|
||||
|
||||
// FIXME OGL4: investigate, only 1 unpack buffer always bound
|
||||
PboPool::UnbindPbo();
|
||||
|
|
Loading…
Reference in New Issue