mirror of https://github.com/PCSX2/pcsx2.git
gsdx-ogl: reduce pbo complexity
Copy the full line into the pbo. Dma will only take GL_UNPACK_ROW_LENGTH - increase memcpy size by 2 in the pbo + single memcpy will be faster and can use sse Enable buffer_storage extension: * GL_CLIENT_STORAGE_BIT was required (it is the duty of TexSubImage to copy data into the GPU mem) * Enable the extension by default
This commit is contained in:
parent
e62af05496
commit
47f40ed79a
|
@ -158,10 +158,10 @@ namespace GLLoader {
|
||||||
bool found_GL_ARB_separate_shader_objects = false; // Issue with Mesa and Catalyst...
|
bool found_GL_ARB_separate_shader_objects = false; // Issue with Mesa and Catalyst...
|
||||||
bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
|
bool found_geometry_shader = true; // we require GL3.3 so geometry must be supported by default
|
||||||
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
|
bool found_GL_ARB_clear_texture = false; // Don't know if GL3 GPU can support it
|
||||||
bool found_GL_ARB_buffer_storage = false;
|
|
||||||
// Note: except Apple, all drivers support explicit uniform location
|
// Note: except Apple, all drivers support explicit uniform location
|
||||||
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture
|
bool found_GL_ARB_explicit_uniform_location = false; // need by subroutine and bindless texture
|
||||||
// GL4 hardware
|
// GL4 hardware
|
||||||
|
bool found_GL_ARB_buffer_storage = false;
|
||||||
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
|
bool found_GL_ARB_copy_image = false; // Not sure actually maybe GL3 GPU can do it
|
||||||
bool found_GL_ARB_gpu_shader5 = false;
|
bool found_GL_ARB_gpu_shader5 = false;
|
||||||
bool found_GL_ARB_shader_image_load_store = false; // GLES3.1
|
bool found_GL_ARB_shader_image_load_store = false; // GLES3.1
|
||||||
|
|
|
@ -33,51 +33,36 @@ namespace PboPool {
|
||||||
|
|
||||||
GLuint m_pool[PBO_POOL_SIZE];
|
GLuint m_pool[PBO_POOL_SIZE];
|
||||||
uint32 m_offset[PBO_POOL_SIZE];
|
uint32 m_offset[PBO_POOL_SIZE];
|
||||||
uint32 m_initial_offset[PBO_POOL_SIZE]; // work around silly driver
|
|
||||||
char* m_map[PBO_POOL_SIZE];
|
char* m_map[PBO_POOL_SIZE];
|
||||||
uint32 m_current_pbo = 0;
|
uint32 m_current_pbo = 0;
|
||||||
uint32 m_size;
|
uint32 m_size;
|
||||||
const uint32 m_pbo_size = (640*480*16) << 2;
|
const uint32 m_pbo_size = 4*1024*1024;
|
||||||
bool m_buffer_storage = false;
|
|
||||||
|
|
||||||
#ifndef ENABLE_GLES
|
#ifndef ENABLE_GLES
|
||||||
// Option for buffer storage
|
// Option for buffer storage
|
||||||
// Note there is a barrier (but maybe coherent is faster)
|
// Note there is a barrier (but maybe coherent is faster)
|
||||||
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
|
// XXX: actually does I really need coherent and barrier???
|
||||||
|
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
|
||||||
|
// the value visible to the server
|
||||||
|
const GLbitfield map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT /*| GL_MAP_COHERENT_BIT*/;
|
||||||
// FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write?
|
// FIXME do I need GL_DYNAMIC_STORAGE_BIT to allow write?
|
||||||
const GLbitfield create_flags = map_flags | GL_DYNAMIC_STORAGE_BIT;
|
const GLbitfield create_flags = map_flags /*| GL_DYNAMIC_STORAGE_BIT*/ | GL_CLIENT_STORAGE_BIT;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Normally driver must aligned the map....
|
|
||||||
void* align_map(void* ptr) {
|
|
||||||
void* aligned_map = (char*)(((uptr)ptr + 63) & ~0x3F);
|
|
||||||
|
|
||||||
m_initial_offset[m_current_pbo] = (uptr)aligned_map-(uptr)ptr;
|
|
||||||
if (m_initial_offset[m_current_pbo])
|
|
||||||
fprintf(stderr, "Buggy driver detected!!! Buffer alignment is not 64B as the spec request it\n");
|
|
||||||
|
|
||||||
return aligned_map;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Init() {
|
void Init() {
|
||||||
gl_GenBuffers(countof(m_pool), m_pool);
|
gl_GenBuffers(countof(m_pool), m_pool);
|
||||||
m_buffer_storage = (theApp.GetConfig("ogl_texture_storage", 0) == 1) && GLLoader::found_GL_ARB_buffer_storage;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < countof(m_pool); i++) {
|
for (size_t i = 0; i < countof(m_pool); i++) {
|
||||||
BindPbo();
|
BindPbo();
|
||||||
|
|
||||||
// Note the +64 gives additional room to realign the buffer (buggy driver....)
|
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||||
if (m_buffer_storage) {
|
|
||||||
#ifndef ENABLE_GLES
|
#ifndef ENABLE_GLES
|
||||||
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, create_flags);
|
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, create_flags);
|
||||||
m_map[m_current_pbo] = (char*)align_map(gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size+64, map_flags));
|
m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, map_flags);
|
||||||
// Workaround silly driver. (would be 0 otherwise)
|
|
||||||
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
|
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size+64, NULL, GL_STREAM_COPY);
|
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_COPY);
|
||||||
m_map[m_current_pbo] = NULL;
|
m_map[m_current_pbo] = NULL;
|
||||||
m_offset[m_current_pbo] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NextPbo();
|
NextPbo();
|
||||||
|
@ -89,19 +74,27 @@ namespace PboPool {
|
||||||
char* map;
|
char* map;
|
||||||
m_size = size;
|
m_size = size;
|
||||||
|
|
||||||
if (m_size >= m_pbo_size) {
|
if (m_size > m_pbo_size) {
|
||||||
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
|
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!m_buffer_storage) {
|
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||||
|
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
||||||
|
NextPbo();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: texsubimage will access currently bound buffer
|
||||||
|
// Pbo ready let's get a pointer
|
||||||
|
BindPbo();
|
||||||
|
|
||||||
|
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
|
||||||
|
|
||||||
|
} else {
|
||||||
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
|
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
|
||||||
|
|
||||||
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
||||||
NextPbo();
|
NextPbo();
|
||||||
|
|
||||||
// Mark current pbo free
|
|
||||||
m_offset[m_current_pbo] = 0;
|
|
||||||
|
|
||||||
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
|
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
|
||||||
flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
|
flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
|
||||||
}
|
}
|
||||||
|
@ -111,20 +104,6 @@ namespace PboPool {
|
||||||
|
|
||||||
// Be sure the map is aligned
|
// Be sure the map is aligned
|
||||||
map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
|
map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
|
||||||
|
|
||||||
} else {
|
|
||||||
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
|
|
||||||
NextPbo();
|
|
||||||
|
|
||||||
// Mark current pbo free. Will be 0 if driver aligned properly the buffer
|
|
||||||
m_offset[m_current_pbo] = m_initial_offset[m_current_pbo];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: texsubimage will access currently bound buffer
|
|
||||||
// Pbo ready let's get a pointer
|
|
||||||
BindPbo();
|
|
||||||
|
|
||||||
map = m_map[m_current_pbo] + m_offset[m_current_pbo];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return map;
|
return map;
|
||||||
|
@ -132,19 +111,16 @@ namespace PboPool {
|
||||||
|
|
||||||
// Used to unmap the buffer when context was detached.
|
// Used to unmap the buffer when context was detached.
|
||||||
void UnmapAll() {
|
void UnmapAll() {
|
||||||
if (m_map[m_current_pbo] == NULL) return;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < countof(m_pool); i++) {
|
for (size_t i = 0; i < countof(m_pool); i++) {
|
||||||
BindPbo();
|
m_map[i] = NULL;
|
||||||
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
m_offset[m_current_pbo] = 0;
|
||||||
m_map[m_current_pbo] = NULL;
|
|
||||||
NextPbo();
|
|
||||||
}
|
}
|
||||||
UnbindPbo();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Unmap() {
|
void Unmap() {
|
||||||
if (m_buffer_storage) {
|
if (GLLoader::found_GL_ARB_buffer_storage) {
|
||||||
|
// As far as I understand glTexSubImage2D is a client-server transfer so no need to make
|
||||||
|
// the value visible to the server
|
||||||
//gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
|
//gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
|
||||||
} else {
|
} else {
|
||||||
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
|
||||||
|
@ -156,7 +132,7 @@ namespace PboPool {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Destroy() {
|
void Destroy() {
|
||||||
if (m_buffer_storage)
|
if (GLLoader::found_GL_ARB_buffer_storage)
|
||||||
UnmapAll();
|
UnmapAll();
|
||||||
gl_DeleteBuffers(countof(m_pool), m_pool);
|
gl_DeleteBuffers(countof(m_pool), m_pool);
|
||||||
}
|
}
|
||||||
|
@ -167,6 +143,8 @@ namespace PboPool {
|
||||||
|
|
||||||
void NextPbo() {
|
void NextPbo() {
|
||||||
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
|
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
|
||||||
|
// Mark new PBO as free
|
||||||
|
m_offset[m_current_pbo] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnbindPbo() {
|
void UnbindPbo() {
|
||||||
|
@ -338,27 +316,23 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
|
||||||
#if 1
|
#if 1
|
||||||
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
|
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
|
||||||
|
|
||||||
uint32 line_size = r.width() << m_int_shift;
|
|
||||||
char* src = (char*)data;
|
char* src = (char*)data;
|
||||||
char* map = PboPool::Map(r.height() * line_size);
|
char* map = PboPool::Map(r.height() * pitch);
|
||||||
|
|
||||||
for (uint32 h = r.height(); h > 0; h--) {
|
|
||||||
if ((uptr)map & 0x3F) {
|
|
||||||
memcpy(map, src, line_size);
|
|
||||||
} else {
|
|
||||||
GSVector4i::storent(map, src, line_size);
|
|
||||||
}
|
|
||||||
src += pitch;
|
|
||||||
map += line_size;
|
|
||||||
|
|
||||||
#ifdef ENABLE_OGL_DEBUG_MEM_BW
|
#ifdef ENABLE_OGL_DEBUG_MEM_BW
|
||||||
g_texture_upload_byte += line_size;
|
// Note: pitch is the line size that will be copied into the PBO
|
||||||
|
// pitch >> m_int_shift is the line size that will be actually dma-ed into the GPU
|
||||||
|
g_texture_upload_byte += pitch * r.height();
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
|
memcpy(map, src, pitch*r.height());
|
||||||
|
|
||||||
PboPool::Unmap();
|
PboPool::Unmap();
|
||||||
|
|
||||||
|
glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch >> m_int_shift);
|
||||||
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
|
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
|
||||||
|
// Normally only affect TexSubImage call. (i.e. only the previous line)
|
||||||
|
//glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
|
||||||
|
|
||||||
// FIXME OGL4: investigate, only 1 unpack buffer always bound
|
// FIXME OGL4: investigate, only 1 unpack buffer always bound
|
||||||
PboPool::UnbindPbo();
|
PboPool::UnbindPbo();
|
||||||
|
|
Loading…
Reference in New Issue