From 690432de30818ab26ab0348cb9e1ddacc77216e4 Mon Sep 17 00:00:00 2001 From: "gregory.hainaut" Date: Sat, 17 Aug 2013 09:05:41 +0000 Subject: [PATCH] gsdx ogl: * redo most of the texture upload (PBO): colin3 benchmark: 32 fps now (vs 26 fps 2 weeks ago) * use the cross vendor vsync extension on linux (previous wasn't supported by nvidia) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5721 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GLLoader.cpp | 4 + plugins/GSdx/GLLoader.h | 4 + plugins/GSdx/GSTextureOGL.cpp | 155 ++++++++++++++++++++++++++++---- plugins/GSdx/GSTextureOGL.h | 14 ++- plugins/GSdx/GSVertexArrayOGL.h | 10 ++- plugins/GSdx/GSWnd.cpp | 4 + plugins/GSdx/GSWndOGL.cpp | 7 +- plugins/GSdx/GSWndOGL.h | 2 +- 8 files changed, 172 insertions(+), 28 deletions(-) diff --git a/plugins/GSdx/GLLoader.cpp b/plugins/GSdx/GLLoader.cpp index 92e84f3de4..2a8f760f89 100644 --- a/plugins/GSdx/GLLoader.cpp +++ b/plugins/GSdx/GLLoader.cpp @@ -81,6 +81,10 @@ PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages = NULL; PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL; PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL; PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL; +PFNGLFENCESYNCPROC gl_FenceSync = NULL; +PFNGLDELETESYNCPROC gl_DeleteSync = NULL; +PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync = NULL; +PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange = NULL; // GL4.0 PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL; // GL4.1 diff --git a/plugins/GSdx/GLLoader.h b/plugins/GSdx/GLLoader.h index f34c78058c..7928937c81 100644 --- a/plugins/GSdx/GLLoader.h +++ b/plugins/GSdx/GLLoader.h @@ -145,6 +145,10 @@ extern PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages; extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer; extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer; extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData; +extern PFNGLFENCESYNCPROC gl_FenceSync; +extern PFNGLDELETESYNCPROC gl_DeleteSync; +extern PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync; +extern PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange; // GL4.0 extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv; // GL4.1 diff --git a/plugins/GSdx/GSTextureOGL.cpp b/plugins/GSdx/GSTextureOGL.cpp index 149ee88e15..baa2ed1b9e 100644 --- a/plugins/GSdx/GSTextureOGL.cpp +++ b/plugins/GSdx/GSTextureOGL.cpp @@ -24,35 +24,144 @@ #include "GSTextureOGL.h" #include "GLState.h" +// Flush need bind/unbind +// Barrier might sync much more +#define BARRIER_INSTEAD_FLUSH + namespace PboPool { - GLuint pool[8]; - uint32 current_pbo = 0; + GLuint m_pool[PBO_POOL_SIZE]; + uint32 m_offset[PBO_POOL_SIZE]; + char* m_map[PBO_POOL_SIZE]; + uint32 m_current_pbo = 0; + uint32 m_size; + const uint32 m_pbo_size = (640*480*16) << 2; void Init() { - gl_GenBuffers(countof(pool), pool); + gl_GenBuffers(countof(m_pool), m_pool); - GLuint size = (640*480*16) << 2; - - for (size_t i = 0; i < countof(pool); i++) { + for (size_t i = 0; i < countof(m_pool); i++) { BindPbo(); - gl_BufferData(GL_PIXEL_UNPACK_BUFFER, size, NULL, GL_STREAM_DRAW); + + if (GLLoader::found_GL_ARB_buffer_storage) { + gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT | GL_CLIENT_STORAGE_BIT); + } else { + gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_DRAW); + m_offset[m_current_pbo] = 0; + m_map[m_current_pbo] = NULL; + } + + NextPbo(); } UnbindPbo(); } + void MapAll() { + if (m_map[m_current_pbo] != NULL) return; + + // FIXME I'm not sure it is allowed to map another buffer after we get a pointer +#ifdef BARRIER_INSTEAD_FLUSH + GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT; +#else + GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT; +#endif + for (size_t i = 0; i < countof(m_pool); i++) { + BindPbo(); + m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, flags); + NextPbo(); + } + UnbindPbo(); + } + + char* Map(uint32 size) { + m_size = size; + + if (m_size >= m_pbo_size) { + fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size); + } + + if (!GLLoader::found_GL_ARB_buffer_storage) { + GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT; + + if (m_offset[m_current_pbo] + m_size >= m_pbo_size) { + NextPbo(); + + // Mark current pbo free + m_offset[m_current_pbo] = 0; + + flags &= ~GL_MAP_INVALIDATE_RANGE_BIT; + flags |= GL_MAP_INVALIDATE_BUFFER_BIT; + } + + // Pbo ready let's get a pointer + BindPbo(); + + return (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags); + } else { + MapAll(); + + if (m_offset[m_current_pbo] + m_size >= m_pbo_size) { + NextPbo(); + + // Mark current pbo free + m_offset[m_current_pbo] = 0; + } + + return m_map[m_current_pbo] + m_offset[m_current_pbo]; + } + } + + void UnmapAll() { + if (m_map[m_current_pbo] == NULL) return; + + for (size_t i = 0; i < countof(m_pool); i++) { + BindPbo(); + gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + m_map[m_current_pbo] == NULL; + NextPbo(); + } + UnbindPbo(); + } + + void Unmap() { + if (GLLoader::found_GL_ARB_buffer_storage) { + // GL4.4 do a glMemoryBarrier? or glFlushMappedBufferRange? +#ifdef BARRIER_INSTEAD_FLUSH + gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); +#else + BindPbo(); + gl_FlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size); + UnbindPbo(); +#endif + } else { + gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + } + } + + uint32 Offset() { + return m_offset[m_current_pbo]; + } + void Destroy() { - gl_DeleteBuffers(countof(pool), pool); + gl_DeleteBuffers(countof(m_pool), m_pool); } void BindPbo() { - gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, pool[current_pbo]); - current_pbo = (current_pbo + 1) & (countof(pool)-1); + gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, m_pool[m_current_pbo]); + } + + void NextPbo() { + m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1); } void UnbindPbo() { gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); } + + void EndTransfer() { + // Note: keep offset aligned for SSE/AVX + m_offset[m_current_pbo] += (m_size + 64) & ~0x3F; + } } // FIXME: check if it possible to always use those setup by default @@ -210,27 +319,35 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch) EnableUnit(); - PboPool::BindPbo(); - + // Note: FGLRX crashes with the default path. It is happy with PBO. However not sure PBO are big enough for + // big upscale + // Note: with latest improvement, Pbo could be faster +#if 1 glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); - char* map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, (pitch * r.height()) << m_int_shift, GL_MAP_WRITE_BIT); - char* src = (char*)data; uint32 line_size = r.width() << m_int_shift; + char* src = (char*)data; + char* map = PboPool::Map(r.height() * line_size); + for (uint32 h = r.height(); h > 0; h--) { - memcpy(map, src, line_size); + GSVector4i::storent(map, src, line_size); + //memcpy(map, src, line_size); src += pitch; map += line_size; } - gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)0); + PboPool::Unmap(); - PboPool::UnbindPbo(); + glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset()); + + if (!GLLoader::found_GL_ARB_buffer_storage) + PboPool::UnbindPbo(); + + PboPool::EndTransfer(); return true; -#if 0 +#else // pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); diff --git a/plugins/GSdx/GSTextureOGL.h b/plugins/GSdx/GSTextureOGL.h index 6b656286c0..37b1deb450 100644 --- a/plugins/GSdx/GSTextureOGL.h +++ b/plugins/GSdx/GSTextureOGL.h @@ -23,12 +23,20 @@ #include "GSTexture.h" -namespace PboPool { - extern GLuint pool[8]; - extern uint32 current_pbo; +// FIXME find the optimal number of PBO +#define PBO_POOL_SIZE 4 +namespace PboPool { void BindPbo(); void UnbindPbo(); + void NextPbo(); + + char* Map(uint32 size); + void MapAll(); + void Unmap(); + void UnmapAll(); + uint32 Offset(); + void EndTransfer(); void Init(); void Destroy(); diff --git a/plugins/GSdx/GSVertexArrayOGL.h b/plugins/GSdx/GSVertexArrayOGL.h index 6356b30f47..e823d2f543 100644 --- a/plugins/GSdx/GSVertexArrayOGL.h +++ b/plugins/GSdx/GSVertexArrayOGL.h @@ -50,7 +50,9 @@ class GSBufferOGL { { gl_GenBuffers(1, &m_buffer); // Opengl works best with 1-4MB buffer. + // Warning m_limit is the number of object (not the size in Bytes) m_limit = 2 * 1024 * 1024 / m_stride; + //m_limit = 512 * 1024 * m_stride; } ~GSBufferOGL() { gl_DeleteBuffers(1, &m_buffer); } @@ -75,9 +77,12 @@ class GSBufferOGL { // Current GPU buffer is really too small need to allocate a new one if (m_count > m_limit) { + //fprintf(stderr, "Allocate a new buffer\n %d", m_stride); allocate(std::max(m_count * 3 / 2, m_limit)); } else if (m_count > (m_limit - m_start) ) { + //fprintf(stderr, "Orphan the buffer %d\n", m_stride); + // Not enough left free room. Just go back at the beginning m_start = 0; // Orphan the buffer to avoid synchronization @@ -91,8 +96,9 @@ class GSBufferOGL { { void* dst; if (Map(&dst, count)) { - // FIXME which one to use - // GSVector4i::storent(dst, src, m_count * m_stride); + // FIXME which one to use. Note dst doesn't have any aligment guarantee + // because it depends of the offset + //GSVector4i::storent(dst, src, m_count * m_stride); memcpy(dst, src, m_stride*m_count); Unmap(); } diff --git a/plugins/GSdx/GSWnd.cpp b/plugins/GSdx/GSWnd.cpp index e94c8536f1..fa98a1626f 100644 --- a/plugins/GSdx/GSWnd.cpp +++ b/plugins/GSdx/GSWnd.cpp @@ -82,6 +82,10 @@ void GSWndGL::PopulateGlFunction() *(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer"); *(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer"); *(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData"); + *(void**)&(gl_FenceSync) = GetProcAddress("glFenceSync"); + *(void**)&(gl_DeleteSync) = GetProcAddress("glDeleteSync"); + *(void**)&(gl_ClientWaitSync) = GetProcAddress("glClientWaitSync"); + *(void**)&(gl_FlushMappedBufferRange) = GetProcAddress("glFlushMappedBufferRange"); // GL4.0 *(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv"); // GL4.1 diff --git a/plugins/GSdx/GSWndOGL.cpp b/plugins/GSdx/GSWndOGL.cpp index 99297e3922..3a21bd83e4 100644 --- a/plugins/GSdx/GSWndOGL.cpp +++ b/plugins/GSdx/GSWndOGL.cpp @@ -144,8 +144,7 @@ bool GSWndOGL::Attach(void* handle, bool managed) CheckContext(); - m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalMESA"); - //PFNGLXSWAPINTERVALMESAPROC m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapInterval"); + m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT"); PopulateGlFunction(); @@ -192,6 +191,8 @@ bool GSWndOGL::Create(const string& title, int w, int h) CheckContext(); + m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT"); + PopulateGlFunction(); return true; @@ -258,7 +259,7 @@ void GSWndOGL::SetVSync(bool enable) // m_swapinterval uses an integer as parameter // 0 -> disable vsync // n -> wait n frame - if (m_swapinterval) m_swapinterval((int)enable); + if (m_swapinterval) m_swapinterval(m_NativeDisplay, m_NativeWindow, (int)enable); } void GSWndOGL::Flip() diff --git a/plugins/GSdx/GSWndOGL.h b/plugins/GSdx/GSWndOGL.h index 024b17cd0e..d25a007408 100644 --- a/plugins/GSdx/GSWndOGL.h +++ b/plugins/GSdx/GSWndOGL.h @@ -31,7 +31,7 @@ class GSWndOGL : public GSWndGL Display* m_NativeDisplay; GLXContext m_context; - PFNGLXSWAPINTERVALMESAPROC m_swapinterval; + PFNGLXSWAPINTERVALEXTPROC m_swapinterval; void CreateContext(int major, int minor); void CheckContext();