gsdx ogl:

* redo most of the texture upload (PBO): colin3 benchmark: 32 fps now (vs 26 fps 2 weeks ago)
* use the cross vendor vsync extension on linux (previous wasn't supported by nvidia)


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5721 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2013-08-17 09:05:41 +00:00
parent 07605941ef
commit 690432de30
8 changed files with 172 additions and 28 deletions

View File

@ -81,6 +81,10 @@ PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages = NULL;
PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL; PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL;
PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL; PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL;
PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL; PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL;
PFNGLFENCESYNCPROC gl_FenceSync = NULL;
PFNGLDELETESYNCPROC gl_DeleteSync = NULL;
PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync = NULL;
PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange = NULL;
// GL4.0 // GL4.0
PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL; PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL;
// GL4.1 // GL4.1

View File

@ -145,6 +145,10 @@ extern PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages;
extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer; extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer;
extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer; extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer;
extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData; extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData;
extern PFNGLFENCESYNCPROC gl_FenceSync;
extern PFNGLDELETESYNCPROC gl_DeleteSync;
extern PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync;
extern PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange;
// GL4.0 // GL4.0
extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv; extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv;
// GL4.1 // GL4.1

View File

@ -24,35 +24,144 @@
#include "GSTextureOGL.h" #include "GSTextureOGL.h"
#include "GLState.h" #include "GLState.h"
// Flush need bind/unbind
// Barrier might sync much more
#define BARRIER_INSTEAD_FLUSH
namespace PboPool { namespace PboPool {
GLuint pool[8]; GLuint m_pool[PBO_POOL_SIZE];
uint32 current_pbo = 0; uint32 m_offset[PBO_POOL_SIZE];
char* m_map[PBO_POOL_SIZE];
uint32 m_current_pbo = 0;
uint32 m_size;
const uint32 m_pbo_size = (640*480*16) << 2;
void Init() { void Init() {
gl_GenBuffers(countof(pool), pool); gl_GenBuffers(countof(m_pool), m_pool);
GLuint size = (640*480*16) << 2; for (size_t i = 0; i < countof(m_pool); i++) {
for (size_t i = 0; i < countof(pool); i++) {
BindPbo(); BindPbo();
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, size, NULL, GL_STREAM_DRAW);
if (GLLoader::found_GL_ARB_buffer_storage) {
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT | GL_CLIENT_STORAGE_BIT);
} else {
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_DRAW);
m_offset[m_current_pbo] = 0;
m_map[m_current_pbo] = NULL;
}
NextPbo();
} }
UnbindPbo(); UnbindPbo();
} }
void MapAll() {
if (m_map[m_current_pbo] != NULL) return;
// FIXME I'm not sure it is allowed to map another buffer after we get a pointer
#ifdef BARRIER_INSTEAD_FLUSH
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT;
#else
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT;
#endif
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, flags);
NextPbo();
}
UnbindPbo();
}
char* Map(uint32 size) {
m_size = size;
if (m_size >= m_pbo_size) {
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
}
if (!GLLoader::found_GL_ARB_buffer_storage) {
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
}
// Pbo ready let's get a pointer
BindPbo();
return (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
} else {
MapAll();
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
}
return m_map[m_current_pbo] + m_offset[m_current_pbo];
}
}
void UnmapAll() {
if (m_map[m_current_pbo] == NULL) return;
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
m_map[m_current_pbo] == NULL;
NextPbo();
}
UnbindPbo();
}
void Unmap() {
if (GLLoader::found_GL_ARB_buffer_storage) {
// GL4.4 do a glMemoryBarrier? or glFlushMappedBufferRange?
#ifdef BARRIER_INSTEAD_FLUSH
gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
#else
BindPbo();
gl_FlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size);
UnbindPbo();
#endif
} else {
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
}
uint32 Offset() {
return m_offset[m_current_pbo];
}
void Destroy() { void Destroy() {
gl_DeleteBuffers(countof(pool), pool); gl_DeleteBuffers(countof(m_pool), m_pool);
} }
void BindPbo() { void BindPbo() {
gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, pool[current_pbo]); gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, m_pool[m_current_pbo]);
current_pbo = (current_pbo + 1) & (countof(pool)-1); }
void NextPbo() {
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
} }
void UnbindPbo() { void UnbindPbo() {
gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
} }
void EndTransfer() {
// Note: keep offset aligned for SSE/AVX
m_offset[m_current_pbo] += (m_size + 64) & ~0x3F;
}
} }
// FIXME: check if it possible to always use those setup by default // FIXME: check if it possible to always use those setup by default
@ -210,27 +319,35 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
EnableUnit(); EnableUnit();
PboPool::BindPbo(); // Note: FGLRX crashes with the default path. It is happy with PBO. However not sure PBO are big enough for
// big upscale
// Note: with latest improvement, Pbo could be faster
#if 1
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
char* map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, (pitch * r.height()) << m_int_shift, GL_MAP_WRITE_BIT);
char* src = (char*)data;
uint32 line_size = r.width() << m_int_shift; uint32 line_size = r.width() << m_int_shift;
char* src = (char*)data;
char* map = PboPool::Map(r.height() * line_size);
for (uint32 h = r.height(); h > 0; h--) { for (uint32 h = r.height(); h > 0; h--) {
memcpy(map, src, line_size); GSVector4i::storent(map, src, line_size);
//memcpy(map, src, line_size);
src += pitch; src += pitch;
map += line_size; map += line_size;
} }
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)0); PboPool::Unmap();
PboPool::UnbindPbo(); glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
if (!GLLoader::found_GL_ARB_buffer_storage)
PboPool::UnbindPbo();
PboPool::EndTransfer();
return true; return true;
#if 0 #else
// pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel // pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment); glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);

View File

@ -23,12 +23,20 @@
#include "GSTexture.h" #include "GSTexture.h"
namespace PboPool { // FIXME find the optimal number of PBO
extern GLuint pool[8]; #define PBO_POOL_SIZE 4
extern uint32 current_pbo;
namespace PboPool {
void BindPbo(); void BindPbo();
void UnbindPbo(); void UnbindPbo();
void NextPbo();
char* Map(uint32 size);
void MapAll();
void Unmap();
void UnmapAll();
uint32 Offset();
void EndTransfer();
void Init(); void Init();
void Destroy(); void Destroy();

View File

@ -50,7 +50,9 @@ class GSBufferOGL {
{ {
gl_GenBuffers(1, &m_buffer); gl_GenBuffers(1, &m_buffer);
// Opengl works best with 1-4MB buffer. // Opengl works best with 1-4MB buffer.
// Warning m_limit is the number of object (not the size in Bytes)
m_limit = 2 * 1024 * 1024 / m_stride; m_limit = 2 * 1024 * 1024 / m_stride;
//m_limit = 512 * 1024 * m_stride;
} }
~GSBufferOGL() { gl_DeleteBuffers(1, &m_buffer); } ~GSBufferOGL() { gl_DeleteBuffers(1, &m_buffer); }
@ -75,9 +77,12 @@ class GSBufferOGL {
// Current GPU buffer is really too small need to allocate a new one // Current GPU buffer is really too small need to allocate a new one
if (m_count > m_limit) { if (m_count > m_limit) {
//fprintf(stderr, "Allocate a new buffer\n %d", m_stride);
allocate(std::max<int>(m_count * 3 / 2, m_limit)); allocate(std::max<int>(m_count * 3 / 2, m_limit));
} else if (m_count > (m_limit - m_start) ) { } else if (m_count > (m_limit - m_start) ) {
//fprintf(stderr, "Orphan the buffer %d\n", m_stride);
// Not enough left free room. Just go back at the beginning // Not enough left free room. Just go back at the beginning
m_start = 0; m_start = 0;
// Orphan the buffer to avoid synchronization // Orphan the buffer to avoid synchronization
@ -91,8 +96,9 @@ class GSBufferOGL {
{ {
void* dst; void* dst;
if (Map(&dst, count)) { if (Map(&dst, count)) {
// FIXME which one to use // FIXME which one to use. Note dst doesn't have any aligment guarantee
// GSVector4i::storent(dst, src, m_count * m_stride); // because it depends of the offset
//GSVector4i::storent(dst, src, m_count * m_stride);
memcpy(dst, src, m_stride*m_count); memcpy(dst, src, m_stride*m_count);
Unmap(); Unmap();
} }

View File

@ -82,6 +82,10 @@ void GSWndGL::PopulateGlFunction()
*(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer"); *(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer");
*(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer"); *(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer");
*(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData"); *(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData");
*(void**)&(gl_FenceSync) = GetProcAddress("glFenceSync");
*(void**)&(gl_DeleteSync) = GetProcAddress("glDeleteSync");
*(void**)&(gl_ClientWaitSync) = GetProcAddress("glClientWaitSync");
*(void**)&(gl_FlushMappedBufferRange) = GetProcAddress("glFlushMappedBufferRange");
// GL4.0 // GL4.0
*(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv"); *(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv");
// GL4.1 // GL4.1

View File

@ -144,8 +144,7 @@ bool GSWndOGL::Attach(void* handle, bool managed)
CheckContext(); CheckContext();
m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalMESA"); m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
//PFNGLXSWAPINTERVALMESAPROC m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapInterval");
PopulateGlFunction(); PopulateGlFunction();
@ -192,6 +191,8 @@ bool GSWndOGL::Create(const string& title, int w, int h)
CheckContext(); CheckContext();
m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
PopulateGlFunction(); PopulateGlFunction();
return true; return true;
@ -258,7 +259,7 @@ void GSWndOGL::SetVSync(bool enable)
// m_swapinterval uses an integer as parameter // m_swapinterval uses an integer as parameter
// 0 -> disable vsync // 0 -> disable vsync
// n -> wait n frame // n -> wait n frame
if (m_swapinterval) m_swapinterval((int)enable); if (m_swapinterval) m_swapinterval(m_NativeDisplay, m_NativeWindow, (int)enable);
} }
void GSWndOGL::Flip() void GSWndOGL::Flip()

View File

@ -31,7 +31,7 @@ class GSWndOGL : public GSWndGL
Display* m_NativeDisplay; Display* m_NativeDisplay;
GLXContext m_context; GLXContext m_context;
PFNGLXSWAPINTERVALMESAPROC m_swapinterval; PFNGLXSWAPINTERVALEXTPROC m_swapinterval;
void CreateContext(int major, int minor); void CreateContext(int major, int minor);
void CheckContext(); void CheckContext();