gsdx ogl:

* redo most of the texture upload (PBO): colin3 benchmark: 32 fps now (vs 26 fps 2 weeks ago)
* use the cross vendor vsync extension on linux (previous wasn't supported by nvidia)


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5721 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2013-08-17 09:05:41 +00:00
parent 07605941ef
commit 690432de30
8 changed files with 172 additions and 28 deletions

View File

@ -81,6 +81,10 @@ PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages = NULL;
PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer = NULL;
PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer = NULL;
PFNGLBUFFERSUBDATAPROC gl_BufferSubData = NULL;
PFNGLFENCESYNCPROC gl_FenceSync = NULL;
PFNGLDELETESYNCPROC gl_DeleteSync = NULL;
PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync = NULL;
PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange = NULL;
// GL4.0
PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv = NULL;
// GL4.1

View File

@ -145,6 +145,10 @@ extern PFNGLUSEPROGRAMSTAGESPROC gl_UseProgramStages;
extern PFNGLVERTEXATTRIBIPOINTERPROC gl_VertexAttribIPointer;
extern PFNGLVERTEXATTRIBPOINTERPROC gl_VertexAttribPointer;
extern PFNGLBUFFERSUBDATAPROC gl_BufferSubData;
extern PFNGLFENCESYNCPROC gl_FenceSync;
extern PFNGLDELETESYNCPROC gl_DeleteSync;
extern PFNGLCLIENTWAITSYNCPROC gl_ClientWaitSync;
extern PFNGLFLUSHMAPPEDBUFFERRANGEPROC gl_FlushMappedBufferRange;
// GL4.0
extern PFNGLUNIFORMSUBROUTINESUIVPROC gl_UniformSubroutinesuiv;
// GL4.1

View File

@ -24,35 +24,144 @@
#include "GSTextureOGL.h"
#include "GLState.h"
// Flush need bind/unbind
// Barrier might sync much more
#define BARRIER_INSTEAD_FLUSH
namespace PboPool {
GLuint pool[8];
uint32 current_pbo = 0;
GLuint m_pool[PBO_POOL_SIZE];
uint32 m_offset[PBO_POOL_SIZE];
char* m_map[PBO_POOL_SIZE];
uint32 m_current_pbo = 0;
uint32 m_size;
const uint32 m_pbo_size = (640*480*16) << 2;
void Init() {
gl_GenBuffers(countof(pool), pool);
gl_GenBuffers(countof(m_pool), m_pool);
GLuint size = (640*480*16) << 2;
for (size_t i = 0; i < countof(pool); i++) {
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, size, NULL, GL_STREAM_DRAW);
if (GLLoader::found_GL_ARB_buffer_storage) {
gl_BufferStorage(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT | GL_CLIENT_STORAGE_BIT);
} else {
gl_BufferData(GL_PIXEL_UNPACK_BUFFER, m_pbo_size, NULL, GL_STREAM_DRAW);
m_offset[m_current_pbo] = 0;
m_map[m_current_pbo] = NULL;
}
NextPbo();
}
UnbindPbo();
}
void MapAll() {
if (m_map[m_current_pbo] != NULL) return;
// FIXME I'm not sure it is allowed to map another buffer after we get a pointer
#ifdef BARRIER_INSTEAD_FLUSH
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT;
#else
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT;
#endif
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
m_map[m_current_pbo] = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, m_pbo_size, flags);
NextPbo();
}
UnbindPbo();
}
char* Map(uint32 size) {
m_size = size;
if (m_size >= m_pbo_size) {
fprintf(stderr, "BUG: PBO too small %d but need %d\n", m_pbo_size, m_size);
}
if (!GLLoader::found_GL_ARB_buffer_storage) {
GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_INVALIDATE_RANGE_BIT;
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
flags &= ~GL_MAP_INVALIDATE_RANGE_BIT;
flags |= GL_MAP_INVALIDATE_BUFFER_BIT;
}
// Pbo ready let's get a pointer
BindPbo();
return (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size, flags);
} else {
MapAll();
if (m_offset[m_current_pbo] + m_size >= m_pbo_size) {
NextPbo();
// Mark current pbo free
m_offset[m_current_pbo] = 0;
}
return m_map[m_current_pbo] + m_offset[m_current_pbo];
}
}
void UnmapAll() {
if (m_map[m_current_pbo] == NULL) return;
for (size_t i = 0; i < countof(m_pool); i++) {
BindPbo();
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
m_map[m_current_pbo] == NULL;
NextPbo();
}
UnbindPbo();
}
void Unmap() {
if (GLLoader::found_GL_ARB_buffer_storage) {
// GL4.4 do a glMemoryBarrier? or glFlushMappedBufferRange?
#ifdef BARRIER_INSTEAD_FLUSH
gl_MemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
#else
BindPbo();
gl_FlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, m_offset[m_current_pbo], m_size);
UnbindPbo();
#endif
} else {
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
}
uint32 Offset() {
return m_offset[m_current_pbo];
}
void Destroy() {
gl_DeleteBuffers(countof(pool), pool);
gl_DeleteBuffers(countof(m_pool), m_pool);
}
void BindPbo() {
gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, pool[current_pbo]);
current_pbo = (current_pbo + 1) & (countof(pool)-1);
gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, m_pool[m_current_pbo]);
}
void NextPbo() {
m_current_pbo = (m_current_pbo + 1) & (countof(m_pool)-1);
}
void UnbindPbo() {
gl_BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
}
void EndTransfer() {
// Note: keep offset aligned for SSE/AVX
m_offset[m_current_pbo] += (m_size + 64) & ~0x3F;
}
}
// FIXME: check if it possible to always use those setup by default
@ -210,27 +319,35 @@ bool GSTextureOGL::Update(const GSVector4i& r, const void* data, int pitch)
EnableUnit();
PboPool::BindPbo();
// Note: FGLRX crashes with the default path. It is happy with PBO. However not sure PBO are big enough for
// big upscale
// Note: with latest improvement, Pbo could be faster
#if 1
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);
char* map = (char*)gl_MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, (pitch * r.height()) << m_int_shift, GL_MAP_WRITE_BIT);
char* src = (char*)data;
uint32 line_size = r.width() << m_int_shift;
char* src = (char*)data;
char* map = PboPool::Map(r.height() * line_size);
for (uint32 h = r.height(); h > 0; h--) {
memcpy(map, src, line_size);
GSVector4i::storent(map, src, line_size);
//memcpy(map, src, line_size);
src += pitch;
map += line_size;
}
gl_UnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)0);
PboPool::Unmap();
PboPool::UnbindPbo();
glTexSubImage2D(GL_TEXTURE_2D, 0, r.x, r.y, r.width(), r.height(), m_int_format, m_int_type, (const void*)PboPool::Offset());
if (!GLLoader::found_GL_ARB_buffer_storage)
PboPool::UnbindPbo();
PboPool::EndTransfer();
return true;
#if 0
#else
// pitch is in byte wherease GL_UNPACK_ROW_LENGTH is in pixel
glPixelStorei(GL_UNPACK_ALIGNMENT, m_int_alignment);

View File

@ -23,12 +23,20 @@
#include "GSTexture.h"
namespace PboPool {
extern GLuint pool[8];
extern uint32 current_pbo;
// FIXME find the optimal number of PBO
#define PBO_POOL_SIZE 4
namespace PboPool {
void BindPbo();
void UnbindPbo();
void NextPbo();
char* Map(uint32 size);
void MapAll();
void Unmap();
void UnmapAll();
uint32 Offset();
void EndTransfer();
void Init();
void Destroy();

View File

@ -50,7 +50,9 @@ class GSBufferOGL {
{
gl_GenBuffers(1, &m_buffer);
// Opengl works best with 1-4MB buffer.
// Warning m_limit is the number of object (not the size in Bytes)
m_limit = 2 * 1024 * 1024 / m_stride;
//m_limit = 512 * 1024 * m_stride;
}
~GSBufferOGL() { gl_DeleteBuffers(1, &m_buffer); }
@ -75,9 +77,12 @@ class GSBufferOGL {
// Current GPU buffer is really too small need to allocate a new one
if (m_count > m_limit) {
//fprintf(stderr, "Allocate a new buffer\n %d", m_stride);
allocate(std::max<int>(m_count * 3 / 2, m_limit));
} else if (m_count > (m_limit - m_start) ) {
//fprintf(stderr, "Orphan the buffer %d\n", m_stride);
// Not enough left free room. Just go back at the beginning
m_start = 0;
// Orphan the buffer to avoid synchronization
@ -91,8 +96,9 @@ class GSBufferOGL {
{
void* dst;
if (Map(&dst, count)) {
// FIXME which one to use
// GSVector4i::storent(dst, src, m_count * m_stride);
// FIXME which one to use. Note dst doesn't have any aligment guarantee
// because it depends of the offset
//GSVector4i::storent(dst, src, m_count * m_stride);
memcpy(dst, src, m_stride*m_count);
Unmap();
}

View File

@ -82,6 +82,10 @@ void GSWndGL::PopulateGlFunction()
*(void**)&(gl_VertexAttribIPointer) = GetProcAddress("glVertexAttribIPointer");
*(void**)&(gl_VertexAttribPointer) = GetProcAddress("glVertexAttribPointer");
*(void**)&(gl_BufferSubData) = GetProcAddress("glBufferSubData");
*(void**)&(gl_FenceSync) = GetProcAddress("glFenceSync");
*(void**)&(gl_DeleteSync) = GetProcAddress("glDeleteSync");
*(void**)&(gl_ClientWaitSync) = GetProcAddress("glClientWaitSync");
*(void**)&(gl_FlushMappedBufferRange) = GetProcAddress("glFlushMappedBufferRange");
// GL4.0
*(void**)&(gl_UniformSubroutinesuiv) = GetProcAddress("glUniformSubroutinesuiv");
// GL4.1

View File

@ -144,8 +144,7 @@ bool GSWndOGL::Attach(void* handle, bool managed)
CheckContext();
m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalMESA");
//PFNGLXSWAPINTERVALMESAPROC m_swapinterval = (PFNGLXSWAPINTERVALMESAPROC)glXGetProcAddress((const GLubyte*) "glXSwapInterval");
m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
PopulateGlFunction();
@ -192,6 +191,8 @@ bool GSWndOGL::Create(const string& title, int w, int h)
CheckContext();
m_swapinterval = (PFNGLXSWAPINTERVALEXTPROC)glXGetProcAddress((const GLubyte*) "glXSwapIntervalEXT");
PopulateGlFunction();
return true;
@ -258,7 +259,7 @@ void GSWndOGL::SetVSync(bool enable)
// m_swapinterval uses an integer as parameter
// 0 -> disable vsync
// n -> wait n frame
if (m_swapinterval) m_swapinterval((int)enable);
if (m_swapinterval) m_swapinterval(m_NativeDisplay, m_NativeWindow, (int)enable);
}
void GSWndOGL::Flip()

View File

@ -31,7 +31,7 @@ class GSWndOGL : public GSWndGL
Display* m_NativeDisplay;
GLXContext m_context;
PFNGLXSWAPINTERVALMESAPROC m_swapinterval;
PFNGLXSWAPINTERVALEXTPROC m_swapinterval;
void CreateContext(int major, int minor);
void CheckContext();