From be3059f5471f4e805e67bbc40e7e96e711fa649b Mon Sep 17 00:00:00 2001 From: rogerman Date: Fri, 21 Dec 2012 03:19:05 +0000 Subject: [PATCH] OpenGL Renderer: - Add PBO support. - Pixels reads are now multithreaded, giving a significant performance boost. (Note: This optimization does not work on GPUs lacking PBO support and running on Windows. Someone please research...) --- desmume/src/OGLRender.cpp | 237 ++++++++++++++++++++++++++++---------- desmume/src/gfx3d.cpp | 9 +- desmume/src/rasterize.cpp | 1 + desmume/src/render3D.cpp | 1 + desmume/src/render3D.h | 3 + 5 files changed, 188 insertions(+), 63 deletions(-) diff --git a/desmume/src/OGLRender.cpp b/desmume/src/OGLRender.cpp index 87d91a3a8..d1fd89285 100644 --- a/desmume/src/OGLRender.cpp +++ b/desmume/src/OGLRender.cpp @@ -75,6 +75,7 @@ static void ENDGL() { #include "shaders.h" #include "texcache.h" +#include "utils/task.h" static DS_ALIGN(16) u8 GPU_screen3D [256*192*4]; @@ -82,7 +83,12 @@ static const GLenum map3d_cull[4] = {GL_FRONT_AND_BACK, GL_FRONT, GL_BACK, 0}; static const GLint texEnv[4] = { GL_MODULATE, GL_DECAL, GL_MODULATE, GL_MODULATE }; static const GLenum depthFunc[2] = { GL_LESS, GL_EQUAL }; -/// Polygon Info +// Multithreading States +static bool enableMultithreading = false; +static bool isReadPixelsWorking = false; +static Task oglReadPixelsTask; + +// Polygon Info static PolygonAttributes currentPolyAttr; static PolygonTexParams currentTexParams; static GLenum depthFuncMode = 0; @@ -95,6 +101,7 @@ static u32 stencilStateSet = -1; // OpenGL Feature Support static char *extString = NULL; static bool isVBOSupported = false; +static bool isPBOSupported = false; static bool isFBOSupported = false; static bool isShaderSupported = false; @@ -111,6 +118,10 @@ static u32 oglClearImageScrollOld = 0; static GLuint vboVertexID; static GLuint vboTexCoordID; +// PBO +static GLuint pboRenderDataID[2]; +static u8 *pboRenderBuffer[2]; + // Shader states static GLuint vertexShaderID; static GLuint fragmentShaderID; @@ -174,11 +185,13 @@ OGLEXT(PFNGLUNIFORM1IPROC,glUniform1i) OGLEXT(PFNGLUNIFORM1IVPROC,glUniform1iv) OGLEXT(PFNGLUNIFORM1FPROC,glUniform1f) OGLEXT(PFNGLUNIFORM2FPROC,glUniform2f) -// VBO +// VBO and PBO OGLEXT(PFNGLGENBUFFERSPROC,glGenBuffersARB) OGLEXT(PFNGLDELETEBUFFERSPROC,glDeleteBuffersARB) OGLEXT(PFNGLBINDBUFFERPROC,glBindBufferARB) OGLEXT(PFNGLBUFFERDATAPROC,glBufferDataARB) +OGLEXT(PFNGLMAPBUFFERPROC,glMapBufferARB) +OGLEXT(PFNGLUNMAPBUFFERPROC,glUnmapBufferARB) // FBO OGLEXT(PFNGLGENFRAMEBUFFERSEXTPROC,glGenFramebuffersEXT); OGLEXT(PFNGLBINDFRAMEBUFFEREXTPROC,glBindFramebufferEXT); @@ -193,6 +206,64 @@ OGLEXT(PFNGLBLITFRAMEBUFFEREXTPROC,glBlitFramebufferEXT); OGLEXT(PFNGLACTIVETEXTUREPROC,glActiveTexture) #endif +static void* execReadPixelsTask(void *arg) +{ + u8 *pixBuffer = NULL; + + if (isPBOSupported) + { + unsigned int *bufferIndex = (unsigned int *)arg; + + if(!BEGINGL()) return 0; + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, pboRenderDataID[*bufferIndex]); + + pboRenderBuffer[*bufferIndex] = (u8 *)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY_ARB); + if (pboRenderBuffer[*bufferIndex] != NULL) + { + glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB); + } + + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0); + ENDGL(); + + pixBuffer = pboRenderBuffer[*bufferIndex]; + } + else + { + if(!BEGINGL()) return 0; + glReadPixels(0, 0, 256, 192, GL_BGRA_EXT, GL_UNSIGNED_BYTE, GPU_screen3D); + ENDGL(); + + pixBuffer = GPU_screen3D; + } + + //convert the pixels to a different format which is more convenient + //is it safe to modify the screen buffer? if not, we could make a temp copy + for(int i=0,y=191;y>=0;y--) + { + u8* dst = gfx3d_convertedScreen + (y<<(8+2)); + + for(int x=0;x<256;x++,i++) + { + u32 &u32screen3D = ((u32*)pixBuffer)[i]; + u32screen3D>>=2; + u32screen3D &= 0x3F3F3F3F; + + const int t = i<<2; + const u8 a = pixBuffer[t+3] >> 1; + const u8 r = pixBuffer[t+2]; + const u8 g = pixBuffer[t+1]; + const u8 b = pixBuffer[t+0]; + + *dst++ = r; + *dst++ = g; + *dst++ = b; + *dst++ = a; + } + } + + return 0; +} //opengl state caching: //This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. @@ -507,11 +578,13 @@ static char OGLInit(void) INITOGLEXT(PFNGLGETPROGRAMIVPROC,glGetProgramiv) INITOGLEXT(PFNGLGETPROGRAMINFOLOGPROC,glGetProgramInfoLog) INITOGLEXT(PFNGLVALIDATEPROGRAMPROC,glValidateProgram) - // VBO + // VBO and PBO INITOGLEXT(PFNGLGENBUFFERSPROC,glGenBuffersARB) INITOGLEXT(PFNGLDELETEBUFFERSPROC,glDeleteBuffersARB) INITOGLEXT(PFNGLBINDBUFFERPROC,glBindBufferARB) INITOGLEXT(PFNGLBUFFERDATAPROC,glBufferDataARB) + INITOGLEXT(PFNGLMAPBUFFERPROC,glMapBufferARB) + INITOGLEXT(PFNGLUNMAPBUFFERPROC,glUnmapBufferARB) // FBO INITOGLEXT(PFNGLGENFRAMEBUFFERSEXTPROC,glGenFramebuffersEXT); INITOGLEXT(PFNGLBINDFRAMEBUFFEREXTPROC,glBindFramebufferEXT); @@ -577,6 +650,21 @@ static char OGLInit(void) glGenBuffersARB(1, &vboVertexID); glGenBuffersARB(1, &vboTexCoordID); } + + // PBO + isPBOSupported = (strstr(extString, "GL_ARB_pixel_buffer_object") == NULL)?false:true; + if (isPBOSupported) + { + glGenBuffersARB(2, pboRenderDataID); + for (unsigned int i = 0; i < 2; i++) + { + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, pboRenderDataID[i]); + glBufferDataARB(GL_PIXEL_PACK_BUFFER_ARB, 256 * 192 * sizeof(u32), NULL, GL_STREAM_READ_ARB); + pboRenderBuffer[i] = NULL; + } + + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0); + } if(isShaderSupported) { @@ -659,6 +747,33 @@ static char OGLInit(void) ENDGL(); + // Set up multithreading + isReadPixelsWorking = false; + + if (CommonSettings.num_cores > 1) + { +#ifdef _WINDOWS + if (!isPBOSupported) + { + // Don't know why this doesn't work on Windows when the GPU + // lacks PBO support. Someone please research. + enableMultithreading = false; + } + else + { + enableMultithreading = true; + oglReadPixelsTask.start(false); + } +#else + enableMultithreading = true; + oglReadPixelsTask.start(false); +#endif + } + else + { + enableMultithreading = false; + } + // Maintain our own vertex index buffer for vertex batching and primitive // conversions. Such conversions are necessary since OpenGL deprecates // primitives like GL_QUADS and GL_QUAD_STRIP in later versions. @@ -714,6 +829,13 @@ static void OGLClose() glDeleteBuffersARB(1, &vboTexCoordID); } + if (isPBOSupported) + { + glDeleteBuffersARB(2, pboRenderDataID); + pboRenderBuffer[0] = NULL; + pboRenderBuffer[1] = NULL; + } + // FBO if (isFBOSupported) { @@ -747,6 +869,13 @@ static void OGLClose() } ENDGL(); + + if (enableMultithreading) + { + oglReadPixelsTask.finish(); + oglReadPixelsTask.shutdown(); + isReadPixelsWorking = false; + } } static void texDeleteCallback(TexCacheItem* item) @@ -968,71 +1097,42 @@ static void Control() } } - static void GL_ReadFramebuffer() { - if(!BEGINGL()) return; - glReadPixels(0, 0, 256, 192, GL_BGRA_EXT, GL_UNSIGNED_BYTE, GPU_screen3D); - ENDGL(); - - //convert the pixels to a different format which is more convenient - //is it safe to modify the screen buffer? if not, we could make a temp copy - for(int i=0,y=191;y>=0;y--) + static unsigned int bufferIndex = 0; + + bufferIndex = (bufferIndex + 1) % 2; + + if (isPBOSupported) { - u8* dst = gfx3d_convertedScreen + (y<<(8+2)); - - for(int x=0;x<256;x++,i++) - { - u32 &u32screen3D = ((u32*)GPU_screen3D)[i]; - u32screen3D>>=2; - u32screen3D &= 0x3F3F3F3F; - - const int t = i<<2; - const u8 a = GPU_screen3D[t+3] >> 1; - const u8 r = GPU_screen3D[t+2]; - const u8 g = GPU_screen3D[t+1]; - const u8 b = GPU_screen3D[t+0]; - *dst++ = r; - *dst++ = g; - *dst++ = b; - *dst++ = a; - } + if(!BEGINGL()) return; + + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, pboRenderDataID[bufferIndex]); + glReadPixels(0, 0, 256, 192, GL_BGRA_EXT, GL_UNSIGNED_BYTE, 0); + glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0); + + ENDGL(); } - -#if 0 - //convert the pixels to a different format which is more convenient - //is it safe to modify the screen buffer? if not, we could make a temp copy - for(int i=0,y=191;y>=0;y--) + + // If multithreading is enabled, call glReadPixels() on a separate thread + // (or glMapBuffer()/glUnmapBuffer() if PBOs are supported). This is a big + // deal, since these functions can cause the thread to block. If 3D rendering + // is happening on the same thread as the core emulation, (which is the most + // likely scenario), this can make the thread stall. + // + // We can get away with doing this since 3D rendering begins on H-Start, + // but the emulation doesn't actually need the rendered data until H-Blank. + // So in between that time, we can let these functions block the other thread + // and then only block this thread for the remaining time difference. + if (enableMultithreading) { - u16* dst = gfx3d_convertedScreen + (y<<8); - u8* dstAlpha = gfx3d_convertedAlpha + (y<<8); - - //I dont know much about this kind of stuff, but this seems to help - //for some reason I couldnt make the intrinsics work - //u8* u8screen3D = (u8*)&((u32*)GPU_screen3D)[i]; - /*#define PREFETCH32(X,Y) __asm { prefetchnta [u8screen3D+32*0x##X##Y] } - #define PREFETCH128(X) PREFETCH32(X,0) PREFETCH32(X,1) PREFETCH32(X,2) PREFETCH32(X,3) \ - PREFETCH32(X,4) PREFETCH32(X,5) PREFETCH32(X,6) PREFETCH32(X,7) \ - PREFETCH32(X,8) PREFETCH32(X,9) PREFETCH32(X,A) PREFETCH32(X,B) \ - PREFETCH32(X,C) PREFETCH32(X,D) PREFETCH32(X,E) PREFETCH32(X,F) - PREFETCH128(0); PREFETCH128(1);*/ - - for(int x=0;x<256;x++,i++) - { - u32 &u32screen3D = ((u32*)GPU_screen3D)[i]; - u32screen3D>>=3; - u32screen3D &= 0x1F1F1F1F; - - const int t = i<<2; - const u8 a = GPU_screen3D[t+3]; - const u8 r = GPU_screen3D[t+2]; - const u8 g = GPU_screen3D[t+1]; - const u8 b = GPU_screen3D[t+0]; - dst[x] = R5G5B5TORGB15(r,g,b) | alpha_lookup[a]; - dstAlpha[x] = a; - } + isReadPixelsWorking = true; + oglReadPixelsTask.execute(execReadPixelsTask, &bufferIndex); + } + else + { + execReadPixelsTask(&bufferIndex); } -#endif } // TODO: optimize @@ -1372,6 +1472,18 @@ static void OGLVramReconfigureSignal() TexCache_Invalidate(); } +static u8* OGLGetLineData(u8 lineNumber) +{ + // If OpenGL is still reading back pixels on a separate thread, wait for it to finish. + if (isReadPixelsWorking) + { + oglReadPixelsTask.finish(); + isReadPixelsWorking = false; + } + + return ( gfx3d_convertedScreen + (lineNumber << (8+2)) ); +} + GPU3DInterface gpu3Dgl = { "OpenGL", OGLInit, @@ -1379,4 +1491,5 @@ GPU3DInterface gpu3Dgl = { OGLClose, OGLRender, OGLVramReconfigureSignal, + OGLGetLineData }; diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index 6d304af03..57204702e 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -2291,7 +2291,14 @@ void gfx3d_glGetLightColor(unsigned int index, unsigned int* dest) void gfx3d_GetLineData(int line, u8** dst) { - *dst = gfx3d_convertedScreen+((line)<<(8+2)); + if (gpu3D->NDS_3D_GetLineData == NULL) + { + *dst = gfx3d_convertedScreen+((line)<<(8+2)); + } + else + { + *dst = gpu3D->NDS_3D_GetLineData(line); + } } void gfx3d_GetLineData15bpp(int line, u16** dst) diff --git a/desmume/src/rasterize.cpp b/desmume/src/rasterize.cpp index 114ee57ac..3858cc96d 100644 --- a/desmume/src/rasterize.cpp +++ b/desmume/src/rasterize.cpp @@ -1650,5 +1650,6 @@ GPU3DInterface gpu3DRasterize = { SoftRastClose, SoftRastRender, SoftRastVramReconfigureSignal, + NULL }; diff --git a/desmume/src/render3D.cpp b/desmume/src/render3D.cpp index f6a78347a..180469162 100644 --- a/desmume/src/render3D.cpp +++ b/desmume/src/render3D.cpp @@ -30,6 +30,7 @@ GPU3DInterface gpu3DNull = { NDS_nullFunc1, //NDS_3D_Close NDS_nullFunc1, //NDS_3D_Render NDS_nullFunc1, //NDS_3D_VramReconfigureSignal + 0 }; GPU3DInterface *gpu3D = &gpu3DNull; diff --git a/desmume/src/render3D.h b/desmume/src/render3D.h index 040dcc052..6c64b7242 100644 --- a/desmume/src/render3D.h +++ b/desmume/src/render3D.h @@ -43,6 +43,9 @@ typedef struct Render3DInterface //called when the emulator reconfigures its vram. you may need to invalidate your texture cache. void (CALL_CONVENTION* NDS_3D_VramReconfigureSignal) (); + + //called when the emulator requests rendered graphics data + u8* (CALL_CONVENTION* NDS_3D_GetLineData) (u8 lineNumber); } GPU3DInterface;