From 95aeedec19db25f7de3bd37cc7281f6f70e0ab58 Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 26 Nov 2013 20:05:49 +0100 Subject: [PATCH] OpenGL: readback efb2ram with different strides at once This is done with a pixel buffer object. We still have to stall the GPU, but we only do it once per efb2ram call. As the cpu can't access the vram, it has to queue a memcpy for the gpu and wait for the gpu to finish this copy. We did this for every cache line which is just stupid. Now we copy the complete texture into a pbo and readback this at once. So we don't have to wait for lots of round-trip-times. --- .../OGL/Src/TextureConverter.cpp | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp index f653dfbf01..290dc510cf 100644 --- a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp +++ b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp @@ -44,6 +44,8 @@ static GLuint s_encode_VBO = 0; static GLuint s_encode_VAO = 0; static TargetRectangle s_cached_sourceRc; +static GLuint s_PBO = 0; // for readback with different strides + static const char *VProgram = "ATTRIN vec2 rawpos;\n" "ATTRIN vec2 tex0;\n" @@ -186,6 +188,8 @@ void Init() glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glGenBuffers(1, &s_PBO); + CreatePrograms(); } @@ -196,6 +200,7 @@ void Shutdown() glDeleteFramebuffers(1, &s_texConvFrameBuffer); glDeleteBuffers(1, &s_encode_VBO ); glDeleteVertexArrays(1, &s_encode_VAO ); + glDeleteBuffers(1, &s_PBO); s_rgbToYuyvProgram.Destroy(); s_yuyvToRgbProgram.Destroy(); @@ -206,6 +211,7 @@ void Shutdown() s_srcTexture = 0; s_dstTexture = 0; s_texConvFrameBuffer = 0; + s_PBO = 0; } void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc, @@ -267,25 +273,37 @@ void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc, // TODO: make this less slow. int writeStride = bpmem.copyMipMapStrideChannels * 32; + int dstSize = dstWidth*dstHeight*4; + int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel + int readLoops = dstHeight / readHeight; - if (writeStride != readStride && toTexture) + if (writeStride != readStride && readLoops > 1 && toTexture) { // writing to a texture of a different size + // also copy more then one block line, so the different strides matters + // copy into one pbo first, map this buffer, and then memcpy into gc memory + // in this way, we only have one vram->ram transfer, but maybe a bigger + // cpu overhead because of the pbo + glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO); + glBufferData(GL_PIXEL_PACK_BUFFER, dstSize, NULL, GL_STREAM_READ); + glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0); + u8* pbo = (u8*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, dstSize, GL_MAP_READ_BIT); - int readHeight = readStride / dstWidth; - readHeight /= 4; // 4 bytes per pixel - - int readStart = 0; - int readLoops = dstHeight / readHeight; + //int readStart = 0; for (int i = 0; i < readLoops; i++) { - glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr); - readStart += readHeight; + memcpy(destAddr, pbo, readStride); + pbo += readStride; destAddr += writeStride; } + + glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); } else + { glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr); + } GL_REPORT_ERRORD();