OpenGL: readback efb2ram with different strides at once

This is done with a pixel buffer object. We still have to stall the GPU, but we only do it once per efb2ram call. As the cpu can't access the vram, it has to queue a memcpy for the gpu and wait for the gpu to finish this copy. We did this for every cache line which is just stupid. Now we copy the complete texture into a pbo and readback this at once. So we don't have to wait for lots of round-trip-times.
2013-11-26 20:05:49 +01:00 · 2013-11-26 20:05:49 +01:00 · 95aeedec19
parent db9c586356
commit 95aeedec19
1 changed files with 26 additions and 8 deletions
--- a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
@ -44,6 +44,8 @@ static GLuint s_encode_VBO = 0;
 static GLuint s_encode_VAO = 0;
 static TargetRectangle s_cached_sourceRc;

+static GLuint s_PBO = 0; // for readback with different strides
+
 static const char *VProgram =
 	"ATTRIN vec2 rawpos;\n"
 	"ATTRIN vec2 tex0;\n"
@ -186,6 +188,8 @@ void Init()
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
 	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);

+	glGenBuffers(1, &s_PBO);
+
 	CreatePrograms();
 }

@ -196,6 +200,7 @@ void Shutdown()
 	glDeleteFramebuffers(1, &s_texConvFrameBuffer);
 	glDeleteBuffers(1, &s_encode_VBO );
 	glDeleteVertexArrays(1, &s_encode_VAO );
+	glDeleteBuffers(1, &s_PBO);

 	s_rgbToYuyvProgram.Destroy();
 	s_yuyvToRgbProgram.Destroy();
@ -206,6 +211,7 @@ void Shutdown()
 	s_srcTexture = 0;
 	s_dstTexture = 0;
 	s_texConvFrameBuffer = 0;
+	s_PBO = 0;
 }

 void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
@ -267,25 +273,37 @@ void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
 	// TODO: make this less slow.

 	int writeStride = bpmem.copyMipMapStrideChannels * 32;
+	int dstSize = dstWidth*dstHeight*4;
+	int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel
+	int readLoops = dstHeight / readHeight;

-	if (writeStride != readStride && toTexture)
+	if (writeStride != readStride && readLoops > 1 && toTexture)
 	{
 		// writing to a texture of a different size
+		// also copy more then one block line, so the different strides matters
+		// copy into one pbo first, map this buffer, and then memcpy into gc memory
+		// in this way, we only have one vram->ram transfer, but maybe a bigger
+		// cpu overhead because of the pbo
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO);
+		glBufferData(GL_PIXEL_PACK_BUFFER, dstSize, NULL, GL_STREAM_READ);
+		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0);
+		u8* pbo = (u8*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, dstSize, GL_MAP_READ_BIT);

-		int readHeight = readStride / dstWidth;
-		readHeight /= 4; // 4 bytes per pixel
-
-		int readStart = 0;
-		int readLoops = dstHeight / readHeight;
+		//int readStart = 0;
 		for (int i = 0; i < readLoops; i++)
 		{
-			glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
-			readStart += readHeight;
+			memcpy(destAddr, pbo, readStride);
+			pbo += readStride;
 			destAddr += writeStride;
 		}
+
+		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 	}
 	else
+	{
 		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
+	}

 	GL_REPORT_ERRORD();