From 95aeedec19db25f7de3bd37cc7281f6f70e0ab58 Mon Sep 17 00:00:00 2001
From: degasus <wickmarkus@web.de>
Date: Tue, 26 Nov 2013 20:05:49 +0100
Subject: [PATCH] OpenGL: readback efb2ram with different strides at once

This is done with a pixel buffer object. We still have to stall the GPU, but
we only do it once per efb2ram call.
As the cpu can't access the vram, it has to queue a memcpy for the gpu and
wait for the gpu to finish this copy. We did this for every cache line which
is just stupid. Now we copy the complete texture into a pbo and readback this
at once. So we don't have to wait for lots of round-trip-times.
---
 .../OGL/Src/TextureConverter.cpp              | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
index f653dfbf01..290dc510cf 100644
--- a/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
+++ b/Source/Core/VideoBackends/OGL/Src/TextureConverter.cpp
@@ -44,6 +44,8 @@ static GLuint s_encode_VBO = 0;
 static GLuint s_encode_VAO = 0;
 static TargetRectangle s_cached_sourceRc;
 
+static GLuint s_PBO = 0; // for readback with different strides
+
 static const char *VProgram =
 	"ATTRIN vec2 rawpos;\n"
 	"ATTRIN vec2 tex0;\n"
@@ -186,6 +188,8 @@ void Init()
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
 	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
 
+	glGenBuffers(1, &s_PBO);
+
 	CreatePrograms();
 }
 
@@ -196,6 +200,7 @@ void Shutdown()
 	glDeleteFramebuffers(1, &s_texConvFrameBuffer);
 	glDeleteBuffers(1, &s_encode_VBO );
 	glDeleteVertexArrays(1, &s_encode_VAO );
+	glDeleteBuffers(1, &s_PBO);
 
 	s_rgbToYuyvProgram.Destroy();
 	s_yuyvToRgbProgram.Destroy();
@@ -206,6 +211,7 @@ void Shutdown()
 	s_srcTexture = 0;
 	s_dstTexture = 0;
 	s_texConvFrameBuffer = 0;
+	s_PBO = 0;
 }
 
 void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
@@ -267,25 +273,37 @@ void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
 	// TODO: make this less slow.
 
 	int writeStride = bpmem.copyMipMapStrideChannels * 32;
+	int dstSize = dstWidth*dstHeight*4;
+	int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel
+	int readLoops = dstHeight / readHeight;
 
-	if (writeStride != readStride && toTexture)
+	if (writeStride != readStride && readLoops > 1 && toTexture)
 	{
 		// writing to a texture of a different size
+		// also copy more then one block line, so the different strides matters
+		// copy into one pbo first, map this buffer, and then memcpy into gc memory
+		// in this way, we only have one vram->ram transfer, but maybe a bigger
+		// cpu overhead because of the pbo
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO);
+		glBufferData(GL_PIXEL_PACK_BUFFER, dstSize, NULL, GL_STREAM_READ);
+		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0);
+		u8* pbo = (u8*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, dstSize, GL_MAP_READ_BIT);
 
-		int readHeight = readStride / dstWidth;
-		readHeight /= 4; // 4 bytes per pixel
-
-		int readStart = 0;
-		int readLoops = dstHeight / readHeight;
+		//int readStart = 0;
 		for (int i = 0; i < readLoops; i++)
 		{
-			glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
-			readStart += readHeight;
+			memcpy(destAddr, pbo, readStride);
+			pbo += readStride;
 			destAddr += writeStride;
 		}
+
+		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 	}
 	else
+	{
 		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
+	}
 
 	GL_REPORT_ERRORD();