OpenGL: readback efb2ram with different strides at once

This is done with a pixel buffer object. We still have to stall the GPU, but
we only do it once per efb2ram call.
As the cpu can't access the vram, it has to queue a memcpy for the gpu and
wait for the gpu to finish this copy. We did this for every cache line which
is just stupid. Now we copy the complete texture into a pbo and readback this
at once. So we don't have to wait for lots of round-trip-times.
This commit is contained in:
degasus 2013-11-26 20:05:49 +01:00
parent db9c586356
commit 95aeedec19
1 changed files with 26 additions and 8 deletions

View File

@ -44,6 +44,8 @@ static GLuint s_encode_VBO = 0;
static GLuint s_encode_VAO = 0;
static TargetRectangle s_cached_sourceRc;
static GLuint s_PBO = 0; // for readback with different strides
static const char *VProgram =
"ATTRIN vec2 rawpos;\n"
"ATTRIN vec2 tex0;\n"
@ -186,6 +188,8 @@ void Init()
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glGenBuffers(1, &s_PBO);
CreatePrograms();
}
@ -196,6 +200,7 @@ void Shutdown()
glDeleteFramebuffers(1, &s_texConvFrameBuffer);
glDeleteBuffers(1, &s_encode_VBO );
glDeleteVertexArrays(1, &s_encode_VAO );
glDeleteBuffers(1, &s_PBO);
s_rgbToYuyvProgram.Destroy();
s_yuyvToRgbProgram.Destroy();
@ -206,6 +211,7 @@ void Shutdown()
s_srcTexture = 0;
s_dstTexture = 0;
s_texConvFrameBuffer = 0;
s_PBO = 0;
}
void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
@ -267,25 +273,37 @@ void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
// TODO: make this less slow.
int writeStride = bpmem.copyMipMapStrideChannels * 32;
int dstSize = dstWidth*dstHeight*4;
int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel
int readLoops = dstHeight / readHeight;
if (writeStride != readStride && toTexture)
if (writeStride != readStride && readLoops > 1 && toTexture)
{
// writing to a texture of a different size
// also copy more then one block line, so the different strides matters
// copy into one pbo first, map this buffer, and then memcpy into gc memory
// in this way, we only have one vram->ram transfer, but maybe a bigger
// cpu overhead because of the pbo
glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO);
glBufferData(GL_PIXEL_PACK_BUFFER, dstSize, NULL, GL_STREAM_READ);
glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0);
u8* pbo = (u8*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, dstSize, GL_MAP_READ_BIT);
int readHeight = readStride / dstWidth;
readHeight /= 4; // 4 bytes per pixel
int readStart = 0;
int readLoops = dstHeight / readHeight;
//int readStart = 0;
for (int i = 0; i < readLoops; i++)
{
glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
readStart += readHeight;
memcpy(destAddr, pbo, readStride);
pbo += readStride;
destAddr += writeStride;
}
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
else
{
glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
}
GL_REPORT_ERRORD();