OGL: implement Bounding Box on systems w/o SSBO

This commit should have zero performance effect if SSBOs are supported.

If they aren't (e.g. on all Macs), this commit alters FramebufferManager
to attach a new stencil buffer and VertexManager to draw to it when
bounding box is active. `BBoxRead` gets the pixel data from the buffer
and dumbly loops through it to find the bounding box.

This patch can run Paper Mario: The Thousand-Year Door at almost full
speed (50–60 FPS) without Dual-Core enabled for all common bounding
box-using actions I tested (going through pipes, Plane Mode, Paper
Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.)
on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz
DDR3, and Intel Iris 1536 MB).

A few more demanding scenes (e.g. the self-building bridge on the way
to Petalburg) slow to ~15% of their speed without this patch (though
they don't run quite at full speed even on master). The slowdown is
caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`.

Other implementation ideas:

- Use a stencil buffer that's separate from the depth buffer. This would
  require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on
  macOS.

- Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower
  on my computer, presumably because it has to transfer the entire
  combined depth-stencil buffer instead of only the stencil data.
  Getting only stencil data from `glGetTexImage` requires
  ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on
  macOS.

- Don't use a PBO, and use `glReadPixels` synchronously. This has no
  visible performance effect on my computer, and is theoretically
  slower.
This commit is contained in:
Michael Maltese 2017-03-05 15:34:30 -08:00
parent 56fe938366
commit ba6e917b49
7 changed files with 184 additions and 36 deletions

View File

@ -2,20 +2,45 @@
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <cstring>
#include "Common/GL/GLUtil.h"
#include "VideoBackends/OGL/BoundingBox.h"
#include "VideoBackends/OGL/FramebufferManager.h"
#include "VideoCommon/DriverDetails.h"
#include "VideoCommon/VideoConfig.h"
static GLuint s_bbox_buffer_id;
static GLuint s_pbo;
static std::array<int, 4> s_stencil_bounds;
static bool s_stencil_updated;
static bool s_stencil_cleared;
static int s_target_width;
static int s_target_height;
namespace OGL
{
void BoundingBox::Init()
void BoundingBox::SetTargetSizeChanged(int target_width, int target_height)
{
if (g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics)
return;
s_target_width = target_width;
s_target_height = target_height;
s_stencil_updated = false;
glBindBuffer(GL_PIXEL_PACK_BUFFER, s_pbo);
glBufferData(GL_PIXEL_PACK_BUFFER, s_target_width * s_target_height, nullptr, GL_STREAM_READ);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
void BoundingBox::Init(int target_width, int target_height)
{
if (g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics)
{
@ -25,6 +50,12 @@ void BoundingBox::Init()
glBufferData(GL_SHADER_STORAGE_BUFFER, 4 * sizeof(s32), initial_values, GL_DYNAMIC_DRAW);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, s_bbox_buffer_id);
}
else
{
s_stencil_bounds = {{0, 0, 0, 0}};
glGenBuffers(1, &s_pbo);
SetTargetSizeChanged(target_width, target_height);
}
}
void BoundingBox::Shutdown()
@ -33,40 +64,107 @@ void BoundingBox::Shutdown()
{
glDeleteBuffers(1, &s_bbox_buffer_id);
}
else
{
glDeleteBuffers(1, &s_pbo);
}
}
void BoundingBox::Set(int index, int value)
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int), &value);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
if (g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics)
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int), &value);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
}
else
{
s_stencil_bounds[index] = value;
if (!s_stencil_cleared)
{
// Assumes that the EFB framebuffer is currently bound
glClearStencil(0);
glClear(GL_STENCIL_BUFFER_BIT);
s_stencil_updated = false;
s_stencil_cleared = true;
}
}
}
int BoundingBox::Get(int index)
{
int data = 0;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
if (!DriverDetails::HasBug(DriverDetails::BUG_SLOW_GETBUFFERSUBDATA))
if (g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics)
{
// Using glMapBufferRange to read back the contents of the SSBO is extremely slow
// on nVidia drivers. This is more noticeable at higher internal resolutions.
// Using glGetBufferSubData instead does not seem to exhibit this slowdown.
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int), &data);
int data = 0;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, s_bbox_buffer_id);
if (!DriverDetails::HasBug(DriverDetails::BUG_SLOW_GETBUFFERSUBDATA))
{
// Using glMapBufferRange to read back the contents of the SSBO is extremely slow
// on nVidia drivers. This is more noticeable at higher internal resolutions.
// Using glGetBufferSubData instead does not seem to exhibit this slowdown.
glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int), &data);
}
else
{
// Using glMapBufferRange is faster on AMD cards by a measurable margin.
void* ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int),
GL_MAP_READ_BIT);
if (ptr)
{
memcpy(&data, ptr, sizeof(int));
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
}
}
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
return data;
}
else
{
// Using glMapBufferRange is faster on AMD cards by a measurable margin.
void* ptr = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, index * sizeof(int), sizeof(int),
GL_MAP_READ_BIT);
if (ptr)
if (s_stencil_updated)
{
memcpy(&data, ptr, sizeof(int));
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
}
}
s_stencil_updated = false;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
return data;
FramebufferManager::ResolveEFBStencilTexture();
glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferManager::GetResolvedFramebuffer());
glBindBuffer(GL_PIXEL_PACK_BUFFER, s_pbo);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
glReadPixels(0, 0, s_target_width, s_target_height, GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, 0);
glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferManager::GetEFBFramebuffer());
// Eke every bit of performance out of the compiler that we can
std::array<int, 4> bounds = s_stencil_bounds;
u8* data = static_cast<u8*>(glMapBufferRange(
GL_PIXEL_PACK_BUFFER, 0, s_target_height * s_target_width, GL_MAP_READ_BIT));
for (int row = 0; row < s_target_height; row++)
{
for (int col = 0; col < s_target_width; col++)
{
if (data[row * s_target_width + col] == 0)
continue;
bounds[0] = std::min(bounds[0], col);
bounds[1] = std::max(bounds[1], col);
bounds[2] = std::min(bounds[2], row);
bounds[3] = std::max(bounds[3], row);
}
}
s_stencil_bounds = bounds;
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
return s_stencil_bounds[index];
}
}
void BoundingBox::StencilWasUpdated()
{
s_stencil_updated = true;
s_stencil_cleared = false;
}
};

View File

@ -9,9 +9,16 @@ namespace OGL
class BoundingBox
{
public:
static void Init();
static void Init(int target_width, int target_height);
static void Shutdown();
static void SetTargetSizeChanged(int target_width, int target_height);
// When SSBO isn't available, the bounding box is calculated directly from the
// stencil buffer. When the stencil buffer is changed, this function needs to
// be called to invalidate the cached bounding box data.
static void StencilWasUpdated();
static void Set(int index, int value);
static int Get(int index);
};

View File

@ -152,12 +152,13 @@ FramebufferManager::FramebufferManager(int targetWidth, int targetHeight, int ms
m_resolvedColorTexture = CreateTexture(resolvedType, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE);
m_resolvedDepthTexture =
CreateTexture(resolvedType, GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT);
CreateTexture(resolvedType, GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
// Bind resolved textures to resolved framebuffer.
glGenFramebuffers(m_EFBLayers, m_resolvedFramebuffer.data());
BindLayeredTexture(m_resolvedColorTexture, m_resolvedFramebuffer, GL_COLOR_ATTACHMENT0, resolvedType);
BindLayeredTexture(m_resolvedDepthTexture, m_resolvedFramebuffer, GL_DEPTH_ATTACHMENT, resolvedType);
BindLayeredTexture(m_resolvedDepthTexture, m_resolvedFramebuffer, GL_STENCIL_ATTACHMENT, resolvedType);
}
m_efbColor = CreateTexture(m_textureType, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE);
@ -172,13 +173,15 @@ FramebufferManager::FramebufferManager(int targetWidth, int targetHeight, int ms
glGenFramebuffers(m_EFBLayers, m_efbFramebuffer.data());
BindLayeredTexture(m_efbColor, m_efbFramebuffer, GL_COLOR_ATTACHMENT0, m_textureType);
BindLayeredTexture(m_efbDepth, m_efbFramebuffer, GL_DEPTH_ATTACHMENT, m_textureType);
BindLayeredTexture(m_efbDepth, m_efbFramebuffer, GL_STENCIL_ATTACHMENT, m_textureType);
// EFB framebuffer is currently bound, make sure to clear it before use.
glViewport(0, 0, m_targetWidth, m_targetHeight);
glScissor(0, 0, m_targetWidth, m_targetHeight);
glClearColor(0.f, 0.f, 0.f, 0.f);
glClearDepthf(1.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glClearStencil(0);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
// reinterpret pixel format
const char* vs = m_EFBLayers > 1 ? "void main(void) {\n"
@ -478,6 +481,24 @@ GLuint FramebufferManager::GetEFBDepthTexture(const EFBRectangle& sourceRc)
}
}
void FramebufferManager::ResolveEFBStencilTexture()
{
if (m_msaaSamples <= 1)
return;
// Resolve.
for (unsigned int i = 0; i < m_EFBLayers; i++)
{
glBindFramebuffer(GL_READ_FRAMEBUFFER, m_efbFramebuffer[i]);
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_resolvedFramebuffer[i]);
glBlitFramebuffer(0, 0, m_targetWidth, m_targetHeight, 0, 0, m_targetWidth, m_targetHeight,
GL_STENCIL_BUFFER_BIT, GL_NEAREST);
}
// Return to EFB.
glBindFramebuffer(GL_FRAMEBUFFER, m_efbFramebuffer[0]);
}
void FramebufferManager::CopyToRealXFB(u32 xfbAddr, u32 fbStride, u32 fbHeight,
const EFBRectangle& sourceRc, float Gamma)
{
@ -493,6 +514,13 @@ void FramebufferManager::CopyToRealXFB(u32 xfbAddr, u32 fbStride, u32 fbHeight,
sourceRc.GetWidth(), fbStride, fbHeight);
}
GLuint FramebufferManager::GetResolvedFramebuffer()
{
if (m_msaaSamples <= 1)
return m_efbFramebuffer[0];
return m_resolvedFramebuffer[0];
}
void FramebufferManager::SetFramebuffer(GLuint fb)
{
glBindFramebuffer(GL_FRAMEBUFFER, fb != 0 ? fb : GetEFBFramebuffer());

View File

@ -70,6 +70,7 @@ public:
// the EFB to a resolved texture first.
static GLuint GetEFBColorTexture(const EFBRectangle& sourceRc);
static GLuint GetEFBDepthTexture(const EFBRectangle& sourceRc);
static void ResolveEFBStencilTexture();
static GLuint GetEFBFramebuffer(unsigned int layer = 0)
{
@ -77,7 +78,7 @@ public:
}
static GLuint GetXFBFramebuffer() { return m_xfbFramebuffer; }
// Resolved framebuffer is only used in MSAA mode.
static GLuint GetResolvedFramebuffer() { return m_resolvedFramebuffer[0]; }
static GLuint GetResolvedFramebuffer();
static void SetFramebuffer(GLuint fb);
static void FramebufferTexture(GLenum target, GLenum attachment, GLenum textarget, GLuint texture,
GLint level);

View File

@ -411,7 +411,8 @@ Renderer::Renderer()
g_Config.backend_info.bSupportsPrimitiveRestart =
!DriverDetails::HasBug(DriverDetails::BUG_PRIMITIVE_RESTART) &&
((GLExtensions::Version() >= 310) || GLExtensions::Supports("GL_NV_primitive_restart"));
g_Config.backend_info.bSupportsBBox = g_Config.backend_info.bSupportsFragmentStoresAndAtomics =
g_Config.backend_info.bSupportsBBox = true;
g_Config.backend_info.bSupportsFragmentStoresAndAtomics =
GLExtensions::Supports("GL_ARB_shader_storage_buffer_object");
g_Config.backend_info.bSupportsGSInstancing = GLExtensions::Supports("GL_ARB_gpu_shader5");
g_Config.backend_info.bSupportsSSAA = GLExtensions::Supports("GL_ARB_gpu_shader5") &&
@ -497,7 +498,6 @@ Renderer::Renderer()
g_Config.backend_info.bSupportsGSInstancing =
g_Config.backend_info.bSupportsGeometryShaders && g_ogl_config.SupportedESPointSize > 0;
g_Config.backend_info.bSupportsSSAA = g_ogl_config.bSupportsAEP;
g_Config.backend_info.bSupportsBBox = true;
g_Config.backend_info.bSupportsFragmentStoresAndAtomics = true;
g_ogl_config.bSupportsMSAA = true;
g_ogl_config.bSupports2DTextureStorage = true;
@ -519,7 +519,6 @@ Renderer::Renderer()
g_Config.backend_info.bSupportsGSInstancing = g_ogl_config.SupportedESPointSize > 0;
g_Config.backend_info.bSupportsPaletteConversion = true;
g_Config.backend_info.bSupportsSSAA = true;
g_Config.backend_info.bSupportsBBox = true;
g_Config.backend_info.bSupportsFragmentStoresAndAtomics = true;
g_ogl_config.bSupportsCopySubImage = true;
g_ogl_config.bSupportsGLBaseVertex = true;
@ -657,10 +656,13 @@ Renderer::Renderer()
// options while running
g_Config.bRunning = true;
glStencilFunc(GL_ALWAYS, 0, 0);
glBlendFunc(GL_ONE, GL_ONE);
// The stencil is used for bounding box emulation when SSBOs are not available
glDisable(GL_STENCIL_TEST);
glStencilFunc(GL_ALWAYS, 1, 0xFF);
glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE);
glViewport(0, 0, GetTargetWidth(), GetTargetHeight()); // Reset The Current Viewport
// Reset The Current Viewport
glViewport(0, 0, GetTargetWidth(), GetTargetHeight());
if (g_ActiveConfig.backend_info.bSupportsClipControl)
glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
@ -677,10 +679,9 @@ Renderer::Renderer()
glPixelStorei(GL_UNPACK_ALIGNMENT, 4); // 4-byte pixel alignment
glDisable(GL_STENCIL_TEST);
glEnable(GL_SCISSOR_TEST);
glScissor(0, 0, GetTargetWidth(), GetTargetHeight());
glBlendFunc(GL_ONE, GL_ONE);
glBlendColor(0, 0, 0, 0.5f);
glClearDepthf(1.0f);
@ -1364,6 +1365,7 @@ void Renderer::SwapImpl(u32 xfbAddr, u32 fbWidth, u32 fbStride, u32 fbHeight,
g_framebuffer_manager.reset();
g_framebuffer_manager =
std::make_unique<FramebufferManager>(m_target_width, m_target_height, s_MSAASamples);
BoundingBox::SetTargetSizeChanged(m_target_width, m_target_height);
}
}

View File

@ -14,9 +14,11 @@
#include "Common/GL/GLExtensions/GLExtensions.h"
#include "Common/StringUtil.h"
#include "VideoBackends/OGL/BoundingBox.h"
#include "VideoBackends/OGL/ProgramShaderCache.h"
#include "VideoBackends/OGL/Render.h"
#include "VideoBackends/OGL/StreamBuffer.h"
#include "VideoCommon/BoundingBox.h"
#include "VideoCommon/IndexGenerator.h"
#include "VideoCommon/Statistics.h"
@ -156,8 +158,19 @@ void VertexManager::vFlush()
// setup the pointers
nativeVertexFmt->SetupVertexPointers();
if (!g_Config.backend_info.bSupportsFragmentStoresAndAtomics && ::BoundingBox::active)
{
glEnable(GL_STENCIL_TEST);
}
Draw(stride);
if (!g_Config.backend_info.bSupportsFragmentStoresAndAtomics && ::BoundingBox::active)
{
OGL::BoundingBox::StencilWasUpdated();
glDisable(GL_STENCIL_TEST);
}
#if defined(_DEBUG) || defined(DEBUGFAST)
if (g_ActiveConfig.iLog & CONF_SAVESHADERS)
{
@ -177,7 +190,6 @@ void VertexManager::vFlush()
}
#endif
g_Config.iSaveTargetId++;
ClearEFBCache();
}

View File

@ -212,7 +212,7 @@ void VideoBackend::Video_Prepare()
g_sampler_cache = std::make_unique<SamplerCache>();
static_cast<Renderer*>(g_renderer.get())->Init();
TextureConverter::Init();
BoundingBox::Init();
BoundingBox::Init(g_renderer->GetTargetWidth(), g_renderer->GetTargetHeight());
}
void VideoBackend::Shutdown()