OGL: Support subgroup reduction operations via GL_NV_shader_thread_shuffle

2019-03-22 20:39:11 +10:00 · 2019-03-22 20:39:11 +10:00 · 86da282570
parent 95c7b5c635
commit 86da282570
3 changed files with 31 additions and 2 deletions
--- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
+++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
@ -719,6 +719,29 @@ void ProgramShaderCache::CreateHeader()
    break;
  }

+  std::string shader_shuffle_string;
+  if (g_ogl_config.bSupportsShaderThreadShuffleNV)
+  {
+    shader_shuffle_string = R"(
+#extension GL_NV_shader_thread_group : enable
+#extension GL_NV_shader_thread_shuffle : enable
+#define SUPPORTS_SUBGROUP_REDUCTION 1
+
+// The xor shuffle below produces incorrect results if all threads in a warp are not active.
+#define CAN_USE_SUBGROUP_REDUCTION (ballotThreadNV(true) == 0xFFFFFFFFu)
+
+#define IS_HELPER_INVOCATION gl_HelperThreadNV
+#define IS_FIRST_ACTIVE_INVOCATION (gl_ThreadInWarpNV == findLSB(ballotThreadNV(!gl_HelperThreadNV)))
+#define SUBGROUP_REDUCTION(func, value) value = func(value, shuffleXorNV(value, 16, 32)); \
+                                        value = func(value, shuffleXorNV(value, 8, 32)); \
+                                        value = func(value, shuffleXorNV(value, 4, 32)); \
+                                        value = func(value, shuffleXorNV(value, 2, 32)); \
+                                        value = func(value, shuffleXorNV(value, 1, 32));
+#define SUBGROUP_MIN(value) SUBGROUP_REDUCTION(min, value)
+#define SUBGROUP_MAX(value) SUBGROUP_REDUCTION(max, value)
+)";
+  }
+
  s_glsl_header = StringFromFormat(
      "%s\n"
      "%s\n"  // ubo
@ -737,6 +760,7 @@ void ProgramShaderCache::CreateHeader()
      "%s\n"  // ES dual source blend
      "%s\n"  // shader image load store
      "%s\n"  // shader framebuffer fetch
+      "%s\n"  // shader thread shuffle

      // Precision defines for GLSL ES
      "%s\n"
@ -815,8 +839,9 @@ void ProgramShaderCache::CreateHeader()
              ((!is_glsles && v < Glsl430) || (is_glsles && v < GlslEs310)) ?
          "#extension GL_ARB_shader_image_load_store : enable" :
          "",
-      framebuffer_fetch_string.c_str(), is_glsles ? "precision highp float;" : "",
-      is_glsles ? "precision highp int;" : "", is_glsles ? "precision highp sampler2DArray;" : "",
+      framebuffer_fetch_string.c_str(), shader_shuffle_string.c_str(),
+      is_glsles ? "precision highp float;" : "", is_glsles ? "precision highp int;" : "",
+      is_glsles ? "precision highp sampler2DArray;" : "",
      (is_glsles && g_ActiveConfig.backend_info.bSupportsPaletteConversion) ?
          "precision highp usamplerBuffer;" :
          "",
--- a/Source/Core/VideoBackends/OGL/Render.cpp
+++ b/Source/Core/VideoBackends/OGL/Render.cpp
@ -661,6 +661,9 @@ Renderer::Renderer(std::unique_ptr<GLContext> main_gl_context, float backbuffer_
  if (g_ogl_config.max_samples < 1 || !g_ogl_config.bSupportsMSAA)
    g_ogl_config.max_samples = 1;

+  g_ogl_config.bSupportsShaderThreadShuffleNV =
+      GLExtensions::Supports("GL_NV_shader_thread_shuffle");
+
  // We require texel buffers, image load store, and compute shaders to enable GPU texture decoding.
  // If the driver doesn't expose the extensions, but supports GL4.3/GLES3.1, it will still be
  // enabled in the version check below.
--- a/Source/Core/VideoBackends/OGL/Render.h
+++ b/Source/Core/VideoBackends/OGL/Render.h
@ -70,6 +70,7 @@ struct VideoConfig
  bool bSupportsBitfield;
  bool bSupportsTextureSubImage;
  EsFbFetchType SupportedFramebufferFetch;
+  bool bSupportsShaderThreadShuffleNV;

  const char* gl_vendor;
  const char* gl_renderer;