From 4b2aa948e662aa037ad39f5ee81a64256a4d61aa Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 31 Jan 2023 18:10:48 +0100 Subject: [PATCH] VideoBackend/OGL: Prefer KHR_subgroup over NV_shader_thread. While the NV extension is totally fine, the KHR extension should be able to support more hardware. For NVIDIA, the hardware either supports both or neither, it just needs a driver from the last two years. For AMD, the drivers from late 2022-12 seems to bring support for the KHR extension. For Intel, the KHR is also supported for some years. --- .../Common/GL/GLExtensions/GLExtensions.h | 1 + .../GL/GLExtensions/KHR_shader_subgroup.h | 19 +++++++++++++ Source/Core/VideoBackends/Metal/MTLUtil.mm | 1 - Source/Core/VideoBackends/OGL/OGLConfig.cpp | 28 +++++++++++++++++-- Source/Core/VideoBackends/OGL/OGLConfig.h | 7 +++-- .../VideoBackends/OGL/ProgramShaderCache.cpp | 27 ++++++++---------- .../VideoBackends/Vulkan/ShaderCompiler.cpp | 3 +- Source/Core/VideoCommon/PixelShaderGen.cpp | 13 ++++----- 8 files changed, 66 insertions(+), 33 deletions(-) create mode 100644 Source/Core/Common/GL/GLExtensions/KHR_shader_subgroup.h diff --git a/Source/Core/Common/GL/GLExtensions/GLExtensions.h b/Source/Core/Common/GL/GLExtensions/GLExtensions.h index 98a60ba9cd..f03812cea0 100644 --- a/Source/Core/Common/GL/GLExtensions/GLExtensions.h +++ b/Source/Core/Common/GL/GLExtensions/GLExtensions.h @@ -37,6 +37,7 @@ #include "Common/GL/GLExtensions/EXT_texture_filter_anisotropic.h" #include "Common/GL/GLExtensions/HP_occlusion_test.h" #include "Common/GL/GLExtensions/KHR_debug.h" +#include "Common/GL/GLExtensions/KHR_shader_subgroup.h" #include "Common/GL/GLExtensions/NV_depth_buffer_float.h" #include "Common/GL/GLExtensions/NV_occlusion_query_samples.h" #include "Common/GL/GLExtensions/NV_primitive_restart.h" diff --git a/Source/Core/Common/GL/GLExtensions/KHR_shader_subgroup.h b/Source/Core/Common/GL/GLExtensions/KHR_shader_subgroup.h new file mode 100644 index 0000000000..0738bc3a20 --- /dev/null +++ b/Source/Core/Common/GL/GLExtensions/KHR_shader_subgroup.h @@ -0,0 +1,19 @@ +/* +** Copyright (c) 2013-2015 The Khronos Group Inc. +** SPDX-License-Identifier: MIT +*/ + +#include "Common/GL/GLExtensions/gl_common.h" + +#define GL_SUBGROUP_SIZE_KHR 0x9532 +#define GL_SUBGROUP_SUPPORTED_STAGES_KHR 0x9533 +#define GL_SUBGROUP_SUPPORTED_FEATURES_KHR 0x9534 +#define GL_SUBGROUP_QUAD_ALL_STAGES_KHR 0x9535 +#define GL_SUBGROUP_FEATURE_BASIC_BIT_KHR 0x00000001 +#define GL_SUBGROUP_FEATURE_VOTE_BIT_KHR 0x00000002 +#define GL_SUBGROUP_FEATURE_ARITHMETIC_BIT_KHR 0x00000004 +#define GL_SUBGROUP_FEATURE_BALLOT_BIT_KHR 0x00000008 +#define GL_SUBGROUP_FEATURE_SHUFFLE_BIT_KHR 0x00000010 +#define GL_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT_KHR 0x00000020 +#define GL_SUBGROUP_FEATURE_CLUSTERED_BIT_KHR 0x00000040 +#define GL_SUBGROUP_FEATURE_QUAD_BIT_KHR 0x00000080 diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.mm b/Source/Core/VideoBackends/Metal/MTLUtil.mm index 1baf532943..452ddfa13d 100644 --- a/Source/Core/VideoBackends/Metal/MTLUtil.mm +++ b/Source/Core/VideoBackends/Metal/MTLUtil.mm @@ -386,7 +386,6 @@ static const std::string_view SUBGROUP_HELPER_HEADER = R"( #extension GL_KHR_shader_subgroup_ballot : enable #define SUPPORTS_SUBGROUP_REDUCTION 1 -#define CAN_USE_SUBGROUP_REDUCTION true #define IS_HELPER_INVOCATION gl_HelperInvocation #define IS_FIRST_ACTIVE_INVOCATION (subgroupElect()) #define SUBGROUP_MIN(value) value = subgroupMin(value) diff --git a/Source/Core/VideoBackends/OGL/OGLConfig.cpp b/Source/Core/VideoBackends/OGL/OGLConfig.cpp index bafc28841d..10722e0526 100644 --- a/Source/Core/VideoBackends/OGL/OGLConfig.cpp +++ b/Source/Core/VideoBackends/OGL/OGLConfig.cpp @@ -489,7 +489,14 @@ bool PopulateConfig(GLContext* m_main_gl_context) else if (GLExtensions::Version() >= 430) { // TODO: We should really parse the GL_SHADING_LANGUAGE_VERSION token. - g_ogl_config.eSupportedGLSLVersion = Glsl430; + if (GLExtensions::Version() >= 450) + { + g_ogl_config.eSupportedGLSLVersion = Glsl450; + } + else + { + g_ogl_config.eSupportedGLSLVersion = Glsl430; + } g_ogl_config.bSupportsTextureStorage = true; g_ogl_config.bSupportsImageLoadStore = true; g_Config.backend_info.bSupportsSSAA = true; @@ -531,8 +538,23 @@ bool PopulateConfig(GLContext* m_main_gl_context) if (g_ogl_config.max_samples < 1 || !g_ogl_config.bSupportsMSAA) g_ogl_config.max_samples = 1; - g_ogl_config.bSupportsShaderThreadShuffleNV = - GLExtensions::Supports("GL_NV_shader_thread_shuffle"); + const bool bSupportsIsHelperInvocation = g_ogl_config.bIsES ? + g_ogl_config.eSupportedGLSLVersion >= GlslEs320 : + g_ogl_config.eSupportedGLSLVersion >= Glsl450; + g_ogl_config.bSupportsKHRShaderSubgroup = + GLExtensions::Supports("GL_KHR_shader_subgroup") && bSupportsIsHelperInvocation; + if (g_ogl_config.bSupportsKHRShaderSubgroup) + { + // Check for the features: basic + arithmetic + ballot + GLint supported_features = 0; + glGetIntegerv(GL_SUBGROUP_SUPPORTED_FEATURES_KHR, &supported_features); + if (~supported_features & + (GL_SUBGROUP_FEATURE_BASIC_BIT_KHR | GL_SUBGROUP_FEATURE_ARITHMETIC_BIT_KHR | + GL_SUBGROUP_FEATURE_BALLOT_BIT_KHR)) + { + g_ogl_config.bSupportsKHRShaderSubgroup = false; + } + } // We require texel buffers, image load store, and compute shaders to enable GPU texture decoding. // If the driver doesn't expose the extensions, but supports GL4.3/GLES3.1, it will still be diff --git a/Source/Core/VideoBackends/OGL/OGLConfig.h b/Source/Core/VideoBackends/OGL/OGLConfig.h index 2aece68896..570e1954eb 100644 --- a/Source/Core/VideoBackends/OGL/OGLConfig.h +++ b/Source/Core/VideoBackends/OGL/OGLConfig.h @@ -15,8 +15,9 @@ enum GlslVersion Glsl140, Glsl150, Glsl330, - Glsl400, // and above - Glsl430, + Glsl400, // and above + Glsl430, // 430 - 440 + Glsl450, // 450 - xxx GlslEs300, // GLES 3.0 GlslEs310, // GLES 3.1 GlslEs320, // GLES 3.2 @@ -61,7 +62,7 @@ struct VideoConfig bool bSupportsBitfield; bool bSupportsTextureSubImage; EsFbFetchType SupportedFramebufferFetch; - bool bSupportsShaderThreadShuffleNV; + bool bSupportsKHRShaderSubgroup; // basic + arithmetic + ballot const char* gl_vendor; const char* gl_renderer; diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 289e99cae0..eff1497e7f 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -78,6 +78,8 @@ static std::string GetGLSLVersionString() return "#version 400"; case Glsl430: return "#version 430"; + case Glsl450: + return "#version 450"; default: // Shouldn't ever hit this return "#version ERROR"; @@ -720,25 +722,18 @@ void ProgramShaderCache::CreateHeader() } std::string shader_shuffle_string; - if (g_ogl_config.bSupportsShaderThreadShuffleNV) + if (g_ogl_config.bSupportsKHRShaderSubgroup) { shader_shuffle_string = R"( -#extension GL_NV_shader_thread_group : enable -#extension GL_NV_shader_thread_shuffle : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_KHR_shader_subgroup_ballot : enable + #define SUPPORTS_SUBGROUP_REDUCTION 1 - -// The xor shuffle below produces incorrect results if all threads in a warp are not active. -#define CAN_USE_SUBGROUP_REDUCTION (ballotThreadNV(true) == 0xFFFFFFFFu) - -#define IS_HELPER_INVOCATION gl_HelperThreadNV -#define IS_FIRST_ACTIVE_INVOCATION (gl_ThreadInWarpNV == findLSB(ballotThreadNV(!gl_HelperThreadNV))) -#define SUBGROUP_REDUCTION(func, value) value = func(value, shuffleXorNV(value, 16, 32)); \ - value = func(value, shuffleXorNV(value, 8, 32)); \ - value = func(value, shuffleXorNV(value, 4, 32)); \ - value = func(value, shuffleXorNV(value, 2, 32)); \ - value = func(value, shuffleXorNV(value, 1, 32)); -#define SUBGROUP_MIN(value) SUBGROUP_REDUCTION(min, value) -#define SUBGROUP_MAX(value) SUBGROUP_REDUCTION(max, value) +#define IS_HELPER_INVOCATION gl_HelperInvocation +#define IS_FIRST_ACTIVE_INVOCATION (subgroupElect()) +#define SUBGROUP_MIN(value) value = subgroupMin(value) +#define SUBGROUP_MAX(value) value = subgroupMax(value) )"; } diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp index 56d20caf43..49b007f43b 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp +++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp @@ -81,9 +81,8 @@ static const char SUBGROUP_HELPER_HEADER[] = R"( #extension GL_KHR_shader_subgroup_ballot : enable #define SUPPORTS_SUBGROUP_REDUCTION 1 - #define CAN_USE_SUBGROUP_REDUCTION true #define IS_HELPER_INVOCATION gl_HelperInvocation - #define IS_FIRST_ACTIVE_INVOCATION (gl_SubgroupInvocationID == subgroupBallotFindLSB(subgroupBallot(!gl_HelperInvocation))) + #define IS_FIRST_ACTIVE_INVOCATION (subgroupElect()) #define SUBGROUP_MIN(value) value = subgroupMin(value) #define SUBGROUP_MAX(value) value = subgroupMax(value) )"; diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index df6f5dfab8..4dcd681185 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -457,15 +457,12 @@ void UpdateBoundingBox(float2 rawpos) {{ int2 pos_br = pos | 1; // round up to odd #ifdef SUPPORTS_SUBGROUP_REDUCTION - if (CAN_USE_SUBGROUP_REDUCTION) {{ - int2 min_pos = IS_HELPER_INVOCATION ? int2(2147483647, 2147483647) : pos_tl; - int2 max_pos = IS_HELPER_INVOCATION ? int2(-2147483648, -2147483648) : pos_br; - SUBGROUP_MIN(min_pos); - SUBGROUP_MAX(max_pos); + if (!IS_HELPER_INVOCATION) + {{ + SUBGROUP_MIN(pos_tl); + SUBGROUP_MAX(pos_br); if (IS_FIRST_ACTIVE_INVOCATION) - UpdateBoundingBoxBuffer(min_pos, max_pos); - }} else {{ - UpdateBoundingBoxBuffer(pos_tl, pos_br); + UpdateBoundingBoxBuffer(pos_tl, pos_br); }} #else UpdateBoundingBoxBuffer(pos_tl, pos_br);