Merge pull request #10215 from OatmealDome/shader-logic-ops

VideoCommon: Support shader logic ops on Metal (Apple GPUs) and OpenGL ES
This commit is contained in:
JMC47 2021-12-22 16:39:54 -05:00 committed by GitHub
commit b1f79d9ecf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 255 additions and 46 deletions

View File

@ -8,6 +8,8 @@ ExternalProject_Add(MoltenVK
CONFIGURE_COMMAND ${CMAKE_CURRENT_LIST_DIR}/configure.sh <LOG_DIR> <SOURCE_DIR> ${MOLTENVK_VERSION}
PATCH_COMMAND ${CMAKE_CURRENT_LIST_DIR}/patch.sh ${CMAKE_SOURCE_DIR}/Externals/MoltenVK/patches/ ${MOLTENVK_VERSION}
BUILD_COMMAND make -C <SOURCE_DIR> macos
BUILD_IN_SOURCE ON
BUILD_BYPRODUCTS <SOURCE_DIR>/Package/Release/MoltenVK/dylib/macOS/libMoltenVK.dylib

13
Externals/MoltenVK/patch.sh vendored Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
# Applies all patches in the "patches" folder to the cloned MoltenVK git repository.
#
# Usage: patch.sh <patches folder> <MoltenVK version>
#
set -e
# Reset the git repository first to ensure that it's in the base state.
git reset --hard $2
git apply $1/*.patch

View File

@ -0,0 +1,24 @@
From 4ca33b7a9b149c6fbcc1c88ce08fc49f21294f6d Mon Sep 17 00:00:00 2001
From: OatmealDome <julian@oatmealdome.me>
Date: Sat, 31 Jul 2021 19:18:35 -0400
Subject: [PATCH] SPIRVToMSLConverter: Enable use_framebuffer_fetch_subpasses
---
.../MoltenVKShaderConverter/SPIRVToMSLConverter.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp
index 17c79394..97e98004 100644
--- a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp
+++ b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp
@@ -92,6 +92,7 @@ MVK_PUBLIC_SYMBOL SPIRVToMSLConversionOptions::SPIRVToMSLConversionOptions() {
#endif
mslOptions.pad_fragment_output_components = true;
+ mslOptions.use_framebuffer_fetch_subpasses = true;
}
MVK_PUBLIC_SYMBOL bool mvk::MSLShaderInput::matches(const mvk::MSLShaderInput& other) const {
--
2.30.1 (Apple Git-130)

View File

@ -692,7 +692,6 @@ void ProgramShaderCache::CreateHeader()
{
case EsFbFetchType::FbFetchExt:
framebuffer_fetch_string = "#extension GL_EXT_shader_framebuffer_fetch: enable\n"
"#define FB_FETCH_VALUE real_ocol0\n"
"#define FRAGMENT_INOUT inout";
break;
case EsFbFetchType::FbFetchArm:

View File

@ -50,9 +50,13 @@ static const char SHADER_HEADER[] = R"(
#define SAMPLER_BINDING(x) layout(set = 1, binding = x)
#define TEXEL_BUFFER_BINDING(x) layout(set = 1, binding = (x + 8))
#define SSBO_BINDING(x) layout(set = 2, binding = x)
#define INPUT_ATTACHMENT_BINDING(x, y, z) layout(set = x, binding = y, input_attachment_index = z)
#define VARYING_LOCATION(x) layout(location = x)
#define FORCE_EARLY_Z layout(early_fragment_tests) in
// Metal framebuffer fetch helpers.
#define FB_FETCH_VALUE subpassLoad(in_ocol0)
// hlsl to glsl function translation
#define API_VULKAN 1
#define float2 vec2

View File

@ -286,7 +286,7 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config)
config->backend_info.bSupportsBPTCTextures = false; // Dependent on features.
config->backend_info.bSupportsLogicOp = false; // Dependent on features.
config->backend_info.bSupportsLargePoints = false; // Dependent on features.
config->backend_info.bSupportsFramebufferFetch = false; // No support.
config->backend_info.bSupportsFramebufferFetch = false; // Dependent on OS and features.
config->backend_info.bSupportsCoarseDerivatives = true; // Assumed support.
config->backend_info.bSupportsTextureQueryLevels = true; // Assumed support.
}
@ -340,6 +340,15 @@ void VulkanContext::PopulateBackendInfoFeatures(VideoConfig* config, VkPhysicalD
properties.limits.pointSizeRange[0] <= 1.0f &&
properties.limits.pointSizeRange[1] >= 16;
std::string device_name = properties.deviceName;
u32 vendor_id = properties.vendorID;
// Only Apple family GPUs support framebuffer fetch.
if (vendor_id == 0x106B || device_name.find("Apple") != std::string::npos)
{
config->backend_info.bSupportsFramebufferFetch = true;
}
// Our usage of primitive restart appears to be broken on AMD's binary drivers.
// Seems to be fine on GCN Gen 1-2, unconfirmed on GCN Gen 3, causes driver resets on GCN Gen 4.
if (DriverDetails::HasBug(DriverDetails::BUG_PRIMITIVE_RESTART))

View File

@ -15,6 +15,7 @@ using int4 = std::array<s32, 4>;
enum class DstBlendFactor : u32;
enum class SrcBlendFactor : u32;
enum class ZTexOp : u32;
enum class LogicOp : u32;
struct PixelShaderConstants
{
@ -54,6 +55,9 @@ struct PixelShaderConstants
DstBlendFactor blend_dst_factor_alpha;
u32 blend_subtract;
u32 blend_subtract_alpha;
// For shader_framebuffer_fetch logic ops:
u32 logic_op_enable; // bool
LogicOp logic_op_mode;
};
struct VertexShaderConstants

View File

@ -19,7 +19,7 @@ namespace VideoCommon
// As pipelines encompass both shader UIDs and render states, changes to either of these should
// also increment the pipeline UID version. Incrementing the UID version will cause all UID
// caches to be invalidated.
constexpr u32 GX_PIPELINE_UID_VERSION = 3; // Last changed in PR 9532
constexpr u32 GX_PIPELINE_UID_VERSION = 4; // Last changed in PR 10215
struct GXPipelineUid
{

View File

@ -332,6 +332,9 @@ PixelShaderUid GetPixelShaderUid()
uid_data->blend_subtract_alpha = state.subtractAlpha;
}
uid_data->logic_op_enable = state.logicopenable;
uid_data->logic_op_mode = u32(state.logicmode.Value());
return out;
}
@ -424,6 +427,8 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
"\tuint blend_dst_factor_alpha;\n"
"\tbool blend_subtract;\n"
"\tbool blend_subtract_alpha;\n"
"\tbool logic_op_enable;\n"
"\tuint logic_op_mode;\n"
"}};\n\n");
out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n"
"#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n"
@ -838,6 +843,7 @@ static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBia
static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type,
bool per_pixel_depth, bool use_dual_source);
static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data);
static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data);
static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
bool use_dual_source);
static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data);
@ -926,40 +932,58 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
uid_data->useDstAlpha);
const bool use_shader_blend =
!use_dual_source && (uid_data->useDstAlpha && host_config.backend_shader_framebuffer_fetch);
const bool use_shader_logic_op =
!host_config.backend_logic_op && host_config.backend_shader_framebuffer_fetch;
if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
{
if (use_dual_source)
bool use_framebuffer_fetch = use_shader_blend || use_shader_logic_op;
#ifdef __APPLE__
// Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK)
// if we want to use it.
if (api_type == APIType::Vulkan)
{
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"
"FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n");
}
else
if (use_dual_source)
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n"
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n");
}
}
else if (use_shader_blend)
{
// QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
// intermediate value with multiple reads & modifications, so pull out the "real" output value
// and use a temporary for calculations, then set the output value once at the end of the
// shader
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
else if (use_shader_blend)
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n");
// Metal doesn't support a single unified variable for both input and output, so we declare
// the output separately. The input will be defined later below.
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 real_ocol0;\n");
}
else
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n");
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
}
if (use_framebuffer_fetch)
{
// Subpass inputs will be converted to framebuffer fetch by SPIRV-Cross.
out.Write("INPUT_ATTACHMENT_BINDING(0, 0, 0) uniform subpassInput in_ocol0;\n");
}
}
else
#endif
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
bool has_broken_decoration =
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION);
out.Write("{} {} vec4 {};\n",
has_broken_decoration ? "FRAGMENT_OUTPUT_LOCATION(0)" :
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0)",
use_framebuffer_fetch ? "FRAGMENT_INOUT" : "out",
use_shader_blend ? "real_ocol0" : "ocol0");
if (use_dual_source)
{
out.Write("{} out vec4 ocol1;\n", has_broken_decoration ?
"FRAGMENT_OUTPUT_LOCATION(1)" :
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1)");
}
}
if (uid_data->per_pixel_depth)
@ -1005,11 +1029,28 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
out.Write("void main()\n{{\n");
out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
if (use_framebuffer_fetch)
{
// Store off a copy of the initial framebuffer value.
//
// If FB_FETCH_VALUE isn't defined (i.e. no special keyword for fetching from the
// framebuffer), we read from real_ocol0 or ocol0, depending if shader blending is enabled.
out.Write("#ifdef FB_FETCH_VALUE\n"
"\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n"
"#else\n"
"\tfloat4 initial_ocol0 = {};\n"
"#endif\n",
use_shader_blend ? "real_ocol0" : "ocol0");
}
if (use_shader_blend)
{
// Store off a copy of the initial fb value for blending
out.Write("\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n"
"\tfloat4 ocol0;\n"
// QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
// intermediate value with multiple reads & modifications, so we pull out the "real" output
// value above and use a temporary for calculations, then set the output value once at the
// end of the shader if we are using shader blending.
out.Write("\tfloat4 ocol0;\n"
"\tfloat4 ocol1;\n");
}
}
@ -1260,6 +1301,9 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
WriteFog(out, uid_data);
if (use_shader_logic_op)
WriteLogicOp(out, uid_data);
// Write the color and alpha values to the framebuffer
// If using shader blend, we still use the separate alpha
WriteColor(out, api_type, uid_data, use_dual_source || use_shader_blend);
@ -1876,6 +1920,34 @@ static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data)
out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n");
}
static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data)
{
if (uid_data->logic_op_enable)
{
static constexpr std::array<const char*, 16> logic_op_mode{
"int4(0, 0, 0, 0)", // CLEAR
"prev & fb_value", // AND
"prev & ~fb_value", // AND_REVERSE
"prev", // COPY
"~prev & fb_value", // AND_INVERTED
"fb_value", // NOOP
"prev ^ fb_value", // XOR
"prev | fb_value", // OR
"~(prev | fb_value)", // NOR
"~(prev ^ fb_value)", // EQUIV
"~fb_value", // INVERT
"prev | ~fb_value", // OR_REVERSE
"~prev", // COPY_INVERTED
"~prev | fb_value", // OR_INVERTED
"~(prev & fb_value)", // NAND
"int4(255, 255, 255, 255)", // SET
};
out.Write("\tint4 fb_value = iround(initial_ocol0 * 255.0);\n");
out.Write("\tprev = {};\n", logic_op_mode[uid_data->logic_op_mode]);
}
}
static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data,
bool use_dual_source)
{

View File

@ -59,6 +59,8 @@ struct pixel_shader_uid_data
DstBlendFactor blend_dst_factor_alpha : 3; // Only used with shader_framebuffer_fetch blend
u32 blend_subtract : 1; // Only used with shader_framebuffer_fetch blend
u32 blend_subtract_alpha : 1; // Only used with shader_framebuffer_fetch blend
u32 logic_op_enable : 1; // Only used with shader_framebuffer_fetch logic ops
u32 logic_op_mode : 4; // Only used with shader_framebuffer_fetch logic ops
u32 texMtxInfo_n_projection : 8; // 8x1 bit
u32 tevindref_bi0 : 3;

View File

@ -510,6 +510,16 @@ void PixelShaderManager::SetBlendModeChanged()
constants.blend_subtract_alpha = state.subtractAlpha;
dirty = true;
}
if (constants.logic_op_enable != state.logicopenable)
{
constants.logic_op_enable = state.logicopenable;
dirty = true;
}
if (constants.logic_op_mode != state.logicmode)
{
constants.logic_op_mode = state.logicmode;
dirty = true;
}
s_bDestAlphaDirty = true;
}

View File

@ -585,7 +585,9 @@ AbstractPipelineConfig ShaderCache::GetGXPipelineConfig(
config.blending_state = blending_state;
config.framebuffer_state = g_framebuffer_manager->GetEFBFramebufferState();
if (config.blending_state.logicopenable && !g_ActiveConfig.backend_info.bSupportsLogicOp)
// We can use framebuffer fetch to emulate logic ops in the fragment shader.
if (config.blending_state.logicopenable && !g_ActiveConfig.backend_info.bSupportsLogicOp &&
!g_ActiveConfig.backend_info.bSupportsFramebufferFetch)
{
WARN_LOG_FMT(VIDEO,
"Approximating logic op with blending, this will produce incorrect rendering.");

View File

@ -55,6 +55,9 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
const bool stereo = host_config.stereo;
const bool use_dual_source = host_config.backend_dual_source_blend;
const bool use_shader_blend = !use_dual_source && host_config.backend_shader_framebuffer_fetch;
const bool use_shader_logic_op =
!host_config.backend_logic_op && host_config.backend_shader_framebuffer_fetch;
const bool use_framebuffer_fetch = use_shader_blend || use_shader_logic_op;
const bool early_depth = uid_data->early_depth != 0;
const bool per_pixel_depth = uid_data->per_pixel_depth != 0;
const bool bounding_box = host_config.bounding_box;
@ -71,37 +74,51 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
// Shader inputs/outputs in GLSL (HLSL is in main).
if (api_type == APIType::OpenGL || api_type == APIType::Vulkan)
{
if (use_dual_source)
#ifdef __APPLE__
// Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK)
// if we want to use it.
if (api_type == APIType::Vulkan)
{
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"
"FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n");
}
else
if (use_dual_source)
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n"
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n");
}
}
else if (use_shader_blend)
{
// QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
// intermediate value with multiple reads & modifications, so pull out the "real" output value
// and use a temporary for calculations, then set the output value once at the end of the
// shader
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION))
else if (use_shader_blend)
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n");
// Metal doesn't support a single unified variable for both input and output, so we declare
// the output separately. The input will be defined later below.
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 real_ocol0;\n");
}
else
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n");
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
}
if (use_framebuffer_fetch)
{
// Subpass inputs will be converted to framebuffer fetch by SPIRV-Cross.
out.Write("INPUT_ATTACHMENT_BINDING(0, 0, 0) uniform subpassInput in_ocol0;\n");
}
}
else
#endif
{
out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
bool has_broken_decoration =
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION);
out.Write("{} {} vec4 {};\n",
has_broken_decoration ? "FRAGMENT_OUTPUT_LOCATION(0)" :
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0)",
use_framebuffer_fetch ? "FRAGMENT_INOUT" : "out",
use_shader_blend ? "real_ocol0" : "ocol0");
if (use_dual_source)
{
out.Write("{} out vec4 ocol1;\n", has_broken_decoration ?
"FRAGMENT_OUTPUT_LOCATION(1)" :
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1)");
}
}
if (per_pixel_depth)
@ -511,11 +528,28 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
out.Write("void main()\n{{\n");
out.Write(" float4 rawpos = gl_FragCoord;\n");
if (use_framebuffer_fetch)
{
// Store off a copy of the initial framebuffer value.
//
// If FB_FETCH_VALUE isn't defined (i.e. no special keyword for fetching from the
// framebuffer), we read from real_ocol0 or ocol0, depending if shader blending is enabled.
out.Write("#ifdef FB_FETCH_VALUE\n"
" float4 initial_ocol0 = FB_FETCH_VALUE;\n"
"#else\n"
" float4 initial_ocol0 = {};\n"
"#endif\n",
use_shader_blend ? "real_ocol0" : "ocol0");
}
if (use_shader_blend)
{
// Store off a copy of the initial fb value for blending
out.Write(" float4 initial_ocol0 = FB_FETCH_VALUE;\n"
" float4 ocol0;\n"
// QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an
// intermediate value with multiple reads & modifications, so we pull out the "real" output
// value above and use a temporary for calculations, then set the output value once at the
// end of the shader if we are using shader blending.
out.Write(" float4 ocol0;\n"
" float4 ocol1;\n");
}
}
@ -1075,6 +1109,40 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
" }}\n"
"\n");
if (use_shader_logic_op)
{
static constexpr std::array<const char*, 16> logic_op_mode{
"int4(0, 0, 0, 0)", // CLEAR
"TevResult & fb_value", // AND
"TevResult & ~fb_value", // AND_REVERSE
"TevResult", // COPY
"~TevResult & fb_value", // AND_INVERTED
"fb_value", // NOOP
"TevResult ^ fb_value", // XOR
"TevResult | fb_value", // OR
"~(TevResult | fb_value)", // NOR
"~(TevResult ^ fb_value)", // EQUIV
"~fb_value", // INVERT
"TevResult | ~fb_value", // OR_REVERSE
"~TevResult", // COPY_INVERTED
"~TevResult | fb_value", // OR_INVERTED
"~(TevResult & fb_value)", // NAND
"int4(255, 255, 255, 255)", // SET
};
out.Write(" // Logic Ops\n"
" if (logic_op_enable) {{\n"
" int4 fb_value = iround(initial_ocol0 * 255.0);"
" switch (logic_op_mode) {{\n");
for (size_t i = 0; i < logic_op_mode.size(); i++)
{
out.Write(" case {}u: TevResult = {}; break;\n", i, logic_op_mode[i]);
}
out.Write(" }}\n"
" }}\n");
}
// D3D requires that the shader outputs be uint when writing to a uint render target for logic op.
if (api_type == APIType::D3D && uid_data->uint_output)
{