diff --git a/Externals/MoltenVK/CMakeLists.txt b/Externals/MoltenVK/CMakeLists.txt index 5202e95970..fdc53313bf 100644 --- a/Externals/MoltenVK/CMakeLists.txt +++ b/Externals/MoltenVK/CMakeLists.txt @@ -8,6 +8,8 @@ ExternalProject_Add(MoltenVK CONFIGURE_COMMAND ${CMAKE_CURRENT_LIST_DIR}/configure.sh ${MOLTENVK_VERSION} + PATCH_COMMAND ${CMAKE_CURRENT_LIST_DIR}/patch.sh ${CMAKE_SOURCE_DIR}/Externals/MoltenVK/patches/ ${MOLTENVK_VERSION} + BUILD_COMMAND make -C macos BUILD_IN_SOURCE ON BUILD_BYPRODUCTS /Package/Release/MoltenVK/dylib/macOS/libMoltenVK.dylib diff --git a/Externals/MoltenVK/patch.sh b/Externals/MoltenVK/patch.sh new file mode 100755 index 0000000000..2a40d7eda9 --- /dev/null +++ b/Externals/MoltenVK/patch.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Applies all patches in the "patches" folder to the cloned MoltenVK git repository. +# +# Usage: patch.sh +# + +set -e + +# Reset the git repository first to ensure that it's in the base state. +git reset --hard $2 + +git apply $1/*.patch diff --git a/Externals/MoltenVK/patches/0001-SPIRVToMSLConverter-Enable-use_framebuffer_fetch_sub.patch b/Externals/MoltenVK/patches/0001-SPIRVToMSLConverter-Enable-use_framebuffer_fetch_sub.patch new file mode 100644 index 0000000000..4cd0c3d8f7 --- /dev/null +++ b/Externals/MoltenVK/patches/0001-SPIRVToMSLConverter-Enable-use_framebuffer_fetch_sub.patch @@ -0,0 +1,24 @@ +From 4ca33b7a9b149c6fbcc1c88ce08fc49f21294f6d Mon Sep 17 00:00:00 2001 +From: OatmealDome +Date: Sat, 31 Jul 2021 19:18:35 -0400 +Subject: [PATCH] SPIRVToMSLConverter: Enable use_framebuffer_fetch_subpasses + +--- + .../MoltenVKShaderConverter/SPIRVToMSLConverter.cpp | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp +index 17c79394..97e98004 100644 +--- a/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp ++++ b/MoltenVKShaderConverter/MoltenVKShaderConverter/SPIRVToMSLConverter.cpp +@@ -92,6 +92,7 @@ MVK_PUBLIC_SYMBOL SPIRVToMSLConversionOptions::SPIRVToMSLConversionOptions() { + #endif + + mslOptions.pad_fragment_output_components = true; ++ mslOptions.use_framebuffer_fetch_subpasses = true; + } + + MVK_PUBLIC_SYMBOL bool mvk::MSLShaderInput::matches(const mvk::MSLShaderInput& other) const { +-- +2.30.1 (Apple Git-130) + diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 492d4b956c..1413813cda 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -692,7 +692,6 @@ void ProgramShaderCache::CreateHeader() { case EsFbFetchType::FbFetchExt: framebuffer_fetch_string = "#extension GL_EXT_shader_framebuffer_fetch: enable\n" - "#define FB_FETCH_VALUE real_ocol0\n" "#define FRAGMENT_INOUT inout"; break; case EsFbFetchType::FbFetchArm: diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp index c28b3cced2..ec38d58009 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp +++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp @@ -50,9 +50,13 @@ static const char SHADER_HEADER[] = R"( #define SAMPLER_BINDING(x) layout(set = 1, binding = x) #define TEXEL_BUFFER_BINDING(x) layout(set = 1, binding = (x + 8)) #define SSBO_BINDING(x) layout(set = 2, binding = x) + #define INPUT_ATTACHMENT_BINDING(x, y, z) layout(set = x, binding = y, input_attachment_index = z) #define VARYING_LOCATION(x) layout(location = x) #define FORCE_EARLY_Z layout(early_fragment_tests) in + // Metal framebuffer fetch helpers. + #define FB_FETCH_VALUE subpassLoad(in_ocol0) + // hlsl to glsl function translation #define API_VULKAN 1 #define float2 vec2 diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index d8a2ab0a47..23cb06e846 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -286,7 +286,7 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config) config->backend_info.bSupportsBPTCTextures = false; // Dependent on features. config->backend_info.bSupportsLogicOp = false; // Dependent on features. config->backend_info.bSupportsLargePoints = false; // Dependent on features. - config->backend_info.bSupportsFramebufferFetch = false; // No support. + config->backend_info.bSupportsFramebufferFetch = false; // Dependent on OS and features. config->backend_info.bSupportsCoarseDerivatives = true; // Assumed support. config->backend_info.bSupportsTextureQueryLevels = true; // Assumed support. } @@ -340,6 +340,15 @@ void VulkanContext::PopulateBackendInfoFeatures(VideoConfig* config, VkPhysicalD properties.limits.pointSizeRange[0] <= 1.0f && properties.limits.pointSizeRange[1] >= 16; + std::string device_name = properties.deviceName; + u32 vendor_id = properties.vendorID; + + // Only Apple family GPUs support framebuffer fetch. + if (vendor_id == 0x106B || device_name.find("Apple") != std::string::npos) + { + config->backend_info.bSupportsFramebufferFetch = true; + } + // Our usage of primitive restart appears to be broken on AMD's binary drivers. // Seems to be fine on GCN Gen 1-2, unconfirmed on GCN Gen 3, causes driver resets on GCN Gen 4. if (DriverDetails::HasBug(DriverDetails::BUG_PRIMITIVE_RESTART)) diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index 7144342503..5335af963a 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -15,6 +15,7 @@ using int4 = std::array; enum class DstBlendFactor : u32; enum class SrcBlendFactor : u32; enum class ZTexOp : u32; +enum class LogicOp : u32; struct PixelShaderConstants { @@ -54,6 +55,9 @@ struct PixelShaderConstants DstBlendFactor blend_dst_factor_alpha; u32 blend_subtract; u32 blend_subtract_alpha; + // For shader_framebuffer_fetch logic ops: + u32 logic_op_enable; // bool + LogicOp logic_op_mode; }; struct VertexShaderConstants diff --git a/Source/Core/VideoCommon/GXPipelineTypes.h b/Source/Core/VideoCommon/GXPipelineTypes.h index 231bddefc1..ca3fb89701 100644 --- a/Source/Core/VideoCommon/GXPipelineTypes.h +++ b/Source/Core/VideoCommon/GXPipelineTypes.h @@ -19,7 +19,7 @@ namespace VideoCommon // As pipelines encompass both shader UIDs and render states, changes to either of these should // also increment the pipeline UID version. Incrementing the UID version will cause all UID // caches to be invalidated. -constexpr u32 GX_PIPELINE_UID_VERSION = 3; // Last changed in PR 9532 +constexpr u32 GX_PIPELINE_UID_VERSION = 4; // Last changed in PR 10215 struct GXPipelineUid { diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 6c12a9607a..c8f3bf5b15 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -332,6 +332,9 @@ PixelShaderUid GetPixelShaderUid() uid_data->blend_subtract_alpha = state.subtractAlpha; } + uid_data->logic_op_enable = state.logicopenable; + uid_data->logic_op_mode = u32(state.logicmode.Value()); + return out; } @@ -424,6 +427,8 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, "\tuint blend_dst_factor_alpha;\n" "\tbool blend_subtract;\n" "\tbool blend_subtract_alpha;\n" + "\tbool logic_op_enable;\n" + "\tuint logic_op_mode;\n" "}};\n\n"); out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n" "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n" @@ -838,6 +843,7 @@ static void WriteTevRegular(ShaderCode& out, std::string_view components, TevBia static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_data, APIType api_type, bool per_pixel_depth, bool use_dual_source); static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data); +static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data); static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data, bool use_dual_source); static void WriteBlend(ShaderCode& out, const pixel_shader_uid_data* uid_data); @@ -926,40 +932,58 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos uid_data->useDstAlpha); const bool use_shader_blend = !use_dual_source && (uid_data->useDstAlpha && host_config.backend_shader_framebuffer_fetch); + const bool use_shader_logic_op = + !host_config.backend_logic_op && host_config.backend_shader_framebuffer_fetch; if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) { - if (use_dual_source) + bool use_framebuffer_fetch = use_shader_blend || use_shader_logic_op; + +#ifdef __APPLE__ + // Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK) + // if we want to use it. + if (api_type == APIType::Vulkan) { - if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) - { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n" - "FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n"); - } - else + if (use_dual_source) { out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n" "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n"); } - } - else if (use_shader_blend) - { - // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an - // intermediate value with multiple reads & modifications, so pull out the "real" output value - // and use a temporary for calculations, then set the output value once at the end of the - // shader - if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) + else if (use_shader_blend) { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n"); + // Metal doesn't support a single unified variable for both input and output, so we declare + // the output separately. The input will be defined later below. + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 real_ocol0;\n"); } else { - out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + } + + if (use_framebuffer_fetch) + { + // Subpass inputs will be converted to framebuffer fetch by SPIRV-Cross. + out.Write("INPUT_ATTACHMENT_BINDING(0, 0, 0) uniform subpassInput in_ocol0;\n"); } } else +#endif { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + bool has_broken_decoration = + DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION); + + out.Write("{} {} vec4 {};\n", + has_broken_decoration ? "FRAGMENT_OUTPUT_LOCATION(0)" : + "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0)", + use_framebuffer_fetch ? "FRAGMENT_INOUT" : "out", + use_shader_blend ? "real_ocol0" : "ocol0"); + + if (use_dual_source) + { + out.Write("{} out vec4 ocol1;\n", has_broken_decoration ? + "FRAGMENT_OUTPUT_LOCATION(1)" : + "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1)"); + } } if (uid_data->per_pixel_depth) @@ -1005,11 +1029,28 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos out.Write("void main()\n{{\n"); out.Write("\tfloat4 rawpos = gl_FragCoord;\n"); + + if (use_framebuffer_fetch) + { + // Store off a copy of the initial framebuffer value. + // + // If FB_FETCH_VALUE isn't defined (i.e. no special keyword for fetching from the + // framebuffer), we read from real_ocol0 or ocol0, depending if shader blending is enabled. + out.Write("#ifdef FB_FETCH_VALUE\n" + "\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n" + "#else\n" + "\tfloat4 initial_ocol0 = {};\n" + "#endif\n", + use_shader_blend ? "real_ocol0" : "ocol0"); + } + if (use_shader_blend) { - // Store off a copy of the initial fb value for blending - out.Write("\tfloat4 initial_ocol0 = FB_FETCH_VALUE;\n" - "\tfloat4 ocol0;\n" + // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an + // intermediate value with multiple reads & modifications, so we pull out the "real" output + // value above and use a temporary for calculations, then set the output value once at the + // end of the shader if we are using shader blending. + out.Write("\tfloat4 ocol0;\n" "\tfloat4 ocol1;\n"); } } @@ -1260,6 +1301,9 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos WriteFog(out, uid_data); + if (use_shader_logic_op) + WriteLogicOp(out, uid_data); + // Write the color and alpha values to the framebuffer // If using shader blend, we still use the separate alpha WriteColor(out, api_type, uid_data, use_dual_source || use_shader_blend); @@ -1876,6 +1920,34 @@ static void WriteFog(ShaderCode& out, const pixel_shader_uid_data* uid_data) out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n"); } +static void WriteLogicOp(ShaderCode& out, const pixel_shader_uid_data* uid_data) +{ + if (uid_data->logic_op_enable) + { + static constexpr std::array logic_op_mode{ + "int4(0, 0, 0, 0)", // CLEAR + "prev & fb_value", // AND + "prev & ~fb_value", // AND_REVERSE + "prev", // COPY + "~prev & fb_value", // AND_INVERTED + "fb_value", // NOOP + "prev ^ fb_value", // XOR + "prev | fb_value", // OR + "~(prev | fb_value)", // NOR + "~(prev ^ fb_value)", // EQUIV + "~fb_value", // INVERT + "prev | ~fb_value", // OR_REVERSE + "~prev", // COPY_INVERTED + "~prev | fb_value", // OR_INVERTED + "~(prev & fb_value)", // NAND + "int4(255, 255, 255, 255)", // SET + }; + + out.Write("\tint4 fb_value = iround(initial_ocol0 * 255.0);\n"); + out.Write("\tprev = {};\n", logic_op_mode[uid_data->logic_op_mode]); + } +} + static void WriteColor(ShaderCode& out, APIType api_type, const pixel_shader_uid_data* uid_data, bool use_dual_source) { diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index adf996044c..6a7a638ac3 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -59,6 +59,8 @@ struct pixel_shader_uid_data DstBlendFactor blend_dst_factor_alpha : 3; // Only used with shader_framebuffer_fetch blend u32 blend_subtract : 1; // Only used with shader_framebuffer_fetch blend u32 blend_subtract_alpha : 1; // Only used with shader_framebuffer_fetch blend + u32 logic_op_enable : 1; // Only used with shader_framebuffer_fetch logic ops + u32 logic_op_mode : 4; // Only used with shader_framebuffer_fetch logic ops u32 texMtxInfo_n_projection : 8; // 8x1 bit u32 tevindref_bi0 : 3; diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index f63722c9c1..675bcceca5 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -510,6 +510,16 @@ void PixelShaderManager::SetBlendModeChanged() constants.blend_subtract_alpha = state.subtractAlpha; dirty = true; } + if (constants.logic_op_enable != state.logicopenable) + { + constants.logic_op_enable = state.logicopenable; + dirty = true; + } + if (constants.logic_op_mode != state.logicmode) + { + constants.logic_op_mode = state.logicmode; + dirty = true; + } s_bDestAlphaDirty = true; } diff --git a/Source/Core/VideoCommon/ShaderCache.cpp b/Source/Core/VideoCommon/ShaderCache.cpp index b72dd238ff..0a9dc0b856 100644 --- a/Source/Core/VideoCommon/ShaderCache.cpp +++ b/Source/Core/VideoCommon/ShaderCache.cpp @@ -585,7 +585,9 @@ AbstractPipelineConfig ShaderCache::GetGXPipelineConfig( config.blending_state = blending_state; config.framebuffer_state = g_framebuffer_manager->GetEFBFramebufferState(); - if (config.blending_state.logicopenable && !g_ActiveConfig.backend_info.bSupportsLogicOp) + // We can use framebuffer fetch to emulate logic ops in the fragment shader. + if (config.blending_state.logicopenable && !g_ActiveConfig.backend_info.bSupportsLogicOp && + !g_ActiveConfig.backend_info.bSupportsFramebufferFetch) { WARN_LOG_FMT(VIDEO, "Approximating logic op with blending, this will produce incorrect rendering."); diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp index c7719ec377..178de469a3 100644 --- a/Source/Core/VideoCommon/UberShaderPixel.cpp +++ b/Source/Core/VideoCommon/UberShaderPixel.cpp @@ -55,6 +55,9 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, const bool stereo = host_config.stereo; const bool use_dual_source = host_config.backend_dual_source_blend; const bool use_shader_blend = !use_dual_source && host_config.backend_shader_framebuffer_fetch; + const bool use_shader_logic_op = + !host_config.backend_logic_op && host_config.backend_shader_framebuffer_fetch; + const bool use_framebuffer_fetch = use_shader_blend || use_shader_logic_op; const bool early_depth = uid_data->early_depth != 0; const bool per_pixel_depth = uid_data->per_pixel_depth != 0; const bool bounding_box = host_config.bounding_box; @@ -71,37 +74,51 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, // Shader inputs/outputs in GLSL (HLSL is in main). if (api_type == APIType::OpenGL || api_type == APIType::Vulkan) { - if (use_dual_source) +#ifdef __APPLE__ + // Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK) + // if we want to use it. + if (api_type == APIType::Vulkan) { - if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) - { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n" - "FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n"); - } - else + if (use_dual_source) { out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n" "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n"); } - } - else if (use_shader_blend) - { - // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an - // intermediate value with multiple reads & modifications, so pull out the "real" output value - // and use a temporary for calculations, then set the output value once at the end of the - // shader - if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) + else if (use_shader_blend) { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) FRAGMENT_INOUT vec4 real_ocol0;\n"); + // Metal doesn't support a single unified variable for both input and output, so we declare + // the output separately. The input will be defined later below. + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 real_ocol0;\n"); } else { - out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) FRAGMENT_INOUT vec4 real_ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + } + + if (use_framebuffer_fetch) + { + // Subpass inputs will be converted to framebuffer fetch by SPIRV-Cross. + out.Write("INPUT_ATTACHMENT_BINDING(0, 0, 0) uniform subpassInput in_ocol0;\n"); } } else +#endif { - out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + bool has_broken_decoration = + DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION); + + out.Write("{} {} vec4 {};\n", + has_broken_decoration ? "FRAGMENT_OUTPUT_LOCATION(0)" : + "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0)", + use_framebuffer_fetch ? "FRAGMENT_INOUT" : "out", + use_shader_blend ? "real_ocol0" : "ocol0"); + + if (use_dual_source) + { + out.Write("{} out vec4 ocol1;\n", has_broken_decoration ? + "FRAGMENT_OUTPUT_LOCATION(1)" : + "FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1)"); + } } if (per_pixel_depth) @@ -511,11 +528,28 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, out.Write("void main()\n{{\n"); out.Write(" float4 rawpos = gl_FragCoord;\n"); + + if (use_framebuffer_fetch) + { + // Store off a copy of the initial framebuffer value. + // + // If FB_FETCH_VALUE isn't defined (i.e. no special keyword for fetching from the + // framebuffer), we read from real_ocol0 or ocol0, depending if shader blending is enabled. + out.Write("#ifdef FB_FETCH_VALUE\n" + " float4 initial_ocol0 = FB_FETCH_VALUE;\n" + "#else\n" + " float4 initial_ocol0 = {};\n" + "#endif\n", + use_shader_blend ? "real_ocol0" : "ocol0"); + } + if (use_shader_blend) { - // Store off a copy of the initial fb value for blending - out.Write(" float4 initial_ocol0 = FB_FETCH_VALUE;\n" - " float4 ocol0;\n" + // QComm's Adreno driver doesn't seem to like using the framebuffer_fetch value as an + // intermediate value with multiple reads & modifications, so we pull out the "real" output + // value above and use a temporary for calculations, then set the output value once at the + // end of the shader if we are using shader blending. + out.Write(" float4 ocol0;\n" " float4 ocol1;\n"); } } @@ -1075,6 +1109,40 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config, " }}\n" "\n"); + if (use_shader_logic_op) + { + static constexpr std::array logic_op_mode{ + "int4(0, 0, 0, 0)", // CLEAR + "TevResult & fb_value", // AND + "TevResult & ~fb_value", // AND_REVERSE + "TevResult", // COPY + "~TevResult & fb_value", // AND_INVERTED + "fb_value", // NOOP + "TevResult ^ fb_value", // XOR + "TevResult | fb_value", // OR + "~(TevResult | fb_value)", // NOR + "~(TevResult ^ fb_value)", // EQUIV + "~fb_value", // INVERT + "TevResult | ~fb_value", // OR_REVERSE + "~TevResult", // COPY_INVERTED + "~TevResult | fb_value", // OR_INVERTED + "~(TevResult & fb_value)", // NAND + "int4(255, 255, 255, 255)", // SET + }; + + out.Write(" // Logic Ops\n" + " if (logic_op_enable) {{\n" + " int4 fb_value = iround(initial_ocol0 * 255.0);" + " switch (logic_op_mode) {{\n"); + for (size_t i = 0; i < logic_op_mode.size(); i++) + { + out.Write(" case {}u: TevResult = {}; break;\n", i, logic_op_mode[i]); + } + + out.Write(" }}\n" + " }}\n"); + } + // D3D requires that the shader outputs be uint when writing to a uint render target for logic op. if (api_type == APIType::D3D && uid_data->uint_output) {