Merge pull request #10747 from tellowkrinkle/LateUIDFixup

Add a post-cache shader UID fixup pass
This commit is contained in:
JMC47 2022-07-17 00:43:16 -04:00 committed by GitHub
commit 70b0b03c3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 319 additions and 229 deletions

View File

@ -1168,19 +1168,8 @@ void Renderer::ApplyBlendingState(const BlendingState state)
if (m_current_blend_state == state)
return;
bool useDualSource =
state.usedualsrc && g_ActiveConfig.backend_info.bSupportsDualSourceBlend &&
(!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) || state.dstalpha);
// Only use shader blend if we need to and we don't support dual-source blending directly
bool useShaderBlend = !useDualSource && state.usedualsrc && state.dstalpha &&
g_ActiveConfig.backend_info.bSupportsFramebufferFetch;
bool useDualSource = state.usedualsrc;
if (useShaderBlend)
{
glDisable(GL_BLEND);
}
else
{
const GLenum src_factors[8] = {GL_ZERO,
GL_ONE,
GL_DST_COLOR,
@ -1216,7 +1205,6 @@ void Renderer::ApplyBlendingState(const BlendingState state)
dst_factors[u32(state.dstfactor.Value())],
src_factors[u32(state.srcfactoralpha.Value())],
dst_factors[u32(state.dstfactoralpha.Value())]);
}
const GLenum logic_op_codes[16] = {
GL_CLEAR, GL_AND, GL_AND_REVERSE, GL_COPY, GL_AND_INVERTED, GL_NOOP,

View File

@ -153,7 +153,7 @@ static void Draw(s32 x, s32 y, s32 xi, s32 yi)
s32 z = (s32)std::clamp<float>(ZSlope.GetValue(x, y), 0.0f, 16777215.0f);
if (bpmem.UseEarlyDepthTest())
if (bpmem.GetEmulatedZ() == EmulatedZ::Early)
{
// TODO: Test if perf regs are incremented even if test is disabled
EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_INPUT_ZCOMPLOC);

View File

@ -840,7 +840,7 @@ void Tev::Draw()
output[BLU_C] = (output[BLU_C] * invFog + fogInt * bpmem.fog.color.b) >> 8;
}
if (bpmem.UseLateDepthTest())
if (bpmem.GetEmulatedZ() == EmulatedZ::Late)
{
// TODO: Check against hw if these values get incremented even if depth testing is disabled
EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_INPUT);

View File

@ -137,19 +137,8 @@ GetVulkanAttachmentBlendState(const BlendingState& state, AbstractPipelineUsage
{
VkPipelineColorBlendAttachmentState vk_state = {};
bool use_dual_source =
state.usedualsrc && g_ActiveConfig.backend_info.bSupportsDualSourceBlend &&
(!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) || state.dstalpha);
bool use_shader_blend = !use_dual_source && state.usedualsrc && state.dstalpha &&
g_ActiveConfig.backend_info.bSupportsFramebufferFetch;
bool use_dual_source = state.usedualsrc;
if (use_shader_blend || (usage == AbstractPipelineUsage::GX &&
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z)))
{
vk_state.blendEnable = VK_FALSE;
}
else
{
vk_state.blendEnable = static_cast<VkBool32>(state.blendenable);
vk_state.colorBlendOp = state.subtract ? VK_BLEND_OP_REVERSE_SUBTRACT : VK_BLEND_OP_ADD;
vk_state.alphaBlendOp = state.subtractAlpha ? VK_BLEND_OP_REVERSE_SUBTRACT : VK_BLEND_OP_ADD;
@ -191,7 +180,6 @@ GetVulkanAttachmentBlendState(const BlendingState& state, AbstractPipelineUsage
vk_state.dstColorBlendFactor = dst_factors[u32(state.dstfactor.Value())];
vk_state.dstAlphaBlendFactor = dst_factors[u32(state.dstfactoralpha.Value())];
}
}
if (state.colorupdate)
{

View File

@ -371,13 +371,6 @@ void VulkanContext::PopulateBackendInfoFeatures(VideoConfig* config, VkPhysicalD
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_REVERSED_DEPTH_RANGE))
config->backend_info.bSupportsReversedDepthRange = false;
// Calling discard when early depth test is enabled can break on some Apple Silicon GPU drivers.
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z))
{
// We will use shader blending, so disable hardware dual source blending.
config->backend_info.bSupportsDualSourceBlend = false;
}
// Dynamic sampler indexing locks up Intel GPUs on MoltenVK/Metal
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DYNAMIC_SAMPLER_INDEXING))
config->backend_info.bSupportsDynamicSamplerIndexing = false;

View File

@ -2338,6 +2338,16 @@ struct BPCmd
int newvalue;
};
enum class EmulatedZ : u32
{
Disabled = 0,
Early = 1,
Late = 2,
ForcedEarly = 3,
EarlyWithFBFetch = 4,
EarlyWithZComplocHack = 5,
};
struct BPMemory
{
GenMode genMode;
@ -2405,8 +2415,15 @@ struct BPMemory
u32 bpMask; // 0xFE
u32 unknown18; // ff
bool UseEarlyDepthTest() const { return zcontrol.early_ztest && zmode.testenable; }
bool UseLateDepthTest() const { return !zcontrol.early_ztest && zmode.testenable; }
EmulatedZ GetEmulatedZ() const
{
if (!zmode.testenable)
return EmulatedZ::Disabled;
if (zcontrol.early_ztest)
return EmulatedZ::Early;
else
return EmulatedZ::Late;
}
};
#pragma pack()

View File

@ -237,7 +237,8 @@ enum Bug
// crash. Sometimes this happens in the kernel mode part of the driver, resulting in a BSOD.
// These shaders are also particularly problematic on macOS's Intel drivers. On OpenGL, they can
// cause depth issues. On Metal, they can cause the driver to not write a primitive to the depth
// buffer whenever a fragment is discarded. Disable dual-source blending support on these drivers.
// buffer if dual source blending is output in the shader but not subsequently used in blending.
// Compile separate shaders for DSB on vs off for these drivers.
BUG_BROKEN_DUAL_SOURCE_BLENDING,
// BUG: ImgTec GLSL shader compiler fails when negating the input to a bitwise operation

View File

@ -19,7 +19,7 @@ namespace VideoCommon
// As pipelines encompass both shader UIDs and render states, changes to either of these should
// also increment the pipeline UID version. Incrementing the UID version will cause all UID
// caches to be invalidated.
constexpr u32 GX_PIPELINE_UID_VERSION = 4; // Last changed in PR 10215
constexpr u32 GX_PIPELINE_UID_VERSION = 5; // Last changed in PR 10747
struct GXPipelineUid
{

View File

@ -167,9 +167,6 @@ constexpr Common::EnumMap<const char*, TevOutput::Color2> tev_a_output_table{
"c2.a",
};
// FIXME: Some of the video card's capabilities (BBox support, EarlyZ support, dstAlpha support)
// leak into this UID; This is really unhelpful if these UIDs ever move from one machine to
// another.
PixelShaderUid GetPixelShaderUid()
{
PixelShaderUid out;
@ -189,20 +186,25 @@ PixelShaderUid GetPixelShaderUid()
u32 numStages = uid_data->genMode_numtevstages + 1;
const bool forced_early_z =
bpmem.UseEarlyDepthTest() &&
uid_data->Pretest = bpmem.alpha_test.TestResult();
uid_data->ztest = bpmem.GetEmulatedZ();
if (uid_data->ztest == EmulatedZ::Early &&
(g_ActiveConfig.bFastDepthCalc ||
bpmem.alpha_test.TestResult() == AlphaTestResult::Undetermined)
// We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
// This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
&& !(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
&& !bpmem.genMode.zfreeze)
{
uid_data->ztest = EmulatedZ::ForcedEarly;
}
const bool forced_early_z = uid_data->ztest == EmulatedZ::ForcedEarly;
const bool per_pixel_depth =
(bpmem.ztex2.op != ZTexOp::Disabled && bpmem.UseLateDepthTest()) ||
(bpmem.ztex2.op != ZTexOp::Disabled && uid_data->ztest == EmulatedZ::Late) ||
(!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) ||
(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
uid_data->per_pixel_depth = per_pixel_depth;
uid_data->forced_early_z = forced_early_z;
if (g_ActiveConfig.bEnablePixelLighting)
{
@ -285,59 +287,24 @@ PixelShaderUid GetPixelShaderUid()
sizeof(*uid_data) :
MY_STRUCT_OFFSET(*uid_data, stagehash[numStages]);
uid_data->Pretest = bpmem.alpha_test.TestResult();
uid_data->late_ztest = bpmem.UseLateDepthTest();
// NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
// (in this case we need to write a depth value if depth test passes regardless of the alpha
// testing result)
if (uid_data->Pretest == AlphaTestResult::Undetermined ||
(uid_data->Pretest == AlphaTestResult::Fail && uid_data->late_ztest))
(uid_data->Pretest == AlphaTestResult::Fail && uid_data->ztest == EmulatedZ::Late))
{
uid_data->alpha_test_comp0 = bpmem.alpha_test.comp0;
uid_data->alpha_test_comp1 = bpmem.alpha_test.comp1;
uid_data->alpha_test_logic = bpmem.alpha_test.logic;
// ZCOMPLOC HACK:
// The only way to emulate alpha test + early-z is to force early-z in the shader.
// As this isn't available on all drivers and as we can't emulate this feature otherwise,
// we are only able to choose which one we want to respect more.
// Tests seem to have proven that writing depth even when the alpha test fails is more
// important that a reliable alpha test, so we just force the alpha test to always succeed.
// At least this seems to be less buggy.
uid_data->alpha_test_use_zcomploc_hack =
bpmem.UseEarlyDepthTest() && bpmem.zmode.updateenable &&
!g_ActiveConfig.backend_info.bSupportsEarlyZ && !bpmem.genMode.zfreeze;
}
uid_data->zfreeze = bpmem.genMode.zfreeze;
uid_data->ztex_op = bpmem.ztex2.op;
uid_data->early_ztest = bpmem.UseEarlyDepthTest();
uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
uid_data->fog_proj = bpmem.fog.c_proj_fsel.proj;
uid_data->fog_RangeBaseEnabled = bpmem.fogRange.Base.Enabled;
BlendingState state = {};
state.Generate(bpmem);
if (((state.usedualsrc && state.dstalpha) ||
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z)) &&
g_ActiveConfig.backend_info.bSupportsFramebufferFetch &&
!g_ActiveConfig.backend_info.bSupportsDualSourceBlend)
{
uid_data->blend_enable = state.blendenable;
uid_data->blend_src_factor = state.srcfactor;
uid_data->blend_src_factor_alpha = state.srcfactoralpha;
uid_data->blend_dst_factor = state.dstfactor;
uid_data->blend_dst_factor_alpha = state.dstfactoralpha;
uid_data->blend_subtract = state.subtract;
uid_data->blend_subtract_alpha = state.subtractAlpha;
}
uid_data->logic_op_enable = state.logicopenable;
uid_data->logic_op_mode = u32(state.logicmode.Value());
return out;
}
@ -798,7 +765,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
out.Write("\n#define sampleTextureWrapper(texmap, uv, layer) "
"sampleTexture(texmap, samp[texmap], uv, layer)\n");
if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
if (uid_data->ztest == EmulatedZ::ForcedEarly)
{
// Zcomploc (aka early_ztest) is a way to control whether depth test is done before
// or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
@ -837,28 +804,15 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
out.Write("FORCE_EARLY_Z; \n");
}
// Only use dual-source blending when required on drivers that don't support it very well.
const bool use_dual_source =
host_config.backend_dual_source_blend &&
(!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) ||
uid_data->useDstAlpha);
const bool use_shader_blend =
!use_dual_source &&
(uid_data->useDstAlpha ||
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z)) &&
host_config.backend_shader_framebuffer_fetch;
const bool use_shader_logic_op = !host_config.backend_logic_op && uid_data->logic_op_enable &&
host_config.backend_shader_framebuffer_fetch;
const bool use_framebuffer_fetch =
use_shader_blend || use_shader_logic_op ||
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z);
const bool use_framebuffer_fetch = uid_data->blend_enable || uid_data->logic_op_enable ||
uid_data->ztest == EmulatedZ::EarlyWithFBFetch;
#ifdef __APPLE__
// Framebuffer fetch is only supported by Metal, so ensure that we're running Vulkan (MoltenVK)
// if we want to use it.
if (api_type == APIType::Vulkan)
{
if (use_dual_source)
if (!uid_data->no_dual_src)
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 {};\n"
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n",
@ -891,7 +845,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
uid_data->uint_output ? "uvec4" : "vec4",
use_framebuffer_fetch ? "real_ocol0" : "ocol0");
if (use_dual_source)
if (!uid_data->no_dual_src)
{
out.Write("{} out {} ocol1;\n",
has_broken_decoration ? "FRAGMENT_OUTPUT_LOCATION(1)" :
@ -960,7 +914,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
out.Write("\tfloat4 ocol0;\n");
}
if (use_shader_blend)
if (uid_data->blend_enable)
{
out.Write("\tfloat4 ocol1;\n");
}
@ -1086,10 +1040,10 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
// (in this case we need to write a depth value if depth test passes regardless of the alpha
// testing result)
if (uid_data->Pretest == AlphaTestResult::Undetermined ||
(uid_data->Pretest == AlphaTestResult::Fail && uid_data->late_ztest))
(uid_data->Pretest == AlphaTestResult::Fail && uid_data->ztest == EmulatedZ::Late))
{
WriteAlphaTest(out, uid_data, api_type, uid_data->per_pixel_depth,
use_dual_source || use_shader_blend);
!uid_data->no_dual_src || uid_data->blend_enable);
}
// This situation is important for Mario Kart Wii's menus (they will render incorrectly if the
@ -1144,7 +1098,10 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
const bool skip_ztexture = !uid_data->per_pixel_depth && uid_data->fog_fsel == FogType::Off;
// Note: z-textures are not written to depth buffer if early depth test is used
if (uid_data->per_pixel_depth && uid_data->early_ztest)
const bool early_ztest = uid_data->ztest == EmulatedZ::Early ||
uid_data->ztest == EmulatedZ::EarlyWithFBFetch ||
uid_data->ztest == EmulatedZ::EarlyWithZComplocHack;
if (uid_data->per_pixel_depth && early_ztest)
{
if (!host_config.backend_reversed_depth_range)
out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
@ -1165,7 +1122,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
}
if (uid_data->per_pixel_depth && uid_data->late_ztest)
if (uid_data->per_pixel_depth && uid_data->ztest == EmulatedZ::Late)
{
if (!host_config.backend_reversed_depth_range)
out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
@ -1184,14 +1141,14 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
WriteFog(out, uid_data);
if (use_shader_logic_op)
if (uid_data->logic_op_enable)
WriteLogicOp(out, uid_data);
// Write the color and alpha values to the framebuffer
// If using shader blend, we still use the separate alpha
WriteColor(out, api_type, uid_data, use_dual_source || use_shader_blend);
WriteColor(out, api_type, uid_data, !uid_data->no_dual_src || uid_data->blend_enable);
if (use_shader_blend)
if (uid_data->blend_enable)
WriteBlend(out, uid_data);
else if (use_framebuffer_fetch)
out.Write("\treal_ocol0 = ocol0;\n");
@ -1728,11 +1685,10 @@ static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_dat
}
// ZCOMPLOC HACK:
if (!uid_data->alpha_test_use_zcomploc_hack)
if (uid_data->ztest != EmulatedZ::EarlyWithZComplocHack)
{
#ifdef __APPLE__
if (uid_data->forced_early_z &&
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z))
if (uid_data->ztest == EmulatedZ::EarlyWithFBFetch)
{
// Instead of using discard, fetch the framebuffer's color value and use it as the output
// for this fragment.

View File

@ -12,6 +12,7 @@ enum class AlphaTestOp : u32;
enum class AlphaTestResult;
enum class CompareMode : u32;
enum class DstBlendFactor : u32;
enum class EmulatedZ : u32;
enum class FogProjection : u32;
enum class FogType : u32;
enum class KonstSel : u32;
@ -28,6 +29,7 @@ struct pixel_shader_uid_data
u32 NumValues() const { return num_values; }
u32 pad0 : 4;
u32 useDstAlpha : 1;
u32 no_dual_src : 1;
AlphaTestResult Pretest : 2;
u32 nIndirectStagesUsed : 4;
u32 genMode_numtexgens : 4;
@ -36,16 +38,13 @@ struct pixel_shader_uid_data
CompareMode alpha_test_comp0 : 3;
CompareMode alpha_test_comp1 : 3;
AlphaTestOp alpha_test_logic : 2;
u32 alpha_test_use_zcomploc_hack : 1;
FogProjection fog_proj : 1;
FogType fog_fsel : 3;
u32 fog_RangeBaseEnabled : 1;
ZTexOp ztex_op : 2;
u32 per_pixel_depth : 1;
u32 forced_early_z : 1;
u32 early_ztest : 1;
u32 late_ztest : 1;
EmulatedZ ztest : 3;
u32 bounding_box : 1;
u32 zfreeze : 1;
u32 numColorChans : 2;

View File

@ -448,7 +448,7 @@ void PixelShaderManager::SetGenModeChanged()
void PixelShaderManager::SetZModeControl()
{
u32 late_ztest = bpmem.UseLateDepthTest();
u32 late_ztest = bpmem.GetEmulatedZ() == EmulatedZ::Late;
u32 rgba6_format =
(bpmem.zcontrol.pixel_format == PixelFormat::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor) ?
1 :

View File

@ -25,6 +25,34 @@ void DepthState::Generate(const BPMemory& bp)
func = bp.zmode.func.Value();
}
static bool IsDualSrc(SrcBlendFactor factor)
{
return factor == SrcBlendFactor::SrcAlpha || factor == SrcBlendFactor::InvSrcAlpha;
}
static bool IsDualSrc(DstBlendFactor factor)
{
switch (factor)
{
case DstBlendFactor::SrcClr:
case DstBlendFactor::SrcAlpha:
case DstBlendFactor::InvSrcClr:
case DstBlendFactor::InvSrcAlpha:
return true;
default:
return false;
}
}
bool BlendingState::RequiresDualSrc() const
{
bool requires_dual_src = false;
requires_dual_src |= IsDualSrc(srcfactor) || IsDualSrc(srcfactoralpha);
requires_dual_src |= IsDualSrc(dstfactor) || IsDualSrc(dstfactoralpha);
requires_dual_src &= blendenable && usedualsrc;
return requires_dual_src;
}
// If the framebuffer format has no alpha channel, it is assumed to
// ONE on blending. As the backends may emulate this framebuffer
// configuration with an alpha channel, we just drop all references
@ -92,12 +120,12 @@ void BlendingState::Generate(const BPMemory& bp)
// Start with everything disabled.
hex = 0;
bool target_has_alpha = bp.zcontrol.pixel_format == PixelFormat::RGBA6_Z24;
bool alpha_test_may_succeed = bp.alpha_test.TestResult() != AlphaTestResult::Fail;
const bool target_has_alpha = bp.zcontrol.pixel_format == PixelFormat::RGBA6_Z24;
const bool alpha_test_may_succeed = bp.alpha_test.TestResult() != AlphaTestResult::Fail;
colorupdate = bp.blendmode.colorupdate && alpha_test_may_succeed;
alphaupdate = bp.blendmode.alphaupdate && target_has_alpha && alpha_test_may_succeed;
dstalpha = bp.dstalpha.enable && alphaupdate;
const bool dstalpha = bp.dstalpha.enable && alphaupdate;
usedualsrc = true;
// The subtract bit has the highest priority

View File

@ -130,7 +130,6 @@ union BlendingState
BitField<0, 1, u32> blendenable;
BitField<1, 1, u32> logicopenable;
BitField<2, 1, u32> dstalpha;
BitField<3, 1, u32> colorupdate;
BitField<4, 1, u32> alphaupdate;
BitField<5, 1, u32> subtract;
@ -142,6 +141,8 @@ union BlendingState
BitField<17, 3, SrcBlendFactor> srcfactoralpha;
BitField<20, 4, LogicOp> logicmode;
bool RequiresDualSrc() const;
u32 hex;
};

View File

@ -10,6 +10,7 @@
#include "Common/MsgHandler.h"
#include "Core/ConfigManager.h"
#include "VideoCommon/DriverDetails.h"
#include "VideoCommon/FramebufferManager.h"
#include "VideoCommon/FramebufferShaderGen.h"
#include "VideoCommon/RenderBase.h"
@ -612,8 +613,95 @@ AbstractPipelineConfig ShaderCache::GetGXPipelineConfig(
return config;
}
std::optional<AbstractPipelineConfig> ShaderCache::GetGXPipelineConfig(const GXPipelineUid& config)
/// Edits the UID based on driver bugs and other special configurations
static GXPipelineUid ApplyDriverBugs(const GXPipelineUid& in)
{
GXPipelineUid out;
memcpy(&out, &in, sizeof(out)); // copy padding
pixel_shader_uid_data* ps = out.ps_uid.GetUidData();
BlendingState& blend = out.blending_state;
if (ps->ztest == EmulatedZ::ForcedEarly && !out.depth_state.updateenable)
{
// No need to force early depth test if you're not writing z
ps->ztest = EmulatedZ::Early;
}
const bool benefits_from_ps_dual_source_off =
(!g_ActiveConfig.backend_info.bSupportsDualSourceBlend &&
g_ActiveConfig.backend_info.bSupportsFramebufferFetch) ||
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING);
if (benefits_from_ps_dual_source_off && !blend.RequiresDualSrc())
{
// Only use dual-source blending when required on drivers that don't support it very well.
ps->no_dual_src = true;
blend.usedualsrc = false;
}
if (g_ActiveConfig.backend_info.bSupportsFramebufferFetch)
{
bool fbfetch_blend = false;
if ((DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z) ||
!g_ActiveConfig.backend_info.bSupportsEarlyZ) &&
ps->ztest == EmulatedZ::ForcedEarly)
{
ps->ztest = EmulatedZ::EarlyWithFBFetch;
fbfetch_blend |= static_cast<bool>(out.blending_state.blendenable);
ps->no_dual_src = true;
}
fbfetch_blend |= blend.logicopenable && !g_ActiveConfig.backend_info.bSupportsLogicOp;
fbfetch_blend |= blend.usedualsrc && !g_ActiveConfig.backend_info.bSupportsDualSourceBlend;
if (fbfetch_blend)
{
ps->no_dual_src = true;
if (blend.logicopenable)
{
ps->logic_op_enable = true;
ps->logic_op_mode = static_cast<u32>(blend.logicmode.Value());
blend.logicopenable = false;
}
if (blend.blendenable)
{
ps->blend_enable = true;
ps->blend_src_factor = blend.srcfactor;
ps->blend_src_factor_alpha = blend.srcfactoralpha;
ps->blend_dst_factor = blend.dstfactor;
ps->blend_dst_factor_alpha = blend.dstfactoralpha;
ps->blend_subtract = blend.subtract;
ps->blend_subtract_alpha = blend.subtractAlpha;
blend.blendenable = false;
}
}
}
// force dual src off if we can't support it
if (!g_ActiveConfig.backend_info.bSupportsDualSourceBlend)
{
ps->no_dual_src = true;
blend.usedualsrc = false;
}
if (ps->ztest == EmulatedZ::ForcedEarly && !g_ActiveConfig.backend_info.bSupportsEarlyZ)
{
// These things should be false
ASSERT(!ps->zfreeze);
// ZCOMPLOC HACK:
// The only way to emulate alpha test + early-z is to force early-z in the shader.
// As this isn't available on all drivers and as we can't emulate this feature otherwise,
// we are only able to choose which one we want to respect more.
// Tests seem to have proven that writing depth even when the alpha test fails is more
// important that a reliable alpha test, so we just force the alpha test to always succeed.
// At least this seems to be less buggy.
ps->ztest = EmulatedZ::EarlyWithZComplocHack;
}
return out;
}
std::optional<AbstractPipelineConfig>
ShaderCache::GetGXPipelineConfig(const GXPipelineUid& config_in)
{
GXPipelineUid config = ApplyDriverBugs(config_in);
const AbstractShader* vs;
auto vs_iter = m_vs_cache.shader_map.find(config.vs_uid);
if (vs_iter != m_vs_cache.shader_map.end() && !vs_iter->second.pending)
@ -650,9 +738,33 @@ std::optional<AbstractPipelineConfig> ShaderCache::GetGXPipelineConfig(const GXP
config.depth_state, config.blending_state);
}
std::optional<AbstractPipelineConfig>
ShaderCache::GetGXPipelineConfig(const GXUberPipelineUid& config)
/// Edits the UID based on driver bugs and other special configurations
static GXUberPipelineUid ApplyDriverBugs(const GXUberPipelineUid& in)
{
GXUberPipelineUid out;
memcpy(&out, &in, sizeof(out)); // Copy padding
if (g_ActiveConfig.backend_info.bSupportsFramebufferFetch)
{
// Always blend in shader
out.blending_state.hex = 0;
out.blending_state.colorupdate = in.blending_state.colorupdate.Value();
out.blending_state.alphaupdate = in.blending_state.alphaupdate.Value();
out.ps_uid.GetUidData()->no_dual_src = true;
}
else if (!g_ActiveConfig.backend_info.bSupportsDualSourceBlend ||
(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING) &&
!out.blending_state.RequiresDualSrc()))
{
out.blending_state.usedualsrc = false;
out.ps_uid.GetUidData()->no_dual_src = true;
}
return out;
}
std::optional<AbstractPipelineConfig>
ShaderCache::GetGXPipelineConfig(const GXUberPipelineUid& config_in)
{
GXUberPipelineUid config = ApplyDriverBugs(config_in);
const AbstractShader* vs;
auto vs_iter = m_uber_vs_cache.shader_map.find(config.vs_uid);
if (vs_iter != m_uber_vs_cache.shader_map.end() && !vs_iter->second.pending)
@ -981,12 +1093,14 @@ void ShaderCache::QueuePipelineCompile(const GXPipelineUid& uid, u32 priority)
{
stages_ready = true;
auto vs_it = shader_cache->m_vs_cache.shader_map.find(uid.vs_uid);
GXPipelineUid actual_uid = ApplyDriverBugs(uid);
auto vs_it = shader_cache->m_vs_cache.shader_map.find(actual_uid.vs_uid);
stages_ready &= vs_it != shader_cache->m_vs_cache.shader_map.end() && !vs_it->second.pending;
if (vs_it == shader_cache->m_vs_cache.shader_map.end())
shader_cache->QueueVertexShaderCompile(uid.vs_uid, priority);
shader_cache->QueueVertexShaderCompile(actual_uid.vs_uid, priority);
PixelShaderUid ps_uid = uid.ps_uid;
PixelShaderUid ps_uid = actual_uid.ps_uid;
ClearUnusedPixelShaderUidBits(shader_cache->m_api_type, shader_cache->m_host_config, &ps_uid);
auto ps_it = shader_cache->m_ps_cache.shader_map.find(ps_uid);
@ -1051,13 +1165,15 @@ void ShaderCache::QueueUberPipelineCompile(const GXUberPipelineUid& uid, u32 pri
{
stages_ready = true;
auto vs_it = shader_cache->m_uber_vs_cache.shader_map.find(uid.vs_uid);
GXUberPipelineUid actual_uid = ApplyDriverBugs(uid);
auto vs_it = shader_cache->m_uber_vs_cache.shader_map.find(actual_uid.vs_uid);
stages_ready &=
vs_it != shader_cache->m_uber_vs_cache.shader_map.end() && !vs_it->second.pending;
if (vs_it == shader_cache->m_uber_vs_cache.shader_map.end())
shader_cache->QueueVertexUberShaderCompile(uid.vs_uid, priority);
shader_cache->QueueVertexUberShaderCompile(actual_uid.vs_uid, priority);
UberShader::PixelShaderUid ps_uid = uid.ps_uid;
UberShader::PixelShaderUid ps_uid = actual_uid.ps_uid;
UberShader::ClearUnusedPixelShaderUidBits(shader_cache->m_api_type,
shader_cache->m_host_config, &ps_uid);

View File

@ -3,6 +3,8 @@
#include "VideoCommon/UberShaderPixel.h"
#include "Common/Assert.h"
#include "VideoCommon/BPMemory.h"
#include "VideoCommon/DriverDetails.h"
#include "VideoCommon/NativeVertexFormat.h"
@ -21,12 +23,12 @@ PixelShaderUid GetPixelShaderUid()
pixel_ubershader_uid_data* const uid_data = out.GetUidData();
uid_data->num_texgens = xfmem.numTexGen.numTexGens;
uid_data->early_depth = bpmem.UseEarlyDepthTest() &&
uid_data->early_depth = bpmem.GetEmulatedZ() == EmulatedZ::Early &&
(g_ActiveConfig.bFastDepthCalc ||
bpmem.alpha_test.TestResult() == AlphaTestResult::Undetermined) &&
!(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
uid_data->per_pixel_depth =
(bpmem.ztex2.op != ZTexOp::Disabled && bpmem.UseLateDepthTest()) ||
(bpmem.ztex2.op != ZTexOp::Disabled && bpmem.GetEmulatedZ() == EmulatedZ::Late) ||
(!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !uid_data->early_depth) ||
(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
uid_data->uint_output = bpmem.blendmode.UseLogicOp();
@ -39,6 +41,13 @@ void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& hos
{
pixel_ubershader_uid_data* const uid_data = uid->GetUidData();
// With fbfetch, ubershaders always blend using that and don't use dual src
if (host_config.backend_shader_framebuffer_fetch || !host_config.backend_dual_source_blend)
uid_data->no_dual_src = 1;
// Dual source is always enabled in the shader if this bug is not present
else if (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DUAL_SOURCE_BLENDING))
uid_data->no_dual_src = 0;
// OpenGL and Vulkan convert implicitly normalized color outputs to their uint representation.
// Therefore, it is not necessary to use a uint output on these backends. We also disable the
// uint output when logic op is not supported (i.e. driver/device does not support D3D11.1).
@ -53,19 +62,17 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
const bool msaa = host_config.msaa;
const bool ssaa = host_config.ssaa;
const bool stereo = host_config.stereo;
const bool use_dual_source = host_config.backend_dual_source_blend;
const bool use_shader_blend = !use_dual_source && host_config.backend_shader_framebuffer_fetch;
const bool use_shader_logic_op =
!host_config.backend_logic_op && host_config.backend_shader_framebuffer_fetch;
const bool use_framebuffer_fetch =
use_shader_blend || use_shader_logic_op ||
DriverDetails::HasBug(DriverDetails::BUG_BROKEN_DISCARD_WITH_EARLY_Z);
const bool use_framebuffer_fetch = host_config.backend_shader_framebuffer_fetch;
const bool use_dual_source = host_config.backend_dual_source_blend && !uid_data->no_dual_src;
const bool early_depth = uid_data->early_depth != 0;
const bool per_pixel_depth = uid_data->per_pixel_depth != 0;
const bool bounding_box = host_config.bounding_box;
const u32 numTexgen = uid_data->num_texgens;
ShaderCode out;
ASSERT_MSG(VIDEO, !(use_dual_source && use_framebuffer_fetch),
"If you're using framebuffer fetch, you shouldn't need dual source blend!");
out.Write("// {}\n", *uid_data);
WriteBitfieldExtractHeader(out, api_type, host_config);
WritePixelShaderCommonHeader(out, api_type, host_config, bounding_box);
@ -79,9 +86,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
{
if (use_dual_source)
{
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 {};\n"
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n",
use_framebuffer_fetch ? "real_ocol0" : "ocol0");
out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n"
"FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n");
}
else
{
@ -520,12 +526,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
// intermediate value with multiple reads & modifications, so we pull out the "real" output
// value above and use a temporary for calculations, then set the output value once at the
// end of the shader.
out.Write(" float4 ocol0;\n");
}
if (use_shader_blend)
{
out.Write(" float4 ocol1;\n");
out.Write(" float4 ocol0;\n"
" float4 ocol1;\n");
}
if (host_config.backend_geometry_shaders && stereo)
@ -943,8 +945,8 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
{
// Instead of using discard, fetch the framebuffer's color value and use it as the output
// for this fragment.
out.Write(" #define discard_fragment {{ {} = float4(initial_ocol0.xyz, 1.0); return; }}\n",
use_shader_blend ? "real_ocol0" : "ocol0");
out.Write(
" #define discard_fragment {{ real_ocol0 = float4(initial_ocol0.xyz, 1.0); return; }}\n");
}
else
{
@ -1055,7 +1057,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
" }}\n"
"\n");
if (use_shader_logic_op)
if (use_framebuffer_fetch)
{
static constexpr std::array<const char*, 16> logic_op_mode{
"int4(0, 0, 0, 0)", // CLEAR
@ -1113,7 +1115,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
" ocol0.a = float(TevResult.a >> 2) / 63.0;\n"
" \n");
if (use_dual_source || use_shader_blend)
if (use_dual_source || use_framebuffer_fetch)
{
out.Write(" // Dest alpha override (dual source blending)\n"
" // Colors will be blended against the alpha from ocol1 and\n"
@ -1129,7 +1131,7 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
" }}\n");
}
if (use_shader_blend)
if (use_framebuffer_fetch)
{
using Common::EnumMap;
@ -1208,10 +1210,6 @@ ShaderCode GenPixelShader(APIType api_type, const ShaderHostConfig& host_config,
" real_ocol0 = ocol0;\n"
" }}\n");
}
else if (use_framebuffer_fetch)
{
out.Write(" real_ocol0 = ocol0;\n");
}
out.Write("}}\n"
"\n"
@ -1274,10 +1272,14 @@ void EnumeratePixelShaderUids(const std::function<void(const PixelShaderUid&)>&
for (u32 uint_output = 0; uint_output < 2; uint_output++)
{
puid->uint_output = uint_output;
for (u32 no_dual_src = 0; no_dual_src < 2; no_dual_src++)
{
puid->no_dual_src = no_dual_src;
callback(uid);
}
}
}
}
}
}
} // namespace UberShader

View File

@ -18,6 +18,7 @@ struct pixel_ubershader_uid_data
u32 early_depth : 1;
u32 per_pixel_depth : 1;
u32 uint_output : 1;
u32 no_dual_src : 1;
u32 NumValues() const { return sizeof(pixel_ubershader_uid_data); }
};
@ -42,9 +43,9 @@ struct fmt::formatter<UberShader::pixel_ubershader_uid_data>
template <typename FormatContext>
auto format(const UberShader::pixel_ubershader_uid_data& uid, FormatContext& ctx) const
{
return fmt::format_to(ctx.out(), "Pixel UberShader for {} texgens{}{}{}", uid.num_texgens,
uid.early_depth ? ", early-depth" : "",
uid.per_pixel_depth ? ", per-pixel depth" : "",
uid.uint_output ? ", uint output" : "");
return fmt::format_to(
ctx.out(), "Pixel UberShader for {} texgens{}{}{}{}", uid.num_texgens,
uid.early_depth ? ", early-depth" : "", uid.per_pixel_depth ? ", per-pixel depth" : "",
uid.uint_output ? ", uint output" : "", uid.no_dual_src ? ", no dual-source blending" : "");
}
};