From 8818cd0285ddaa3773526757bb07c045b28fae7e Mon Sep 17 00:00:00 2001 From: Stenzek Date: Mon, 15 Jan 2024 02:35:50 +1000 Subject: [PATCH] GS/HW: Further improve no_rt heuristics Reduces copies by almost 500 in Crash and Burn, few hundred drawcall reductions in other games. --- pcsx2/GS/Renderers/HW/GSRendererHW.cpp | 65 +++++++++++++++++++------- pcsx2/GS/Renderers/HW/GSRendererHW.h | 6 ++- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index 7afc5a92fa..ed21e47910 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -1210,7 +1210,38 @@ void GSRendererHW::FinishSplitClear() m_split_clear_color = 0; } -bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) const +bool GSRendererHW::IsRTWritten() +{ + const u32 written_bits = (~m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk); + const GIFRegALPHA ALPHA = m_context->ALPHA; + return ( + // A not masked + (written_bits & 0xFF000000u) != 0) || + ( + // RGB not entirely masked + ((written_bits & 0x00FFFFFFu) != 0) && + // RGB written through no-blending, or blend result being non-zero + (!PRIM->ABE || // not blending + ALPHA.D != 1 || // additive to Cs + (ALPHA.A != ALPHA.B && // left side is not zero + (ALPHA.C == 1 || // multiply by Ad + (ALPHA.C == 2 && ALPHA.FIX != 0) || // multiply by 0 + (ALPHA.C == 0 && GetAlphaMinMax().max != 0))))); +} + +bool GSRendererHW::IsUsingCsInBlend() +{ + const GIFRegALPHA ALPHA = m_context->ALPHA; + const bool blend_zero = (ALPHA.A == ALPHA.B || (ALPHA.C == 2 && ALPHA.FIX == 0) || (ALPHA.C == 0 && GetAlphaMinMax().max == 0)); + return (PRIM->ABE && ((ALPHA.IsUsingCs() && !blend_zero) || m_context->ALPHA.D == 0)); +} + +bool GSRendererHW::IsUsingAsInBlend() +{ + return (PRIM->ABE && m_context->ALPHA.IsUsingAs() && GetAlphaMinMax().max != 0); +} + +bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) { const bool is_frame = (m_cached_ctx.FRAME.Block() == tbp); const bool is_z = (m_cached_ctx.ZBUF.Block() == tbp); @@ -1222,15 +1253,16 @@ bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) const const u32 fm_mask = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk; const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8)); - const bool no_rt = (m_context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1)) - || (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk); + const bool no_rt = (!IsRTWritten() && !m_cached_ctx.TEST.DATE); const bool no_ds = ( - // Depth is always pass/fail (no read) and write are discarded. - (zm != 0 && m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) || - // Depth test will always pass - (zm != 0 && m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z) || - // Depth will be written through the RT - (!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE)); + // Depth is always pass/fail (no read) and write are discarded. + (zm != 0 && m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) || + // Depth test will always pass + (zm != 0 && m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z) || + // Depth will be written through the RT + (!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE)) || + // No color or Z being written. + (no_rt && zm != 0); // Relying a lot on the optimizer here... I don't like it. return (is_frame && !no_rt) || (is_z && !no_ds); @@ -1857,16 +1889,17 @@ void GSRendererHW::Draw() // 3/ 50cents really draws (0,0,0,128) color and a (0) 24 bits depth // Note: FF DoC has both buffer at same location but disable the depth test (write?) with ZTE = 0 const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8)); - bool no_rt = (context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1)) - || (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk); + bool no_rt = (!IsRTWritten() && !m_cached_ctx.TEST.DATE); const bool all_depth_tests_pass = // Depth is always pass/fail (no read) and write are discarded. (!m_cached_ctx.TEST.ZTE || m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) || // Depth test will always pass (m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z); bool no_ds = (zm != 0 && all_depth_tests_pass) || - // Depth will be written through the RT - (!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE); + // Depth will be written through the RT + (!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE) || + // No color or Z being written. + (no_rt && zm != 0); // No Z test if no z buffer. if (no_ds || all_depth_tests_pass) @@ -2259,10 +2292,10 @@ void GSRendererHW::Draw() tgt = nullptr; } - const bool possible_shuffle = ((shuffle_target && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) || (m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 && ((m_cached_ctx.TEX0.PSM & 0x6) || m_cached_ctx.FRAME.PSM != m_cached_ctx.TEX0.PSM))) || IsPossibleChannelShuffle(); + const bool possible_shuffle = !no_rt && (((shuffle_target && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) || (m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 && ((m_cached_ctx.TEX0.PSM & 0x6) || m_cached_ctx.FRAME.PSM != m_cached_ctx.TEX0.PSM))) || IsPossibleChannelShuffle()); const bool need_aem_color = GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].trbpp <= 24 && GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].pal == 0 && m_context->ALPHA.C == 0 && m_env.TEXA.AEM; - const bool req_color = (!PRIM->ABE || (PRIM->ABE && (m_context->ALPHA.IsUsingCs() || need_aem_color))) && (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0x00FFFFFF)) != (fm_mask & 0x00FFFFFF)); - const bool alpha_used = m_context->TEX0.TCC && ((PRIM->ABE && m_context->ALPHA.IsUsingAs()) || (m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > ATST_ALWAYS) || (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0xFF000000)) != (fm_mask & 0xFF000000))); + const bool req_color = (!PRIM->ABE || (PRIM->ABE && (IsUsingCsInBlend() || need_aem_color))) && (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0x00FFFFFF)) != (fm_mask & 0x00FFFFFF)); + const bool alpha_used = m_context->TEX0.TCC && ((PRIM->ABE && IsUsingAsInBlend()) || (m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > ATST_ALWAYS) || (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0xFF000000)) != (fm_mask & 0xFF000000))); const bool req_alpha = (GSUtil::GetChannelMask(m_context->TEX0.PSM) & 0x8) && alpha_used; // TODO: Be able to send an alpha of 1.0 (blended with vertex alpha maybe?) so we can avoid sending the texture, since we don't always need it. diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.h b/pcsx2/GS/Renderers/HW/GSRendererHW.h index f55cf38936..7c0175df62 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.h +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.h @@ -111,6 +111,10 @@ private: bool ContinueSplitClear(); void FinishSplitClear(); + bool IsRTWritten(); + bool IsUsingCsInBlend(); + bool IsUsingAsInBlend(); + GSVector4i m_r = {}; // We modify some of the context registers to optimize away unnecessary operations. @@ -215,7 +219,7 @@ public: bool TestChannelShuffle(GSTextureCache::Target* src); /// Returns true if the specified texture address matches the frame or Z buffer. - bool IsTBPFrameOrZ(u32 tbp) const; + bool IsTBPFrameOrZ(u32 tbp); /// Offsets the current draw, used for RT-in-RT. Offsets are relative to the *current* FBP, not the new FBP. void OffsetDraw(s32 fbp_offset, s32 zbp_offset, s32 xoffset, s32 yoffset);