GS/HW: Further improve no_rt heuristics

Reduces copies by almost 500 in Crash and Burn, few hundred drawcall
reductions in other games.
This commit is contained in:
Stenzek 2024-01-15 02:35:50 +10:00 committed by Connor McLaughlin
parent 23b72d08d2
commit 8818cd0285
2 changed files with 54 additions and 17 deletions

View File

@ -1210,7 +1210,38 @@ void GSRendererHW::FinishSplitClear()
m_split_clear_color = 0;
}
bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) const
bool GSRendererHW::IsRTWritten()
{
const u32 written_bits = (~m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
const GIFRegALPHA ALPHA = m_context->ALPHA;
return (
// A not masked
(written_bits & 0xFF000000u) != 0) ||
(
// RGB not entirely masked
((written_bits & 0x00FFFFFFu) != 0) &&
// RGB written through no-blending, or blend result being non-zero
(!PRIM->ABE || // not blending
ALPHA.D != 1 || // additive to Cs
(ALPHA.A != ALPHA.B && // left side is not zero
(ALPHA.C == 1 || // multiply by Ad
(ALPHA.C == 2 && ALPHA.FIX != 0) || // multiply by 0
(ALPHA.C == 0 && GetAlphaMinMax().max != 0)))));
}
bool GSRendererHW::IsUsingCsInBlend()
{
const GIFRegALPHA ALPHA = m_context->ALPHA;
const bool blend_zero = (ALPHA.A == ALPHA.B || (ALPHA.C == 2 && ALPHA.FIX == 0) || (ALPHA.C == 0 && GetAlphaMinMax().max == 0));
return (PRIM->ABE && ((ALPHA.IsUsingCs() && !blend_zero) || m_context->ALPHA.D == 0));
}
bool GSRendererHW::IsUsingAsInBlend()
{
return (PRIM->ABE && m_context->ALPHA.IsUsingAs() && GetAlphaMinMax().max != 0);
}
bool GSRendererHW::IsTBPFrameOrZ(u32 tbp)
{
const bool is_frame = (m_cached_ctx.FRAME.Block() == tbp);
const bool is_z = (m_cached_ctx.ZBUF.Block() == tbp);
@ -1222,15 +1253,16 @@ bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) const
const u32 fm_mask = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;
const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8));
const bool no_rt = (m_context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1))
|| (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
const bool no_rt = (!IsRTWritten() && !m_cached_ctx.TEST.DATE);
const bool no_ds = (
// Depth is always pass/fail (no read) and write are discarded.
(zm != 0 && m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) ||
// Depth test will always pass
(zm != 0 && m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z) ||
// Depth will be written through the RT
(!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE));
// Depth is always pass/fail (no read) and write are discarded.
(zm != 0 && m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) ||
// Depth test will always pass
(zm != 0 && m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z) ||
// Depth will be written through the RT
(!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE)) ||
// No color or Z being written.
(no_rt && zm != 0);
// Relying a lot on the optimizer here... I don't like it.
return (is_frame && !no_rt) || (is_z && !no_ds);
@ -1857,16 +1889,17 @@ void GSRendererHW::Draw()
// 3/ 50cents really draws (0,0,0,128) color and a (0) 24 bits depth
// Note: FF DoC has both buffer at same location but disable the depth test (write?) with ZTE = 0
const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8));
bool no_rt = (context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1))
|| (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
bool no_rt = (!IsRTWritten() && !m_cached_ctx.TEST.DATE);
const bool all_depth_tests_pass =
// Depth is always pass/fail (no read) and write are discarded.
(!m_cached_ctx.TEST.ZTE || m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) ||
// Depth test will always pass
(m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z);
bool no_ds = (zm != 0 && all_depth_tests_pass) ||
// Depth will be written through the RT
(!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE);
// Depth will be written through the RT
(!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE) ||
// No color or Z being written.
(no_rt && zm != 0);
// No Z test if no z buffer.
if (no_ds || all_depth_tests_pass)
@ -2259,10 +2292,10 @@ void GSRendererHW::Draw()
tgt = nullptr;
}
const bool possible_shuffle = ((shuffle_target && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) || (m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 && ((m_cached_ctx.TEX0.PSM & 0x6) || m_cached_ctx.FRAME.PSM != m_cached_ctx.TEX0.PSM))) || IsPossibleChannelShuffle();
const bool possible_shuffle = !no_rt && (((shuffle_target && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) || (m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 && ((m_cached_ctx.TEX0.PSM & 0x6) || m_cached_ctx.FRAME.PSM != m_cached_ctx.TEX0.PSM))) || IsPossibleChannelShuffle());
const bool need_aem_color = GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].trbpp <= 24 && GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].pal == 0 && m_context->ALPHA.C == 0 && m_env.TEXA.AEM;
const bool req_color = (!PRIM->ABE || (PRIM->ABE && (m_context->ALPHA.IsUsingCs() || need_aem_color))) && (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0x00FFFFFF)) != (fm_mask & 0x00FFFFFF));
const bool alpha_used = m_context->TEX0.TCC && ((PRIM->ABE && m_context->ALPHA.IsUsingAs()) || (m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > ATST_ALWAYS) || (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0xFF000000)) != (fm_mask & 0xFF000000)));
const bool req_color = (!PRIM->ABE || (PRIM->ABE && (IsUsingCsInBlend() || need_aem_color))) && (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0x00FFFFFF)) != (fm_mask & 0x00FFFFFF));
const bool alpha_used = m_context->TEX0.TCC && ((PRIM->ABE && IsUsingAsInBlend()) || (m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > ATST_ALWAYS) || (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0xFF000000)) != (fm_mask & 0xFF000000)));
const bool req_alpha = (GSUtil::GetChannelMask(m_context->TEX0.PSM) & 0x8) && alpha_used;
// TODO: Be able to send an alpha of 1.0 (blended with vertex alpha maybe?) so we can avoid sending the texture, since we don't always need it.

View File

@ -111,6 +111,10 @@ private:
bool ContinueSplitClear();
void FinishSplitClear();
bool IsRTWritten();
bool IsUsingCsInBlend();
bool IsUsingAsInBlend();
GSVector4i m_r = {};
// We modify some of the context registers to optimize away unnecessary operations.
@ -215,7 +219,7 @@ public:
bool TestChannelShuffle(GSTextureCache::Target* src);
/// Returns true if the specified texture address matches the frame or Z buffer.
bool IsTBPFrameOrZ(u32 tbp) const;
bool IsTBPFrameOrZ(u32 tbp);
/// Offsets the current draw, used for RT-in-RT. Offsets are relative to the *current* FBP, not the new FBP.
void OffsetDraw(s32 fbp_offset, s32 zbp_offset, s32 xoffset, s32 yoffset);