From 9084ef35b47ff816cc046b0559ec17d966b6d00a Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Mon, 16 May 2022 16:57:06 -0500 Subject: [PATCH] GS:SW: Restore zequal for optimization --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 27 +++++++++++++++---- .../SW/GSDrawScanlineCodeGenerator.all.cpp | 20 +++++++++++--- pcsx2/GS/Renderers/SW/GSRendererSW.cpp | 1 + pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h | 1 + 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index a0df154aa7..02fd0203ce 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -98,6 +98,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) sel.fb = m_global.sel.fb; sel.zb = m_global.sel.zb; sel.zoverflow = m_global.sel.zoverflow; + sel.zequal = m_global.sel.zequal; sel.notest = m_global.sel.notest; m_sp = m_sp_map[sel]; @@ -161,7 +162,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons } } - if (has_z) + if (has_z && !sel.zequal) { const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z); const VectorF dzf(static_cast(dscan.p.F64[1])); @@ -357,8 +358,20 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.zb) { VectorF zbase = VectorF::broadcast64(&scan.p.z); - z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0])); - z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2])); + if (sel.zequal) + { +#if _M_SSE >= 0x501 + z0 = GSVector8::cast(GSVector8i::broadcast32(zbase.extract<0>().f64toi32())); +#else + z0 = GSVector4::cast(zbase.f64toi32()); + z0 = z0.upld(z0); +#endif + } + else + { + z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0])); + z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2])); + } } } @@ -449,7 +462,11 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.prim != GS_SPRITE_CLASS) { - if (sel.zoverflow) + if (sel.zequal) + { + zs = VectorI::cast(z0); + } + else if (sel.zoverflow) { // SSE only has double to int32 conversion, no double to uint32 // Work around this by subtracting 0x80000000 before converting, then adding it back after @@ -1492,7 +1509,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.prim != GS_SPRITE_CLASS) { - if (sel.zb) + if (sel.zb && !sel.zequal) { #if _M_SSE >= 0x501 GSVector8 add = GSVector8::broadcast64(&local.d8.p.z); diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index e47d93ee80..848ba98c47 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -720,7 +720,17 @@ void GSDrawScanlineCodeGenerator2::Init() paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]); } - if (m_sel.zb) + if (m_sel.zb && m_sel.zequal) + { + Xmm zx(_z.getIdx()); + cvttsd2si(rax, ptr[a3 + offsetof(GSVertexSW, p.z)]); + movd(zx, eax); + if (hasAVX2) + vpbroadcastd(_z, zx); + else + pshufd(_z, _z, _MM_SHUFFLE(0, 0, 0, 0)); + } + else if (m_sel.zb) { // z = vp.zzzz() + m_local.d[skip].z; broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z @@ -908,7 +918,7 @@ void GSDrawScanlineCodeGenerator2::Step() { // z += m_local.d4.z; - if (m_sel.zb) + if (m_sel.zb && !m_sel.zequal) { broadcastsd(xym7, _rip_local_d_p(z)); addpd(_z, xym7); @@ -1054,7 +1064,11 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) if (m_sel.prim != GS_SPRITE_CLASS) { - if (m_sel.zoverflow) + if (m_sel.zequal) + { + movdqa(xym0, _z); + } + else if (m_sel.zoverflow) { // GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(); // GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(); diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp index 957baaec15..9b08a5d9c4 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp @@ -1288,6 +1288,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; + gd.sel.zequal = !!m_vt.m_eq.z; gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U; gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max; } diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h index f9250e48ff..a6696a6473 100644 --- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h +++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h @@ -64,6 +64,7 @@ union GSScanlineSelector u32 mmin : 2; // 54 u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction + u32 zequal : 1; // 56 u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program };