GS:SW: Restore zequal for optimization

This commit is contained in:
TellowKrinkle 2022-05-16 16:57:06 -05:00 committed by tellowkrinkle
parent e58b1054ea
commit 9084ef35b4
4 changed files with 41 additions and 8 deletions

View File

@ -98,6 +98,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.fb = m_global.sel.fb; sel.fb = m_global.sel.fb;
sel.zb = m_global.sel.zb; sel.zb = m_global.sel.zb;
sel.zoverflow = m_global.sel.zoverflow; sel.zoverflow = m_global.sel.zoverflow;
sel.zequal = m_global.sel.zequal;
sel.notest = m_global.sel.notest; sel.notest = m_global.sel.notest;
m_sp = m_sp_map[sel]; m_sp = m_sp_map[sel];
@ -161,7 +162,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
} }
} }
if (has_z) if (has_z && !sel.zequal)
{ {
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z); const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
const VectorF dzf(static_cast<float>(dscan.p.F64[1])); const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
@ -357,8 +358,20 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.zb) if (sel.zb)
{ {
VectorF zbase = VectorF::broadcast64(&scan.p.z); VectorF zbase = VectorF::broadcast64(&scan.p.z);
z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0])); if (sel.zequal)
z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2])); {
#if _M_SSE >= 0x501
z0 = GSVector8::cast(GSVector8i::broadcast32(zbase.extract<0>().f64toi32()));
#else
z0 = GSVector4::cast(zbase.f64toi32());
z0 = z0.upld(z0);
#endif
}
else
{
z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0]));
z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2]));
}
} }
} }
@ -449,7 +462,11 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.prim != GS_SPRITE_CLASS) if (sel.prim != GS_SPRITE_CLASS)
{ {
if (sel.zoverflow) if (sel.zequal)
{
zs = VectorI::cast(z0);
}
else if (sel.zoverflow)
{ {
// SSE only has double to int32 conversion, no double to uint32 // SSE only has double to int32 conversion, no double to uint32
// Work around this by subtracting 0x80000000 before converting, then adding it back after // Work around this by subtracting 0x80000000 before converting, then adding it back after
@ -1492,7 +1509,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.prim != GS_SPRITE_CLASS) if (sel.prim != GS_SPRITE_CLASS)
{ {
if (sel.zb) if (sel.zb && !sel.zequal)
{ {
#if _M_SSE >= 0x501 #if _M_SSE >= 0x501
GSVector8 add = GSVector8::broadcast64(&local.d8.p.z); GSVector8 add = GSVector8::broadcast64(&local.d8.p.z);

View File

@ -720,7 +720,17 @@ void GSDrawScanlineCodeGenerator2::Init()
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]); paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
} }
if (m_sel.zb) if (m_sel.zb && m_sel.zequal)
{
Xmm zx(_z.getIdx());
cvttsd2si(rax, ptr[a3 + offsetof(GSVertexSW, p.z)]);
movd(zx, eax);
if (hasAVX2)
vpbroadcastd(_z, zx);
else
pshufd(_z, _z, _MM_SHUFFLE(0, 0, 0, 0));
}
else if (m_sel.zb)
{ {
// z = vp.zzzz() + m_local.d[skip].z; // z = vp.zzzz() + m_local.d[skip].z;
broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
@ -908,7 +918,7 @@ void GSDrawScanlineCodeGenerator2::Step()
{ {
// z += m_local.d4.z; // z += m_local.d4.z;
if (m_sel.zb) if (m_sel.zb && !m_sel.zequal)
{ {
broadcastsd(xym7, _rip_local_d_p(z)); broadcastsd(xym7, _rip_local_d_p(z));
addpd(_z, xym7); addpd(_z, xym7);
@ -1054,7 +1064,11 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
if (m_sel.prim != GS_SPRITE_CLASS) if (m_sel.prim != GS_SPRITE_CLASS)
{ {
if (m_sel.zoverflow) if (m_sel.zequal)
{
movdqa(xym0, _z);
}
else if (m_sel.zoverflow)
{ {
// GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(); // GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32();
// GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(); // GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32();

View File

@ -1288,6 +1288,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
gd.sel.zequal = !!m_vt.m_eq.z;
gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U; gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max; gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
} }

View File

@ -64,6 +64,7 @@ union GSScanlineSelector
u32 mmin : 2; // 54 u32 mmin : 2; // 54
u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
u32 zequal : 1; // 56
u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
}; };