mirror of https://github.com/PCSX2/pcsx2.git
GS:SW: Restore zequal for optimization
This commit is contained in:
parent
e58b1054ea
commit
9084ef35b4
|
@ -98,6 +98,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
|
|||
sel.fb = m_global.sel.fb;
|
||||
sel.zb = m_global.sel.zb;
|
||||
sel.zoverflow = m_global.sel.zoverflow;
|
||||
sel.zequal = m_global.sel.zequal;
|
||||
sel.notest = m_global.sel.notest;
|
||||
|
||||
m_sp = m_sp_map[sel];
|
||||
|
@ -161,7 +162,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
}
|
||||
}
|
||||
|
||||
if (has_z)
|
||||
if (has_z && !sel.zequal)
|
||||
{
|
||||
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
|
||||
const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
|
||||
|
@ -357,10 +358,22 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
if (sel.zb)
|
||||
{
|
||||
VectorF zbase = VectorF::broadcast64(&scan.p.z);
|
||||
if (sel.zequal)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
z0 = GSVector8::cast(GSVector8i::broadcast32(zbase.extract<0>().f64toi32()));
|
||||
#else
|
||||
z0 = GSVector4::cast(zbase.f64toi32());
|
||||
z0 = z0.upld(z0);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
z0 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[0]));
|
||||
z1 = zbase.add64(VectorF::f32to64(&local.d[skip].z.F32[vlen/2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sel.fb)
|
||||
{
|
||||
|
@ -449,7 +462,11 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
|
||||
if (sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
if (sel.zoverflow)
|
||||
if (sel.zequal)
|
||||
{
|
||||
zs = VectorI::cast(z0);
|
||||
}
|
||||
else if (sel.zoverflow)
|
||||
{
|
||||
// SSE only has double to int32 conversion, no double to uint32
|
||||
// Work around this by subtracting 0x80000000 before converting, then adding it back after
|
||||
|
@ -1492,7 +1509,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
|
||||
if (sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
if (sel.zb)
|
||||
if (sel.zb && !sel.zequal)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
GSVector8 add = GSVector8::broadcast64(&local.d8.p.z);
|
||||
|
|
|
@ -720,7 +720,17 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
|
||||
}
|
||||
|
||||
if (m_sel.zb)
|
||||
if (m_sel.zb && m_sel.zequal)
|
||||
{
|
||||
Xmm zx(_z.getIdx());
|
||||
cvttsd2si(rax, ptr[a3 + offsetof(GSVertexSW, p.z)]);
|
||||
movd(zx, eax);
|
||||
if (hasAVX2)
|
||||
vpbroadcastd(_z, zx);
|
||||
else
|
||||
pshufd(_z, _z, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
}
|
||||
else if (m_sel.zb)
|
||||
{
|
||||
// z = vp.zzzz() + m_local.d[skip].z;
|
||||
broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
|
||||
|
@ -908,7 +918,7 @@ void GSDrawScanlineCodeGenerator2::Step()
|
|||
{
|
||||
// z += m_local.d4.z;
|
||||
|
||||
if (m_sel.zb)
|
||||
if (m_sel.zb && !m_sel.zequal)
|
||||
{
|
||||
broadcastsd(xym7, _rip_local_d_p(z));
|
||||
addpd(_z, xym7);
|
||||
|
@ -1054,7 +1064,11 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
|
|||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
if (m_sel.zoverflow)
|
||||
if (m_sel.zequal)
|
||||
{
|
||||
movdqa(xym0, _z);
|
||||
}
|
||||
else if (m_sel.zoverflow)
|
||||
{
|
||||
// GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32();
|
||||
// GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32();
|
||||
|
|
|
@ -1288,6 +1288,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
|||
|
||||
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
|
||||
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
|
||||
gd.sel.zequal = !!m_vt.m_eq.z;
|
||||
gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
|
||||
gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
|
||||
}
|
||||
|
|
|
@ -64,6 +64,7 @@ union GSScanlineSelector
|
|||
u32 mmin : 2; // 54
|
||||
u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
|
||||
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
|
||||
u32 zequal : 1; // 56
|
||||
u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue