GS: Use 32-bit vertex positions for culling

More accurate, stops it passing through vertices which are off-screen and coordinates overflowed. Differences versus current have been manually verified to be correct.
2023-06-18 01:40:04 +10:00 · 2023-06-18 01:40:04 +10:00 · 2046a9b414
parent c77d8b3709
commit 2046a9b414
5 changed files with 103 additions and 121 deletions
--- a/pcsx2/GS/GSDrawingContext.cpp
+++ b/pcsx2/GS/GSDrawingContext.cpp
@ -79,6 +79,44 @@ static int extend(int uv, int size)
 	return size;
 }

+GSDrawingContext::GSDrawingContext()
+{
+	std::memset(&offset, 0, sizeof(offset));
+
+	Reset();
+}
+
+void GSDrawingContext::Reset()
+{
+	std::memset(&XYOFFSET, 0, sizeof(XYOFFSET));
+	std::memset(&TEX0, 0, sizeof(TEX0));
+	std::memset(&TEX1, 0, sizeof(TEX1));
+	std::memset(&CLAMP, 0, sizeof(CLAMP));
+	std::memset(&MIPTBP1, 0, sizeof(MIPTBP1));
+	std::memset(&MIPTBP2, 0, sizeof(MIPTBP2));
+	std::memset(&SCISSOR, 0, sizeof(SCISSOR));
+	std::memset(&ALPHA, 0, sizeof(ALPHA));
+	std::memset(&TEST, 0, sizeof(TEST));
+	std::memset(&FBA, 0, sizeof(FBA));
+	std::memset(&FRAME, 0, sizeof(FRAME));
+	std::memset(&ZBUF, 0, sizeof(ZBUF));
+}
+
+void GSDrawingContext::UpdateScissor()
+{
+	// Scissor registers are inclusive of the upper bounds.
+	const GSVector4i rscissor = GSVector4i(static_cast<int>(SCISSOR.SCAX0), static_cast<int>(SCISSOR.SCAY0),
+		static_cast<int>(SCISSOR.SCAX1), static_cast<int>(SCISSOR.SCAY1));
+	scissor.in = rscissor + GSVector4i::cxpr(0, 0, 1, 1);
+
+	// Fixed-point scissor min/max, used for rejecting primitives which are entirely outside.
+	scissor.cull = rscissor.sll32(4);
+
+	// Offset applied to vertices for culling, zw is for native resolution culling
+	// We want to round subpixels down, because at least one pixel gets filled per scanline.
+	scissor.xyof = GSVector4i::loadl(&XYOFFSET.U64).xyxy().sub32(GSVector4i::cxpr(0, 0, 15, 15));
+}
+
 GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap) const
 {
 	if (mipmap)
--- a/pcsx2/GS/GSDrawingContext.h
+++ b/pcsx2/GS/GSDrawingContext.h
@ -39,8 +39,8 @@ public:
 	struct
 	{
 		GSVector4i in;
-		GSVector4i ex;
-		GSVector4i ofxy;
+		GSVector4i cull;
+		GSVector4i xyof;
 	} scissor;

 	struct
@ -50,50 +50,11 @@ public:
 		GSPixelOffset4* fzb4;
 	} offset;

-	GSDrawingContext()
-	{
-		memset(&offset, 0, sizeof(offset));
+	GSDrawingContext();

-		Reset();
-	}
+	void Reset();

-	void Reset()
-	{
-		memset(&XYOFFSET, 0, sizeof(XYOFFSET));
-		memset(&TEX0, 0, sizeof(TEX0));
-		memset(&TEX1, 0, sizeof(TEX1));
-		memset(&CLAMP, 0, sizeof(CLAMP));
-		memset(&MIPTBP1, 0, sizeof(MIPTBP1));
-		memset(&MIPTBP2, 0, sizeof(MIPTBP2));
-		memset(&SCISSOR, 0, sizeof(SCISSOR));
-		memset(&ALPHA, 0, sizeof(ALPHA));
-		memset(&TEST, 0, sizeof(TEST));
-		memset(&FBA, 0, sizeof(FBA));
-		memset(&FRAME, 0, sizeof(FRAME));
-		memset(&ZBUF, 0, sizeof(ZBUF));
-	}
-
-	void UpdateScissor()
-	{
-		ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
-
-		scissor.ex.U16[0] = (u16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
-		scissor.ex.U16[1] = (u16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
-		scissor.ex.U16[2] = (u16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
-		scissor.ex.U16[3] = (u16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
-
-		scissor.in = GSVector4i(
-			(int)SCISSOR.SCAX0,
-			(int)SCISSOR.SCAY0,
-			(int)SCISSOR.SCAX1 + 1,
-			(int)SCISSOR.SCAY1 + 1);
-
-		scissor.ofxy = GSVector4i(
-			0x8000,
-			0x8000,
-			(int)XYOFFSET.OFX - 15,
-			(int)XYOFFSET.OFY - 15);
-	}
+	void UpdateScissor();

 	GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap = false) const;

--- a/pcsx2/GS/GSState.cpp
+++ b/pcsx2/GS/GSState.cpp
@ -174,7 +174,7 @@ void GSState::Reset(bool hardware_reset)
 		// after reset (otherwise it'd only ever render 1x1).
 		//
 		if (!hardware_reset && GSConfig.UseHardwareRenderer())
-			m_env.CTXT[i].scissor.ex = GSVector4i::xffffffff();
+			m_env.CTXT[i].scissor.cull = GSVector4i::xffffffff();

 		m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
 		m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
@ -1684,8 +1684,9 @@ void GSState::FlushPrim()
 				{
 					GSVector4i* RESTRICT vert_ptr = (GSVector4i*)&m_vertex.buff[i];
 					GSVector4i v = vert_ptr[1];
-					v = v.xxxx().u16to32().sub32(m_ofxy);
-					GSVector4i::storel(&m_vertex.xy[i & 3], v.blend16<0xf0>(v.sra32(4)).ps32());
+					v = v.xxxx().u16to32().sub32(m_xyof);
+					v = v.blend32<12>(v.sra32(4));
+					m_vertex.xy[i & 3] = v;
 					m_vertex.xy_tail = unused;
 				}
 			}
@ -2664,8 +2665,9 @@ void GSState::UpdateContext()

 void GSState::UpdateScissor()
 {
-	m_scissor = m_context->scissor.ex;
-	m_ofxy = m_context->scissor.ofxy;
+	m_scissor_cull_min = m_context->scissor.cull.xyxy();
+	m_scissor_cull_max = m_context->scissor.cull.zwzw();
+	m_xyof = m_context->scissor.xyof;
 	m_scissor_invalid = !m_context->scissor.in.gt32(m_context->scissor.in.zwzw()).allfalse();
 }

@ -3165,9 +3167,18 @@ __forceinline void GSState::VertexKick(u32 skip)
 	tailptr[0] = new_v0;
 	tailptr[1] = new_v1;

-	const GSVector4i xy = new_v1.xxxx().u16to32().sub32(m_ofxy);
+	// We maintain the X/Y coordinates for the last 4 vertices, as well as the head for triangle fans, so we can compute
+	// the min/max, and cull degenerate triangles, which saves draws in some cases. Why 4? Mod 4 is cheaper than Mod 3.
+	// These vertices are a full vector containing <X_Fixed_Point, Y_Fixed_Point, X_Integer, Y_Integer>. We use the
+	// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
+	// applied, then we split it into the fixed/integer portions.
+	const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
+	const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32(4));
+	m_vertex.xy[xy_tail & 3] = xy;

-	GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
+	// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.
+	if (prim == GS_TRIANGLEFAN && tail == head)
+		m_vertex.xyhead = xy;

 	m_vertex.tail = ++tail;
 	m_vertex.xy_tail = ++xy_tail;
@ -3177,44 +3188,40 @@ __forceinline void GSState::VertexKick(u32 skip)
 	if (m < n)
 		return;

+
 	// Skip draws when scissor is out of range (i.e. bottom-right is less than top-left), since everything will get clipped.
 	skip |= static_cast<u32>(m_scissor_invalid);

-	if (skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
+	GSVector4i pmin, pmax;
+	if (skip == 0)
 	{
-		GSVector4i pmin, pmax;
-
-		const GSVector4i v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
-		const GSVector4i v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
-		const GSVector4i v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
-		const GSVector4i v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
+		const GSVector4i v0 = m_vertex.xy[(xy_tail - 1) & 3];
+		const GSVector4i v1 = m_vertex.xy[(xy_tail - 2) & 3];
+		const GSVector4i v2 = (prim == GS_TRIANGLEFAN) ? m_vertex.xyhead : m_vertex.xy[(xy_tail - 3) & 3];

 		switch (prim)
 		{
 			case GS_POINTLIST:
-				pmin = v2;
-				pmax = v2;
+				pmin = v0;
+				pmax = v0;
 				break;
 			case GS_LINELIST:
 			case GS_LINESTRIP:
 			case GS_SPRITE:
-				pmin = v2.min_i16(v1);
-				pmax = v2.max_i16(v1);
+				pmin = v0.min_i32(v1);
+				pmax = v0.max_i32(v1);
 				break;
 			case GS_TRIANGLELIST:
 			case GS_TRIANGLESTRIP:
-				pmin = v2.min_i16(v1.min_i16(v0));
-				pmax = v2.max_i16(v1.max_i16(v0));
-				break;
 			case GS_TRIANGLEFAN:
-				pmin = v2.min_i16(v1.min_i16(v3));
-				pmax = v2.max_i16(v1.max_i16(v3));
+				pmin = v0.min_i32(v1.min_i32(v2));
+				pmax = v0.max_i32(v1.max_i32(v2));
 				break;
 			default:
 				break;
 		}

-		GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl());
+		GSVector4i test = pmax.lt32(m_scissor_cull_min) | pmin.gt32(m_scissor_cull_max);

 		switch (prim)
 		{
@ -3222,10 +3229,14 @@ __forceinline void GSState::VertexKick(u32 skip)
 			case GS_TRIANGLESTRIP:
 			case GS_TRIANGLEFAN:
 			case GS_SPRITE:
-				// Discard degenerate triangles. For native resolution, we can ignore the subpixel bits,
-				// because at the boundaries, they're irrelevant.
-				test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
-				break;
+			{
+				// Discard degenerate triangles which don't cover at least one pixel. Since the vertices are in native
+				// resolution space, we can use the integer locations. When upscaling, we can't, because a primitive which
+				// does not span a single pixel at 1x may span multiple pixels at higher resolutions.
+				const GSVector4i degen_test = pmin.eq32(pmax);
+				test |= m_nativeres ? degen_test.zwzw() : degen_test;
+			}
+			break;
 			default:
 				break;
 		}
@ -3234,18 +3245,15 @@ __forceinline void GSState::VertexKick(u32 skip)
 		{
 			case GS_TRIANGLELIST:
 			case GS_TRIANGLESTRIP:
-				// TODO: any way to do a 16-bit integer cross product?
-				// cross product is zero most of the time because either of the vertices are the same
-				test = (test | v0 == v1) | (v1 == v2 | v0 == v2);
-				break;
 			case GS_TRIANGLEFAN:
-				test = (test | v3 == v1) | (v1 == v2 | v3 == v2);
+				test = (test | v0.eq64(v1)) | (v1.eq64(v2) | v0.eq64(v2));
 				break;
 			default:
 				break;
 		}

-		skip |= test.mask() & 15;
+		// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
+		skip |= test.mask() & 0xff;
 	}

 	if (skip != 0)
@ -3365,46 +3373,14 @@ __forceinline void GSState::VertexKick(u32 skip)
 			__assume(0);
 	}

-	{
-		const GSVector4i voffset(GSVector4i::loadl(&m_context->XYOFFSET));
-
-		auto get_vertex = [&](u32 i) {
-			GSVector4i v(GSVector4i::loadl(&m_vertex.buff[m_index.buff[(m_index.tail - n) + (i)]].XYZ));
-			v = v.upl16(); // 16->32
-			v = v.sub32(voffset); // -= (OFX, OFY)
-			v = v.sra32(4); // >> 4
-			return v;
-		};
-
-		const GSVector4i xy0(get_vertex(0));
-		GSVector4i min, max;
-		if (m_vertex.tail == n)
-		{
-			min = xy0;
-			max = xy0;
-		}
-		else
-		{
-			min = temp_draw_rect.min_i32(xy0);
-			max = temp_draw_rect.zwzw().max_i32(xy0);
-		}
-
-		if constexpr (n > 1)
-		{
-			const GSVector4i xy1(get_vertex(1));
-			min = min.min_i32(xy1);
-			max = max.max_i32(xy1);
-
-			if constexpr (n > 2)
-			{
-				const GSVector4i xy2(get_vertex(2));
-				min = min.min_i32(xy2);
-				max = max.max_i32(xy2);
-			}
-		}
-
-		temp_draw_rect = min.upl64(max).rintersect(m_context->scissor.in);
-	}
+	// Update rectangle for the current draw. We can use the re-integer coordinates from min/max here.
+	const GSVector4i draw_min = pmin.zwzw();
+	const GSVector4i draw_max = pmax;
+	if (m_vertex.tail != n)
+		temp_draw_rect = temp_draw_rect.min_i32(draw_min).blend32<12>(temp_draw_rect.max_i32(draw_max));
+	else
+		temp_draw_rect = draw_min.blend32<12>(draw_max);
+	temp_draw_rect = temp_draw_rect.rintersect(m_context->scissor.in);

 	CLUTAutoFlush(prim);

--- a/pcsx2/GS/GSState.h
+++ b/pcsx2/GS/GSState.h
@ -141,15 +141,17 @@ private:
 protected:
 	GSVertex m_v = {};
 	float m_q = 1.0f;
-	GSVector4i m_scissor = {};
-	GSVector4i m_ofxy = {};
+	GSVector4i m_scissor_cull_min = {};
+	GSVector4i m_scissor_cull_max = {};
+	GSVector4i m_xyof = {};

 	struct
 	{
 		GSVertex* buff;
 		u32 head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
 		u32 xy_tail;
-		u64 xy[4];
+		GSVector4i xy[4];
+		GSVector4i xyhead;
 	} m_vertex = {};

 	struct
--- a/pcsx2/GS/GSVector4i.h
+++ b/pcsx2/GS/GSVector4i.h
@ -969,6 +969,11 @@ public:
 		return GSVector4i(_mm_cmpeq_epi32(m, v.m));
 	}

+	__forceinline GSVector4i eq64(const GSVector4i& v) const
+	{
+		return GSVector4i(_mm_cmpeq_epi64(m, v.m));
+	}
+
 	__forceinline GSVector4i neq8(const GSVector4i& v) const
 	{
 		return ~eq8(v);