From 435344f099a7fa7b12e5906bde37d4b9f3ad9395 Mon Sep 17 00:00:00 2001
From: TJnotJT <tjnotjt.github@gmail.com>
Date: Sun, 2 Feb 2025 00:24:47 -0500
Subject: [PATCH] GS: Handle huge/infinite/nan ST coords in GS vertex input.

---
 pcsx2/GS/GSState.cpp                          | 324 +++++++++++++++---
 pcsx2/GS/GSState.h                            |   3 +
 pcsx2/GS/GSVector4.h                          |  15 +
 pcsx2/GS/GSVector4_arm64.h                    |  15 +
 pcsx2/GS/Renderers/Common/GSVertexTrace.cpp   |   9 +-
 .../GS/Renderers/Common/GSVertexTraceFMM.cpp  |  15 +-
 6 files changed, 324 insertions(+), 57 deletions(-)
diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp
index c664204864..a3c62bf723 100644
--- a/pcsx2/GS/GSState.cpp
+++ b/pcsx2/GS/GSState.cpp
@@ -1595,13 +1595,6 @@ inline bool GSState::TestDrawChanged()
 	return false;
 }
 
-u32 GSState::CalcMask(int exp, int max_exp)
-{
-	const int amount = 9 + (max_exp - exp);
-
-	return (1 << std::min(amount, 23)) - 1;
-}
-
 void GSState::FlushPrim()
 {
 	if (m_index.tail > 0)
@@ -1675,50 +1668,19 @@ void GSState::FlushPrim()
 		}
 #endif
 
+
 		m_vt.Update(m_vertex.buff, m_index.buff, m_vertex.tail, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
-
-		// Texel coordinate rounding
-		// Helps Manhunt (lights shining through objects).
-		// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
-		// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
-		if (PRIM->TME && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
+		
+		// Fix huge or nan ST coordinates
+		if (PRIM->TME && !PRIM->FST)
 		{
-			if (!PRIM->FST) // STQ's
-			{
-				const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
-				// ST's have the lowest 9 bits (or greater depending on exponent difference) rounding down (from hardware tests).
-				for (int i = m_index.tail - 1; i >= 0; i--)
-				{
-					GSVertex* v = &m_vertex.buff[m_index.buff[i]];
+			FixHugeSTCoords();	
+		}
 
-					// Only Q on the second vertex is valid
-					if (!(i & 1) && is_sprite)
-						v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
-
-					int T = std::bit_cast<int>(v->ST.T);
-					int Q = std::bit_cast<int>(v->RGBAQ.Q);
-					int S = std::bit_cast<int>(v->ST.S);
-					const int expS = (S >> 23) & 0xff;
-					const int expT = (T >> 23) & 0xff;
-					const int expQ = (Q >> 23) & 0xff;
-					int max_exp = std::max(expS, expQ);
-
-					u32 mask = CalcMask(expS, max_exp);
-					S &= ~mask;
-					v->ST.S = std::bit_cast<float>(S);
-					max_exp = std::max(expT, expQ);
-					mask = CalcMask(expT, max_exp);
-					T &= ~mask;
-					v->ST.T = std::bit_cast<float>(T);
-					Q &= ~0xff;
-
-					if (!is_sprite || (i & 1))
-						v->RGBAQ.Q = std::bit_cast<float>(Q);
-
-					m_vt.m_min.t.x = std::min(m_vt.m_min.t.x, (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW));
-					m_vt.m_min.t.y = std::min(m_vt.m_min.t.y, (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH));
-				}
-			}
+		// Round fractional parts of ST coords
+		if (PRIM->TME && !PRIM->FST && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
+		{
+			RoundSTCoords();
 		}
 
 		// Skip draw if Z test is enabled, but set to fail all pixels.
@@ -3831,8 +3793,8 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
 
 	u8 uses_border = 0;
 
-	if (m_vt.m_max.t.x >= FLT_MAX || m_vt.m_min.t.x <= -FLT_MAX ||
-		m_vt.m_max.t.y >= FLT_MAX || m_vt.m_min.t.y <= -FLT_MAX)
+	if (m_vt.m_max.t.x >= 2047.0f || m_vt.m_min.t.x <= -2047.0f ||
+		m_vt.m_max.t.y >= 2047.0f || m_vt.m_min.t.y <= -2047.0f)
 	{
 		// If any of the min/max values are +-FLT_MAX we can't rely on them
 		// so just assume full texture.
@@ -4009,6 +3971,268 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
 	return { vr, uses_border };
 }
 
+// ST coordinate rounding
+// Helps Manhunt (lights shining through objects).
+// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
+// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
+void GSState::RoundSTCoords()
+{
+	const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
+	
+	// ST's have the lowest 9 bits (or greater depending on exponent difference) rounded down (from hardware tests).
+	// This gives the bitmask for the lower 9 (or more) bits.
+	auto LowerBitsMask = [](int exp, int max_exp)
+	{
+		const int amount = 9 + (max_exp - exp);
+		return (1 << std::min(amount, 23)) - 1;
+	};
+
+	for (int i = m_index.tail - 1; i >= 0; i--)
+	{
+		GSVertex* v = &m_vertex.buff[m_index.buff[i]];
+
+		// Only Q on the second vertex is valid
+		if (!(i & 1) && is_sprite)
+			v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
+
+		int S = std::bit_cast<int>(v->ST.S);
+		int T = std::bit_cast<int>(v->ST.T);
+		int Q = std::bit_cast<int>(v->RGBAQ.Q);
+		
+		const int expS = (S >> 23) & 0xff;
+		const int expT = (T >> 23) & 0xff;
+		const int expQ = (Q >> 23) & 0xff;
+		
+		S &= ~LowerBitsMask(expS, std::max(expS, expQ));
+		T &= ~LowerBitsMask(expT, std::max(expT, expQ));
+		Q &= ~0xff; // Q gets truncated less than ST by hardware tests
+
+		v->ST.S = std::bit_cast<float>(S);
+		v->ST.T = std::bit_cast<float>(T);
+
+		if (!is_sprite || (i & 1))
+			v->RGBAQ.Q = std::bit_cast<float>(Q);
+
+		const float U = (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW);
+		const float V = (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH);
+		const float Qf = std::bit_cast<float>(Q);
+
+		const GSVector4 uvq(U, V, Qf, Qf);
+
+		// Do min/max with only those values that are not NaN
+		m_vt.m_min.t = m_vt.m_min.t.blend32(m_vt.m_min.t.min(uvq), uvq.notnan());
+		m_vt.m_max.t = m_vt.m_max.t.blend32(m_vt.m_max.t.max(uvq), uvq.notnan());
+	}
+
+	// Clamp the min/max UV values to the min/max valid UV values.
+	m_vt.m_min.t = m_vt.m_min.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_min.t);
+	m_vt.m_max.t = m_vt.m_max.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_max.t);
+}
+
+// Handle the huge ST coords in by culling primitives with NaN coords and
+// replacing the primitives with huge coords with a new one that has the huge coordinate replaced with +/- 2047.
+// This is based on hardware test that show that seem to show that ST coordinate get clamped to +/- 2047
+// (perhaps before applying repeat or region repeat).
+// Note that the huge texture coords may be a symptom of floating point issues upstream in the EE and
+// it would be better to have them fixed there; this is a bandaid.
+void GSState::FixHugeSTCoords()
+{
+	bool sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_SPRITE_CLASS;
+	switch (GSUtil::GetClassVertexCount(GSUtil::GetPrimClass(PRIM->PRIM)))
+	{
+		case 1:
+			if (sprite)
+				FixHugeSTCoordsImpl<1, true>();
+			else
+				FixHugeSTCoordsImpl<1, false>();
+			break;
+		case 2:
+			if (sprite)
+				FixHugeSTCoordsImpl<2, true>();
+			else
+				FixHugeSTCoordsImpl<2, false>();
+			break;
+		case 3:
+			if (sprite)
+				FixHugeSTCoordsImpl<3, true>();
+			else
+				FixHugeSTCoordsImpl<3, false>();
+			break;
+		default:
+			pxFail("Impossible");
+	}
+}
+
+template <u32 n, bool sprite> void GSState::FixHugeSTCoordsImpl()
+{
+	GSVertex* const vertex = m_vertex.buff;
+	u16* const index = m_index.buff;
+
+	u32 new_index_tail = 0;
+
+	constexpr float huge = 1e10f; // arbitrary large value
+
+	const float tex_width = 1 << m_context->TEX0.TW;
+	const float tex_height = 1 << m_context->TEX0.TH;
+
+	bool new_prims = false; // Did we generate new primitives?
+
+	for (u32 i = 0; i < m_index.tail; i += n)
+	{
+		bool nan_s = false;
+		bool nan_t = false;
+		bool huge_s_pos = false;
+		bool huge_s_neg = false;
+		bool huge_t_pos = false;
+		bool huge_t_neg = false;
+
+		if (sprite)
+		{
+			// Sprites behave as if both Qs are same as the second one
+			const float s0 = vertex[index[i + 0]].ST.S / vertex[index[i + 1]].RGBAQ.Q;
+			const float t0 = vertex[index[i + 0]].ST.T / vertex[index[i + 1]].RGBAQ.Q;
+			const float s1 = vertex[index[i + 1]].ST.S / vertex[index[i + 1]].RGBAQ.Q;
+			const float t1 = vertex[index[i + 1]].ST.T / vertex[index[i + 1]].RGBAQ.Q;
+			nan_s = std::isnan(s0) || std::isnan(s1);
+			nan_t = std::isnan(t0) || std::isnan(t1);
+			huge_s_pos = s0 > huge || s1 > huge;
+			huge_s_neg = s0 < -huge || s1 < -huge;
+			huge_t_pos = t0 > huge || t1 > huge;
+			huge_t_neg = t0 < -huge || t1 < -huge;
+		}
+		else
+		{
+			for (u32 j = 0; j < n; j++)
+			{
+				const float s = vertex[index[i + j]].ST.S / vertex[index[i + j]].RGBAQ.Q;
+				const float t = vertex[index[i + j]].ST.T / vertex[index[i + j]].RGBAQ.Q;
+				nan_s |= std::isnan(s);
+				nan_t |= std::isnan(t);
+				huge_s_pos |= s > huge;
+				huge_t_pos |= t > huge;
+				huge_s_neg |= s < -huge;
+				huge_t_neg |= t < -huge;
+			}
+		}
+
+		// ambiguous = true would probably result in NaN in the SW rasterizer or something undefined in HW.
+		// PS2 does not have NaN so there is no really accurate way to emulate this.
+		// huge = true and ambiguous = false seems to have well-defined behavior on the PS2:
+		// it clamps huge values to +/-2047 in UV coordinates space. We try to approximate this by
+		// giving ST the values that would result in exactly +/-2047 across the primitive.
+		const bool ambiguous = nan_s || nan_t || (huge_s_pos && huge_s_neg) || (huge_s_pos && huge_s_neg);
+		const bool huge = huge_s_pos || huge_t_pos || huge_s_neg || huge_t_neg;
+
+		if (ambiguous)
+		{
+			// Cull the primitive by not saving the indices
+			continue;
+		}
+
+		if (huge)
+		{
+			// Add new vertices to replace the primitive with another primitive with clamped values.
+			new_prims = true;
+
+			if (sprite)
+			{
+				// Handle sprite separately since it uses the second Q for both vertices
+				GSVertex v_new0 = vertex[index[i + 0]];
+				GSVertex v_new1 = vertex[index[i + 1]];
+
+				// Try to set values so that we get constant UV +/-2047 across the entire triangle after interpolation
+				// Sprites behave as if both Qs are same as the second one
+				if (huge_s_pos)
+				{
+					v_new1.ST.S = v_new0.ST.S = 2047.0f * v_new1.RGBAQ.Q / tex_width;
+				}
+				else if (huge_s_neg)
+				{
+					v_new1.ST.S = v_new0.ST.S = -2047.0f * v_new1.RGBAQ.Q / tex_width;
+				}
+
+				if (huge_t_pos)
+				{
+					v_new1.ST.T = v_new0.ST.T = 2047.0f * v_new1.RGBAQ.Q / tex_height;
+				}
+				else if (huge_t_neg)
+				{
+					v_new1.ST.T = v_new0.ST.T = -2047.0f * v_new1.RGBAQ.Q / tex_height;
+				}
+
+				// Copy old values to tail of vertex buffer.
+				// The vertex buffer is allocated so that there is always at least room for 3 new vertices at the end.
+				vertex[m_vertex.tail + 0] = v_new0;
+				vertex[m_vertex.tail + 1] = v_new1;
+
+				// Make new indices point to new vertices
+				index[new_index_tail + 0] = m_vertex.tail + 0;
+				index[new_index_tail + 1] = m_vertex.tail + 1;
+			}
+			else
+			{
+				// Copy old values to tail of vertex buffer.
+				// The vertex buffer is allocated so that there is always at least room for 3 new vertices at the end.
+				for (u32 j = 0; j < n; j++)
+					vertex[m_vertex.tail + j] = vertex[index[i + j]];
+
+				// Try to set values so that we get constant UV +/-2047 across the entire primitive after interpolation
+				if (huge_s_pos)
+				{
+					for (u32 j = 0; j < n; j++)
+						vertex[m_vertex.tail + j].ST.S = 2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
+				}
+				else if (huge_s_neg)
+				{
+					for (u32 j = 0; j < n; j++)
+						vertex[m_vertex.tail + j].ST.S = -2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
+				}
+
+				if (huge_t_pos)
+				{
+					for (int j = 0; j < n; j++)
+						vertex[m_vertex.tail + j].ST.T = 2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_height;
+				}
+				else if (huge_t_neg)
+				{
+					for (u32 j = 0; j < n; j++)
+						vertex[m_vertex.tail + j].ST.T = -2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_height;
+				}
+
+				// Make new indices point to new vertices
+				for (u32 j = 0; j < n; j++)
+				{
+					index[new_index_tail + j] = m_vertex.tail + j;
+				}
+			}
+
+			// Advance tail since we pushed new vertices
+			m_vertex.tail += n;
+
+			if (m_vertex.tail >= m_vertex.maxcount)
+			{
+				GrowVertexBuffer();
+			}
+		}
+		else if (new_index_tail < i) // If new_index_tail == i, don't update indices since no primitives have been culled
+		{
+			// Keep the same primitive so shift indices down
+			for (u32 j = 0; j < n; j++)
+				index[new_index_tail + j] = index[i + j];
+		}
+
+		new_index_tail += n;
+	}
+
+	m_index.tail = new_index_tail;
+
+	if (new_prims)
+	{
+		// We indexed new primitives at the end of the buffer so update head and next also
+		m_vertex.head = m_vertex.next = m_vertex.tail;
+	}
+}
+
 void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
 {
 	if (m_vt.m_alpha.valid && tex_alpha_min == 0 && tex_alpha_max == 255)
diff --git a/pcsx2/GS/GSState.h b/pcsx2/GS/GSState.h
index 75416d86d5..2bd95eacd5 100644
--- a/pcsx2/GS/GSState.h
+++ b/pcsx2/GS/GSState.h
@@ -190,6 +190,9 @@ protected:
 	bool IsCoverageAlpha();
 	void CalcAlphaMinMax(const int tex_min, const int tex_max);
 	void CorrectATEAlphaMinMax(const u32 atst, const int aref);
+	void RoundSTCoords();
+	void FixHugeSTCoords();
+	template <u32 n, bool sprite> void FixHugeSTCoordsImpl();
 
 public:
 	struct GSUploadQueue
diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h
index c74b965ea4..2fe8caa94b 100644
--- a/pcsx2/GS/GSVector4.h
+++ b/pcsx2/GS/GSVector4.h
@@ -267,6 +267,16 @@ public:
 		return round<Round_PosInf>();
 	}
 
+	__forceinline GSVector4 notnan() const
+	{
+		return GSVector4(_mm_cmpord_ps(m, m));
+	}
+
+	__forceinline GSVector4 isnan() const
+	{
+		return GSVector4(_mm_cmpunord_ps(m, m));
+	}
+
 	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 
 #define LOG_POLY0(x, c0) GSVector4(c0)
@@ -656,6 +666,11 @@ public:
 		return neg();
 	}
 
+	__forceinline GSVector4 operator~() const
+	{
+		return cast(~GSVector4i::cast(*this));
+	}
+
 	__forceinline void operator+=(const GSVector4& v)
 	{
 		m = _mm_add_ps(m, v);
diff --git a/pcsx2/GS/GSVector4_arm64.h b/pcsx2/GS/GSVector4_arm64.h
index c78b0de3f8..177c3a5121 100644
--- a/pcsx2/GS/GSVector4_arm64.h
+++ b/pcsx2/GS/GSVector4_arm64.h
@@ -241,6 +241,16 @@ public:
 		return GSVector4(vrndpq_f32(v4s));
 	}
 
+	__forceinline GSVector4 notnan() const
+	{
+		return *this == *this;
+	}
+	
+	__forceinline GSVector4 isnan() const
+	{
+		return *this != *this;
+	}
+
 	// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
 
 #define LOG_POLY0(x, c0) GSVector4(c0)
@@ -560,6 +570,11 @@ public:
 		return neg();
 	}
 
+	__forceinline GSVector4 operator~() const
+	{
+		return cast(~GSVector4i::cast(*this));
+	}
+
 	__forceinline void operator+=(const GSVector4& v)
 	{
 		v4s = vaddq_f32(v4s, v.v4s);
diff --git a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
index c431c86d3f..fe06ea3c11 100644
--- a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
+++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp
@@ -20,11 +20,12 @@ void GSVertexTrace::Update(const void* vertex, const u16* index, int v_count, in
 
 	m_primclass = primclass;
 
-	u32 iip = m_state->PRIM->IIP;
-	u32 tme = m_state->PRIM->TME;
-	u32 fst = m_state->PRIM->FST;
-	u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
+	const u32 iip = m_state->PRIM->IIP;
+	const u32 tme = m_state->PRIM->TME;
+	const u32 fst = m_state->PRIM->FST;
+	const u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
 
+	// Call the correct function to find the min/max values
 	m_fmm[color][fst][tme][iip][primclass](*this, vertex, index, i_count);
 
 	// Potential float overflow detected. Better uses the slower division instead
diff --git a/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp b/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp
index 66bdb9565d..90b9d5dd77 100644
--- a/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp
+++ b/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp
@@ -138,9 +138,12 @@ void GSVertexTraceFMM::FindMinMax(GSVertexTrace& vt, const void* vertex, const u
 
 				stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0);
 				stq1 = st.zwww(stq1);
-
-				tmin = tmin.min(stq0.min(stq1));
-				tmax = tmax.max(stq0.max(stq1));
+				
+				// Only update entries that are not NaN
+				tmin = tmin.blend32(tmin.min(stq0), stq0.notnan());
+				tmin = tmin.blend32(tmin.min(stq1), stq1.notnan());
+				tmax = tmax.blend32(tmax.max(stq0), stq0.notnan());
+				tmax = tmax.blend32(tmax.max(stq1), stq1.notnan());
 			}
 			else
 			{
@@ -246,6 +249,12 @@ void GSVertexTraceFMM::FindMinMax(GSVertexTrace& vt, const void* vertex, const u
 
 		vt.m_min.t = tmin * s;
 		vt.m_max.t = tmax * s;
+
+		// Clamp the min/max UV values to the min/max valid UV values.
+		// This is needed in certain cases where buggy GS input results
+		// in huge floating points values for ST.
+		vt.m_min.t = vt.m_min.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(vt.m_min.t);
+		vt.m_max.t = vt.m_max.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(vt.m_max.t);
 	}
 	else
 	{