GS: Handle huge/infinite/nan ST coords in GS vertex input.

This commit is contained in:
TJnotJT 2025-02-02 00:24:47 -05:00
parent df7646fd34
commit 435344f099
6 changed files with 324 additions and 57 deletions

View File

@ -1595,13 +1595,6 @@ inline bool GSState::TestDrawChanged()
return false;
}
u32 GSState::CalcMask(int exp, int max_exp)
{
const int amount = 9 + (max_exp - exp);
return (1 << std::min(amount, 23)) - 1;
}
void GSState::FlushPrim()
{
if (m_index.tail > 0)
@ -1675,50 +1668,19 @@ void GSState::FlushPrim()
}
#endif
m_vt.Update(m_vertex.buff, m_index.buff, m_vertex.tail, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
// Texel coordinate rounding
// Helps Manhunt (lights shining through objects).
// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
if (PRIM->TME && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
// Fix huge or nan ST coordinates
if (PRIM->TME && !PRIM->FST)
{
if (!PRIM->FST) // STQ's
{
const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
// ST's have the lowest 9 bits (or greater depending on exponent difference) rounding down (from hardware tests).
for (int i = m_index.tail - 1; i >= 0; i--)
{
GSVertex* v = &m_vertex.buff[m_index.buff[i]];
FixHugeSTCoords();
}
// Only Q on the second vertex is valid
if (!(i & 1) && is_sprite)
v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
int T = std::bit_cast<int>(v->ST.T);
int Q = std::bit_cast<int>(v->RGBAQ.Q);
int S = std::bit_cast<int>(v->ST.S);
const int expS = (S >> 23) & 0xff;
const int expT = (T >> 23) & 0xff;
const int expQ = (Q >> 23) & 0xff;
int max_exp = std::max(expS, expQ);
u32 mask = CalcMask(expS, max_exp);
S &= ~mask;
v->ST.S = std::bit_cast<float>(S);
max_exp = std::max(expT, expQ);
mask = CalcMask(expT, max_exp);
T &= ~mask;
v->ST.T = std::bit_cast<float>(T);
Q &= ~0xff;
if (!is_sprite || (i & 1))
v->RGBAQ.Q = std::bit_cast<float>(Q);
m_vt.m_min.t.x = std::min(m_vt.m_min.t.x, (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW));
m_vt.m_min.t.y = std::min(m_vt.m_min.t.y, (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH));
}
}
// Round fractional parts of ST coords
if (PRIM->TME && !PRIM->FST && (GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS || m_vt.m_eq.z))
{
RoundSTCoords();
}
// Skip draw if Z test is enabled, but set to fail all pixels.
@ -3831,8 +3793,8 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
u8 uses_border = 0;
if (m_vt.m_max.t.x >= FLT_MAX || m_vt.m_min.t.x <= -FLT_MAX ||
m_vt.m_max.t.y >= FLT_MAX || m_vt.m_min.t.y <= -FLT_MAX)
if (m_vt.m_max.t.x >= 2047.0f || m_vt.m_min.t.x <= -2047.0f ||
m_vt.m_max.t.y >= 2047.0f || m_vt.m_min.t.y <= -2047.0f)
{
// If any of the min/max values are +-FLT_MAX we can't rely on them
// so just assume full texture.
@ -4009,6 +3971,268 @@ GSState::TextureMinMaxResult GSState::GetTextureMinMax(GIFRegTEX0 TEX0, GIFRegCL
return { vr, uses_border };
}
// ST coordinate rounding
// Helps Manhunt (lights shining through objects).
// Can help with some alignment issues when upscaling too, and is for both Software and Hardware renderers.
// Sometimes hardware doesn't get affected, likely due to the difference in how GPU's handle textures (Persona minimap).
void GSState::RoundSTCoords()
{
const bool is_sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_PRIM_CLASS::GS_SPRITE_CLASS;
// ST's have the lowest 9 bits (or greater depending on exponent difference) rounded down (from hardware tests).
// This gives the bitmask for the lower 9 (or more) bits.
auto LowerBitsMask = [](int exp, int max_exp)
{
const int amount = 9 + (max_exp - exp);
return (1 << std::min(amount, 23)) - 1;
};
for (int i = m_index.tail - 1; i >= 0; i--)
{
GSVertex* v = &m_vertex.buff[m_index.buff[i]];
// Only Q on the second vertex is valid
if (!(i & 1) && is_sprite)
v->RGBAQ.Q = m_vertex.buff[m_index.buff[i + 1]].RGBAQ.Q;
int S = std::bit_cast<int>(v->ST.S);
int T = std::bit_cast<int>(v->ST.T);
int Q = std::bit_cast<int>(v->RGBAQ.Q);
const int expS = (S >> 23) & 0xff;
const int expT = (T >> 23) & 0xff;
const int expQ = (Q >> 23) & 0xff;
S &= ~LowerBitsMask(expS, std::max(expS, expQ));
T &= ~LowerBitsMask(expT, std::max(expT, expQ));
Q &= ~0xff; // Q gets truncated less than ST by hardware tests
v->ST.S = std::bit_cast<float>(S);
v->ST.T = std::bit_cast<float>(T);
if (!is_sprite || (i & 1))
v->RGBAQ.Q = std::bit_cast<float>(Q);
const float U = (v->ST.S / v->RGBAQ.Q) * (1 << m_context->TEX0.TW);
const float V = (v->ST.T / v->RGBAQ.Q) * (1 << m_context->TEX0.TH);
const float Qf = std::bit_cast<float>(Q);
const GSVector4 uvq(U, V, Qf, Qf);
// Do min/max with only those values that are not NaN
m_vt.m_min.t = m_vt.m_min.t.blend32(m_vt.m_min.t.min(uvq), uvq.notnan());
m_vt.m_max.t = m_vt.m_max.t.blend32(m_vt.m_max.t.max(uvq), uvq.notnan());
}
// Clamp the min/max UV values to the min/max valid UV values.
m_vt.m_min.t = m_vt.m_min.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_min.t);
m_vt.m_max.t = m_vt.m_max.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(m_vt.m_max.t);
}
// Handle the huge ST coords in by culling primitives with NaN coords and
// replacing the primitives with huge coords with a new one that has the huge coordinate replaced with +/- 2047.
// This is based on hardware test that show that seem to show that ST coordinate get clamped to +/- 2047
// (perhaps before applying repeat or region repeat).
// Note that the huge texture coords may be a symptom of floating point issues upstream in the EE and
// it would be better to have them fixed there; this is a bandaid.
void GSState::FixHugeSTCoords()
{
bool sprite = GSUtil::GetPrimClass(PRIM->PRIM) == GS_SPRITE_CLASS;
switch (GSUtil::GetClassVertexCount(GSUtil::GetPrimClass(PRIM->PRIM)))
{
case 1:
if (sprite)
FixHugeSTCoordsImpl<1, true>();
else
FixHugeSTCoordsImpl<1, false>();
break;
case 2:
if (sprite)
FixHugeSTCoordsImpl<2, true>();
else
FixHugeSTCoordsImpl<2, false>();
break;
case 3:
if (sprite)
FixHugeSTCoordsImpl<3, true>();
else
FixHugeSTCoordsImpl<3, false>();
break;
default:
pxFail("Impossible");
}
}
template <u32 n, bool sprite> void GSState::FixHugeSTCoordsImpl()
{
GSVertex* const vertex = m_vertex.buff;
u16* const index = m_index.buff;
u32 new_index_tail = 0;
constexpr float huge = 1e10f; // arbitrary large value
const float tex_width = 1 << m_context->TEX0.TW;
const float tex_height = 1 << m_context->TEX0.TH;
bool new_prims = false; // Did we generate new primitives?
for (u32 i = 0; i < m_index.tail; i += n)
{
bool nan_s = false;
bool nan_t = false;
bool huge_s_pos = false;
bool huge_s_neg = false;
bool huge_t_pos = false;
bool huge_t_neg = false;
if (sprite)
{
// Sprites behave as if both Qs are same as the second one
const float s0 = vertex[index[i + 0]].ST.S / vertex[index[i + 1]].RGBAQ.Q;
const float t0 = vertex[index[i + 0]].ST.T / vertex[index[i + 1]].RGBAQ.Q;
const float s1 = vertex[index[i + 1]].ST.S / vertex[index[i + 1]].RGBAQ.Q;
const float t1 = vertex[index[i + 1]].ST.T / vertex[index[i + 1]].RGBAQ.Q;
nan_s = std::isnan(s0) || std::isnan(s1);
nan_t = std::isnan(t0) || std::isnan(t1);
huge_s_pos = s0 > huge || s1 > huge;
huge_s_neg = s0 < -huge || s1 < -huge;
huge_t_pos = t0 > huge || t1 > huge;
huge_t_neg = t0 < -huge || t1 < -huge;
}
else
{
for (u32 j = 0; j < n; j++)
{
const float s = vertex[index[i + j]].ST.S / vertex[index[i + j]].RGBAQ.Q;
const float t = vertex[index[i + j]].ST.T / vertex[index[i + j]].RGBAQ.Q;
nan_s |= std::isnan(s);
nan_t |= std::isnan(t);
huge_s_pos |= s > huge;
huge_t_pos |= t > huge;
huge_s_neg |= s < -huge;
huge_t_neg |= t < -huge;
}
}
// ambiguous = true would probably result in NaN in the SW rasterizer or something undefined in HW.
// PS2 does not have NaN so there is no really accurate way to emulate this.
// huge = true and ambiguous = false seems to have well-defined behavior on the PS2:
// it clamps huge values to +/-2047 in UV coordinates space. We try to approximate this by
// giving ST the values that would result in exactly +/-2047 across the primitive.
const bool ambiguous = nan_s || nan_t || (huge_s_pos && huge_s_neg) || (huge_s_pos && huge_s_neg);
const bool huge = huge_s_pos || huge_t_pos || huge_s_neg || huge_t_neg;
if (ambiguous)
{
// Cull the primitive by not saving the indices
continue;
}
if (huge)
{
// Add new vertices to replace the primitive with another primitive with clamped values.
new_prims = true;
if (sprite)
{
// Handle sprite separately since it uses the second Q for both vertices
GSVertex v_new0 = vertex[index[i + 0]];
GSVertex v_new1 = vertex[index[i + 1]];
// Try to set values so that we get constant UV +/-2047 across the entire triangle after interpolation
// Sprites behave as if both Qs are same as the second one
if (huge_s_pos)
{
v_new1.ST.S = v_new0.ST.S = 2047.0f * v_new1.RGBAQ.Q / tex_width;
}
else if (huge_s_neg)
{
v_new1.ST.S = v_new0.ST.S = -2047.0f * v_new1.RGBAQ.Q / tex_width;
}
if (huge_t_pos)
{
v_new1.ST.T = v_new0.ST.T = 2047.0f * v_new1.RGBAQ.Q / tex_height;
}
else if (huge_t_neg)
{
v_new1.ST.T = v_new0.ST.T = -2047.0f * v_new1.RGBAQ.Q / tex_height;
}
// Copy old values to tail of vertex buffer.
// The vertex buffer is allocated so that there is always at least room for 3 new vertices at the end.
vertex[m_vertex.tail + 0] = v_new0;
vertex[m_vertex.tail + 1] = v_new1;
// Make new indices point to new vertices
index[new_index_tail + 0] = m_vertex.tail + 0;
index[new_index_tail + 1] = m_vertex.tail + 1;
}
else
{
// Copy old values to tail of vertex buffer.
// The vertex buffer is allocated so that there is always at least room for 3 new vertices at the end.
for (u32 j = 0; j < n; j++)
vertex[m_vertex.tail + j] = vertex[index[i + j]];
// Try to set values so that we get constant UV +/-2047 across the entire primitive after interpolation
if (huge_s_pos)
{
for (u32 j = 0; j < n; j++)
vertex[m_vertex.tail + j].ST.S = 2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
}
else if (huge_s_neg)
{
for (u32 j = 0; j < n; j++)
vertex[m_vertex.tail + j].ST.S = -2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_width;
}
if (huge_t_pos)
{
for (int j = 0; j < n; j++)
vertex[m_vertex.tail + j].ST.T = 2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_height;
}
else if (huge_t_neg)
{
for (u32 j = 0; j < n; j++)
vertex[m_vertex.tail + j].ST.T = -2047.0f * vertex[m_vertex.tail + j].RGBAQ.Q / tex_height;
}
// Make new indices point to new vertices
for (u32 j = 0; j < n; j++)
{
index[new_index_tail + j] = m_vertex.tail + j;
}
}
// Advance tail since we pushed new vertices
m_vertex.tail += n;
if (m_vertex.tail >= m_vertex.maxcount)
{
GrowVertexBuffer();
}
}
else if (new_index_tail < i) // If new_index_tail == i, don't update indices since no primitives have been culled
{
// Keep the same primitive so shift indices down
for (u32 j = 0; j < n; j++)
index[new_index_tail + j] = index[i + j];
}
new_index_tail += n;
}
m_index.tail = new_index_tail;
if (new_prims)
{
// We indexed new primitives at the end of the buffer so update head and next also
m_vertex.head = m_vertex.next = m_vertex.tail;
}
}
void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
{
if (m_vt.m_alpha.valid && tex_alpha_min == 0 && tex_alpha_max == 255)

View File

@ -190,6 +190,9 @@ protected:
bool IsCoverageAlpha();
void CalcAlphaMinMax(const int tex_min, const int tex_max);
void CorrectATEAlphaMinMax(const u32 atst, const int aref);
void RoundSTCoords();
void FixHugeSTCoords();
template <u32 n, bool sprite> void FixHugeSTCoordsImpl();
public:
struct GSUploadQueue

View File

@ -267,6 +267,16 @@ public:
return round<Round_PosInf>();
}
__forceinline GSVector4 notnan() const
{
return GSVector4(_mm_cmpord_ps(m, m));
}
__forceinline GSVector4 isnan() const
{
return GSVector4(_mm_cmpunord_ps(m, m));
}
// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
#define LOG_POLY0(x, c0) GSVector4(c0)
@ -656,6 +666,11 @@ public:
return neg();
}
__forceinline GSVector4 operator~() const
{
return cast(~GSVector4i::cast(*this));
}
__forceinline void operator+=(const GSVector4& v)
{
m = _mm_add_ps(m, v);

View File

@ -241,6 +241,16 @@ public:
return GSVector4(vrndpq_f32(v4s));
}
__forceinline GSVector4 notnan() const
{
return *this == *this;
}
__forceinline GSVector4 isnan() const
{
return *this != *this;
}
// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
#define LOG_POLY0(x, c0) GSVector4(c0)
@ -560,6 +570,11 @@ public:
return neg();
}
__forceinline GSVector4 operator~() const
{
return cast(~GSVector4i::cast(*this));
}
__forceinline void operator+=(const GSVector4& v)
{
v4s = vaddq_f32(v4s, v.v4s);

View File

@ -20,11 +20,12 @@ void GSVertexTrace::Update(const void* vertex, const u16* index, int v_count, in
m_primclass = primclass;
u32 iip = m_state->PRIM->IIP;
u32 tme = m_state->PRIM->TME;
u32 fst = m_state->PRIM->FST;
u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
const u32 iip = m_state->PRIM->IIP;
const u32 tme = m_state->PRIM->TME;
const u32 fst = m_state->PRIM->FST;
const u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
// Call the correct function to find the min/max values
m_fmm[color][fst][tme][iip][primclass](*this, vertex, index, i_count);
// Potential float overflow detected. Better uses the slower division instead

View File

@ -138,9 +138,12 @@ void GSVertexTraceFMM::FindMinMax(GSVertexTrace& vt, const void* vertex, const u
stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0);
stq1 = st.zwww(stq1);
tmin = tmin.min(stq0.min(stq1));
tmax = tmax.max(stq0.max(stq1));
// Only update entries that are not NaN
tmin = tmin.blend32(tmin.min(stq0), stq0.notnan());
tmin = tmin.blend32(tmin.min(stq1), stq1.notnan());
tmax = tmax.blend32(tmax.max(stq0), stq0.notnan());
tmax = tmax.blend32(tmax.max(stq1), stq1.notnan());
}
else
{
@ -246,6 +249,12 @@ void GSVertexTraceFMM::FindMinMax(GSVertexTrace& vt, const void* vertex, const u
vt.m_min.t = tmin * s;
vt.m_max.t = tmax * s;
// Clamp the min/max UV values to the min/max valid UV values.
// This is needed in certain cases where buggy GS input results
// in huge floating points values for ST.
vt.m_min.t = vt.m_min.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(vt.m_min.t);
vt.m_max.t = vt.m_max.t.min(GSVector4(2047.0f)).max(GSVector4(-2047.0f)).xyzw(vt.m_max.t);
}
else
{