mirror of https://github.com/PCSX2/pcsx2.git
GS: Reduce repeated code in GSVertexTrace::FindMinMax
Why repeat things when you can make the compiler repeat them for you
This commit is contained in:
parent
2e1d147135
commit
5d33165fa5
|
@ -181,290 +181,126 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
|
|||
|
||||
const GSVertex* RESTRICT v = (GSVertex*)vertex;
|
||||
|
||||
for (int i = 0; i < count; i += n)
|
||||
// Process 2 vertices at a time for increased efficiency
|
||||
auto processVertices = [&](const GSVertex& v0, const GSVertex& v1, bool finalVertex)
|
||||
{
|
||||
if (primclass == GS_POINT_CLASS)
|
||||
if (color)
|
||||
{
|
||||
GSVector4i c(v[index[i]].m[0]);
|
||||
|
||||
if (color)
|
||||
GSVector4i c0 = GSVector4i::load(v0.RGBAQ.u32[0]);
|
||||
GSVector4i c1 = GSVector4i::load(v1.RGBAQ.u32[0]);
|
||||
if (iip || finalVertex)
|
||||
{
|
||||
cmin = cmin.min_u8(c);
|
||||
cmax = cmax.max_u8(c);
|
||||
cmin = cmin.min_u8(c0.min_u8(c1));
|
||||
cmax = cmax.max_u8(c0.max_u8(c1));
|
||||
}
|
||||
|
||||
if (tme)
|
||||
else if (n == 2)
|
||||
{
|
||||
if (!fst)
|
||||
{
|
||||
GSVector4 stq = GSVector4::cast(c);
|
||||
|
||||
GSVector4 q = stq.wwww();
|
||||
|
||||
if (accurate_stq)
|
||||
stq = (stq.xyww() / q).noopt().xyww(q);
|
||||
else
|
||||
stq = (stq.xyww() * q.rcpnr()).noopt().xyww(q);
|
||||
|
||||
tmin = tmin.min(stq);
|
||||
tmax = tmax.max(stq);
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4i uv(v[index[i]].m[1]);
|
||||
|
||||
GSVector4 st = GSVector4(uv.uph16()).xyxy();
|
||||
|
||||
tmin = tmin.min(st);
|
||||
tmax = tmax.max(st);
|
||||
}
|
||||
// For even n, we process v1 and v2 of the same prim
|
||||
// (For odd n, we process one vertex from each of two prims)
|
||||
cmin = cmin.min_u8(c1);
|
||||
cmax = cmax.max_u8(c1);
|
||||
}
|
||||
|
||||
GSVector4i xyzf(v[index[i]].m[1]);
|
||||
|
||||
GSVector4i xy = xyzf.upl16();
|
||||
GSVector4i z = xyzf.yyyy();
|
||||
|
||||
GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf));
|
||||
|
||||
pmin = pmin.min_u32(p);
|
||||
pmax = pmax.max_u32(p);
|
||||
}
|
||||
else if (primclass == GS_LINE_CLASS)
|
||||
|
||||
if (tme)
|
||||
{
|
||||
GSVector4i c0(v[index[i + 0]].m[0]);
|
||||
GSVector4i c1(v[index[i + 1]].m[0]);
|
||||
|
||||
if (color)
|
||||
if (!fst)
|
||||
{
|
||||
if (iip)
|
||||
{
|
||||
cmin = cmin.min_u8(c0.min_u8(c1));
|
||||
cmax = cmax.max_u8(c0.max_u8(c1));
|
||||
}
|
||||
else
|
||||
{
|
||||
cmin = cmin.min_u8(c1);
|
||||
cmax = cmax.max_u8(c1);
|
||||
}
|
||||
}
|
||||
GSVector4 stq0 = GSVector4::cast(GSVector4i(v0.m[0]));
|
||||
GSVector4 stq1 = GSVector4::cast(GSVector4i(v1.m[0]));
|
||||
|
||||
if (tme)
|
||||
GSVector4 st, q;
|
||||
// Sprites always have indices == vertices, so we don't have to look at the index table here
|
||||
if (primclass == GS_SPRITE_CLASS)
|
||||
q = stq1.wwww();
|
||||
else
|
||||
q = stq0.wwww(stq1);
|
||||
|
||||
// Note: If in the future this is changed in a way that causes parts of calculations to go unused,
|
||||
// make sure to remove the z (rgba) field as it's often denormal.
|
||||
// Then, use GSVector4::noopt() to prevent clang from optimizing out your "useless" shuffle
|
||||
// e.g. stq = (stq.xyww() / stq.wwww()).noopt().xyww(stq);
|
||||
if (accurate_stq)
|
||||
st = stq0.xyxy(stq1) / q;
|
||||
else
|
||||
st = stq0.xyxy(stq1) * q.rcpnr();
|
||||
|
||||
stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0);
|
||||
stq1 = st.zwww(stq1);
|
||||
|
||||
tmin = tmin.min(stq0.min(stq1));
|
||||
tmax = tmax.max(stq0.max(stq1));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!fst)
|
||||
{
|
||||
GSVector4 stq0 = GSVector4::cast(c0);
|
||||
GSVector4 stq1 = GSVector4::cast(c1);
|
||||
GSVector4i uv0(v0.m[1]);
|
||||
GSVector4i uv1(v1.m[1]);
|
||||
|
||||
GSVector4 q = stq0.wwww(stq1);
|
||||
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
|
||||
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
|
||||
|
||||
if (accurate_stq)
|
||||
{
|
||||
GSVector4 st = stq0.xyxy(stq1) / q;
|
||||
|
||||
stq0 = st.xyww(stq1);
|
||||
stq1 = st.zwww(stq1);
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4 st = stq0.xyxy(stq1) * q.rcpnr();
|
||||
|
||||
stq0 = st.xyww(stq0);
|
||||
stq1 = st.zwww(stq1);
|
||||
}
|
||||
|
||||
tmin = tmin.min(stq0.min(stq1));
|
||||
tmax = tmax.max(stq0.max(stq1));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4i uv0(v[index[i + 0]].m[1]);
|
||||
GSVector4i uv1(v[index[i + 1]].m[1]);
|
||||
|
||||
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
|
||||
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
|
||||
|
||||
tmin = tmin.min(st0.min(st1));
|
||||
tmax = tmax.max(st0.max(st1));
|
||||
}
|
||||
tmin = tmin.min(st0.min(st1));
|
||||
tmax = tmax.max(st0.max(st1));
|
||||
}
|
||||
|
||||
GSVector4i xyzf0(v[index[i + 0]].m[1]);
|
||||
GSVector4i xyzf1(v[index[i + 1]].m[1]);
|
||||
|
||||
GSVector4i xy0 = xyzf0.upl16();
|
||||
GSVector4i z0 = xyzf0.yyyy();
|
||||
GSVector4i xy1 = xyzf1.upl16();
|
||||
GSVector4i z1 = xyzf1.yyyy();
|
||||
|
||||
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
|
||||
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
|
||||
|
||||
pmin = pmin.min_u32(p0.min_u32(p1));
|
||||
pmax = pmax.max_u32(p0.max_u32(p1));
|
||||
}
|
||||
else if (primclass == GS_TRIANGLE_CLASS)
|
||||
|
||||
GSVector4i xyzf0(v0.m[1]);
|
||||
GSVector4i xyzf1(v1.m[1]);
|
||||
|
||||
GSVector4i xy0 = xyzf0.upl16();
|
||||
GSVector4i z0 = xyzf0.yyyy();
|
||||
GSVector4i xy1 = xyzf1.upl16();
|
||||
GSVector4i z1 = xyzf1.yyyy();
|
||||
|
||||
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(primclass == GS_SPRITE_CLASS ? xyzf1 : xyzf0));
|
||||
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
|
||||
|
||||
pmin = pmin.min_u32(p0.min_u32(p1));
|
||||
pmax = pmax.max_u32(p0.max_u32(p1));
|
||||
};
|
||||
|
||||
if (n == 2)
|
||||
{
|
||||
for (int i = 0; i < count; i += 2)
|
||||
{
|
||||
GSVector4i c0(v[index[i + 0]].m[0]);
|
||||
GSVector4i c1(v[index[i + 1]].m[0]);
|
||||
GSVector4i c2(v[index[i + 2]].m[0]);
|
||||
|
||||
if (color)
|
||||
{
|
||||
if (iip)
|
||||
{
|
||||
cmin = cmin.min_u8(c2).min_u8(c0.min_u8(c1));
|
||||
cmax = cmax.max_u8(c2).max_u8(c0.max_u8(c1));
|
||||
}
|
||||
else
|
||||
{
|
||||
cmin = cmin.min_u8(c2);
|
||||
cmax = cmax.max_u8(c2);
|
||||
}
|
||||
}
|
||||
|
||||
if (tme)
|
||||
{
|
||||
if (!fst)
|
||||
{
|
||||
GSVector4 stq0 = GSVector4::cast(c0);
|
||||
GSVector4 stq1 = GSVector4::cast(c1);
|
||||
GSVector4 stq2 = GSVector4::cast(c2);
|
||||
|
||||
if (accurate_stq)
|
||||
{
|
||||
GSVector4 st01 = stq0.xyxy(stq1) / stq0.wwww(stq1);
|
||||
|
||||
stq0 = st01.xyww(stq0);
|
||||
stq1 = st01.zwww(stq1);
|
||||
stq2 = (stq2.xyww() / stq2.wwww()).noopt().xyww(stq2);
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr();
|
||||
GSVector4 st01 = stq0.xyxy(stq1) * q.xxyy();
|
||||
|
||||
stq0 = st01.xyww(stq0);
|
||||
stq1 = st01.zwww(stq1);
|
||||
stq2 = (stq2.xyww() * q.zzzz()).noopt().xyww(stq2);
|
||||
}
|
||||
|
||||
tmin = tmin.min(stq2).min(stq0.min(stq1));
|
||||
tmax = tmax.max(stq2).max(stq0.max(stq1));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4i uv0(v[index[i + 0]].m[1]);
|
||||
GSVector4i uv1(v[index[i + 1]].m[1]);
|
||||
GSVector4i uv2(v[index[i + 2]].m[1]);
|
||||
|
||||
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
|
||||
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
|
||||
GSVector4 st2 = GSVector4(uv2.uph16()).xyxy();
|
||||
|
||||
tmin = tmin.min(st2).min(st0.min(st1));
|
||||
tmax = tmax.max(st2).max(st0.max(st1));
|
||||
}
|
||||
}
|
||||
|
||||
GSVector4i xyzf0(v[index[i + 0]].m[1]);
|
||||
GSVector4i xyzf1(v[index[i + 1]].m[1]);
|
||||
GSVector4i xyzf2(v[index[i + 2]].m[1]);
|
||||
|
||||
GSVector4i xy0 = xyzf0.upl16();
|
||||
GSVector4i z0 = xyzf0.yyyy();
|
||||
GSVector4i xy1 = xyzf1.upl16();
|
||||
GSVector4i z1 = xyzf1.yyyy();
|
||||
GSVector4i xy2 = xyzf2.upl16();
|
||||
GSVector4i z2 = xyzf2.yyyy();
|
||||
|
||||
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
|
||||
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
|
||||
GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2));
|
||||
|
||||
pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1));
|
||||
pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1));
|
||||
}
|
||||
else if (primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
GSVector4i c0(v[index[i + 0]].m[0]);
|
||||
GSVector4i c1(v[index[i + 1]].m[0]);
|
||||
|
||||
if (color)
|
||||
{
|
||||
if (iip)
|
||||
{
|
||||
cmin = cmin.min_u8(c0.min_u8(c1));
|
||||
cmax = cmax.max_u8(c0.max_u8(c1));
|
||||
}
|
||||
else
|
||||
{
|
||||
cmin = cmin.min_u8(c1);
|
||||
cmax = cmax.max_u8(c1);
|
||||
}
|
||||
}
|
||||
|
||||
if (tme)
|
||||
{
|
||||
if (!fst)
|
||||
{
|
||||
GSVector4 stq0 = GSVector4::cast(c0);
|
||||
GSVector4 stq1 = GSVector4::cast(c1);
|
||||
|
||||
if (accurate_stq)
|
||||
{
|
||||
GSVector4 st = stq0.xyxy(stq1) / stq1.wwww();
|
||||
|
||||
stq0 = st.xyww(stq1);
|
||||
stq1 = st.zwww(stq1);
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4 st = stq0.xyxy(stq1) * stq1.wwww().rcpnr();
|
||||
|
||||
stq0 = st.xyww(stq1);
|
||||
stq1 = st.zwww(stq1);
|
||||
}
|
||||
|
||||
tmin = tmin.min(stq0.min(stq1));
|
||||
tmax = tmax.max(stq0.max(stq1));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector4i uv0(v[index[i + 0]].m[1]);
|
||||
GSVector4i uv1(v[index[i + 1]].m[1]);
|
||||
|
||||
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
|
||||
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
|
||||
|
||||
tmin = tmin.min(st0.min(st1));
|
||||
tmax = tmax.max(st0.max(st1));
|
||||
}
|
||||
}
|
||||
|
||||
GSVector4i xyzf0(v[index[i + 0]].m[1]);
|
||||
GSVector4i xyzf1(v[index[i + 1]].m[1]);
|
||||
|
||||
GSVector4i xy0 = xyzf0.upl16();
|
||||
GSVector4i z0 = xyzf0.yyyy();
|
||||
GSVector4i xy1 = xyzf1.upl16();
|
||||
GSVector4i z1 = xyzf1.yyyy();
|
||||
|
||||
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1));
|
||||
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
|
||||
|
||||
pmin = pmin.min_u32(p0.min_u32(p1));
|
||||
pmax = pmax.max_u32(p0.max_u32(p1));
|
||||
processVertices(v[index[i + 0]], v[index[i + 1]], false);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME/WARNING. A division by 2 is done on the depth. I suspect to avoid
|
||||
// negative value. However it means that we lost the lsb bit. m_eq.z could
|
||||
// be true if depth isn't constant but close enough. It also imply that
|
||||
// pmin.z & 1 == 0 and pax.z & 1 == 0
|
||||
|
||||
pmin = pmin.blend16<0x30>(pmin.srl32(1));
|
||||
pmax = pmax.blend16<0x30>(pmax.srl32(1));
|
||||
else if (iip || n == 1) // iip means final and non-final vertexes are treated the same
|
||||
{
|
||||
int i = 0;
|
||||
for (; i < (count - 1); i += 2) // 2x loop unroll
|
||||
{
|
||||
processVertices(v[index[i + 0]], v[index[i + 1]], true);
|
||||
}
|
||||
if (count & 1)
|
||||
{
|
||||
// Compiler optimizations go!
|
||||
// (And if they don't, it's only one vertex out of many)
|
||||
processVertices(v[index[i]], v[index[i]], true);
|
||||
}
|
||||
}
|
||||
else if (n == 3)
|
||||
{
|
||||
int i = 0;
|
||||
for (; i < (count - 3); i += 6)
|
||||
{
|
||||
processVertices(v[index[i + 0]], v[index[i + 3]], false);
|
||||
processVertices(v[index[i + 1]], v[index[i + 4]], false);
|
||||
processVertices(v[index[i + 2]], v[index[i + 5]], true);
|
||||
}
|
||||
if (count & 1)
|
||||
{
|
||||
processVertices(v[index[i + 0]], v[index[i + 1]], false);
|
||||
// Compiler optimizations go!
|
||||
// (And if they don't, it's only one vertex out of many)
|
||||
processVertices(v[index[i + 2]], v[index[i + 2]], true);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pxAssertRel(0, "Bad n value");
|
||||
}
|
||||
|
||||
GSVector4 o(context->XYOFFSET);
|
||||
GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f);
|
||||
|
@ -472,6 +308,10 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
|
|||
m_min.p = (GSVector4(pmin) - o) * s;
|
||||
m_max.p = (GSVector4(pmax) - o) * s;
|
||||
|
||||
// Fix signed int conversion
|
||||
m_min.p = m_min.p.insert32<0, 2>(GSVector4::load((float)(uint32)pmin.extract32<2>()));
|
||||
m_max.p = m_max.p.insert32<0, 2>(GSVector4::load((float)(uint32)pmax.extract32<2>()));
|
||||
|
||||
if (tme)
|
||||
{
|
||||
if (fst)
|
||||
|
@ -494,8 +334,8 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun
|
|||
|
||||
if (color)
|
||||
{
|
||||
m_min.c = cmin.zzzz().u8to32();
|
||||
m_max.c = cmax.zzzz().u8to32();
|
||||
m_min.c = cmin.u8to32();
|
||||
m_max.c = cmax.u8to32();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue