GS: Use 32-bit vertex positions for culling

More accurate, stops it passing through vertices which are off-screen
and coordinates overflowed.

Differences versus current have been manually verified to be correct.
This commit is contained in:
Stenzek 2023-06-18 01:40:04 +10:00 committed by Connor McLaughlin
parent c77d8b3709
commit 2046a9b414
5 changed files with 103 additions and 121 deletions

View File

@ -79,6 +79,44 @@ static int extend(int uv, int size)
return size;
}
GSDrawingContext::GSDrawingContext()
{
std::memset(&offset, 0, sizeof(offset));
Reset();
}
void GSDrawingContext::Reset()
{
std::memset(&XYOFFSET, 0, sizeof(XYOFFSET));
std::memset(&TEX0, 0, sizeof(TEX0));
std::memset(&TEX1, 0, sizeof(TEX1));
std::memset(&CLAMP, 0, sizeof(CLAMP));
std::memset(&MIPTBP1, 0, sizeof(MIPTBP1));
std::memset(&MIPTBP2, 0, sizeof(MIPTBP2));
std::memset(&SCISSOR, 0, sizeof(SCISSOR));
std::memset(&ALPHA, 0, sizeof(ALPHA));
std::memset(&TEST, 0, sizeof(TEST));
std::memset(&FBA, 0, sizeof(FBA));
std::memset(&FRAME, 0, sizeof(FRAME));
std::memset(&ZBUF, 0, sizeof(ZBUF));
}
void GSDrawingContext::UpdateScissor()
{
// Scissor registers are inclusive of the upper bounds.
const GSVector4i rscissor = GSVector4i(static_cast<int>(SCISSOR.SCAX0), static_cast<int>(SCISSOR.SCAY0),
static_cast<int>(SCISSOR.SCAX1), static_cast<int>(SCISSOR.SCAY1));
scissor.in = rscissor + GSVector4i::cxpr(0, 0, 1, 1);
// Fixed-point scissor min/max, used for rejecting primitives which are entirely outside.
scissor.cull = rscissor.sll32(4);
// Offset applied to vertices for culling, zw is for native resolution culling
// We want to round subpixels down, because at least one pixel gets filled per scanline.
scissor.xyof = GSVector4i::loadl(&XYOFFSET.U64).xyxy().sub32(GSVector4i::cxpr(0, 0, 15, 15));
}
GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap) const
{
if (mipmap)

View File

@ -39,8 +39,8 @@ public:
struct
{
GSVector4i in;
GSVector4i ex;
GSVector4i ofxy;
GSVector4i cull;
GSVector4i xyof;
} scissor;
struct
@ -50,50 +50,11 @@ public:
GSPixelOffset4* fzb4;
} offset;
GSDrawingContext()
{
memset(&offset, 0, sizeof(offset));
GSDrawingContext();
Reset();
}
void Reset();
void Reset()
{
memset(&XYOFFSET, 0, sizeof(XYOFFSET));
memset(&TEX0, 0, sizeof(TEX0));
memset(&TEX1, 0, sizeof(TEX1));
memset(&CLAMP, 0, sizeof(CLAMP));
memset(&MIPTBP1, 0, sizeof(MIPTBP1));
memset(&MIPTBP2, 0, sizeof(MIPTBP2));
memset(&SCISSOR, 0, sizeof(SCISSOR));
memset(&ALPHA, 0, sizeof(ALPHA));
memset(&TEST, 0, sizeof(TEST));
memset(&FBA, 0, sizeof(FBA));
memset(&FRAME, 0, sizeof(FRAME));
memset(&ZBUF, 0, sizeof(ZBUF));
}
void UpdateScissor()
{
ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
scissor.ex.U16[0] = (u16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
scissor.ex.U16[1] = (u16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
scissor.ex.U16[2] = (u16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
scissor.ex.U16[3] = (u16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
scissor.in = GSVector4i(
(int)SCISSOR.SCAX0,
(int)SCISSOR.SCAY0,
(int)SCISSOR.SCAX1 + 1,
(int)SCISSOR.SCAY1 + 1);
scissor.ofxy = GSVector4i(
0x8000,
0x8000,
(int)XYOFFSET.OFX - 15,
(int)XYOFFSET.OFY - 15);
}
void UpdateScissor();
GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap = false) const;

View File

@ -174,7 +174,7 @@ void GSState::Reset(bool hardware_reset)
// after reset (otherwise it'd only ever render 1x1).
//
if (!hardware_reset && GSConfig.UseHardwareRenderer())
m_env.CTXT[i].scissor.ex = GSVector4i::xffffffff();
m_env.CTXT[i].scissor.cull = GSVector4i::xffffffff();
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
@ -1684,8 +1684,9 @@ void GSState::FlushPrim()
{
GSVector4i* RESTRICT vert_ptr = (GSVector4i*)&m_vertex.buff[i];
GSVector4i v = vert_ptr[1];
v = v.xxxx().u16to32().sub32(m_ofxy);
GSVector4i::storel(&m_vertex.xy[i & 3], v.blend16<0xf0>(v.sra32(4)).ps32());
v = v.xxxx().u16to32().sub32(m_xyof);
v = v.blend32<12>(v.sra32(4));
m_vertex.xy[i & 3] = v;
m_vertex.xy_tail = unused;
}
}
@ -2664,8 +2665,9 @@ void GSState::UpdateContext()
void GSState::UpdateScissor()
{
m_scissor = m_context->scissor.ex;
m_ofxy = m_context->scissor.ofxy;
m_scissor_cull_min = m_context->scissor.cull.xyxy();
m_scissor_cull_max = m_context->scissor.cull.zwzw();
m_xyof = m_context->scissor.xyof;
m_scissor_invalid = !m_context->scissor.in.gt32(m_context->scissor.in.zwzw()).allfalse();
}
@ -3165,9 +3167,18 @@ __forceinline void GSState::VertexKick(u32 skip)
tailptr[0] = new_v0;
tailptr[1] = new_v1;
const GSVector4i xy = new_v1.xxxx().u16to32().sub32(m_ofxy);
// We maintain the X/Y coordinates for the last 4 vertices, as well as the head for triangle fans, so we can compute
// the min/max, and cull degenerate triangles, which saves draws in some cases. Why 4? Mod 4 is cheaper than Mod 3.
// These vertices are a full vector containing <X_Fixed_Point, Y_Fixed_Point, X_Integer, Y_Integer>. We use the
// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
// applied, then we split it into the fixed/integer portions.
const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32(4));
m_vertex.xy[xy_tail & 3] = xy;
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.
if (prim == GS_TRIANGLEFAN && tail == head)
m_vertex.xyhead = xy;
m_vertex.tail = ++tail;
m_vertex.xy_tail = ++xy_tail;
@ -3177,44 +3188,40 @@ __forceinline void GSState::VertexKick(u32 skip)
if (m < n)
return;
// Skip draws when scissor is out of range (i.e. bottom-right is less than top-left), since everything will get clipped.
skip |= static_cast<u32>(m_scissor_invalid);
if (skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
GSVector4i pmin, pmax;
if (skip == 0)
{
GSVector4i pmin, pmax;
const GSVector4i v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
const GSVector4i v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
const GSVector4i v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
const GSVector4i v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
const GSVector4i v0 = m_vertex.xy[(xy_tail - 1) & 3];
const GSVector4i v1 = m_vertex.xy[(xy_tail - 2) & 3];
const GSVector4i v2 = (prim == GS_TRIANGLEFAN) ? m_vertex.xyhead : m_vertex.xy[(xy_tail - 3) & 3];
switch (prim)
{
case GS_POINTLIST:
pmin = v2;
pmax = v2;
pmin = v0;
pmax = v0;
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin = v2.min_i16(v1);
pmax = v2.max_i16(v1);
pmin = v0.min_i32(v1);
pmax = v0.max_i32(v1);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
pmin = v2.min_i16(v1.min_i16(v0));
pmax = v2.max_i16(v1.max_i16(v0));
break;
case GS_TRIANGLEFAN:
pmin = v2.min_i16(v1.min_i16(v3));
pmax = v2.max_i16(v1.max_i16(v3));
pmin = v0.min_i32(v1.min_i32(v2));
pmax = v0.max_i32(v1.max_i32(v2));
break;
default:
break;
}
GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl());
GSVector4i test = pmax.lt32(m_scissor_cull_min) | pmin.gt32(m_scissor_cull_max);
switch (prim)
{
@ -3222,10 +3229,14 @@ __forceinline void GSState::VertexKick(u32 skip)
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
// Discard degenerate triangles. For native resolution, we can ignore the subpixel bits,
// because at the boundaries, they're irrelevant.
test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
break;
{
// Discard degenerate triangles which don't cover at least one pixel. Since the vertices are in native
// resolution space, we can use the integer locations. When upscaling, we can't, because a primitive which
// does not span a single pixel at 1x may span multiple pixels at higher resolutions.
const GSVector4i degen_test = pmin.eq32(pmax);
test |= m_nativeres ? degen_test.zwzw() : degen_test;
}
break;
default:
break;
}
@ -3234,18 +3245,15 @@ __forceinline void GSState::VertexKick(u32 skip)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
// TODO: any way to do a 16-bit integer cross product?
// cross product is zero most of the time because either of the vertices are the same
test = (test | v0 == v1) | (v1 == v2 | v0 == v2);
break;
case GS_TRIANGLEFAN:
test = (test | v3 == v1) | (v1 == v2 | v3 == v2);
test = (test | v0.eq64(v1)) | (v1.eq64(v2) | v0.eq64(v2));
break;
default:
break;
}
skip |= test.mask() & 15;
// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
skip |= test.mask() & 0xff;
}
if (skip != 0)
@ -3365,46 +3373,14 @@ __forceinline void GSState::VertexKick(u32 skip)
__assume(0);
}
{
const GSVector4i voffset(GSVector4i::loadl(&m_context->XYOFFSET));
auto get_vertex = [&](u32 i) {
GSVector4i v(GSVector4i::loadl(&m_vertex.buff[m_index.buff[(m_index.tail - n) + (i)]].XYZ));
v = v.upl16(); // 16->32
v = v.sub32(voffset); // -= (OFX, OFY)
v = v.sra32(4); // >> 4
return v;
};
const GSVector4i xy0(get_vertex(0));
GSVector4i min, max;
if (m_vertex.tail == n)
{
min = xy0;
max = xy0;
}
else
{
min = temp_draw_rect.min_i32(xy0);
max = temp_draw_rect.zwzw().max_i32(xy0);
}
if constexpr (n > 1)
{
const GSVector4i xy1(get_vertex(1));
min = min.min_i32(xy1);
max = max.max_i32(xy1);
if constexpr (n > 2)
{
const GSVector4i xy2(get_vertex(2));
min = min.min_i32(xy2);
max = max.max_i32(xy2);
}
}
temp_draw_rect = min.upl64(max).rintersect(m_context->scissor.in);
}
// Update rectangle for the current draw. We can use the re-integer coordinates from min/max here.
const GSVector4i draw_min = pmin.zwzw();
const GSVector4i draw_max = pmax;
if (m_vertex.tail != n)
temp_draw_rect = temp_draw_rect.min_i32(draw_min).blend32<12>(temp_draw_rect.max_i32(draw_max));
else
temp_draw_rect = draw_min.blend32<12>(draw_max);
temp_draw_rect = temp_draw_rect.rintersect(m_context->scissor.in);
CLUTAutoFlush(prim);

View File

@ -141,15 +141,17 @@ private:
protected:
GSVertex m_v = {};
float m_q = 1.0f;
GSVector4i m_scissor = {};
GSVector4i m_ofxy = {};
GSVector4i m_scissor_cull_min = {};
GSVector4i m_scissor_cull_max = {};
GSVector4i m_xyof = {};
struct
{
GSVertex* buff;
u32 head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
u32 xy_tail;
u64 xy[4];
GSVector4i xy[4];
GSVector4i xyhead;
} m_vertex = {};
struct

View File

@ -969,6 +969,11 @@ public:
return GSVector4i(_mm_cmpeq_epi32(m, v.m));
}
__forceinline GSVector4i eq64(const GSVector4i& v) const
{
return GSVector4i(_mm_cmpeq_epi64(m, v.m));
}
__forceinline GSVector4i neq8(const GSVector4i& v) const
{
return ~eq8(v);