mirror of https://github.com/PCSX2/pcsx2.git
GS: Use 32-bit vertex positions for culling
More accurate, stops it passing through vertices which are off-screen and coordinates overflowed. Differences versus current have been manually verified to be correct.
This commit is contained in:
parent
c77d8b3709
commit
2046a9b414
|
@ -79,6 +79,44 @@ static int extend(int uv, int size)
|
|||
return size;
|
||||
}
|
||||
|
||||
GSDrawingContext::GSDrawingContext()
|
||||
{
|
||||
std::memset(&offset, 0, sizeof(offset));
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
void GSDrawingContext::Reset()
|
||||
{
|
||||
std::memset(&XYOFFSET, 0, sizeof(XYOFFSET));
|
||||
std::memset(&TEX0, 0, sizeof(TEX0));
|
||||
std::memset(&TEX1, 0, sizeof(TEX1));
|
||||
std::memset(&CLAMP, 0, sizeof(CLAMP));
|
||||
std::memset(&MIPTBP1, 0, sizeof(MIPTBP1));
|
||||
std::memset(&MIPTBP2, 0, sizeof(MIPTBP2));
|
||||
std::memset(&SCISSOR, 0, sizeof(SCISSOR));
|
||||
std::memset(&ALPHA, 0, sizeof(ALPHA));
|
||||
std::memset(&TEST, 0, sizeof(TEST));
|
||||
std::memset(&FBA, 0, sizeof(FBA));
|
||||
std::memset(&FRAME, 0, sizeof(FRAME));
|
||||
std::memset(&ZBUF, 0, sizeof(ZBUF));
|
||||
}
|
||||
|
||||
void GSDrawingContext::UpdateScissor()
|
||||
{
|
||||
// Scissor registers are inclusive of the upper bounds.
|
||||
const GSVector4i rscissor = GSVector4i(static_cast<int>(SCISSOR.SCAX0), static_cast<int>(SCISSOR.SCAY0),
|
||||
static_cast<int>(SCISSOR.SCAX1), static_cast<int>(SCISSOR.SCAY1));
|
||||
scissor.in = rscissor + GSVector4i::cxpr(0, 0, 1, 1);
|
||||
|
||||
// Fixed-point scissor min/max, used for rejecting primitives which are entirely outside.
|
||||
scissor.cull = rscissor.sll32(4);
|
||||
|
||||
// Offset applied to vertices for culling, zw is for native resolution culling
|
||||
// We want to round subpixels down, because at least one pixel gets filled per scanline.
|
||||
scissor.xyof = GSVector4i::loadl(&XYOFFSET.U64).xyxy().sub32(GSVector4i::cxpr(0, 0, 15, 15));
|
||||
}
|
||||
|
||||
GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap) const
|
||||
{
|
||||
if (mipmap)
|
||||
|
|
|
@ -39,8 +39,8 @@ public:
|
|||
struct
|
||||
{
|
||||
GSVector4i in;
|
||||
GSVector4i ex;
|
||||
GSVector4i ofxy;
|
||||
GSVector4i cull;
|
||||
GSVector4i xyof;
|
||||
} scissor;
|
||||
|
||||
struct
|
||||
|
@ -50,50 +50,11 @@ public:
|
|||
GSPixelOffset4* fzb4;
|
||||
} offset;
|
||||
|
||||
GSDrawingContext()
|
||||
{
|
||||
memset(&offset, 0, sizeof(offset));
|
||||
GSDrawingContext();
|
||||
|
||||
Reset();
|
||||
}
|
||||
void Reset();
|
||||
|
||||
void Reset()
|
||||
{
|
||||
memset(&XYOFFSET, 0, sizeof(XYOFFSET));
|
||||
memset(&TEX0, 0, sizeof(TEX0));
|
||||
memset(&TEX1, 0, sizeof(TEX1));
|
||||
memset(&CLAMP, 0, sizeof(CLAMP));
|
||||
memset(&MIPTBP1, 0, sizeof(MIPTBP1));
|
||||
memset(&MIPTBP2, 0, sizeof(MIPTBP2));
|
||||
memset(&SCISSOR, 0, sizeof(SCISSOR));
|
||||
memset(&ALPHA, 0, sizeof(ALPHA));
|
||||
memset(&TEST, 0, sizeof(TEST));
|
||||
memset(&FBA, 0, sizeof(FBA));
|
||||
memset(&FRAME, 0, sizeof(FRAME));
|
||||
memset(&ZBUF, 0, sizeof(ZBUF));
|
||||
}
|
||||
|
||||
void UpdateScissor()
|
||||
{
|
||||
ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
|
||||
|
||||
scissor.ex.U16[0] = (u16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
|
||||
scissor.ex.U16[1] = (u16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
|
||||
scissor.ex.U16[2] = (u16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
|
||||
scissor.ex.U16[3] = (u16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
|
||||
|
||||
scissor.in = GSVector4i(
|
||||
(int)SCISSOR.SCAX0,
|
||||
(int)SCISSOR.SCAY0,
|
||||
(int)SCISSOR.SCAX1 + 1,
|
||||
(int)SCISSOR.SCAY1 + 1);
|
||||
|
||||
scissor.ofxy = GSVector4i(
|
||||
0x8000,
|
||||
0x8000,
|
||||
(int)XYOFFSET.OFX - 15,
|
||||
(int)XYOFFSET.OFY - 15);
|
||||
}
|
||||
void UpdateScissor();
|
||||
|
||||
GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap = false) const;
|
||||
|
||||
|
|
|
@ -174,7 +174,7 @@ void GSState::Reset(bool hardware_reset)
|
|||
// after reset (otherwise it'd only ever render 1x1).
|
||||
//
|
||||
if (!hardware_reset && GSConfig.UseHardwareRenderer())
|
||||
m_env.CTXT[i].scissor.ex = GSVector4i::xffffffff();
|
||||
m_env.CTXT[i].scissor.cull = GSVector4i::xffffffff();
|
||||
|
||||
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
|
||||
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
|
||||
|
@ -1684,8 +1684,9 @@ void GSState::FlushPrim()
|
|||
{
|
||||
GSVector4i* RESTRICT vert_ptr = (GSVector4i*)&m_vertex.buff[i];
|
||||
GSVector4i v = vert_ptr[1];
|
||||
v = v.xxxx().u16to32().sub32(m_ofxy);
|
||||
GSVector4i::storel(&m_vertex.xy[i & 3], v.blend16<0xf0>(v.sra32(4)).ps32());
|
||||
v = v.xxxx().u16to32().sub32(m_xyof);
|
||||
v = v.blend32<12>(v.sra32(4));
|
||||
m_vertex.xy[i & 3] = v;
|
||||
m_vertex.xy_tail = unused;
|
||||
}
|
||||
}
|
||||
|
@ -2664,8 +2665,9 @@ void GSState::UpdateContext()
|
|||
|
||||
void GSState::UpdateScissor()
|
||||
{
|
||||
m_scissor = m_context->scissor.ex;
|
||||
m_ofxy = m_context->scissor.ofxy;
|
||||
m_scissor_cull_min = m_context->scissor.cull.xyxy();
|
||||
m_scissor_cull_max = m_context->scissor.cull.zwzw();
|
||||
m_xyof = m_context->scissor.xyof;
|
||||
m_scissor_invalid = !m_context->scissor.in.gt32(m_context->scissor.in.zwzw()).allfalse();
|
||||
}
|
||||
|
||||
|
@ -3165,9 +3167,18 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
tailptr[0] = new_v0;
|
||||
tailptr[1] = new_v1;
|
||||
|
||||
const GSVector4i xy = new_v1.xxxx().u16to32().sub32(m_ofxy);
|
||||
// We maintain the X/Y coordinates for the last 4 vertices, as well as the head for triangle fans, so we can compute
|
||||
// the min/max, and cull degenerate triangles, which saves draws in some cases. Why 4? Mod 4 is cheaper than Mod 3.
|
||||
// These vertices are a full vector containing <X_Fixed_Point, Y_Fixed_Point, X_Integer, Y_Integer>. We use the
|
||||
// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
|
||||
// applied, then we split it into the fixed/integer portions.
|
||||
const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
|
||||
const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32(4));
|
||||
m_vertex.xy[xy_tail & 3] = xy;
|
||||
|
||||
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
|
||||
// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.
|
||||
if (prim == GS_TRIANGLEFAN && tail == head)
|
||||
m_vertex.xyhead = xy;
|
||||
|
||||
m_vertex.tail = ++tail;
|
||||
m_vertex.xy_tail = ++xy_tail;
|
||||
|
@ -3177,44 +3188,40 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
if (m < n)
|
||||
return;
|
||||
|
||||
|
||||
// Skip draws when scissor is out of range (i.e. bottom-right is less than top-left), since everything will get clipped.
|
||||
skip |= static_cast<u32>(m_scissor_invalid);
|
||||
|
||||
if (skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
|
||||
GSVector4i pmin, pmax;
|
||||
if (skip == 0)
|
||||
{
|
||||
GSVector4i pmin, pmax;
|
||||
|
||||
const GSVector4i v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
|
||||
const GSVector4i v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
|
||||
const GSVector4i v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
|
||||
const GSVector4i v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
|
||||
const GSVector4i v0 = m_vertex.xy[(xy_tail - 1) & 3];
|
||||
const GSVector4i v1 = m_vertex.xy[(xy_tail - 2) & 3];
|
||||
const GSVector4i v2 = (prim == GS_TRIANGLEFAN) ? m_vertex.xyhead : m_vertex.xy[(xy_tail - 3) & 3];
|
||||
|
||||
switch (prim)
|
||||
{
|
||||
case GS_POINTLIST:
|
||||
pmin = v2;
|
||||
pmax = v2;
|
||||
pmin = v0;
|
||||
pmax = v0;
|
||||
break;
|
||||
case GS_LINELIST:
|
||||
case GS_LINESTRIP:
|
||||
case GS_SPRITE:
|
||||
pmin = v2.min_i16(v1);
|
||||
pmax = v2.max_i16(v1);
|
||||
pmin = v0.min_i32(v1);
|
||||
pmax = v0.max_i32(v1);
|
||||
break;
|
||||
case GS_TRIANGLELIST:
|
||||
case GS_TRIANGLESTRIP:
|
||||
pmin = v2.min_i16(v1.min_i16(v0));
|
||||
pmax = v2.max_i16(v1.max_i16(v0));
|
||||
break;
|
||||
case GS_TRIANGLEFAN:
|
||||
pmin = v2.min_i16(v1.min_i16(v3));
|
||||
pmax = v2.max_i16(v1.max_i16(v3));
|
||||
pmin = v0.min_i32(v1.min_i32(v2));
|
||||
pmax = v0.max_i32(v1.max_i32(v2));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl());
|
||||
GSVector4i test = pmax.lt32(m_scissor_cull_min) | pmin.gt32(m_scissor_cull_max);
|
||||
|
||||
switch (prim)
|
||||
{
|
||||
|
@ -3222,10 +3229,14 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
case GS_TRIANGLESTRIP:
|
||||
case GS_TRIANGLEFAN:
|
||||
case GS_SPRITE:
|
||||
// Discard degenerate triangles. For native resolution, we can ignore the subpixel bits,
|
||||
// because at the boundaries, they're irrelevant.
|
||||
test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
|
||||
break;
|
||||
{
|
||||
// Discard degenerate triangles which don't cover at least one pixel. Since the vertices are in native
|
||||
// resolution space, we can use the integer locations. When upscaling, we can't, because a primitive which
|
||||
// does not span a single pixel at 1x may span multiple pixels at higher resolutions.
|
||||
const GSVector4i degen_test = pmin.eq32(pmax);
|
||||
test |= m_nativeres ? degen_test.zwzw() : degen_test;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -3234,18 +3245,15 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
{
|
||||
case GS_TRIANGLELIST:
|
||||
case GS_TRIANGLESTRIP:
|
||||
// TODO: any way to do a 16-bit integer cross product?
|
||||
// cross product is zero most of the time because either of the vertices are the same
|
||||
test = (test | v0 == v1) | (v1 == v2 | v0 == v2);
|
||||
break;
|
||||
case GS_TRIANGLEFAN:
|
||||
test = (test | v3 == v1) | (v1 == v2 | v3 == v2);
|
||||
test = (test | v0.eq64(v1)) | (v1.eq64(v2) | v0.eq64(v2));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
skip |= test.mask() & 15;
|
||||
// We only care about the xy passing the skip test. zw is the offset coordinates for native culling.
|
||||
skip |= test.mask() & 0xff;
|
||||
}
|
||||
|
||||
if (skip != 0)
|
||||
|
@ -3365,46 +3373,14 @@ __forceinline void GSState::VertexKick(u32 skip)
|
|||
__assume(0);
|
||||
}
|
||||
|
||||
{
|
||||
const GSVector4i voffset(GSVector4i::loadl(&m_context->XYOFFSET));
|
||||
|
||||
auto get_vertex = [&](u32 i) {
|
||||
GSVector4i v(GSVector4i::loadl(&m_vertex.buff[m_index.buff[(m_index.tail - n) + (i)]].XYZ));
|
||||
v = v.upl16(); // 16->32
|
||||
v = v.sub32(voffset); // -= (OFX, OFY)
|
||||
v = v.sra32(4); // >> 4
|
||||
return v;
|
||||
};
|
||||
|
||||
const GSVector4i xy0(get_vertex(0));
|
||||
GSVector4i min, max;
|
||||
if (m_vertex.tail == n)
|
||||
{
|
||||
min = xy0;
|
||||
max = xy0;
|
||||
}
|
||||
else
|
||||
{
|
||||
min = temp_draw_rect.min_i32(xy0);
|
||||
max = temp_draw_rect.zwzw().max_i32(xy0);
|
||||
}
|
||||
|
||||
if constexpr (n > 1)
|
||||
{
|
||||
const GSVector4i xy1(get_vertex(1));
|
||||
min = min.min_i32(xy1);
|
||||
max = max.max_i32(xy1);
|
||||
|
||||
if constexpr (n > 2)
|
||||
{
|
||||
const GSVector4i xy2(get_vertex(2));
|
||||
min = min.min_i32(xy2);
|
||||
max = max.max_i32(xy2);
|
||||
}
|
||||
}
|
||||
|
||||
temp_draw_rect = min.upl64(max).rintersect(m_context->scissor.in);
|
||||
}
|
||||
// Update rectangle for the current draw. We can use the re-integer coordinates from min/max here.
|
||||
const GSVector4i draw_min = pmin.zwzw();
|
||||
const GSVector4i draw_max = pmax;
|
||||
if (m_vertex.tail != n)
|
||||
temp_draw_rect = temp_draw_rect.min_i32(draw_min).blend32<12>(temp_draw_rect.max_i32(draw_max));
|
||||
else
|
||||
temp_draw_rect = draw_min.blend32<12>(draw_max);
|
||||
temp_draw_rect = temp_draw_rect.rintersect(m_context->scissor.in);
|
||||
|
||||
CLUTAutoFlush(prim);
|
||||
|
||||
|
|
|
@ -141,15 +141,17 @@ private:
|
|||
protected:
|
||||
GSVertex m_v = {};
|
||||
float m_q = 1.0f;
|
||||
GSVector4i m_scissor = {};
|
||||
GSVector4i m_ofxy = {};
|
||||
GSVector4i m_scissor_cull_min = {};
|
||||
GSVector4i m_scissor_cull_max = {};
|
||||
GSVector4i m_xyof = {};
|
||||
|
||||
struct
|
||||
{
|
||||
GSVertex* buff;
|
||||
u32 head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
|
||||
u32 xy_tail;
|
||||
u64 xy[4];
|
||||
GSVector4i xy[4];
|
||||
GSVector4i xyhead;
|
||||
} m_vertex = {};
|
||||
|
||||
struct
|
||||
|
|
|
@ -969,6 +969,11 @@ public:
|
|||
return GSVector4i(_mm_cmpeq_epi32(m, v.m));
|
||||
}
|
||||
|
||||
__forceinline GSVector4i eq64(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(_mm_cmpeq_epi64(m, v.m));
|
||||
}
|
||||
|
||||
__forceinline GSVector4i neq8(const GSVector4i& v) const
|
||||
{
|
||||
return ~eq8(v);
|
||||
|
|
Loading…
Reference in New Issue