GS SW: Handle flat prims without float conversion

This commit is contained in:
refractionpcsx2 2021-11-16 21:24:23 +00:00
parent 125b9ea1e1
commit 14e8d840ff
6 changed files with 116 additions and 39 deletions

View File

@ -98,6 +98,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.fb = m_global.sel.fb;
sel.zb = m_global.sel.zb;
sel.zoverflow = m_global.sel.zoverflow;
sel.zequal = m_global.sel.zequal;
sel.notest = m_global.sel.notest;
m_sp = m_sp_map[sel];
@ -143,13 +144,20 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const u32* index, const
if (has_z)
{
m_local.d8.p.z = dp8.extract32<2>();
GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
for (int i = 0; i < 8; i++)
if (sel.zequal)
{
m_local.d[i].z = dz * shift[1 + i];
m_local.p.z = vertex[index[1]].t.U32[3];
}
{
m_local.d8.p.z = dp8.extract32<2>();
const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
for (int i = 0; i < 8; i++)
{
m_local.d[i].z = dz * shift[1 + i];
}
}
}
}
@ -545,9 +553,14 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if (sel.prim != GS_SPRITE_CLASS)
{
// Need to handle when the float converts incorrectly
GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
if (sel.zoverflow)
if (sel.zequal)
{
zs = GSVector8i::broadcast32(&m_local.p.z);
}
else if (sel.zoverflow)
{
zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001());
}
@ -2787,7 +2800,7 @@ void GSDrawScanline::WritePixel(const T& src, int addr, int i, u32 psm)
*(u32*)dst = (src.U32[i] & 0xffffff) | (*(u32*)dst & 0xff000000);
break;
case 2:
*(u16*)dst = src.u16[i * 2];
*(u16*)dst = src.U16[i * 2];
break;
}
}

View File

@ -784,19 +784,24 @@ void GSDrawScanlineCodeGenerator2::Init()
if (m_sel.zb)
{
// z = vp.zzzz() + m_local.d[skip].z;
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
if (is64)
if (!m_sel.zequal)
{
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
// z = vp.zzzz() + m_local.d[skip].z;
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
if (is64)
{
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
}
else
{
movaps(ptr[&m_local.temp.z], z);
movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
movaps(ptr[&m_local.temp.zo], xym2);
addps(z, xym2);
}
}
else
{
movaps(ptr[&m_local.temp.z], z);
movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
movaps(ptr[&m_local.temp.zo], xym2);
addps(z, xym2);
}
pbroadcastdLocal(z, _rip_local(p.z));
}
}
}
@ -995,7 +1000,11 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.zb)
{
if (is32)
if (m_sel.zequal)
{
pbroadcastdLocal(z, _rip_local(p.z));
}
else if (is32)
{
broadcastssLocal(z, _rip_local_d_p(z));
addps(z, _rip_local(temp.zo));
@ -1185,9 +1194,18 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
if (m_sel.prim != GS_SPRITE_CLASS)
{
if (m_sel.zoverflow)
if (m_sel.zequal)
{
ONLY64(movdqa(xym0, _z));
}
else if (m_sel.zoverflow)
{
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
z /= 2;
zs = GSVector8i(z, true);
zs = zs.min_u32(GSVector8i::x7fffffff());
zs = zs.sll32(1) | 1;*/
auto m_half = loadAddress(rax, &GSVector4::m_half);
@ -1195,6 +1213,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
vbroadcastss(temp1, ptr[m_half]);
else
movaps(temp1, ptr[m_half]);
mulps(temp1, z);
cvttps2dq(temp1, temp1);
pslld(temp1, 1);

View File

@ -407,6 +407,7 @@ public:
AFORWARD(3, pinsrd, ARGS_XOI)
AFORWARD(2, pmaxsw, ARGS_XO)
AFORWARD(2, pminsd, ARGS_XO)
AFORWARD(2, pminud, ARGS_XO)
AFORWARD(2, pminsw, ARGS_XO)
SFORWARD(2, pmovsxbd, ARGS_XO)
SFORWARD(2, pmovmskb, const Reg32e&, const Xmm&)

View File

@ -313,7 +313,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
}
}
if (primclass == GS_SPRITE_CLASS)
if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z)
{
xyzuvf = xyzuvf.min_u32(z_max);
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
@ -1341,6 +1341,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
gd.sel.zequal = !!m_vt.m_eq.z;
gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
}

View File

@ -64,7 +64,7 @@ union GSScanlineSelector
u32 mmin : 2; // 54
u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
u32 zequal : 1; // 56
u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
};

View File

@ -187,21 +187,43 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
if (m_sel.zequal)
{
// m_local.d[i].z = dz * m_shift[i];
u32 offset = 0;
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
movdqa(_rip_local(p.z), xmm0);
}
else
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
}
}
}
}
@ -257,13 +279,34 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
if (m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
if (m_sel.zequal)
{
u32 offset = 0;
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
extractps(_rip_local_d_p(z), xmm1, 2);
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), t1.cvt32());
}
else
{
// m_local.d8.p.z = dp8.extract32<2>();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
extractps(_rip_local_d_p(z), xmm1, 2);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
}
}
if (m_en.f)