From 14e8d840fff619bffc963cb518ada58bb56e7722 Mon Sep 17 00:00:00 2001 From: refractionpcsx2 Date: Tue, 16 Nov 2021 21:24:23 +0000 Subject: [PATCH] GS SW: Handle flat prims without float conversion --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 29 +++++-- .../SW/GSDrawScanlineCodeGenerator.all.cpp | 43 ++++++++--- pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h | 1 + pcsx2/GS/Renderers/SW/GSRendererSW.cpp | 3 +- pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h | 2 +- .../SW/GSSetupPrimCodeGenerator.all.cpp | 77 +++++++++++++++---- 6 files changed, 116 insertions(+), 39 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 5ef0082f27..f5799d7078 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -98,6 +98,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) sel.fb = m_global.sel.fb; sel.zb = m_global.sel.zb; sel.zoverflow = m_global.sel.zoverflow; + sel.zequal = m_global.sel.zequal; sel.notest = m_global.sel.notest; m_sp = m_sp_map[sel]; @@ -143,13 +144,20 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const u32* index, const if (has_z) { - m_local.d8.p.z = dp8.extract32<2>(); - - GSVector8 dz = GSVector8::broadcast32(&dscan.p.z); - - for (int i = 0; i < 8; i++) + if (sel.zequal) { - m_local.d[i].z = dz * shift[1 + i]; + m_local.p.z = vertex[index[1]].t.U32[3]; + } + + { + m_local.d8.p.z = dp8.extract32<2>(); + + const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z); + + for (int i = 0; i < 8; i++) + { + m_local.d[i].z = dz * shift[1 + i]; + } } } } @@ -545,9 +553,14 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if (sel.prim != GS_SPRITE_CLASS) { + // Need to handle when the float converts incorrectly GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; - if (sel.zoverflow) + if (sel.zequal) + { + zs = GSVector8i::broadcast32(&m_local.p.z); + } + else if (sel.zoverflow) { zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); } @@ -2787,7 +2800,7 @@ void GSDrawScanline::WritePixel(const T& src, int addr, int i, u32 psm) *(u32*)dst = (src.U32[i] & 0xffffff) | (*(u32*)dst & 0xff000000); break; case 2: - *(u16*)dst = src.u16[i * 2]; + *(u16*)dst = src.U16[i * 2]; break; } } diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index b022d10529..65b380872e 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -784,19 +784,24 @@ void GSDrawScanlineCodeGenerator2::Init() if (m_sel.zb) { - // z = vp.zzzz() + m_local.d[skip].z; - shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2)); - if (is64) + if (!m_sel.zequal) { - addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + // z = vp.zzzz() + m_local.d[skip].z; + shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2)); + if (is64) + { + addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + } + else + { + movaps(ptr[&m_local.temp.z], z); + movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + movaps(ptr[&m_local.temp.zo], xym2); + addps(z, xym2); + } } else - { - movaps(ptr[&m_local.temp.z], z); - movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); - movaps(ptr[&m_local.temp.zo], xym2); - addps(z, xym2); - } + pbroadcastdLocal(z, _rip_local(p.z)); } } } @@ -995,7 +1000,11 @@ void GSDrawScanlineCodeGenerator2::Step() if (m_sel.zb) { - if (is32) + if (m_sel.zequal) + { + pbroadcastdLocal(z, _rip_local(p.z)); + } + else if (is32) { broadcastssLocal(z, _rip_local_d_p(z)); addps(z, _rip_local(temp.zo)); @@ -1185,9 +1194,18 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) if (m_sel.prim != GS_SPRITE_CLASS) { - if (m_sel.zoverflow) + if (m_sel.zequal) + { + ONLY64(movdqa(xym0, _z)); + } + else if (m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + /*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; + z /= 2; + zs = GSVector8i(z, true); + zs = zs.min_u32(GSVector8i::x7fffffff()); + zs = zs.sll32(1) | 1;*/ auto m_half = loadAddress(rax, &GSVector4::m_half); @@ -1195,6 +1213,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) vbroadcastss(temp1, ptr[m_half]); else movaps(temp1, ptr[m_half]); + mulps(temp1, z); cvttps2dq(temp1, temp1); pslld(temp1, 1); diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h index e5f2f2bde8..74a6f35748 100644 --- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h @@ -407,6 +407,7 @@ public: AFORWARD(3, pinsrd, ARGS_XOI) AFORWARD(2, pmaxsw, ARGS_XO) AFORWARD(2, pminsd, ARGS_XO) + AFORWARD(2, pminud, ARGS_XO) AFORWARD(2, pminsw, ARGS_XO) SFORWARD(2, pmovsxbd, ARGS_XO) SFORWARD(2, pmovmskb, const Reg32e&, const Xmm&) diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp index be91157975..4a4ecbc904 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp @@ -313,7 +313,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* } } - if (primclass == GS_SPRITE_CLASS) + if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z) { xyzuvf = xyzuvf.min_u32(z_max); t = t.insert32<1, 3>(GSVector4::cast(xyzuvf)); @@ -1341,6 +1341,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; + gd.sel.zequal = !!m_vt.m_eq.z; gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U; gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max; } diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h index 4c041e2751..38f2d0093f 100644 --- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h +++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h @@ -64,7 +64,7 @@ union GSScanlineSelector u32 mmin : 2; // 54 u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction - + u32 zequal : 1; // 56 u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program }; diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index 0f67d560bd..eeef8aa39a 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -187,21 +187,43 @@ void GSSetupPrimCodeGenerator2::Depth_XMM() if (m_en.z) { - // GSVector4 dz = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - // m_local.d4.z = dz * 4.0f; - - THREEARG(mulps, xmm1, xmm0, xmm3); - movdqa(_rip_local_d_p(z), xmm1); - - for (int i = 0; i < (m_sel.notest ? 1 : 4); i++) + if (m_sel.zequal) { - // m_local.d[i].z = dz * m_shift[i]; + u32 offset = 0; + if (m_sel.prim != GS_POINT_CLASS) + offset = sizeof(u32) * 1; - THREEARG(mulps, xmm1, xmm0, XYm(4 + i)); - movdqa(_rip_local(d[i].z), xmm1); + if (is32) + mov(_index, ptr[rsp + _32_index]); + mov(eax, ptr[_index + offset]); + shl(eax, 6); // * sizeof(GSVertexSW) + if (is64) + add(rax, _64_vertex); + else + add(rax, ptr[rsp + _32_vertex]); + + movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); + pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + movdqa(_rip_local(p.z), xmm0); + } + else + { + // GSVector4 dz = p.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + // m_local.d4.z = dz * 4.0f; + + THREEARG(mulps, xmm1, xmm0, xmm3); + movdqa(_rip_local_d_p(z), xmm1); + + for (int i = 0; i < (m_sel.notest ? 1 : 4); i++) + { + // m_local.d[i].z = dz * m_shift[i]; + + THREEARG(mulps, xmm1, xmm0, XYm(4 + i)); + movdqa(_rip_local(d[i].z), xmm1); + } } } } @@ -257,13 +279,34 @@ void GSSetupPrimCodeGenerator2::Depth_YMM() if (m_en.z) { - // m_local.d8.p.z = dp8.extract32<2>(); + if (m_sel.zequal) + { + u32 offset = 0; + if (m_sel.prim != GS_POINT_CLASS) + offset = sizeof(u32) * 1; - extractps(_rip_local_d_p(z), xmm1, 2); + if (is32) + mov(_index, ptr[rsp + _32_index]); + mov(eax, ptr[_index + offset]); + shl(eax, 6); // * sizeof(GSVertexSW) + if (is64) + add(rax, _64_vertex); + else + add(rax, ptr[rsp + _32_vertex]); - // GSVector8 dz = GSVector8(dscan.p).zzzz(); + mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]); + mov(_rip_local(p.z), t1.cvt32()); + } + else + { + // m_local.d8.p.z = dp8.extract32<2>(); - vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); + extractps(_rip_local_d_p(z), xmm1, 2); + + // GSVector8 dz = GSVector8(dscan.p).zzzz(); + + vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); + } } if (m_en.f)