GSdx-sw: Add Z clamping to GSdx SW mode (#3433)

* Add Z clamping to GSdx SW mode

* Fix spacing

* Only clamp when max vertex z is greater than zfmt max

* Fix Z format switches

* Get rid of needless shuffle

* Whoops, missed a case

* Replace switches with a shift

* Disable triangle clamping for SSE2

* Implement clamping on GS Raster Interpreter

* Added SSE2 Triangle Z clamping by KrossX
This commit is contained in:
refractionpcsx2 2020-06-05 20:56:16 +01:00 committed by GitHub
parent d08e49f2df
commit ca903b6b14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 125 additions and 12 deletions

View File

@ -587,6 +587,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
zdo -= GSVector8i::x80000000();
}
if (sel.zclamp)
zso = zso.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8));
switch(sel.ztst)
{
case ZTST_GEQUAL: test |= zso < zdo; break;
@ -1219,6 +1222,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
zs = zs.blend8(zd, zm);
}
if (sel.zclamp)
zs = zs.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8));
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
if(sel.notest)
@ -1696,6 +1702,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
zdo -= GSVector4i::x80000000();
}
if (sel.zclamp)
{
const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8);
zso.u32[0] = std::min(z_max, zso.u32[0]);
zso.u32[1] = std::min(z_max, zso.u32[1]);
zso.u32[2] = std::min(z_max, zso.u32[2]);
zso.u32[3] = std::min(z_max, zso.u32[3]);
}
switch(sel.ztst)
{
case ZTST_GEQUAL: test |= zso < zdo; break;
@ -2333,6 +2349,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
zs = zs.blend8(zd, zm);
}
if (sel.zclamp)
{
const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8);
zs.u32[0] = std::min(z_max, zs.u32[0]);
zs.u32[1] = std::min(z_max, zs.u32[1]);
zs.u32[2] = std::min(z_max, zs.u32[2]);
zs.u32[3] = std::min(z_max, zs.u32[3]);
}
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
if(sel.notest)

View File

@ -635,6 +635,14 @@ void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
vcvttps2dq(xmm0, xmm0);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
vpcmpeqd(temp1, temp1);
vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
vpminsd(xmm0, temp1);
}
if(m_sel.zwrite)
{
vmovdqa(ptr[&m_local.temp.zs], xmm0);
@ -2378,6 +2386,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
vpcmpeqd(xmm7, xmm7);
vpsrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
vpminsd(xmm1, xmm7);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);

View File

@ -637,6 +637,14 @@ void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2)
vcvttps2dq(ymm0, ymm0);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
vpcmpeqd(temp1, temp1);
vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
vpminsd(ymm0, temp1);
}
if(m_sel.zwrite)
{
vmovdqa(ptr[&m_local.temp.zs], ymm0);
@ -2374,6 +2382,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
vpbroadcastd(ymm1, ptr[&m_local.p.z]);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
vpcmpeqd(ymm7, ymm7);
vpsrld(ymm7, (uint8)((m_sel.zpsm & 0x3) * 8));
vpminsd(ymm1, ymm7);
}
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);

View File

@ -640,6 +640,26 @@ void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
cvttps2dq(xmm0, xmm0);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
#if _M_SSE >= 0x401
pcmpeqd(temp1, temp1);
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
pminsd(xmm0, temp1);
#else
pcmpeqd(temp1, temp1);
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
pcmpgtd(temp1, xmm0);
pand(xmm0, temp1);
pcmpeqd(temp2, temp2);
pxor(temp1, temp2);
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
por(xmm0, temp1);
#endif
}
if(m_sel.zwrite)
{
movdqa(ptr[&m_local.temp.zs], xmm0);
@ -2412,6 +2432,25 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
blend8(xmm1, xmm7);
}
// Clamp Z to ZPSM_FMT_MAX
if (m_sel.zclamp)
{
#if _M_SSE >= 0x401
pcmpeqd(xmm7, xmm7);
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
pminsd(xmm1, xmm7);
#else
static GSVector4i all_1s = GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
pcmpeqd(xmm7, xmm7);
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
pcmpgtd(xmm7, xmm1);
pand(xmm1, xmm7);
pxor(xmm7, ptr[&all_1s]);
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
por(xmm1, xmm7);
#endif
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1);

View File

@ -287,6 +287,16 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
GSVector4i off = (GSVector4i)m_context->XYOFFSET;
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
#if _M_SSE >= 0x401
GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
#else
uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
#endif
for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
{
GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
@ -351,10 +361,12 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
{
#if _M_SSE >= 0x401
xyzuvf = xyzuvf.min_u32(z_max);
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
#else
z = std::min(z, z_max);
t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
#endif
@ -1326,9 +1338,12 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
if(zwrite || ztest)
{
uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[context->ZBUF.PSM].fmt * 8);
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
gd.sel.zoverflow = (uint32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
gd.sel.zclamp = (uint32)GSVector4i(m_vt.m_max.p).z > z_max;
}
#if _M_SSE >= 0x501

View File

@ -55,19 +55,20 @@ union GSScanlineSelector
uint32 zwrite:1; // 35
uint32 ztest:1; // 36
uint32 zoverflow:1; // 37 (z max >= 0x80000000)
uint32 wms:2; // 38
uint32 wmt:2; // 40
uint32 datm:1; // 42
uint32 colclamp:1; // 43
uint32 fba:1; // 44
uint32 dthe:1; // 45
uint32 prim:2; // 46
uint32 zclamp:1; // 38
uint32 wms:2; // 39
uint32 wmt:2; // 41
uint32 datm:1; // 43
uint32 colclamp:1; // 44
uint32 fba:1; // 45
uint32 dthe:1; // 46
uint32 prim:2; // 47
uint32 edge:1; // 48
uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 lcm:1; // 52
uint32 mmin:2; // 53
uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
uint32 edge:1; // 49
uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 lcm:1; // 53
uint32 mmin:2; // 54
uint32 notest:1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program