mirror of https://github.com/PCSX2/pcsx2.git
GSdx-sw: Add Z clamping to GSdx SW mode (#3433)
* Add Z clamping to GSdx SW mode * Fix spacing * Only clamp when max vertex z is greater than zfmt max * Fix Z format switches * Get rid of needless shuffle * Whoops, missed a case * Replace switches with a shift * Disable triangle clamping for SSE2 * Implement clamping on GS Raster Interpreter * Added SSE2 Triangle Z clamping by KrossX
This commit is contained in:
parent
d08e49f2df
commit
ca903b6b14
|
@ -587,6 +587,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
zdo -= GSVector8i::x80000000();
|
zdo -= GSVector8i::x80000000();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sel.zclamp)
|
||||||
|
zso = zso.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8));
|
||||||
|
|
||||||
switch(sel.ztst)
|
switch(sel.ztst)
|
||||||
{
|
{
|
||||||
case ZTST_GEQUAL: test |= zso < zdo; break;
|
case ZTST_GEQUAL: test |= zso < zdo; break;
|
||||||
|
@ -1219,6 +1222,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
zs = zs.blend8(zd, zm);
|
zs = zs.blend8(zd, zm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sel.zclamp)
|
||||||
|
zs = zs.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8));
|
||||||
|
|
||||||
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
|
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
|
||||||
|
|
||||||
if(sel.notest)
|
if(sel.notest)
|
||||||
|
@ -1696,6 +1702,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
zdo -= GSVector4i::x80000000();
|
zdo -= GSVector4i::x80000000();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sel.zclamp)
|
||||||
|
{
|
||||||
|
const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8);
|
||||||
|
|
||||||
|
zso.u32[0] = std::min(z_max, zso.u32[0]);
|
||||||
|
zso.u32[1] = std::min(z_max, zso.u32[1]);
|
||||||
|
zso.u32[2] = std::min(z_max, zso.u32[2]);
|
||||||
|
zso.u32[3] = std::min(z_max, zso.u32[3]);
|
||||||
|
}
|
||||||
|
|
||||||
switch(sel.ztst)
|
switch(sel.ztst)
|
||||||
{
|
{
|
||||||
case ZTST_GEQUAL: test |= zso < zdo; break;
|
case ZTST_GEQUAL: test |= zso < zdo; break;
|
||||||
|
@ -2333,6 +2349,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
zs = zs.blend8(zd, zm);
|
zs = zs.blend8(zd, zm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sel.zclamp)
|
||||||
|
{
|
||||||
|
const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8);
|
||||||
|
|
||||||
|
zs.u32[0] = std::min(z_max, zs.u32[0]);
|
||||||
|
zs.u32[1] = std::min(z_max, zs.u32[1]);
|
||||||
|
zs.u32[2] = std::min(z_max, zs.u32[2]);
|
||||||
|
zs.u32[3] = std::min(z_max, zs.u32[3]);
|
||||||
|
}
|
||||||
|
|
||||||
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
|
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
|
||||||
|
|
||||||
if(sel.notest)
|
if(sel.notest)
|
||||||
|
|
|
@ -635,6 +635,14 @@ void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
|
||||||
vcvttps2dq(xmm0, xmm0);
|
vcvttps2dq(xmm0, xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
vpcmpeqd(temp1, temp1);
|
||||||
|
vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
vpminsd(xmm0, temp1);
|
||||||
|
}
|
||||||
|
|
||||||
if(m_sel.zwrite)
|
if(m_sel.zwrite)
|
||||||
{
|
{
|
||||||
vmovdqa(ptr[&m_local.temp.zs], xmm0);
|
vmovdqa(ptr[&m_local.temp.zs], xmm0);
|
||||||
|
@ -2378,6 +2386,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
|
||||||
vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
|
vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
vpcmpeqd(xmm7, xmm7);
|
||||||
|
vpsrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
vpminsd(xmm1, xmm7);
|
||||||
|
}
|
||||||
|
|
||||||
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
|
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
|
||||||
|
|
||||||
WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
|
WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
|
||||||
|
|
|
@ -637,6 +637,14 @@ void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2)
|
||||||
vcvttps2dq(ymm0, ymm0);
|
vcvttps2dq(ymm0, ymm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
vpcmpeqd(temp1, temp1);
|
||||||
|
vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
vpminsd(ymm0, temp1);
|
||||||
|
}
|
||||||
|
|
||||||
if(m_sel.zwrite)
|
if(m_sel.zwrite)
|
||||||
{
|
{
|
||||||
vmovdqa(ptr[&m_local.temp.zs], ymm0);
|
vmovdqa(ptr[&m_local.temp.zs], ymm0);
|
||||||
|
@ -2374,6 +2382,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
|
||||||
vpbroadcastd(ymm1, ptr[&m_local.p.z]);
|
vpbroadcastd(ymm1, ptr[&m_local.p.z]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
vpcmpeqd(ymm7, ymm7);
|
||||||
|
vpsrld(ymm7, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
vpminsd(ymm1, ymm7);
|
||||||
|
}
|
||||||
|
|
||||||
if(m_sel.ztest && m_sel.zpsm < 2)
|
if(m_sel.ztest && m_sel.zpsm < 2)
|
||||||
{
|
{
|
||||||
// zs = zs.blend8(zd, zm);
|
// zs = zs.blend8(zd, zm);
|
||||||
|
|
|
@ -640,6 +640,26 @@ void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
|
||||||
cvttps2dq(xmm0, xmm0);
|
cvttps2dq(xmm0, xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
#if _M_SSE >= 0x401
|
||||||
|
pcmpeqd(temp1, temp1);
|
||||||
|
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
pminsd(xmm0, temp1);
|
||||||
|
#else
|
||||||
|
pcmpeqd(temp1, temp1);
|
||||||
|
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
pcmpgtd(temp1, xmm0);
|
||||||
|
pand(xmm0, temp1);
|
||||||
|
pcmpeqd(temp2, temp2);
|
||||||
|
pxor(temp1, temp2);
|
||||||
|
psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
por(xmm0, temp1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
if(m_sel.zwrite)
|
if(m_sel.zwrite)
|
||||||
{
|
{
|
||||||
movdqa(ptr[&m_local.temp.zs], xmm0);
|
movdqa(ptr[&m_local.temp.zs], xmm0);
|
||||||
|
@ -2412,6 +2432,25 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
|
||||||
blend8(xmm1, xmm7);
|
blend8(xmm1, xmm7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clamp Z to ZPSM_FMT_MAX
|
||||||
|
if (m_sel.zclamp)
|
||||||
|
{
|
||||||
|
#if _M_SSE >= 0x401
|
||||||
|
pcmpeqd(xmm7, xmm7);
|
||||||
|
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
pminsd(xmm1, xmm7);
|
||||||
|
#else
|
||||||
|
static GSVector4i all_1s = GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||||
|
pcmpeqd(xmm7, xmm7);
|
||||||
|
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
pcmpgtd(xmm7, xmm1);
|
||||||
|
pand(xmm1, xmm7);
|
||||||
|
pxor(xmm7, ptr[&all_1s]);
|
||||||
|
psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8));
|
||||||
|
por(xmm1, xmm7);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
|
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
|
||||||
|
|
||||||
WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
|
WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
|
||||||
|
|
|
@ -287,6 +287,16 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
||||||
GSVector4i off = (GSVector4i)m_context->XYOFFSET;
|
GSVector4i off = (GSVector4i)m_context->XYOFFSET;
|
||||||
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
|
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
|
||||||
|
|
||||||
|
#if _M_SSE >= 0x401
|
||||||
|
|
||||||
|
GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
|
for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
|
||||||
{
|
{
|
||||||
GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
|
GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
|
||||||
|
@ -351,10 +361,12 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x401
|
#if _M_SSE >= 0x401
|
||||||
|
|
||||||
|
xyzuvf = xyzuvf.min_u32(z_max);
|
||||||
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
|
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
z = std::min(z, z_max);
|
||||||
t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
|
t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1326,9 +1338,12 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
||||||
|
|
||||||
if(zwrite || ztest)
|
if(zwrite || ztest)
|
||||||
{
|
{
|
||||||
|
uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[context->ZBUF.PSM].fmt * 8);
|
||||||
|
|
||||||
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
|
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
|
||||||
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
|
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
|
||||||
gd.sel.zoverflow = (uint32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
|
gd.sel.zoverflow = (uint32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
|
||||||
|
gd.sel.zclamp = (uint32)GSVector4i(m_vt.m_max.p).z > z_max;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
|
|
|
@ -55,19 +55,20 @@ union GSScanlineSelector
|
||||||
uint32 zwrite:1; // 35
|
uint32 zwrite:1; // 35
|
||||||
uint32 ztest:1; // 36
|
uint32 ztest:1; // 36
|
||||||
uint32 zoverflow:1; // 37 (z max >= 0x80000000)
|
uint32 zoverflow:1; // 37 (z max >= 0x80000000)
|
||||||
uint32 wms:2; // 38
|
uint32 zclamp:1; // 38
|
||||||
uint32 wmt:2; // 40
|
uint32 wms:2; // 39
|
||||||
uint32 datm:1; // 42
|
uint32 wmt:2; // 41
|
||||||
uint32 colclamp:1; // 43
|
uint32 datm:1; // 43
|
||||||
uint32 fba:1; // 44
|
uint32 colclamp:1; // 44
|
||||||
uint32 dthe:1; // 45
|
uint32 fba:1; // 45
|
||||||
uint32 prim:2; // 46
|
uint32 dthe:1; // 46
|
||||||
|
uint32 prim:2; // 47
|
||||||
|
|
||||||
uint32 edge:1; // 48
|
uint32 edge:1; // 49
|
||||||
uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
|
uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
|
||||||
uint32 lcm:1; // 52
|
uint32 lcm:1; // 53
|
||||||
uint32 mmin:2; // 53
|
uint32 mmin:2; // 54
|
||||||
uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
|
uint32 notest:1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
|
||||||
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
|
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
|
||||||
|
|
||||||
uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program
|
uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program
|
||||||
|
|
Loading…
Reference in New Issue