diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanline.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanline.cpp index 11ffae3aa5..ec0f578075 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanline.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanline.cpp @@ -587,6 +587,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS zdo -= GSVector8i::x80000000(); } + if (sel.zclamp) + zso = zso.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8)); + switch(sel.ztst) { case ZTST_GEQUAL: test |= zso < zdo; break; @@ -1219,6 +1222,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS zs = zs.blend8(zd, zm); } + if (sel.zclamp) + zs = zs.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8)); + bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; if(sel.notest) @@ -1696,6 +1702,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS zdo -= GSVector4i::x80000000(); } + if (sel.zclamp) + { + const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8); + + zso.u32[0] = std::min(z_max, zso.u32[0]); + zso.u32[1] = std::min(z_max, zso.u32[1]); + zso.u32[2] = std::min(z_max, zso.u32[2]); + zso.u32[3] = std::min(z_max, zso.u32[3]); + } + switch(sel.ztst) { case ZTST_GEQUAL: test |= zso < zdo; break; @@ -2333,6 +2349,16 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS zs = zs.blend8(zd, zm); } + if (sel.zclamp) + { + const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8); + + zs.u32[0] = std::min(z_max, zs.u32[0]); + zs.u32[1] = std::min(z_max, zs.u32[1]); + zs.u32[2] = std::min(z_max, zs.u32[2]); + zs.u32[3] = std::min(z_max, zs.u32[3]); + } + bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; if(sel.notest) diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp index aef6a8736e..1814776868 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -635,6 +635,14 @@ void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) vcvttps2dq(xmm0, xmm0); } + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { + vpcmpeqd(temp1, temp1); + vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); + vpminsd(xmm0, temp1); + } + if(m_sel.zwrite) { vmovdqa(ptr[&m_local.temp.zs], xmm0); @@ -2378,6 +2386,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); } + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { + vpcmpeqd(xmm7, xmm7); + vpsrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); + vpminsd(xmm1, xmm7); + } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1); diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp index 7d24272faa..1797036c09 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp @@ -637,6 +637,14 @@ void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2) vcvttps2dq(ymm0, ymm0); } + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { + vpcmpeqd(temp1, temp1); + vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); + vpminsd(ymm0, temp1); + } + if(m_sel.zwrite) { vmovdqa(ptr[&m_local.temp.zs], ymm0); @@ -2374,6 +2382,14 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() vpbroadcastd(ymm1, ptr[&m_local.p.z]); } + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { + vpcmpeqd(ymm7, ymm7); + vpsrld(ymm7, (uint8)((m_sel.zpsm & 0x3) * 8)); + vpminsd(ymm1, ymm7); + } + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp index 02d64d112e..78711e48a8 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp @@ -640,6 +640,26 @@ void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) cvttps2dq(xmm0, xmm0); } + + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { +#if _M_SSE >= 0x401 + pcmpeqd(temp1, temp1); + psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); + pminsd(xmm0, temp1); +#else + pcmpeqd(temp1, temp1); + psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); + pcmpgtd(temp1, xmm0); + pand(xmm0, temp1); + pcmpeqd(temp2, temp2); + pxor(temp1, temp2); + psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); + por(xmm0, temp1); +#endif + } + if(m_sel.zwrite) { movdqa(ptr[&m_local.temp.zs], xmm0); @@ -2412,6 +2432,25 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() blend8(xmm1, xmm7); } + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { +#if _M_SSE >= 0x401 + pcmpeqd(xmm7, xmm7); + psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); + pminsd(xmm1, xmm7); +#else + static GSVector4i all_1s = GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); + pcmpeqd(xmm7, xmm7); + psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); + pcmpgtd(xmm7, xmm1); + pand(xmm1, xmm7); + pxor(xmm7, ptr[&all_1s]); + psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); + por(xmm1, xmm7); +#endif + } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1); diff --git a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp index 07daac3c9e..d7cd5d936f 100644 --- a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp +++ b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp @@ -287,6 +287,16 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* GSVector4i off = (GSVector4i)m_context->XYOFFSET; GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0); + #if _M_SSE >= 0x401 + + GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8); + + #else + + uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8); + + #endif + for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++) { GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q @@ -351,10 +361,12 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* { #if _M_SSE >= 0x401 + xyzuvf = xyzuvf.min_u32(z_max); t = t.insert32<1, 3>(GSVector4::cast(xyzuvf)); #else + z = std::min(z, z_max); t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z))); #endif @@ -1326,9 +1338,12 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) if(zwrite || ztest) { + uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[context->ZBUF.PSM].fmt * 8); + gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; gd.sel.zoverflow = (uint32)GSVector4i(m_vt.m_max.p).z == 0x80000000U; + gd.sel.zclamp = (uint32)GSVector4i(m_vt.m_max.p).z > z_max; } #if _M_SSE >= 0x501 diff --git a/plugins/GSdx/Renderers/SW/GSScanlineEnvironment.h b/plugins/GSdx/Renderers/SW/GSScanlineEnvironment.h index f034ea565c..5c68bc7365 100644 --- a/plugins/GSdx/Renderers/SW/GSScanlineEnvironment.h +++ b/plugins/GSdx/Renderers/SW/GSScanlineEnvironment.h @@ -55,19 +55,20 @@ union GSScanlineSelector uint32 zwrite:1; // 35 uint32 ztest:1; // 36 uint32 zoverflow:1; // 37 (z max >= 0x80000000) - uint32 wms:2; // 38 - uint32 wmt:2; // 40 - uint32 datm:1; // 42 - uint32 colclamp:1; // 43 - uint32 fba:1; // 44 - uint32 dthe:1; // 45 - uint32 prim:2; // 46 + uint32 zclamp:1; // 38 + uint32 wms:2; // 39 + uint32 wmt:2; // 41 + uint32 datm:1; // 43 + uint32 colclamp:1; // 44 + uint32 fba:1; // 45 + uint32 dthe:1; // 46 + uint32 prim:2; // 47 - uint32 edge:1; // 48 - uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) - uint32 lcm:1; // 52 - uint32 mmin:2; // 53 - uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) + uint32 edge:1; // 49 + uint32 tw:3; // 50 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) + uint32 lcm:1; // 53 + uint32 mmin:2; // 54 + uint32 notest:1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction uint32 breakpoint:1; // Insert a trap to stop the program, helpful to stop debugger on a program