diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index ff9ab080aa..b546facfaf 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -166,6 +166,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data, Functions* f) m_env.t.min = m_env.t.min.xxxxlh(); m_env.t.max = m_env.t.max.xxxxlh(); m_env.t.mask = m_env.t.mask.xxzz(); + m_env.t.invmask = ~m_env.t.mask; } // diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index cbe6fda70d..1b35ab577c 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -164,7 +164,7 @@ L("@@"); // esi = fzbr // edi = fzbc // ebp = za - //xmm2 = fd + // xmm2 = fd // xmm3 = fm // xmm4 = zm // xmm5 = rb @@ -768,17 +768,17 @@ void GSDrawScanlineCodeGenerator::SampleTexture() pcmpeqd(xmm1, xmm1); psrlw(xmm1, 15); paddw(xmm3, xmm1); - } - // uv0 = Wrap(uv0); - - Wrap(xmm2, xmm1); - - if(m_env.sel.ltf) - { + // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); - Wrap(xmm3, xmm1); + Wrap(xmm2, xmm3); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(xmm2); } // xmm2 = uv0 @@ -983,6 +983,118 @@ void GSDrawScanlineCodeGenerator::SampleTexture() } } +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + if(m_env.sel.wms == m_env.sel.wmt) + { + if(m_env.sel.wms) + { + pmaxsw(uv, xmmword[&m_env.t.min]); + pminsw(uv, xmmword[&m_env.t.max]); + } + else + { + pand(uv, xmmword[&m_env.t.min]); + por(uv, xmmword[&m_env.t.max]); + } + } + else + { + movdqa(xmm1, uv); + + movdqa(xmm4, xmmword[&m_env.t.min]); + movdqa(xmm5, xmmword[&m_env.t.max]); + + // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); + + pmaxsw(uv, xmm4); + pminsw(uv, xmm5); + + // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; + + pand(xmm1, xmm4); + por(xmm1, xmm5); + + // clamp.blend8(repeat, m_env.t.mask); + + movdqa(xmm0, xmmword[&m_env.t.mask]); + blend8(uv, xmm1); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + if(m_env.sel.wms == m_env.sel.wmt) + { + movdqa(xmm4, xmmword[&m_env.t.min]); + movdqa(xmm5, xmmword[&m_env.t.max]); + + if(m_env.sel.wms) + { + pmaxsw(uv0, xmm4); + pminsw(uv0, xmm5); + pmaxsw(uv1, xmm4); + pminsw(uv1, xmm5); + } + else + { + pand(uv0, xmm4); + por(uv0, xmm5); + pand(uv1, xmm4); + por(uv1, xmm5); + } + } + else + { + movdqa(xmm1, uv0); + movdqa(xmm6, uv1); + + movdqa(xmm4, xmmword[&m_env.t.min]); + movdqa(xmm5, xmmword[&m_env.t.max]); + + // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); + + pmaxsw(uv0, xmm4); + pminsw(uv0, xmm5); + pmaxsw(uv1, xmm4); + pminsw(uv1, xmm5); + + // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; + + pand(xmm1, xmm4); + por(xmm1, xmm5); + pand(xmm6, xmm4); + por(xmm6, xmm5); + + // clamp.blend8(repeat, m_env.t.mask); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, xmmword[&m_env.t.mask]); + + pblendvb(uv0, xmm1); + pblendvb(uv1, xmm6); + } + else + { + movdqa(xmm0, xmmword[&m_env.t.invmask]); + movdqa(xmm4, xmm0); + + pand(uv0, xmm0); + pandn(xmm0, xmm1); + por(uv0, xmm0); + + pand(uv1, xmm4); + pandn(xmm4, xmm6); + por(uv1, xmm4); + } + } +} + void GSDrawScanlineCodeGenerator::AlphaTFX() { if(!m_env.sel.fb) @@ -1288,9 +1400,32 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() // test |= ((fd [<< 16]) ^ m_env.datm).sra32(31); movdqa(xmm1, xmm2); - if(m_env.sel.fpsm == 2) pslld(xmm1, 16); - pxor(xmm1, xmmword[&m_env.datm]); - psrad(xmm1, 31); + + if(m_env.sel.datm) + { + if(m_env.sel.fpsm == 2) + { + pxor(xmm0, xmm0); + psrld(xmm1, 15); + pcmpeqd(xmm1, xmm0); + } + else + { + pcmpeqd(xmm0, xmm0); + pxor(xmm1, xmm0); + psrad(xmm1, 31); + } + } + else + { + if(m_env.sel.fpsm == 2) + { + pslld(xmm1, 16); + } + + psrad(xmm1, 31); + } + por(xmm7, xmm1); alltrue(xmm7, eax, "step"); @@ -1826,42 +1961,6 @@ void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, con else pinsrd(dst, src, i); } -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv, const Xmm& temp) -{ - if(m_env.sel.wms == m_env.sel.wmt) - { - if(m_env.sel.wms) - { - pmaxsw(uv, xmmword[&m_env.t.min]); - pminsw(uv, xmmword[&m_env.t.max]); - } - else - { - pand(uv, xmmword[&m_env.t.min]); - por(uv, xmmword[&m_env.t.max]); - } - } - else - { - movdqa(temp, uv); - - // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); - - pmaxsw(uv, xmmword[&m_env.t.min]); - pminsw(uv, xmmword[&m_env.t.max]); - - // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; - - pand(temp, xmmword[&m_env.t.min]); - por(temp, xmmword[&m_env.t.max]); - - // clamp.blend8(repeat, m_env.t.mask); - - movdqa(xmm0, xmmword[&m_env.t.mask]); - blend8(uv, temp); - } -} - template void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f) { @@ -1871,7 +1970,7 @@ void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f) } else { - pslld(a, shift + 1); + psllw(a, shift + 1); pmulhw(a, f); } } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 0bf01fd506..96b47250a8 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -43,6 +43,8 @@ class GSDrawScanlineCodeGenerator : public CodeGenerator void Step(); void TestZ(const Xmm& temp1, const Xmm& temp2); void SampleTexture(); + void Wrap(const Xmm& uv0); + void Wrap(const Xmm& uv0, const Xmm& uv1); void AlphaTFX(); void TestAlpha(); void ColorTFX(); @@ -57,8 +59,7 @@ class GSDrawScanlineCodeGenerator : public CodeGenerator void WritePixel(const Xmm& src, const Xmm& temp, const Reg32& addr, uint8 i, int psm); void ReadTexel(const Xmm& dst, const Xmm& addr, const Reg32& base, const Xmm& temp1, const Xmm& temp2); void ReadTexel(const Xmm& dst, const Xmm& addr, const Reg32& base, uint8 i); - void Wrap(const Xmm& uv, const Xmm& temp); - + template void modulate16(const Xmm& a, const Operand& f); template void lerp16(const Xmm& a, const Xmm& b, const Xmm& f); void mix16(const Xmm& a, const Xmm& b, const Xmm& temp); diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h index 521e0fce7b..da9cbf6cc3 100644 --- a/plugins/GSdx/GSRendererSW.h +++ b/plugins/GSdx/GSRendererSW.h @@ -446,6 +446,7 @@ protected: if(context->FRAME.PSM != PSM_PSMCT24) { p.sel.date = context->TEST.DATE; + p.sel.datm = context->TEST.DATM; } if(PRIM->ABE) diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index b88e7d1eb2..7bf7c1201c 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -55,9 +55,10 @@ union GSScanlineSelector DWORD ztest:1; // 35 DWORD wms:1; // 36 (0: repeat, 1: clamp) DWORD wmt:1; // 37 - DWORD colclamp:1; // 38 - DWORD fba:1; // 39 - DWORD dthe:1; // 40 + DWORD datm:1; // 38 + DWORD colclamp:1; // 39 + DWORD fba:1; // 40 + DWORD dthe:1; // 41 }; struct @@ -128,7 +129,7 @@ __declspec(align(16)) struct GSScanlineEnvironment GSVector4i* dimx; GSVector4i fm, zm; - struct {GSVector4i min, max, mask;} t; // [u] x 4 [v] x 4 + struct {GSVector4i min, max, mask, invmask;} t; // [u] x 4 [v] x 4 GSVector4i datm; GSVector4i colclamp; GSVector4i fba; diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index a160a36d0a..eebf06e3ac 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -327,7 +327,15 @@ public: GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const { + #if _M_SSE >= 0x401 + return GSVector4i(_mm_blendv_epi8(m, a, mask)); + + #else + + return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a))); + + #endif } #if _M_SSE >= 0x401 @@ -2310,7 +2318,15 @@ public: GSVector4 blend8(const GSVector4& a, const GSVector4& mask) const { + #if _M_SSE >= 0x401 + return GSVector4(_mm_blendv_ps(m, a, mask)); + + #else + + return GSVector4(_mm_or_ps(_mm_andnot_ps(mask, m), _mm_and_ps(mask, a))); + + #endif } GSVector4 upl(const GSVector4& a) const diff --git a/plugins/GSdx/sse.h b/plugins/GSdx/sse.h index b8af513a10..d261bfbff6 100644 --- a/plugins/GSdx/sse.h +++ b/plugins/GSdx/sse.h @@ -103,11 +103,6 @@ #else - // not an equal replacement for sse4's blend but for our needs it is ok - - #define _mm_blendv_ps(a, b, mask) _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)) - #define _mm_blendv_epi8(a, b, mask) _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)) - __forceinline __m128 _mm_round_ps(__m128 x) { __m128 t = _mm_or_ps(_mm_and_ps(ps_80000000, x), ps_4b000000);