GSdx: When mipmapping is on, LOD is calculated per pixel, it isn't used for anything, but it's there. I cannot really measure any significant slowdown, but rest of the fun is yet to come.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4428 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-03-14 03:32:28 +00:00
parent 5e9930a9dc
commit 1d759c852d
18 changed files with 405 additions and 138 deletions

View File

@ -34,6 +34,14 @@ const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
GSVector4i::zero(),
};
const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
{
GSVector4(0.204446009836232697516f),
GSVector4(-1.04913055217340124191f),
GSVector4(2.28330284476918490682f),
GSVector4(1.0f),
};
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)

View File

@ -31,6 +31,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void operator = (const GSDrawScanlineCodeGenerator&);
static const GSVector4i m_test[8];
static const GSVector4 m_log2_coef[4];
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;

View File

@ -256,7 +256,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// edx = &m_local.d[skip]
shl(rdx, 4);
shl(rdx, 3);
lea(rdx, ptr[rdx + r11 + offsetof(GSScanlineLocalData, d)]);
}
@ -317,17 +317,17 @@ void GSDrawScanlineCodeGenerator::Init()
vcvttps2dq(xmm0, xmm0);
// si = vti.xxxx() + m_local.d[skip].si;
// ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti;
// s = vti.xxxx() + m_local.d[skip].s;
// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(xmm10, ptr[rdx + 16 * 7]);
vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
if(!m_sel.sprite)
{
vpaddd(xmm11, ptr[rdx + 16 * 8]);
vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
}
else
{
@ -338,6 +338,12 @@ void GSDrawScanlineCodeGenerator::Init()
vpsrlw(xmm6, 1);
}
}
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]);
}
}
else
{
@ -349,9 +355,9 @@ void GSDrawScanlineCodeGenerator::Init()
vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm10, ptr[rdx + 16 * 1]);
vaddps(xmm11, ptr[rdx + 16 * 2]);
vaddps(xmm12, ptr[rdx + 16 * 3]);
vaddps(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
vaddps(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]);
}
}
@ -374,8 +380,8 @@ void GSDrawScanlineCodeGenerator::Init()
vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vpaddw(xmm13, ptr[rdx + 16 * 4]);
vpaddw(xmm14, ptr[rdx + 16 * 5]);
vpaddw(xmm13, ptr[rdx + offsetof(GSScanlineLocalData::skip, rb)]);
vpaddw(xmm14, ptr[rdx + offsetof(GSScanlineLocalData::skip, ga)]);
}
else
{
@ -430,7 +436,7 @@ void GSDrawScanlineCodeGenerator::Step()
// si += st.xxxx();
// if(!sprite) ti += st.yyyy();
vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.st)]);
vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]);
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpaddd(xmm10, xmm1);
@ -440,6 +446,12 @@ void GSDrawScanlineCodeGenerator::Step()
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(xmm11, xmm1);
}
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm12, xmm3);
}
}
else
{
@ -617,6 +629,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// ebx = tex
if(m_sel.mipmap && !m_sel.lcm)
{
}
if(!m_sel.fst)
{
vrcpps(xmm0, xmm12);

View File

@ -52,8 +52,9 @@ L("loop");
// esi = fzbr
// edi = fzbc
// xmm0 = z/zi
// xmm2 = u (tme)
// xmm3 = v (tme)
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
// xmm4 = q (tme)
// xmm5 = rb (!tme)
// xmm6 = ga (!tme)
// xmm7 = test
@ -66,8 +67,9 @@ L("loop");
// esi = fzbr
// edi = fzbc
// - xmm0
// xmm2 = u (tme)
// xmm3 = v (tme)
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
// xmm4 = q (tme)
// xmm5 = rb (!tme)
// xmm6 = ga (!tme)
// xmm7 = test
@ -284,7 +286,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// edx = &m_local.d[skip]
shl(edx, 4);
shl(edx, 3);
lea(edx, ptr[edx + (size_t)m_local.d]);
// ebx = &v
@ -305,7 +307,7 @@ void GSDrawScanlineCodeGenerator::Init()
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpaddw(xmm1, ptr[edx + 16 * 6]);
vpaddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
vmovdqa(ptr[&m_local.temp.f], xmm1);
}
@ -315,7 +317,7 @@ void GSDrawScanlineCodeGenerator::Init()
// z = vp.zzzz() + m_local.d[skip].z;
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm0, ptr[edx]);
vaddps(xmm0, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
vmovaps(ptr[&m_local.temp.z], xmm0);
}
@ -351,34 +353,41 @@ void GSDrawScanlineCodeGenerator::Init()
{
// GSVector4i vti(vt);
vcvttps2dq(xmm4, xmm4);
vcvttps2dq(xmm6, xmm4);
// si = vti.xxxx() + m_local.d[skip].si;
// ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti;
// s = vti.xxxx() + m_local.d[skip].s;
// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
vpshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(xmm2, ptr[edx + 16 * 7]);
vpaddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
if(!m_sel.sprite)
{
vpaddd(xmm3, ptr[edx + 16 * 8]);
vpaddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
}
else
{
if(m_sel.ltf)
{
vpshuflw(xmm4, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm4, 1);
vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm6, 1);
vmovdqa(ptr[&m_local.temp.vf], xmm4);
vmovdqa(ptr[&m_local.temp.vf], xmm6);
}
}
vmovdqa(ptr[&m_local.temp.s], xmm2);
vmovdqa(ptr[&m_local.temp.t], xmm3);
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
vmovaps(ptr[&m_local.temp.q], xmm4);
}
}
else
{
@ -390,17 +399,13 @@ void GSDrawScanlineCodeGenerator::Init()
vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm2, ptr[edx + 16 * 1]);
vaddps(xmm3, ptr[edx + 16 * 2]);
vaddps(xmm4, ptr[edx + 16 * 3]);
vaddps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
vaddps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
vaddps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
vmovaps(ptr[&m_local.temp.s], xmm2);
vmovaps(ptr[&m_local.temp.t], xmm3);
vmovaps(ptr[&m_local.temp.q], xmm4);
vrcpps(xmm4, xmm4);
vmulps(xmm2, xmm4);
vmulps(xmm3, xmm4);
}
}
@ -423,8 +428,8 @@ void GSDrawScanlineCodeGenerator::Init()
vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
vpaddw(xmm5, ptr[edx + 16 * 4]);
vpaddw(xmm6, ptr[edx + 16 * 5]);
vpaddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
vpaddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
vmovdqa(ptr[&m_local.temp.rb], xmm5);
vmovdqa(ptr[&m_local.temp.ga], xmm6);
@ -485,12 +490,12 @@ void GSDrawScanlineCodeGenerator::Step()
{
if(m_sel.fst)
{
// GSVector4i st = m_local.d4.st;
// GSVector4i stq = m_local.d4.stq;
// si += st.xxxx();
// if(!sprite) ti += st.yyyy();
// s += stq.xxxx();
// if(!sprite) t += stq.yyyy();
vmovdqa(xmm4, ptr[&m_local.d4.st]);
vmovdqa(xmm4, ptr[&m_local.d4.stq]);
vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
vpaddd(xmm2, ptr[&m_local.temp.s]);
@ -506,6 +511,13 @@ void GSDrawScanlineCodeGenerator::Step()
{
vmovdqa(xmm3, ptr[&m_local.temp.t]);
}
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm4, ptr[&m_local.temp.q]);
vmovaps(ptr[&m_local.temp.q], xmm4);
}
}
else
{
@ -528,10 +540,6 @@ void GSDrawScanlineCodeGenerator::Step()
vmovaps(ptr[&m_local.temp.s], xmm2);
vmovaps(ptr[&m_local.temp.t], xmm3);
vmovaps(ptr[&m_local.temp.q], xmm4);
vrcpps(xmm4, xmm4);
vmulps(xmm2, xmm4);
vmulps(xmm3, xmm4);
}
}
@ -648,16 +656,14 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
// GSVector4i o = GSVector4i::x80000000();
vpcmpeqd(xmm4, xmm4);
vpslld(xmm4, 31);
vpcmpeqd(temp1, temp1);
vpslld(temp1, 31);
// GSVector4i zso = zs - o;
vpsubd(xmm0, xmm4);
// GSVector4i zdo = zd - o;
vpsubd(xmm1, xmm4);
vpsubd(xmm0, temp1);
vpsubd(xmm1, temp1);
}
switch(m_sel.ztst)
@ -671,8 +677,8 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
// test |= zso <= zdo; // ~(zso > zdo)
vpcmpgtd(xmm0, xmm1);
vpcmpeqd(xmm4, xmm4);
vpxor(xmm0, xmm4);
vpcmpeqd(temp1, temp1);
vpxor(xmm0, temp1);
vpor(xmm7, xmm0);
break;
}
@ -694,11 +700,66 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
if(!m_sel.fst)
{
// TODO: move these into Init/Step too?
vrcpps(xmm0, xmm4);
vmulps(xmm2, xmm0);
vmulps(xmm3, xmm0);
vcvttps2dq(xmm2, xmm2);
vcvttps2dq(xmm3, xmm3);
}
if(m_sel.mipmap)
{
// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (but can we round it?)
if(!m_sel.lcm)
{
// lod = -log2(Q) * (1 << L) + K
vpcmpeqd(xmm1, xmm1);
vpsrld(xmm1, xmm1, 25);
vpslld(xmm0, xmm4, 1);
vpsrld(xmm0, xmm0, 24);
vpsubd(xmm0, xmm1);
vcvtdq2ps(xmm0, xmm0);
// xmm0 = (float)(exp(e) - 127)
vpslld(xmm4, xmm4, 9);
vpsrld(xmm4, xmm4, 9);
vorps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
// xmm4 = mant(q) | 1.0f
vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
vmulps(xmm5, xmm4);
vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
vmulps(xmm4, xmm5);
vaddps(xmm4, xmm0);
// xmm4 = log2(Q) = (((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0)
vmulps(xmm4, ptr[&m_local.gd->l]);
vaddps(xmm4, ptr[&m_local.gd->k]);
// xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000
vcvtps2dq(xmm4, xmm4);
vmovdqa(ptr[&m_local.temp.lod], xmm4);
}
else
{
// lod = K (=> use m_local->gd.k later when lod is needed)
}
}
// TODO: if(m_sel.mipmap) ...
if(!m_sel.fst)
{
if(m_sel.ltf)
{
// u -= 0x8000;

View File

@ -49,8 +49,9 @@ L("loop");
// esi = fzbr
// edi = fzbc
// xmm0 = z/zi
// xmm2 = u (tme)
// xmm3 = v (tme)
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
// xmm4 = q (tme)
// xmm5 = rb (!tme)
// xmm6 = ga (!tme)
// xmm7 = test
@ -63,8 +64,9 @@ L("loop");
// esi = fzbr
// edi = fzbc
// - xmm0
// xmm2 = u (tme)
// xmm3 = v (tme)
// xmm2 = s/u (tme)
// xmm3 = t/v (tme)
// xmm4 = q (tme)
// xmm5 = rb (!tme)
// xmm6 = ga (!tme)
// xmm7 = test
@ -281,7 +283,7 @@ void GSDrawScanlineCodeGenerator::Init()
{
// edx = &m_local.d[skip]
shl(edx, 4);
shl(edx, 3);
lea(edx, ptr[edx + (size_t)m_local.d]);
// ebx = &v
@ -302,7 +304,7 @@ void GSDrawScanlineCodeGenerator::Init()
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
paddw(xmm1, ptr[edx + 16 * 6]);
paddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
movdqa(ptr[&m_local.temp.f], xmm1);
}
@ -312,7 +314,7 @@ void GSDrawScanlineCodeGenerator::Init()
// z = vp.zzzz() + m_local.d[skip].z;
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
addps(xmm0, ptr[edx]);
addps(xmm0, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
movaps(ptr[&m_local.temp.z], xmm0);
}
@ -348,34 +350,41 @@ void GSDrawScanlineCodeGenerator::Init()
{
// GSVector4i vti(vt);
cvttps2dq(xmm4, xmm4);
cvttps2dq(xmm6, xmm4);
// si = vti.xxxx() + m_local.d[skip].si;
// ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti;
// s = vti.xxxx() + m_local.d[skip].s;
// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
pshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
paddd(xmm2, ptr[edx + 16 * 7]);
paddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
if(!m_sel.sprite)
{
paddd(xmm3, ptr[edx + 16 * 8]);
paddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
}
else
{
if(m_sel.ltf)
{
movdqa(xmm4, xmm3);
pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm4, 1);
movdqa(ptr[&m_local.temp.vf], xmm4);
movdqa(xmm6, xmm3);
pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm6, 1);
movdqa(ptr[&m_local.temp.vf], xmm6);
}
}
movdqa(ptr[&m_local.temp.s], xmm2);
movdqa(ptr[&m_local.temp.t], xmm3);
if(m_sel.mipmap && !m_sel.lcm)
{
shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
addps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
movaps(ptr[&m_local.temp.q], xmm4);
}
}
else
{
@ -390,17 +399,13 @@ void GSDrawScanlineCodeGenerator::Init()
shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
addps(xmm2, ptr[edx + 16 * 1]);
addps(xmm3, ptr[edx + 16 * 2]);
addps(xmm4, ptr[edx + 16 * 3]);
addps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
addps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
addps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
movaps(ptr[&m_local.temp.s], xmm2);
movaps(ptr[&m_local.temp.t], xmm3);
movaps(ptr[&m_local.temp.q], xmm4);
rcpps(xmm4, xmm4);
mulps(xmm2, xmm4);
mulps(xmm3, xmm4);
}
}
@ -423,8 +428,8 @@ void GSDrawScanlineCodeGenerator::Init()
pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
paddw(xmm5, ptr[edx + 16 * 4]);
paddw(xmm6, ptr[edx + 16 * 5]);
paddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
paddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
movdqa(ptr[&m_local.temp.rb], xmm5);
movdqa(ptr[&m_local.temp.ga], xmm6);
@ -485,12 +490,12 @@ void GSDrawScanlineCodeGenerator::Step()
{
if(m_sel.fst)
{
// GSVector4i st = m_local.d4.st;
// GSVector4i st = m_local.d4.stq;
// si += st.xxxx();
// if(!sprite) ti += st.yyyy();
movdqa(xmm4, ptr[&m_local.d4.st]);
movdqa(xmm4, ptr[&m_local.d4.stq]);
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
paddd(xmm2, ptr[&m_local.temp.s]);
@ -506,6 +511,13 @@ void GSDrawScanlineCodeGenerator::Step()
{
movdqa(xmm3, ptr[&m_local.temp.t]);
}
if(m_sel.mipmap && !m_sel.lcm)
{
shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
addps(xmm4, ptr[&m_local.temp.q]);
movaps(ptr[&m_local.temp.q], xmm4);
}
}
else
{
@ -515,9 +527,9 @@ void GSDrawScanlineCodeGenerator::Step()
// t += stq.yyyy();
// q += stq.zzzz();
movaps(xmm2, ptr[&m_local.d4.stq]);
movaps(xmm3, xmm2);
movaps(xmm4, xmm2);
movaps(xmm4, ptr[&m_local.d4.stq]);
movaps(xmm2, xmm4);
movaps(xmm3, xmm4);
shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
@ -530,10 +542,6 @@ void GSDrawScanlineCodeGenerator::Step()
movaps(ptr[&m_local.temp.s], xmm2);
movaps(ptr[&m_local.temp.t], xmm3);
movaps(ptr[&m_local.temp.q], xmm4);
rcpps(xmm4, xmm4);
mulps(xmm2, xmm4);
mulps(xmm3, xmm4);
}
}
@ -650,16 +658,14 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
// GSVector4i o = GSVector4i::x80000000();
pcmpeqd(xmm4, xmm4);
pslld(xmm4, 31);
pcmpeqd(temp1, temp1);
pslld(temp1, 31);
// GSVector4i zso = zs - o;
psubd(xmm0, xmm4);
// GSVector4i zdo = zd - o;
psubd(xmm1, xmm4);
psubd(xmm0, temp1);
psubd(xmm1, temp1);
}
switch(m_sel.ztst)
@ -673,8 +679,8 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
// test |= zso <= zdo; // ~(zso > zdo)
pcmpgtd(xmm0, xmm1);
pcmpeqd(xmm4, xmm4);
pxor(xmm0, xmm4);
pcmpeqd(temp1, temp1);
pxor(xmm0, temp1);
por(xmm7, xmm0);
break;
}
@ -696,11 +702,66 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
if(!m_sel.fst)
{
// TODO: move these into Init/Step too?
rcpps(xmm4, xmm4);
mulps(xmm2, xmm4);
mulps(xmm3, xmm4);
cvttps2dq(xmm2, xmm2);
cvttps2dq(xmm3, xmm3);
}
if(m_sel.mipmap)
{
// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (but can we round it?)
if(!m_sel.lcm)
{
// lod = -log2(Q) * (1 << L) + K
vpcmpeqd(xmm1, xmm1);
vpsrld(xmm1, xmm1, 25);
vpslld(xmm0, xmm4, 1);
vpsrld(xmm0, xmm0, 24);
vpsubd(xmm0, xmm1);
vcvtdq2ps(xmm0, xmm0);
// xmm0 = (float)(exp(e) - 127)
vpslld(xmm4, xmm4, 9);
vpsrld(xmm4, xmm4, 9);
vorps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
// xmm4 = mant(q) | 1.0f
vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
vmulps(xmm5, xmm4);
vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
vmulps(xmm4, xmm5);
vaddps(xmm4, xmm0);
// xmm4 = log2(Q) = (((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0)
vmulps(xmm4, ptr[&m_local.gd->l]);
vaddps(xmm4, ptr[&m_local.gd->k]);
// xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000
vcvtps2dq(xmm4, xmm4);
vmovdqa(ptr[&m_local.temp.lod], xmm4);
}
else
{
// lod = K (=> use m_local->gd.k later when lod is needed)
}
}
// TODO: if(m_sel.mipmap) ...
if(!m_sel.fst)
{
if(m_sel.ltf)
{
// u -= 0x8000;

View File

@ -375,6 +375,14 @@ void GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
{
gd.sel.mipmap = 1; // TODO: pass mmin here and store mxl to m_global for clamping the lod
gd.sel.lcm = context->TEX1.LCM;
gd.l = GSVector4((float)(-0x10000 << context->TEX1.L));
gd.k = GSVector4((float)(0x1000 * context->TEX1.K));
// the rest is fake, should be removed later
int level = (int)(m_vt.m_lod.x + 0.5f);
level = std::min<int>(level, context->TEX1.MXL);
@ -691,21 +699,17 @@ void GSRendererSW::VertexKick(bool skip)
if(tme)
{
float q;
if(fst)
{
v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
q = 1.0f;
}
else
{
v.t = GSVector4(m_v.ST.S, m_v.ST.T);
v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH);
q = m_v.RGBAQ.Q;
}
v.t = v.t.xyxy(GSVector4::load(q));
v.t = v.t.xyxy(GSVector4::load(m_v.RGBAQ.Q));
}
GSVertexSW& dst = m_vl.AddTail();

View File

@ -65,6 +65,8 @@ union GSScanlineSelector
uint32 edge:1; // 47
uint32 tw:3; // 48 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 mipmap:1; // 49
uint32 lcm:1; // 50
};
struct
@ -127,18 +129,20 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
GSVector4i aref;
GSVector4i afix;
GSVector4i frb, fga;
GSVector4 k; // TEX1.K * 0x10000
GSVector4 l; // TEX1.L * -0x10000
};
__aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own
{
const GSScanlineGlobalData* gd;
struct {GSVector4 z, s, t, q; GSVector4i rb, ga, f, si, ti, _pad[7];} d[4];
struct {GSVector4 z, stq; GSVector4i c, f, st;} d4;
struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4];
struct step {GSVector4 z, stq; GSVector4i c, f;} d4;
struct {GSVector4i rb, ga;} c;
struct {GSVector4i z, f;} p;
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct {GSVector4i z, f, s, t, q, rb, ga, zs, zd, uf, vf, cov;} temp;
struct {GSVector4i z, f, s, t, q, rb, ga, zs, zd, uf, vf, cov, lod;} temp;
};

View File

@ -188,8 +188,14 @@ void GSSetupPrimCodeGenerator::Texture()
{
// m_local.d4.st = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
vmovhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1);
}
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
vmovq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
else
{
@ -198,7 +204,7 @@ void GSSetupPrimCodeGenerator::Texture()
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
@ -212,16 +218,16 @@ void GSSetupPrimCodeGenerator::Texture()
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm))
{
// m_local.d[i].si/ti = GSVector4i(v);
// m_local.d[i].s/t = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
}
}
else

View File

@ -193,8 +193,14 @@ void GSSetupPrimCodeGenerator::Texture()
{
// m_local.d4.st = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
movhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1);
}
cvttps2dq(xmm1, xmm1);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
movq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
else
{
@ -203,7 +209,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
@ -219,9 +225,9 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if(m_sel.fst)
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm))
{
// m_local.d[i].si/ti = GSVector4i(v);
// m_local.d[i].s/t = GSVector4i(v);
cvttps2dq(xmm2, xmm2);

View File

@ -172,8 +172,14 @@ void GSSetupPrimCodeGenerator::Texture()
{
// m_local.d4.st = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
vmovhps(ptr[&m_local.d4.stq.z], xmm1);
}
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.st], xmm1);
vmovq(ptr[&m_local.d4.stq], xmm1);
}
else
{
@ -182,7 +188,7 @@ void GSSetupPrimCodeGenerator::Texture()
vmovaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
@ -196,16 +202,16 @@ void GSSetupPrimCodeGenerator::Texture()
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm))
{
// m_local.d[i].si/ti = GSVector4i(v);
// m_local.d[i].s/t = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
case 0: vmovdqa(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].t], xmm2); break;
}
}
else

View File

@ -178,8 +178,14 @@ void GSSetupPrimCodeGenerator::Texture()
{
// m_local.d4.st = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
movhps(ptr[&m_local.d4.stq.z], xmm1);
}
cvttps2dq(xmm1, xmm1);
movdqa(ptr[&m_local.d4.st], xmm1);
movq(ptr[&m_local.d4.stq], xmm1);
}
else
{
@ -188,7 +194,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
@ -204,16 +210,16 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if(m_sel.fst)
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm))
{
// m_local.d[i].si/ti = GSVector4i(v);
// m_local.d[i].s/t = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
case 0: movdqa(ptr[&m_local.d[i].s], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].t], xmm2); break;
}
}
else

View File

@ -24,7 +24,8 @@
const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_half(0.5f, 0.5f, 0.5f, 0.5f);
const GSVector4 GSVector4::m_half(0.5f);
const GSVector4 GSVector4::m_one(1.0f);
const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));

View File

@ -1,3 +1,23 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
@ -2309,6 +2329,7 @@ public:
static const GSVector4 m_ps0123;
static const GSVector4 m_ps4567;
static const GSVector4 m_half;
static const GSVector4 m_one;
static const GSVector4 m_x3f800000;
static const GSVector4 m_x4b000000;
@ -2479,14 +2500,12 @@ public:
{
// NOTE: sign bit ignored, safe to pass negative numbers
GSVector4i exp = GSVector4i::xff000000() >> 1;
GSVector4i mant = GSVector4i::x007fffff();
GSVector4 one(1.0f);
GSVector4i i = GSVector4i::cast(*this);
GSVector4 e = GSVector4(((i & exp) >> 23) - GSVector4i::x0000007f());
GSVector4 m = GSVector4::cast(i & mant) | one;
GSVector4 e = GSVector4(((i << 1) >> 24) - GSVector4i::x0000007f());
GSVector4 m = GSVector4::cast((i << 9) >> 9) | one;
GSVector4 p;

View File

@ -70,7 +70,7 @@ void GSVertexTrace::UpdateLOD()
{
// LOD = log2(1/|Q|) * (1 << L) + K
GSVector4::storel(&m_lod, m_max.t.uph(m_min.t).log2(2).neg() * (float)(1 << TEX1.L) + K);
GSVector4::storel(&m_lod, m_max.t.uph(m_min.t).log2(3).neg() * (float)(1 << TEX1.L) + K);
if(m_lod.x > m_lod.y) {float tmp = m_lod.x; m_lod.x = m_lod.y; m_lod.y = tmp;}
}

View File

@ -612,7 +612,11 @@
<ClCompile Include="GSTextureSW.cpp" />
<ClCompile Include="GSThread.cpp" />
<ClCompile Include="GSUtil.cpp" />
<ClCompile Include="GSVector.cpp" />
<ClCompile Include="GSVector.cpp">
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">AssemblyAndSourceCode</AssemblerOutput>
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">AssemblyAndSourceCode</AssemblerOutput>
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">AssemblyAndSourceCode</AssemblerOutput>
</ClCompile>
<ClCompile Include="GSVertexList.cpp" />
<ClCompile Include="GSVertexSW.cpp" />
<ClCompile Include="GSVertexTrace.cpp" />

View File

@ -1,3 +1,24 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
//{{NO_DEPENDENCIES}}
// Microsoft Visual C++ generated include file.
// Used by GSdx.rc

View File

@ -1,3 +1,24 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
// stdafx.cpp : source file that includes just the standard includes
// GSdx.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information

View File

@ -1,3 +1,24 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently