pcsx2/plugins/GSdx_legacy/GPUSetupPrimCodeGenerator.cpp

229 lines
5.4 KiB
C++

/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
// TODO: x64
#include "stdafx.h"
#include "GPUSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
using namespace Xbyak;
static const int _args = 0;
static const int _vertex = _args + 4;
static const int _index = _args + 8;
static const int _dscan = _args + 12;
GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GPUScanlineLocalData*)param)
{
m_sel.key = key;
Generate();
}
void GPUSetupPrimCodeGenerator::Generate()
{
if(m_sel.tme && !m_sel.twin)
{
pcmpeqd(xmm0, xmm0);
if(m_sel.sprite)
{
// t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001();
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
cvttps2dq(xmm1, ptr[ecx + offsetof(GSVertexSW, t)]);
psrld(xmm1, 8);
psrld(xmm0, 31);
psubd(xmm1, xmm0);
// t = t.ps32(t);
// t = t.upl16(t);
packssdw(xmm1, xmm1);
punpcklwd(xmm1, xmm1);
// m_local.twin[2].u = t.xxxx();
// m_local.twin[2].v = t.yyyy();
pshufd(xmm2, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm3, xmm1, _MM_SHUFFLE(1, 1, 1, 1));
movdqa(ptr[&m_local.twin[2].u], xmm2);
movdqa(ptr[&m_local.twin[2].v], xmm3);
}
else
{
// TODO: not really needed
// m_local.twin[2].u = GSVector4i::x00ff();
// m_local.twin[2].v = GSVector4i::x00ff();
psrlw(xmm0, 8);
movdqa(ptr[&m_local.twin[2].u], xmm0);
movdqa(ptr[&m_local.twin[2].v], xmm0);
}
}
if(m_sel.tme || m_sel.iip && m_sel.tfx != 3)
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < 3; i++)
{
movaps(Xmm(5 + i), ptr[&m_shift[i]]);
}
// GSVector4 dt = dscan.t;
// GSVector4 dc = dscan.c;
movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]);
movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]);
// GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f));
movaps(xmm1, xmm3);
mulps(xmm1, xmm5);
cvttps2dq(xmm1, xmm1);
movaps(xmm2, xmm4);
mulps(xmm2, xmm5);
cvttps2dq(xmm2, xmm2);
packssdw(xmm1, xmm2);
if(m_sel.tme)
{
// m_local.d8.st = dtc8.upl16(dtc8);
movdqa(xmm0, xmm1);
punpcklwd(xmm0, xmm0);
movdqa(ptr[&m_local.d8.st], xmm0);
}
if(m_sel.iip && m_sel.tfx != 3)
{
// m_local.d8.c = dtc8.uph16(dtc8);
punpckhwd(xmm1, xmm1);
movdqa(ptr[&m_local.d8.c], xmm1);
}
// xmm3 = dt
// xmm4 = dc
// xmm6 = ps0123
// xmm7 = ps4567
// xmm0, xmm1, xmm2, xmm5 = free
if(m_sel.tme)
{
// GSVector4 dtx = dt.xxxx();
// GSVector4 dty = dt.yyyy();
movaps(xmm0, xmm3);
shufps(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
// m_local.d.s = GSVector4i(dtx * ps0123).ps32(GSVector4i(dtx * ps4567));
movaps(xmm1, xmm3);
mulps(xmm3, xmm6);
mulps(xmm1, xmm7);
cvttps2dq(xmm3, xmm3);
cvttps2dq(xmm1, xmm1);
packssdw(xmm3, xmm1);
movdqa(ptr[&m_local.d.s], xmm3);
// m_local.d.t = GSVector4i(dty * ps0123).ps32(GSVector4i(dty * ps4567));
movaps(xmm1, xmm0);
mulps(xmm0, xmm6);
mulps(xmm1, xmm7);
cvttps2dq(xmm0, xmm0);
cvttps2dq(xmm1, xmm1);
packssdw(xmm0, xmm1);
movdqa(ptr[&m_local.d.t], xmm0);
}
// xmm4 = dc
// xmm6 = ps0123
// xmm7 = ps4567
// xmm0, xmm1, zmm2, xmm3, xmm5 = free
if(m_sel.iip && m_sel.tfx != 3)
{
// GSVector4 dcx = dc.xxxx();
// GSVector4 dcy = dc.yyyy();
// GSVector4 dcz = dc.zzzz();
movaps(xmm0, xmm4);
movaps(xmm1, xmm4);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d.r = GSVector4i(dcx * ps0123).ps32(GSVector4i(dcx * ps4567));
movaps(xmm2, xmm4);
mulps(xmm4, xmm6);
mulps(xmm2, xmm7);
cvttps2dq(xmm4, xmm4);
cvttps2dq(xmm2, xmm2);
packssdw(xmm4, xmm2);
movdqa(ptr[&m_local.d.r], xmm4);
// m_local.d.g = GSVector4i(dcy * ps0123).ps32(GSVector4i(dcy * ps4567));
movaps(xmm2, xmm0);
mulps(xmm0, xmm6);
mulps(xmm2, xmm7);
cvttps2dq(xmm0, xmm0);
cvttps2dq(xmm2, xmm2);
packssdw(xmm0, xmm2);
movdqa(ptr[&m_local.d.g], xmm0);
// m_local.d.b = GSVector4i(dcz * ps0123).ps32(GSVector4i(dcz * ps4567));
movaps(xmm2, xmm1);
mulps(xmm1, xmm6);
mulps(xmm2, xmm7);
cvttps2dq(xmm1, xmm1);
cvttps2dq(xmm2, xmm2);
packssdw(xmm1, xmm2);
movdqa(ptr[&m_local.d.b], xmm1);
}
}
ret();
}
const GSVector4 GPUSetupPrimCodeGenerator::m_shift[3] =
{
GSVector4(8.0f, 8.0f, 8.0f, 8.0f),
GSVector4(0.0f, 1.0f, 2.0f, 3.0f),
GSVector4(4.0f, 5.0f, 6.0f, 7.0f),
};