/* * Copyright (C) 2007-2009 Gabest * http://www.gabest.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. * http://www.gnu.org/copyleft/gpl.html * */ // TODO: x64 #include "stdafx.h" #include "GPUSetupPrimCodeGenerator.h" #include "GSVertexSW.h" using namespace Xbyak; static const int _args = 0; static const int _vertex = _args + 4; static const int _index = _args + 8; static const int _dscan = _args + 12; GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) , m_local(*(GPUScanlineLocalData*)param) { m_sel.key = key; Generate(); } void GPUSetupPrimCodeGenerator::Generate() { if(m_sel.tme && !m_sel.twin) { pcmpeqd(xmm0, xmm0); if(m_sel.sprite) { // t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001(); mov(ecx, ptr[esp + _index]); mov(ecx, ptr[ecx + sizeof(uint32) * 1]); shl(ecx, 6); // * sizeof(GSVertexSW) add(ecx, ptr[esp + _vertex]); cvttps2dq(xmm1, ptr[ecx + offsetof(GSVertexSW, t)]); psrld(xmm1, 8); psrld(xmm0, 31); psubd(xmm1, xmm0); // t = t.ps32(t); // t = t.upl16(t); packssdw(xmm1, xmm1); punpcklwd(xmm1, xmm1); // m_local.twin[2].u = t.xxxx(); // m_local.twin[2].v = t.yyyy(); pshufd(xmm2, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm1, _MM_SHUFFLE(1, 1, 1, 1)); movdqa(ptr[&m_local.twin[2].u], xmm2); movdqa(ptr[&m_local.twin[2].v], xmm3); } else { // TODO: not really needed // m_local.twin[2].u = GSVector4i::x00ff(); // m_local.twin[2].v = GSVector4i::x00ff(); psrlw(xmm0, 8); movdqa(ptr[&m_local.twin[2].u], xmm0); movdqa(ptr[&m_local.twin[2].v], xmm0); } } if(m_sel.tme || m_sel.iip && m_sel.tfx != 3) { mov(edx, dword[esp + _dscan]); for(int i = 0; i < 3; i++) { movaps(Xmm(5 + i), ptr[&m_shift[i]]); } // GSVector4 dt = dscan.t; // GSVector4 dc = dscan.c; movaps(xmm4, ptr[edx + offsetof(GSVertexSW, c)]); movaps(xmm3, ptr[edx + offsetof(GSVertexSW, t)]); // GSVector4i dtc8 = GSVector4i(dt * 8.0f).ps32(GSVector4i(dc * 8.0f)); movaps(xmm1, xmm3); mulps(xmm1, xmm5); cvttps2dq(xmm1, xmm1); movaps(xmm2, xmm4); mulps(xmm2, xmm5); cvttps2dq(xmm2, xmm2); packssdw(xmm1, xmm2); if(m_sel.tme) { // m_local.d8.st = dtc8.upl16(dtc8); movdqa(xmm0, xmm1); punpcklwd(xmm0, xmm0); movdqa(ptr[&m_local.d8.st], xmm0); } if(m_sel.iip && m_sel.tfx != 3) { // m_local.d8.c = dtc8.uph16(dtc8); punpckhwd(xmm1, xmm1); movdqa(ptr[&m_local.d8.c], xmm1); } // xmm3 = dt // xmm4 = dc // xmm6 = ps0123 // xmm7 = ps4567 // xmm0, xmm1, xmm2, xmm5 = free if(m_sel.tme) { // GSVector4 dtx = dt.xxxx(); // GSVector4 dty = dt.yyyy(); movaps(xmm0, xmm3); shufps(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); // m_local.d.s = GSVector4i(dtx * ps0123).ps32(GSVector4i(dtx * ps4567)); movaps(xmm1, xmm3); mulps(xmm3, xmm6); mulps(xmm1, xmm7); cvttps2dq(xmm3, xmm3); cvttps2dq(xmm1, xmm1); packssdw(xmm3, xmm1); movdqa(ptr[&m_local.d.s], xmm3); // m_local.d.t = GSVector4i(dty * ps0123).ps32(GSVector4i(dty * ps4567)); movaps(xmm1, xmm0); mulps(xmm0, xmm6); mulps(xmm1, xmm7); cvttps2dq(xmm0, xmm0); cvttps2dq(xmm1, xmm1); packssdw(xmm0, xmm1); movdqa(ptr[&m_local.d.t], xmm0); } // xmm4 = dc // xmm6 = ps0123 // xmm7 = ps4567 // xmm0, xmm1, zmm2, xmm3, xmm5 = free if(m_sel.iip && m_sel.tfx != 3) { // GSVector4 dcx = dc.xxxx(); // GSVector4 dcy = dc.yyyy(); // GSVector4 dcz = dc.zzzz(); movaps(xmm0, xmm4); movaps(xmm1, xmm4); shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); // m_local.d.r = GSVector4i(dcx * ps0123).ps32(GSVector4i(dcx * ps4567)); movaps(xmm2, xmm4); mulps(xmm4, xmm6); mulps(xmm2, xmm7); cvttps2dq(xmm4, xmm4); cvttps2dq(xmm2, xmm2); packssdw(xmm4, xmm2); movdqa(ptr[&m_local.d.r], xmm4); // m_local.d.g = GSVector4i(dcy * ps0123).ps32(GSVector4i(dcy * ps4567)); movaps(xmm2, xmm0); mulps(xmm0, xmm6); mulps(xmm2, xmm7); cvttps2dq(xmm0, xmm0); cvttps2dq(xmm2, xmm2); packssdw(xmm0, xmm2); movdqa(ptr[&m_local.d.g], xmm0); // m_local.d.b = GSVector4i(dcz * ps0123).ps32(GSVector4i(dcz * ps4567)); movaps(xmm2, xmm1); mulps(xmm1, xmm6); mulps(xmm2, xmm7); cvttps2dq(xmm1, xmm1); cvttps2dq(xmm2, xmm2); packssdw(xmm1, xmm2); movdqa(ptr[&m_local.d.b], xmm1); } } ret(); } const GSVector4 GPUSetupPrimCodeGenerator::m_shift[3] = { GSVector4(8.0f, 8.0f, 8.0f, 8.0f), GSVector4(0.0f, 1.0f, 2.0f, 3.0f), GSVector4(4.0f, 5.0f, 6.0f, 7.0f), };