mirror of https://github.com/PCSX2/pcsx2.git
GSdx: fixing the vs2008 project
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4382 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
2e6951d102
commit
f9da2669a7
File diff suppressed because it is too large
Load Diff
|
@ -1,123 +1,123 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#error TODO
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#error TODO
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,349 +1,349 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)&m_shift[0]);
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
vbroadcastss(xmm1, ptr[r9]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)&m_shift[0]);
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
vbroadcastss(xmm1, ptr[r9]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,363 +1,363 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
movss(xmm1, ptr[r9]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
cvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
movss(xmm1, ptr[r9]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
cvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,333 +1,333 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,349 +1,349 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
movaps(xmm1, ptr[&GSVector4::m_half]);
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
movaps(xmm0, ptr[ecx]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
movaps(xmm1, ptr[&GSVector4::m_half]);
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
movaps(xmm0, ptr[ecx]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,496 +1,496 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
|
@ -1,484 +1,484 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _count = _args + 4; // rcx
|
||||
static const int _v = _args + 8; // rdx
|
||||
static const int _min = _args + 12; // r8
|
||||
static const int _max = _args + 16; // r9
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _count = _args + 4; // rcx
|
||||
static const int _v = _args + 8; // rdx
|
||||
static const int _min = _args + 12; // r8
|
||||
static const int _max = _args + 16; // r9
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -892,6 +892,110 @@
|
|||
RelativePath=".\GSDrawScanlineCodeGenerator.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSDrawScanlineCodeGenerator.x64.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSDrawScanlineCodeGenerator.x86.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSDump.cpp"
|
||||
>
|
||||
|
@ -956,6 +1060,110 @@
|
|||
RelativePath=".\GSSetupPrimCodeGenerator.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSSetupPrimCodeGenerator.x64.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSSetupPrimCodeGenerator.x86.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSState.cpp"
|
||||
>
|
||||
|
@ -1032,6 +1240,110 @@
|
|||
RelativePath=".\GSVertexTrace.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSVertexTrace.x64.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|Win32"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSVertexTrace.x86.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE2|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSSE3|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release SSE4|x64"
|
||||
ExcludedFromBuild="true"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\GSWnd.cpp"
|
||||
>
|
||||
|
|
Loading…
Reference in New Issue