mirror of https://github.com/PCSX2/pcsx2.git
GSdx: the x64 ABI on windows is not so nice after all.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4380 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
525175ba70
commit
a96a345077
|
@ -33,7 +33,7 @@ GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 k
|
|||
, m_local(*(GPUScanlineLocalData*)param)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
//#error TODO
|
||||
#endif
|
||||
|
||||
m_sel.key = key;
|
||||
|
|
|
@ -32,7 +32,7 @@ GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, vo
|
|||
, m_local(*(GPUScanlineLocalData*)param)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
//#error TODO
|
||||
#endif
|
||||
|
||||
m_sel.key = key;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -55,14 +55,21 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
|||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
|
||||
#if defined(_M_AMD64) || defined(_WIN64)
|
||||
void ReadPixel(const Xmm& dst, const Reg64& addr);
|
||||
void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm);
|
||||
#else
|
||||
void ReadPixel(const Xmm& dst, const Reg32& addr);
|
||||
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm);
|
||||
#endif
|
||||
|
||||
void ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2);
|
||||
void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i);
|
||||
|
||||
template<int shift> void modulate16(const Xmm& a, const Operand& f);
|
||||
template<int shift> void lerp16(const Xmm& a, const Xmm& b, const Xmm& f);
|
||||
void modulate16(const Xmm& a, const Operand& f, int shift);
|
||||
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);
|
||||
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
|
||||
void clamp16(const Xmm& a, const Xmm& temp);
|
||||
void alltrue();
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#error TODO
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -22,3 +22,36 @@
|
|||
#include "stdafx.h"
|
||||
#include "GSFunctionMap.h"
|
||||
|
||||
void GSCodeGenerator::enter(uint32 size, bool align)
|
||||
{
|
||||
#ifdef _M_AMD64
|
||||
|
||||
push(r15);
|
||||
mov(r15, rsp);
|
||||
if(size > 0) sub(rsp, size);
|
||||
if(align) and(rsp, 0xfffffffffffffff0);
|
||||
|
||||
#else
|
||||
|
||||
push(ebp);
|
||||
mov(ebp, esp);
|
||||
if(size > 0) sub(esp, size);
|
||||
if(align) and(esp, 0xfffffff0);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSCodeGenerator::leave()
|
||||
{
|
||||
#ifdef _M_AMD64
|
||||
|
||||
mov(rsp, r15);
|
||||
pop(r15);
|
||||
|
||||
#else
|
||||
|
||||
mov(esp, ebp);
|
||||
pop(ebp);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -145,7 +145,7 @@ public:
|
|||
int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
|
||||
int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
|
||||
|
||||
printf("[%016llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n",
|
||||
printf("[%014llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n",
|
||||
(uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ',
|
||||
(float)(tpf * 10000 / 50000000) / 100,
|
||||
(float)(tpf * 10000 / ttpf) / 100,
|
||||
|
@ -161,6 +161,9 @@ class GSCodeGenerator : public Xbyak::CodeGenerator
|
|||
protected:
|
||||
Xbyak::util::Cpu m_cpu;
|
||||
|
||||
void enter(uint32 size, bool align);
|
||||
void leave();
|
||||
|
||||
public:
|
||||
GSCodeGenerator(void* code, size_t maxsize)
|
||||
: Xbyak::CodeGenerator(maxsize, code)
|
||||
|
|
|
@ -144,7 +144,7 @@ void GSRendererSW::Draw()
|
|||
|
||||
if(s_save && s_n >= s_saven && PRIM->TME)
|
||||
{
|
||||
s = format("c:\\temp1\\_%05d_f%ll_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
|
||||
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
|
||||
|
||||
m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
|
||||
}
|
||||
|
|
|
@ -19,639 +19,9 @@
|
|||
*
|
||||
*/
|
||||
|
||||
// TODO: x64
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
#endif
|
||||
|
||||
m_sel.key = key;
|
||||
|
||||
m_en.z = m_sel.zb ? 1 : 0;
|
||||
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
|
||||
Generate();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
else
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
static const float half = 0.5f;
|
||||
|
||||
vbroadcastss(xmm1, dword[&half]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
static const float half = 0.5f;
|
||||
|
||||
movss(xmm1, dword[&half]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
movaps(xmm0, ptr[ecx]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
|
||||
{
|
||||
GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
|
||||
|
@ -660,3 +30,17 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
|
|||
GSVector4(-2.0f, -1.0f, 0.0f, 1.0f),
|
||||
GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
|
||||
};
|
||||
|
||||
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
{
|
||||
m_sel.key = key;
|
||||
|
||||
m_en.z = m_sel.zb ? 1 : 0;
|
||||
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
|
||||
Generate();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,349 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)&m_shift[0]);
|
||||
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
vbroadcastss(xmm1, ptr[r9]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,363 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
mov(r8, (size_t)&m_local);
|
||||
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[rcx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
mov(r9, (size_t)&GSVector4::m_half);
|
||||
|
||||
movss(xmm1, ptr[r9]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[rdx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
|
||||
case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
|
||||
case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
|
||||
case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
cvttps2dq(xmm0, ptr[rcx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
|
||||
vmulps(xmm1, xmm0);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpslld(xmm1, 1);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpsrld(xmm2, 31);
|
||||
vpand(xmm0, xmm2);
|
||||
|
||||
vpor(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,349 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(!m_sel.sprite)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertices[0].p;
|
||||
|
||||
movaps(xmm0, ptr[ecx + 16]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector4 z = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
if(m_sel.zoverflow)
|
||||
{
|
||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
|
||||
movaps(xmm1, ptr[&GSVector4::m_half]);
|
||||
mulps(xmm1, xmm0);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pslld(xmm1, 1);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
psrld(xmm2, 31);
|
||||
pand(xmm0, xmm2);
|
||||
|
||||
por(xmm0, xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.p.z = GSVector4i(z);
|
||||
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
}
|
||||
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + 32]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d4.st = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
movdqa(ptr[&m_local.d4.st], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].si/ti = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < 4; i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
||||
|
||||
movaps(xmm0, ptr[ecx]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
|
||||
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
|
||||
const GSVector4 GSVector4::m_half(0.5f, 0.5f, 0.5f, 0.5f);
|
||||
const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
|
||||
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
|
||||
|
||||
|
|
|
@ -2271,6 +2271,7 @@ public:
|
|||
|
||||
static const GSVector4 m_ps0123;
|
||||
static const GSVector4 m_ps4567;
|
||||
static const GSVector4 m_half;
|
||||
|
||||
static const GSVector4 m_x3f800000;
|
||||
static const GSVector4 m_x4b000000;
|
||||
|
|
|
@ -24,8 +24,7 @@
|
|||
#include "GSUtil.h"
|
||||
#include "GSState.h"
|
||||
|
||||
static const float s_fmin = -FLT_MAX;
|
||||
static const float s_fmax = FLT_MAX;
|
||||
const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX);
|
||||
|
||||
GSVertexTrace::GSVertexTrace(const GSState* state)
|
||||
: m_state(state)
|
||||
|
@ -51,7 +50,7 @@ uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass)
|
|||
|
||||
void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass)
|
||||
{
|
||||
m_map_sw[Hash(primclass)](v, count, m_min, m_max);
|
||||
m_map_sw[Hash(primclass)](count, v, m_min, m_max);
|
||||
|
||||
m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
|
||||
|
||||
|
@ -60,7 +59,7 @@ void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primcla
|
|||
|
||||
void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass)
|
||||
{
|
||||
m_map_hw9[Hash(primclass)](v, count, m_min, m_max);
|
||||
m_map_hw9[Hash(primclass)](count, v, m_min, m_max);
|
||||
|
||||
const GSDrawingContext* context = m_state->m_context;
|
||||
|
||||
|
@ -92,7 +91,7 @@ void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primcl
|
|||
|
||||
void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass)
|
||||
{
|
||||
m_map_hw11[Hash(primclass)](v, count, m_min, m_max);
|
||||
m_map_hw11[Hash(primclass)](count, v, m_min, m_max);
|
||||
|
||||
const GSDrawingContext* context = m_state->m_context;
|
||||
|
||||
|
@ -121,939 +120,3 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
|
|||
|
||||
m_alpha.valid = false;
|
||||
}
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _v = _args + 4;
|
||||
static const int _count = _args + 8;
|
||||
static const int _min = _args + 12;
|
||||
static const int _max = _args + 16;
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
#endif
|
||||
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_fmax]);
|
||||
vbroadcastss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_fmax]);
|
||||
movss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
movaps(xmm2, xmm4);
|
||||
movaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
minps(xmm2, xmm0);
|
||||
maxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
psrld(xmm2, 7);
|
||||
movaps(ptr[eax], xmm2);
|
||||
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
psrld(xmm3, 7);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
#endif
|
||||
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_fmax]);
|
||||
vbroadcastss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpxor(xmm0, xmm0);
|
||||
|
||||
vpunpckhbw(xmm2, xmm0);
|
||||
vpunpcklwd(xmm2, xmm0);
|
||||
|
||||
vpunpckhbw(xmm3, xmm0);
|
||||
vpunpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_fmax]);
|
||||
movss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
#if _M_AMD64
|
||||
#error TODO
|
||||
#endif
|
||||
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if(m_cpu.has(util::Cpu::tAVX))
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_fmax]);
|
||||
vbroadcastss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpunpcklwd(xmm1, xmm0, xmm0);
|
||||
vpsrld(xmm1, 16);
|
||||
}
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpxor(xmm0, xmm0);
|
||||
|
||||
vpunpckhbw(xmm2, xmm0);
|
||||
vpunpcklwd(xmm2, xmm0);
|
||||
|
||||
vpunpckhbw(xmm3, xmm0);
|
||||
vpunpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_fmax]);
|
||||
movss(xmm5, ptr[&s_fmin]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pmovzxwd(xmm1, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(xmm1, xmm0);
|
||||
punpcklwd(xmm1, xmm1);
|
||||
psrld(xmm1, 16);
|
||||
}
|
||||
|
||||
psrld(xmm0, 1);
|
||||
punpcklqdq(xmm1, xmm0);
|
||||
cvtdq2ps(xmm1, xmm1);
|
||||
|
||||
minps(xmm4, xmm1);
|
||||
maxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ __aligned(class, 32) GSVertexTrace
|
|||
struct Vertex {GSVector4i c; GSVector4 p, t;};
|
||||
struct VertexAlpha {int min, max; bool valid;};
|
||||
|
||||
typedef void (*VertexTracePtr)(const void* v, int count, Vertex& min, Vertex& max);
|
||||
typedef void (*VertexTracePtr)(int count, const void* v, Vertex& min, Vertex& max);
|
||||
|
||||
class CGSW : public GSCodeGenerator
|
||||
{
|
||||
|
@ -62,6 +62,8 @@ __aligned(class, 32) GSVertexTrace
|
|||
|
||||
const GSState* m_state;
|
||||
|
||||
static const GSVector4 s_minmax;
|
||||
|
||||
public:
|
||||
GS_PRIM_CLASS m_primclass;
|
||||
Vertex m_min, m_max; // t.xy * 0x10000
|
||||
|
|
|
@ -0,0 +1,496 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
vbroadcastss(xmm4, ptr[rax + 0]);
|
||||
vbroadcastss(xmm5, ptr[rax + 4]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[r8], xmm2);
|
||||
vmovaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[r8 + 16], xmm4);
|
||||
vmovaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[r8 + 32], xmm6);
|
||||
vmovaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,543 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
movdqa(ptr[rsp + 0], xmm6);
|
||||
movdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
movss(xmm4, ptr[rax + 0]);
|
||||
movss(xmm5, ptr[rax + 4]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
movaps(xmm2, xmm4);
|
||||
movaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
minps(xmm2, xmm0);
|
||||
maxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexSW));
|
||||
sub(rcx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
psrld(xmm2, 7);
|
||||
movaps(ptr[r8], xmm2);
|
||||
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
psrld(xmm3, 7);
|
||||
movaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
movdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
movdqa(ptr[rsp + 0], xmm6);
|
||||
movdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
movss(xmm4, ptr[rax + 0]);
|
||||
movss(xmm5, ptr[rax + 16]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[r8], xmm2);
|
||||
movaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
movdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
enter(32, true);
|
||||
|
||||
movdqa(ptr[rsp + 0], xmm6);
|
||||
movdqa(ptr[rsp + 16], xmm7);
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
mov(rax, (size_t)&s_minmax);
|
||||
|
||||
movss(xmm4, ptr[rax + 0]);
|
||||
movss(xmm5, ptr[rax + 16]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pmovzxwd(xmm1, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(xmm1, xmm0);
|
||||
punpcklwd(xmm1, xmm1);
|
||||
psrld(xmm1, 16);
|
||||
}
|
||||
|
||||
psrld(xmm0, 1);
|
||||
punpcklqdq(xmm1, xmm0);
|
||||
cvtdq2ps(xmm1, xmm1);
|
||||
|
||||
minps(xmm4, xmm1);
|
||||
maxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(rdx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[r8], xmm2);
|
||||
movaps(ptr[r9], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[r8 + 16], xmm4);
|
||||
movaps(ptr[r9 + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
movaps(ptr[r8 + 32], xmm6);
|
||||
movaps(ptr[r9 + 32], xmm7);
|
||||
}
|
||||
|
||||
movdqa(xmm6, ptr[rsp + 0]);
|
||||
movdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
leave();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,484 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _count = _args + 4; // rcx
|
||||
static const int _v = _args + 8; // rdx
|
||||
static const int _min = _args + 12; // r8
|
||||
static const int _max = _args + 16; // r9
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm2, xmm4);
|
||||
vmovaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
vminps(xmm2, xmm0);
|
||||
vmaxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpsrld(xmm2, 7);
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
|
||||
vcvttps2dq(xmm3, xmm3);
|
||||
vpsrld(xmm3, 7);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
vminps(xmm4, xmm0);
|
||||
vmaxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
vdivps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
vbroadcastss(xmm4, ptr[&s_minmax.x]);
|
||||
vbroadcastss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
vpcmpeqd(xmm2, xmm2);
|
||||
vpxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
vmovaps(xmm6, xmm4);
|
||||
vmovaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
vpminub(xmm2, xmm0);
|
||||
vpmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
vmovaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vdivps(xmm0, xmm1);
|
||||
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
vminps(xmm6, xmm0);
|
||||
vmaxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
vpmovzxwd(xmm1, xmm0);
|
||||
|
||||
vpsrld(xmm0, 1);
|
||||
vpunpcklqdq(xmm1, xmm0);
|
||||
vcvtdq2ps(xmm1, xmm1);
|
||||
|
||||
vminps(xmm4, xmm1);
|
||||
vmaxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm2, xmm2);
|
||||
|
||||
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpmovzxbd(xmm3, xmm3);
|
||||
|
||||
vmovaps(ptr[eax], xmm2);
|
||||
vmovaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
vmovaps(ptr[eax + 16], xmm4);
|
||||
vmovaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
vmovaps(ptr[eax + 32], xmm6);
|
||||
vmovaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,531 @@
|
|||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSVertexTrace.h"
|
||||
|
||||
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _count = _args + 4; // rcx
|
||||
static const int _v = _args + 8; // rdx
|
||||
static const int _min = _args + 12; // r8
|
||||
static const int _max = _args + 16; // r9
|
||||
|
||||
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_minmax.x]);
|
||||
movss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = FLT_MAX;
|
||||
// max.c = -FLT_MAX;
|
||||
|
||||
movaps(xmm2, xmm4);
|
||||
movaps(xmm3, xmm5);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.minv(v[i + j].c);
|
||||
// max.c = max.c.maxv(v[i + j].c);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
|
||||
|
||||
minps(xmm2, xmm0);
|
||||
maxps(xmm3, xmm0);
|
||||
}
|
||||
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
if(primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexSW));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
psrld(xmm2, 7);
|
||||
movaps(ptr[eax], xmm2);
|
||||
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
psrld(xmm3, 7);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 6;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_minmax.x]);
|
||||
movss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
if(tme && !fst && primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
// min.p = min.p.minv(v[i + j].p);
|
||||
// max.p = max.p.maxv(v[i + j].p);
|
||||
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
|
||||
|
||||
minps(xmm4, xmm0);
|
||||
maxps(xmm5, xmm0);
|
||||
|
||||
if(tme && !fst && primclass != GS_SPRITE_CLASS)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
// min.c = min.c.min_u8(v[i + j].c);
|
||||
// max.c = max.c.min_u8(v[i + j].c);
|
||||
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
// t /= p.wwww();
|
||||
|
||||
divps(xmm0, xmm1);
|
||||
}
|
||||
|
||||
// min.t = min.t.minv(v[i + j].t);
|
||||
// max.t = max.t.maxv(v[i + j].t);
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW9));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin;
|
||||
// m_max.p = pmax;
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin.xyww(pmin);
|
||||
// m_max.t = tmax.xyww(pmax);
|
||||
|
||||
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
{
|
||||
uint32 primclass = (key >> 0) & 3;
|
||||
uint32 iip = (key >> 2) & 1;
|
||||
uint32 tme = (key >> 3) & 1;
|
||||
uint32 fst = (key >> 4) & 1;
|
||||
uint32 color = (key >> 5) & 1;
|
||||
|
||||
int n = 1;
|
||||
|
||||
switch(primclass)
|
||||
{
|
||||
case GS_POINT_CLASS:
|
||||
n = 1;
|
||||
break;
|
||||
case GS_LINE_CLASS:
|
||||
case GS_SPRITE_CLASS:
|
||||
n = 2;
|
||||
break;
|
||||
case GS_TRIANGLE_CLASS:
|
||||
n = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
// min.p = FLT_MAX;
|
||||
// max.p = -FLT_MAX;
|
||||
|
||||
movss(xmm4, ptr[&s_minmax.x]);
|
||||
movss(xmm5, ptr[&s_minmax.y]);
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
||||
if(color)
|
||||
{
|
||||
// min.c = 0xffffffff;
|
||||
// max.c = 0;
|
||||
|
||||
pcmpeqd(xmm2, xmm2);
|
||||
pxor(xmm3, xmm3);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// min.t = FLT_MAX;
|
||||
// max.t = -FLT_MAX;
|
||||
|
||||
movaps(xmm6, xmm4);
|
||||
movaps(xmm7, xmm5);
|
||||
}
|
||||
|
||||
// for(int i = 0; i < count; i += step) {
|
||||
|
||||
mov(edx, dword[esp + _v]);
|
||||
mov(ecx, dword[esp + _count]);
|
||||
|
||||
align(16);
|
||||
|
||||
L("loop");
|
||||
|
||||
for(int j = 0; j < n; j++)
|
||||
{
|
||||
if(color && (iip || j == n - 1) || tme)
|
||||
{
|
||||
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
|
||||
}
|
||||
|
||||
if(color && (iip || j == n - 1))
|
||||
{
|
||||
pminub(xmm2, xmm0);
|
||||
pmaxub(xmm3, xmm0);
|
||||
}
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(!fst)
|
||||
{
|
||||
movaps(xmm1, xmm0);
|
||||
}
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
|
||||
|
||||
if(!fst)
|
||||
{
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
divps(xmm0, xmm1);
|
||||
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
|
||||
}
|
||||
|
||||
minps(xmm6, xmm0);
|
||||
maxps(xmm7, xmm0);
|
||||
}
|
||||
|
||||
movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pmovzxwd(xmm1, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(xmm1, xmm0);
|
||||
punpcklwd(xmm1, xmm1);
|
||||
psrld(xmm1, 16);
|
||||
}
|
||||
|
||||
psrld(xmm0, 1);
|
||||
punpcklqdq(xmm1, xmm0);
|
||||
cvtdq2ps(xmm1, xmm1);
|
||||
|
||||
minps(xmm4, xmm1);
|
||||
maxps(xmm5, xmm1);
|
||||
}
|
||||
|
||||
add(edx, n * sizeof(GSVertexHW11));
|
||||
sub(ecx, n);
|
||||
|
||||
jg("loop");
|
||||
|
||||
// }
|
||||
|
||||
mov(eax, dword[esp + _min]);
|
||||
mov(edx, dword[esp + _max]);
|
||||
|
||||
if(color)
|
||||
{
|
||||
// m_min.c = cmin.zzzz().u8to32();
|
||||
// m_max.c = cmax.zzzz().u8to32();
|
||||
|
||||
if(m_cpu.has(util::Cpu::tSSE41))
|
||||
{
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm2, xmm2);
|
||||
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pmovzxbd(xmm3, xmm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
pxor(xmm0, xmm0);
|
||||
|
||||
punpckhbw(xmm2, xmm0);
|
||||
punpcklwd(xmm2, xmm0);
|
||||
|
||||
punpckhbw(xmm3, xmm0);
|
||||
punpcklwd(xmm3, xmm0);
|
||||
}
|
||||
|
||||
movaps(ptr[eax], xmm2);
|
||||
movaps(ptr[edx], xmm3);
|
||||
}
|
||||
|
||||
// m_min.p = pmin.xyww();
|
||||
// m_max.p = pmax.xyww();
|
||||
|
||||
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
|
||||
|
||||
movaps(ptr[eax + 16], xmm4);
|
||||
movaps(ptr[edx + 16], xmm5);
|
||||
|
||||
if(tme)
|
||||
{
|
||||
// m_min.t = tmin;
|
||||
// m_max.t = tmax;
|
||||
|
||||
movaps(ptr[eax + 32], xmm6);
|
||||
movaps(ptr[edx + 32], xmm7);
|
||||
}
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -466,12 +466,71 @@
|
|||
<ClCompile Include="GSDirtyRect.cpp" />
|
||||
<ClCompile Include="GSDrawScanline.cpp" />
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.cpp" />
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDump.cpp" />
|
||||
<ClCompile Include="GSdx.cpp" />
|
||||
<ClCompile Include="GSFunctionMap.cpp" />
|
||||
<ClCompile Include="GSLocalMemory.cpp" />
|
||||
<ClCompile Include="GSPerfMon.cpp" />
|
||||
<ClCompile Include="GSRasterizer.cpp" />
|
||||
<ClCompile Include="GSRasterizer.cpp">
|
||||
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">All</AssemblerOutput>
|
||||
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">AssemblyAndSourceCode</AssemblerOutput>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSRenderer.cpp" />
|
||||
<ClCompile Include="GSRendererDX.cpp" />
|
||||
<ClCompile Include="GSRendererDX11.cpp" />
|
||||
|
@ -482,6 +541,62 @@
|
|||
<ClCompile Include="GSSetting.cpp" />
|
||||
<ClCompile Include="GSSettingsDlg.cpp" />
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.cpp" />
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSState.cpp" />
|
||||
<ClCompile Include="GSTables.cpp" />
|
||||
<ClCompile Include="GSTexture.cpp" />
|
||||
|
@ -501,6 +616,62 @@
|
|||
<ClCompile Include="GSVertexList.cpp" />
|
||||
<ClCompile Include="GSVertexSW.cpp" />
|
||||
<ClCompile Include="GSVertexTrace.cpp" />
|
||||
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x64.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x86.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSWnd.cpp" />
|
||||
<ClCompile Include="stdafx.cpp">
|
||||
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">Create</PrecompiledHeader>
|
||||
|
|
|
@ -288,6 +288,42 @@
|
|||
<ClCompile Include="GSDeviceSDL.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x64.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSVertexTrace.x86.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="GS.h">
|
||||
|
|
|
@ -1040,6 +1040,13 @@ private:
|
|||
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
|
||||
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
|
||||
}
|
||||
#ifdef XBYAK64
|
||||
void opMovsxd(const Reg& reg, const Operand& op)
|
||||
{
|
||||
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
|
||||
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x63);
|
||||
}
|
||||
#endif
|
||||
void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
|
||||
{
|
||||
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
|
||||
|
|
|
@ -1021,4 +1021,5 @@ void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx())
|
|||
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
|
||||
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
|
||||
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
|
||||
void movsxd(const Reg64& reg, const Operand& op) { opMovsxd(reg, op); }
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue