GSdx: the x64 ABI on windows is not so nice after all.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4380 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-02-28 11:08:52 +00:00
parent 525175ba70
commit a96a345077
28 changed files with 9896 additions and 5369 deletions

View File

@ -33,7 +33,7 @@ GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 k
, m_local(*(GPUScanlineLocalData*)param) , m_local(*(GPUScanlineLocalData*)param)
{ {
#if _M_AMD64 #if _M_AMD64
#error TODO //#error TODO
#endif #endif
m_sel.key = key; m_sel.key = key;

View File

@ -32,7 +32,7 @@ GPUSetupPrimCodeGenerator::GPUSetupPrimCodeGenerator(void* param, uint32 key, vo
, m_local(*(GPUScanlineLocalData*)param) , m_local(*(GPUScanlineLocalData*)param)
{ {
#if _M_AMD64 #if _M_AMD64
#error TODO //#error TODO
#endif #endif
m_sel.key = key; m_sel.key = key;

File diff suppressed because it is too large Load Diff

View File

@ -55,14 +55,21 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
void AlphaBlend(); void AlphaBlend();
void WriteFrame(); void WriteFrame();
#if defined(_M_AMD64) || defined(_WIN64)
void ReadPixel(const Xmm& dst, const Reg64& addr);
void WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm);
#else
void ReadPixel(const Xmm& dst, const Reg32& addr); void ReadPixel(const Xmm& dst, const Reg32& addr);
void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz); void WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm); void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm);
#endif
void ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2); void ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2);
void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i); void ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i);
template<int shift> void modulate16(const Xmm& a, const Operand& f); void modulate16(const Xmm& a, const Operand& f, int shift);
template<int shift> void lerp16(const Xmm& a, const Xmm& b, const Xmm& f); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift);
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp); void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
void clamp16(const Xmm& a, const Xmm& temp); void clamp16(const Xmm& a, const Xmm& temp);
void alltrue(); void alltrue();

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,123 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#error TODO
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
void GSDrawScanlineCodeGenerator::Generate()
{
}
void GSDrawScanlineCodeGenerator::Init()
{
}
void GSDrawScanlineCodeGenerator::Step()
{
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
}
void GSDrawScanlineCodeGenerator::SampleTexture()
{
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
{
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
{
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
{
}
void GSDrawScanlineCodeGenerator::ReadMask()
{
}
void GSDrawScanlineCodeGenerator::TestAlpha()
{
}
void GSDrawScanlineCodeGenerator::ColorTFX()
{
}
void GSDrawScanlineCodeGenerator::Fog()
{
}
void GSDrawScanlineCodeGenerator::ReadFrame()
{
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
{
}
void GSDrawScanlineCodeGenerator::WriteMask()
{
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
{
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
}
void GSDrawScanlineCodeGenerator::WriteFrame()
{
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
{
}
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
}
static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
{
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -22,3 +22,36 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSFunctionMap.h" #include "GSFunctionMap.h"
void GSCodeGenerator::enter(uint32 size, bool align)
{
#ifdef _M_AMD64
push(r15);
mov(r15, rsp);
if(size > 0) sub(rsp, size);
if(align) and(rsp, 0xfffffffffffffff0);
#else
push(ebp);
mov(ebp, esp);
if(size > 0) sub(esp, size);
if(align) and(esp, 0xfffffff0);
#endif
}
void GSCodeGenerator::leave()
{
#ifdef _M_AMD64
mov(rsp, r15);
pop(r15);
#else
mov(esp, ebp);
pop(ebp);
#endif
}

View File

@ -145,7 +145,7 @@ public:
int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0; int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0; int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
printf("[%016llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n", printf("[%014llx]%c %6.2f%% | %5.2f%% | f %4lld | p %10lld | tpp %4lld | tpf %9lld | ppf %7lld\n",
(uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ', (uint64)key, m_map.find(key) == m_map.end() ? '*' : ' ',
(float)(tpf * 10000 / 50000000) / 100, (float)(tpf * 10000 / 50000000) / 100,
(float)(tpf * 10000 / ttpf) / 100, (float)(tpf * 10000 / ttpf) / 100,
@ -161,6 +161,9 @@ class GSCodeGenerator : public Xbyak::CodeGenerator
protected: protected:
Xbyak::util::Cpu m_cpu; Xbyak::util::Cpu m_cpu;
void enter(uint32 size, bool align);
void leave();
public: public:
GSCodeGenerator(void* code, size_t maxsize) GSCodeGenerator(void* code, size_t maxsize)
: Xbyak::CodeGenerator(maxsize, code) : Xbyak::CodeGenerator(maxsize, code)

View File

@ -144,7 +144,7 @@ void GSRendererSW::Draw()
if(s_save && s_n >= s_saven && PRIM->TME) if(s_save && s_n >= s_saven && PRIM->TME)
{ {
s = format("c:\\temp1\\_%05d_f%ll_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM); s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH); m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
} }

View File

@ -19,639 +19,9 @@
* *
*/ */
// TODO: x64
#include "stdafx.h" #include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.h"
using namespace Xbyak;
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)
{
#if _M_AMD64
#error TODO
#endif
m_sel.key = key;
m_en.z = m_sel.zb ? 1 : 0;
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
Generate();
}
void GSSetupPrimCodeGenerator::Generate()
{
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
{
for(int i = 0; i < 5; i++)
{
if(m_cpu.has(util::Cpu::tAVX))
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
else
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
}
}
Depth();
Texture();
Color();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
vbroadcastss(xmm1, dword[&half]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
}
else
{
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
movss(xmm1, dword[&half]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[&m_local.p.z], xmm0);
}
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[edx + 32]);
vmulps(xmm1, xmm0, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
vmovaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
else
{
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[edx + 32]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.c], xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[ecx]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.c.rb], xmm1);
vmovdqa(ptr[&m_local.c.ga], xmm2);
}
}
else
{
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[&m_local.d4.c], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].ga], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm0, ptr[ecx]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy());
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.c.rb], xmm1);
movdqa(ptr[&m_local.c.ga], xmm2);
}
}
}
const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] = const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
{ {
GSVector4(4.0f, 4.0f, 4.0f, 4.0f), GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
@ -660,3 +30,17 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
GSVector4(-2.0f, -1.0f, 0.0f, 1.0f), GSVector4(-2.0f, -1.0f, 0.0f, 1.0f),
GSVector4(-3.0f, -2.0f, -1.0f, 0.0f), GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
}; };
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)
{
m_sel.key = key;
m_en.z = m_sel.zb ? 1 : 0;
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
Generate();
}

View File

@ -0,0 +1,349 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate()
{
enter(32, true);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
mov(r8, (size_t)&m_local);
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)&m_shift[0]);
for(int i = 0; i < 5; i++)
{
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
}
}
Depth();
Texture();
Color();
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[rdx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[rcx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
mov(r9, (size_t)&GSVector4::m_half);
vbroadcastss(xmm1, ptr[r9]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[rdx + 32]);
vmulps(xmm1, xmm0, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[rcx]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
}
}
#endif

View File

@ -0,0 +1,363 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate()
{
enter(32, true);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
mov(r8, (size_t)&m_local);
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
{
for(int i = 0; i < 5; i++)
{
movaps(Xmm(3 + i), ptr[rax + i * 16]);
}
}
Depth();
Texture();
Color();
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[rdx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, ptr[rcx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
mov(r9, (size_t)&GSVector4::m_half);
movss(xmm1, ptr[r9]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[rdx + 32]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break;
case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break;
case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break;
case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
cvttps2dq(xmm0, ptr[rcx]);
// c = c.upl16(c.zwxy());
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
}
}
#endif

View File

@ -0,0 +1,333 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate()
{
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
{
for(int i = 0; i < 5; i++)
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
}
Depth();
Texture();
Color();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[edx + 32]);
vmulps(xmm1, xmm0, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
vmovaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.c], xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[ecx]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.c.rb], xmm1);
vmovdqa(ptr[&m_local.c.ga], xmm2);
}
}
#endif

View File

@ -0,0 +1,349 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSSetupPrimCodeGenerator.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate()
{
if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip)
{
for(int i = 0; i < 5; i++)
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
}
Depth();
Texture();
Color();
ret();
}
void GSSetupPrimCodeGenerator::Depth()
{
if(!m_en.z && !m_en.f)
{
return;
}
if(!m_sel.sprite)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
movaps(xmm1, ptr[&GSVector4::m_half]);
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[&m_local.p.z], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture()
{
if(!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[edx + 32]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if(m_sel.fst)
{
// m_local.d4.st = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[&m_local.d4.st], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(ptr[&m_local.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if(m_sel.fst)
{
// m_local.d[i].si/ti = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch(j)
{
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color()
{
if(!m_en.c)
{
return;
}
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[&m_local.d4.c], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].ga], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm0, ptr[ecx]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy());
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.c.rb], xmm1);
movdqa(ptr[&m_local.c.ga], xmm2);
}
}
#endif

View File

@ -24,6 +24,7 @@
const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f); const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f); const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_half(0.5f, 0.5f, 0.5f, 0.5f);
const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000))); const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));

View File

@ -2271,6 +2271,7 @@ public:
static const GSVector4 m_ps0123; static const GSVector4 m_ps0123;
static const GSVector4 m_ps4567; static const GSVector4 m_ps4567;
static const GSVector4 m_half;
static const GSVector4 m_x3f800000; static const GSVector4 m_x3f800000;
static const GSVector4 m_x4b000000; static const GSVector4 m_x4b000000;

View File

@ -24,8 +24,7 @@
#include "GSUtil.h" #include "GSUtil.h"
#include "GSState.h" #include "GSState.h"
static const float s_fmin = -FLT_MAX; const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX);
static const float s_fmax = FLT_MAX;
GSVertexTrace::GSVertexTrace(const GSState* state) GSVertexTrace::GSVertexTrace(const GSState* state)
: m_state(state) : m_state(state)
@ -51,7 +50,7 @@ uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass)
void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass) void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primclass)
{ {
m_map_sw[Hash(primclass)](v, count, m_min, m_max); m_map_sw[Hash(primclass)](count, v, m_min, m_max);
m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20); m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
@ -60,7 +59,7 @@ void GSVertexTrace::Update(const GSVertexSW* v, int count, GS_PRIM_CLASS primcla
void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass) void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primclass)
{ {
m_map_hw9[Hash(primclass)](v, count, m_min, m_max); m_map_hw9[Hash(primclass)](count, v, m_min, m_max);
const GSDrawingContext* context = m_state->m_context; const GSDrawingContext* context = m_state->m_context;
@ -92,7 +91,7 @@ void GSVertexTrace::Update(const GSVertexHW9* v, int count, GS_PRIM_CLASS primcl
void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass) void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primclass)
{ {
m_map_hw11[Hash(primclass)](v, count, m_min, m_max); m_map_hw11[Hash(primclass)](count, v, m_min, m_max);
const GSDrawingContext* context = m_state->m_context; const GSDrawingContext* context = m_state->m_context;
@ -121,939 +120,3 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
m_alpha.valid = false; m_alpha.valid = false;
} }
using namespace Xbyak;
static const int _args = 0;
static const int _v = _args + 4;
static const int _count = _args + 8;
static const int _min = _args + 12;
static const int _max = _args + 16;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
#if _M_AMD64
#error TODO
#endif
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
//
if(m_cpu.has(util::Cpu::tAVX))
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_fmax]);
vbroadcastss(xmm5, ptr[&s_fmin]);
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
vmovaps(xmm2, xmm4);
vmovaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
vmovaps(xmm1, xmm0);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[eax], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[edx], xmm3);
}
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
}
else
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_fmax]);
movss(xmm5, ptr[&s_fmin]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
movaps(xmm2, xmm4);
movaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[eax], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[edx], xmm3);
}
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
}
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
#if _M_AMD64
#error TODO
#endif
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
//
if(m_cpu.has(util::Cpu::tAVX))
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_fmax]);
vbroadcastss(xmm5, ptr[&s_fmin]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
vdivps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
}
else
{
vpxor(xmm0, xmm0);
vpunpckhbw(xmm2, xmm0);
vpunpcklwd(xmm2, xmm0);
vpunpckhbw(xmm3, xmm0);
vpunpcklwd(xmm3, xmm0);
}
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
}
else
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_fmax]);
movss(xmm5, ptr[&s_fmin]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
divps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
}
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
#if _M_AMD64
#error TODO
#endif
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
//
if(m_cpu.has(util::Cpu::tAVX))
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_fmax]);
vbroadcastss(xmm5, ptr[&s_fmin]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
vmovaps(xmm1, xmm0);
}
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
vpmovzxwd(xmm1, xmm0);
}
else
{
vpunpcklwd(xmm1, xmm0, xmm0);
vpsrld(xmm1, 16);
}
vpsrld(xmm0, 1);
vpunpcklqdq(xmm1, xmm0);
vcvtdq2ps(xmm1, xmm1);
vminps(xmm4, xmm1);
vmaxps(xmm5, xmm1);
}
add(edx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
}
else
{
vpxor(xmm0, xmm0);
vpunpckhbw(xmm2, xmm0);
vpunpcklwd(xmm2, xmm0);
vpunpckhbw(xmm3, xmm0);
vpunpcklwd(xmm3, xmm0);
}
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
}
else
{
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_fmax]);
movss(xmm5, ptr[&s_fmin]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
movaps(xmm1, xmm0);
}
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
pmovzxwd(xmm1, xmm0);
}
else
{
movdqa(xmm1, xmm0);
punpcklwd(xmm1, xmm1);
psrld(xmm1, 16);
}
psrld(xmm0, 1);
punpcklqdq(xmm1, xmm0);
cvtdq2ps(xmm1, xmm1);
minps(xmm4, xmm1);
maxps(xmm5, xmm1);
}
add(edx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
}
ret();
}

View File

@ -34,7 +34,7 @@ __aligned(class, 32) GSVertexTrace
struct Vertex {GSVector4i c; GSVector4 p, t;}; struct Vertex {GSVector4i c; GSVector4 p, t;};
struct VertexAlpha {int min, max; bool valid;}; struct VertexAlpha {int min, max; bool valid;};
typedef void (*VertexTracePtr)(const void* v, int count, Vertex& min, Vertex& max); typedef void (*VertexTracePtr)(int count, const void* v, Vertex& min, Vertex& max);
class CGSW : public GSCodeGenerator class CGSW : public GSCodeGenerator
{ {
@ -62,6 +62,8 @@ __aligned(class, 32) GSVertexTrace
const GSState* m_state; const GSState* m_state;
static const GSVector4 s_minmax;
public: public:
GS_PRIM_CLASS m_primclass; GS_PRIM_CLASS m_primclass;
Vertex m_min, m_max; // t.xy * 0x10000 Vertex m_min, m_max; // t.xy * 0x10000

View File

@ -0,0 +1,496 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
enter(32, true);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
vmovaps(xmm2, xmm4);
vmovaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
}
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
if(color)
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[r8], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[r9], xmm3);
}
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
if(tme)
{
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
enter(32, true);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
vdivps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8], xmm2);
vmovaps(ptr[r9], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
enter(32, true);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
vmovaps(xmm1, xmm0);
}
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
vpmovzxwd(xmm1, xmm0);
vpsrld(xmm0, 1);
vpunpcklqdq(xmm1, xmm0);
vcvtdq2ps(xmm1, xmm1);
vminps(xmm4, xmm1);
vmaxps(xmm5, xmm1);
}
add(rdx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8], xmm2);
vmovaps(ptr[r9], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + 16], xmm4);
vmovaps(ptr[r9 + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[r8 + 32], xmm6);
vmovaps(ptr[r9 + 32], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
#endif

View File

@ -0,0 +1,543 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
enter(32, true);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 4]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
movaps(xmm2, xmm4);
movaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexSW));
sub(rcx, n);
jg("loop");
// }
if(color)
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[r8], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[r9], xmm3);
}
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
if(tme)
{
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
enter(32, true);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 16]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
divps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8], xmm2);
movaps(ptr[r9], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
enter(32, true);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 16]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
movaps(xmm1, xmm0);
}
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
pmovzxwd(xmm1, xmm0);
}
else
{
movdqa(xmm1, xmm0);
punpcklwd(xmm1, xmm1);
psrld(xmm1, 16);
}
psrld(xmm0, 1);
punpcklqdq(xmm1, xmm0);
cvtdq2ps(xmm1, xmm1);
minps(xmm4, xmm1);
maxps(xmm5, xmm1);
}
add(rdx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8], xmm2);
movaps(ptr[r9], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + 16], xmm4);
movaps(ptr[r9 + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[r8 + 32], xmm6);
movaps(ptr[r9 + 32], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
leave();
ret();
}
#endif

View File

@ -0,0 +1,484 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
static const int _args = 0;
static const int _count = _args + 4; // rcx
static const int _v = _args + 8; // rdx
static const int _min = _args + 12; // r8
static const int _max = _args + 16; // r9
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
vmovaps(xmm2, xmm4);
vmovaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
}
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[eax], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[edx], xmm3);
}
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
vdivps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
vmovaps(xmm1, xmm0);
}
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
vpmovzxwd(xmm1, xmm0);
vpsrld(xmm0, 1);
vpunpcklqdq(xmm1, xmm0);
vcvtdq2ps(xmm1, xmm1);
vminps(xmm4, xmm1);
vmaxps(xmm5, xmm1);
}
add(edx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax], xmm2);
vmovaps(ptr[edx], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + 16], xmm4);
vmovaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[eax + 32], xmm6);
vmovaps(ptr[edx + 32], xmm7);
}
ret();
}
#endif

View File

@ -0,0 +1,531 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
static const int _args = 0;
static const int _count = _args + 4; // rcx
static const int _v = _args + 8; // rdx
static const int _min = _args + 12; // r8
static const int _max = _args + 16; // r9
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
movaps(xmm2, xmm4);
movaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[eax], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[edx], xmm3);
}
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
divps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(edx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _v]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
movaps(xmm1, xmm0);
}
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
pmovzxwd(xmm1, xmm0);
}
else
{
movdqa(xmm1, xmm0);
punpcklwd(xmm1, xmm1);
psrld(xmm1, 16);
}
psrld(xmm0, 1);
punpcklqdq(xmm1, xmm0);
cvtdq2ps(xmm1, xmm1);
minps(xmm4, xmm1);
maxps(xmm5, xmm1);
}
add(edx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();
}
#endif

View File

@ -466,12 +466,71 @@
<ClCompile Include="GSDirtyRect.cpp" /> <ClCompile Include="GSDirtyRect.cpp" />
<ClCompile Include="GSDrawScanline.cpp" /> <ClCompile Include="GSDrawScanline.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.cpp" /> <ClCompile Include="GSDrawScanlineCodeGenerator.cpp" />
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSDump.cpp" /> <ClCompile Include="GSDump.cpp" />
<ClCompile Include="GSdx.cpp" /> <ClCompile Include="GSdx.cpp" />
<ClCompile Include="GSFunctionMap.cpp" /> <ClCompile Include="GSFunctionMap.cpp" />
<ClCompile Include="GSLocalMemory.cpp" /> <ClCompile Include="GSLocalMemory.cpp" />
<ClCompile Include="GSPerfMon.cpp" /> <ClCompile Include="GSPerfMon.cpp" />
<ClCompile Include="GSRasterizer.cpp" /> <ClCompile Include="GSRasterizer.cpp">
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">All</AssemblerOutput>
<AssemblerOutput Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">AssemblyAndSourceCode</AssemblerOutput>
</ClCompile>
<ClCompile Include="GSRenderer.cpp" /> <ClCompile Include="GSRenderer.cpp" />
<ClCompile Include="GSRendererDX.cpp" /> <ClCompile Include="GSRendererDX.cpp" />
<ClCompile Include="GSRendererDX11.cpp" /> <ClCompile Include="GSRendererDX11.cpp" />
@ -482,6 +541,62 @@
<ClCompile Include="GSSetting.cpp" /> <ClCompile Include="GSSetting.cpp" />
<ClCompile Include="GSSettingsDlg.cpp" /> <ClCompile Include="GSSettingsDlg.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.cpp" /> <ClCompile Include="GSSetupPrimCodeGenerator.cpp" />
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSState.cpp" /> <ClCompile Include="GSState.cpp" />
<ClCompile Include="GSTables.cpp" /> <ClCompile Include="GSTables.cpp" />
<ClCompile Include="GSTexture.cpp" /> <ClCompile Include="GSTexture.cpp" />
@ -501,6 +616,62 @@
<ClCompile Include="GSVertexList.cpp" /> <ClCompile Include="GSVertexList.cpp" />
<ClCompile Include="GSVertexSW.cpp" /> <ClCompile Include="GSVertexSW.cpp" />
<ClCompile Include="GSVertexTrace.cpp" /> <ClCompile Include="GSVertexTrace.cpp" />
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x64.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSWnd.cpp" /> <ClCompile Include="GSWnd.cpp" />
<ClCompile Include="stdafx.cpp"> <ClCompile Include="stdafx.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">Create</PrecompiledHeader> <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">Create</PrecompiledHeader>

View File

@ -288,6 +288,42 @@
<ClCompile Include="GSDeviceSDL.cpp"> <ClCompile Include="GSDeviceSDL.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x64.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x64.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x64.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="GS.h"> <ClInclude Include="GS.h">

View File

@ -1040,6 +1040,13 @@ private:
bool cond = reg.isREG() && (reg.getBit() > op.getBit()); bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w); opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
} }
#ifdef XBYAK64
void opMovsxd(const Reg& reg, const Operand& op)
{
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x63);
}
#endif
void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext) void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
{ {
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP; if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;

View File

@ -1021,4 +1021,5 @@ void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx())
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); } void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); } void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); } void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
void movsxd(const Reg64& reg, const Operand& op) { opMovsxd(reg, op); }
#endif #endif