pcsx2/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator...

3056 lines
57 KiB
C++
Raw Normal View History

/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h"
#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64))
static const int _args = 16;
static const int _top = _args + 4;
static const int _v = _args + 8;
// Ease the reading of the code
#define _m_local r12
#define _m_local__gd r13
#define _m_local__gd__vm a1
#define _m_local__gd__clut r11
#define _m_local__gd__tex a3
// More pretty name
#define _z ymm8
#define _f ymm9
#define _s ymm10
#define _t ymm11
#define _q ymm12
#define _f_rb ymm13
#define _f_ga ymm14
#define _test ymm15
// Extra bonus
#define _rb ymm2
#define _ga ymm3
#define _fm ymm4
#define _zm ymm5
#define _fd ymm6
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[_m_local + offsetof(GSScanlineLocalData, field)])
#define _rip_global(field) (m_rip ? ptr[rip + &m_local.gd->field] : ptr[_m_local__gd + offsetof(GSScanlineGlobalData, field)])
#ifdef _WIN64
#else
static const int _rz_rbx = -8 * 1;
static const int _rz_r12 = -8 * 2;
static const int _rz_r13 = -8 * 3;
//static const int _rz_r14 = -8 * 4;
//static const int _rz_r15 = -8 * 5;
static const int _rz_top = -8 * 4;
static const int _rz_zs = -8 * 8;
static const int _rz_zd = -8 * 12;
static const int _rz_cov = -8 * 16;
#endif
void GSDrawScanlineCodeGenerator::Generate()
{
ret();
return;
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
bool need_clut = need_tex && m_sel.tlu;
m_rip = (size_t)getCurr() < 0x80000000;
m_rip &= (size_t)&m_local < 0x80000000;
m_rip &= (size_t)&m_local.gd < 0x80000000;
#ifdef _WIN64
push(rbx);
push(rsi);
push(rdi);
push(rbp);
push(r12);
push(r13);
sub(rsp, 8 + 10 * 16);
for(int i = 6; i < 16; i++)
{
vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i));
}
#else
// No reservation on the stack as a red zone is available
push(rbp);
mov(ptr[rsp + _rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _rz_r12], r12);
mov(ptr[rsp + _rz_r13], r13);
}
#endif
mov(r10, (size_t)g_const->m_test_256b[0]);
if (!m_rip)
{
mov(_m_local, (size_t)&m_local);
mov(_m_local__gd, _rip_local(gd));
}
if(need_clut)
mov(_m_local__gd__clut, _rip_global(clut));
//db(0xcc);
Init();
if(!m_sel.edge)
{
align(16);
}
L("loop");
// ecx = steps
// esi = fzbr
// edi = fzbc
// ymm0 = z/zi
// ymm2 = s/u (tme)
// ymm3 = t/v (tme)
// ymm4 = q (tme)
// ymm5 = rb (!tme)
// ymm6 = ga (!tme)
// ymm7 = test
TestZ(ymm5, ymm6);
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// - ymm0
// ymm2 = s/u (tme)
// ymm3 = t/v (tme)
// ymm4 = q (tme)
// ymm5 = rb (!tme)
// ymm6 = ga (!tme)
// ymm7 = test
if(m_sel.mmin)
{
SampleTextureLOD();
}
else
{
SampleTexture();
}
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// - ymm2
// - ymm3
// - ymm4
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
AlphaTFX();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
ReadMask();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
TestAlpha();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
ColorTFX();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
Fog();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
ReadFrame();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = fd
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
TestDestAlpha();
// ecx = steps
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = fd
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
// ymm7 = test
WriteMask();
// ebx = fa
// ecx = steps
// edx = fzm
// esi = fzbr
// edi = fzbc
// ebp = za
// ymm2 = fd
// ymm3 = fm
// ymm4 = zm
// ymm5 = rb
// ymm6 = ga
WriteZBuf();
// ebx = fa
// ecx = steps
// edx = fzm
// esi = fzbr
// edi = fzbc
// - ebp
// ymm2 = fd
// ymm3 = fm
// - ymm4
// ymm5 = rb
// ymm6 = ga
AlphaBlend();
// ebx = fa
// ecx = steps
// edx = fzm
// esi = fzbr
// edi = fzbc
// ymm2 = fd
// ymm3 = fm
// ymm5 = rb
// ymm6 = ga
WriteFrame();
L("step");
// if(steps <= 0) break;
if(!m_sel.edge)
{
test(ecx, ecx);
jle("exit", T_NEAR);
Step();
jmp("loop", T_NEAR);
}
L("exit");
#ifdef _WIN64
for(int i = 6; i < 16; i++)
{
vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]);
}
add(rsp, 8 + 10 * 16);
pop(r13);
pop(r12);
pop(rbp);
pop(rdi);
pop(rsi);
pop(rbx);
#else
mov(rbx, ptr[rsp + _rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _rz_r12]);
mov(r13, ptr[rsp + _rz_r13]);
}
pop(rbp);
#endif
}
void GSDrawScanlineCodeGenerator::Init()
{
if(!m_sel.notest)
{
// int skip = left & 7;
mov(ebx, a1.cvt32());
and(a1.cvt32(), 7);
// int steps = pixels + skip - 8;
lea(a0, ptr[a0 + a1 - 8]);
// left -= skip;
sub(ebx, a1.cvt32());
// GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))];
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[0]]);
vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)g_const->m_test_256b[15]]);
vpor(ymm7, ymm0);
shl(edx, 5);
}
else
{
mov(ebx, edx); // left
xor(edx, edx); // skip
lea(ecx, ptr[ecx - 8]); // steps
}
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
mov(esi, ptr[esp + _top]);
lea(esi, ptr[esi * 8]);
add(esi, ptr[&m_local.gd->fzbr]);
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
lea(edi, ptr[ebx * 2]);
add(edi, ptr[&m_local.gd->fzbc]);
if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
{
// edx = &m_local.d[skip]
lea(edx, ptr[edx * 8 + (size_t)m_local.d]);
// ebx = &v
mov(ebx, ptr[esp + _v]);
}
if(m_sel.prim != GS_SPRITE_CLASS)
{
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
{
vbroadcastf128(ymm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p
if(m_sel.fwrite && m_sel.fge)
{
// f = GSVector8i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
vcvttps2dq(ymm1, ymm0);
vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2));
vpaddw(ymm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]);
vmovdqa(ptr[&m_local.temp.f], ymm1);
}
if(m_sel.zb)
{
// z = vp.zzzz() + m_local.d[skip].z;
vshufps(ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovaps(ptr[&m_local.temp.z], ymm0);
vmovaps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]);
vmovaps(ptr[&m_local.temp.zo], ymm2);
vaddps(ymm0, ymm2);
}
}
}
else
{
if(m_sel.ztest)
{
vpbroadcastd(ymm0, ptr[&m_local.p.z]);
}
}
if(m_sel.fb)
{
if(m_sel.edge || m_sel.tfx != TFX_NONE)
{
vbroadcastf128(ymm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t
}
if(m_sel.edge)
{
// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
vpshufhw(ymm3, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(ymm3, ymm3, _MM_SHUFFLE(3, 3, 3, 3));
vpsrlw(ymm3, 9);
vmovdqa(ptr[&m_local.temp.cov], ymm3);
}
if(m_sel.tfx != TFX_NONE)
{
if(m_sel.fst)
{
// GSVector4i vti(vt);
vcvttps2dq(ymm6, ymm4);
// s = vti.xxxx() + m_local.d[skip].s;
// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
vpshufd(ymm2, ymm6, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm3, ymm6, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
{
vpaddd(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
}
else
{
if(m_sel.ltf)
{
vpshuflw(ymm6, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm6, ymm6, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm6, 12);
vmovdqa(ptr[&m_local.temp.vf], ymm6);
}
}
vmovdqa(ptr[&m_local.temp.s], ymm2);
vmovdqa(ptr[&m_local.temp.t], ymm3);
}
else
{
// s = vt.xxxx() + m_local.d[skip].s;
// t = vt.yyyy() + m_local.d[skip].t;
// q = vt.zzzz() + m_local.d[skip].q;
vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]);
vaddps(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]);
vaddps(ymm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]);
vmovaps(ptr[&m_local.temp.s], ymm2);
vmovaps(ptr[&m_local.temp.t], ymm3);
vmovaps(ptr[&m_local.temp.q], ymm4);
}
}
if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
{
if(m_sel.iip)
{
// GSVector4i vc = GSVector4i(v.c);
vbroadcastf128(ymm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c
vcvttps2dq(ymm6, ymm6);
// vc = vc.upl16(vc.zwxy());
vpshufd(ymm5, ymm6, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(ymm6, ymm5);
// rb = vc.xxxx().add16(m_local.d[skip].rb);
// ga = vc.zzzz().add16(m_local.d[skip].ga);
vpshufd(ymm5, ymm6, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm6, ymm6, _MM_SHUFFLE(2, 2, 2, 2));
vpaddw(ymm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]);
vpaddw(ymm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]);
vmovdqa(ptr[&m_local.temp.rb], ymm5);
vmovdqa(ptr[&m_local.temp.ga], ymm6);
}
else
{
if(m_sel.tfx == TFX_NONE)
{
vmovdqa(ymm5, ptr[&m_local.c.rb]);
vmovdqa(ymm6, ptr[&m_local.c.ga]);
}
}
}
}
}
void GSDrawScanlineCodeGenerator::Step()
{
// steps -= 8;
sub(a0, 8);
// fza_offset += 2;
add(t0, 16);
if(m_sel.prim != GS_SPRITE_CLASS)
{
// zo += GSVector8::broadcast32(&m_local.d8.p.z);
if(m_sel.zb)
{
vbroadcastss(ymm0, ptr[&m_local.d8.p.z]);
vaddps(ymm0, ptr[&m_local.temp.zo]);
vmovaps(ptr[&m_local.temp.zo], ymm0);
vaddps(ymm0, ptr[&m_local.temp.z]);
}
// f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
if(m_sel.fwrite && m_sel.fge)
{
vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]);
vpaddw(ymm1, ptr[&m_local.temp.f]);
vmovdqa(ptr[&m_local.temp.f], ymm1);
}
}
else
{
if(m_sel.ztest)
{
vpbroadcastd(ymm0, ptr[&m_local.p.z]);
}
}
if(m_sel.fb)
{
if(m_sel.tfx != TFX_NONE)
{
if(m_sel.fst)
{
// GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
vbroadcasti128(ymm4, ptr[&m_local.d8.stq]);
// s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
vpaddd(ymm2, ptr[&m_local.temp.s]);
vmovdqa(ptr[&m_local.temp.s], ymm2);
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
{
// t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy());
vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(ymm3, ptr[&m_local.temp.t]);
vmovdqa(ptr[&m_local.temp.t], ymm3);
}
else
{
vmovdqa(ymm3, ptr[&m_local.temp.t]);
}
}
else
{
// GSVector8 stq(m_local.d8.stq);
// s += stq.xxxx();
// t += stq.yyyy();
// q += stq.zzzz();
vbroadcastf128(ymm4, ptr[&m_local.d8.stq]);
vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(ymm2, ptr[&m_local.temp.s]);
vaddps(ymm3, ptr[&m_local.temp.t]);
vaddps(ymm4, ptr[&m_local.temp.q]);
vmovaps(ptr[&m_local.temp.s], ymm2);
vmovaps(ptr[&m_local.temp.t], ymm3);
vmovaps(ptr[&m_local.temp.q], ymm4);
}
}
if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
{
if(m_sel.iip)
{
// GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
vpbroadcastq(ymm7, ptr[&m_local.d8.c]);
// rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero());
// ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero());
vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1));
vpaddw(ymm5, ptr[&m_local.temp.rb]);
vpaddw(ymm6, ptr[&m_local.temp.ga]);
// FIXME: color may underflow and roll over at the end of the line, if decreasing
vpxor(ymm7, ymm7);
vpmaxsw(ymm5, ymm7);
vpmaxsw(ymm6, ymm7);
vmovdqa(ptr[&m_local.temp.rb], ymm5);
vmovdqa(ptr[&m_local.temp.ga], ymm6);
}
else
{
if(m_sel.tfx == TFX_NONE)
{
vmovdqa(ymm5, ptr[&m_local.c.rb]);
vmovdqa(ymm6, ptr[&m_local.c.ga]);
}
}
}
}
if(!m_sel.notest)
{
// test = m_test[15 + (steps & (steps >> 31))];
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[15]]);
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2)
{
if(!m_sel.zb)
{
return;
}
// int za = fza_base.y + fza_offset->y;
mov(ebp, ptr[esi + 4]);
add(ebp, ptr[edi + 4]);
and(ebp, HALF_VM_SIZE - 1);
// GSVector8i zs = zi;
if(m_sel.prim != GS_SPRITE_CLASS)
{
if(m_sel.zoverflow)
{
// zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001());
vbroadcastss(ymm0, ptr[&GSVector8::m_half]);
vmulps(ymm0, _z);
vcvttps2dq(ymm0, ymm0);
vpslld(ymm0, 1);
vcvttps2dq(ymm1, _z);
vpcmpeqd(ymm2, ymm2);
vpsrld(ymm2, 31);
vpand(ymm1, ymm2);
vpor(ymm0, ymm1);
}
else
{
// zs = GSVector8i(z);
vcvttps2dq(ymm0, ymm0);
}
if(m_sel.zwrite)
{
#ifdef _WIN64
vmovdqa(ptr[&m_local.temp.zs], ymm0);
#else
vmovdqa(ptr[rsp + _rz_zs], ymm0);
#endif
}
}
if(m_sel.ztest)
{
ReadPixel(ymm1, temp1, rbp);
if(m_sel.zwrite && m_sel.zpsm < 2)
{
#ifdef _WIN64
vmovdqa(_rip_local(temp.zd), ymm1);
#else
vmovdqa(ptr[rsp + _rz_zd], ymm1);
#endif
}
// zd &= 0xffffffff >> m_sel.zpsm * 8;
if(m_sel.zpsm)
{
vpslld(ymm1, (uint8)(m_sel.zpsm * 8));
vpsrld(ymm1, (uint8)(m_sel.zpsm * 8));
}
if(m_sel.zoverflow || m_sel.zpsm == 0)
{
// GSVector8i o = GSVector8i::x80000000();
vpcmpeqd(temp1, temp1);
vpslld(temp1, 31);
// GSVector8i zso = zs - o;
// GSVector8i zdo = zd - o;
vpsubd(ymm0, temp1);
vpsubd(ymm1, temp1);
}
switch(m_sel.ztst)
{
case ZTST_GEQUAL:
// test |= zso < zdo; // ~(zso >= zdo)
vpcmpgtd(ymm1, ymm0);
vpor(ymm7, ymm1);
break;
case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
// test |= zso <= zdo; // ~(zso > zdo)
vpcmpgtd(ymm0, ymm1);
vpcmpeqd(temp1, temp1);
vpxor(ymm0, temp1);
vpor(ymm7, ymm0);
break;
}
alltrue(ymm7);
}
}
void GSDrawScanlineCodeGenerator::SampleTexture()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
return;
}
mov(ebx, ptr[&m_local.gd->tex[0]]);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
// ebx = tex
// edx = clut
if(!m_sel.fst)
{
vrcpps(ymm0, ymm4);
vmulps(ymm2, ymm0);
vmulps(ymm3, ymm0);
vcvttps2dq(ymm2, ymm2);
vcvttps2dq(ymm3, ymm3);
if(m_sel.ltf)
{
// u -= 0x8000;
// v -= 0x8000;
mov(eax, 0x8000);
vmovd(xmm4, eax);
vpbroadcastd(ymm4, xmm4);
vpsubd(ymm2, ymm4);
vpsubd(ymm3, ymm4);
}
}
// ymm2 = u
// ymm3 = v
if(m_sel.ltf)
{
// GSVector8i uf = u.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.uf], ymm0);
if(m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector8i vf = v.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.vf], ymm0);
}
}
// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
vpsrad(ymm2, 16);
vpsrad(ymm3, 16);
vpackssdw(ymm2, ymm3);
if(m_sel.ltf)
{
// GSVector8i uv1 = uv0.add16(GSVector8i::x0001());
vpcmpeqd(ymm1, ymm1);
vpsrlw(ymm1, 15);
vpaddw(ymm3, ymm2, ymm1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
Wrap(ymm2, ymm3);
}
else
{
// uv0 = Wrap(uv0);
Wrap(ymm2);
}
// ymm2 = uv0
// ymm3 = uv1 (ltf)
// ymm0, ymm1, ymm4, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i y0 = uv0.uph16() << tw;
// GSVector8i x0 = uv0.upl16();
vpxor(ymm0, ymm0);
vpunpcklwd(ymm4, ymm2, ymm0);
vpunpckhwd(ymm2, ymm2, ymm0);
vpslld(ymm2, (uint8)(m_sel.tw + 3));
// ymm0 = 0
// ymm2 = y0
// ymm3 = uv1 (ltf)
// ymm4 = x0
// ymm1, ymm5, ymm6 = free
// ymm7 = used
if(m_sel.ltf)
{
// GSVector8i y1 = uv1.uph16() << tw;
// GSVector8i x1 = uv1.upl16();
vpunpcklwd(ymm6, ymm3, ymm0);
vpunpckhwd(ymm3, ymm3, ymm0);
vpslld(ymm3, (uint8)(m_sel.tw + 3));
// ymm2 = y0
// ymm3 = y1
// ymm4 = x0
// ymm6 = x1
// ymm0, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i addr00 = y0 + x0;
// GSVector8i addr01 = y0 + x1;
// GSVector8i addr10 = y1 + x0;
// GSVector8i addr11 = y1 + x1;
vpaddd(ymm5, ymm2, ymm4);
vpaddd(ymm2, ymm2, ymm6);
vpaddd(ymm0, ymm3, ymm4);
vpaddd(ymm3, ymm3, ymm6);
// ymm5 = addr00
// ymm2 = addr01
// ymm0 = addr10
// ymm3 = addr11
// ymm1, ymm4, ymm6 = free
// ymm7 = used
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
// ymm6 = c00
// ymm4 = c01
// ymm1 = c10
// ymm5 = c11
// ymm0, ymm2, ymm3 = free
// ymm7 = used
vmovdqa(ymm0, ptr[&m_local.temp.uf]);
// GSVector8i rb00 = c00 & mask;
// GSVector8i ga00 = (c00 >> 8) & mask;
vpsllw(ymm2, ymm6, 8);
vpsrlw(ymm2, 8);
vpsrlw(ymm6, 8);
// GSVector8i rb01 = c01 & mask;
// GSVector8i ga01 = (c01 >> 8) & mask;
vpsllw(ymm3, ymm4, 8);
vpsrlw(ymm3, 8);
vpsrlw(ymm4, 8);
// ymm0 = uf
// ymm2 = rb00
// ymm3 = rb01
// ymm6 = ga00
// ymm4 = ga01
// ymm1 = c10
// ymm5 = c11
// ymm7 = used
// rb00 = rb00.lerp16_4(rb01, uf);
// ga00 = ga00.lerp16_4(ga01, uf);
lerp16_4(ymm3, ymm2, ymm0);
lerp16_4(ymm4, ymm6, ymm0);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = c10
// ymm5 = c11
// ymm2, ymm6 = free
// ymm7 = used
// GSVector8i rb10 = c10 & mask;
// GSVector8i ga10 = (c10 >> 8) & mask;
vpsrlw(ymm2, ymm1, 8);
vpsllw(ymm1, 8);
vpsrlw(ymm1, 8);
// GSVector8i rb11 = c11 & mask;
// GSVector8i ga11 = (c11 >> 8) & mask;
vpsrlw(ymm6, ymm5, 8);
vpsllw(ymm5, 8);
vpsrlw(ymm5, 8);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = rb10
// ymm5 = rb11
// ymm2 = ga10
// ymm6 = ga11
// ymm7 = used
// rb10 = rb10.lerp16_4(rb11, uf);
// ga10 = ga10.lerp16_4(ga11, uf);
lerp16_4(ymm5, ymm1, ymm0);
lerp16_4(ymm6, ymm2, ymm0);
// ymm3 = rb00
// ymm4 = ga00
// ymm5 = rb10
// ymm6 = ga10
// ymm0, ymm1, ymm2 = free
// ymm7 = used
// rb00 = rb00.lerp16_4(rb10, vf);
// ga00 = ga00.lerp16_4(ga10, vf);
vmovdqa(ymm0, ptr[&m_local.temp.vf]);
lerp16_4(ymm5, ymm3, ymm0);
lerp16_4(ymm6, ymm4, ymm0);
}
else
{
// GSVector8i addr00 = y0 + x0;
vpaddd(ymm5, ymm2, ymm4);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
// GSVector8i mask = GSVector8i::x00ff();
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
vpsllw(ymm5, ymm6, 8);
vpsrlw(ymm5, 8);
vpsrlw(ymm6, 8);
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv)
{
// ymm0, ymm1, ymm4, ymm5, ymm6 = free
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
if(wms_clamp == wmt_clamp)
{
if(wms_clamp)
{
if(region)
{
vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]);
vpmaxsw(uv, ymm0);
}
else
{
vpxor(ymm0, ymm0);
vpmaxsw(uv, ymm0);
}
vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]);
vpminsw(uv, ymm0);
}
else
{
vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]);
vpand(uv, ymm0);
if(region)
{
vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]);
vpor(uv, ymm0);
}
}
}
else
{
vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv, ymm4);
if(region)
{
vpor(ymm1, ymm5);
}
// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv, ymm4);
vpminsw(uv, ymm5);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv, ymm1, ymm0);
}
}
void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv0, const Ymm& uv1)
{
// ymm0, ymm1, ymm4, ymm5, ymm6 = free
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
if(wms_clamp == wmt_clamp)
{
if(wms_clamp)
{
if(region)
{
vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
vpmaxsw(uv0, ymm4);
vpmaxsw(uv1, ymm4);
}
else
{
vpxor(ymm0, ymm0);
vpmaxsw(uv0, ymm0);
vpmaxsw(uv1, ymm0);
}
vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
vpminsw(uv0, ymm5);
vpminsw(uv1, ymm5);
}
else
{
vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
vpand(uv0, ymm4);
vpand(uv1, ymm4);
if(region)
{
vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
vpor(uv0, ymm5);
vpor(uv1, ymm5);
}
}
}
else
{
vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]);
vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
// uv0
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv0, ymm4);
if(region)
{
vpor(ymm1, ymm5);
}
// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv0, ymm4);
vpminsw(uv0, ymm5);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv0, ymm1, ymm0);
// uv1
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv1, ymm4);
if(region)
{
vpor(ymm1, ymm5);
}
// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv1, ymm4);
vpminsw(uv1, ymm5);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv1, ymm1, ymm0);
}
}
void GSDrawScanlineCodeGenerator::SampleTextureLOD()
{
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
{
return;
}
push(ebp);
mov(ebp, (size_t)m_local.gd->tex);
if(m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
if(!m_sel.fst)
{
vrcpps(ymm0, ymm4);
vmulps(ymm2, ymm0);
vmulps(ymm3, ymm0);
vcvttps2dq(ymm2, ymm2);
vcvttps2dq(ymm3, ymm3);
}
// ymm2 = u
// ymm3 = v
// ymm4 = q
// ymm0 = ymm1 = ymm5 = ymm6 = free
// TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?)
if(!m_sel.lcm)
{
// lod = -log2(Q) * (1 << L) + K
vpcmpeqd(ymm1, ymm1);
vpsrld(ymm1, ymm1, 25);
vpslld(ymm0, ymm4, 1);
vpsrld(ymm0, ymm0, 24);
vpsubd(ymm0, ymm1);
vcvtdq2ps(ymm0, ymm0);
// ymm0 = (float)(exp(q) - 127)
vpslld(ymm4, ymm4, 9);
vpsrld(ymm4, ymm4, 9);
vorps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
// ymm4 = mant(q) | 1.0f
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[g_const->m_log2_coef_256b[0]]); // c0
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[1]]); // c0 * ymm4 + c1
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); // ymm4 - 1.0f
vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0
}
else
{
vmulps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[0]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[1]]);
vmulps(ymm5, ymm4);
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[2]]);
vmulps(ymm4, ymm5);
vaddps(ymm4, ymm0);
}
// ymm4 = log2(Q) = ((((c0 * ymm4) + c1) * ymm4) + c2) * (ymm4 - 1.0f) + ymm0
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[&m_local.gd->l]);
vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]);
}
else
{
vmulps(ymm4, ptr[&m_local.gd->l]);
vaddps(ymm4, ptr[&m_local.gd->k]);
}
// ymm4 = (-log2(Q) * (1 << L) + K) * 0x10000
vxorps(ymm0, ymm0);
vminps(ymm4, ptr[&m_local.gd->mxl]);
vmaxps(ymm4, ymm0);
vcvtps2dq(ymm4, ymm4);
if(m_sel.mmin == 1) // round-off mode
{
mov(eax, 0x8000);
vmovd(xmm0, eax);
vpbroadcastd(ymm0, xmm0);
vpaddd(ymm4, ymm0);
}
vpsrld(ymm0, ymm4, 16);
vmovdqa(ptr[&m_local.temp.lod.i], ymm0);
/*
vpslld(ymm5, ymm0, 6);
vpslld(ymm6, ymm4, 16);
vpsrld(ymm6, ymm6, 24);
return;
*/
if(m_sel.mmin == 2) // trilinear mode
{
vpshuflw(ymm1, ymm4, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.temp.lod.f], ymm1);
}
// shift u/v/minmax by (int)lod
vpsravd(ymm2, ymm2, ymm0);
vpsravd(ymm3, ymm3, ymm0);
vmovdqa(ptr[&m_local.temp.uv[0]], ymm2);
vmovdqa(ptr[&m_local.temp.uv[1]], ymm3);
// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
vpxor(ymm1, ymm1);
vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]);
vpunpcklwd(ymm5, ymm4, ymm1); // minu
vpunpckhwd(ymm6, ymm4, ymm1); // minv
vpsrlvd(ymm5, ymm5, ymm0);
vpsrlvd(ymm6, ymm6, ymm0);
vpackusdw(ymm5, ymm6);
vbroadcasti128(ymm4, ptr[&m_local.gd->t.max]);
vpunpcklwd(ymm6, ymm4, ymm1); // maxu
vpunpckhwd(ymm4, ymm4, ymm1); // maxv
vpsrlvd(ymm6, ymm6, ymm0);
vpsrlvd(ymm4, ymm4, ymm0);
vpackusdw(ymm6, ymm4);
vmovdqa(ptr[&m_local.temp.uv_minmax[0]], ymm5);
vmovdqa(ptr[&m_local.temp.uv_minmax[1]], ymm6);
}
else
{
// lod = K
vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]);
vpsrad(ymm2, xmm0);
vpsrad(ymm3, xmm0);
vmovdqa(ptr[&m_local.temp.uv[0]], ymm2);
vmovdqa(ptr[&m_local.temp.uv[1]], ymm3);
vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]);
vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]);
}
// ymm2 = m_local.temp.uv[0] = u (level m)
// ymm3 = m_local.temp.uv[1] = v (level m)
// ymm5 = minuv
// ymm6 = maxuv
if(m_sel.ltf)
{
// u -= 0x8000;
// v -= 0x8000;
mov(eax, 0x8000);
vmovd(xmm4, eax);
vpbroadcastd(ymm4, xmm4);
vpsubd(ymm2, ymm4);
vpsubd(ymm3, ymm4);
// GSVector8i uf = u.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.uf], ymm0);
// GSVector8i vf = v.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.vf], ymm0);
}
// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
vpsrad(ymm2, 16);
vpsrad(ymm3, 16);
vpackssdw(ymm2, ymm3);
if(m_sel.ltf)
{
// GSVector8i uv1 = uv0.add16(GSVector8i::x0001());
vpcmpeqd(ymm1, ymm1);
vpsrlw(ymm1, 15);
vpaddw(ymm3, ymm2, ymm1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(ymm2, ymm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(ymm2);
}
// ymm2 = uv0
// ymm3 = uv1 (ltf)
// ymm0, ymm1, ymm4, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i x0 = uv0.upl16();
// GSVector8i y0 = uv0.uph16() << tw;
vpxor(ymm0, ymm0);
vpunpcklwd(ymm4, ymm2, ymm0);
vpunpckhwd(ymm2, ymm2, ymm0);
vpslld(ymm2, (uint8)(m_sel.tw + 3));
// ymm0 = 0
// ymm2 = y0
// ymm3 = uv1 (ltf)
// ymm4 = x0
// ymm1, ymm5, ymm6 = free
// ymm7 = used
if(m_sel.ltf)
{
// GSVector8i x1 = uv1.upl16();
// GSVector8i y1 = uv1.uph16() << tw;
vpunpcklwd(ymm6, ymm3, ymm0);
vpunpckhwd(ymm3, ymm3, ymm0);
vpslld(ymm3, (uint8)(m_sel.tw + 3));
// ymm2 = y0
// ymm3 = y1
// ymm4 = x0
// ymm6 = x1
// ymm0, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i addr00 = y0 + x0;
// GSVector8i addr01 = y0 + x1;
// GSVector8i addr10 = y1 + x0;
// GSVector8i addr11 = y1 + x1;
vpaddd(ymm5, ymm2, ymm4);
vpaddd(ymm2, ymm2, ymm6);
vpaddd(ymm0, ymm3, ymm4);
vpaddd(ymm3, ymm3, ymm6);
// ymm5 = addr00
// ymm2 = addr01
// ymm0 = addr10
// ymm3 = addr11
// ymm1, ymm4, ymm6 = free
// ymm7 = used
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 0);
// ymm6 = c00
// ymm4 = c01
// ymm1 = c10
// ymm5 = c11
// ymm0, ymm2, ymm3 = free
// ymm7 = used
vmovdqa(ymm0, ptr[&m_local.temp.uf]);
// GSVector8i rb00 = c00 & mask;
// GSVector8i ga00 = (c00 >> 8) & mask;
vpsllw(ymm2, ymm6, 8);
vpsrlw(ymm2, 8);
vpsrlw(ymm6, 8);
// GSVector8i rb01 = c01 & mask;
// GSVector8i ga01 = (c01 >> 8) & mask;
vpsllw(ymm3, ymm4, 8);
vpsrlw(ymm3, 8);
vpsrlw(ymm4, 8);
// ymm0 = uf
// ymm2 = rb00
// ymm3 = rb01
// ymm6 = ga00
// ymm4 = ga01
// ymm1 = c10
// ymm5 = c11
// ymm7 = used
// rb00 = rb00.lerp16_4(rb01, uf);
// ga00 = ga00.lerp16_4(ga01, uf);
lerp16_4(ymm3, ymm2, ymm0);
lerp16_4(ymm4, ymm6, ymm0);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = c10
// ymm5 = c11
// ymm2, ymm6 = free
// ymm7 = used
// GSVector8i rb10 = c10 & mask;
// GSVector8i ga10 = (c10 >> 8) & mask;
vpsrlw(ymm2, ymm1, 8);
vpsllw(ymm1, 8);
vpsrlw(ymm1, 8);
// GSVector8i rb11 = c11 & mask;
// GSVector8i ga11 = (c11 >> 8) & mask;
vpsrlw(ymm6, ymm5, 8);
vpsllw(ymm5, 8);
vpsrlw(ymm5, 8);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = rb10
// ymm5 = rb11
// ymm2 = ga10
// ymm6 = ga11
// ymm7 = used
// rb10 = rb10.lerp16_4(rb11, uf);
// ga10 = ga10.lerp16_4(ga11, uf);
lerp16_4(ymm5, ymm1, ymm0);
lerp16_4(ymm6, ymm2, ymm0);
// ymm3 = rb00
// ymm4 = ga00
// ymm5 = rb10
// ymm6 = ga10
// ymm0, ymm1, ymm2 = free
// ymm7 = used
// rb00 = rb00.lerp16_4(rb10, vf);
// ga00 = ga00.lerp16_4(ga10, vf);
vmovdqa(ymm0, ptr[&m_local.temp.vf]);
lerp16_4(ymm5, ymm3, ymm0);
lerp16_4(ymm6, ymm4, ymm0);
}
else
{
// GSVector8i addr00 = y0 + x0;
vpaddd(ymm5, ymm2, ymm4);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 0);
// GSVector8i mask = GSVector8i::x00ff();
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
vpsllw(ymm5, ymm6, 8);
vpsrlw(ymm5, 8);
vpsrlw(ymm6, 8);
}
if(m_sel.mmin != 1) // !round-off mode
{
vmovdqa(ptr[&m_local.temp.trb], ymm5);
vmovdqa(ptr[&m_local.temp.tga], ymm6);
vmovdqa(ymm2, ptr[&m_local.temp.uv[0]]);
vmovdqa(ymm3, ptr[&m_local.temp.uv[1]]);
vpsrad(ymm2, 1);
vpsrad(ymm3, 1);
vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]);
vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]);
vpsrlw(ymm5, 1);
vpsrlw(ymm6, 1);
if(m_sel.ltf)
{
// u -= 0x8000;
// v -= 0x8000;
mov(eax, 0x8000);
vmovd(xmm4, eax);
vpbroadcastd(ymm4, xmm4);
vpsubd(ymm2, ymm4);
vpsubd(ymm3, ymm4);
// GSVector8i uf = u.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.uf], ymm0);
// GSVector8i vf = v.xxzzlh().srl16(1);
vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(ymm0, 12);
vmovdqa(ptr[&m_local.temp.vf], ymm0);
}
// GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16));
vpsrad(ymm2, 16);
vpsrad(ymm3, 16);
vpackssdw(ymm2, ymm3);
if(m_sel.ltf)
{
// GSVector8i uv1 = uv0.add16(GSVector4i::x0001());
vpcmpeqd(ymm1, ymm1);
vpsrlw(ymm1, 15);
vpaddw(ymm3, ymm2, ymm1);
// uv0 = Wrap(uv0);
// uv1 = Wrap(uv1);
WrapLOD(ymm2, ymm3);
}
else
{
// uv0 = Wrap(uv0);
WrapLOD(ymm2);
}
// ymm2 = uv0
// ymm3 = uv1 (ltf)
// ymm0, ymm1, ymm4, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i x0 = uv0.upl16();
// GSVector8i y0 = uv0.uph16() << tw;
vpxor(ymm0, ymm0);
vpunpcklwd(ymm4, ymm2, ymm0);
vpunpckhwd(ymm2, ymm2, ymm0);
vpslld(ymm2, (uint8)(m_sel.tw + 3));
// ymm0 = 0
// ymm2 = y0
// ymm3 = uv1 (ltf)
// ymm4 = x0
// ymm1, ymm5, ymm6 = free
// ymm7 = used
if(m_sel.ltf)
{
// GSVector8i x1 = uv1.upl16();
// GSVector8i y1 = uv1.uph16() << tw;
vpunpcklwd(ymm6, ymm3, ymm0);
vpunpckhwd(ymm3, ymm3, ymm0);
vpslld(ymm3, (uint8)(m_sel.tw + 3));
// ymm2 = y0
// ymm3 = y1
// ymm4 = x0
// ymm6 = x1
// ymm0, ymm5, ymm6 = free
// ymm7 = used
// GSVector8i addr00 = y0 + x0;
// GSVector8i addr01 = y0 + x1;
// GSVector8i addr10 = y1 + x0;
// GSVector8i addr11 = y1 + x1;
vpaddd(ymm5, ymm2, ymm4);
vpaddd(ymm2, ymm2, ymm6);
vpaddd(ymm0, ymm3, ymm4);
vpaddd(ymm3, ymm3, ymm6);
// ymm5 = addr00
// ymm2 = addr01
// ymm0 = addr10
// ymm3 = addr11
// ymm1, ymm4, ymm6 = free
// ymm7 = used
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(4, 1);
// ymm6 = c00
// ymm4 = c01
// ymm1 = c10
// ymm5 = c11
// ymm0, ymm2, ymm3 = free
// ymm7 = used
vmovdqa(ymm0, ptr[&m_local.temp.uf]);
// GSVector8i rb00 = c00 & mask;
// GSVector8i ga00 = (c00 >> 8) & mask;
vpsllw(ymm2, ymm6, 8);
vpsrlw(ymm2, 8);
vpsrlw(ymm6, 8);
// GSVector8i rb01 = c01 & mask;
// GSVector8i ga01 = (c01 >> 8) & mask;
vpsllw(ymm3, ymm4, 8);
vpsrlw(ymm3, 8);
vpsrlw(ymm4, 8);
// ymm0 = uf
// ymm2 = rb00
// ymm3 = rb01
// ymm6 = ga00
// ymm4 = ga01
// ymm1 = c10
// ymm5 = c11
// ymm7 = used
// rb00 = rb00.lerp16_4(rb01, uf);
// ga00 = ga00.lerp16_4(ga01, uf);
lerp16_4(ymm3, ymm2, ymm0);
lerp16_4(ymm4, ymm6, ymm0);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = c10
// ymm5 = c11
// ymm2, ymm6 = free
// ymm7 = used
// GSVector8i rb10 = c10 & mask;
// GSVector8i ga10 = (c10 >> 8) & mask;
vpsrlw(ymm2, ymm1, 8);
vpsllw(ymm1, 8);
vpsrlw(ymm1, 8);
// GSVector8i rb11 = c11 & mask;
// GSVector8i ga11 = (c11 >> 8) & mask;
vpsrlw(ymm6, ymm5, 8);
vpsllw(ymm5, 8);
vpsrlw(ymm5, 8);
// ymm0 = uf
// ymm3 = rb00
// ymm4 = ga00
// ymm1 = rb10
// ymm5 = rb11
// ymm2 = ga10
// ymm6 = ga11
// ymm7 = used
// rb10 = rb10.lerp16_4(rb11, uf);
// ga10 = ga10.lerp16_4(ga11, uf);
lerp16_4(ymm5, ymm1, ymm0);
lerp16_4(ymm6, ymm2, ymm0);
// ymm3 = rb00
// ymm4 = ga00
// ymm5 = rb10
// ymm6 = ga10
// ymm0, ymm1, ymm2 = free
// ymm7 = used
// rb00 = rb00.lerp16_4(rb10, vf);
// ga00 = ga00.lerp16_4(ga10, vf);
vmovdqa(ymm0, ptr[&m_local.temp.vf]);
lerp16_4(ymm5, ymm3, ymm0);
lerp16_4(ymm6, ymm4, ymm0);
}
else
{
// GSVector8i addr00 = y0 + x0;
vpaddd(ymm5, ymm2, ymm4);
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(1, 1);
// GSVector8i mask = GSVector8i::x00ff();
// c[0] = c00 & mask;
// c[1] = (c00 >> 8) & mask;
vpsllw(ymm5, ymm6, 8);
vpsrlw(ymm5, 8);
vpsrlw(ymm6, 8);
}
vmovdqa(ymm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]);
vpsrlw(ymm0, ymm0, 1);
vmovdqa(ymm2, ptr[&m_local.temp.trb]);
vmovdqa(ymm3, ptr[&m_local.temp.tga]);
lerp16(ymm5, ymm2, ymm0, 0);
lerp16(ymm6, ymm3, ymm0, 0);
}
pop(ebp);
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv)
{
// ymm5 = minuv
// ymm6 = maxuv
// ymm0, ymm1, ymm4 = free
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
if(wms_clamp == wmt_clamp)
{
if(wms_clamp)
{
if(region)
{
vpmaxsw(uv, ymm5);
}
else
{
vpxor(ymm0, ymm0);
vpmaxsw(uv, ymm0);
}
vpminsw(uv, ymm6);
}
else
{
vpand(uv, ymm5);
if(region)
{
vpor(uv, ymm6);
}
}
}
else
{
vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv, ymm5);
if(region)
{
vpor(ymm1, ymm6);
}
// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv, ymm5);
vpminsw(uv, ymm6);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv, ymm1, ymm0);
}
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv0, const Ymm& uv1)
{
// ymm5 = minuv
// ymm6 = maxuv
// ymm0, ymm1, ymm4 = free
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
if(wms_clamp == wmt_clamp)
{
if(wms_clamp)
{
if(region)
{
vpmaxsw(uv0, ymm5);
vpmaxsw(uv1, ymm5);
}
else
{
vpxor(ymm0, ymm0);
vpmaxsw(uv0, ymm0);
vpmaxsw(uv1, ymm0);
}
vpminsw(uv0, ymm6);
vpminsw(uv1, ymm6);
}
else
{
vpand(uv0, ymm5);
vpand(uv1, ymm5);
if(region)
{
vpor(uv0, ymm6);
vpor(uv1, ymm6);
}
}
}
else
{
vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]);
// uv0
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv0, ymm5);
if(region)
{
vpor(ymm1, ymm6);
}
// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv0, ymm5);
vpminsw(uv0, ymm6);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv0, ymm1, ymm0);
// uv1
// GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
vpand(ymm1, uv1, ymm5);
if(region)
{
vpor(ymm1, ymm6);
}
// GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
vpmaxsw(uv1, ymm5);
vpminsw(uv1, ymm6);
// clamp.blend8(repeat, m_local.gd->t.mask);
vpblendvb(uv1, ymm1, ymm0);
}
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
{
if(!m_sel.fb)
{
return;
}
switch(m_sel.tfx)
{
case TFX_MODULATE:
// GSVector8i ga = iip ? gaf : m_local.c.ga;
vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
// gat = gat.modulate16<1>(ga).clamp8();
modulate16(ymm6, ymm4, 1);
clamp16(ymm6, ymm3);
// if(!tcc) gat = gat.mix16(ga.srl16(7));
if(!m_sel.tcc)
{
vpsrlw(ymm4, 7);
mix16(ymm6, ymm4, ymm3);
}
break;
case TFX_DECAL:
// if(!tcc) gat = gat.mix16(ga.srl16(7));
if(!m_sel.tcc)
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
vpsrlw(ymm4, 7);
mix16(ymm6, ymm4, ymm3);
}
break;
case TFX_HIGHLIGHT:
// GSVector4i ga = iip ? gaf : m_local.c.ga;
vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
vmovdqa(ymm2, ymm4);
// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
vpsrlw(ymm4, 7);
if(m_sel.tcc)
{
vpaddusb(ymm4, ymm6);
}
mix16(ymm6, ymm4, ymm3);
break;
case TFX_HIGHLIGHT2:
// if(!tcc) gat = gat.mix16(ga.srl16(7));
if(!m_sel.tcc)
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
vmovdqa(ymm2, ymm4);
vpsrlw(ymm4, 7);
mix16(ymm6, ymm4, ymm3);
}
break;
case TFX_NONE:
// gat = iip ? ga.srl16(7) : ga;
if(m_sel.iip)
{
vpsrlw(ymm6, 7);
}
break;
}
if(m_sel.aa1)
{
// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
// FIXME: bios config screen cubes
if(!m_sel.abe)
{
// a = cov
if(m_sel.edge)
{
vmovdqa(ymm0, ptr[&m_local.temp.cov]);
}
else
{
vpcmpeqd(ymm0, ymm0);
vpsllw(ymm0, 15);
vpsrlw(ymm0, 8);
}
mix16(ymm6, ymm0, ymm1);
}
else
{
// a = a == 0x80 ? cov : a
vpcmpeqd(ymm0, ymm0);
vpsllw(ymm0, 15);
vpsrlw(ymm0, 8);
if(m_sel.edge)
{
vmovdqa(ymm1, ptr[&m_local.temp.cov]);
}
else
{
vmovdqa(ymm1, ymm0);
}
vpcmpeqw(ymm0, ymm6);
vpsrld(ymm0, 16);
vpslld(ymm0, 16);
vpblendvb(ymm6, ymm1, ymm0);
}
}
}
void GSDrawScanlineCodeGenerator::ReadMask()
{
if(m_sel.fwrite)
{
vpbroadcastd(ymm3, ptr[&m_local.gd->fm]);
}
if(m_sel.zwrite)
{
vpbroadcastd(ymm4, ptr[&m_local.gd->zm]);
}
}
void GSDrawScanlineCodeGenerator::TestAlpha()
{
switch(m_sel.atst)
{
case ATST_NEVER:
// t = GSVector8i::xffffffff();
vpcmpeqd(ymm1, ymm1);
break;
case ATST_ALWAYS:
return;
case ATST_LESS:
case ATST_LEQUAL:
// t = (ga >> 16) > m_local.gd->aref;
vpsrld(ymm1, ymm6, 16);
vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
vpcmpgtd(ymm1, ymm0);
break;
case ATST_EQUAL:
// t = (ga >> 16) != m_local.gd->aref;
vpsrld(ymm1, ymm6, 16);
vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
vpcmpeqd(ymm1, ymm0);
vpcmpeqd(ymm0, ymm0);
vpxor(ymm1, ymm0);
break;
case ATST_GEQUAL:
case ATST_GREATER:
// t = (ga >> 16) < m_local.gd->aref;
vpsrld(ymm0, ymm6, 16);
vbroadcasti128(ymm1, ptr[&m_local.gd->aref]);
vpcmpgtd(ymm1, ymm0);
break;
case ATST_NOTEQUAL:
// t = (ga >> 16) == m_local.gd->aref;
vpsrld(ymm1, ymm6, 16);
vbroadcasti128(ymm0, ptr[&m_local.gd->aref]);
vpcmpeqd(ymm1, ymm0);
break;
}
switch(m_sel.afail)
{
case AFAIL_KEEP:
// test |= t;
vpor(ymm7, ymm1);
alltrue(ymm7);
break;
case AFAIL_FB_ONLY:
// zm |= t;
vpor(ymm4, ymm1);
break;
case AFAIL_ZB_ONLY:
// fm |= t;
vpor(ymm3, ymm1);
break;
case AFAIL_RGB_ONLY:
// zm |= t;
vpor(ymm4, ymm1);
// fm |= t & GSVector8i::xff000000();
vpsrld(ymm1, 24);
vpslld(ymm1, 24);
vpor(ymm3, ymm1);
break;
}
}
void GSDrawScanlineCodeGenerator::ColorTFX()
{
if(!m_sel.fwrite)
{
return;
}
switch(m_sel.tfx)
{
case TFX_MODULATE:
// GSVector8i rb = iip ? rbf : m_local.c.rb;
// rbt = rbt.modulate16<1>(rb).clamp8();
modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
clamp16(ymm5, ymm1);
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2:
if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
{
// GSVector8i ga = iip ? gaf : m_local.c.ga;
vmovdqa(ymm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]);
}
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
vmovdqa(ymm1, ymm6);
modulate16(ymm6, ymm2, 1);
vpshuflw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1));
vpshufhw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1));
vpsrlw(ymm2, 7);
vpaddw(ymm6, ymm2);
clamp16(ymm6, ymm0);
mix16(ymm6, ymm1, ymm0);
// GSVector8i rb = iip ? rbf : m_local.c.rb;
// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1);
vpaddw(ymm5, ymm2);
clamp16(ymm5, ymm0);
break;
case TFX_NONE:
// rbt = iip ? rb.srl16(7) : rb;
if(m_sel.iip)
{
vpsrlw(ymm5, 7);
}
break;
}
}
void GSDrawScanlineCodeGenerator::Fog()
{
if(!m_sel.fwrite || !m_sel.fge)
{
return;
}
// rb = m_local.gd->frb.lerp16<0>(rb, f);
// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
if(m_sel.prim != GS_SPRITE_CLASS)
{
vmovdqa(ymm0, ptr[&m_local.temp.f]);
}
else
{
vpbroadcastw(ymm0, ptr[&m_local.p.f]);
}
vmovdqa(ymm1, ymm6);
vpbroadcastd(ymm2, ptr[&m_local.gd->frb]);
lerp16(ymm5, ymm2, ymm0, 0);
vpbroadcastd(ymm2, ptr[&m_local.gd->fga]);
lerp16(ymm6, ymm2, ymm0, 0);
mix16(ymm6, ymm1, ymm0);
}
void GSDrawScanlineCodeGenerator::ReadFrame()
{
if(!m_sel.fb)
{
return;
}
// int fa = fza_base.x + fza_offset->x;
mov(ebx, ptr[esi]);
add(ebx, ptr[edi]);
and(ebx, HALF_VM_SIZE - 1);
if(!m_sel.rfb)
{
return;
}
ReadPixel(ymm2, ymm0, rbx);
}
void GSDrawScanlineCodeGenerator::TestDestAlpha()
{
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
{
return;
}
// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
if(m_sel.datm)
{
if(m_sel.fpsm == 2)
{
vpxor(ymm0, ymm0);
//vpsrld(ymm1, ymm2, 15);
vpslld(ymm1, ymm2, 16);
vpsrad(ymm1, 31);
vpcmpeqd(ymm1, ymm0);
}
else
{
vpcmpeqd(ymm0, ymm0);
vpxor(ymm1, ymm2, ymm0);
vpsrad(ymm1, 31);
}
}
else
{
if(m_sel.fpsm == 2)
{
vpslld(ymm1, ymm2, 16);
vpsrad(ymm1, 31);
}
else
{
vpsrad(ymm1, ymm2, 31);
}
}
vpor(ymm7, ymm1);
alltrue(ymm7);
}
void GSDrawScanlineCodeGenerator::WriteMask()
{
if(m_sel.notest)
{
return;
}
// fm |= test;
// zm |= test;
if(m_sel.fwrite)
{
vpor(ymm3, ymm7);
}
if(m_sel.zwrite)
{
vpor(ymm4, ymm7);
}
// int fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask();
vpcmpeqd(ymm1, ymm1);
if(m_sel.fwrite && m_sel.zwrite)
{
vpcmpeqd(ymm0, ymm1, ymm4);
vpcmpeqd(ymm1, ymm3);
vpackssdw(ymm1, ymm0);
}
else if(m_sel.fwrite)
{
vpcmpeqd(ymm1, ymm3);
vpackssdw(ymm1, ymm1);
}
else if(m_sel.zwrite)
{
vpcmpeqd(ymm1, ymm4);
vpackssdw(ymm1, ymm1);
}
vpmovmskb(edx, ymm1);
not(edx);
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
{
if(!m_sel.zwrite)
{
return;
}
if(m_sel.prim != GS_SPRITE_CLASS)
{
vmovdqa(ymm1, ptr[&m_local.temp.zs]);
}
else
{
vpbroadcastd(ymm1, ptr[&m_local.p.z]);
}
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);
vpblendvb(ymm1, ptr[&m_local.temp.zd], ymm4);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(ymm1, ymm0, rbp, edx, fast, m_sel.zpsm, 1);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
if(!m_sel.fwrite)
{
return;
}
if(m_sel.abe == 0 && m_sel.aa1 == 0)
{
return;
}
if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
{
switch(m_sel.fpsm)
{
case 0:
case 1:
// c[2] = fd & mask;
// c[3] = (fd >> 8) & mask;
vpsllw(ymm0, ymm2, 8);
vpsrlw(ymm0, 8);
vpsrlw(ymm1, ymm2, 8);
break;
case 2:
// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
vpcmpeqd(ymm7, ymm7);
vpsrld(ymm7, 27); // 0x0000001f
vpand(ymm0, ymm2, ymm7);
vpslld(ymm0, 3);
vpslld(ymm7, 10); // 0x00007c00
vpand(ymm4, ymm2, ymm7);
vpslld(ymm4, 9);
vpor(ymm0, ymm4);
vpsrld(ymm7, 5); // 0x000003e0
vpand(ymm1, ymm2, ymm7);
vpsrld(ymm1, 2);
vpsllw(ymm7, 10); // 0x00008000
vpand(ymm4, ymm2, ymm7);
vpslld(ymm4, 8);
vpor(ymm1, ymm4);
break;
}
}
// ymm5, ymm6 = src rb, ga
// ymm0, ymm1 = dst rb, ga
// ymm2, ymm3 = used
// ymm4, ymm7 = free
if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
{
vmovdqa(ymm4, ymm5);
}
if(m_sel.aba != m_sel.abb)
{
// rb = c[aba * 2 + 0];
switch(m_sel.aba)
{
case 0: break;
case 1: vmovdqa(ymm5, ymm0); break;
case 2: vpxor(ymm5, ymm5); break;
}
// rb = rb.sub16(c[abb * 2 + 0]);
switch(m_sel.abb)
{
case 0: vpsubw(ymm5, ymm4); break;
case 1: vpsubw(ymm5, ymm0); break;
case 2: break;
}
if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
{
// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
switch(m_sel.abc)
{
case 0:
case 1:
vpshuflw(ymm7, m_sel.abc ? ymm1 : ymm6, _MM_SHUFFLE(3, 3, 1, 1));
vpshufhw(ymm7, ymm7, _MM_SHUFFLE(3, 3, 1, 1));
vpsllw(ymm7, 7);
break;
case 2:
vpbroadcastw(ymm7, ptr[&m_local.gd->afix]);
break;
}
// rb = rb.modulate16<1>(a);
modulate16(ymm5, ymm7, 1);
}
// rb = rb.add16(c[abd * 2 + 0]);
switch(m_sel.abd)
{
case 0: vpaddw(ymm5, ymm4); break;
case 1: vpaddw(ymm5, ymm0); break;
case 2: break;
}
}
else
{
// rb = c[abd * 2 + 0];
switch(m_sel.abd)
{
case 0: break;
case 1: vmovdqa(ymm5, ymm0); break;
case 2: vpxor(ymm5, ymm5); break;
}
}
if(m_sel.pabe)
{
// mask = (c[1] << 8).sra32(31);
vpslld(ymm0, ymm6, 8);
vpsrad(ymm0, 31);
// rb = c[0].blend8(rb, mask);
vpblendvb(ymm5, ymm4, ymm5, ymm0);
}
// ymm6 = src ga
// ymm1 = dst ga
// ymm5 = rb
// ymm7 = a
// ymm2, ymm3 = used
// ymm0, ymm4 = free
vmovdqa(ymm4, ymm6);
if(m_sel.aba != m_sel.abb)
{
// ga = c[aba * 2 + 1];
switch(m_sel.aba)
{
case 0: break;
case 1: vmovdqa(ymm6, ymm1); break;
case 2: vpxor(ymm6, ymm6); break;
}
// ga = ga.sub16(c[abeb * 2 + 1]);
switch(m_sel.abb)
{
case 0: vpsubw(ymm6, ymm4); break;
case 1: vpsubw(ymm6, ymm1); break;
case 2: break;
}
if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
{
// ga = ga.modulate16<1>(a);
modulate16(ymm6, ymm7, 1);
}
// ga = ga.add16(c[abd * 2 + 1]);
switch(m_sel.abd)
{
case 0: vpaddw(ymm6, ymm4); break;
case 1: vpaddw(ymm6, ymm1); break;
case 2: break;
}
}
else
{
// ga = c[abd * 2 + 1];
switch(m_sel.abd)
{
case 0: break;
case 1: vmovdqa(ymm6, ymm1); break;
case 2: vpxor(ymm6, ymm6); break;
}
}
// ymm4 = src ga
// ymm5 = rb
// ymm6 = ga
// ymm2, ymm3 = used
// ymm0, ymm1, ymm7 = free
if(m_sel.pabe)
{
vpsrld(ymm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
// ga = c[1].blend8(ga, mask).mix16(c[1]);
vpblendvb(ymm6, ymm4, ymm6, ymm0);
}
else
{
if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
{
mix16(ymm6, ymm4, ymm7);
}
}
}
void GSDrawScanlineCodeGenerator::WriteFrame()
{
if(!m_sel.fwrite)
{
return;
}
if(m_sel.fpsm == 2 && m_sel.dthe)
{
mov(eax, ptr[esp + _top]);
and(eax, 3);
shl(eax, 5);
mov(ebp, ptr[&m_local.gd->dimx]);
vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 0]);
vpaddw(ymm5, ymm7);
vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 1]);
vpaddw(ymm6, ymm7);
}
if(m_sel.colclamp == 0)
{
// c[0] &= 0x00ff00ff;
// c[1] &= 0x00ff00ff;
vpcmpeqd(ymm7, ymm7);
vpsrlw(ymm7, 8);
vpand(ymm5, ymm7);
vpand(ymm6, ymm7);
}
// GSVector8i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
vpunpckhwd(ymm7, ymm5, ymm6);
vpunpcklwd(ymm5, ymm6);
vpackuswb(ymm5, ymm7);
if(m_sel.fba && m_sel.fpsm != 1)
{
// fs |= 0x80000000;
vpcmpeqd(ymm7, ymm7);
vpslld(ymm7, 31);
vpor(ymm5, ymm7);
}
if(m_sel.fpsm == 2)
{
// GSVector8i rb = fs & 0x00f800f8;
// GSVector8i ga = fs & 0x8000f800;
mov(eax, 0x00f800f8);
vmovd(xmm6, eax);
vpbroadcastd(ymm6, xmm6);
mov(eax, 0x8000f800);
vmovd(xmm7, eax);
vpbroadcastd(ymm7, xmm7);
vpand(ymm4, ymm5, ymm6);
vpand(ymm5, ymm7);
// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
vpsrld(ymm6, ymm4, 9);
vpsrld(ymm4, 3);
vpsrld(ymm7, ymm5, 16);
vpsrld(ymm5, 6);
vpor(ymm5, ymm4);
vpor(ymm7, ymm6);
vpor(ymm5, ymm7);
}
if(m_sel.rfb)
{
// fs = fs.blend(fd, fm);
blend(ymm5, ymm2, ymm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
}
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(ymm5, ymm0, rbx, edx, fast, m_sel.fpsm, 0);
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr)
{
vmovq(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm]);
vmovhps(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]);
vmovq(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]);
vmovhps(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2]);
vinserti128(dst, dst, Xmm(temp.getIdx()), 1);
/*
vmovdqu(dst, ptr[addr * 2 + (size_t)m_local.gd->vm]);
vmovdqu(temp, ptr[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]);
vpunpcklqdq(dst, dst, temp);
vpermq(dst, dst, _MM_SHUFFLE(3, 1, 2, 0));
*/
}
void GSDrawScanlineCodeGenerator::WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz)
{
Xmm src1 = Xmm(src.getIdx());
Xmm src2 = Xmm(temp.getIdx());
vextracti128(src2, src, 1);
if(m_sel.notest)
{
if(fast)
{
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1);
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1);
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2);
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2);
}
else
{
WritePixel(src1, addr, 0, 0, psm);
WritePixel(src1, addr, 1, 1, psm);
WritePixel(src1, addr, 2, 2, psm);
WritePixel(src1, addr, 3, 3, psm);
WritePixel(src2, addr, 4, 0, psm);
WritePixel(src2, addr, 5, 1, psm);
WritePixel(src2, addr, 6, 2, psm);
WritePixel(src2, addr, 7, 3, psm);
}
}
else
{
// cascade tests?
if(fast)
{
test(mask, 0x0000000f << (fz * 8));
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1);
L("@@");
test(mask, 0x000000f0 << (fz * 8));
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1);
L("@@");
test(mask, 0x000f0000 << (fz * 8));
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2);
L("@@");
test(mask, 0x00f00000 << (fz * 8));
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2);
L("@@");
// vmaskmovps?
}
else
{
test(mask, 0x00000003 << (fz * 8));
je("@f");
WritePixel(src1, addr, 0, 0, psm);
L("@@");
test(mask, 0x0000000c << (fz * 8));
je("@f");
WritePixel(src1, addr, 1, 1, psm);
L("@@");
test(mask, 0x00000030 << (fz * 8));
je("@f");
WritePixel(src1, addr, 2, 2, psm);
L("@@");
test(mask, 0x000000c0 << (fz * 8));
je("@f");
WritePixel(src1, addr, 3, 3, psm);
L("@@");
test(mask, 0x00030000 << (fz * 8));
je("@f");
WritePixel(src2, addr, 4, 0, psm);
L("@@");
test(mask, 0x000c0000 << (fz * 8));
je("@f");
WritePixel(src2, addr, 5, 1, psm);
L("@@");
test(mask, 0x00300000 << (fz * 8));
je("@f");
WritePixel(src2, addr, 6, 2, psm);
L("@@");
test(mask, 0x00c00000 << (fz * 8));
je("@f");
WritePixel(src2, addr, 7, 3, psm);
L("@@");
}
}
}
static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26};
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm)
{
Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2];
switch(psm)
{
case 0:
if(j == 0) vmovd(dst, src);
else vpextrd(dst, src, j);
break;
case 1:
if(j == 0) vmovd(eax, src);
else vpextrd(eax, src, j);
xor(eax, dst);
and(eax, 0xffffff);
xor(dst, eax);
break;
case 2:
if(j == 0) vmovd(eax, src);
else vpextrw(eax, src, j * 2);
mov(dst, ax);
break;
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{
// in
// ymm5 = addr00
// ymm2 = addr01
// ymm0 = addr10
// ymm3 = addr11
// ebx = m_local.tex[0] (!m_sel.mmin)
// ebp = m_local.tex (m_sel.mmin)
// edx = m_local.clut (m_sel.tlu)
// out
// ymm6 = c00
// ymm4 = c01
// ymm1 = c10
// ymm5 = c11
ASSERT(pixels == 1 || pixels == 4);
mip_offset *= sizeof(void*);
const GSVector8i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i;
if(m_sel.mmin && !m_sel.lcm)
{
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
const int t[] = {1, 4, 5, 1, 2, 5, 0, 2};
for(int i = 0; i < pixels; i++)
{
Ymm src = Ymm(r[i * 2 + 0]);
Ymm dst = Ymm(r[i * 2 + 1]);
Ymm t1 = Ymm(t[i * 2 + 0]);
Ymm t2 = Ymm(t[i * 2 + 1]);
vextracti128(Xmm(t1.getIdx()), src, 1);
for(uint8 j = 0; j < 4; j++)
{
mov(ebx, ptr[&lod_i->u32[j + 0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(dst, src, j);
mov(ebx, ptr[&lod_i->u32[j + 4]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
ReadTexel(t2, t1, j);
}
vinserti128(dst, dst, Xmm(t2.getIdx()), 1);
}
}
else
{
const int r[] = {5, 6, 2, 4, 0, 1, 3, 5};
const int t[] = {1, 4, 5, 1, 2, 5, 0, 2};
if(m_sel.mmin && m_sel.lcm)
{
mov(ebx, ptr[&lod_i->u32[0]]);
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
}
for(int i = 0; i < pixels; i++)
{
Ymm src = Ymm(r[i * 2 + 0]);
Ymm dst = Ymm(r[i * 2 + 1]);
Ymm t1 = Ymm(t[i * 2 + 0]);
Ymm t2 = Ymm(t[i * 2 + 1]);
if(!m_sel.tlu)
{
vpcmpeqd(t1, t1);
vpgatherdd(dst, ptr[ebx + src * 4], t1);
}
else
{
vextracti128(Xmm(t1.getIdx()), src, 1);
for(uint8 j = 0; j < 4; j++)
{
ReadTexel(dst, src, j);
ReadTexel(t2, t1, j);
}
vinserti128(dst, dst, Xmm(t2.getIdx()), 1);
/*
vpcmpeqd(t1, t1);
vpgatherdd(t2, ptr[ebx + src * 1], t1); // either this 1x scale, or the latency of two dependendent gathers are too slow
vpslld(t2, 24);
vpsrld(t2, 24);
vpcmpeqd(t1, t1);
vpgatherdd(dst, ptr[edx + t2 * 4], t1);
*/
}
}
}
}
void GSDrawScanlineCodeGenerator::ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i)
{
ASSERT(i < 4);
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
if(i == 0) vmovd(eax, Xmm(addr.getIdx()));
else vpextrd(eax, Xmm(addr.getIdx()), i);
if(m_sel.tlu) movzx(eax, byte[ebx + eax]);
if(i == 0) vmovd(Xmm(dst.getIdx()), src);
else vpinsrd(Xmm(dst.getIdx()), src, i);
}
#endif