GS: Remove 32bit code from SW renderer

This commit is contained in:
refractionpcsx2 2022-03-19 22:16:44 +00:00
parent ccd86a242c
commit f5afbfd4f5
5 changed files with 175 additions and 479 deletions

View File

@ -30,22 +30,8 @@ using namespace Xbyak;
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
#define _64_m_local__gd__tex r14
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_global(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
/// Executes the given code only if targeting 32-bit
#define ONLY32(code) if (is32) (code)
/// Executes the given code only if targeting 64-bit
#define ONLY64(code) if (is64) (code)
/// Combines temporary with either dst64 on 64-bit or src32 on 32-bit
/// Follow up with an ONLY32 save back to src32
#define REG_64_MEM_32(operation, dst64, temporary, src32) \
if (is32) \
operation(temporary, src32); \
else \
operation(dst64, temporary)
#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
/// On AVX, does a v-prefixed separate destination operation
/// On SSE, moves src1 into dst using movdqa, then does the operation
@ -67,14 +53,7 @@ using namespace Xbyak;
#define MOVE_IF_64(operation, dst, src64, ...) \
do \
{ \
if (is64) \
{ \
THREEARG(operation, dst, src64, __VA_ARGS__); \
} \
else \
{ \
operation(dst, __VA_ARGS__); \
} \
THREEARG(operation, dst, src64, __VA_ARGS__); \
} while (0)
#define USING_XMM DRAW_SCANLINE_USING_XMM
@ -105,21 +84,21 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
, m_local(*(GSScanlineLocalData*)param)
, m_rip(false)
#ifdef _WIN32
, a0(rcx) , a1(rdx)
, a2(r8) , a3(is64 ? r9 : rbx)
, t0(rdi) , t1(rsi)
, t2(is64 ? r8 : rbp), t3(r9)
, a0(rcx), a1(rdx)
, a2(r8) , a3(r9)
, t0(rdi), t1(rsi)
, t2(r8) , t3(r9)
#else
, a0(is64 ? rdi : rcx), a1(is64 ? rsi : rdx)
, a2(is64 ? rdx : r8), a3(is64 ? rcx : rbx)
, t0(is64 ? r8 : rdi), t1(is64 ? r9 : rsi)
, t2(is64 ? rcx : rbp), t3(is64 ? rsi : r8)
, a0(rdi), a1(rsi)
, a2(rdx), a3(rcx)
, t0(r8) , t1(r9)
, t2(rcx), t3(rsi)
#endif
, _g_const(chooseLocal(&*g_const, _64_g_const))
, _m_local(chooseLocal(&m_local, _64_m_local))
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(is64 ? xym15 : xym7)
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
{
m_sel.key = key;
@ -132,8 +111,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr)
{
if (is64)
mov(reg, (size_t)addr);
mov(reg, (size_t)addr);
return choose3264((size_t)addr, reg);
}
@ -352,51 +330,41 @@ void GSDrawScanlineCodeGenerator2::Generate()
m_rip &= (size_t)&m_local < 0x80000000;
m_rip &= (size_t)&m_local.gd < 0x80000000;
if (is32)
{
push(rbx);
push(rsi);
push(rdi);
push(rbp);
}
else
{
push(rbp);
mov(rbp, rsp); // Stack traces look much nicer this way
push(rbp);
mov(rbp, rsp); // Stack traces look much nicer this way
#ifdef _WIN32
push(rbx);
push(rsi);
push(rdi);
push(r12);
push(r13);
push(r14);
push(rbx);
push(rsi);
push(rdi);
push(r12);
push(r13);
push(r14);
sub(rsp, _64_win_stack_size);
sub(rsp, _64_win_stack_size);
for (int i = 0; i < 10; i++)
{
movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6));
}
#else
mov(ptr[rsp + _64_rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
}
mov(ptr[rsp + _64_rz_r14], r14);
mov(ptr[rsp + _64_rz_r15], r15);
#endif
mov(_64_g_const, (size_t)&*g_const);
if (!m_rip)
{
mov(_64_m_local, (size_t)&m_local);
mov(_64_m_local__gd, _rip_local(gd));
}
if (need_clut)
mov(_64_m_local__gd__clut, _rip_global(clut));
for (int i = 0; i < 10; i++)
{
movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6));
}
#else
mov(ptr[rsp + _64_rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
}
mov(ptr[rsp + _64_rz_r14], r14);
mov(ptr[rsp + _64_rz_r15], r15);
#endif
mov(_64_g_const, (size_t)&*g_const);
if (!m_rip)
{
mov(_64_m_local, (size_t)&m_local);
mov(_64_m_local__gd, _rip_local(gd));
}
if (need_clut)
mov(_64_m_local__gd__clut, _rip_global(clut));
Init();
@ -615,47 +583,33 @@ L("step");
L("exit");
if (is32)
{
pop(ebp);
pop(edi);
pop(esi);
pop(ebx);
ret(8);
}
else
{
#ifdef _WIN32
for (int i = 0; i < 10; i++)
{
movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]);
}
add(rsp, _64_win_stack_size);
pop(r14);
pop(r13);
pop(r12);
pop(rdi);
pop(rsi);
pop(rbx);
#else
mov(rbx, ptr[rsp + _64_rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
}
mov(r14, ptr[rsp + _64_rz_r14]);
mov(r15, ptr[rsp + _64_rz_r15]);
#endif
pop(rbp);
if (isYmm)
vzeroupper();
ret();
for (int i = 0; i < 10; i++)
{
movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]);
}
add(rsp, _64_win_stack_size);
pop(r14);
pop(r13);
pop(r12);
pop(rdi);
pop(rsi);
pop(rbx);
#else
mov(rbx, ptr[rsp + _64_rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
}
mov(r14, ptr[rsp + _64_rz_r14]);
mov(r15, ptr[rsp + _64_rz_r15]);
#endif
pop(rbp);
if (isYmm)
vzeroupper();
ret();
}
/// Inputs: a0=pixels, a1=left, a2[x64]=top, a3[x64]=v
@ -683,7 +637,7 @@ void GSDrawScanlineCodeGenerator2::Init()
and(eax, a0.cvt32());
if (isXmm)
shl(eax, 4); // * sizeof(m_test[0])
ONLY64(cdqe());
cdqe();
if (isXmm)
{
@ -713,43 +667,19 @@ void GSDrawScanlineCodeGenerator2::Init()
// rbx = left
// Free: rax, t0, t1
if (is64)
{
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
mov(rax, _rip_global(fzbr));
lea(t1, ptr[rax + a2 * 8]);
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
mov(rax, _rip_global(fzbr));
lea(t1, ptr[rax + a2 * 8]);
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
mov(rax, _rip_global(fzbc));
lea(t0, ptr[rax + rbx * 2]);
}
else
{
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
mov(t1, ptr[rsp + _top]);
lea(t1, ptr[t1 * 8]);
add(t1, ptr[&m_local.gd->fzbr]);
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
lea(t0, ptr[rbx * 2]);
add(t0, ptr[(size_t)&m_local.gd->fzbc]);
}
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
mov(rax, _rip_global(fzbc));
lea(t0, ptr[rax + rbx * 2]);
if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
{
// a1 = &m_local.d[skip] // note a1 was (skip << 4)
if (is64)
{
lea(rax, _rip_local(d));
lea(a1, ptr[rax + a1 * 8]);
}
else
{
lea(a1, ptr[(size_t)m_local.d + a1 * 8]);
// a3 starts on the stack in x86, we want it in a register
mov(a3, ptr[rsp + _v]);
}
lea(rax, _rip_local(d));
lea(a1, ptr[rax + a1 * 8]);
}
// a0 = steps (rcx | rdi)
@ -760,8 +690,8 @@ void GSDrawScanlineCodeGenerator2::Init()
// t1 = fza_base (rsi | r9 )
// Free: rax
const XYm& f = is64 ? _f : xym1;
const XYm& z = is64 ? _z : xym0;
const XYm& f = _f;
const XYm& z = _z;
if (m_sel.prim != GS_SPRITE_CLASS)
{
@ -777,9 +707,6 @@ void GSDrawScanlineCodeGenerator2::Init()
pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
if (is32) // _f is shared on x86
movdqa(ptr[&m_local.temp.f], f);
}
if (m_sel.zb)
@ -788,17 +715,7 @@ void GSDrawScanlineCodeGenerator2::Init()
{
// z = vp.zzzz() + m_local.d[skip].z;
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
if (is64)
{
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
}
else
{
movaps(ptr[&m_local.temp.z], z);
movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
movaps(ptr[&m_local.temp.zo], xym2);
addps(z, xym2);
}
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
}
else
pbroadcastdLocal(z, _rip_local(p.z));
@ -812,7 +729,7 @@ void GSDrawScanlineCodeGenerator2::Init()
pbroadcastdLocal(z, _rip_local(p.z));
}
if (m_sel.fwrite && m_sel.fge && is64)
if (m_sel.fwrite && m_sel.fge)
pbroadcastwLocal(_f, _rip_local(p.f));
}
@ -840,8 +757,8 @@ void GSDrawScanlineCodeGenerator2::Init()
{
// a1 = &m_local.d[skip]
const XYm& s = is64 ? _s : xym2;
const XYm& t = is64 ? _t : xym3;
const XYm& s = _s;
const XYm& t = _t;
if (m_sel.fst)
{
@ -863,19 +780,16 @@ void GSDrawScanlineCodeGenerator2::Init()
}
else if (m_sel.ltf)
{
XYm vf = is64 ? xym7 : xym6;
XYm vf = xym7;
pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12);
movdqa(_rip_local(temp.vf), vf);
}
ONLY32(movdqa(_rip_local(temp.s), s));
ONLY32(movdqa(_rip_local(temp.t), t));
}
else
{
const XYm& q = is64 ? _q : vt;
const XYm& q = _q;
// s = vt.xxxx() + m_local.d[skip].s;
// t = vt.yyyy() + m_local.d[skip].t;
@ -891,7 +805,7 @@ void GSDrawScanlineCodeGenerator2::Init()
{
movaps(s, vt);
movaps(t, vt);
ONLY64(movaps(q, vt));
movaps(q, vt);
shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0));
shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1));
@ -901,20 +815,13 @@ void GSDrawScanlineCodeGenerator2::Init()
addps(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]);
addps(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]);
addps(q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]);
if (is32)
{
movaps(ptr[&m_local.temp.s], s);
movaps(ptr[&m_local.temp.t], t);
movaps(ptr[&m_local.temp.q], q);
}
}
}
if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
{
const XYm& f_rb = is64 ? _f_rb : xym5;
const XYm& f_ga = is64 ? _f_ga : xym6;
const XYm& f_rb = _f_rb;
const XYm& f_ga = _f_ga;
if (m_sel.iip)
{
// GSVector4i vc = GSVector4i(v.c);
@ -942,38 +849,32 @@ void GSDrawScanlineCodeGenerator2::Init()
paddw(f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]);
paddw(f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]);
ONLY32(movdqa(ptr[&m_local.temp.rb], f_rb));
ONLY32(movdqa(ptr[&m_local.temp.ga], f_ga));
}
else if (is64 || m_sel.tfx == TFX_NONE)
else
{
movdqa(f_rb, _rip_local(c.rb));
movdqa(f_ga, _rip_local(c.ga));
}
ONLY64(movdqa(_rb, _f_rb));
ONLY64(movdqa(_ga, _f_ga));
movdqa(_rb, _f_rb);
movdqa(_ga, _f_ga);
}
}
if (is64)
if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe)
{
if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe)
{
// On linux, a2 is edx which will be used for fzm
// In all case, it will require a mov in dthe code, so let's keep the value on the stack
mov(ptr[rsp + _top], a2);
}
// On linux, a2 is edx which will be used for fzm
// In all case, it will require a mov in dthe code, so let's keep the value on the stack
mov(ptr[rsp + _top], a2);
}
mov(_64_m_local__gd__vm, _rip_global(vm));
if (m_sel.fb && m_sel.tfx != TFX_NONE)
{
if (use_lod)
lea(_64_m_local__gd__tex, _rip_global(tex));
else
mov(_64_m_local__gd__tex, _rip_global(tex));
}
mov(_64_m_local__gd__vm, _rip_global(vm));
if (m_sel.fb && m_sel.tfx != TFX_NONE)
{
if (use_lod)
lea(_64_m_local__gd__tex, _rip_global(tex));
else
mov(_64_m_local__gd__tex, _rip_global(tex));
}
}
@ -991,8 +892,8 @@ void GSDrawScanlineCodeGenerator2::Step()
add(t0, vecsize / 2);
const XYm& z = is64 ? _z : xym0;
const XYm& f = is64 ? _f : xym1;
const XYm& z =_z;
const XYm& f =_f;
if (m_sel.prim != GS_SPRITE_CLASS)
{
@ -1004,13 +905,6 @@ void GSDrawScanlineCodeGenerator2::Step()
{
pbroadcastdLocal(z, _rip_local(p.z));
}
else if (is32)
{
broadcastssLocal(z, _rip_local_d_p(z));
addps(z, _rip_local(temp.zo));
movaps(_rip_local(temp.zo), z);
addps(z, _rip_local(temp.z));
}
else
{
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
@ -1021,23 +915,7 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.fwrite && m_sel.fge)
{
if (is32)
{
pbroadcastwLocal(f, _rip_local_d_p(f));
paddw(f, _rip_local(temp.f));
movdqa(_rip_local(temp.f), f);
}
else
{
BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f));
}
}
}
else
{
if (is32 && m_sel.ztest)
{
pbroadcastdLocal(z, _rip_local(p.z));
BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f));
}
}
@ -1047,7 +925,7 @@ void GSDrawScanlineCodeGenerator2::Step()
{
if (m_sel.fst)
{
const XYm& stq = is64 ? xym0 : xym4;
const XYm& stq = xym0;
// GSVector4i stq = m_local.d4.stq;
// s += stq.xxxx();
@ -1055,28 +933,22 @@ void GSDrawScanlineCodeGenerator2::Step()
broadcasti128(stq, _rip_local_d(stq));
XYm s = is64 ? xym1 : xym2;
XYm s = xym1;
pshufd(s, stq, _MM_SHUFFLE(0, 0, 0, 0));
REG_64_MEM_32(paddd, _s, s, _rip_local(temp.s));
ONLY32(movdqa(_rip_local(temp.s), s));
paddd(_s, s);
XYm t = is64 ? xym1 : xym3;
XYm t = xym1;
if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
{
pshufd(t, stq, _MM_SHUFFLE(1, 1, 1, 1));
REG_64_MEM_32(paddd, _t, t, _rip_local(temp.t));
ONLY32(movdqa(_rip_local(temp.t), t));
}
else
{
ONLY32(movdqa(t, _rip_local(temp.t)));
paddd(_t, t);
}
}
else
{
const XYm& s = xym2;
const XYm& t = xym3;
const XYm& q = is64 ? xym1 : xym4;
const XYm& q = xym1;
// GSVector4 stq = m_local.d4.stq;
// s += stq.xxxx();
@ -1102,13 +974,9 @@ void GSDrawScanlineCodeGenerator2::Step()
shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
}
REG_64_MEM_32(addps, _s, s, _rip_local(temp.s));
REG_64_MEM_32(addps, _t, t, _rip_local(temp.t));
REG_64_MEM_32(addps, _q, q, _rip_local(temp.q));
ONLY32(movaps(_rip_local(temp.s), s));
ONLY32(movaps(_rip_local(temp.t), t));
ONLY32(movaps(_rip_local(temp.q), q));
addps(_s, s);
addps(_t, t);
addps(_q, q);
}
}
@ -1116,7 +984,7 @@ void GSDrawScanlineCodeGenerator2::Step()
{
if (m_sel.iip)
{
XYm c = is64 ? xym0 : xym7;
XYm c = xym0;
// GSVector4i c = m_local.d4.c;
// rb = rb.add16(c.xxxx());
@ -1127,29 +995,18 @@ void GSDrawScanlineCodeGenerator2::Step()
pshufd(_rb, c, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(_ga, c, _MM_SHUFFLE(1, 1, 1, 1));
REG_64_MEM_32(paddw, _f_rb, _rb, _rip_local(temp.rb));
REG_64_MEM_32(paddw, _f_ga, _ga, _rip_local(temp.ga));
paddw(_f_rb, _rb);
paddw(_f_ga, _ga);
// FIXME: color may underflow and roll over at the end of the line, if decreasing
pxor(c, c);
pmaxsw(is64 ? _f_rb : _rb, c);
pmaxsw(is64 ? _f_ga : _ga, c);
ONLY32(movdqa(_rip_local(temp.rb), _rb));
ONLY32(movdqa(_rip_local(temp.ga), _ga));
}
else
{
if (m_sel.tfx == TFX_NONE)
{
ONLY32(movdqa(_rb, ptr[&m_local.c.rb]));
ONLY32(movdqa(_ga, ptr[&m_local.c.ga]));
}
pmaxsw(_f_rb, c);
pmaxsw(_f_ga, c);
}
ONLY64(movdqa(_rb, _f_rb));
ONLY64(movdqa(_ga, _f_ga));
movdqa(_rb, _f_rb);
movdqa(_ga, _f_ga);
}
}
@ -1162,7 +1019,7 @@ void GSDrawScanlineCodeGenerator2::Step()
and(eax, a0.cvt32());
if (isXmm)
shl(eax, 4);
ONLY64(cdqe());
cdqe();
#if USING_XMM
movdqa(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]);
@ -1182,7 +1039,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
return;
}
const XYm& z = is64 ? _z : xym0;
const XYm& z = _z;
// int za = fza_base.y + fza_offset->y;
@ -1196,7 +1053,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
{
if (m_sel.zequal)
{
ONLY64(movdqa(xym0, _z));
movdqa(xym0, _z);
}
else if (m_sel.zoverflow)
{
@ -1247,7 +1104,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
}
else
{
ONLY64(movdqa(xym0, _z));
movdqa(xym0, _z);
}
if (m_sel.ztest)
@ -1312,22 +1169,11 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
return;
}
if (is32)
{
mov(ebx, ptr[&m_local.gd->tex[0]]);
if (m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
}
const bool needsMoreRegs = isYmm;
if (!m_sel.fst)
{
rcpps(xym0, is64 ? _q : xym4);
rcpps(xym0, _q);
MOVE_IF_64(mulps, xym2, _s, xym0);
MOVE_IF_64(mulps, xym3, _t, xym0);
@ -1349,21 +1195,19 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
}
else
{
ONLY64(movdqa(xym2, _s));
ONLY64(movdqa(xym3, _t));
movdqa(xym2, _s);
movdqa(xym3, _t);
}
if (m_sel.ltf)
{
const XYm& vf = is64 ? xym7 : xym0;
const XYm& vf = xym7;
// GSVector4i uf = u.xxzzlh().srl16(12);
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
if (m_sel.prim != GS_SPRITE_CLASS)
{
@ -1372,10 +1216,10 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12);
if (is32 || needsMoreRegs)
if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf);
}
else if (is64 && !needsMoreRegs)
else if (!needsMoreRegs)
{
movdqa(vf, _rip_local(temp.vf));
}
@ -1484,8 +1328,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// c10 = addr10.gather32_32((const u32/u8*)tex[, clut]);
// c11 = addr11.gather32_32((const u32/u8*)tex[, clut]);
const XYm& tmp1 = is64 ? xym7 : xym4; // OK to destroy if needsMoreRegs
const XYm& tmp2 = is64 ? xym4 : xym7;
const XYm& tmp1 = xym7; // OK to destroy if needsMoreRegs
const XYm& tmp2 = xym4;
// d0 d1 d2s0 d3s1 s2 s3
ReadTexel4(xym5, xym6, xym0, xym2, xym1, xym3, tmp1, tmp2, mip_offset);
@ -1496,9 +1340,6 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// xym6 = c00
// xym7 = used[x86] vf[x64&&!needsMoreRegs]
if (is32 && needsMoreRegs)
movdqa(xym4, _rip_local(temp.uf));
// GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask;
@ -1565,8 +1406,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// rb00 = rb00.lerp16_4(rb10, vf);
// ga00 = ga00.lerp16_4(ga10, vf);
XYm vf = is64 ? xym7 : xym2;
if (needsMoreRegs || is32)
XYm vf = xym7;
if (needsMoreRegs)
movdqa(vf, _rip_local(temp.vf));
lerp16_4(xym5, xym0, vf);
@ -1729,22 +1570,9 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
return;
}
if (is32)
{
push(t2);
mov(t2, (size_t)m_local.gd->tex);
if (m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
}
const bool needsMoreRegs = isYmm;
if (is64)
movdqa(xym4, _q);
movdqa(xym4, _q);
if (!m_sel.fst)
{
@ -1758,8 +1586,8 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
}
else
{
ONLY64(movdqa(xym2, _s));
ONLY64(movdqa(xym3, _t));
movdqa(xym2, _s);
movdqa(xym3, _t);
}
// xym2 = u
@ -1974,7 +1802,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
if (m_sel.ltf)
{
const XYm& vf = is64 ? xym7 : xym0;
const XYm& vf = xym7;
// u -= 0x8000;
// v -= 0x8000;
@ -1989,15 +1817,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
// GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12);
if (is32 || needsMoreRegs)
if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf);
}
@ -2058,7 +1884,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
if (m_sel.ltf)
{
const XYm& vf = is64 ? xym7 : xym0;
const XYm& vf = xym7;
// u -= 0x8000;
// v -= 0x8000;
@ -2073,15 +1899,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
// GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12);
if (is32 || needsMoreRegs)
if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf);
}
@ -2131,9 +1955,6 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
lerp16(xym5, xym2, xym0, 0);
lerp16(xym6, xym3, xym0, 0);
}
if (is32)
pop(t2);
}
void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv)
@ -2264,9 +2085,9 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
return;
}
const XYm& f_ga = is64 ? _f_ga : xym4;
const XYm& tmpga = is64 ? xym1 : f_ga;
const XYm& tmp = is64 ? xym0 : xym3;
const XYm& f_ga = _f_ga;
const XYm& tmpga = xym1;
const XYm& tmp = xym0;
Address _32_gaptr = m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga);
switch (m_sel.tfx)
@ -2274,9 +2095,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
case TFX_MODULATE:
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
// gat = gat.modulate16<1>(ga).clamp8();
modulate16(_ga, f_ga, 1);
@ -2301,8 +2119,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
MOVE_IF_64(psrlw, tmpga, f_ga, 7);
mix16(_ga, tmpga, tmp);
@ -2313,10 +2129,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
case TFX_HIGHLIGHT:
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
ONLY32(movdqa(xym2, f_ga)); // WHY
// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
MOVE_IF_64(psrlw, tmpga, f_ga, 7);
@ -2338,9 +2150,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
ONLY32(movdqa(xym2, f_ga));
MOVE_IF_64(psrlw, tmpga, f_ga, 7);
mix16(_ga, tmpga, tmp);
@ -2505,16 +2314,13 @@ void GSDrawScanlineCodeGenerator2::ColorTFX()
return;
}
const XYm& f_ga = is64 ? _f_ga : xym2;
const XYm& tmpga = is64 ? xym2 : f_ga;
const XYm& f_ga = _f_ga;
const XYm& tmpga = xym2;
auto modulate16_1_rb = [&]
{
// GSVector4i rb = iip ? rbf : m_local.c.rb;
if (is64)
modulate16(_rb, _f_rb, 1);
else
modulate16(_rb, m_sel.iip ? _rip_local(temp.rb) : _rip_local(c.rb), 1);
modulate16(_rb, _f_rb, 1);
};
switch (m_sel.tfx)
@ -2537,14 +2343,6 @@ void GSDrawScanlineCodeGenerator2::ColorTFX()
case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2:
if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga)));
}
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
movdqa(xym1, _ga);
@ -2593,21 +2391,12 @@ void GSDrawScanlineCodeGenerator2::Fog()
return;
}
const XYm& f = is64 ? _f : xym0;
const XYm& tmp = is64 ? xym0 : xym2;
const XYm& f = _f;
const XYm& tmp = xym0;
// rb = m_local.gd->frb.lerp16<0>(rb, f);
// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
if (m_sel.prim != GS_SPRITE_CLASS)
{
ONLY32(movdqa(f, _rip_local(temp.f)));
}
else
{
ONLY32(pbroadcastwLocal(f, _rip_local(p.f)));
}
movdqa(xym1, _ga);
pbroadcastdLocal(tmp, _rip_global(frb));
@ -3060,7 +2849,7 @@ void GSDrawScanlineCodeGenerator2::WriteFrame()
}
const XYm& tmp = is64 ? xym15 : xym7;
const XYm& tmp = xym15;
if (m_sel.fpsm == 2 && m_sel.dthe)
{
@ -3379,7 +3168,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(
void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset)
{
AddressReg texIn = is64 ? _64_m_local__gd__tex : t2;
AddressReg texIn = _64_m_local__gd__tex;
Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]);
mov(ebx, lod_addr);
mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]);
@ -3397,7 +3186,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplYmm(
const Ymm t1[] = { d1, d2s0, d3s1, s2 };
const Ymm t2[] = { tmp, tmp, tmp, tmp };
bool texInRBX = is32;
bool texInRBX = false;
if (use_lod && m_sel.lcm)
{
ReadTexelImplLoadTexLOD(0, mip_offset);
@ -3489,7 +3278,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplSSE4(
else
{
bool preserve = false;
bool texInRBX = is32;
bool texInRBX = false;
if (use_lod && m_sel.lcm)
{
@ -3511,7 +3300,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(const Xmm& dst, const Xmm& addr
{
ASSERT(i < 4);
AddressReg clut = is64 ? _64_m_local__gd__clut : rdx;
AddressReg clut = _64_m_local__gd__clut;
AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex;
Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4];

View File

@ -41,7 +41,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
constexpr static int wordsize = is64 ? 8 : 4;
constexpr static int wordsize = 8;
constexpr static int vecsize = isXmm ? 16 : 32;
constexpr static int vecsizelog = isXmm ? 4 : 5;
constexpr static int vecints = vecsize / 4;
@ -67,8 +67,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
constexpr static int _64_rz_r15 = -8 * 5;
constexpr static int _64_top = -8 * 6;
#endif
constexpr static int _top = is64 ? _64_top : _32_args + 4;
constexpr static int _v = is64 ? _invalid : _32_args + 8;
constexpr static int _top = _64_top;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;

View File

@ -100,30 +100,6 @@ public:
};
private:
/// Make sure the register is okay to use
void validateRegister(const Operand& op)
{
if (is64)
return;
if (op.isREG() && (op.isExtIdx() || op.isExt8bit()))
throw Error(Error::ERR_64_BIT_REG_IN_32);
if (op.isMEM())
{
auto e = static_cast<const Address&>(op).getRegExp();
validateRegister(e.getIndex());
validateRegister(e.getBase());
}
}
/// For easier macro-ing
void validateRegister(int imm)
{
}
void require64()
{
if (!is64)
throw Error(Error::ERR_64_INSTR_IN_32);
}
void requireAVX()
{
if (!hasAVX)
@ -133,9 +109,6 @@ private:
public:
Xbyak::CodeGenerator& actual;
#if defined(_M_X86_64)
constexpr static bool is32 = false;
constexpr static bool is64 = true;
using AddressReg = Xbyak::Reg64;
using RipType = Xbyak::RegRip;
@ -144,18 +117,6 @@ public:
template <typename T32, typename T64>
static T64 choose3264(T32 t32, T64 t64) { return t64; }
#else
constexpr static bool is32 = true;
constexpr static bool is64 = false;
using AddressReg = Xbyak::Reg32;
using RipType = int;
template <typename T32, typename T64>
struct Choose3264 { using type = T32; };
template <typename T32, typename T64>
static T32 choose3264(T32 t32, T64 t64) { return t32; }
#endif
const bool hasAVX, hasAVX2, hasFMA;
@ -238,34 +199,24 @@ public:
#define FORWARD1(category, name, type) \
void name(type a) \
{ \
validateRegister(a); \
ACTUAL_FORWARD_##category(name, a) \
}
#define FORWARD2(category, name, type1, type2) \
void name(type1 a, type2 b) \
{ \
validateRegister(a); \
validateRegister(b); \
ACTUAL_FORWARD_##category(name, a, b) \
}
#define FORWARD3(category, name, type1, type2, type3) \
void name(type1 a, type2 b, type3 c) \
{ \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
ACTUAL_FORWARD_##category(name, a, b, c) \
}
#define FORWARD4(category, name, type1, type2, type3, type4) \
void name(type1 a, type2 b, type3 c, type4 d) \
{ \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
validateRegister(d); \
ACTUAL_FORWARD_##category(name, a, b, c, d) \
}
@ -282,8 +233,6 @@ public:
#define FORWARD_SSE_XMM0(name) \
void name(const Xmm& a, const Operand& b) \
{ \
validateRegister(a); \
validateRegister(b); \
if (hasAVX) \
actual.v##name(a, b, Xmm(0)); \
else \
@ -326,19 +275,12 @@ public:
#define ARGS_XOI const Xmm&, const Operand&, u8
#define ARGS_XXO const Xmm&, const Xmm&, const Operand&
// For instructions that are ifdef'd out without XBYAK64
#ifdef XBYAK64
#define REQUIRE64(action) require64(); action
#else
#define REQUIRE64(action) require64()
#endif
const u8 *getCurr() { return actual.getCurr(); }
void align(int x = 16) { return actual.align(x); }
void db(int code) { actual.db(code); }
void L(const std::string& label) { actual.L(label); }
void cdqe() { REQUIRE64(actual.cdqe()); }
void cdqe() { actual.cdqe(); }
void ret(int imm = 0) { actual.ret(imm); }
void vzeroupper() { requireAVX(); actual.vzeroupper(); }
void vzeroall() { requireAVX(); actual.vzeroall(); }
@ -458,7 +400,6 @@ public:
FORWARD(3, AVX2, vpsravd, ARGS_XXO)
FORWARD(3, AVX2, vpsrlvd, ARGS_XXO)
#undef REQUIRE64
#undef ARGS_OI
#undef ARGS_OO
#undef ARGS_XI

View File

@ -19,7 +19,7 @@
using namespace Xbyak;
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _64_m_local _64_t0
@ -53,15 +53,15 @@ GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base,
, m_rip(false), many_regs(false)
// On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap
#ifdef _WIN32
, _64_vertex(is64 ? rcx : r8)
, _index(is64 ? rdx : rcx)
, _dscan(is64 ? r8 : rdx)
, _64_t0(r9), t1(is64 ? r10 : rcx)
, _64_vertex(rcx)
, _index(rdx)
, _dscan(r8)
, _64_t0(r9), t1(r10)
#else
, _64_vertex(is64 ? rdi : r8)
, _index(is64 ? rsi : rcx)
, _64_vertex(rdi)
, _index(rsi)
, _dscan(rdx)
, _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx)
, _64_t0(rcx), t1(r8)
#endif
, _m_local(chooseLocal(&m_local, _64_m_local))
{
@ -88,7 +88,7 @@ void GSSetupPrimCodeGenerator2::Generate()
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip;
many_regs = is64 && isYmm && !m_sel.notest && needs_shift;
many_regs = isYmm && !m_sel.notest && needs_shift;
#ifdef _WIN64
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
@ -102,13 +102,11 @@ void GSSetupPrimCodeGenerator2::Generate()
}
#endif
if (is64 && !m_rip)
if (!m_rip)
mov(_64_m_local, (size_t)&m_local);
if (needs_shift)
{
if (is32)
mov(_dscan, ptr[rsp + _32_dscan]);
if (isXmm)
mov(rax, (size_t)g_const->m_shift_128b);
@ -193,14 +191,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
add(rax, _64_vertex);
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
@ -231,14 +224,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
{
// GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
add(rax, _64_vertex);
if (m_en.f)
{
@ -285,14 +273,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
add(rax, _64_vertex);
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), t1.cvt32());
@ -354,14 +337,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
{
// GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
add(rax, _64_vertex);
if (m_en.f)
{
@ -564,14 +542,9 @@ void GSSetupPrimCodeGenerator2::Color()
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
add(rax, _64_vertex);
}
if (isXmm)

View File

@ -46,12 +46,6 @@ class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator
constexpr static int dsize = isXmm ? 4 : 8;
constexpr static int _32_args = 0;
constexpr static int _invalid = 0xaaaaaaaa;
constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4;
constexpr static int _32_index = is64 ? _invalid : _32_args + 8;
constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;