GS: Remove 32bit code from SW renderer

This commit is contained in:
refractionpcsx2 2022-03-19 22:16:44 +00:00
parent ccd86a242c
commit f5afbfd4f5
5 changed files with 175 additions and 479 deletions

View File

@ -30,22 +30,8 @@ using namespace Xbyak;
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0] // If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
#define _64_m_local__gd__tex r14 #define _64_m_local__gd__tex r14
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) #define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_global(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) #define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
/// Executes the given code only if targeting 32-bit
#define ONLY32(code) if (is32) (code)
/// Executes the given code only if targeting 64-bit
#define ONLY64(code) if (is64) (code)
/// Combines temporary with either dst64 on 64-bit or src32 on 32-bit
/// Follow up with an ONLY32 save back to src32
#define REG_64_MEM_32(operation, dst64, temporary, src32) \
if (is32) \
operation(temporary, src32); \
else \
operation(dst64, temporary)
/// On AVX, does a v-prefixed separate destination operation /// On AVX, does a v-prefixed separate destination operation
/// On SSE, moves src1 into dst using movdqa, then does the operation /// On SSE, moves src1 into dst using movdqa, then does the operation
@ -67,14 +53,7 @@ using namespace Xbyak;
#define MOVE_IF_64(operation, dst, src64, ...) \ #define MOVE_IF_64(operation, dst, src64, ...) \
do \ do \
{ \ { \
if (is64) \ THREEARG(operation, dst, src64, __VA_ARGS__); \
{ \
THREEARG(operation, dst, src64, __VA_ARGS__); \
} \
else \
{ \
operation(dst, __VA_ARGS__); \
} \
} while (0) } while (0)
#define USING_XMM DRAW_SCANLINE_USING_XMM #define USING_XMM DRAW_SCANLINE_USING_XMM
@ -105,21 +84,21 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
, m_local(*(GSScanlineLocalData*)param) , m_local(*(GSScanlineLocalData*)param)
, m_rip(false) , m_rip(false)
#ifdef _WIN32 #ifdef _WIN32
, a0(rcx) , a1(rdx) , a0(rcx), a1(rdx)
, a2(r8) , a3(is64 ? r9 : rbx) , a2(r8) , a3(r9)
, t0(rdi) , t1(rsi) , t0(rdi), t1(rsi)
, t2(is64 ? r8 : rbp), t3(r9) , t2(r8) , t3(r9)
#else #else
, a0(is64 ? rdi : rcx), a1(is64 ? rsi : rdx) , a0(rdi), a1(rsi)
, a2(is64 ? rdx : r8), a3(is64 ? rcx : rbx) , a2(rdx), a3(rcx)
, t0(is64 ? r8 : rdi), t1(is64 ? r9 : rsi) , t0(r8) , t1(r9)
, t2(is64 ? rcx : rbp), t3(is64 ? rsi : r8) , t2(rcx), t3(rsi)
#endif #endif
, _g_const(chooseLocal(&*g_const, _64_g_const)) , _g_const(chooseLocal(&*g_const, _64_g_const))
, _m_local(chooseLocal(&m_local, _64_m_local)) , _m_local(chooseLocal(&m_local, _64_m_local))
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(is64 ? xym15 : xym7) , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
{ {
m_sel.key = key; m_sel.key = key;
@ -132,8 +111,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr) GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr)
{ {
if (is64) mov(reg, (size_t)addr);
mov(reg, (size_t)addr);
return choose3264((size_t)addr, reg); return choose3264((size_t)addr, reg);
} }
@ -352,51 +330,41 @@ void GSDrawScanlineCodeGenerator2::Generate()
m_rip &= (size_t)&m_local < 0x80000000; m_rip &= (size_t)&m_local < 0x80000000;
m_rip &= (size_t)&m_local.gd < 0x80000000; m_rip &= (size_t)&m_local.gd < 0x80000000;
if (is32) push(rbp);
{ mov(rbp, rsp); // Stack traces look much nicer this way
push(rbx);
push(rsi);
push(rdi);
push(rbp);
}
else
{
push(rbp);
mov(rbp, rsp); // Stack traces look much nicer this way
#ifdef _WIN32 #ifdef _WIN32
push(rbx); push(rbx);
push(rsi); push(rsi);
push(rdi); push(rdi);
push(r12); push(r12);
push(r13); push(r13);
push(r14); push(r14);
sub(rsp, _64_win_stack_size); sub(rsp, _64_win_stack_size);
for (int i = 0; i < 10; i++) for (int i = 0; i < 10; i++)
{ {
movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6)); movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6));
}
#else
mov(ptr[rsp + _64_rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
}
mov(ptr[rsp + _64_rz_r14], r14);
mov(ptr[rsp + _64_rz_r15], r15);
#endif
mov(_64_g_const, (size_t)&*g_const);
if (!m_rip)
{
mov(_64_m_local, (size_t)&m_local);
mov(_64_m_local__gd, _rip_local(gd));
}
if (need_clut)
mov(_64_m_local__gd__clut, _rip_global(clut));
} }
#else
mov(ptr[rsp + _64_rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
}
mov(ptr[rsp + _64_rz_r14], r14);
mov(ptr[rsp + _64_rz_r15], r15);
#endif
mov(_64_g_const, (size_t)&*g_const);
if (!m_rip)
{
mov(_64_m_local, (size_t)&m_local);
mov(_64_m_local__gd, _rip_local(gd));
}
if (need_clut)
mov(_64_m_local__gd__clut, _rip_global(clut));
Init(); Init();
@ -615,47 +583,33 @@ L("step");
L("exit"); L("exit");
if (is32)
{
pop(ebp);
pop(edi);
pop(esi);
pop(ebx);
ret(8);
}
else
{
#ifdef _WIN32 #ifdef _WIN32
for (int i = 0; i < 10; i++) for (int i = 0; i < 10; i++)
{ {
movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]); movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]);
}
add(rsp, _64_win_stack_size);
pop(r14);
pop(r13);
pop(r12);
pop(rdi);
pop(rsi);
pop(rbx);
#else
mov(rbx, ptr[rsp + _64_rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
}
mov(r14, ptr[rsp + _64_rz_r14]);
mov(r15, ptr[rsp + _64_rz_r15]);
#endif
pop(rbp);
if (isYmm)
vzeroupper();
ret();
} }
add(rsp, _64_win_stack_size);
pop(r14);
pop(r13);
pop(r12);
pop(rdi);
pop(rsi);
pop(rbx);
#else
mov(rbx, ptr[rsp + _64_rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
}
mov(r14, ptr[rsp + _64_rz_r14]);
mov(r15, ptr[rsp + _64_rz_r15]);
#endif
pop(rbp);
if (isYmm)
vzeroupper();
ret();
} }
/// Inputs: a0=pixels, a1=left, a2[x64]=top, a3[x64]=v /// Inputs: a0=pixels, a1=left, a2[x64]=top, a3[x64]=v
@ -683,7 +637,7 @@ void GSDrawScanlineCodeGenerator2::Init()
and(eax, a0.cvt32()); and(eax, a0.cvt32());
if (isXmm) if (isXmm)
shl(eax, 4); // * sizeof(m_test[0]) shl(eax, 4); // * sizeof(m_test[0])
ONLY64(cdqe()); cdqe();
if (isXmm) if (isXmm)
{ {
@ -713,43 +667,19 @@ void GSDrawScanlineCodeGenerator2::Init()
// rbx = left // rbx = left
// Free: rax, t0, t1 // Free: rax, t0, t1
if (is64) // GSVector2i* fza_base = &m_local.gd->fzbr[top];
{ mov(rax, _rip_global(fzbr));
// GSVector2i* fza_base = &m_local.gd->fzbr[top]; lea(t1, ptr[rax + a2 * 8]);
mov(rax, _rip_global(fzbr));
lea(t1, ptr[rax + a2 * 8]);
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
mov(rax, _rip_global(fzbc)); mov(rax, _rip_global(fzbc));
lea(t0, ptr[rax + rbx * 2]); lea(t0, ptr[rax + rbx * 2]);
}
else
{
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
mov(t1, ptr[rsp + _top]);
lea(t1, ptr[t1 * 8]);
add(t1, ptr[&m_local.gd->fzbr]);
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
lea(t0, ptr[rbx * 2]);
add(t0, ptr[(size_t)&m_local.gd->fzbc]);
}
if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
{ {
// a1 = &m_local.d[skip] // note a1 was (skip << 4) // a1 = &m_local.d[skip] // note a1 was (skip << 4)
lea(rax, _rip_local(d));
if (is64) lea(a1, ptr[rax + a1 * 8]);
{
lea(rax, _rip_local(d));
lea(a1, ptr[rax + a1 * 8]);
}
else
{
lea(a1, ptr[(size_t)m_local.d + a1 * 8]);
// a3 starts on the stack in x86, we want it in a register
mov(a3, ptr[rsp + _v]);
}
} }
// a0 = steps (rcx | rdi) // a0 = steps (rcx | rdi)
@ -760,8 +690,8 @@ void GSDrawScanlineCodeGenerator2::Init()
// t1 = fza_base (rsi | r9 ) // t1 = fza_base (rsi | r9 )
// Free: rax // Free: rax
const XYm& f = is64 ? _f : xym1; const XYm& f = _f;
const XYm& z = is64 ? _z : xym0; const XYm& z = _z;
if (m_sel.prim != GS_SPRITE_CLASS) if (m_sel.prim != GS_SPRITE_CLASS)
{ {
@ -777,9 +707,6 @@ void GSDrawScanlineCodeGenerator2::Init()
pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2)); pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]); paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
if (is32) // _f is shared on x86
movdqa(ptr[&m_local.temp.f], f);
} }
if (m_sel.zb) if (m_sel.zb)
@ -788,17 +715,7 @@ void GSDrawScanlineCodeGenerator2::Init()
{ {
// z = vp.zzzz() + m_local.d[skip].z; // z = vp.zzzz() + m_local.d[skip].z;
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2)); shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
if (is64) addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
{
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
}
else
{
movaps(ptr[&m_local.temp.z], z);
movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
movaps(ptr[&m_local.temp.zo], xym2);
addps(z, xym2);
}
} }
else else
pbroadcastdLocal(z, _rip_local(p.z)); pbroadcastdLocal(z, _rip_local(p.z));
@ -812,7 +729,7 @@ void GSDrawScanlineCodeGenerator2::Init()
pbroadcastdLocal(z, _rip_local(p.z)); pbroadcastdLocal(z, _rip_local(p.z));
} }
if (m_sel.fwrite && m_sel.fge && is64) if (m_sel.fwrite && m_sel.fge)
pbroadcastwLocal(_f, _rip_local(p.f)); pbroadcastwLocal(_f, _rip_local(p.f));
} }
@ -840,8 +757,8 @@ void GSDrawScanlineCodeGenerator2::Init()
{ {
// a1 = &m_local.d[skip] // a1 = &m_local.d[skip]
const XYm& s = is64 ? _s : xym2; const XYm& s = _s;
const XYm& t = is64 ? _t : xym3; const XYm& t = _t;
if (m_sel.fst) if (m_sel.fst)
{ {
@ -863,19 +780,16 @@ void GSDrawScanlineCodeGenerator2::Init()
} }
else if (m_sel.ltf) else if (m_sel.ltf)
{ {
XYm vf = is64 ? xym7 : xym6; XYm vf = xym7;
pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12); psrlw(vf, 12);
movdqa(_rip_local(temp.vf), vf); movdqa(_rip_local(temp.vf), vf);
} }
ONLY32(movdqa(_rip_local(temp.s), s));
ONLY32(movdqa(_rip_local(temp.t), t));
} }
else else
{ {
const XYm& q = is64 ? _q : vt; const XYm& q = _q;
// s = vt.xxxx() + m_local.d[skip].s; // s = vt.xxxx() + m_local.d[skip].s;
// t = vt.yyyy() + m_local.d[skip].t; // t = vt.yyyy() + m_local.d[skip].t;
@ -891,7 +805,7 @@ void GSDrawScanlineCodeGenerator2::Init()
{ {
movaps(s, vt); movaps(s, vt);
movaps(t, vt); movaps(t, vt);
ONLY64(movaps(q, vt)); movaps(q, vt);
shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0)); shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0));
shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1)); shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1));
@ -901,20 +815,13 @@ void GSDrawScanlineCodeGenerator2::Init()
addps(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); addps(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]);
addps(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); addps(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]);
addps(q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); addps(q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]);
if (is32)
{
movaps(ptr[&m_local.temp.s], s);
movaps(ptr[&m_local.temp.t], t);
movaps(ptr[&m_local.temp.q], q);
}
} }
} }
if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
{ {
const XYm& f_rb = is64 ? _f_rb : xym5; const XYm& f_rb = _f_rb;
const XYm& f_ga = is64 ? _f_ga : xym6; const XYm& f_ga = _f_ga;
if (m_sel.iip) if (m_sel.iip)
{ {
// GSVector4i vc = GSVector4i(v.c); // GSVector4i vc = GSVector4i(v.c);
@ -942,38 +849,32 @@ void GSDrawScanlineCodeGenerator2::Init()
paddw(f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); paddw(f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]);
paddw(f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); paddw(f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]);
ONLY32(movdqa(ptr[&m_local.temp.rb], f_rb));
ONLY32(movdqa(ptr[&m_local.temp.ga], f_ga));
} }
else if (is64 || m_sel.tfx == TFX_NONE) else
{ {
movdqa(f_rb, _rip_local(c.rb)); movdqa(f_rb, _rip_local(c.rb));
movdqa(f_ga, _rip_local(c.ga)); movdqa(f_ga, _rip_local(c.ga));
} }
ONLY64(movdqa(_rb, _f_rb)); movdqa(_rb, _f_rb);
ONLY64(movdqa(_ga, _f_ga)); movdqa(_ga, _f_ga);
} }
} }
if (is64) if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe)
{ {
if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) // On linux, a2 is edx which will be used for fzm
{ // In all case, it will require a mov in dthe code, so let's keep the value on the stack
// On linux, a2 is edx which will be used for fzm mov(ptr[rsp + _top], a2);
// In all case, it will require a mov in dthe code, so let's keep the value on the stack }
mov(ptr[rsp + _top], a2);
}
mov(_64_m_local__gd__vm, _rip_global(vm)); mov(_64_m_local__gd__vm, _rip_global(vm));
if (m_sel.fb && m_sel.tfx != TFX_NONE) if (m_sel.fb && m_sel.tfx != TFX_NONE)
{ {
if (use_lod) if (use_lod)
lea(_64_m_local__gd__tex, _rip_global(tex)); lea(_64_m_local__gd__tex, _rip_global(tex));
else else
mov(_64_m_local__gd__tex, _rip_global(tex)); mov(_64_m_local__gd__tex, _rip_global(tex));
}
} }
} }
@ -991,8 +892,8 @@ void GSDrawScanlineCodeGenerator2::Step()
add(t0, vecsize / 2); add(t0, vecsize / 2);
const XYm& z = is64 ? _z : xym0; const XYm& z =_z;
const XYm& f = is64 ? _f : xym1; const XYm& f =_f;
if (m_sel.prim != GS_SPRITE_CLASS) if (m_sel.prim != GS_SPRITE_CLASS)
{ {
@ -1004,13 +905,6 @@ void GSDrawScanlineCodeGenerator2::Step()
{ {
pbroadcastdLocal(z, _rip_local(p.z)); pbroadcastdLocal(z, _rip_local(p.z));
} }
else if (is32)
{
broadcastssLocal(z, _rip_local_d_p(z));
addps(z, _rip_local(temp.zo));
movaps(_rip_local(temp.zo), z);
addps(z, _rip_local(temp.z));
}
else else
{ {
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z)); BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
@ -1021,23 +915,7 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.fwrite && m_sel.fge) if (m_sel.fwrite && m_sel.fge)
{ {
if (is32) BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f));
{
pbroadcastwLocal(f, _rip_local_d_p(f));
paddw(f, _rip_local(temp.f));
movdqa(_rip_local(temp.f), f);
}
else
{
BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f));
}
}
}
else
{
if (is32 && m_sel.ztest)
{
pbroadcastdLocal(z, _rip_local(p.z));
} }
} }
@ -1047,7 +925,7 @@ void GSDrawScanlineCodeGenerator2::Step()
{ {
if (m_sel.fst) if (m_sel.fst)
{ {
const XYm& stq = is64 ? xym0 : xym4; const XYm& stq = xym0;
// GSVector4i stq = m_local.d4.stq; // GSVector4i stq = m_local.d4.stq;
// s += stq.xxxx(); // s += stq.xxxx();
@ -1055,28 +933,22 @@ void GSDrawScanlineCodeGenerator2::Step()
broadcasti128(stq, _rip_local_d(stq)); broadcasti128(stq, _rip_local_d(stq));
XYm s = is64 ? xym1 : xym2; XYm s = xym1;
pshufd(s, stq, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(s, stq, _MM_SHUFFLE(0, 0, 0, 0));
REG_64_MEM_32(paddd, _s, s, _rip_local(temp.s)); paddd(_s, s);
ONLY32(movdqa(_rip_local(temp.s), s));
XYm t = is64 ? xym1 : xym3; XYm t = xym1;
if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
{ {
pshufd(t, stq, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(t, stq, _MM_SHUFFLE(1, 1, 1, 1));
REG_64_MEM_32(paddd, _t, t, _rip_local(temp.t)); paddd(_t, t);
ONLY32(movdqa(_rip_local(temp.t), t));
}
else
{
ONLY32(movdqa(t, _rip_local(temp.t)));
} }
} }
else else
{ {
const XYm& s = xym2; const XYm& s = xym2;
const XYm& t = xym3; const XYm& t = xym3;
const XYm& q = is64 ? xym1 : xym4; const XYm& q = xym1;
// GSVector4 stq = m_local.d4.stq; // GSVector4 stq = m_local.d4.stq;
// s += stq.xxxx(); // s += stq.xxxx();
@ -1102,13 +974,9 @@ void GSDrawScanlineCodeGenerator2::Step()
shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2)); shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2));
} }
REG_64_MEM_32(addps, _s, s, _rip_local(temp.s)); addps(_s, s);
REG_64_MEM_32(addps, _t, t, _rip_local(temp.t)); addps(_t, t);
REG_64_MEM_32(addps, _q, q, _rip_local(temp.q)); addps(_q, q);
ONLY32(movaps(_rip_local(temp.s), s));
ONLY32(movaps(_rip_local(temp.t), t));
ONLY32(movaps(_rip_local(temp.q), q));
} }
} }
@ -1116,7 +984,7 @@ void GSDrawScanlineCodeGenerator2::Step()
{ {
if (m_sel.iip) if (m_sel.iip)
{ {
XYm c = is64 ? xym0 : xym7; XYm c = xym0;
// GSVector4i c = m_local.d4.c; // GSVector4i c = m_local.d4.c;
// rb = rb.add16(c.xxxx()); // rb = rb.add16(c.xxxx());
@ -1127,29 +995,18 @@ void GSDrawScanlineCodeGenerator2::Step()
pshufd(_rb, c, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(_rb, c, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(_ga, c, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(_ga, c, _MM_SHUFFLE(1, 1, 1, 1));
REG_64_MEM_32(paddw, _f_rb, _rb, _rip_local(temp.rb)); paddw(_f_rb, _rb);
REG_64_MEM_32(paddw, _f_ga, _ga, _rip_local(temp.ga)); paddw(_f_ga, _ga);
// FIXME: color may underflow and roll over at the end of the line, if decreasing // FIXME: color may underflow and roll over at the end of the line, if decreasing
pxor(c, c); pxor(c, c);
pmaxsw(is64 ? _f_rb : _rb, c); pmaxsw(_f_rb, c);
pmaxsw(is64 ? _f_ga : _ga, c); pmaxsw(_f_ga, c);
ONLY32(movdqa(_rip_local(temp.rb), _rb));
ONLY32(movdqa(_rip_local(temp.ga), _ga));
}
else
{
if (m_sel.tfx == TFX_NONE)
{
ONLY32(movdqa(_rb, ptr[&m_local.c.rb]));
ONLY32(movdqa(_ga, ptr[&m_local.c.ga]));
}
} }
ONLY64(movdqa(_rb, _f_rb)); movdqa(_rb, _f_rb);
ONLY64(movdqa(_ga, _f_ga)); movdqa(_ga, _f_ga);
} }
} }
@ -1162,7 +1019,7 @@ void GSDrawScanlineCodeGenerator2::Step()
and(eax, a0.cvt32()); and(eax, a0.cvt32());
if (isXmm) if (isXmm)
shl(eax, 4); shl(eax, 4);
ONLY64(cdqe()); cdqe();
#if USING_XMM #if USING_XMM
movdqa(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]); movdqa(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]);
@ -1182,7 +1039,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
return; return;
} }
const XYm& z = is64 ? _z : xym0; const XYm& z = _z;
// int za = fza_base.y + fza_offset->y; // int za = fza_base.y + fza_offset->y;
@ -1196,7 +1053,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
{ {
if (m_sel.zequal) if (m_sel.zequal)
{ {
ONLY64(movdqa(xym0, _z)); movdqa(xym0, _z);
} }
else if (m_sel.zoverflow) else if (m_sel.zoverflow)
{ {
@ -1247,7 +1104,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
} }
else else
{ {
ONLY64(movdqa(xym0, _z)); movdqa(xym0, _z);
} }
if (m_sel.ztest) if (m_sel.ztest)
@ -1312,22 +1169,11 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
return; return;
} }
if (is32)
{
mov(ebx, ptr[&m_local.gd->tex[0]]);
if (m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
}
const bool needsMoreRegs = isYmm; const bool needsMoreRegs = isYmm;
if (!m_sel.fst) if (!m_sel.fst)
{ {
rcpps(xym0, is64 ? _q : xym4); rcpps(xym0, _q);
MOVE_IF_64(mulps, xym2, _s, xym0); MOVE_IF_64(mulps, xym2, _s, xym0);
MOVE_IF_64(mulps, xym3, _t, xym0); MOVE_IF_64(mulps, xym3, _t, xym0);
@ -1349,21 +1195,19 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
} }
else else
{ {
ONLY64(movdqa(xym2, _s)); movdqa(xym2, _s);
ONLY64(movdqa(xym3, _t)); movdqa(xym3, _t);
} }
if (m_sel.ltf) if (m_sel.ltf)
{ {
const XYm& vf = is64 ? xym7 : xym0; const XYm& vf = xym7;
// GSVector4i uf = u.xxzzlh().srl16(12); // GSVector4i uf = u.xxzzlh().srl16(12);
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12); psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
if (m_sel.prim != GS_SPRITE_CLASS) if (m_sel.prim != GS_SPRITE_CLASS)
{ {
@ -1372,10 +1216,10 @@ void GSDrawScanlineCodeGenerator2::SampleTexture()
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12); psrlw(vf, 12);
if (is32 || needsMoreRegs) if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf); movdqa(_rip_local(temp.vf), vf);
} }
else if (is64 && !needsMoreRegs) else if (!needsMoreRegs)
{ {
movdqa(vf, _rip_local(temp.vf)); movdqa(vf, _rip_local(temp.vf));
} }
@ -1484,8 +1328,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// c10 = addr10.gather32_32((const u32/u8*)tex[, clut]); // c10 = addr10.gather32_32((const u32/u8*)tex[, clut]);
// c11 = addr11.gather32_32((const u32/u8*)tex[, clut]); // c11 = addr11.gather32_32((const u32/u8*)tex[, clut]);
const XYm& tmp1 = is64 ? xym7 : xym4; // OK to destroy if needsMoreRegs const XYm& tmp1 = xym7; // OK to destroy if needsMoreRegs
const XYm& tmp2 = is64 ? xym4 : xym7; const XYm& tmp2 = xym4;
// d0 d1 d2s0 d3s1 s2 s3 // d0 d1 d2s0 d3s1 s2 s3
ReadTexel4(xym5, xym6, xym0, xym2, xym1, xym3, tmp1, tmp2, mip_offset); ReadTexel4(xym5, xym6, xym0, xym2, xym1, xym3, tmp1, tmp2, mip_offset);
@ -1496,9 +1340,6 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// xym6 = c00 // xym6 = c00
// xym7 = used[x86] vf[x64&&!needsMoreRegs] // xym7 = used[x86] vf[x64&&!needsMoreRegs]
if (is32 && needsMoreRegs)
movdqa(xym4, _rip_local(temp.uf));
// GSVector4i rb00 = c00 & mask; // GSVector4i rb00 = c00 & mask;
// GSVector4i ga00 = (c00 >> 8) & mask; // GSVector4i ga00 = (c00 >> 8) & mask;
@ -1565,8 +1406,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset)
// rb00 = rb00.lerp16_4(rb10, vf); // rb00 = rb00.lerp16_4(rb10, vf);
// ga00 = ga00.lerp16_4(ga10, vf); // ga00 = ga00.lerp16_4(ga10, vf);
XYm vf = is64 ? xym7 : xym2; XYm vf = xym7;
if (needsMoreRegs || is32) if (needsMoreRegs)
movdqa(vf, _rip_local(temp.vf)); movdqa(vf, _rip_local(temp.vf));
lerp16_4(xym5, xym0, vf); lerp16_4(xym5, xym0, vf);
@ -1729,22 +1570,9 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
return; return;
} }
if (is32)
{
push(t2);
mov(t2, (size_t)m_local.gd->tex);
if (m_sel.tlu)
{
mov(edx, ptr[&m_local.gd->clut]);
}
}
const bool needsMoreRegs = isYmm; const bool needsMoreRegs = isYmm;
if (is64) movdqa(xym4, _q);
movdqa(xym4, _q);
if (!m_sel.fst) if (!m_sel.fst)
{ {
@ -1758,8 +1586,8 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
} }
else else
{ {
ONLY64(movdqa(xym2, _s)); movdqa(xym2, _s);
ONLY64(movdqa(xym3, _t)); movdqa(xym3, _t);
} }
// xym2 = u // xym2 = u
@ -1974,7 +1802,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
if (m_sel.ltf) if (m_sel.ltf)
{ {
const XYm& vf = is64 ? xym7 : xym0; const XYm& vf = xym7;
// u -= 0x8000; // u -= 0x8000;
// v -= 0x8000; // v -= 0x8000;
@ -1989,15 +1817,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12); psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
// GSVector4i vf = v.xxzzlh().srl16(1); // GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12); psrlw(vf, 12);
if (is32 || needsMoreRegs) if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf); movdqa(_rip_local(temp.vf), vf);
} }
@ -2058,7 +1884,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
if (m_sel.ltf) if (m_sel.ltf)
{ {
const XYm& vf = is64 ? xym7 : xym0; const XYm& vf = xym7;
// u -= 0x8000; // u -= 0x8000;
// v -= 0x8000; // v -= 0x8000;
@ -2073,15 +1899,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xym4, 12); psrlw(xym4, 12);
if (is32 && needsMoreRegs)
movdqa(_rip_local(temp.uf), xym4);
// GSVector4i vf = v.xxzzlh().srl16(1); // GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(vf, 12); psrlw(vf, 12);
if (is32 || needsMoreRegs) if (needsMoreRegs)
movdqa(_rip_local(temp.vf), vf); movdqa(_rip_local(temp.vf), vf);
} }
@ -2131,9 +1955,6 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD()
lerp16(xym5, xym2, xym0, 0); lerp16(xym5, xym2, xym0, 0);
lerp16(xym6, xym3, xym0, 0); lerp16(xym6, xym3, xym0, 0);
} }
if (is32)
pop(t2);
} }
void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv) void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv)
@ -2264,9 +2085,9 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
return; return;
} }
const XYm& f_ga = is64 ? _f_ga : xym4; const XYm& f_ga = _f_ga;
const XYm& tmpga = is64 ? xym1 : f_ga; const XYm& tmpga = xym1;
const XYm& tmp = is64 ? xym0 : xym3; const XYm& tmp = xym0;
Address _32_gaptr = m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga); Address _32_gaptr = m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga);
switch (m_sel.tfx) switch (m_sel.tfx)
@ -2274,9 +2095,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
case TFX_MODULATE: case TFX_MODULATE:
// GSVector4i ga = iip ? gaf : m_local.c.ga; // GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
// gat = gat.modulate16<1>(ga).clamp8(); // gat = gat.modulate16<1>(ga).clamp8();
modulate16(_ga, f_ga, 1); modulate16(_ga, f_ga, 1);
@ -2301,8 +2119,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
{ {
// GSVector4i ga = iip ? gaf : m_local.c.ga; // GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
MOVE_IF_64(psrlw, tmpga, f_ga, 7); MOVE_IF_64(psrlw, tmpga, f_ga, 7);
mix16(_ga, tmpga, tmp); mix16(_ga, tmpga, tmp);
@ -2313,10 +2129,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
case TFX_HIGHLIGHT: case TFX_HIGHLIGHT:
// GSVector4i ga = iip ? gaf : m_local.c.ga; // GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
ONLY32(movdqa(xym2, f_ga)); // WHY
// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
MOVE_IF_64(psrlw, tmpga, f_ga, 7); MOVE_IF_64(psrlw, tmpga, f_ga, 7);
@ -2338,9 +2150,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX()
{ {
// GSVector4i ga = iip ? gaf : m_local.c.ga; // GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, _32_gaptr));
ONLY32(movdqa(xym2, f_ga));
MOVE_IF_64(psrlw, tmpga, f_ga, 7); MOVE_IF_64(psrlw, tmpga, f_ga, 7);
mix16(_ga, tmpga, tmp); mix16(_ga, tmpga, tmp);
@ -2505,16 +2314,13 @@ void GSDrawScanlineCodeGenerator2::ColorTFX()
return; return;
} }
const XYm& f_ga = is64 ? _f_ga : xym2; const XYm& f_ga = _f_ga;
const XYm& tmpga = is64 ? xym2 : f_ga; const XYm& tmpga = xym2;
auto modulate16_1_rb = [&] auto modulate16_1_rb = [&]
{ {
// GSVector4i rb = iip ? rbf : m_local.c.rb; // GSVector4i rb = iip ? rbf : m_local.c.rb;
if (is64) modulate16(_rb, _f_rb, 1);
modulate16(_rb, _f_rb, 1);
else
modulate16(_rb, m_sel.iip ? _rip_local(temp.rb) : _rip_local(c.rb), 1);
}; };
switch (m_sel.tfx) switch (m_sel.tfx)
@ -2537,14 +2343,6 @@ void GSDrawScanlineCodeGenerator2::ColorTFX()
case TFX_HIGHLIGHT: case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2: case TFX_HIGHLIGHT2:
if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
{
// GSVector4i ga = iip ? gaf : m_local.c.ga;
ONLY32(movdqa(f_ga, m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga)));
}
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
movdqa(xym1, _ga); movdqa(xym1, _ga);
@ -2593,21 +2391,12 @@ void GSDrawScanlineCodeGenerator2::Fog()
return; return;
} }
const XYm& f = is64 ? _f : xym0; const XYm& f = _f;
const XYm& tmp = is64 ? xym0 : xym2; const XYm& tmp = xym0;
// rb = m_local.gd->frb.lerp16<0>(rb, f); // rb = m_local.gd->frb.lerp16<0>(rb, f);
// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
if (m_sel.prim != GS_SPRITE_CLASS)
{
ONLY32(movdqa(f, _rip_local(temp.f)));
}
else
{
ONLY32(pbroadcastwLocal(f, _rip_local(p.f)));
}
movdqa(xym1, _ga); movdqa(xym1, _ga);
pbroadcastdLocal(tmp, _rip_global(frb)); pbroadcastdLocal(tmp, _rip_global(frb));
@ -3060,7 +2849,7 @@ void GSDrawScanlineCodeGenerator2::WriteFrame()
} }
const XYm& tmp = is64 ? xym15 : xym7; const XYm& tmp = xym15;
if (m_sel.fpsm == 2 && m_sel.dthe) if (m_sel.fpsm == 2 && m_sel.dthe)
{ {
@ -3379,7 +3168,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(
void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset) void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset)
{ {
AddressReg texIn = is64 ? _64_m_local__gd__tex : t2; AddressReg texIn = _64_m_local__gd__tex;
Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]); Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]);
mov(ebx, lod_addr); mov(ebx, lod_addr);
mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]); mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]);
@ -3397,7 +3186,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplYmm(
const Ymm t1[] = { d1, d2s0, d3s1, s2 }; const Ymm t1[] = { d1, d2s0, d3s1, s2 };
const Ymm t2[] = { tmp, tmp, tmp, tmp }; const Ymm t2[] = { tmp, tmp, tmp, tmp };
bool texInRBX = is32; bool texInRBX = false;
if (use_lod && m_sel.lcm) if (use_lod && m_sel.lcm)
{ {
ReadTexelImplLoadTexLOD(0, mip_offset); ReadTexelImplLoadTexLOD(0, mip_offset);
@ -3489,7 +3278,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplSSE4(
else else
{ {
bool preserve = false; bool preserve = false;
bool texInRBX = is32; bool texInRBX = false;
if (use_lod && m_sel.lcm) if (use_lod && m_sel.lcm)
{ {
@ -3511,7 +3300,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(const Xmm& dst, const Xmm& addr
{ {
ASSERT(i < 4); ASSERT(i < 4);
AddressReg clut = is64 ? _64_m_local__gd__clut : rdx; AddressReg clut = _64_m_local__gd__clut;
AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex; AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex;
Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4]; Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4];

View File

@ -41,7 +41,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value; constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value; constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
constexpr static int wordsize = is64 ? 8 : 4; constexpr static int wordsize = 8;
constexpr static int vecsize = isXmm ? 16 : 32; constexpr static int vecsize = isXmm ? 16 : 32;
constexpr static int vecsizelog = isXmm ? 4 : 5; constexpr static int vecsizelog = isXmm ? 4 : 5;
constexpr static int vecints = vecsize / 4; constexpr static int vecints = vecsize / 4;
@ -67,8 +67,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
constexpr static int _64_rz_r15 = -8 * 5; constexpr static int _64_rz_r15 = -8 * 5;
constexpr static int _64_top = -8 * 6; constexpr static int _64_top = -8 * 6;
#endif #endif
constexpr static int _top = is64 ? _64_top : _32_args + 4; constexpr static int _top = _64_top;
constexpr static int _v = is64 ? _invalid : _32_args + 8;
GSScanlineSelector m_sel; GSScanlineSelector m_sel;
GSScanlineLocalData& m_local; GSScanlineLocalData& m_local;

View File

@ -100,30 +100,6 @@ public:
}; };
private: private:
/// Make sure the register is okay to use
void validateRegister(const Operand& op)
{
if (is64)
return;
if (op.isREG() && (op.isExtIdx() || op.isExt8bit()))
throw Error(Error::ERR_64_BIT_REG_IN_32);
if (op.isMEM())
{
auto e = static_cast<const Address&>(op).getRegExp();
validateRegister(e.getIndex());
validateRegister(e.getBase());
}
}
/// For easier macro-ing
void validateRegister(int imm)
{
}
void require64()
{
if (!is64)
throw Error(Error::ERR_64_INSTR_IN_32);
}
void requireAVX() void requireAVX()
{ {
if (!hasAVX) if (!hasAVX)
@ -133,9 +109,6 @@ private:
public: public:
Xbyak::CodeGenerator& actual; Xbyak::CodeGenerator& actual;
#if defined(_M_X86_64)
constexpr static bool is32 = false;
constexpr static bool is64 = true;
using AddressReg = Xbyak::Reg64; using AddressReg = Xbyak::Reg64;
using RipType = Xbyak::RegRip; using RipType = Xbyak::RegRip;
@ -144,18 +117,6 @@ public:
template <typename T32, typename T64> template <typename T32, typename T64>
static T64 choose3264(T32 t32, T64 t64) { return t64; } static T64 choose3264(T32 t32, T64 t64) { return t64; }
#else
constexpr static bool is32 = true;
constexpr static bool is64 = false;
using AddressReg = Xbyak::Reg32;
using RipType = int;
template <typename T32, typename T64>
struct Choose3264 { using type = T32; };
template <typename T32, typename T64>
static T32 choose3264(T32 t32, T64 t64) { return t32; }
#endif
const bool hasAVX, hasAVX2, hasFMA; const bool hasAVX, hasAVX2, hasFMA;
@ -238,34 +199,24 @@ public:
#define FORWARD1(category, name, type) \ #define FORWARD1(category, name, type) \
void name(type a) \ void name(type a) \
{ \ { \
validateRegister(a); \
ACTUAL_FORWARD_##category(name, a) \ ACTUAL_FORWARD_##category(name, a) \
} }
#define FORWARD2(category, name, type1, type2) \ #define FORWARD2(category, name, type1, type2) \
void name(type1 a, type2 b) \ void name(type1 a, type2 b) \
{ \ { \
validateRegister(a); \
validateRegister(b); \
ACTUAL_FORWARD_##category(name, a, b) \ ACTUAL_FORWARD_##category(name, a, b) \
} }
#define FORWARD3(category, name, type1, type2, type3) \ #define FORWARD3(category, name, type1, type2, type3) \
void name(type1 a, type2 b, type3 c) \ void name(type1 a, type2 b, type3 c) \
{ \ { \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
ACTUAL_FORWARD_##category(name, a, b, c) \ ACTUAL_FORWARD_##category(name, a, b, c) \
} }
#define FORWARD4(category, name, type1, type2, type3, type4) \ #define FORWARD4(category, name, type1, type2, type3, type4) \
void name(type1 a, type2 b, type3 c, type4 d) \ void name(type1 a, type2 b, type3 c, type4 d) \
{ \ { \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
validateRegister(d); \
ACTUAL_FORWARD_##category(name, a, b, c, d) \ ACTUAL_FORWARD_##category(name, a, b, c, d) \
} }
@ -282,8 +233,6 @@ public:
#define FORWARD_SSE_XMM0(name) \ #define FORWARD_SSE_XMM0(name) \
void name(const Xmm& a, const Operand& b) \ void name(const Xmm& a, const Operand& b) \
{ \ { \
validateRegister(a); \
validateRegister(b); \
if (hasAVX) \ if (hasAVX) \
actual.v##name(a, b, Xmm(0)); \ actual.v##name(a, b, Xmm(0)); \
else \ else \
@ -326,19 +275,12 @@ public:
#define ARGS_XOI const Xmm&, const Operand&, u8 #define ARGS_XOI const Xmm&, const Operand&, u8
#define ARGS_XXO const Xmm&, const Xmm&, const Operand& #define ARGS_XXO const Xmm&, const Xmm&, const Operand&
// For instructions that are ifdef'd out without XBYAK64
#ifdef XBYAK64
#define REQUIRE64(action) require64(); action
#else
#define REQUIRE64(action) require64()
#endif
const u8 *getCurr() { return actual.getCurr(); } const u8 *getCurr() { return actual.getCurr(); }
void align(int x = 16) { return actual.align(x); } void align(int x = 16) { return actual.align(x); }
void db(int code) { actual.db(code); } void db(int code) { actual.db(code); }
void L(const std::string& label) { actual.L(label); } void L(const std::string& label) { actual.L(label); }
void cdqe() { REQUIRE64(actual.cdqe()); } void cdqe() { actual.cdqe(); }
void ret(int imm = 0) { actual.ret(imm); } void ret(int imm = 0) { actual.ret(imm); }
void vzeroupper() { requireAVX(); actual.vzeroupper(); } void vzeroupper() { requireAVX(); actual.vzeroupper(); }
void vzeroall() { requireAVX(); actual.vzeroall(); } void vzeroall() { requireAVX(); actual.vzeroall(); }
@ -458,7 +400,6 @@ public:
FORWARD(3, AVX2, vpsravd, ARGS_XXO) FORWARD(3, AVX2, vpsravd, ARGS_XXO)
FORWARD(3, AVX2, vpsrlvd, ARGS_XXO) FORWARD(3, AVX2, vpsrlvd, ARGS_XXO)
#undef REQUIRE64
#undef ARGS_OI #undef ARGS_OI
#undef ARGS_OO #undef ARGS_OO
#undef ARGS_XI #undef ARGS_XI

View File

@ -19,7 +19,7 @@
using namespace Xbyak; using namespace Xbyak;
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) #define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _64_m_local _64_t0 #define _64_m_local _64_t0
@ -53,15 +53,15 @@ GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base,
, m_rip(false), many_regs(false) , m_rip(false), many_regs(false)
// On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap // On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap
#ifdef _WIN32 #ifdef _WIN32
, _64_vertex(is64 ? rcx : r8) , _64_vertex(rcx)
, _index(is64 ? rdx : rcx) , _index(rdx)
, _dscan(is64 ? r8 : rdx) , _dscan(r8)
, _64_t0(r9), t1(is64 ? r10 : rcx) , _64_t0(r9), t1(r10)
#else #else
, _64_vertex(is64 ? rdi : r8) , _64_vertex(rdi)
, _index(is64 ? rsi : rcx) , _index(rsi)
, _dscan(rdx) , _dscan(rdx)
, _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx) , _64_t0(rcx), t1(r8)
#endif #endif
, _m_local(chooseLocal(&m_local, _64_m_local)) , _m_local(chooseLocal(&m_local, _64_m_local))
{ {
@ -88,7 +88,7 @@ void GSSetupPrimCodeGenerator2::Generate()
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000; m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip; bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip;
many_regs = is64 && isYmm && !m_sel.notest && needs_shift; many_regs = isYmm && !m_sel.notest && needs_shift;
#ifdef _WIN64 #ifdef _WIN64
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2; int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
@ -102,13 +102,11 @@ void GSSetupPrimCodeGenerator2::Generate()
} }
#endif #endif
if (is64 && !m_rip) if (!m_rip)
mov(_64_m_local, (size_t)&m_local); mov(_64_m_local, (size_t)&m_local);
if (needs_shift) if (needs_shift)
{ {
if (is32)
mov(_dscan, ptr[rsp + _32_dscan]);
if (isXmm) if (isXmm)
mov(rax, (size_t)g_const->m_shift_128b); mov(rax, (size_t)g_const->m_shift_128b);
@ -193,14 +191,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_sel.prim != GS_POINT_CLASS) if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1; offset = sizeof(u32) * 1;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]); mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW) shl(eax, 6); // * sizeof(GSVertexSW)
if (is64) add(rax, _64_vertex);
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
@ -231,14 +224,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
{ {
// GSVector4 p = vertex[index[1]].p; // GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * 1]); mov(eax, ptr[_index + sizeof(u32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW) shl(eax, 6); // * sizeof(GSVertexSW)
if (is64) add(rax, _64_vertex);
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
if (m_en.f) if (m_en.f)
{ {
@ -285,14 +273,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
if (m_sel.prim != GS_POINT_CLASS) if (m_sel.prim != GS_POINT_CLASS)
offset = sizeof(u32) * 1; offset = sizeof(u32) * 1;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + offset]); mov(eax, ptr[_index + offset]);
shl(eax, 6); // * sizeof(GSVertexSW) shl(eax, 6); // * sizeof(GSVertexSW)
if (is64) add(rax, _64_vertex);
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]); mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), t1.cvt32()); mov(_rip_local(p.z), t1.cvt32());
@ -354,14 +337,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
{ {
// GSVector4 p = vertex[index[1]].p; // GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * 1]); mov(eax, ptr[_index + sizeof(u32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW) shl(eax, 6); // * sizeof(GSVertexSW)
if (is64) add(rax, _64_vertex);
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
if (m_en.f) if (m_en.f)
{ {
@ -564,14 +542,9 @@ void GSSetupPrimCodeGenerator2::Color()
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{ {
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(u32) * last]); mov(eax, ptr[_index + sizeof(u32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW) shl(eax, 6); // * sizeof(GSVertexSW)
if (is64) add(rax, _64_vertex);
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
} }
if (isXmm) if (isXmm)

View File

@ -46,12 +46,6 @@ class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator
constexpr static int dsize = isXmm ? 4 : 8; constexpr static int dsize = isXmm ? 4 : 8;
constexpr static int _32_args = 0;
constexpr static int _invalid = 0xaaaaaaaa;
constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4;
constexpr static int _32_index = is64 ? _invalid : _32_args + 8;
constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12;
GSScanlineSelector m_sel; GSScanlineSelector m_sel;
GSScanlineLocalData& m_local; GSScanlineLocalData& m_local;
bool m_rip; bool m_rip;