diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index 65f0a1dc12..5d95924a62 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -30,22 +30,8 @@ using namespace Xbyak; // If use_lod, m_local.gd->tex, else m_local.gd->tex[0] #define _64_m_local__gd__tex r14 -#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) -#define _rip_global(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) - -/// Executes the given code only if targeting 32-bit -#define ONLY32(code) if (is32) (code) - -/// Executes the given code only if targeting 64-bit -#define ONLY64(code) if (is64) (code) - -/// Combines temporary with either dst64 on 64-bit or src32 on 32-bit -/// Follow up with an ONLY32 save back to src32 -#define REG_64_MEM_32(operation, dst64, temporary, src32) \ - if (is32) \ - operation(temporary, src32); \ - else \ - operation(dst64, temporary) +#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) +#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) /// On AVX, does a v-prefixed separate destination operation /// On SSE, moves src1 into dst using movdqa, then does the operation @@ -67,14 +53,7 @@ using namespace Xbyak; #define MOVE_IF_64(operation, dst, src64, ...) \ do \ { \ - if (is64) \ - { \ - THREEARG(operation, dst, src64, __VA_ARGS__); \ - } \ - else \ - { \ - operation(dst, __VA_ARGS__); \ - } \ + THREEARG(operation, dst, src64, __VA_ARGS__); \ } while (0) #define USING_XMM DRAW_SCANLINE_USING_XMM @@ -105,21 +84,21 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* , m_local(*(GSScanlineLocalData*)param) , m_rip(false) #ifdef _WIN32 - , a0(rcx) , a1(rdx) - , a2(r8) , a3(is64 ? r9 : rbx) - , t0(rdi) , t1(rsi) - , t2(is64 ? r8 : rbp), t3(r9) + , a0(rcx), a1(rdx) + , a2(r8) , a3(r9) + , t0(rdi), t1(rsi) + , t2(r8) , t3(r9) #else - , a0(is64 ? rdi : rcx), a1(is64 ? rsi : rdx) - , a2(is64 ? rdx : r8), a3(is64 ? rcx : rbx) - , t0(is64 ? r8 : rdi), t1(is64 ? r9 : rsi) - , t2(is64 ? rcx : rbp), t3(is64 ? rsi : r8) + , a0(rdi), a1(rsi) + , a2(rdx), a3(rcx) + , t0(r8) , t1(r9) + , t2(rcx), t3(rsi) #endif , _g_const(chooseLocal(&*g_const, _64_g_const)) , _m_local(chooseLocal(&m_local, _64_m_local)) , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) - , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(is64 ? xym15 : xym7) + , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15) , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) { m_sel.key = key; @@ -132,8 +111,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr) { - if (is64) - mov(reg, (size_t)addr); + mov(reg, (size_t)addr); return choose3264((size_t)addr, reg); } @@ -352,51 +330,41 @@ void GSDrawScanlineCodeGenerator2::Generate() m_rip &= (size_t)&m_local < 0x80000000; m_rip &= (size_t)&m_local.gd < 0x80000000; - if (is32) - { - push(rbx); - push(rsi); - push(rdi); - push(rbp); - } - else - { - push(rbp); - mov(rbp, rsp); // Stack traces look much nicer this way + push(rbp); + mov(rbp, rsp); // Stack traces look much nicer this way #ifdef _WIN32 - push(rbx); - push(rsi); - push(rdi); - push(r12); - push(r13); - push(r14); + push(rbx); + push(rsi); + push(rdi); + push(r12); + push(r13); + push(r14); - sub(rsp, _64_win_stack_size); + sub(rsp, _64_win_stack_size); - for (int i = 0; i < 10; i++) - { - movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6)); - } -#else - mov(ptr[rsp + _64_rz_rbx], rbx); - if (!m_rip) - { - mov(ptr[rsp + _64_rz_r12], r12); - mov(ptr[rsp + _64_rz_r13], r13); - } - mov(ptr[rsp + _64_rz_r14], r14); - mov(ptr[rsp + _64_rz_r15], r15); -#endif - mov(_64_g_const, (size_t)&*g_const); - if (!m_rip) - { - mov(_64_m_local, (size_t)&m_local); - mov(_64_m_local__gd, _rip_local(gd)); - } - - if (need_clut) - mov(_64_m_local__gd__clut, _rip_global(clut)); + for (int i = 0; i < 10; i++) + { + movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6)); } +#else + mov(ptr[rsp + _64_rz_rbx], rbx); + if (!m_rip) + { + mov(ptr[rsp + _64_rz_r12], r12); + mov(ptr[rsp + _64_rz_r13], r13); + } + mov(ptr[rsp + _64_rz_r14], r14); + mov(ptr[rsp + _64_rz_r15], r15); +#endif + mov(_64_g_const, (size_t)&*g_const); + if (!m_rip) + { + mov(_64_m_local, (size_t)&m_local); + mov(_64_m_local__gd, _rip_local(gd)); + } + + if (need_clut) + mov(_64_m_local__gd__clut, _rip_global(clut)); Init(); @@ -615,47 +583,33 @@ L("step"); L("exit"); - - - if (is32) - { - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); - } - else - { #ifdef _WIN32 - for (int i = 0; i < 10; i++) - { - movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]); - } - add(rsp, _64_win_stack_size); - - pop(r14); - pop(r13); - pop(r12); - pop(rdi); - pop(rsi); - pop(rbx); -#else - mov(rbx, ptr[rsp + _64_rz_rbx]); - if (!m_rip) - { - mov(r12, ptr[rsp + _64_rz_r12]); - mov(r13, ptr[rsp + _64_rz_r13]); - } - mov(r14, ptr[rsp + _64_rz_r14]); - mov(r15, ptr[rsp + _64_rz_r15]); -#endif - pop(rbp); - if (isYmm) - vzeroupper(); - ret(); + for (int i = 0; i < 10; i++) + { + movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]); } + add(rsp, _64_win_stack_size); + + pop(r14); + pop(r13); + pop(r12); + pop(rdi); + pop(rsi); + pop(rbx); +#else + mov(rbx, ptr[rsp + _64_rz_rbx]); + if (!m_rip) + { + mov(r12, ptr[rsp + _64_rz_r12]); + mov(r13, ptr[rsp + _64_rz_r13]); + } + mov(r14, ptr[rsp + _64_rz_r14]); + mov(r15, ptr[rsp + _64_rz_r15]); +#endif + pop(rbp); + if (isYmm) + vzeroupper(); + ret(); } /// Inputs: a0=pixels, a1=left, a2[x64]=top, a3[x64]=v @@ -683,7 +637,7 @@ void GSDrawScanlineCodeGenerator2::Init() and(eax, a0.cvt32()); if (isXmm) shl(eax, 4); // * sizeof(m_test[0]) - ONLY64(cdqe()); + cdqe(); if (isXmm) { @@ -713,43 +667,19 @@ void GSDrawScanlineCodeGenerator2::Init() // rbx = left // Free: rax, t0, t1 - if (is64) - { - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - mov(rax, _rip_global(fzbr)); - lea(t1, ptr[rax + a2 * 8]); + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + mov(rax, _rip_global(fzbr)); + lea(t1, ptr[rax + a2 * 8]); - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - mov(rax, _rip_global(fzbc)); - lea(t0, ptr[rax + rbx * 2]); - } - else - { - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - mov(t1, ptr[rsp + _top]); - lea(t1, ptr[t1 * 8]); - add(t1, ptr[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - lea(t0, ptr[rbx * 2]); - add(t0, ptr[(size_t)&m_local.gd->fzbc]); - } + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + mov(rax, _rip_global(fzbc)); + lea(t0, ptr[rax + rbx * 2]); if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { // a1 = &m_local.d[skip] // note a1 was (skip << 4) - - if (is64) - { - lea(rax, _rip_local(d)); - lea(a1, ptr[rax + a1 * 8]); - } - else - { - lea(a1, ptr[(size_t)m_local.d + a1 * 8]); - // a3 starts on the stack in x86, we want it in a register - mov(a3, ptr[rsp + _v]); - } + lea(rax, _rip_local(d)); + lea(a1, ptr[rax + a1 * 8]); } // a0 = steps (rcx | rdi) @@ -760,8 +690,8 @@ void GSDrawScanlineCodeGenerator2::Init() // t1 = fza_base (rsi | r9 ) // Free: rax - const XYm& f = is64 ? _f : xym1; - const XYm& z = is64 ? _z : xym0; + const XYm& f = _f; + const XYm& z = _z; if (m_sel.prim != GS_SPRITE_CLASS) { @@ -777,9 +707,6 @@ void GSDrawScanlineCodeGenerator2::Init() pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2)); paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]); - - if (is32) // _f is shared on x86 - movdqa(ptr[&m_local.temp.f], f); } if (m_sel.zb) @@ -788,17 +715,7 @@ void GSDrawScanlineCodeGenerator2::Init() { // z = vp.zzzz() + m_local.d[skip].z; shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2)); - if (is64) - { - addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); - } - else - { - movaps(ptr[&m_local.temp.z], z); - movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); - movaps(ptr[&m_local.temp.zo], xym2); - addps(z, xym2); - } + addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); } else pbroadcastdLocal(z, _rip_local(p.z)); @@ -812,7 +729,7 @@ void GSDrawScanlineCodeGenerator2::Init() pbroadcastdLocal(z, _rip_local(p.z)); } - if (m_sel.fwrite && m_sel.fge && is64) + if (m_sel.fwrite && m_sel.fge) pbroadcastwLocal(_f, _rip_local(p.f)); } @@ -840,8 +757,8 @@ void GSDrawScanlineCodeGenerator2::Init() { // a1 = &m_local.d[skip] - const XYm& s = is64 ? _s : xym2; - const XYm& t = is64 ? _t : xym3; + const XYm& s = _s; + const XYm& t = _t; if (m_sel.fst) { @@ -863,19 +780,16 @@ void GSDrawScanlineCodeGenerator2::Init() } else if (m_sel.ltf) { - XYm vf = is64 ? xym7 : xym6; + XYm vf = xym7; pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(vf, 12); movdqa(_rip_local(temp.vf), vf); } - - ONLY32(movdqa(_rip_local(temp.s), s)); - ONLY32(movdqa(_rip_local(temp.t), t)); } else { - const XYm& q = is64 ? _q : vt; + const XYm& q = _q; // s = vt.xxxx() + m_local.d[skip].s; // t = vt.yyyy() + m_local.d[skip].t; @@ -891,7 +805,7 @@ void GSDrawScanlineCodeGenerator2::Init() { movaps(s, vt); movaps(t, vt); - ONLY64(movaps(q, vt)); + movaps(q, vt); shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0)); shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1)); @@ -901,20 +815,13 @@ void GSDrawScanlineCodeGenerator2::Init() addps(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); addps(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); addps(q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); - - if (is32) - { - movaps(ptr[&m_local.temp.s], s); - movaps(ptr[&m_local.temp.t], t); - movaps(ptr[&m_local.temp.q], q); - } } } if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { - const XYm& f_rb = is64 ? _f_rb : xym5; - const XYm& f_ga = is64 ? _f_ga : xym6; + const XYm& f_rb = _f_rb; + const XYm& f_ga = _f_ga; if (m_sel.iip) { // GSVector4i vc = GSVector4i(v.c); @@ -942,38 +849,32 @@ void GSDrawScanlineCodeGenerator2::Init() paddw(f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); paddw(f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); - - ONLY32(movdqa(ptr[&m_local.temp.rb], f_rb)); - ONLY32(movdqa(ptr[&m_local.temp.ga], f_ga)); } - else if (is64 || m_sel.tfx == TFX_NONE) + else { movdqa(f_rb, _rip_local(c.rb)); movdqa(f_ga, _rip_local(c.ga)); } - ONLY64(movdqa(_rb, _f_rb)); - ONLY64(movdqa(_ga, _f_ga)); + movdqa(_rb, _f_rb); + movdqa(_ga, _f_ga); } } - if (is64) + if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) { - if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) - { - // On linux, a2 is edx which will be used for fzm - // In all case, it will require a mov in dthe code, so let's keep the value on the stack - mov(ptr[rsp + _top], a2); - } + // On linux, a2 is edx which will be used for fzm + // In all case, it will require a mov in dthe code, so let's keep the value on the stack + mov(ptr[rsp + _top], a2); + } - mov(_64_m_local__gd__vm, _rip_global(vm)); - if (m_sel.fb && m_sel.tfx != TFX_NONE) - { - if (use_lod) - lea(_64_m_local__gd__tex, _rip_global(tex)); - else - mov(_64_m_local__gd__tex, _rip_global(tex)); - } + mov(_64_m_local__gd__vm, _rip_global(vm)); + if (m_sel.fb && m_sel.tfx != TFX_NONE) + { + if (use_lod) + lea(_64_m_local__gd__tex, _rip_global(tex)); + else + mov(_64_m_local__gd__tex, _rip_global(tex)); } } @@ -991,8 +892,8 @@ void GSDrawScanlineCodeGenerator2::Step() add(t0, vecsize / 2); - const XYm& z = is64 ? _z : xym0; - const XYm& f = is64 ? _f : xym1; + const XYm& z =_z; + const XYm& f =_f; if (m_sel.prim != GS_SPRITE_CLASS) { @@ -1004,13 +905,6 @@ void GSDrawScanlineCodeGenerator2::Step() { pbroadcastdLocal(z, _rip_local(p.z)); } - else if (is32) - { - broadcastssLocal(z, _rip_local_d_p(z)); - addps(z, _rip_local(temp.zo)); - movaps(_rip_local(temp.zo), z); - addps(z, _rip_local(temp.z)); - } else { BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z)); @@ -1021,23 +915,7 @@ void GSDrawScanlineCodeGenerator2::Step() if (m_sel.fwrite && m_sel.fge) { - if (is32) - { - pbroadcastwLocal(f, _rip_local_d_p(f)); - paddw(f, _rip_local(temp.f)); - movdqa(_rip_local(temp.f), f); - } - else - { - BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f)); - } - } - } - else - { - if (is32 && m_sel.ztest) - { - pbroadcastdLocal(z, _rip_local(p.z)); + BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f)); } } @@ -1047,7 +925,7 @@ void GSDrawScanlineCodeGenerator2::Step() { if (m_sel.fst) { - const XYm& stq = is64 ? xym0 : xym4; + const XYm& stq = xym0; // GSVector4i stq = m_local.d4.stq; // s += stq.xxxx(); @@ -1055,28 +933,22 @@ void GSDrawScanlineCodeGenerator2::Step() broadcasti128(stq, _rip_local_d(stq)); - XYm s = is64 ? xym1 : xym2; + XYm s = xym1; pshufd(s, stq, _MM_SHUFFLE(0, 0, 0, 0)); - REG_64_MEM_32(paddd, _s, s, _rip_local(temp.s)); - ONLY32(movdqa(_rip_local(temp.s), s)); + paddd(_s, s); - XYm t = is64 ? xym1 : xym3; + XYm t = xym1; if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { pshufd(t, stq, _MM_SHUFFLE(1, 1, 1, 1)); - REG_64_MEM_32(paddd, _t, t, _rip_local(temp.t)); - ONLY32(movdqa(_rip_local(temp.t), t)); - } - else - { - ONLY32(movdqa(t, _rip_local(temp.t))); + paddd(_t, t); } } else { const XYm& s = xym2; const XYm& t = xym3; - const XYm& q = is64 ? xym1 : xym4; + const XYm& q = xym1; // GSVector4 stq = m_local.d4.stq; // s += stq.xxxx(); @@ -1102,13 +974,9 @@ void GSDrawScanlineCodeGenerator2::Step() shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2)); } - REG_64_MEM_32(addps, _s, s, _rip_local(temp.s)); - REG_64_MEM_32(addps, _t, t, _rip_local(temp.t)); - REG_64_MEM_32(addps, _q, q, _rip_local(temp.q)); - - ONLY32(movaps(_rip_local(temp.s), s)); - ONLY32(movaps(_rip_local(temp.t), t)); - ONLY32(movaps(_rip_local(temp.q), q)); + addps(_s, s); + addps(_t, t); + addps(_q, q); } } @@ -1116,7 +984,7 @@ void GSDrawScanlineCodeGenerator2::Step() { if (m_sel.iip) { - XYm c = is64 ? xym0 : xym7; + XYm c = xym0; // GSVector4i c = m_local.d4.c; // rb = rb.add16(c.xxxx()); @@ -1127,29 +995,18 @@ void GSDrawScanlineCodeGenerator2::Step() pshufd(_rb, c, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(_ga, c, _MM_SHUFFLE(1, 1, 1, 1)); - REG_64_MEM_32(paddw, _f_rb, _rb, _rip_local(temp.rb)); - REG_64_MEM_32(paddw, _f_ga, _ga, _rip_local(temp.ga)); + paddw(_f_rb, _rb); + paddw(_f_ga, _ga); // FIXME: color may underflow and roll over at the end of the line, if decreasing pxor(c, c); - pmaxsw(is64 ? _f_rb : _rb, c); - pmaxsw(is64 ? _f_ga : _ga, c); - - ONLY32(movdqa(_rip_local(temp.rb), _rb)); - ONLY32(movdqa(_rip_local(temp.ga), _ga)); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - ONLY32(movdqa(_rb, ptr[&m_local.c.rb])); - ONLY32(movdqa(_ga, ptr[&m_local.c.ga])); - } + pmaxsw(_f_rb, c); + pmaxsw(_f_ga, c); } - ONLY64(movdqa(_rb, _f_rb)); - ONLY64(movdqa(_ga, _f_ga)); + movdqa(_rb, _f_rb); + movdqa(_ga, _f_ga); } } @@ -1162,7 +1019,7 @@ void GSDrawScanlineCodeGenerator2::Step() and(eax, a0.cvt32()); if (isXmm) shl(eax, 4); - ONLY64(cdqe()); + cdqe(); #if USING_XMM movdqa(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]); @@ -1182,7 +1039,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) return; } - const XYm& z = is64 ? _z : xym0; + const XYm& z = _z; // int za = fza_base.y + fza_offset->y; @@ -1196,7 +1053,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) { if (m_sel.zequal) { - ONLY64(movdqa(xym0, _z)); + movdqa(xym0, _z); } else if (m_sel.zoverflow) { @@ -1247,7 +1104,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) } else { - ONLY64(movdqa(xym0, _z)); + movdqa(xym0, _z); } if (m_sel.ztest) @@ -1312,22 +1169,11 @@ void GSDrawScanlineCodeGenerator2::SampleTexture() return; } - - if (is32) - { - mov(ebx, ptr[&m_local.gd->tex[0]]); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - } - const bool needsMoreRegs = isYmm; if (!m_sel.fst) { - rcpps(xym0, is64 ? _q : xym4); + rcpps(xym0, _q); MOVE_IF_64(mulps, xym2, _s, xym0); MOVE_IF_64(mulps, xym3, _t, xym0); @@ -1349,21 +1195,19 @@ void GSDrawScanlineCodeGenerator2::SampleTexture() } else { - ONLY64(movdqa(xym2, _s)); - ONLY64(movdqa(xym3, _t)); + movdqa(xym2, _s); + movdqa(xym3, _t); } if (m_sel.ltf) { - const XYm& vf = is64 ? xym7 : xym0; + const XYm& vf = xym7; // GSVector4i uf = u.xxzzlh().srl16(12); pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xym4, 12); - if (is32 && needsMoreRegs) - movdqa(_rip_local(temp.uf), xym4); if (m_sel.prim != GS_SPRITE_CLASS) { @@ -1372,10 +1216,10 @@ void GSDrawScanlineCodeGenerator2::SampleTexture() pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(vf, 12); - if (is32 || needsMoreRegs) + if (needsMoreRegs) movdqa(_rip_local(temp.vf), vf); } - else if (is64 && !needsMoreRegs) + else if (!needsMoreRegs) { movdqa(vf, _rip_local(temp.vf)); } @@ -1484,8 +1328,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset) // c10 = addr10.gather32_32((const u32/u8*)tex[, clut]); // c11 = addr11.gather32_32((const u32/u8*)tex[, clut]); - const XYm& tmp1 = is64 ? xym7 : xym4; // OK to destroy if needsMoreRegs - const XYm& tmp2 = is64 ? xym4 : xym7; + const XYm& tmp1 = xym7; // OK to destroy if needsMoreRegs + const XYm& tmp2 = xym4; // d0 d1 d2s0 d3s1 s2 s3 ReadTexel4(xym5, xym6, xym0, xym2, xym1, xym3, tmp1, tmp2, mip_offset); @@ -1496,9 +1340,6 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset) // xym6 = c00 // xym7 = used[x86] vf[x64&&!needsMoreRegs] - if (is32 && needsMoreRegs) - movdqa(xym4, _rip_local(temp.uf)); - // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; @@ -1565,8 +1406,8 @@ void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset) // rb00 = rb00.lerp16_4(rb10, vf); // ga00 = ga00.lerp16_4(ga10, vf); - XYm vf = is64 ? xym7 : xym2; - if (needsMoreRegs || is32) + XYm vf = xym7; + if (needsMoreRegs) movdqa(vf, _rip_local(temp.vf)); lerp16_4(xym5, xym0, vf); @@ -1729,22 +1570,9 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() return; } - if (is32) - { - push(t2); - - mov(t2, (size_t)m_local.gd->tex); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - } - const bool needsMoreRegs = isYmm; - if (is64) - movdqa(xym4, _q); + movdqa(xym4, _q); if (!m_sel.fst) { @@ -1758,8 +1586,8 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() } else { - ONLY64(movdqa(xym2, _s)); - ONLY64(movdqa(xym3, _t)); + movdqa(xym2, _s); + movdqa(xym3, _t); } // xym2 = u @@ -1974,7 +1802,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() if (m_sel.ltf) { - const XYm& vf = is64 ? xym7 : xym0; + const XYm& vf = xym7; // u -= 0x8000; // v -= 0x8000; @@ -1989,15 +1817,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xym4, 12); - if (is32 && needsMoreRegs) - movdqa(_rip_local(temp.uf), xym4); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(vf, 12); - if (is32 || needsMoreRegs) + if (needsMoreRegs) movdqa(_rip_local(temp.vf), vf); } @@ -2058,7 +1884,7 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() if (m_sel.ltf) { - const XYm& vf = is64 ? xym7 : xym0; + const XYm& vf = xym7; // u -= 0x8000; // v -= 0x8000; @@ -2073,15 +1899,13 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xym4, 12); - if (is32 && needsMoreRegs) - movdqa(_rip_local(temp.uf), xym4); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(vf, 12); - if (is32 || needsMoreRegs) + if (needsMoreRegs) movdqa(_rip_local(temp.vf), vf); } @@ -2131,9 +1955,6 @@ void GSDrawScanlineCodeGenerator2::SampleTextureLOD() lerp16(xym5, xym2, xym0, 0); lerp16(xym6, xym3, xym0, 0); } - - if (is32) - pop(t2); } void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv) @@ -2264,9 +2085,9 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX() return; } - const XYm& f_ga = is64 ? _f_ga : xym4; - const XYm& tmpga = is64 ? xym1 : f_ga; - const XYm& tmp = is64 ? xym0 : xym3; + const XYm& f_ga = _f_ga; + const XYm& tmpga = xym1; + const XYm& tmp = xym0; Address _32_gaptr = m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga); switch (m_sel.tfx) @@ -2274,9 +2095,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX() case TFX_MODULATE: // GSVector4i ga = iip ? gaf : m_local.c.ga; - - ONLY32(movdqa(f_ga, _32_gaptr)); - // gat = gat.modulate16<1>(ga).clamp8(); modulate16(_ga, f_ga, 1); @@ -2301,8 +2119,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX() { // GSVector4i ga = iip ? gaf : m_local.c.ga; - ONLY32(movdqa(f_ga, _32_gaptr)); - MOVE_IF_64(psrlw, tmpga, f_ga, 7); mix16(_ga, tmpga, tmp); @@ -2313,10 +2129,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX() case TFX_HIGHLIGHT: // GSVector4i ga = iip ? gaf : m_local.c.ga; - - ONLY32(movdqa(f_ga, _32_gaptr)); - ONLY32(movdqa(xym2, f_ga)); // WHY - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); MOVE_IF_64(psrlw, tmpga, f_ga, 7); @@ -2338,9 +2150,6 @@ void GSDrawScanlineCodeGenerator2::AlphaTFX() { // GSVector4i ga = iip ? gaf : m_local.c.ga; - ONLY32(movdqa(f_ga, _32_gaptr)); - ONLY32(movdqa(xym2, f_ga)); - MOVE_IF_64(psrlw, tmpga, f_ga, 7); mix16(_ga, tmpga, tmp); @@ -2505,16 +2314,13 @@ void GSDrawScanlineCodeGenerator2::ColorTFX() return; } - const XYm& f_ga = is64 ? _f_ga : xym2; - const XYm& tmpga = is64 ? xym2 : f_ga; + const XYm& f_ga = _f_ga; + const XYm& tmpga = xym2; auto modulate16_1_rb = [&] { // GSVector4i rb = iip ? rbf : m_local.c.rb; - if (is64) - modulate16(_rb, _f_rb, 1); - else - modulate16(_rb, m_sel.iip ? _rip_local(temp.rb) : _rip_local(c.rb), 1); + modulate16(_rb, _f_rb, 1); }; switch (m_sel.tfx) @@ -2537,14 +2343,6 @@ void GSDrawScanlineCodeGenerator2::ColorTFX() case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: - - if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - ONLY32(movdqa(f_ga, m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga))); - } - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); movdqa(xym1, _ga); @@ -2593,21 +2391,12 @@ void GSDrawScanlineCodeGenerator2::Fog() return; } - const XYm& f = is64 ? _f : xym0; - const XYm& tmp = is64 ? xym0 : xym2; + const XYm& f = _f; + const XYm& tmp = xym0; // rb = m_local.gd->frb.lerp16<0>(rb, f); // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - if (m_sel.prim != GS_SPRITE_CLASS) - { - ONLY32(movdqa(f, _rip_local(temp.f))); - } - else - { - ONLY32(pbroadcastwLocal(f, _rip_local(p.f))); - } - movdqa(xym1, _ga); pbroadcastdLocal(tmp, _rip_global(frb)); @@ -3060,7 +2849,7 @@ void GSDrawScanlineCodeGenerator2::WriteFrame() } - const XYm& tmp = is64 ? xym15 : xym7; + const XYm& tmp = xym15; if (m_sel.fpsm == 2 && m_sel.dthe) { @@ -3379,7 +3168,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl( void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset) { - AddressReg texIn = is64 ? _64_m_local__gd__tex : t2; + AddressReg texIn = _64_m_local__gd__tex; Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]); mov(ebx, lod_addr); mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]); @@ -3397,7 +3186,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplYmm( const Ymm t1[] = { d1, d2s0, d3s1, s2 }; const Ymm t2[] = { tmp, tmp, tmp, tmp }; - bool texInRBX = is32; + bool texInRBX = false; if (use_lod && m_sel.lcm) { ReadTexelImplLoadTexLOD(0, mip_offset); @@ -3489,7 +3278,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplSSE4( else { bool preserve = false; - bool texInRBX = is32; + bool texInRBX = false; if (use_lod && m_sel.lcm) { @@ -3511,7 +3300,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(const Xmm& dst, const Xmm& addr { ASSERT(i < 4); - AddressReg clut = is64 ? _64_m_local__gd__clut : rdx; + AddressReg clut = _64_m_local__gd__clut; AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex; Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4]; diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h index 0e9a6a1860..f6bf1fce57 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h @@ -41,7 +41,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator constexpr static bool isXmm = std::is_same::value; constexpr static bool isYmm = std::is_same::value; - constexpr static int wordsize = is64 ? 8 : 4; + constexpr static int wordsize = 8; constexpr static int vecsize = isXmm ? 16 : 32; constexpr static int vecsizelog = isXmm ? 4 : 5; constexpr static int vecints = vecsize / 4; @@ -67,8 +67,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator constexpr static int _64_rz_r15 = -8 * 5; constexpr static int _64_top = -8 * 6; #endif - constexpr static int _top = is64 ? _64_top : _32_args + 4; - constexpr static int _v = is64 ? _invalid : _32_args + 8; + constexpr static int _top = _64_top; GSScanlineSelector m_sel; GSScanlineLocalData& m_local; diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h index 74a6f35748..297b341511 100644 --- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h @@ -100,30 +100,6 @@ public: }; private: - /// Make sure the register is okay to use - void validateRegister(const Operand& op) - { - if (is64) - return; - if (op.isREG() && (op.isExtIdx() || op.isExt8bit())) - throw Error(Error::ERR_64_BIT_REG_IN_32); - if (op.isMEM()) - { - auto e = static_cast(op).getRegExp(); - validateRegister(e.getIndex()); - validateRegister(e.getBase()); - } - } - /// For easier macro-ing - void validateRegister(int imm) - { - } - - void require64() - { - if (!is64) - throw Error(Error::ERR_64_INSTR_IN_32); - } void requireAVX() { if (!hasAVX) @@ -133,9 +109,6 @@ private: public: Xbyak::CodeGenerator& actual; -#if defined(_M_X86_64) - constexpr static bool is32 = false; - constexpr static bool is64 = true; using AddressReg = Xbyak::Reg64; using RipType = Xbyak::RegRip; @@ -144,18 +117,6 @@ public: template static T64 choose3264(T32 t32, T64 t64) { return t64; } -#else - constexpr static bool is32 = true; - constexpr static bool is64 = false; - using AddressReg = Xbyak::Reg32; - using RipType = int; - - template - struct Choose3264 { using type = T32; }; - - template - static T32 choose3264(T32 t32, T64 t64) { return t32; } -#endif const bool hasAVX, hasAVX2, hasFMA; @@ -238,34 +199,24 @@ public: #define FORWARD1(category, name, type) \ void name(type a) \ { \ - validateRegister(a); \ ACTUAL_FORWARD_##category(name, a) \ } #define FORWARD2(category, name, type1, type2) \ void name(type1 a, type2 b) \ { \ - validateRegister(a); \ - validateRegister(b); \ ACTUAL_FORWARD_##category(name, a, b) \ } #define FORWARD3(category, name, type1, type2, type3) \ void name(type1 a, type2 b, type3 c) \ { \ - validateRegister(a); \ - validateRegister(b); \ - validateRegister(c); \ ACTUAL_FORWARD_##category(name, a, b, c) \ } #define FORWARD4(category, name, type1, type2, type3, type4) \ void name(type1 a, type2 b, type3 c, type4 d) \ { \ - validateRegister(a); \ - validateRegister(b); \ - validateRegister(c); \ - validateRegister(d); \ ACTUAL_FORWARD_##category(name, a, b, c, d) \ } @@ -282,8 +233,6 @@ public: #define FORWARD_SSE_XMM0(name) \ void name(const Xmm& a, const Operand& b) \ { \ - validateRegister(a); \ - validateRegister(b); \ if (hasAVX) \ actual.v##name(a, b, Xmm(0)); \ else \ @@ -326,19 +275,12 @@ public: #define ARGS_XOI const Xmm&, const Operand&, u8 #define ARGS_XXO const Xmm&, const Xmm&, const Operand& -// For instructions that are ifdef'd out without XBYAK64 -#ifdef XBYAK64 - #define REQUIRE64(action) require64(); action -#else - #define REQUIRE64(action) require64() -#endif - const u8 *getCurr() { return actual.getCurr(); } void align(int x = 16) { return actual.align(x); } void db(int code) { actual.db(code); } void L(const std::string& label) { actual.L(label); } - void cdqe() { REQUIRE64(actual.cdqe()); } + void cdqe() { actual.cdqe(); } void ret(int imm = 0) { actual.ret(imm); } void vzeroupper() { requireAVX(); actual.vzeroupper(); } void vzeroall() { requireAVX(); actual.vzeroall(); } @@ -458,7 +400,6 @@ public: FORWARD(3, AVX2, vpsravd, ARGS_XXO) FORWARD(3, AVX2, vpsrlvd, ARGS_XXO) -#undef REQUIRE64 #undef ARGS_OI #undef ARGS_OO #undef ARGS_XI diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index eeef8aa39a..9593f04970 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -19,7 +19,7 @@ using namespace Xbyak; -#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) +#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) #define _64_m_local _64_t0 @@ -53,15 +53,15 @@ GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, , m_rip(false), many_regs(false) // On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap #ifdef _WIN32 - , _64_vertex(is64 ? rcx : r8) - , _index(is64 ? rdx : rcx) - , _dscan(is64 ? r8 : rdx) - , _64_t0(r9), t1(is64 ? r10 : rcx) + , _64_vertex(rcx) + , _index(rdx) + , _dscan(r8) + , _64_t0(r9), t1(r10) #else - , _64_vertex(is64 ? rdi : r8) - , _index(is64 ? rsi : rcx) + , _64_vertex(rdi) + , _index(rsi) , _dscan(rdx) - , _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx) + , _64_t0(rcx), t1(r8) #endif , _m_local(chooseLocal(&m_local, _64_m_local)) { @@ -88,7 +88,7 @@ void GSSetupPrimCodeGenerator2::Generate() m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000; bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip; - many_regs = is64 && isYmm && !m_sel.notest && needs_shift; + many_regs = isYmm && !m_sel.notest && needs_shift; #ifdef _WIN64 int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2; @@ -102,13 +102,11 @@ void GSSetupPrimCodeGenerator2::Generate() } #endif - if (is64 && !m_rip) + if (!m_rip) mov(_64_m_local, (size_t)&m_local); if (needs_shift) { - if (is32) - mov(_dscan, ptr[rsp + _32_dscan]); if (isXmm) mov(rax, (size_t)g_const->m_shift_128b); @@ -193,14 +191,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM() if (m_sel.prim != GS_POINT_CLASS) offset = sizeof(u32) * 1; - if (is32) - mov(_index, ptr[rsp + _32_index]); mov(eax, ptr[_index + offset]); shl(eax, 6); // * sizeof(GSVertexSW) - if (is64) - add(rax, _64_vertex); - else - add(rax, ptr[rsp + _32_vertex]); + add(rax, _64_vertex); movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); @@ -231,14 +224,9 @@ void GSSetupPrimCodeGenerator2::Depth_XMM() { // GSVector4 p = vertex[index[1]].p; - if (is32) - mov(_index, ptr[rsp + _32_index]); mov(eax, ptr[_index + sizeof(u32) * 1]); shl(eax, 6); // * sizeof(GSVertexSW) - if (is64) - add(rax, _64_vertex); - else - add(rax, ptr[rsp + _32_vertex]); + add(rax, _64_vertex); if (m_en.f) { @@ -285,14 +273,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM() if (m_sel.prim != GS_POINT_CLASS) offset = sizeof(u32) * 1; - if (is32) - mov(_index, ptr[rsp + _32_index]); mov(eax, ptr[_index + offset]); shl(eax, 6); // * sizeof(GSVertexSW) - if (is64) - add(rax, _64_vertex); - else - add(rax, ptr[rsp + _32_vertex]); + add(rax, _64_vertex); mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]); mov(_rip_local(p.z), t1.cvt32()); @@ -354,14 +337,9 @@ void GSSetupPrimCodeGenerator2::Depth_YMM() { // GSVector4 p = vertex[index[1]].p; - if (is32) - mov(_index, ptr[rsp + _32_index]); mov(eax, ptr[_index + sizeof(u32) * 1]); shl(eax, 6); // * sizeof(GSVertexSW) - if (is64) - add(rax, _64_vertex); - else - add(rax, ptr[rsp + _32_vertex]); + add(rax, _64_vertex); if (m_en.f) { @@ -564,14 +542,9 @@ void GSSetupPrimCodeGenerator2::Color() if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() { - if (is32) - mov(_index, ptr[rsp + _32_index]); mov(eax, ptr[_index + sizeof(u32) * last]); shl(eax, 6); // * sizeof(GSVertexSW) - if (is64) - add(rax, _64_vertex); - else - add(rax, ptr[rsp + _32_vertex]); + add(rax, _64_vertex); } if (isXmm) diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h index f21ba1fc73..342626807d 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h @@ -46,12 +46,6 @@ class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator constexpr static int dsize = isXmm ? 4 : 8; - constexpr static int _32_args = 0; - constexpr static int _invalid = 0xaaaaaaaa; - constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4; - constexpr static int _32_index = is64 ? _invalid : _32_args + 8; - constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12; - GSScanlineSelector m_sel; GSScanlineLocalData& m_local; bool m_rip;