diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index 3e9cda2898..6a5b1f9ff9 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -639,6 +639,7 @@ set(pcsx2GSSources GS/Renderers/HW/GSTextureCache.cpp GS/Renderers/SW/GSDrawScanline.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp + GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp @@ -708,6 +709,7 @@ set(pcsx2GSHeaders GS/Renderers/HW/GSTextureCache.h GS/Renderers/HW/GSVertexHW.h GS/Renderers/SW/GSDrawScanlineCodeGenerator.h + GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h GS/Renderers/SW/GSDrawScanline.h GS/Renderers/SW/GSNewCodeGenerator.h GS/Renderers/SW/GSRasterizer.h diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp new file mode 100644 index 0000000000..2b647ba92a --- /dev/null +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -0,0 +1,3508 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2021 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "PrecompiledHeader.h" +#include "GSDrawScanlineCodeGenerator.all.h" +#include "GS/Renderers/Common/GSFunctionMap.h" +#include "GSVertexSW.h" + +using namespace Xbyak; + +// Ease the reading of the code +// Note, there are versions without the _64 prefix that can be used as source (but not destination) operands on both 32 and 64 bit +#define _64_g_const r10 +#define _64_m_local r12 +#define _64_m_local__gd r13 +#define _64_m_local__gd__vm t3 +#define _64_m_local__gd__clut r11 +// If use_lod, m_local.gd->tex, else m_local.gd->tex[0] +#define _64_m_local__gd__tex r14 + +#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) +#define _rip_global(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) + +/// Executes the given code only if targeting 32-bit +#define ONLY32(code) if (is32) (code) + +/// Executes the given code only if targeting 64-bit +#define ONLY64(code) if (is64) (code) + +/// Combines temporary with either dst64 on 64-bit or src32 on 32-bit +/// Follow up with an ONLY32 save back to src32 +#define REG_64_MEM_32(operation, dst64, temporary, src32) \ + if (is32) \ + operation(temporary, src32); \ + else \ + operation(dst64, temporary) + +/// On AVX, does a v-prefixed separate destination operation +/// On SSE, moves src1 into dst using movdqa, then does the operation +#define THREEARG(operation, dst, src1, ...) \ + do \ + { \ + if (hasAVX) \ + { \ + v##operation(dst, src1, __VA_ARGS__); \ + } \ + else \ + { \ + movdqa(dst, src1); \ + operation(dst, __VA_ARGS__); \ + } \ + } while (0) + +/// On x64, does a 3-operand move, on x86 uses a two-operand SSE-style +#define MOVE_IF_64(operation, dst, src64, ...) \ + do \ + { \ + if (is64) \ + { \ + THREEARG(operation, dst, src64, __VA_ARGS__); \ + } \ + else \ + { \ + operation(dst, __VA_ARGS__); \ + } \ + } while (0) + +#define USING_XMM DRAW_SCANLINE_USING_XMM +#define USING_YMM DRAW_SCANLINE_USING_YMM + +#if _M_SSE >= 0x501 + /// On AVX2, uses the given broadcast to load into the temp register, then applies the given op + /// Otherwise, applies the given op directly + #define BROADCAST_AND_OP(broadcast, op, dst, tmpReg, src) \ + do \ + { \ + broadcast(tmpReg, src); \ + op(dst, tmpReg); \ + } while (0) + #define _rip_local_d(x) _rip_local(d8.x) + #define _rip_local_d_p(x) _rip_local_d(p.x) +#else + /// On AVX2, uses the given broadcast to load into the temp register, then applies the given op + /// Otherwise, applies the given op directly + #define BROADCAST_AND_OP(broadcast, op, dst, tmpReg, src) \ + op(dst, src) + #define _rip_local_d(x) _rip_local(d4.x) + #define _rip_local_d_p(x) _rip_local_d(x) +#endif + +GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key) + : _parent(base, cpu) + , m_local(*(GSScanlineLocalData*)param) + , m_rip(false) +#ifdef _WIN32 + , a0(rcx) , a1(rdx) + , a2(r8) , a3(is64 ? r9 : rbx) + , t0(rdi) , t1(rsi) + , t2(is64 ? r8 : rbp), t3(r9) +#else + , a0(is64 ? rdi : rcx), a1(is64 ? rsi : rdx) + , a2(is64 ? rdx : r8), a3(is64 ? rcx : rbx) + , t0(is64 ? r8 : rdi), t1(is64 ? r9 : rsi) + , t2(is64 ? rcx : rbp), t3(is64 ? rsi : r8) +#endif + , _g_const(chooseLocal(&*g_const, _64_g_const)) + , _m_local(chooseLocal(&m_local, _64_m_local)) + , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) + , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) + , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(is64 ? xym15 : xym7) + , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) +{ + m_sel.key = key; + use_lod = m_sel.mmin; + if (isYmm) + ASSERT(hasAVX2); +} + +// MARK: - Helpers + +GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr) +{ + if (is64) + mov(reg, (size_t)addr); + return choose3264((size_t)addr, reg); +} + +void GSDrawScanlineCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vbroadcastf128(reg, mem); +#else + movaps(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::broadcasti128(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vbroadcasti128(reg, mem); +#else + movdqa(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::broadcastssLocal(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vbroadcastss(reg, mem); +#else + movaps(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::pbroadcastqLocal(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vpbroadcastq(reg, mem); +#else + movdqa(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::pbroadcastdLocal(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vpbroadcastd(reg, mem); +#else + movdqa(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::pbroadcastwLocal(const XYm& reg, const Address& mem) +{ +#if USING_YMM + vpbroadcastw(reg, mem); +#else + movdqa(reg, mem); +#endif +} + +void GSDrawScanlineCodeGenerator2::broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr) +{ + movd(Xmm(vec.getIdx()), gpr); +#if USING_YMM + vpbroadcastd(vec, Xmm(vec.getIdx())); +#else + pshufd(vec, vec, _MM_SHUFFLE(0, 0, 0, 0)); +#endif +} + +void GSDrawScanlineCodeGenerator2::modulate16(const XYm& a, const Operand& f, uint8 shift) +{ + if (shift == 0) + { + pmulhrsw(a, f); + } + else + { + psllw(a, shift + 1); + pmulhw(a, f); + } +} + +void GSDrawScanlineCodeGenerator2::lerp16(const XYm& a, const XYm& b, const XYm& f, uint8 shift) +{ + psubw(a, b); + modulate16(a, f, shift); + paddw(a, b); +} + +void GSDrawScanlineCodeGenerator2::lerp16_4(const XYm& a, const XYm& b, const XYm& f) +{ + psubw(a, b); + pmullw(a, f); + psraw(a, 4); + paddw(a, b); +} + +void GSDrawScanlineCodeGenerator2::mix16(const XYm& a, const XYm& b, const XYm& temp) +{ + pblendw(a, b, 0xaa); +} + +void GSDrawScanlineCodeGenerator2::clamp16(const XYm& a, const XYm& temp) +{ + if (isXmm) + { + packuswb(a, a); + pmovzxbw(a, a); + } + else + { + packuswb(a, a); + pxor(temp, temp); + punpcklbw(a, temp); + } +} + +void GSDrawScanlineCodeGenerator2::alltrue(const XYm& test) +{ + uint32 mask = test.isYMM() ? 0xffffffff : 0xffff; + pmovmskb(eax, test); + cmp(eax, mask); + je("step", GSCodeGenerator::T_NEAR); +} + +void GSDrawScanlineCodeGenerator2::blend(const XYm& a, const XYm& b, const XYm& mask) +{ + pand(b, mask); + pandn(mask, a); + if (hasAVX) + { + vpor(a, b, mask); + } + else + { + por(b, mask); + movdqa(a, b); + } +} + +void GSDrawScanlineCodeGenerator2::blendr(const XYm& b, const XYm& a, const XYm& mask) +{ + pand(b, mask); + pandn(mask, a); + por(b, mask); +} + +void GSDrawScanlineCodeGenerator2::blend8(const XYm& a, const XYm& b) +{ + pblendvb(a, b /*, xym0 */); +} + +void GSDrawScanlineCodeGenerator2::blend8r(const XYm& b, const XYm& a) +{ + if (hasAVX) + { + vpblendvb(b, a, b, xym0); + } + else + { + pblendvb(a, b); + movdqa(b, a); + } +} + +void GSDrawScanlineCodeGenerator2::split16_2x8(const XYm& l, const XYm& h, const XYm& src) +{ + // l = src & 0xFF; (1 left shift + 1 right shift) + // h = (src >> 8) & 0xFF; (1 right shift) + + if (hasAVX) + { + if (src == h) + { + vpsllw(l, src, 8); + psrlw(h, 8); + } + else if (src == l) + { + vpsrlw(h, src, 8); + psllw(l, 8); + } + else + { + vpsllw(l, src, 8); + vpsrlw(h, src, 8); + } + psrlw(l, 8); + } + else + { + if (src == h) + { + movdqa(l, src); + } + else if (src == l) + { + movdqa(h, src); + } + else + { + movdqa(l, src); + movdqa(h, src); + } + psllw(l, 8); + psrlw(l, 8); + psrlw(h, 8); + } +} + +// MARK: - Main Implementation + +void GSDrawScanlineCodeGenerator2::Generate() +{ + bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; + bool need_clut = need_tex && m_sel.tlu; + m_rip = (size_t)getCurr() < 0x80000000; + m_rip &= (size_t)&m_local < 0x80000000; + m_rip &= (size_t)&m_local.gd < 0x80000000; + + if (is32) + { + push(rbx); + push(rsi); + push(rdi); + push(rbp); + } + else + { + push(rbp); + mov(rbp, rsp); // Stack traces look much nicer this way +#ifdef _WIN32 + push(rbx); + push(rsi); + push(rdi); + push(r12); + push(r13); + push(r14); + + sub(rsp, _64_win_stack_size); + + for (int i = 0; i < 10; i++) + { + movdqa(ptr[rsp + _64_win_xmm_start + 16 * i], Xmm(i + 6)); + } +#else + mov(ptr[rsp + _64_rz_rbx], rbx); + if (!m_rip) + { + mov(ptr[rsp + _64_rz_r12], r12); + mov(ptr[rsp + _64_rz_r13], r13); + } + mov(ptr[rsp + _64_rz_r14], r14); + mov(ptr[rsp + _64_rz_r15], r15); +#endif + mov(_64_g_const, (size_t)&*g_const); + if (!m_rip) + { + mov(_64_m_local, (size_t)&m_local); + mov(_64_m_local__gd, _rip_local(gd)); + } + + if (need_clut) + mov(_64_m_local__gd__clut, _rip_global(clut)); + } + + Init(); + + if (!m_sel.edge) + { + align(16); + } + +L("loop"); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // xym0 = z/zi | + // xym2 = s/u (tme) | free + // xym3 = t/v (tme) | free + // xym4 = q (tme) | free + // xym5 = rb (!tme) + // xym6 = ga (!tme) + // xym7 = test | free + // xym15 = | test + + bool tme = m_sel.tfx != TFX_NONE; + + TestZ(tme ? xym5 : xym2, tme ? xym6 : xym3); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = s/u (tme) | free + // xym3 = t/v (tme) | free + // xym4 = q (tme) | free + // xym5 = rb (!tme) + // xym6 = ga (!tme) + // xym7 = test | free + // xym15 = | test + + if (use_lod) + { + SampleTextureLOD(); + } + else + { + SampleTexture(); + } + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = free + // xym3 = free + // xym4 = free + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + AlphaTFX(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) | free + // xym3 = free | free + // xym4 = free | free + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + ReadMask(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) | free + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + TestAlpha(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) | free + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + ColorTFX(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = free + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + Fog(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // xym2 = free + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + ReadFrame(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // ebx = fa + // xym2 = fd + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + TestDestAlpha(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // ebx = fa + // xym2 = fd + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + // xym7 = test | free + // xym15 = | test + + WriteMask(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // t2 = za + // edx = fzm + // ebx = fa + // xym2 = fd + // xym3 = fm + // xym4 = zm + // xym5 = rb + // xym6 = ga + + WriteZBuf(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // edx = fzm + // ebx = fa + // xym2 = fd + // xym3 = fm + // xym4 = free + // xym5 = rb + // xym6 = ga + + AlphaBlend(); + + // a0 = steps + // t1 = fza_base + // t0 = fza_offset + // edx = fzm + // ebx = fa + // xym2 = fd + // xym3 = fm + // xym4 = free + // xym5 = rb + // xym6 = ga + + WriteFrame(); + +L("step"); + + // if(steps <= 0) break; + + if (!m_sel.edge) + { + test(a0.cvt32(), a0.cvt32()); + + jle("exit", CodeGenerator::T_NEAR); + + Step(); + + jmp("loop", CodeGenerator::T_NEAR); + } + +L("exit"); + + + + if (is32) + { + pop(ebp); + pop(edi); + pop(esi); + pop(ebx); + + ret(8); + } + else + { +#ifdef _WIN32 + for (int i = 0; i < 10; i++) + { + movdqa(Xmm(i + 6), ptr[rsp + _64_win_xmm_start + 16 * i]); + } + add(rsp, _64_win_stack_size); + + pop(r14); + pop(r13); + pop(r12); + pop(rdi); + pop(rsi); + pop(rbx); +#else + mov(rbx, ptr[rsp + _64_rz_rbx]); + if (!m_rip) + { + mov(r12, ptr[rsp + _64_rz_r12]); + mov(r13, ptr[rsp + _64_rz_r13]); + } + mov(r14, ptr[rsp + _64_rz_r14]); + mov(r15, ptr[rsp + _64_rz_r15]); +#endif + pop(rbp); + if (isYmm) + vzeroupper(); + ret(); + } +} + +/// Inputs: a0=pixels, a1=left, a2[x64]=top, a3[x64]=v +void GSDrawScanlineCodeGenerator2::Init() +{ + if (!m_sel.notest) + { + // int skip = left & 3; + + mov(ebx, a1.cvt32()); + and(a1.cvt32(), vecints - 1); + + // left -= skip; + + sub(ebx, a1.cvt32()); + + // int steps = pixels + skip - 4; + + lea(a0.cvt32(), ptr[a0 + a1 - vecints]); + + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + + mov(eax, a0.cvt32()); + sar(eax, 31); // GH: 31 to extract the sign of the register + and(eax, a0.cvt32()); + if (isXmm) + shl(eax, 4); // * sizeof(m_test[0]) + ONLY64(cdqe()); + + if (isXmm) + { + shl(a1.cvt32(), 4); // * sizeof(m_test[0]) + movdqa(_test, ptr[a1 + _g_const + offsetof(GSScanlineConstantData, m_test_128b[0])]); + por(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]); + } + else + { + pmovsxbd(_test, ptr[a1 * 8 + _g_const + offsetof(GSScanlineConstantData, m_test_256b[0])]); + pmovsxbd(xym0, ptr[rax * 8 + _g_const + offsetof(GSScanlineConstantData, m_test_256b[15])]); + por(_test, xym0); + shl(a1.cvt32(), 5); // * sizeof(m_test[0]) + } + } + else + { + mov(ebx, a1.cvt32()); // left + xor(a1.cvt32(), a1.cvt32()); // skip + lea(a0.cvt32(), ptr[a0 - vecints]); // steps + } + + // a0 = steps + // a1 = skip + // a2[x64] = top + // a3[x64] = v + // rbx = left + // Free: rax, t0, t1 + + if (is64) + { + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + mov(rax, _rip_global(fzbr)); + lea(t1, ptr[rax + a2 * 8]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + mov(rax, _rip_global(fzbc)); + lea(t0, ptr[rax + rbx * 2]); + } + else + { + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + mov(t1, ptr[rsp + _top]); + lea(t1, ptr[t1 * 8]); + add(t1, ptr[&m_local.gd->fzbr]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + lea(t0, ptr[rbx * 2]); + add(t0, ptr[(size_t)&m_local.gd->fzbc]); + } + + if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) + { + // a1 = &m_local.d[skip] // note a1 was (skip << 4) + + if (is64) + { + lea(rax, _rip_local(d)); + lea(a1, ptr[rax + a1 * 8]); + } + else + { + lea(a1, ptr[(size_t)m_local.d + a1 * 8]); + // a3 starts on the stack in x86, we want it in a register + mov(a3, ptr[rsp + _v]); + } + } + + // a0 = steps (rcx | rdi) + // a1 = skip (rdx | rsi) + // a2[x64] = top (r8 | rdx) + // a3 = v (rbx | rcx) + // t0 = fza_offset (rdi | r8 ) + // t1 = fza_base (rsi | r9 ) + // Free: rax + + const XYm& f = is64 ? _f : xym1; + const XYm& z = is64 ? _z : xym0; + + if (m_sel.prim != GS_SPRITE_CLASS) + { + if (m_sel.fwrite && m_sel.fge || m_sel.zb) + { + broadcastf128(z, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p + + if (m_sel.fwrite && m_sel.fge) + { + // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); + + cvttps2dq(f, z); + pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2)); + paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]); + + if (is32) // _f is shared on x86 + movdqa(ptr[&m_local.temp.f], f); + } + + if (m_sel.zb) + { + // z = vp.zzzz() + m_local.d[skip].z; + shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2)); + if (is64) + { + addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + } + else + { + movaps(ptr[&m_local.temp.z], z); + movaps(xym2, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]); + movaps(ptr[&m_local.temp.zo], xym2); + addps(z, xym2); + } + } + } + } + else + { + if (m_sel.ztest) + { + pbroadcastdLocal(z, _rip_local(p.z)); + } + + if (m_sel.fwrite && m_sel.fge && is64) + pbroadcastdLocal(_f, _rip_local(p.f)); + } + + const XYm& vt = xym4; + + if (m_sel.fb) + { + if (m_sel.edge || m_sel.tfx != TFX_NONE) + { + broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t + } + + if (m_sel.edge) + { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + + pshufhw(xym3, vt, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(xym3, xym3, _MM_SHUFFLE(3, 3, 3, 3)); + psrlw(xym3, 9); + + movdqa(_rip_local(temp.cov), xym3); + } + + if (m_sel.tfx != TFX_NONE) + { + // a1 = &m_local.d[skip] + + const XYm& s = is64 ? _s : xym2; + const XYm& t = is64 ? _t : xym3; + + if (m_sel.fst) + { + // GSVector4i vti(vt); + + cvttps2dq(xym6, vt); + + // s = vti.xxxx() + m_local.d[skip].s; + // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; + + pshufd(s, xym6, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(t, xym6, _MM_SHUFFLE(1, 1, 1, 1)); + + paddd(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); + + if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) + { + paddd(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); + } + else if (m_sel.ltf) + { + XYm vf = is64 ? xym7 : xym6; + pshuflw(vf, t, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(vf, 12); + movdqa(_rip_local(temp.vf), vf); + } + + ONLY32(movdqa(_rip_local(temp.s), s)); + ONLY32(movdqa(_rip_local(temp.t), t)); + } + else + { + const XYm& q = is64 ? _q : vt; + + // s = vt.xxxx() + m_local.d[skip].s; + // t = vt.yyyy() + m_local.d[skip].t; + // q = vt.zzzz() + m_local.d[skip].q; + + if (hasAVX) + { + vshufps(s, vt, vt, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(t, vt, vt, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(q, vt, vt, _MM_SHUFFLE(2, 2, 2, 2)); + } + else + { + movaps(s, vt); + movaps(t, vt); + ONLY64(movaps(q, vt)); + + shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2)); + } + + addps(s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); + addps(t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); + addps(q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); + + if (is32) + { + movaps(ptr[&m_local.temp.s], s); + movaps(ptr[&m_local.temp.t], t); + movaps(ptr[&m_local.temp.q], q); + } + } + } + + if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + const XYm& f_rb = is64 ? _f_rb : xym5; + const XYm& f_ga = is64 ? _f_ga : xym6; + if (m_sel.iip) + { + // GSVector4i vc = GSVector4i(v.c); + + if (isXmm) + { + cvttps2dq(xym6, ptr[a3 + offsetof(GSVertexSW, c)]); // v.c + } + else + { + vbroadcastf128(ymm6, ptr[a3 + offsetof(GSVertexSW, c)]); + cvttps2dq(ymm6, ymm6); + } + + // vc = vc.upl16(vc.zwxy()); + + pshufd(xym5, xym6, _MM_SHUFFLE(1, 0, 3, 2)); + punpcklwd(xym6, xym5); + + // rb = vc.xxxx().add16(m_local.d[skip].rb); + // ga = vc.zzzz().add16(m_local.d[skip].ga); + + pshufd(f_rb, xym6, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(f_ga, xym6, _MM_SHUFFLE(2, 2, 2, 2)); + + paddw(f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); + paddw(f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); + + ONLY32(movdqa(ptr[&m_local.temp.rb], f_rb)); + ONLY32(movdqa(ptr[&m_local.temp.ga], f_ga)); + } + else if (is64 || m_sel.tfx == TFX_NONE) + { + movdqa(f_rb, _rip_local(c.rb)); + movdqa(f_ga, _rip_local(c.ga)); + } + + ONLY64(movdqa(_rb, _f_rb)); + ONLY64(movdqa(_ga, _f_ga)); + } + } + + if (is64) + { + if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) + { + // On linux, a2 is edx which will be used for fzm + // In all case, it will require a mov in dthe code, so let's keep the value on the stack + mov(ptr[rsp + _top], a2); + } + + mov(_64_m_local__gd__vm, _rip_global(vm)); + if (m_sel.fb && m_sel.tfx != TFX_NONE) + { + if (use_lod) + lea(_64_m_local__gd__tex, _rip_global(tex)); + else + mov(_64_m_local__gd__tex, _rip_global(tex)); + } + } +} + +/// Inputs: a0=steps, t0=fza_offset +/// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test +/// Destroys[x86]: all +/// Destroys[x64]: xym0, xym1, xym2, xym3 +void GSDrawScanlineCodeGenerator2::Step() +{ + // steps -= 4; + + sub(a0.cvt32(), vecints); + + // fza_offset++; + + add(t0, vecsize / 2); + + const XYm& z = is64 ? _z : xym0; + const XYm& f = is64 ? _f : xym1; + + if (m_sel.prim != GS_SPRITE_CLASS) + { + // z += m_local.d4.z; + + if (m_sel.zb) + { + if (is32) + { + broadcastssLocal(z, _rip_local_d_p(z)); + addps(z, _rip_local(temp.zo)); + movaps(_rip_local(temp.zo), z); + addps(z, _rip_local(temp.z)); + } + else + { + BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z)); + } + } + + // f = f.add16(m_local.d4.f); + + if (m_sel.fwrite && m_sel.fge) + { + if (is32) + { + pbroadcastwLocal(f, _rip_local_d_p(f)); + paddw(f, _rip_local(temp.f)); + movdqa(_rip_local(temp.f), f); + } + else + { + BROADCAST_AND_OP(vpbroadcastw, paddw, f, xym0, _rip_local_d_p(f)); + } + } + } + else + { + if (is32 && m_sel.ztest) + { + pbroadcastdLocal(z, _rip_local(p.z)); + } + } + + if (m_sel.fb) + { + if (m_sel.tfx != TFX_NONE) + { + if (m_sel.fst) + { + const XYm& stq = is64 ? xym0 : xym4; + // GSVector4i stq = m_local.d4.stq; + + // s += stq.xxxx(); + // if(!sprite) t += st.yyyy(); + + broadcasti128(stq, _rip_local_d(stq)); + + XYm s = is64 ? xym1 : xym2; + pshufd(s, stq, _MM_SHUFFLE(0, 0, 0, 0)); + REG_64_MEM_32(paddd, _s, s, _rip_local(temp.s)); + ONLY32(movdqa(_rip_local(temp.s), s)); + + XYm t = is64 ? xym1 : xym3; + if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) + { + pshufd(t, stq, _MM_SHUFFLE(1, 1, 1, 1)); + REG_64_MEM_32(paddd, _t, t, _rip_local(temp.t)); + ONLY32(movdqa(_rip_local(temp.t), t)); + } + else + { + ONLY32(movdqa(t, _rip_local(temp.t))); + } + } + else + { + const XYm& s = xym2; + const XYm& t = xym3; + const XYm& q = is64 ? xym1 : xym4; + // GSVector4 stq = m_local.d4.stq; + + // s += stq.xxxx(); + // t += stq.yyyy(); + // q += stq.zzzz(); + + if (hasAVX) + { + broadcastf128(q, _rip_local_d(stq)); + + vshufps(s, q, q, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(t, q, q, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(q, q, q, _MM_SHUFFLE(2, 2, 2, 2)); + } + else + { + movaps(q, _rip_local_d(stq)); + movaps(s, q); + movaps(t, q); + + shufps(s, s, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(t, t, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(q, q, _MM_SHUFFLE(2, 2, 2, 2)); + } + + REG_64_MEM_32(addps, _s, s, _rip_local(temp.s)); + REG_64_MEM_32(addps, _t, t, _rip_local(temp.t)); + REG_64_MEM_32(addps, _q, q, _rip_local(temp.q)); + + ONLY32(movaps(_rip_local(temp.s), s)); + ONLY32(movaps(_rip_local(temp.t), t)); + ONLY32(movaps(_rip_local(temp.q), q)); + } + } + + if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if (m_sel.iip) + { + XYm c = is64 ? xym0 : xym7; + // GSVector4i c = m_local.d4.c; + + // rb = rb.add16(c.xxxx()); + // ga = ga.add16(c.yyyy()); + + pbroadcastqLocal(c, _rip_local_d(c)); + + pshufd(_rb, c, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(_ga, c, _MM_SHUFFLE(1, 1, 1, 1)); + + REG_64_MEM_32(paddw, _f_rb, _rb, _rip_local(temp.rb)); + REG_64_MEM_32(paddw, _f_ga, _ga, _rip_local(temp.ga)); + + // FIXME: color may underflow and roll over at the end of the line, if decreasing + + pxor(c, c); + pmaxsw(is64 ? _f_rb : _rb, c); + pmaxsw(is64 ? _f_ga : _ga, c); + + ONLY32(movdqa(_rip_local(temp.rb), _rb)); + ONLY32(movdqa(_rip_local(temp.ga), _ga)); + } + else + { + if (m_sel.tfx == TFX_NONE) + { + ONLY32(movdqa(_rb, ptr[&m_local.c.rb])); + ONLY32(movdqa(_ga, ptr[&m_local.c.ga])); + } + } + + ONLY64(movdqa(_rb, _f_rb)); + ONLY64(movdqa(_ga, _f_ga)); + } + } + + if (!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; + + mov(eax, a0.cvt32()); + sar(eax, 31); // GH: 31 to extract the sign of the register + and(eax, a0.cvt32()); + if (isXmm) + shl(eax, 4); + ONLY64(cdqe()); + +#if USING_XMM + movdqa(_test, ptr[rax + _g_const + offsetof(GSScanlineConstantData, m_test_128b[7])]); +#else + pmovsxbd(_test, ptr[rax * 8 + _g_const + offsetof(GSScanlineConstantData, m_test_256b[15])]); +#endif + } +} + +/// Inputs: xym0[x86]=z, t1=fza_base, t0=fza_offset, _test +/// Outputs: t2=za +/// Destroys: rax, xym0, temp1, temp2 +void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2) +{ + if (!m_sel.zb) + { + return; + } + + const XYm& z = is64 ? _z : xym0; + + // int za = fza_base.y + fza_offset->y; + + mov(t2.cvt32(), dword[t1 + 4]); + add(t2.cvt32(), dword[t0 + 4]); + and(t2.cvt32(), HALF_VM_SIZE - 1); + + // GSVector4i zs = zi; + + if (m_sel.prim != GS_SPRITE_CLASS) + { + if (m_sel.zoverflow) + { + // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + auto m_half = loadAddress(rax, &GSVector4::m_half); + + if (hasAVX) + vbroadcastss(temp1, ptr[m_half]); + else + movaps(temp1, ptr[m_half]); + mulps(temp1, z); + cvttps2dq(temp1, temp1); + pslld(temp1, 1); + + cvttps2dq(xym0, z); + pcmpeqd(temp2, temp2); + psrld(temp2, 31); + pand(xym0, temp2); + + por(xym0, temp1); + } + else + { + // zs = GSVector4i(z); + + cvttps2dq(xym0, z); + } + + if (m_sel.zclamp) + { + const uint8 amt = (uint8)((m_sel.zpsm & 0x3) * 8); + pcmpeqd(temp1, temp1); + psrld(temp1, amt); + pminsd(xym0, temp1); + } + + if (m_sel.zwrite) + { + movdqa(_rip_local(temp.zs), xym0); + } + } + else + { + ONLY64(movdqa(xym0, _z)); + } + + if (m_sel.ztest) + { + ReadPixel(temp2, temp1, t2); + + if (m_sel.zwrite && m_sel.zpsm < 2) + { + movdqa(_rip_local(temp.zd), temp2); + } + + // zd &= 0xffffffff >> m_sel.zpsm * 8; + + if (m_sel.zpsm) + { + pslld(temp2, static_cast(m_sel.zpsm * 8)); + psrld(temp2, static_cast(m_sel.zpsm * 8)); + } + + if (m_sel.zoverflow || m_sel.zpsm == 0) + { + // GSVector4i o = GSVector4i::x80000000(); + + pcmpeqd(temp1, temp1); + pslld(temp1, 31); + + // GSVector4i zso = zs - o; + // GSVector4i zdo = zd - o; + + psubd(xym0, temp1); + psubd(temp2, temp1); + } + + switch (m_sel.ztst) + { + case ZTST_GEQUAL: + // test |= zso < zdo; // ~(zso >= zdo) + pcmpgtd(temp2, xym0); + por(_test, temp2); + break; + + case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL + // test |= zso <= zdo; // ~(zso > zdo) + pcmpgtd(xym0, temp2); + pcmpeqd(temp1, temp1); + pxor(xym0, temp1); + por(_test, xym0); + break; + } + + alltrue(_test); + } +} + +/// Input[x86]: xym4=q, xym2=s, xym3=t +/// Output: _rb, _ga +/// Destroys everything except xym7[x86] +void GSDrawScanlineCodeGenerator2::SampleTexture() +{ + if (!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + + if (is32) + { + mov(ebx, ptr[&m_local.gd->tex[0]]); + + if (m_sel.tlu) + { + mov(edx, ptr[&m_local.gd->clut]); + } + } + + const bool needsMoreRegs = isYmm; + + if (!m_sel.fst) + { + rcpps(xym0, is64 ? _q : xym4); + + MOVE_IF_64(mulps, xym2, _s, xym0); + MOVE_IF_64(mulps, xym3, _t, xym0); + + cvttps2dq(xym2, xym2); + cvttps2dq(xym3, xym3); + + if (m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + broadcastGPRToVec(xym1, eax); + + psubd(xym2, xym1); + psubd(xym3, xym1); + } + } + else + { + ONLY64(movdqa(xym2, _s)); + ONLY64(movdqa(xym3, _t)); + } + + if (m_sel.ltf) + { + const XYm& vf = is64 ? xym7 : xym0; + + // GSVector4i uf = u.xxzzlh().srl16(12); + + pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xym4, 12); + if (is32 && needsMoreRegs) + movdqa(_rip_local(temp.uf), xym4); + + if (m_sel.prim != GS_SPRITE_CLASS) + { + // GSVector4i vf = v.xxzzlh().srl16(12); + + pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(vf, 12); + if (is32 || needsMoreRegs) + movdqa(_rip_local(temp.vf), vf); + } + else if (is64 && !needsMoreRegs) + { + movdqa(vf, _rip_local(temp.vf)); + } + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + psrad(xym2, 16); + psrad(xym3, 16); + packssdw(xym2, xym3); + + if (m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + pcmpeqd(xym0, xym0); + psrlw(xym0, 15); + THREEARG(paddw, xym3, xym2, xym0); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + Wrap(xym2, xym3); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(xym2); + } + + // xym2 = uv0 + // xym3 = uv1 + // xym4 = uf[x64||!needsMoreRegs] + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym0, xym1, xym5, xym6 + + SampleTexture_TexelReadHelper(0); + + // xym5 = rb (xym5[x86], xym2[x64]) + // xym6 = ga (xym6[x86], xym3[x64]) +} + +/// Input[x86]: xym2=uv0, xym3=uv1 (ltf), xym4=uf (!needsMoreRegs) +/// Input[x64]: xym2=uv0, xym3=uv1 (ltf), xym4=uf, xym7=vf (!needsMoreRegs) +/// Output: _rb, _ga +/// Destroys all registers except outputs, xmm4 and xmm7 +void GSDrawScanlineCodeGenerator2::SampleTexture_TexelReadHelper(int mip_offset) +{ + const bool needsMoreRegs = isYmm; + + // GSVector4i x0 = uv0.upl16(); + // GSVector4i y0 = uv0.uph16() << tw; + + pxor(xym0, xym0); + + THREEARG(punpcklwd, xym5, xym2, xym0); + punpckhwd(xym2, xym0); + pslld(xym2, static_cast(m_sel.tw + 3)); + + // xym0 = 0 + // xym2 = y0 + // xym3 = uv1 (ltf) + // xym4 = uf[x64||!needsMoreRegs] + // xym5 = x0 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym1, xym6 + + if (m_sel.ltf) + { + // GSVector4i x1 = uv1.upl16(); + // GSVector4i y1 = uv1.uph16() << tw; + + THREEARG(punpcklwd, xym1, xym3, xym0); + punpckhwd(xym3, xym0); + pslld(xym3, static_cast(m_sel.tw + 3)); + + // xym1 = x1 + // xym2 = y0 + // xym3 = y1 + // xym4 = uf[x64||!needsMoreRegs] + // xym5 = x0 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym0, xym6 + + // GSVector4i addr00 = y0 + x0; + // GSVector4i addr01 = y0 + x1; + // GSVector4i addr10 = y1 + x0; + // GSVector4i addr11 = y1 + x1; + + THREEARG(paddd, xym0, xym3, xym1); // addr11 + paddd(xym1, xym2); // addr01 + paddd(xym2, xym5); // addr00 + paddd(xym3, xym5); // addr10 + + // xym0 = addr11 + // xym1 = addr01 + // xym2 = addr00 + // xym3 = addr10 + // xym4 = uf[x64||!needsMoreRegs] + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym4, xym5 + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + const XYm& tmp1 = is64 ? xym7 : xym4; // OK to destroy if needsMoreRegs + const XYm& tmp2 = is64 ? xym4 : xym7; + // d0 d1 d2s0 d3s1 s2 s3 + ReadTexel4(xym5, xym6, xym0, xym2, xym1, xym3, tmp1, tmp2, mip_offset); + + // xym0 = c01 + // xym2 = c10 + // xym4 = uf[x64||!needsMoreRegs] + // xym5 = c11 + // xym6 = c00 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + + if (is32 && needsMoreRegs) + movdqa(xym4, _rip_local(temp.uf)); + + // GSVector4i rb00 = c00 & mask; + // GSVector4i ga00 = (c00 >> 8) & mask; + + split16_2x8(xym3, xym6, xym6); + + // GSVector4i rb01 = c01 & mask; + // GSVector4i ga01 = (c01 >> 8) & mask; + + split16_2x8(xym0, xym1, xym0); + + // xym0 = rb01 + // xym1 = ga01 + // xym2 = c10 + // xym3 = rb00 + // xym4 = uf + // xym5 = c11 + // xym6 = ga00 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); + + lerp16_4(xym0, xym3, xym4); + lerp16_4(xym1, xym6, xym4); + + // xym0 = rb00 + // xym1 = ga00 + // xym2 = c10 + // xym4 = uf + // xym5 = c11 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + + // GSVector4i rb10 = c10 & mask; + // GSVector4i ga10 = (c10 >> 8) & mask; + + split16_2x8(xym2, xym3, xym2); + + // GSVector4i rb11 = c11 & mask; + // GSVector4i ga11 = (c11 >> 8) & mask; + + split16_2x8(xym5, xym6, xym5); + + // xym0 = rb00 + // xym1 = ga00 + // xym2 = rb10 + // xym3 = ga10 + // xym4 = uf + // xym5 = rb11 + // xym6 = ga11 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); + + lerp16_4(xym5, xym2, xym4); + lerp16_4(xym6, xym3, xym4); + + // xym0 = rb00 + // xym1 = ga00 + // xym5 = rb10 + // xym6 = ga10 + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); + + XYm vf = is64 ? xym7 : xym2; + if (needsMoreRegs || is32) + movdqa(vf, _rip_local(temp.vf)); + + lerp16_4(xym5, xym0, vf); + lerp16_4(xym6, xym1, vf); + } + else + { + // GSVector4i addr00 = y0 + x0; + + paddd(xym2, xym5); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel1(xym5, xym2, xym0, xym1, mip_offset); + + // GSVector4i mask = GSVector4i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + split16_2x8(xym5, xym6, xym5); + } +} + +void GSDrawScanlineCodeGenerator2::Wrap(const XYm& uv) +{ + // Registers free from SampleTexture + const XYm& mask = xym0; + const XYm& min = xym1; + const XYm& max = xym5; + const XYm& tmp = xym6; + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if (wms_clamp == wmt_clamp) + { + if (wms_clamp) + { + if (region) + { + BROADCAST_AND_OP(vbroadcasti128, pmaxsw, uv, min, _rip_global(t.min)); + } + else + { + pxor(tmp, tmp); + pmaxsw(uv, tmp); + } + + BROADCAST_AND_OP(vbroadcasti128, pminsw, uv, max, _rip_global(t.max)); + } + else + { + BROADCAST_AND_OP(vbroadcasti128, pand, uv, min, _rip_global(t.min)); + + if (region) + { + BROADCAST_AND_OP(vbroadcasti128, por, uv, max, _rip_global(t.max)); + } + } + } + else + { + broadcasti128(min, _rip_global(t.min)); + broadcasti128(max, _rip_global(t.max)); + broadcasti128(mask, _rip_global(t.mask)); + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + THREEARG(pand, tmp, uv, min); + if (region) + por(tmp, max); + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + pmaxsw(uv, min); + pminsw(uv, max); + // clamp.blend8(repeat, m_local.gd->t.mask); + blend8(uv, tmp /*, xym0==mask */); + } +} + +/// Destroys[x86]: xym0, xym1, xym2, xym3, xym4[!sse41] +/// Destroys[x64]: xym0, xym1, xym5, xym6, xym7[!sse41] +void GSDrawScanlineCodeGenerator2::Wrap(const XYm& uv0, const XYm& uv1) +{ + // Registers free from SampleTexture + const XYm& mask = xym0; + const XYm& min = xym1; + const XYm& max = xym5; + const XYm& tmp = xym6; + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if (wms_clamp == wmt_clamp) + { + if (wms_clamp) + { + if (region) + { + broadcasti128(min, _rip_global(t.min)); + pmaxsw(uv0, min); + pmaxsw(uv1, min); + } + else + { + pxor(tmp, tmp); + pmaxsw(uv0, tmp); + pmaxsw(uv1, tmp); + } + + broadcasti128(max, _rip_global(t.max)); + pminsw(uv0, max); + pminsw(uv1, max); + } + else + { + broadcasti128(min, _rip_global(t.min)); + pand(uv0, min); + pand(uv1, min); + + if (region) + { + broadcasti128(max, _rip_global(t.max)); + por(uv0, max); + por(uv1, max); + } + } + } + else + { + broadcasti128(min, _rip_global(t.min)); + broadcasti128(max, _rip_global(t.max)); + broadcasti128(mask, _rip_global(t.mask)); + + for (const XYm& uv : {uv0, uv1}) + { + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + THREEARG(pand, tmp, uv, min); + if (region) + por(tmp, max); + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + pmaxsw(uv, min); + pminsw(uv, max); + // clamp.blend8(repeat, m_local.gd->t.mask); + pblendvb(uv, tmp /*, xym0==mask */); + } + } +} + +/// Input[x86]: xym4=q, xym2=s, xym3=t +/// Output: _rb, _ga +/// Destroys everything except xym7[x86] +void GSDrawScanlineCodeGenerator2::SampleTextureLOD() +{ + if (!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + if (is32) + { + push(t2); + + mov(t2, (size_t)m_local.gd->tex); + + if (m_sel.tlu) + { + mov(edx, ptr[&m_local.gd->clut]); + } + } + + const bool needsMoreRegs = isYmm; + + if (is64) + movdqa(xym4, _q); + + if (!m_sel.fst) + { + rcpps(xym0, xym4); + + MOVE_IF_64(mulps, xym2, _s, xym0); + MOVE_IF_64(mulps, xym3, _t, xym0); + + cvttps2dq(xym2, xym2); + cvttps2dq(xym3, xym3); + } + + // xym2 = u + // xym3 = v + // xym4 = q + // xym0 = xym1 = xym5 = xym6 = free + + // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) + + if (!m_sel.lcm) + { + // lod = -log2(Q) * (1 << L) + K + + pcmpeqd(xym1, xym1); + psrld(xym1, 25); + THREEARG(pslld, xym0, xym4, 1); + psrld(xym0, 24); + psubd(xym0, xym1); + cvtdq2ps(xym0, xym0); + + // xym0 = (float)(exp(q) - 127) + + pslld(xym4, 9); + psrld(xym4, 9); + + auto log2_coeff = [this](int i) -> Address + { + if (isXmm) + return ptr[_g_const + OFFSETOF(GSScanlineConstantData, m_log2_coef_128b[i])]; + else + return ptr[_g_const + OFFSETOF(GSScanlineConstantData, m_log2_coef_256b[i])]; + }; + + orps(xym4, log2_coeff(3)); + + // xym4 = mant(q) | 1.0f + + if (hasFMA) + { + movaps(xym5, log2_coeff(0)); // c0 + vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1 + vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2 + subps(xym4, log2_coeff(3)); // xym4 - 1.0f + vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0 + } + else + { + THREEARG(mulps, xym5, xym4, log2_coeff(0)); + addps(xym5, log2_coeff(1)); + mulps(xym5, xym4); + subps(xym4, log2_coeff(3)); + addps(xym5, log2_coeff(2)); + mulps(xym4, xym5); + addps(xym4, xym0); + } + + // xym4 = log2(Q) = ((((c0 * xym4) + c1) * xym4) + c2) * (xym4 - 1.0f) + xym0 + + if (hasFMA) + { + movaps(xym5, _rip_global(l)); + vfmadd213ps(xym4, xym5, _rip_global(k)); + } + else + { + mulps(xym4, _rip_global(l)); + addps(xym4, _rip_global(k)); + } + + // xym4 = (-log2(Q) * (1 << L) + K) * 0x10000 + + xorps(xym0, xym0); + minps(xym4, _rip_global(mxl)); + maxps(xym4, xym0); + cvtps2dq(xym4, xym4); + + if (m_sel.mmin == 1) // round-off mode + { + mov(eax, 0x8000); + broadcastGPRToVec(xym0, eax); + paddd(xym4, xym0); + } + + THREEARG(psrld, xym0, xym4, 16); + + movdqa(_rip_local(temp.lod.i), xym0); + /* + vpslld(xym5, xym0, 6); + vpslld(xym6, xym4, 16); + vpsrld(xym6, xym6, 24); + return; + */ + if (m_sel.mmin == 2) // trilinear mode + { + pshuflw(xym1, xym4, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xym1, xym1, _MM_SHUFFLE(2, 2, 0, 0)); + movdqa(_rip_local(temp.lod.f), xym1); + } + + // shift u/v/minmax by (int)lod + + if (hasAVX2) + { + vpsravd(xym2, xym2, xym0); + vpsravd(xym3, xym3, xym0); + + movdqa(_rip_local(temp.uv[0]), xym2); + movdqa(_rip_local(temp.uv[1]), xym3); + + // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] + + pxor(xym1, xym1); + + broadcasti128(xym4, _rip_global(t.min)); + vpunpcklwd(xym5, xym4, xym1); // minu + vpunpckhwd(xym6, xym4, xym1); // minv + vpsrlvd(xym5, xym5, xym0); + vpsrlvd(xym6, xym6, xym0); + packusdw(xym5, xym6); + + broadcasti128(xym4, _rip_global(t.max)); + vpunpcklwd(xym6, xym4, xym1); // maxu + vpunpckhwd(xym4, xym4, xym1); // maxv + vpsrlvd(xym6, xym6, xym0); + vpsrlvd(xym4, xym4, xym0); + packusdw(xym6, xym4); + + movdqa(_rip_local(temp.uv_minmax[0]), xym5); + movdqa(_rip_local(temp.uv_minmax[1]), xym6); + } + else + { + movq(xym4, _rip_global(t.minmax)); + + THREEARG(punpckhdq, xym6, xym2, xym3); + punpckldq(xym2, xym3); + movdqa(xym5, xym2); + movdqa(xym3, xym6); + + movd(xym0, _rip_local(temp.lod.i.u32[0])); + psrad(xym2, xym0); + THREEARG(psrlw, xym1, xym4, xym0); + movq(_rip_local(temp.uv_minmax[0].u32[0]), xym1); + + movd(xym0, _rip_local(temp.lod.i.u32[1])); + psrad(xym5, xym0); + THREEARG(psrlw, xym1, xym4, xym0); + movq(_rip_local(temp.uv_minmax[1].u32[0]), xym1); + + movd(xym0, _rip_local(temp.lod.i.u32[2])); + psrad(xym3, xym0); + THREEARG(psrlw, xym1, xym4, xym0); + movq(_rip_local(temp.uv_minmax[0].u32[2]), xym1); + + movd(xym0, _rip_local(temp.lod.i.u32[3])); + psrad(xym6, xym0); + THREEARG(psrlw, xym1, xym4, xym0); + movq(_rip_local(temp.uv_minmax[1].u32[2]), xym1); + + punpckldq(xym2, xym3); + punpckhdq(xym5, xym6); + THREEARG(punpckhdq, xym3, xym2, xym5); + punpckldq(xym2, xym5); + + movdqa(_rip_local(temp.uv[0]), xym2); + movdqa(_rip_local(temp.uv[1]), xym3); + + movdqa(xym5, _rip_local(temp.uv_minmax[0])); + movdqa(xym6, _rip_local(temp.uv_minmax[1])); + + if (hasAVX) + { + vpunpcklwd(xym0, xym5, xym6); + vpunpckhwd(xym1, xym5, xym6); + vpunpckldq(xym5, xym0, xym1); + vpunpckhdq(xym6, xym0, xym1); + } + else + { + movdqa(xym0, xym5); + punpcklwd(xym5, xym6); + punpckhwd(xym0, xym6); + movdqa(xym6, xym5); + punpckldq(xym5, xym0); + punpckhdq(xym6, xym0); + } + + movdqa(_rip_local(temp.uv_minmax[0]), xym5); + movdqa(_rip_local(temp.uv_minmax[1]), xym6); + } + } + else + { + // lod = K + + movd(Xmm(xym0.getIdx()), _rip_global(lod.i.u32[0])); + + psrad(xym2, Xmm(xym0.getIdx())); + psrad(xym3, Xmm(xym0.getIdx())); + + movdqa(_rip_local(temp.uv[0]), xym2); + movdqa(_rip_local(temp.uv[1]), xym3); + + movdqa(xym5, _rip_local(temp.uv_minmax[0])); + movdqa(xym6, _rip_local(temp.uv_minmax[1])); + } + + // xym2 = m_local.temp.uv[0] = u (level m) + // xym3 = m_local.temp.uv[1] = v (level m) + // xym5 = minuv + // xym6 = maxuv + + if (m_sel.ltf) + { + const XYm& vf = is64 ? xym7 : xym0; + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + broadcastGPRToVec(xym4, eax); + + psubd(xym2, xym4); + psubd(xym3, xym4); + + // GSVector4i uf = u.xxzzlh().srl16(1); + + pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xym4, 12); + if (is32 && needsMoreRegs) + movdqa(_rip_local(temp.uf), xym4); + + // GSVector4i vf = v.xxzzlh().srl16(1); + + pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(vf, 12); + if (is32 || needsMoreRegs) + movdqa(_rip_local(temp.vf), vf); + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + psrad(xym2, 16); + psrad(xym3, 16); + packssdw(xym2, xym3); + + if (m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + pcmpeqd(xym1, xym1); + psrlw(xym1, 15); + THREEARG(paddw, xym3, xym2, xym1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + WrapLOD(xym2, xym3); + } + else + { + // uv0 = Wrap(uv0); + + WrapLOD(xym2); + } + + // xym2 = uv0 + // xym3 = uv1 (ltf) + // xym4 = uf[x64||!needsMoreRegs] + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym0, xym1, xym5, xym6 + + SampleTexture_TexelReadHelper(0); + + // xym5: rb + // xym6: ga + + + if (m_sel.mmin != 1) // !round-off mode + { + movdqa(_rip_local(temp.trb), xym5); + movdqa(_rip_local(temp.tga), xym6); + + movdqa(xym2, _rip_local(temp.uv[0])); + movdqa(xym3, _rip_local(temp.uv[1])); + + psrad(xym2, 1); + psrad(xym3, 1); + + movdqa(xym5, _rip_local(temp.uv_minmax[0])); + movdqa(xym6, _rip_local(temp.uv_minmax[1])); + + psrlw(xym5, 1); + psrlw(xym6, 1); + + if (m_sel.ltf) + { + const XYm& vf = is64 ? xym7 : xym0; + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + broadcastGPRToVec(xym4, eax); + + psubd(xym2, xym4); + psubd(xym3, xym4); + + // GSVector4i uf = u.xxzzlh().srl16(1); + + pshuflw(xym4, xym2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xym4, xym4, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xym4, 12); + if (is32 && needsMoreRegs) + movdqa(_rip_local(temp.uf), xym4); + + // GSVector4i vf = v.xxzzlh().srl16(1); + + pshuflw(vf, xym3, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(vf, vf, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(vf, 12); + if (is32 || needsMoreRegs) + movdqa(_rip_local(temp.vf), vf); + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + psrad(xym2, 16); + psrad(xym3, 16); + packssdw(xym2, xym3); + + if (m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + pcmpeqd(xym1, xym1); + psrlw(xym1, 15); + THREEARG(paddw, xym3, xym2, xym1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + WrapLOD(xym2, xym3); + } + else + { + // uv0 = Wrap(uv0); + + WrapLOD(xym2); + } + + // xym2 = uv0 + // xym3 = uv1 (ltf) + // xym4 = uf[x64||!needsMoreRegs] + // xym7 = used[x86] vf[x64&&!needsMoreRegs] + // Free: xym0, xym1, xym5, xym6 + + SampleTexture_TexelReadHelper(1); + + // xym5: rb + // xym6: ga + + movdqa(xym0, m_sel.lcm ? _rip_global(lod.f) : _rip_local(temp.lod.f)); + psrlw(xym0, 1); + + movdqa(xym2, _rip_local(temp.trb)); + movdqa(xym3, _rip_local(temp.tga)); + + lerp16(xym5, xym2, xym0, 0); + lerp16(xym6, xym3, xym0, 0); + } + + if (is32) + pop(t2); +} + +void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv) +{ + // Registers free from SampleTexture + const XYm& mask = xym0; + const XYm& tmp = xym1; + const XYm& min = xym5; + const XYm& max = xym6; + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if (wms_clamp == wmt_clamp) + { + if (wms_clamp) + { + if (region) + { + pmaxsw(uv, min); + } + else + { + pxor(tmp, tmp); + pmaxsw(uv, tmp); + } + + pminsw(uv, max); + } + else + { + pand(uv, min); + + if (region) + { + por(uv, max); + } + } + } + else + { + broadcasti128(mask, _rip_global(t.mask)); + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + THREEARG(pand, tmp, uv, min); + if (region) + por(tmp, max); + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + pmaxsw(uv, min); + pminsw(uv, max); + // clamp.blend8(repeat, m_local.gd->t.mask); + blend8(uv, tmp /*, xym0==mask */); + } +} + +void GSDrawScanlineCodeGenerator2::WrapLOD(const XYm& uv0, const XYm& uv1) +{ + // Registers free from SampleTexture + const XYm& mask = xym0; + const XYm& tmp = xym1; + const XYm& min = xym5; + const XYm& max = xym6; + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if (wms_clamp == wmt_clamp) + { + if (wms_clamp) + { + if (region) + { + pmaxsw(uv0, min); + pmaxsw(uv1, min); + } + else + { + pxor(tmp, tmp); + pmaxsw(uv0, tmp); + pmaxsw(uv1, tmp); + } + + pminsw(uv0, max); + pminsw(uv1, max); + } + else + { + pand(uv0, min); + pand(uv1, min); + + if (region) + { + por(uv0, max); + por(uv1, max); + } + } + } + else + { + broadcasti128(mask, _rip_global(t.mask)); + + for (const XYm& uv : {uv0, uv1}) + { + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + THREEARG(pand, tmp, uv, min); + if (region) + por(tmp, max); + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + pmaxsw(uv, min); + pminsw(uv, max); + // clamp.blend8(repeat, m_local.gd->t.mask);* + pblendvb(uv, tmp /*, xym0==mask */); + } + } +} + +/// Input: _ga +/// Output: xym2[x86]=gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) +/// Destroys: xym0, xym1, xym3[x86], xym4[x86] +void GSDrawScanlineCodeGenerator2::AlphaTFX() +{ + if (!m_sel.fb) + { + return; + } + + const XYm& f_ga = is64 ? _f_ga : xym4; + const XYm& tmpga = is64 ? xym1 : f_ga; + const XYm& tmp = is64 ? xym0 : xym3; + Address _32_gaptr = m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga); + + switch (m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + ONLY32(movdqa(f_ga, _32_gaptr)); + + // gat = gat.modulate16<1>(ga).clamp8(); + + modulate16(_ga, f_ga, 1); + + clamp16(_ga, tmp); + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if (!m_sel.tcc) + { + MOVE_IF_64(psrlw, tmpga, f_ga, 7); + + mix16(_ga, tmpga, tmp); + } + + break; + + case TFX_DECAL: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + if (!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + ONLY32(movdqa(f_ga, _32_gaptr)); + + MOVE_IF_64(psrlw, tmpga, f_ga, 7); + + mix16(_ga, tmpga, tmp); + } + + break; + + case TFX_HIGHLIGHT: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + ONLY32(movdqa(f_ga, _32_gaptr)); + ONLY32(movdqa(xym2, f_ga)); // WHY + + // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); + + MOVE_IF_64(psrlw, tmpga, f_ga, 7); + + if (m_sel.tcc) + { + paddusb(tmpga, _ga); + } + + mix16(_ga, tmpga, tmp); + + break; + + case TFX_HIGHLIGHT2: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if (!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + ONLY32(movdqa(f_ga, _32_gaptr)); + ONLY32(movdqa(xym2, f_ga)); + + MOVE_IF_64(psrlw, tmpga, f_ga, 7); + + mix16(_ga, tmpga, tmp); + } + + break; + + case TFX_NONE: + + // gat = iip ? ga.srl16(7) : ga; + + if (m_sel.iip) + { + MOVE_IF_64(psrlw, _ga, f_ga, 7); + } + + break; + } + + if (m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if (!m_sel.abe) + { + // a = cov + + if (m_sel.edge) + { + movdqa(xym0, _rip_local(temp.cov)); + } + else + { + pcmpeqd(xym0, xym0); + psllw(xym0, 15); + psrlw(xym0, 8); + } + + mix16(_ga, xym0, xym1); + } + else + { + // a = a == 0x80 ? cov : a + + pcmpeqd(xym0, xym0); + psllw(xym0, 15); + psrlw(xym0, 8); + + if (m_sel.edge) + { + movdqa(xym1, _rip_local(temp.cov)); + } + else + { + movdqa(xym1, xym0); + } + + pcmpeqw(xym0, _ga); + psrld(xym0, 16); + pslld(xym0, 16); + + blend8(_ga, xym1 /*, xym0 */); + } + } +} + +/// Output: _fm, _zm +void GSDrawScanlineCodeGenerator2::ReadMask() +{ + if (m_sel.fwrite) + { + pbroadcastdLocal(_fm, _rip_global(fm)); + } + + if (m_sel.zwrite) + { + pbroadcastdLocal(_zm, _rip_global(zm)); + } +} + +/// Input: _ga, _fm, _zm +/// Destroys: xym0, xym1 +void GSDrawScanlineCodeGenerator2::TestAlpha() +{ + switch (m_sel.atst) + { + case ATST_NEVER: + // t = GSVector4i::xffffffff(); + pcmpeqd(xym1, xym1); + break; + + case ATST_ALWAYS: + return; + + case ATST_LESS: + case ATST_LEQUAL: + // t = (ga >> 16) > m_local.gd->aref; + THREEARG(psrld, xym1, _ga, 16); + BROADCAST_AND_OP(vbroadcasti128, pcmpgtd, xym1, xym0, _rip_global(aref)); + break; + + case ATST_EQUAL: + // t = (ga >> 16) != m_local.gd->aref; + THREEARG(psrld, xym1, _ga, 16); + BROADCAST_AND_OP(vbroadcasti128, pcmpeqd, xym1, xym0, _rip_global(aref)); + pcmpeqd(xym0, xym0); + pxor(xym1, xym0); + break; + + case ATST_GEQUAL: + case ATST_GREATER: + // t = (ga >> 16) < m_local.gd->aref; + THREEARG(psrld, xym0, _ga, 16); + broadcasti128(xym1, _rip_global(aref)); + pcmpgtd(xym1, xym0); + break; + + case ATST_NOTEQUAL: + // t = (ga >> 16) == m_local.gd->aref; + THREEARG(psrld, xym1, _ga, 16); + BROADCAST_AND_OP(vbroadcasti128, pcmpeqd, xym1, xym0, _rip_global(aref)); + break; + } + + switch (m_sel.afail) + { + case AFAIL_KEEP: + // test |= t; + por(_test, xym1); + alltrue(_test); + break; + + case AFAIL_FB_ONLY: + // zm |= t; + por(_zm, xym1); + break; + + case AFAIL_ZB_ONLY: + // fm |= t; + por(_fm, xym1); + break; + + case AFAIL_RGB_ONLY: + // zm |= t; + por(_zm, xym1); + // fm |= t & GSVector4i::xff000000(); + psrld(xym1, 24); + pslld(xym1, 24); + por(_fm, xym1); + break; + } +} + +/// Input: xym2[x86]=gaf, _rb, _ga +/// Destroys: xym0, xym1, xym2 +void GSDrawScanlineCodeGenerator2::ColorTFX() +{ + if (!m_sel.fwrite) + { + return; + } + + const XYm& f_ga = is64 ? _f_ga : xym2; + const XYm& tmpga = is64 ? xym2 : f_ga; + + auto modulate16_1_rb = [&] + { + // GSVector4i rb = iip ? rbf : m_local.c.rb; + if (is64) + modulate16(_rb, _f_rb, 1); + else + modulate16(_rb, m_sel.iip ? _rip_local(temp.rb) : _rip_local(c.rb), 1); + }; + + switch (m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).clamp8(); + + modulate16_1_rb(); + + clamp16(_rb, xym0); + + break; + + case TFX_DECAL: + + break; + + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + + if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + ONLY32(movdqa(f_ga, m_sel.iip ? _rip_local(temp.ga) : _rip_local(c.ga))); + } + + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + movdqa(xym1, _ga); + + modulate16(_ga, f_ga, 1); + + pshuflw(tmpga, f_ga, _MM_SHUFFLE(3, 3, 1, 1)); + pshufhw(tmpga, tmpga, _MM_SHUFFLE(3, 3, 1, 1)); + psrlw(tmpga, 7); + + paddw(_ga, tmpga); + + clamp16(_ga, xym0); + + mix16(_ga, xym1, xym0); + + // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); + + modulate16_1_rb(); + + paddw(_rb, tmpga); + + clamp16(_rb, xym0); + + break; + + case TFX_NONE: + + // rbt = iip ? rb.srl16(7) : rb; + + if (m_sel.iip) + { + MOVE_IF_64(psrlw, _rb, _f_rb, 7); + } + + break; + } +} + +/// Input: _rb, _ga +/// Destroys: xym0, xym1, xym2[x86] +void GSDrawScanlineCodeGenerator2::Fog() +{ + if (!m_sel.fwrite || !m_sel.fge) + { + return; + } + + const XYm& f = is64 ? _f : xym0; + const XYm& tmp = is64 ? xym0 : xym2; + + // rb = m_local.gd->frb.lerp16<0>(rb, f); + // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); + + if (m_sel.prim != GS_SPRITE_CLASS) + { + ONLY32(movdqa(f, _rip_local(temp.f))); + } + else + { + ONLY32(pbroadcastwLocal(f, _rip_local(p.f))); + } + + movdqa(xym1, _ga); + + pbroadcastdLocal(tmp, _rip_global(frb)); + lerp16(_rb, tmp, f, 0); + + pbroadcastdLocal(tmp, _rip_global(fga)); + lerp16(_ga, tmp, f, 0); + + mix16(_ga, xym1, xym0); +} + +/// Outputs: _fd, rbx=fa +void GSDrawScanlineCodeGenerator2::ReadFrame() +{ + if (!m_sel.fb) + { + return; + } + + mov(ebx, dword[t1]); + add(ebx, dword[t0]); + and(ebx, HALF_VM_SIZE - 1); + + if (!m_sel.rfb) + { + return; + } + + ReadPixel(_fd, xym0, rbx); +} + +/// Input: _fd, _test +/// Destroys: xym0, xym1 +void GSDrawScanlineCodeGenerator2::TestDestAlpha() +{ + if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) + { + return; + } + + // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); + + if (m_sel.datm) + { + if (m_sel.fpsm == 2) + { + pxor(xym0, xym0); + //vpsrld(xym1, _fd, 15); + THREEARG(pslld, xym1, _fd, 16); + psrad(xym1, 31); + pcmpeqd(xym1, xym0); + } + else + { + pcmpeqd(xym0, xym0); + THREEARG(pxor, xym1, _fd, xym0); + psrad(xym1, 31); + } + } + else + { + if (m_sel.fpsm == 2) + { + THREEARG(pslld, xym1, _fd, 16); + psrad(xym1, 31); + } + else + { + THREEARG(psrad, xym1, _fd, 31); + } + } + + por(_test, xym1); + + alltrue(_test); +} + +/// Input: _fm, _zm, _test +/// Output: edx=fzm +/// Destroys: xym0, xym1 +void GSDrawScanlineCodeGenerator2::WriteMask() +{ + if (m_sel.notest) + { + return; + } + + // fm |= test; + // zm |= test; + + if (m_sel.fwrite) + { + por(_fm, _test); + } + + if (m_sel.zwrite) + { + por(_zm, _test); + } + + // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + + pcmpeqd(xym1, xym1); + + if (m_sel.fwrite && m_sel.zwrite) + { + THREEARG(pcmpeqd, xym0, xym1, _zm); + pcmpeqd(xym1, _fm); + packssdw(xym1, xym0); + } + else if (m_sel.fwrite) + { + pcmpeqd(xym1, _fm); + packssdw(xym1, xym1); + } + else if (m_sel.zwrite) + { + pcmpeqd(xym1, _zm); + packssdw(xym1, xym1); + } + + pmovmskb(edx, xym1); + + not(edx); +} + +/// Inputs: t2=za, edx=fzm, _zm +/// Destroys: xym0, xym1, xym7 +void GSDrawScanlineCodeGenerator2::WriteZBuf() +{ + if (!m_sel.zwrite) + { + return; + } + + if (m_sel.prim != GS_SPRITE_CLASS) + movdqa(xym1, _rip_local(temp.zs)); + else + pbroadcastdLocal(xym1, _rip_local(p.z)); + + if (m_sel.ztest && m_sel.zpsm < 2) + { + // zs = zs.blend8(zd, zm); + + if (hasAVX) + { + vpblendvb(xym1, xym1, _rip_local(temp.zd), _zm); + } + else + { + movdqa(xym0, _zm); + movdqa(xym7, _rip_local(temp.zd)); + blend8(xym1, xym7 /*, xym0 */); + } + } + + // Clamp Z to ZPSM_FMT_MAX + if (m_sel.zclamp) + { + const uint8 amt = (uint8)((m_sel.zpsm & 0x3) * 8); + pcmpeqd(xym7, xym7); + psrld(xym7, amt); + pminsd(xym1, xym7); + } + + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + +#if USING_XMM + WritePixel(xym1, t2, dh, fast, m_sel.zpsm, 1); +#else + WritePixel(xym1, t2, edx, fast, m_sel.zpsm, 1); +#endif +} + +/// Input: _fd, _rb, _ga +/// Destroys: xym0, xym1, xym4, xym7[x86], xym15[x64] +void GSDrawScanlineCodeGenerator2::AlphaBlend() +{ + if (!m_sel.fwrite) + { + return; + } + + if (m_sel.abe == 0 && m_sel.aa1 == 0) + { + return; + } + + const XYm& _dst_rb = xym0; + const XYm& _dst_ga = xym1; + const XYm& tmp1 = _test; + const XYm& tmp2 = xym4; + + if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) + { + switch (m_sel.fpsm) + { + case 0: + case 1: + + // c[2] = fd & mask; + // c[3] = (fd >> 8) & mask; + + split16_2x8(_dst_rb, _dst_ga, _fd); + + break; + + case 2: + + // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + + pcmpeqd(tmp1, tmp1); + + psrld(tmp1, 27); // 0x0000001f + THREEARG(pand, _dst_rb, _fd, tmp1); + pslld(_dst_rb, 3); + + pslld(tmp1, 10); // 0x00007c00 + THREEARG(pand, tmp2, _fd, tmp1); + pslld(tmp2, 9); + + por(_dst_rb, tmp2); + + psrld(tmp1, 5); // 0x000003e0 + THREEARG(pand, _dst_ga, _fd, tmp1); + psrld(_dst_ga, 2); + + psllw(tmp1, 10); // 0x00008000 + THREEARG(pand, tmp2, _fd, tmp1); + pslld(tmp2, 8); + + por(_dst_ga, tmp2); + + break; + } + } + + // rb, ga = src rb, ga + // xym0, xym1 = dst rb, ga + // tmp1, tmp2 = free + + if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) + { + movdqa(tmp2, _rb); + } + + if (m_sel.aba != m_sel.abb) + { + // rb = c[aba * 2 + 0]; + + switch (m_sel.aba) + { + case 0: + break; + case 1: + movdqa(_rb, _dst_rb); + break; + case 2: + pxor(_rb, _rb); + break; + } + + // rb = rb.sub16(c[abb * 2 + 0]); + + switch (m_sel.abb) + { + case 0: + psubw(_rb, tmp2); + break; + case 1: + psubw(_rb, _dst_rb); + break; + case 2: + break; + } + + if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; + + switch (m_sel.abc) + { + case 0: + case 1: + pshuflw(tmp1, m_sel.abc ? _dst_ga : _ga, _MM_SHUFFLE(3, 3, 1, 1)); + pshufhw(tmp1, tmp1, _MM_SHUFFLE(3, 3, 1, 1)); + psllw(tmp1, 7); + break; + case 2: + pbroadcastwLocal(tmp1, _rip_global(afix)); + break; + } + + // rb = rb.modulate16<1>(a); + + modulate16(_rb, tmp1, 1); + } + + // rb = rb.add16(c[abd * 2 + 0]); + + switch (m_sel.abd) + { + case 0: + paddw(_rb, tmp2); + break; + case 1: + paddw(_rb, _dst_rb); + break; + case 2: + break; + } + } + else + { + // rb = c[abd * 2 + 0]; + + switch (m_sel.abd) + { + case 0: + break; + case 1: + movdqa(_rb, _dst_rb); + break; + case 2: + pxor(_rb, _rb); + break; + } + } + + if (m_sel.pabe) + { + // mask = (c[1] << 8).sra32(31); + + THREEARG(pslld, xym0, _ga, 8); + psrad(xym0, 31); + + // rb = c[0].blend8(rb, mask); + + blend8r(_rb, tmp2 /*, xym0 */); + } + + // xym0 = pabe mask (>=sse41) + // ga = src ga + // xym1 = dst ga + // rb = rb + // tmp1 = a + // tmp2 = free + + movdqa(tmp2, _ga); + + if (m_sel.aba != m_sel.abb) + { + // ga = c[aba * 2 + 1]; + + switch (m_sel.aba) + { + case 0: + break; + case 1: + movdqa(_ga, _dst_ga); + break; + case 2: + pxor(_ga, _ga); + break; + } + + // ga = ga.sub16(c[abeb * 2 + 1]); + + switch (m_sel.abb) + { + case 0: + psubw(_ga, tmp2); + break; + case 1: + psubw(_ga, _dst_ga); + break; + case 2: + break; + } + + if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // ga = ga.modulate16<1>(a); + + modulate16(_ga, tmp1, 1); + } + + // ga = ga.add16(c[abd * 2 + 1]); + + switch (m_sel.abd) + { + case 0: + paddw(_ga, tmp2); + break; + case 1: + paddw(_ga, _dst_ga); + break; + case 2: + break; + } + } + else + { + // ga = c[abd * 2 + 1]; + + switch (m_sel.abd) + { + case 0: + break; + case 1: + movdqa(_ga, _dst_ga); + break; + case 2: + pxor(_ga, _ga); + break; + } + } + + // xym0 = pabe mask (>=sse41) + // tmp2 = src ga + // rb = rb + // ga = ga + // xym1, tmp1 = free + + if (m_sel.pabe) + { + psrld(xym0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) + + // ga = c[1].blend8(ga, mask).mix16(c[1]); + + blend8r(_ga, tmp2 /*, xym0 */); + } + else + { + if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx + { + mix16(_ga, tmp2, tmp1); + } + } +} + +/// Input: rbx=fa, rdx=fzm, _fd, _fm +/// Destroys: rax, xym0, xym1, xym5, xym6, xym7[x86], xmm15[x64] +void GSDrawScanlineCodeGenerator2::WriteFrame() +{ + if (!m_sel.fwrite) + { + return; + } + + + const XYm& tmp = is64 ? xym15 : xym7; + + if (m_sel.fpsm == 2 && m_sel.dthe) + { + // y = (top & 3) << 5 + + mov(eax, ptr[rsp + _top]); + and(eax, 3); + shl(eax, 5); + + // rb = rb.add16(m_global.dimx[0 + y]); + // ga = ga.add16(m_global.dimx[1 + y]); + + add(rax, _rip_global(dimx)); + + BROADCAST_AND_OP(vbroadcasti128, paddw, xym5, tmp, ptr[rax + sizeof(GSVector4i) * 0]); + BROADCAST_AND_OP(vbroadcasti128, paddw, xym6, tmp, ptr[rax + sizeof(GSVector4i) * 1]); + } + + if (m_sel.colclamp == 0) + { + // c[0] &= 0x00ff00ff; + // c[1] &= 0x00ff00ff; + + pcmpeqd(tmp, tmp); + psrlw(tmp, 8); + pand(xym5, tmp); + pand(xym6, tmp); + } + + // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); + + THREEARG(punpckhwd, tmp, xym5, xym6); + punpcklwd(xym5, xym6); + packuswb(xym5, tmp); + + if (m_sel.fba && m_sel.fpsm != 1) + { + // fs |= 0x80000000; + + pcmpeqd(tmp, tmp); + pslld(tmp, 31); + por(xym5, tmp); + } + + // tmp1 = fs + // xym4 = fm + // xym6 = fd + + if (m_sel.fpsm == 2) + { + // GSVector4i rb = fs & 0x00f800f8; + // GSVector4i ga = fs & 0x8000f800; + + mov(eax, 0x00f800f8); + broadcastGPRToVec(xym0, eax); + + mov(eax, 0x8000f800); + broadcastGPRToVec(xym1, eax); + + pand(xym0, xym5); + pand(xym1, xym5); + + // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + + THREEARG(psrld, xym5, xym0, 9); + psrld(xym0, 3); + THREEARG(psrld, xym6, xym1, 16); + psrld(xym1, 6); + + por(xym0, xym1); + por(xym5, xym6); + por(xym5, xym0); + } + + if (m_sel.rfb) + { + // fs = fs.blend(fd, fm); + + blend(xym5, _fd, _fm); // TODO: could be skipped in certain cases, depending on fpsm and fm + } + + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; + +#if USING_XMM + WritePixel(xym5, rbx, dl, fast, m_sel.fpsm, 0); +#else + WritePixel(xym5, rbx, edx, fast, m_sel.fpsm, 0); +#endif +} + +/// Destroys: tmp[isYmm] +void GSDrawScanlineCodeGenerator2::ReadPixel(const XYm& dst, const XYm& tmp, const AddressReg& addr) +{ + RegExp base = _m_local__gd__vm + addr * 2; +#if USING_XMM + movq(dst, qword[base]); + movhps(dst, qword[base + 8 * 2]); +#else + Xmm dstXmm = Xmm(dst.getIdx()); + Xmm tmpXmm = Xmm(tmp.getIdx()); + movq(dstXmm, qword[base]); + movhps(dstXmm, qword[base + 8 * 2]); + movq(tmpXmm, qword[base + 16 * 2]); + movhps(tmpXmm, qword[base + 24 * 2]); + vinserti128(dst, dst, tmpXmm, 1); +#endif +} + +#if USING_XMM +void GSDrawScanlineCodeGenerator2::WritePixel(const XYm& src_, const AddressReg& addr, const Reg8& mask, bool fast, int psm, int fz) +#else +void GSDrawScanlineCodeGenerator2::WritePixel(const XYm& src_, const AddressReg& addr, const Reg32& mask, bool fast, int psm, int fz) +#endif +{ +#if USING_XMM + const Xmm& src = src_; + int shift = 0; +#else + Xmm src = Xmm(src_.getIdx()); + int shift = fz * 8; +#endif + RegExp base = _m_local__gd__vm + addr * 2; + + if (m_sel.notest) + { + if (fast) + { + movq(qword[base], src); + movhps(qword[base + 8 * 2], src); +#if USING_YMM + vextracti128(src, src_, 1); + movq(qword[base + 16 * 2], src); + movhps(qword[base + 24 * 2], src); +#endif + } + else + { + WritePixel(src, addr, 0, 0, psm); + WritePixel(src, addr, 1, 1, psm); + WritePixel(src, addr, 2, 2, psm); + WritePixel(src, addr, 3, 3, psm); +#if USING_YMM + vextracti128(src, src_, 1); + WritePixel(src, addr, 4, 0, psm); + WritePixel(src, addr, 5, 1, psm); + WritePixel(src, addr, 6, 2, psm); + WritePixel(src, addr, 7, 3, psm); +#endif + } + } + else + { + if (fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); + + test(mask, 0x0000000f << shift); + je("@f"); + movq(qword[base], src); + L("@@"); + + test(mask, 0x000000f0 << shift); + je("@f"); + movhps(qword[base + 8 * 2], src); + L("@@"); + +#if USING_YMM + vextracti128(src, src_, 1); + + test(mask, 0x000f0000 << shift); + je("@f"); + movq(qword[base + 16 * 2], src); + L("@@"); + + test(mask, 0x00f00000 << shift); + je("@f"); + movhps(qword[base + 24 * 2], src); + L("@@"); +#endif + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + + test(mask, 0x00000003 << shift); + je("@f"); + WritePixel(src, addr, 0, 0, psm); + L("@@"); + + test(mask, 0x0000000c << shift); + je("@f"); + WritePixel(src, addr, 1, 1, psm); + L("@@"); + + test(mask, 0x00000030 << shift); + je("@f"); + WritePixel(src, addr, 2, 2, psm); + L("@@"); + + test(mask, 0x000000c0 << shift); + je("@f"); + WritePixel(src, addr, 3, 3, psm); + L("@@"); + +#if USING_YMM + vextracti128(src, src_, 1); + + test(mask, 0x00030000 << shift); + je("@f"); + WritePixel(src, addr, 4, 0, psm); + L("@@"); + + test(mask, 0x000c0000 << shift); + je("@f"); + WritePixel(src, addr, 5, 1, psm); + L("@@"); + + test(mask, 0x00300000 << shift); + je("@f"); + WritePixel(src, addr, 6, 2, psm); + L("@@"); + + test(mask, 0x00c00000 << shift); + je("@f"); + WritePixel(src, addr, 7, 3, psm); + L("@@"); +#endif + } + } +} + +void GSDrawScanlineCodeGenerator2::WritePixel(const Xmm& src, const AddressReg& addr, uint8 i, uint8 j, int psm) +{ + constexpr int s_offsets[8] = {0, 2, 8, 10, 16, 18, 24, 26}; + + Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; + + switch (psm) + { + case 0: + if (j == 0) + movd(dst, src); + else + pextrd(dst, src, j); + break; + case 1: + if (j == 0) + movd(eax, src); + else + pextrd(eax, src, j); + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + if (j == 0) + movd(eax, src); + else + pextrw(eax, src, j * 2); + mov(dst, ax); + break; + } +} + +/// Input: +/// rbx = m_local.tex[0] (x86 && !use_lod) +/// t2 = m_local.tex (x86 && use_lod) +/// rdx = m_local.clut (x86 && m_sel.tlu) +/// Destroys: rax, src, tmp1, tmp2 +/// Destroys rbx (!use_lod) +void GSDrawScanlineCodeGenerator2::ReadTexel1(const XYm& dst, const XYm& src, const XYm& tmp1, const XYm& tmp2, int mip_offset) +{ + const XYm no(-1); // Hopefully this will assert if we accidentally use it + ReadTexelImpl(dst, tmp1, src, no, no, no, tmp2, no, 1, mip_offset); +} + +/// Will process addr## to c## from s registers to d registers +/// Destroys contents of s registers +/// Destroys tmp1 if . + */ + +#pragma once + +#include "GSScanlineEnvironment.h" +#include "GSNewCodeGenerator.h" + +#undef _t // Conflict with wx, hopefully no one needs this + +#if _M_SSE >= 0x501 + #define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Ymm + #define DRAW_SCANLINE_USING_XMM 0 + #define DRAW_SCANLINE_USING_YMM 1 +#else + #define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Xmm + #define DRAW_SCANLINE_USING_XMM 1 + #define DRAW_SCANLINE_USING_YMM 0 +#endif + +class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator +{ + using _parent = GSNewCodeGenerator; + using XYm = DRAW_SCANLINE_VECTOR_REGISTER; + + /// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach + /// On x86-32 the same values are just raw 32-bit addresses + using LocalAddr = Choose3264::type; + + constexpr static bool isXmm = std::is_same::value; + constexpr static bool isYmm = std::is_same::value; + constexpr static int wordsize = is64 ? 8 : 4; + constexpr static int vecsize = isXmm ? 16 : 32; + constexpr static int vecsizelog = isXmm ? 4 : 5; + constexpr static int vecints = vecsize / 4; + + +// MARK: - Constants + + constexpr static int _32_args = 16; + constexpr static int _invalid = 0xaaaaaaaa; +#ifdef _WIN32 + constexpr static int _64_top = 8 * 0; + // XMM registers will be saved to `rsp + _64_win_xmm_start + id - 6` + // Which will put xmm6 after the temporaries, them xmm7, etc + constexpr static int _64_win_xmm_start = 8 * 2; + // Windows has no redzone and also has 10 xmm registers to save + constexpr static int _64_win_stack_size = _64_win_xmm_start + 16 * 10; +#else + // System-V has a redzone so stick everything there + constexpr static int _64_rz_rbx = -8 * 1; + constexpr static int _64_rz_r12 = -8 * 2; + constexpr static int _64_rz_r13 = -8 * 3; + constexpr static int _64_rz_r14 = -8 * 4; + constexpr static int _64_rz_r15 = -8 * 5; + constexpr static int _64_top = -8 * 6; +#endif + constexpr static int _top = is64 ? _64_top : _32_args + 4; + constexpr static int _v = is64 ? _invalid : _32_args + 8; + + GSScanlineSelector m_sel; + GSScanlineLocalData& m_local; + bool m_rip; + bool use_lod; + + const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15}; + /// Note: a2 and t3 are only available on x86-64 + /// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+ + const AddressReg a0, a1, a2, a3, t0, t1, t2, t3; + const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm; + /// Available on both x86 and x64, not always valid + const XYm _rb, _ga, _fm, _zm, _fd, _test; + /// Always valid if needed, x64 only + const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga; + + /// Returns the first arg on 32-bit, second on 64-bit + static LocalAddr chooseLocal(const void* addr32, AddressReg reg64) + { + return choose3264((size_t)addr32, reg64); + } + +public: + GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key); + void Generate(); + +private: + /// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]` + LocalAddr loadAddress(AddressReg reg, const void* addr); + /// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be + void broadcastf128(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be + void broadcasti128(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast a floating-point variable stored in GSScanlineLocalData to the whole register + /// On YMM registers this will be a broadcast from a 32-bit value + /// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data + void broadcastssLocal(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast a qword variable stored in GSScanlineLocalData to the whole register + /// On YMM registers this will be a broadcast from a 64-bit value + /// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data + void pbroadcastqLocal(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast a dword variable stored in GSScanlineLocalData to the whole register + /// On YMM registers this will be a broadcast from a 32-bit value + /// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data + void pbroadcastdLocal(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast a word variable stored in GSScanlineLocalData to the whole register + /// On YMM registers this will be a broadcast from a 16-bit value + /// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data + void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem); + /// Broadcast a 32-bit GPR to a vector register + void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr); + void modulate16(const XYm& a, const Xbyak::Operand& f, uint8 shift); + void lerp16(const XYm& a, const XYm& b, const XYm& f, uint8 shift); + void lerp16_4(const XYm& a, const XYm& b, const XYm& f); + void mix16(const XYm& a, const XYm& b, const XYm& temp); + void clamp16(const XYm& a, const XYm& temp); + void alltrue(const XYm& test); + void blend(const XYm& a, const XYm& b, const XYm& mask); + void blendr(const XYm& b, const XYm& a, const XYm& mask); + void blend8(const XYm& a, const XYm& b); + void blend8r(const XYm& b, const XYm& a); + void split16_2x8(const XYm& l, const XYm& h, const XYm& src); + + void Init(); + void Step(); + void TestZ(const XYm& temp1, const XYm& temp2); + void SampleTexture(); + void SampleTexture_TexelReadHelper(int mip_offset); + void Wrap(const XYm& uv); + void Wrap(const XYm& uv0, const XYm& uv1); + void SampleTextureLOD(); + void WrapLOD(const XYm& uv); + void WrapLOD(const XYm& uv0, const XYm& uv1); + void AlphaTFX(); + void ReadMask(); + void TestAlpha(); + void ColorTFX(); + void Fog(); + void ReadFrame(); + void TestDestAlpha(); + void WriteMask(); + void WriteZBuf(); + void AlphaBlend(); + void WriteFrame(); + void ReadPixel(const XYm& dst, const XYm& tmp, const AddressReg& addr); +#if DRAW_SCANLINE_USING_XMM + void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg8& mask, bool fast, int psm, int fz); +#else + void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz); +#endif + void WritePixel(const Xmm& src, const AddressReg& addr, uint8 i, uint8 j, int psm); + void ReadTexel1(const XYm& dst, const XYm& src, const XYm& tmp1, const XYm& tmp2, int mip_offset); + void ReadTexel4( + const XYm& d0, const XYm& d1, + const XYm& d2s0, const XYm& d3s1, + const XYm& s2, const XYm& s3, + const XYm& tmp1, const XYm& tmp2, + int mip_offset); + void ReadTexelImpl( + const XYm& d0, const XYm& d1, + const XYm& d2s0, const XYm& d3s1, + const XYm& s2, const XYm& s3, + const XYm& tmp1, const XYm& tmp2, + int pixels, int mip_offset); + void ReadTexelImplLoadTexLOD(int lod, int mip_offset); + void ReadTexelImplYmm( + const Ymm& d0, const Ymm& d1, + const Ymm& d2s0, const Ymm& d3s1, + const Ymm& s2, const Ymm& s3, + const Ymm& tmp, + int pixels, int mip_offset); + void ReadTexelImplSSE4( + const Xmm& d0, const Xmm& d1, + const Xmm& d2s0, const Xmm& d3s1, + const Xmm& s2, const Xmm& s3, + int pixels, int mip_offset); + void ReadTexelImpl(const Xmm& dst, const Xmm& addr, uint8 i, bool texInA3, bool preserveDst); +}; diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp index e95954c07f..2d01f96c2f 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -15,6 +15,7 @@ #include "PrecompiledHeader.h" #include "GSDrawScanlineCodeGenerator.h" +#include "GSDrawScanlineCodeGenerator.all.h" #if _M_SSE >= 0x501 #else @@ -37,7 +38,7 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key if (m_sel.breakpoint) db(0xCC); - Generate(); + GSDrawScanlineCodeGenerator2(this, CPUInfo(m_cpu), (void*)&m_local, m_sel.key).Generate(); } void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, uint8 shift) diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 2c787b648a..68df3ad887 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -466,6 +466,7 @@ + @@ -830,6 +831,7 @@ + diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index f734cc6721..8c2cea4496 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -1517,6 +1517,9 @@ System\Ps2\GS\Renderers\Software + + System\Ps2\GS\Renderers\Software + System\Ps2\GS\Renderers\Software @@ -2616,6 +2619,9 @@ System\Ps2\GS\Renderers\Software + + System\Ps2\GS\Renderers\Software + System\Ps2\GS\Renderers\Software