From 9d767838d6566464c7b147877f9fb141bb326cfe Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Sat, 28 Aug 2021 00:41:26 -0500 Subject: [PATCH] GS: Remove old DrawScanline code generators --- pcsx2/CMakeLists.txt | 7 - pcsx2/GS/GS_codegen.h | 38 - .../SW/GSDrawScanlineCodeGenerator.cpp | 225 -- .../SW/GSDrawScanlineCodeGenerator.h | 105 - .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 2129 ----------- .../GSDrawScanlineCodeGenerator.x64.avx2.cpp | 3103 ----------------- .../SW/GSDrawScanlineCodeGenerator.x64.cpp | 118 - .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 2936 ---------------- .../GSDrawScanlineCodeGenerator.x86.avx2.cpp | 3020 ---------------- .../SW/GSDrawScanlineCodeGenerator.x86.cpp | 2953 ---------------- pcsx2/pcsx2.vcxproj | 7 - pcsx2/pcsx2.vcxproj.filters | 21 - 12 files changed, 14662 deletions(-) delete mode 100644 pcsx2/GS/GS_codegen.h delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp delete mode 100644 pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index 6a5b1f9ff9..351d56c136 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -640,12 +640,6 @@ set(pcsx2GSSources GS/Renderers/SW/GSDrawScanline.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp - GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp GS/Renderers/SW/GSNewCodeGenerator.cpp GS/Renderers/SW/GSRasterizer.cpp GS/Renderers/SW/GSRendererSW.cpp @@ -676,7 +670,6 @@ set(pcsx2GSHeaders GS/GSDrawingEnvironment.h GS/GSDump.h GS/GS_types.h - GS/GS_codegen.h GS/GS.h GS/GSLocalMemory.h GS/GSLzma.h diff --git a/pcsx2/GS/GS_codegen.h b/pcsx2/GS/GS_codegen.h deleted file mode 100644 index 7ca1b4b847..0000000000 --- a/pcsx2/GS/GS_codegen.h +++ /dev/null @@ -1,38 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#pragma once - -using namespace Xbyak; - -#ifdef _M_AMD64 -// Yeah let use mips naming ;) - #ifdef _WIN64 - #define a0 rcx - #define a1 rdx - #define a2 r8 - #define a3 r9 - #define t0 rdi - #define t1 rsi - #else - #define a0 rdi - #define a1 rsi - #define a2 rdx - #define a3 rcx - #define t0 r8 - #define t1 r9 - #endif -#endif - diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp index 2d01f96c2f..c9f23ff64c 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -17,16 +17,6 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.all.h" -#if _M_SSE >= 0x501 -#else -void GSDrawScanlineCodeGenerator::Generate() -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - Generate_AVX(); - else - Generate_SSE(); -} -#endif GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) @@ -40,218 +30,3 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key GSDrawScanlineCodeGenerator2(this, CPUInfo(m_cpu), (void*)&m_local, m_sel.key).Generate(); } - -void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, uint8 shift) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - if (shift == 0) - { - vpmulhrsw(a, f); - } - else - { - vpsllw(a, shift + 1); - vpmulhw(a, f); - } - } - else - { - if (shift == 0 && m_cpu.has(Xbyak::util::Cpu::tSSSE3)) - { - pmulhrsw(a, f); - } - else - { - psllw(a, shift + 1); - pmulhw(a, f); - } - } -} - -void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpsubw(a, b); - modulate16(a, f, shift); - vpaddw(a, b); - } - else - { - psubw(a, b); - modulate16(a, f, shift); - paddw(a, b); - } -} - -void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpsubw(a, b); - vpmullw(a, f); - vpsraw(a, 4); - vpaddw(a, b); - } - else - { - psubw(a, b); - pmullw(a, f); - psraw(a, 4); - paddw(a, b); - } -} - -void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpblendw(a, b, 0xaa); - } - else - { - pblendw(a, b, 0xaa); - } -} - -void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpackuswb(a, a); - -#if _M_SSE >= 0x501 - // Greg: why ? - if (m_cpu.has(Xbyak::util::Cpu::tAVX2)) - { - ASSERT(a.isYMM()); - vpermq(Ymm(a.getIdx()), Ymm(a.getIdx()), _MM_SHUFFLE(3, 1, 2, 0)); // this sucks - } -#endif - - vpmovzxbw(a, a); - } - else - { - packuswb(a, a); - pmovzxbw(a, a); - } -} - -void GSDrawScanlineCodeGenerator::alltrue(const Xmm& test) -{ - uint32 mask = test.isYMM() ? 0xffffffff : 0xffff; - - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpmovmskb(eax, test); - cmp(eax, mask); - je("step", T_NEAR); - } - else - { - pmovmskb(eax, test); - cmp(eax, mask); - je("step", T_NEAR); - } -} - -void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpand(b, mask); - vpandn(mask, a); - vpor(a, b, mask); - } - else - { - pand(b, mask); - pandn(mask, a); - por(b, mask); - movdqa(a, b); - } -} - -void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpand(b, mask); - vpandn(mask, a); - vpor(b, mask); - } - else - { - pand(b, mask); - pandn(mask, a); - por(b, mask); - } -} - -void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - vpblendvb(a, a, b, xmm0); - else - pblendvb(a, b); -} - -void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) -{ - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - vpblendvb(b, a, b, xmm0); - } - else - { - pblendvb(a, b); - movdqa(b, a); - } -} - -void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) -{ - // l = src & 0xFF; (1 left shift + 1 right shift) - // h = (src >> 8) & 0xFF; (1 right shift) - - if (m_cpu.has(Xbyak::util::Cpu::tAVX)) - { - if (src == h) - { - vpsllw(l, src, 8); - vpsrlw(h, 8); - } - else if (src == l) - { - vpsrlw(h, src, 8); - vpsllw(l, 8); - } - else - { - vpsllw(l, src, 8); - vpsrlw(h, src, 8); - } - vpsrlw(l, 8); - } - else - { - if (src == h) - { - movdqa(l, src); - } - else if (src == l) - { - movdqa(h, src); - } - else - { - movdqa(l, src); - movdqa(h, src); - } - psllw(l, 8); - psrlw(l, 8); - psrlw(h, 8); - } -} diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h index 253bcb678a..70eda1b31a 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h @@ -27,117 +27,12 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator { - typedef Xbyak::Ymm Ymm; - typedef Xbyak::Xmm Xmm; - typedef Xbyak::Reg8 Reg8; - typedef Xbyak::Operand Operand; - void operator=(const GSDrawScanlineCodeGenerator&); GSScanlineSelector m_sel; GSScanlineLocalData& m_local; bool m_rip; - void Generate(); - -#if _M_SSE >= 0x501 - - void Init(); - void Step(); - void TestZ(const Ymm& temp1, const Ymm& temp2); - void SampleTexture(); - void Wrap(const Ymm& uv0); - void Wrap(const Ymm& uv0, const Ymm& uv1); - void SampleTextureLOD(); - void WrapLOD(const Ymm& uv0); - void WrapLOD(const Ymm& uv0, const Ymm& uv1); - void AlphaTFX(); - void ReadMask(); - void TestAlpha(); - void ColorTFX(); - void Fog(); - void ReadFrame(); - void TestDestAlpha(); - void WriteMask(); - void WriteZBuf(); - void AlphaBlend(); - void WriteFrame(); - void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr); - void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz); - void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm); - void ReadTexel(int pixels, int mip_offset = 0); - void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); - -#else - - void Generate_SSE(); - void Init_SSE(); - void Step_SSE(); - void TestZ_SSE(const Xmm& temp1, const Xmm& temp2); - void SampleTexture_SSE(); - void Wrap_SSE(const Xmm& uv0); - void Wrap_SSE(const Xmm& uv0, const Xmm& uv1); - void SampleTextureLOD_SSE(); - void WrapLOD_SSE(const Xmm& uv0); - void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1); - void AlphaTFX_SSE(); - void ReadMask_SSE(); - void TestAlpha_SSE(); - void ColorTFX_SSE(); - void Fog_SSE(); - void ReadFrame_SSE(); - void TestDestAlpha_SSE(); - void WriteMask_SSE(); - void WriteZBuf_SSE(); - void AlphaBlend_SSE(); - void WriteFrame_SSE(); - void ReadPixel_SSE(const Xmm& dst, const RegLong& addr); - void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm); - void ReadTexel_SSE(int pixels, int mip_offset = 0); - void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i); - - void Generate_AVX(); - void Init_AVX(); - void Step_AVX(); - void TestZ_AVX(const Xmm& temp1, const Xmm& temp2); - void SampleTexture_AVX(); - void Wrap_AVX(const Xmm& uv0); - void Wrap_AVX(const Xmm& uv0, const Xmm& uv1); - void SampleTextureLOD_AVX(); - void WrapLOD_AVX(const Xmm& uv0); - void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1); - void AlphaTFX_AVX(); - void ReadMask_AVX(); - void TestAlpha_AVX(); - void ColorTFX_AVX(); - void Fog_AVX(); - void ReadFrame_AVX(); - void TestDestAlpha_AVX(); - void WriteMask_AVX(); - void WriteZBuf_AVX(); - void AlphaBlend_AVX(); - void WriteFrame_AVX(); - void ReadPixel_AVX(const Xmm& dst, const RegLong& addr); - void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz); - void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm); - void ReadTexel_AVX(int pixels, int mip_offset = 0); - void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i); - -#endif - - void modulate16(const Xmm& a, const Operand& f, uint8 shift); - void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift); - void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f); - void mix16(const Xmm& a, const Xmm& b, const Xmm& temp); - void clamp16(const Xmm& a, const Xmm& temp); - void alltrue(const Xmm& test); - void blend(const Xmm& a, const Xmm& b, const Xmm& mask); - void blendr(const Xmm& b, const Xmm& a, const Xmm& mask); - void blend8(const Xmm& a, const Xmm& b); - void blend8r(const Xmm& b, const Xmm& a); - void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src); - public: GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); }; diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp deleted file mode 100644 index eb3be1a385..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ /dev/null @@ -1,2129 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" -#include "GSVertexSW.h" -#include "GS/GS_codegen.h" - -#undef _t - -#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) - -// Ease the reading of the code -#define _m_local r12 -#define _m_local__gd r13 -#define _m_local__gd__vm a1 -#define _m_local__gd__clut r11 -#define _m_local__gd__tex a3 -// More pretty name -#define _z xmm8 -#define _f xmm9 -#define _s xmm10 -#define _t xmm11 -#define _q xmm12 -#define _f_rb xmm13 -#define _f_ga xmm14 -#define _test xmm15 -// Extra bonus -#define _rb xmm2 -#define _ga xmm3 -#define _fm xmm4 -#define _zm xmm5 -#define _fd xmm6 - -#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[_m_local + offsetof(GSScanlineLocalData, field)]) -#define _rip_global(field) (m_rip ? ptr[rip + &m_local.gd->field] : ptr[_m_local__gd + offsetof(GSScanlineGlobalData, field)]) - -#ifdef _WIN64 -#else -static const int _rz_rbx = -8 * 1; -static const int _rz_r12 = -8 * 2; -static const int _rz_r13 = -8 * 3; -//static const int _rz_r14 = -8 * 4; -//static const int _rz_r15 = -8 * 5; -static const int _rz_top = -8 * 6; -static const int _rz_zs = -8 * 8; -static const int _rz_zd = -8 * 10; -static const int _rz_cov = -8 * 12; -#endif - -void GSDrawScanlineCodeGenerator::Generate_AVX() -{ - bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; - bool need_clut = need_tex && m_sel.tlu; - m_rip = (size_t)getCurr() < 0x80000000; - m_rip &= (size_t)&m_local < 0x80000000; - m_rip &= (size_t)&m_local.gd < 0x80000000; - -#ifdef _WIN64 - push(rbx); - push(rsi); - push(rdi); - push(rbp); - push(r12); - push(r13); - - sub(rsp, 8 + 10 * 16); - - for (int i = 6; i < 16; i++) - { - vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); - } -#else - // No reservation on the stack as a red zone is available - push(rbp); - mov(ptr[rsp + _rz_rbx], rbx); - if (!m_rip) - { - mov(ptr[rsp + _rz_r12], r12); - mov(ptr[rsp + _rz_r13], r13); - } -#endif - - mov(r10, (size_t)g_const->m_test_128b[0]); - if (!m_rip) - { - mov(_m_local, (size_t)&m_local); - mov(_m_local__gd, _rip_local(gd)); - } - - if (need_clut) - mov(_m_local__gd__clut, _rip_global(clut)); - - Init_AVX(); - - // a0 = steps - // t1 = fza_base - // t0 = fza_offset - // r10 = &m_test[0] - // _m_local = &m_local - // _m_local__gd = m_local->gd - // _m_local__gd__vm = m_local->gd.vm - // xmm7 = vf (sprite && ltf) - // xmm8 = z - // xmm9 = f - // xmm10 = s - // xmm11 = t - // xmm12 = q - // xmm13 = rb - // xmm14 = ga - // xmm15 = test - - if (!m_sel.edge) - { - align(16); - } - -L("loop"); - - TestZ_AVX(xmm5, xmm6); - - // ebp = za - - // FIXME not yet done - if (m_sel.mmin && 0) - { - SampleTextureLOD_AVX(); - } - else - { - SampleTexture_AVX(); - } - - // ebp = za - // xmm2 = rb - // xmm3 = ga - - AlphaTFX_AVX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - - ReadMask_AVX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - TestAlpha_AVX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - ColorTFX_AVX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - Fog_AVX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - ReadFrame_AVX(); - - // ebx = fa - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - TestDestAlpha_AVX(); - - // ebx = fa - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - WriteMask_AVX(); - - // ebx = fa - // edx = fzm - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - WriteZBuf_AVX(); - - // ebx = fa - // edx = fzm - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm6 = fd - - AlphaBlend_AVX(); - - // ebx = fa - // edx = fzm - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm6 = fd - - WriteFrame_AVX(); - -L("step"); - - // if(steps <= 0) break; - - if (!m_sel.edge) - { - test(a0.cvt32(), a0.cvt32()); - - jle("exit", T_NEAR); - - Step_AVX(); - - jmp("loop", T_NEAR); - } - -L("exit"); - -#ifdef _WIN64 - for (int i = 6; i < 16; i++) - { - vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); - } - - add(rsp, 8 + 10 * 16); - - pop(r13); - pop(r12); - pop(rbp); - pop(rdi); - pop(rsi); - pop(rbx); -#else - mov(rbx, ptr[rsp + _rz_rbx]); - if (!m_rip) - { - mov(r12, ptr[rsp + _rz_r12]); - mov(r13, ptr[rsp + _rz_r13]); - } - pop(rbp); -#endif - - ret(); -} - -void GSDrawScanlineCodeGenerator::Init_AVX() -{ - if (!m_sel.notest) - { - // int skip = left & 3; - - mov(ebx, a1.cvt32()); - and(a1.cvt32(), 3); - - // left -= skip; - - sub(ebx, a1.cvt32()); - - // int steps = pixels + skip - 4; - - lea(a0.cvt32(), ptr[a0 + a1 - 4]); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(a1.cvt32(), 4); // * sizeof(m_test[0]) - - vmovdqa(_test, ptr[a1 + r10]); - - mov(eax, a0.cvt32()); - sar(eax, 31); // GH: 31 to extract the sign of the register - and(eax, a0.cvt32()); - shl(eax, 4); // * sizeof(m_test[0]) - cdqe(); - - vpor(_test, ptr[rax + r10 + 7 * 16]); - } - else - { - mov(ebx, a1.cvt32()); // left - xor(a1.cvt32(), a1.cvt32()); // skip - lea(a0.cvt32(), ptr[a0 - 4]); // steps - } - - // a0 = steps - // a1 = skip - // rbx = left - - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(rax, _rip_global(fzbr)); - lea(t1, ptr[rax + a2 * 8]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - mov(rax, _rip_global(fzbc)); - lea(t0, ptr[rax + rbx * 2]); - - if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // a1 = &m_local.d[skip] // note a1 was (skip << 4) - - // FIXME - //lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]); - lea(rax, _rip_local(d)); - lea(a1, ptr[rax + a1 * 8]); - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p - - if (m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(_f, xmm0); - vpshufhw(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(_f, _f, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(_f, ptr[a1 + 16 * 6]); - } - - if (m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(_z, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(_z, ptr[a1]); - } - } - } - else - { - if (m_sel.ztest) - { - vmovdqa(_z, _rip_local(p.z)); - } - - if (m_sel.fwrite && m_sel.fge) - vmovdqa(_f, _rip_local(p.f)); - } - - if (m_sel.fb) - { - if (m_sel.edge || m_sel.tfx != TFX_NONE) - { - vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t - } - - if (m_sel.edge) - { - // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); - - vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(xmm1, 9); - -#ifdef _WIN64 - vmovdqa(_rip_local(temp.cov), xmm1); -#else - vmovdqa(ptr[rsp + _rz_cov], xmm1); -#endif - } - - if (m_sel.tfx != TFX_NONE) - { - // a1 = &m_local.d[skip] - - if (m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(xmm0, xmm0); - - // s = vti.xxxx() + m_local.d[skip].s; - // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - - vpshufd(_s, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(_t, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpaddd(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); - } - else if (m_sel.ltf) - { - vpshuflw(xmm7, _t, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm7, 12); - } - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(_s, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(_t, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(_q, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(_q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vcvttps2dq(xmm0, ptr[a3 + offsetof(GSVertexSW, c)]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm0, xmm1); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(_f_rb, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(_f_ga, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(_f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(_f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]); - } - else - { - vmovdqa(_f_rb, _rip_local(c.rb)); - vmovdqa(_f_ga, _rip_local(c.ga)); - } - - vmovdqa(_rb, _f_rb); - vmovdqa(_ga, _f_ga); - } - } - - if (m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe) - { - // On linux, a2 is edx which will be used for fzm - // In all case, it will require a mov in dthe code, so let's keep the value on the stack -#ifdef _WIN64 - ASSERT(0); -#else - mov(ptr[rsp + _rz_top], a2); -#endif - } - - mov(_m_local__gd__vm, _rip_global(vm)); - if (m_sel.fb && m_sel.tfx != TFX_NONE) - mov(_m_local__gd__tex, _rip_global(tex)); -} - -void GSDrawScanlineCodeGenerator::Step_AVX() -{ - // steps -= 4; - - sub(a0.cvt32(), 4); - - // fza_offset++; - - add(t0, 8); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // z += m_local.d4.z; - - if (m_sel.zb) - { - vaddps(_z, _rip_local(d4.z)); - } - - // f = f.add16(m_local.d4.f); - - if (m_sel.fwrite && m_sel.fge) - { - vpaddw(_f, _rip_local(d4.f)); - } - } - else - { - if (m_sel.ztest) - { - } - } - - if (m_sel.fb) - { - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i st = m_local.d4.st; - - // si += st.xxxx(); - // if(!sprite) ti += st.yyyy(); - - vmovdqa(xmm0, _rip_local(d4.stq)); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(_s, xmm1); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(_t, xmm1); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vmovaps(xmm0, _rip_local(d4.stq)); - - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(_s, xmm1); - vaddps(_t, xmm2); - vaddps(_q, xmm3); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - vmovdqa(xmm0, _rip_local(d4.c)); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(_f_rb, xmm1); - vpaddw(_f_ga, xmm2); - - // FIXME: color may underflow and roll over at the end of the line, if decreasing - - vpxor(xmm0, xmm0); - vpmaxsw(_f_rb, xmm0); - vpmaxsw(_f_ga, xmm0); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - } - } - - vmovdqa(_rb, _f_rb); - vmovdqa(_ga, _f_ga); - } - } - - if (!m_sel.notest) - { - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(eax, a0.cvt32()); - sar(eax, 31); // GH: 31 to extract the sign of the register - and(eax, a0.cvt32()); - shl(eax, 4); - cdqe(); - - vmovdqa(_test, ptr[rax + r10 + 7 * 16]); - } -} - -void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) -{ - if (!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, dword[t1 + 4]); - add(ebp, dword[t0 + 4]); - and(ebp, HALF_VM_SIZE - 1); - - // GSVector4i zs = zi; - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(rax, (size_t)&GSVector4::m_half); - - vbroadcastss(xmm0, ptr[rax]); - vmulps(xmm0, _z); - vcvttps2dq(xmm0, xmm0); - vpslld(xmm0, 1); - - vcvttps2dq(xmm1, _z); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm1, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // zs = GSVector4i(z); - - vcvttps2dq(xmm0, _z); - } - - if (m_sel.zwrite) - { -#ifdef _WIN64 - vmovdqa(_rip_local(temp.zs), xmm0); -#else - vmovdqa(ptr[rsp + _rz_zs], xmm0); -#endif - } - } - else - { - movdqa(xmm0, _z); - } - - if (m_sel.ztest) - { - ReadPixel_AVX(xmm1, rbp); - - if (m_sel.zwrite && m_sel.zpsm < 2) - { -#ifdef _WIN64 - vmovdqa(_rip_local(temp.zd), xmm1); -#else - vmovdqa(ptr[rsp + _rz_zd], xmm1); -#endif - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if (m_sel.zpsm) - { - vpslld(xmm1, static_cast(m_sel.zpsm * 8)); - vpsrld(xmm1, static_cast(m_sel.zpsm * 8)); - } - - if (m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - vpcmpeqd(xmm2, xmm2); - vpslld(xmm2, 31); - - // GSVector4i zso = zs - o; - // GSVector4i zdo = zd - o; - - vpsubd(xmm0, xmm2); - vpsubd(xmm1, xmm2); - } - - switch (m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(xmm1, xmm0); - vpor(_test, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(xmm0, xmm1); - vpcmpeqd(xmm2, xmm2); - vpxor(xmm0, xmm2); - vpor(_test, xmm0); - break; - } - - alltrue(_test); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture_AVX() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - if (!m_sel.fst) - { - vrcpps(xmm0, _q); - - vmulps(xmm4, _s, xmm0); - vmulps(xmm5, _t, xmm0); - - vcvttps2dq(xmm4, xmm4); - vcvttps2dq(xmm5, xmm5); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm0, eax); - vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm4, xmm0); - vpsubd(xmm5, xmm0); - } - } - else - { - vmovdqa(xmm4, _s); - vmovdqa(xmm5, _t); - } - - if (m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(12); - - vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 12); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // GSVector4i vf = v.xxzzlh().srl16(12); - - vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm7, 12); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm4, 16); - vpsrad(xmm5, 16); - vpackssdw(xmm4, xmm5); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm0, xmm0); - vpsrlw(xmm0, 15); - vpaddw(xmm5, xmm4, xmm0); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap_AVX(xmm4, xmm5); - } - else - { - // uv0 = Wrap(uv0); - - Wrap_AVX(xmm4); - } - - // xmm4 = uv0 - // xmm5 = uv1 (ltf) - // xmm6 = uf - // xmm7 = vf - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm2, xmm4, xmm0); - vpunpckhwd(xmm3, xmm4, xmm0); - vpslld(xmm3, static_cast(m_sel.tw + 3)); - - // xmm0 = 0 - // xmm2 = x0 - // xmm3 = y0 - // xmm5 = uv1 (ltf) - // xmm6 = uf - // xmm7 = vf - - if (m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - vpunpcklwd(xmm4, xmm5, xmm0); - vpunpckhwd(xmm5, xmm5, xmm0); - vpslld(xmm5, static_cast(m_sel.tw + 3)); - - // xmm2 = x0 - // xmm3 = y0 - // xmm4 = x1 - // xmm5 = y1 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm0, xmm3, xmm2); - vpaddd(xmm1, xmm3, xmm4); - vpaddd(xmm2, xmm5, xmm2); - vpaddd(xmm3, xmm5, xmm4); - - // xmm0 = addr00 - // xmm1 = addr01 - // xmm2 = addr10 - // xmm3 = addr11 - // xmm6 = uf - // xmm7 = vf - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(4, 0); - - // xmm0 = c10 - // xmm1 = c11 - // xmm4 = c00 - // xmm5 = c01 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm3, xmm4); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm4, xmm5, xmm5); - - // xmm0 = c10 - // xmm1 = c11 - // xmm2 = rb00 - // xmm3 = ga00 - // xmm4 = rb01 - // xmm5 = ga01 - // xmm6 = uf - // xmm7 = vf - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(xmm4, xmm2, xmm6); - lerp16_4(xmm5, xmm3, xmm6); - - // xmm0 = c10 - // xmm1 = c11 - // xmm4 = rb00 - // xmm5 = ga00 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm2, xmm3, xmm0); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm0, xmm1, xmm1); - - // xmm0 = rb11 - // xmm1 = ga11 - // xmm2 = rb10 - // xmm3 = ga10 - // xmm4 = rb00 - // xmm5 = ga00 - // xmm6 = uf - // xmm7 = vf - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(xmm0, xmm2, xmm6); - lerp16_4(xmm1, xmm3, xmm6); - - // xmm0 = rb10 - // xmm1 = ga10 - // xmm4 = rb00 - // xmm5 = ga00 - // xmm7 = vf - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - lerp16_4(xmm0, xmm4, xmm7); - lerp16_4(xmm1, xmm5, xmm7); - - // FIXME not ideal (but allow different source in ReadTexel and less register dependency) - vmovdqa(xmm2, xmm0); - vmovdqa(xmm3, xmm1); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm0, xmm3, xmm2); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(1, 0); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(_rb, _ga, xmm4); - } - - // xmm2 = rb - // xmm3 = ga -} - -void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) -{ - // xmm0, xmm1, xmm2, xmm3 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv, _rip_global(t.min)); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv, xmm0); - } - - vpminsw(uv, _rip_global(t.max)); - } - else - { - vpand(uv, _rip_global(t.min)); - - if (region) - { - vpor(uv, _rip_global(t.max)); - } - } - } - else - { - vmovdqa(xmm2, _rip_global(t.min)); - vmovdqa(xmm3, _rip_global(t.max)); - vmovdqa(xmm0, _rip_global(t.mask)); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv, xmm2); - - if (region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, xmm2); - vpminsw(uv, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm2, xmm3 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vmovdqa(xmm0, _rip_global(t.min)); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - - vmovdqa(xmm0, _rip_global(t.max)); - vpminsw(uv0, xmm0); - vpminsw(uv1, xmm0); - } - else - { - vmovdqa(xmm0, _rip_global(t.min)); - vpand(uv0, xmm0); - vpand(uv1, xmm0); - - if (region) - { - vmovdqa(xmm0, _rip_global(t.max)); - vpor(uv0, xmm0); - vpor(uv1, xmm0); - } - } - } - else - { - vmovdqa(xmm2, _rip_global(t.min)); - vmovdqa(xmm3, _rip_global(t.max)); - vmovdqa(xmm0, _rip_global(t.mask)); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv0, xmm2); - - if (region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, xmm2); - vpminsw(uv0, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, xmm1, xmm0); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv1, xmm2); - - if (region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, xmm2); - vpminsw(uv1, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() -{ -} - -void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) -{ -} - -void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) -{ -} - -void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() -{ - if (!m_sel.fb) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(_ga, _f_ga, 1); - - clamp16(_ga, xmm0); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(xmm1, _f_ga, 7); - - mix16(_ga, xmm1, xmm0); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(xmm1, _f_ga, 7); - - mix16(_ga, xmm1, xmm0); - } - - break; - - case TFX_HIGHLIGHT: - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(xmm1, _f_ga, 7); - - if (m_sel.tcc) - { - vpaddusb(xmm1, _ga); - } - - mix16(_ga, xmm1, xmm0); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(xmm1, _f_ga, 7); - - mix16(_ga, xmm1, xmm0); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if (m_sel.iip) - { - vpsrlw(_ga, _f_ga, 7); - } - - break; - } - - if (m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if (!m_sel.abe) - { - // a = cov - - if (m_sel.edge) - { -#ifdef _WIN64 - vmovdqa(xmm0, _rip_local(temp.cov)); -#else - vmovdqa(xmm0, ptr[rsp + _rz_cov]); -#endif - } - else - { - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - } - - mix16(_ga, xmm0, xmm1); - } - else - { - // a = a == 0x80 ? cov : a - - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - - if (m_sel.edge) - { -#ifdef _WIN64 - vmovdqa(xmm1, _rip_local(temp.cov)); -#else - vmovdqa(xmm1, ptr[rsp + _rz_cov]); -#endif - } - else - { - vmovdqa(xmm1, xmm0); - } - - vpcmpeqw(xmm0, _ga); - vpsrld(xmm0, 16); - vpslld(xmm0, 16); - - vpblendvb(_ga, xmm1, xmm0); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask_AVX() -{ - if (m_sel.fwrite) - { - vmovdqa(_fm, _rip_global(fm)); - } - - if (m_sel.zwrite) - { - vmovdqa(_zm, _rip_global(zm)); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha_AVX() -{ - switch (m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - vpcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, _ga, 16); - vpcmpgtd(xmm1, _rip_global(aref)); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, _ga, 16); - vpcmpeqd(xmm1, _rip_global(aref)); - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, _ga, 16); - vmovdqa(xmm1, _rip_global(aref)); - vpcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, _ga, 16); - vpcmpeqd(xmm1, _rip_global(aref)); - break; - } - - switch (m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(_test, xmm1); - alltrue(_test); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(_zm, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(_fm, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(_zm, xmm1); - // fm |= t & GSVector4i::xff000000(); - vpsrld(xmm1, 24); - vpslld(xmm1, 24); - vpor(_fm, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(_rb, _f_rb, 1); - - clamp16(_rb, xmm0); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(xmm1, _ga); - - modulate16(_ga, _f_ga, 1); - - vpshuflw(xmm6, _f_ga, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(xmm6, 7); - - vpaddw(_ga, xmm6); - - clamp16(_ga, xmm0); - - mix16(_ga, xmm1, xmm0); - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(_rb, _f_rb, 1); - - vpaddw(_rb, xmm6); - - clamp16(_rb, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if (m_sel.iip) - { - vpsrlw(_rb, _f_rb, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog_AVX() -{ - if (!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - vmovdqa(xmm6, _ga); - - vmovdqa(xmm0, _rip_global(frb)); - vmovdqa(xmm1, _rip_global(fga)); - - lerp16(_rb, xmm0, _f, 0); - lerp16(_ga, xmm1, _f, 0); - - mix16(_ga, xmm6, _f); -} - -void GSDrawScanlineCodeGenerator::ReadFrame_AVX() -{ - if (!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, dword[t1]); - add(ebx, dword[t0]); - and(ebx, HALF_VM_SIZE - 1); - - if (!m_sel.rfb) - { - return; - } - - ReadPixel_AVX(_fd, rbx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() -{ - if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if (m_sel.datm) - { - if (m_sel.fpsm == 2) - { - vpxor(xmm0, xmm0); - //vpsrld(xmm1, _fd, 15); - vpslld(xmm1, _fd, 16); - vpsrad(xmm1, 31); - vpcmpeqd(xmm1, xmm0); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, _fd, xmm0); - vpsrad(xmm1, 31); - } - } - else - { - if (m_sel.fpsm == 2) - { - vpslld(xmm1, _fd, 16); - vpsrad(xmm1, 31); - } - else - { - vpsrad(xmm1, _fd, 31); - } - } - - vpor(_test, xmm1); - - alltrue(_test); -} - -void GSDrawScanlineCodeGenerator::WriteMask_AVX() -{ - if (m_sel.notest) - { - return; - } - - // fm |= test; - // zm |= test; - - if (m_sel.fwrite) - { - vpor(_fm, _test); - } - - if (m_sel.zwrite) - { - vpor(_zm, _test); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - vpcmpeqd(xmm1, xmm1); - - if (m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(xmm0, xmm1, _zm); - vpcmpeqd(xmm1, _fm); - vpackssdw(xmm1, xmm0); - } - else if (m_sel.fwrite) - { - vpcmpeqd(xmm1, _fm); - vpackssdw(xmm1, xmm1); - } - else if (m_sel.zwrite) - { - vpcmpeqd(xmm1, _zm); - vpackssdw(xmm1, xmm1); - } - - vpmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() -{ - if (!m_sel.zwrite) - { - return; - } - - if (m_sel.prim != GS_SPRITE_CLASS) -#ifdef _WIN64 - vmovdqa(xmm1, _rip_local(temp.zs)); -#else - vmovdqa(xmm1, ptr[rsp + _rz_zs]); -#endif - else - vmovdqa(xmm1, _rip_local(p.z)); - - if (m_sel.ztest && m_sel.zpsm < 2) - { - // zs = zs.blend8(zd, zm); - -#ifdef _WIN64 - vpblendvb(xmm1, _rip_local(temp.zd), _zm); -#else - vpblendvb(xmm1, ptr[rsp + _rz_zd], _zm); -#endif - } - - bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - - WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - const Xmm& _dst_rb = xmm0; - const Xmm& _dst_ga = xmm1; - - if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch (m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - split16_2x8(_dst_rb, _dst_ga, _fd); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(xmm15, xmm15); - - vpsrld(xmm15, 27); // 0x0000001f - vpand(_dst_rb, _fd, xmm15); - vpslld(_dst_rb, 3); - - vpslld(xmm15, 10); // 0x00007c00 - vpand(xmm5, _fd, xmm15); - vpslld(xmm5, 9); - - vpor(_dst_rb, xmm5); - - vpsrld(xmm15, 5); // 0x000003e0 - vpand(_dst_ga, _fd, xmm15); - vpsrld(_dst_ga, 2); - - vpsllw(xmm15, 10); // 0x00008000 - vpand(xmm5, _fd, xmm15); - vpslld(xmm5, 8); - - vpor(_dst_ga, xmm5); - - break; - } - } - - // xmm2, xmm3 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm5, xmm15 = free - - if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(xmm5, _rb); - } - - if (m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(_rb, _dst_rb); - break; - case 2: - vpxor(_rb, _rb); - break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch (m_sel.abb) - { - case 0: - vpsubw(_rb, xmm5); - break; - case 1: - vpsubw(_rb, _dst_rb); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch (m_sel.abc) - { - case 0: - case 1: - vpshuflw(xmm15, m_sel.abc ? _dst_ga : _ga, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(xmm15, 7); - break; - case 2: - vmovdqa(xmm15, _rip_global(afix)); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(_rb, xmm15, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch (m_sel.abd) - { - case 0: - vpaddw(_rb, xmm5); - break; - case 1: - vpaddw(_rb, _dst_rb); - break; - case 2: - break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(_rb, _dst_rb); - break; - case 2: - vpxor(_rb, _rb); - break; - } - } - - if (m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(xmm0, _ga, 8); - vpsrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(_rb, xmm5, _rb, xmm0); - } - - // xmm0 = pabe mask - // xmm3 = src ga - // xmm1 = dst ga - // xmm2 = rb - // xmm15 = a - // xmm5 = free - - vmovdqa(xmm5, _ga); - - if (m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(_ga, _dst_ga); - break; - case 2: - vpxor(_ga, _ga); - break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch (m_sel.abb) - { - case 0: - vpsubw(_ga, xmm5); - break; - case 1: - vpsubw(_ga, _dst_ga); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(_ga, xmm15, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch (m_sel.abd) - { - case 0: - vpaddw(_ga, xmm5); - break; - case 1: - vpaddw(_ga, _dst_ga); - break; - case 2: - break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(_ga, _dst_ga); - break; - case 2: - vpxor(_ga, _ga); - break; - } - } - - // xmm0 = pabe mask - // xmm5 = src ga - // xmm2 = rb - // xmm3 = ga - // xmm1, xmm15 = free - - if (m_sel.pabe) - { - vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(_ga, xmm5, _ga, xmm0); - } - else - { - if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(_ga, xmm5, xmm15); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.fpsm == 2 && m_sel.dthe) - { - // y = (top & 3) << 5 - -#ifdef _WIN64 - ASSERT(0); -#else - mov(eax, ptr[rsp + _rz_top]); -#endif - and(eax, 3); - shl(eax, 5); - - // rb = rb.add16(m_global.dimx[0 + y]); - // ga = ga.add16(m_global.dimx[1 + y]); - - add(rax, _rip_global(dimx)); - - vpaddw(xmm2, ptr[rax + sizeof(GSVector4i) * 0]); - vpaddw(xmm3, ptr[rax + sizeof(GSVector4i) * 1]); - } - - if (m_sel.colclamp == 0) - { - // c[0] &= 0x00ff00ff; - // c[1] &= 0x00ff00ff; - - vpcmpeqd(xmm15, xmm15); - vpsrlw(xmm15, 8); - vpand(xmm2, xmm15); - vpand(xmm3, xmm15); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(xmm15, xmm2, xmm3); - vpunpcklwd(xmm2, xmm3); - vpackuswb(xmm2, xmm15); - - if (m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(xmm15, xmm15); - vpslld(xmm15, 31); - vpor(xmm2, xmm15); - } - - // xmm2 = fs - // xmm4 = fm - // xmm6 = fd - - if (m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm0, eax); - vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - vmovd(xmm1, eax); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); - - vpand(xmm0, xmm2); - vpand(xmm1, xmm2); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(xmm2, xmm0, 9); - vpsrld(xmm0, 3); - vpsrld(xmm3, xmm1, 16); - vpsrld(xmm1, 6); - - vpor(xmm0, xmm1); - vpor(xmm2, xmm3); - vpor(xmm2, xmm0); - } - - if (m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm2, _fd, _fm); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - - WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr) -{ - vmovq(dst, qword[_m_local__gd__vm + addr * 2]); - vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if (m_sel.notest) - { - if (fast) - { - vmovq(qword[_m_local__gd__vm + addr * 2], src); - vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); - } - else - { - WritePixel_AVX(src, addr, 0, psm); - WritePixel_AVX(src, addr, 1, psm); - WritePixel_AVX(src, addr, 2, psm); - WritePixel_AVX(src, addr, 3, psm); - } - } - else - { - if (fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[_m_local__gd__vm + addr * 2], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src); - L("@@"); - - // vmaskmovps? - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel_AVX(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel_AVX(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel_AVX(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel_AVX(src, addr, 3, psm); - L("@@"); - } - } -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm) -{ - Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2]; - - switch (psm) - { - case 0: - if (i == 0) - vmovd(dst, src); - else - vpextrd(dst, src, i); - break; - case 1: - if (i == 0) - vmovd(eax, src); - else - vpextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - vpextrw(eax, src, i * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) -{ - const int in[] = {0, 1, 2, 3}; - const int out[] = {4, 5, 0, 1}; - - for (int i = 0; i < pixels; i++) - { - for (uint8 j = 0; j < 4; j++) - { - ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) -{ - const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4]; - - // Extract address offset - if (i == 0) - vmovd(eax, addr); - else - vpextrd(eax, addr, i); - - // If clut, load the value as a byte index - if (m_sel.tlu) - movzx(eax, byte[_m_local__gd__tex + rax]); - - if (i == 0) - vmovd(dst, src); - else - vpinsrd(dst, src, i); -} - -// Gather example (AVX2). Not faster on Haswell but potentially better on recent CPU -// Worst case reduce Icache. -// -// Current limitation requires 1 extra free register for the mask. -// And palette need zero masking. -// It is not possible to use same source/destination so linear interpolation must be updated -#if 0 -void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) -{ - const int in[] = {0, 1, 2, 3}; - const int out[] = {4, 5, 0, 1}; - const int mask[] = {5, 0, 1, 2}; - - if (m_sel.tlu) { - for(int i = 0; i < pixels; i++) { - // FIXME can't use same dst and add register - Gather4Texel(Xmm(in[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); - // FIXME need a memory and could be faster - vpslld(Xmm(in[i]), 24); - vpsrld(Xmm(in[i]), 24); - Gather4Texel(Xmm(out[i]), _m_local__gd__clut, Xmm(in[i]), Xmm(mask[i])); - } - } else { - for(int i = 0; i < pixels; i++) { - Gather4Texel(Xmm(out[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i])); - } - } -} - -static void Gather4Texel(const Xmm& dst, const Reg64& base, const Xmm& addr, const Xmm& Mask) -{ - //void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) - vpcmpeqd(Mask, Mask); - vpgatherdd(dst, ptr[base + addr * 4], Mask); -} - -#endif - -#endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp deleted file mode 100644 index 27ab2d8ce0..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp +++ /dev/null @@ -1,3103 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" -#include "GSVertexSW.h" -#include "GS/GS_codegen.h" - -#undef _t - -#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -// Ease the reading of the code -#define _m_local r12 -#define _m_local__gd r13 -#define _m_local__gd__vm a1 -#define _m_local__gd__clut r11 -#define _m_local__gd__tex a3 -// More pretty name -#define _z ymm8 -#define _f ymm9 -#define _s ymm10 -#define _t ymm11 -#define _q ymm12 -#define _f_rb ymm13 -#define _f_ga ymm14 -#define _test ymm15 -// Extra bonus -#define _rb ymm2 -#define _ga ymm3 -#define _fm ymm4 -#define _zm ymm5 -#define _fd ymm6 - -#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[_m_local + offsetof(GSScanlineLocalData, field)]) -#define _rip_global(field) (m_rip ? ptr[rip + &m_local.gd->field] : ptr[_m_local__gd + offsetof(GSScanlineGlobalData, field)]) - -#ifdef _WIN64 -#else -static const int _rz_rbx = -8 * 1; -static const int _rz_r12 = -8 * 2; -static const int _rz_r13 = -8 * 3; -//static const int _rz_r14 = -8 * 4; -//static const int _rz_r15 = -8 * 5; -static const int _rz_top = -8 * 4; -static const int _rz_zs = -8 * 8; -static const int _rz_zd = -8 * 12; -static const int _rz_cov = -8 * 16; -#endif - -void GSDrawScanlineCodeGenerator::Generate() -{ - ret(); - return; - - bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; - bool need_clut = need_tex && m_sel.tlu; - m_rip = (size_t)getCurr() < 0x80000000; - m_rip &= (size_t)&m_local < 0x80000000; - m_rip &= (size_t)&m_local.gd < 0x80000000; - -#ifdef _WIN64 - push(rbx); - push(rsi); - push(rdi); - push(rbp); - push(r12); - push(r13); - - sub(rsp, 8 + 10 * 16); - - for (int i = 6; i < 16; i++) - { - vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); - } -#else - // No reservation on the stack as a red zone is available - push(rbp); - mov(ptr[rsp + _rz_rbx], rbx); - if (!m_rip) - { - mov(ptr[rsp + _rz_r12], r12); - mov(ptr[rsp + _rz_r13], r13); - } -#endif - - mov(r10, (size_t)g_const->m_test_256b[0]); - if (!m_rip) - { - mov(_m_local, (size_t)&m_local); - mov(_m_local__gd, _rip_local(gd)); - } - - if (need_clut) - mov(_m_local__gd__clut, _rip_global(clut)); - - //db(0xcc); - - Init(); - - if (!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ymm0 = z/zi - // ymm2 = s/u (tme) - // ymm3 = t/v (tme) - // ymm4 = q (tme) - // ymm5 = rb (!tme) - // ymm6 = ga (!tme) - // ymm7 = test - - TestZ(ymm5, ymm6); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - ymm0 - // ymm2 = s/u (tme) - // ymm3 = t/v (tme) - // ymm4 = q (tme) - // ymm5 = rb (!tme) - // ymm6 = ga (!tme) - // ymm7 = test - - if (m_sel.mmin) - { - SampleTextureLOD(); - } - else - { - SampleTexture(); - } - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - ymm2 - // - ymm3 - // - ymm4 - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - AlphaTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ReadMask(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - TestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ColorTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - Fog(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ReadFrame(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - TestDestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - WriteMask(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - - WriteZBuf(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // ymm2 = fd - // ymm3 = fm - // - ymm4 - // ymm5 = rb - // ymm6 = ga - - AlphaBlend(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ymm2 = fd - // ymm3 = fm - // ymm5 = rb - // ymm6 = ga - - WriteFrame(); - -L("step"); - - // if(steps <= 0) break; - - if (!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step(); - - jmp("loop", T_NEAR); - } - -L("exit"); - -#ifdef _WIN64 - for (int i = 6; i < 16; i++) - { - vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); - } - - add(rsp, 8 + 10 * 16); - - pop(r13); - pop(r12); - pop(rbp); - pop(rdi); - pop(rsi); - pop(rbx); -#else - mov(rbx, ptr[rsp + _rz_rbx]); - if (!m_rip) - { - mov(r12, ptr[rsp + _rz_r12]); - mov(r13, ptr[rsp + _rz_r13]); - } - pop(rbp); -#endif -} - -void GSDrawScanlineCodeGenerator::Init() -{ - if (!m_sel.notest) - { - // int skip = left & 7; - - mov(ebx, a1.cvt32()); - and(a1.cvt32(), 7); - - // int steps = pixels + skip - 8; - - lea(a0, ptr[a0 + a1 - 8]); - - // left -= skip; - - sub(ebx, a1.cvt32()); - - // GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))]; - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - - vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[0]]); - vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)g_const->m_test_256b[15]]); - vpor(ymm7, ymm0); - - shl(edx, 5); - } - else - { - mov(ebx, edx); // left - xor(edx, edx); // skip - lea(ecx, ptr[ecx - 8]); // steps - } - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, ptr[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, ptr[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, ptr[&m_local.gd->fzbc]); - - if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - lea(edx, ptr[edx * 8 + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, ptr[esp + _v]); - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vbroadcastf128(ymm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p - - if (m_sel.fwrite && m_sel.fge) - { - // f = GSVector8i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(ymm1, ymm0); - vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(ymm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]); - - vmovdqa(ptr[&m_local.temp.f], ymm1); - } - - if (m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); - vmovaps(ptr[&m_local.temp.z], ymm0); - vmovaps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]); - vmovaps(ptr[&m_local.temp.zo], ymm2); - vaddps(ymm0, ymm2); - } - } - } - else - { - if (m_sel.ztest) - { - vpbroadcastd(ymm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.edge || m_sel.tfx != TFX_NONE) - { - vbroadcastf128(ymm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t - } - - if (m_sel.edge) - { - // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); - - vpshufhw(ymm3, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(ymm3, ymm3, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(ymm3, 9); - - vmovdqa(ptr[&m_local.temp.cov], ymm3); - } - - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(ymm6, ymm4); - - // s = vti.xxxx() + m_local.d[skip].s; - // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - - vpshufd(ymm2, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm3, ymm6, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpaddd(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - } - else - { - if (m_sel.ltf) - { - vpshuflw(ymm6, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm6, ymm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm6, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm6); - } - } - - vmovdqa(ptr[&m_local.temp.s], ymm2); - vmovdqa(ptr[&m_local.temp.t], ymm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(ymm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]); - - vmovaps(ptr[&m_local.temp.s], ymm2); - vmovaps(ptr[&m_local.temp.t], ymm3); - vmovaps(ptr[&m_local.temp.q], ymm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vbroadcastf128(ymm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c - vcvttps2dq(ymm6, ymm6); - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(ymm5, ymm6, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(ymm6, ymm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(ymm5, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm6, ymm6, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(ymm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(ymm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]); - - vmovdqa(ptr[&m_local.temp.rb], ymm5); - vmovdqa(ptr[&m_local.temp.ga], ymm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(ymm5, ptr[&m_local.c.rb]); - vmovdqa(ymm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step() -{ - // steps -= 8; - - sub(a0, 8); - - // fza_offset += 2; - - add(t0, 16); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // zo += GSVector8::broadcast32(&m_local.d8.p.z); - - if (m_sel.zb) - { - vbroadcastss(ymm0, ptr[&m_local.d8.p.z]); - vaddps(ymm0, ptr[&m_local.temp.zo]); - vmovaps(ptr[&m_local.temp.zo], ymm0); - vaddps(ymm0, ptr[&m_local.temp.z]); - } - - // f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f)); - - if (m_sel.fwrite && m_sel.fge) - { - vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]); - vpaddw(ymm1, ptr[&m_local.temp.f]); - vmovdqa(ptr[&m_local.temp.f], ymm1); - } - } - else - { - if (m_sel.ztest) - { - vpbroadcastd(ymm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq)); - - vbroadcasti128(ymm4, ptr[&m_local.d8.stq]); - - // s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); - - vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(ymm2, ptr[&m_local.temp.s]); - vmovdqa(ptr[&m_local.temp.s], ymm2); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - // t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy()); - - vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(ymm3, ptr[&m_local.temp.t]); - vmovdqa(ptr[&m_local.temp.t], ymm3); - } - else - { - vmovdqa(ymm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector8 stq(m_local.d8.stq); - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vbroadcastf128(ymm4, ptr[&m_local.d8.stq]); - - vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(ymm2, ptr[&m_local.temp.s]); - vaddps(ymm3, ptr[&m_local.temp.t]); - vaddps(ymm4, ptr[&m_local.temp.q]); - - vmovaps(ptr[&m_local.temp.s], ymm2); - vmovaps(ptr[&m_local.temp.t], ymm3); - vmovaps(ptr[&m_local.temp.q], ymm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c); - - vpbroadcastq(ymm7, ptr[&m_local.d8.c]); - - // rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero()); - // ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero()); - - vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(ymm5, ptr[&m_local.temp.rb]); - vpaddw(ymm6, ptr[&m_local.temp.ga]); - - // FIXME: color may underflow and roll over at the end of the line, if decreasing - - vpxor(ymm7, ymm7); - vpmaxsw(ymm5, ymm7); - vpmaxsw(ymm6, ymm7); - - vmovdqa(ptr[&m_local.temp.rb], ymm5); - vmovdqa(ptr[&m_local.temp.ga], ymm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(ymm5, ptr[&m_local.c.rb]); - vmovdqa(ymm6, ptr[&m_local.c.ga]); - } - } - } - } - - if (!m_sel.notest) - { - // test = m_test[15 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - - vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[15]]); - } -} - -void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2) -{ - if (!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, ptr[esi + 4]); - add(ebp, ptr[edi + 4]); - and(ebp, HALF_VM_SIZE - 1); - - // GSVector8i zs = zi; - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.zoverflow) - { - // zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); - - vbroadcastss(ymm0, ptr[&GSVector8::m_half]); - vmulps(ymm0, _z); - vcvttps2dq(ymm0, ymm0); - vpslld(ymm0, 1); - - vcvttps2dq(ymm1, _z); - vpcmpeqd(ymm2, ymm2); - vpsrld(ymm2, 31); - vpand(ymm1, ymm2); - - vpor(ymm0, ymm1); - } - else - { - // zs = GSVector8i(z); - - vcvttps2dq(ymm0, ymm0); - } - - if (m_sel.zwrite) - { -#ifdef _WIN64 - vmovdqa(ptr[&m_local.temp.zs], ymm0); -#else - vmovdqa(ptr[rsp + _rz_zs], ymm0); -#endif - } - } - - if (m_sel.ztest) - { - ReadPixel(ymm1, temp1, rbp); - - if (m_sel.zwrite && m_sel.zpsm < 2) - { -#ifdef _WIN64 - vmovdqa(_rip_local(temp.zd), ymm1); -#else - vmovdqa(ptr[rsp + _rz_zd], ymm1); -#endif - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if (m_sel.zpsm) - { - vpslld(ymm1, (uint8)(m_sel.zpsm * 8)); - vpsrld(ymm1, (uint8)(m_sel.zpsm * 8)); - } - - if (m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector8i o = GSVector8i::x80000000(); - - vpcmpeqd(temp1, temp1); - vpslld(temp1, 31); - - // GSVector8i zso = zs - o; - // GSVector8i zdo = zd - o; - - vpsubd(ymm0, temp1); - vpsubd(ymm1, temp1); - } - - switch (m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(ymm1, ymm0); - vpor(ymm7, ymm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(ymm0, ymm1); - vpcmpeqd(temp1, temp1); - vpxor(ymm0, temp1); - vpor(ymm7, ymm0); - break; - } - - alltrue(ymm7); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, ptr[&m_local.gd->tex[0]]); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - // ebx = tex - // edx = clut - - if (!m_sel.fst) - { - vrcpps(ymm0, ymm4); - - vmulps(ymm2, ymm0); - vmulps(ymm3, ymm0); - - vcvttps2dq(ymm2, ymm2); - vcvttps2dq(ymm3, ymm3); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - } - } - - // ymm2 = u - // ymm3 = v - - if (m_sel.ltf) - { - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i y0 = uv0.uph16() << tw; - // GSVector8i x0 = uv0.upl16(); - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i y1 = uv1.uph16() << tw; - // GSVector8i x1 = uv1.upl16(); - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 0); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 0); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv) -{ - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); - vpmaxsw(uv, ymm0); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv, ymm0); - } - - vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); - vpminsw(uv, ymm0); - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); - vpand(uv, ymm0); - - if (region) - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); - vpor(uv, ymm0); - } - } - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, ymm4); - vpminsw(uv, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv0, const Ymm& uv1) -{ - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpmaxsw(uv0, ymm4); - vpmaxsw(uv1, ymm4); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv0, ymm0); - vpmaxsw(uv1, ymm0); - } - - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vpminsw(uv0, ymm5); - vpminsw(uv1, ymm5); - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpand(uv0, ymm4); - vpand(uv1, ymm4); - - if (region) - { - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vpor(uv0, ymm5); - vpor(uv1, ymm5); - } - } - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv0, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, ymm4); - vpminsw(uv0, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, ymm1, ymm0); - - // uv1 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv1, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, ymm4); - vpminsw(uv1, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::SampleTextureLOD() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - push(ebp); - - mov(ebp, (size_t)m_local.gd->tex); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - if (!m_sel.fst) - { - vrcpps(ymm0, ymm4); - - vmulps(ymm2, ymm0); - vmulps(ymm3, ymm0); - - vcvttps2dq(ymm2, ymm2); - vcvttps2dq(ymm3, ymm3); - } - - // ymm2 = u - // ymm3 = v - // ymm4 = q - // ymm0 = ymm1 = ymm5 = ymm6 = free - - // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) - - if (!m_sel.lcm) - { - // lod = -log2(Q) * (1 << L) + K - - vpcmpeqd(ymm1, ymm1); - vpsrld(ymm1, ymm1, 25); - vpslld(ymm0, ymm4, 1); - vpsrld(ymm0, ymm0, 24); - vpsubd(ymm0, ymm1); - vcvtdq2ps(ymm0, ymm0); - - // ymm0 = (float)(exp(q) - 127) - - vpslld(ymm4, ymm4, 9); - vpsrld(ymm4, ymm4, 9); - vorps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); - - // ymm4 = mant(q) | 1.0f - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(ymm5, ptr[g_const->m_log2_coef_256b[0]]); // c0 - vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[1]]); // c0 * ymm4 + c1 - vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[2]]); // (c0 * ymm4 + c1) * ymm4 + c2 - vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); // ymm4 - 1.0f - vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0 - } - else - { - vmulps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[0]]); - vaddps(ymm5, ptr[g_const->m_log2_coef_256b[1]]); - vmulps(ymm5, ymm4); - vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); - vaddps(ymm5, ptr[g_const->m_log2_coef_256b[2]]); - vmulps(ymm4, ymm5); - vaddps(ymm4, ymm0); - } - - // ymm4 = log2(Q) = ((((c0 * ymm4) + c1) * ymm4) + c2) * (ymm4 - 1.0f) + ymm0 - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(ymm5, ptr[&m_local.gd->l]); - vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]); - } - else - { - vmulps(ymm4, ptr[&m_local.gd->l]); - vaddps(ymm4, ptr[&m_local.gd->k]); - } - - // ymm4 = (-log2(Q) * (1 << L) + K) * 0x10000 - - vxorps(ymm0, ymm0); - vminps(ymm4, ptr[&m_local.gd->mxl]); - vmaxps(ymm4, ymm0); - vcvtps2dq(ymm4, ymm4); - - if (m_sel.mmin == 1) // round-off mode - { - mov(eax, 0x8000); - vmovd(xmm0, eax); - vpbroadcastd(ymm0, xmm0); - vpaddd(ymm4, ymm0); - } - - vpsrld(ymm0, ymm4, 16); - - vmovdqa(ptr[&m_local.temp.lod.i], ymm0); -/* -vpslld(ymm5, ymm0, 6); -vpslld(ymm6, ymm4, 16); -vpsrld(ymm6, ymm6, 24); -return; -*/ - if (m_sel.mmin == 2) // trilinear mode - { - vpshuflw(ymm1, ymm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.temp.lod.f], ymm1); - } - - // shift u/v/minmax by (int)lod - - vpsravd(ymm2, ymm2, ymm0); - vpsravd(ymm3, ymm3, ymm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); - vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); - - // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] - - vpxor(ymm1, ymm1); - - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpunpcklwd(ymm5, ymm4, ymm1); // minu - vpunpckhwd(ymm6, ymm4, ymm1); // minv - vpsrlvd(ymm5, ymm5, ymm0); - vpsrlvd(ymm6, ymm6, ymm0); - vpackusdw(ymm5, ymm6); - - vbroadcasti128(ymm4, ptr[&m_local.gd->t.max]); - vpunpcklwd(ymm6, ymm4, ymm1); // maxu - vpunpckhwd(ymm4, ymm4, ymm1); // maxv - vpsrlvd(ymm6, ymm6, ymm0); - vpsrlvd(ymm4, ymm4, ymm0); - vpackusdw(ymm6, ymm4); - - vmovdqa(ptr[&m_local.temp.uv_minmax[0]], ymm5); - vmovdqa(ptr[&m_local.temp.uv_minmax[1]], ymm6); - } - else - { - // lod = K - - vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]); - - vpsrad(ymm2, xmm0); - vpsrad(ymm3, xmm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); - vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); - - vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); - } - - // ymm2 = m_local.temp.uv[0] = u (level m) - // ymm3 = m_local.temp.uv[1] = v (level m) - // ymm5 = minuv - // ymm6 = maxuv - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i x0 = uv0.upl16(); - // GSVector8i y0 = uv0.uph16() << tw; - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i x1 = uv1.upl16(); - // GSVector8i y1 = uv1.uph16() << tw; - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 0); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 0); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } - - if (m_sel.mmin != 1) // !round-off mode - { - vmovdqa(ptr[&m_local.temp.trb], ymm5); - vmovdqa(ptr[&m_local.temp.tga], ymm6); - - vmovdqa(ymm2, ptr[&m_local.temp.uv[0]]); - vmovdqa(ymm3, ptr[&m_local.temp.uv[1]]); - - vpsrad(ymm2, 1); - vpsrad(ymm3, 1); - - vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); - - vpsrlw(ymm5, 1); - vpsrlw(ymm6, 1); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i x0 = uv0.upl16(); - // GSVector8i y0 = uv0.uph16() << tw; - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i x1 = uv1.upl16(); - // GSVector8i y1 = uv1.uph16() << tw; - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 1); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 1); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } - - vmovdqa(ymm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); - vpsrlw(ymm0, ymm0, 1); - - vmovdqa(ymm2, ptr[&m_local.temp.trb]); - vmovdqa(ymm3, ptr[&m_local.temp.tga]); - - lerp16(ymm5, ymm2, ymm0, 0); - lerp16(ymm6, ymm3, ymm0, 0); - } - - pop(ebp); -} - -void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv) -{ - // ymm5 = minuv - // ymm6 = maxuv - // ymm0, ymm1, ymm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv, ymm5); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv, ymm0); - } - - vpminsw(uv, ymm6); - } - else - { - vpand(uv, ymm5); - - if (region) - { - vpor(uv, ymm6); - } - } - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, ymm5); - vpminsw(uv, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv0, const Ymm& uv1) -{ - // ymm5 = minuv - // ymm6 = maxuv - // ymm0, ymm1, ymm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv0, ymm5); - vpmaxsw(uv1, ymm5); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv0, ymm0); - vpmaxsw(uv1, ymm0); - } - - vpminsw(uv0, ymm6); - vpminsw(uv1, ymm6); - } - else - { - vpand(uv0, ymm5); - vpand(uv1, ymm5); - - if (region) - { - vpor(uv0, ymm6); - vpor(uv1, ymm6); - } - } - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv0, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, ymm5); - vpminsw(uv0, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, ymm1, ymm0); - - // uv1 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv1, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, ymm5); - vpminsw(uv1, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ - if (!m_sel.fb) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector8i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(ymm6, ymm4, 1); - - clamp16(ymm6, ymm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(ymm2, ymm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(ymm4, 7); - - if (m_sel.tcc) - { - vpaddusb(ymm4, ymm6); - } - - mix16(ymm6, ymm4, ymm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(ymm2, ymm4); - - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if (m_sel.iip) - { - vpsrlw(ymm6, 7); - } - - break; - } - - if (m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if (!m_sel.abe) - { - // a = cov - - if (m_sel.edge) - { - vmovdqa(ymm0, ptr[&m_local.temp.cov]); - } - else - { - vpcmpeqd(ymm0, ymm0); - vpsllw(ymm0, 15); - vpsrlw(ymm0, 8); - } - - mix16(ymm6, ymm0, ymm1); - } - else - { - // a = a == 0x80 ? cov : a - - vpcmpeqd(ymm0, ymm0); - vpsllw(ymm0, 15); - vpsrlw(ymm0, 8); - - if (m_sel.edge) - { - vmovdqa(ymm1, ptr[&m_local.temp.cov]); - } - else - { - vmovdqa(ymm1, ymm0); - } - - vpcmpeqw(ymm0, ymm6); - vpsrld(ymm0, 16); - vpslld(ymm0, 16); - - vpblendvb(ymm6, ymm1, ymm0); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ - if (m_sel.fwrite) - { - vpbroadcastd(ymm3, ptr[&m_local.gd->fm]); - } - - if (m_sel.zwrite) - { - vpbroadcastd(ymm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ - switch (m_sel.atst) - { - case ATST_NEVER: - // t = GSVector8i::xffffffff(); - vpcmpeqd(ymm1, ymm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpgtd(ymm1, ymm0); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpeqd(ymm1, ymm0); - vpcmpeqd(ymm0, ymm0); - vpxor(ymm1, ymm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(ymm0, ymm6, 16); - vbroadcasti128(ymm1, ptr[&m_local.gd->aref]); - vpcmpgtd(ymm1, ymm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpeqd(ymm1, ymm0); - break; - } - - switch (m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(ymm7, ymm1); - alltrue(ymm7); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(ymm4, ymm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(ymm3, ymm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(ymm4, ymm1); - // fm |= t & GSVector8i::xff000000(); - vpsrld(ymm1, 24); - vpslld(ymm1, 24); - vpor(ymm3, ymm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ - if (!m_sel.fwrite) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector8i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(ymm5, ymm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector8i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(ymm1, ymm6); - - modulate16(ymm6, ymm2, 1); - - vpshuflw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(ymm2, 7); - - vpaddw(ymm6, ymm2); - - clamp16(ymm6, ymm0); - - mix16(ymm6, ymm1, ymm0); - - // GSVector8i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - vpaddw(ymm5, ymm2); - - clamp16(ymm5, ymm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if (m_sel.iip) - { - vpsrlw(ymm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog() -{ - if (!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - vmovdqa(ymm0, ptr[&m_local.temp.f]); - } - else - { - vpbroadcastw(ymm0, ptr[&m_local.p.f]); - } - - vmovdqa(ymm1, ymm6); - - vpbroadcastd(ymm2, ptr[&m_local.gd->frb]); - lerp16(ymm5, ymm2, ymm0, 0); - - vpbroadcastd(ymm2, ptr[&m_local.gd->fga]); - lerp16(ymm6, ymm2, ymm0, 0); - mix16(ymm6, ymm1, ymm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ - if (!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, ptr[esi]); - add(ebx, ptr[edi]); - and(ebx, HALF_VM_SIZE - 1); - - if (!m_sel.rfb) - { - return; - } - - ReadPixel(ymm2, ymm0, rbx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ - if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if (m_sel.datm) - { - if (m_sel.fpsm == 2) - { - vpxor(ymm0, ymm0); - //vpsrld(ymm1, ymm2, 15); - vpslld(ymm1, ymm2, 16); - vpsrad(ymm1, 31); - vpcmpeqd(ymm1, ymm0); - } - else - { - vpcmpeqd(ymm0, ymm0); - vpxor(ymm1, ymm2, ymm0); - vpsrad(ymm1, 31); - } - } - else - { - if (m_sel.fpsm == 2) - { - vpslld(ymm1, ymm2, 16); - vpsrad(ymm1, 31); - } - else - { - vpsrad(ymm1, ymm2, 31); - } - } - - vpor(ymm7, ymm1); - - alltrue(ymm7); -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ - if (m_sel.notest) - { - return; - } - - // fm |= test; - // zm |= test; - - if (m_sel.fwrite) - { - vpor(ymm3, ymm7); - } - - if (m_sel.zwrite) - { - vpor(ymm4, ymm7); - } - - // int fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask(); - - vpcmpeqd(ymm1, ymm1); - - if (m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(ymm0, ymm1, ymm4); - vpcmpeqd(ymm1, ymm3); - vpackssdw(ymm1, ymm0); - } - else if (m_sel.fwrite) - { - vpcmpeqd(ymm1, ymm3); - vpackssdw(ymm1, ymm1); - } - else if (m_sel.zwrite) - { - vpcmpeqd(ymm1, ymm4); - vpackssdw(ymm1, ymm1); - } - - vpmovmskb(edx, ymm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ - if (!m_sel.zwrite) - { - return; - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - vmovdqa(ymm1, ptr[&m_local.temp.zs]); - } - else - { - vpbroadcastd(ymm1, ptr[&m_local.p.z]); - } - - if (m_sel.ztest && m_sel.zpsm < 2) - { - // zs = zs.blend8(zd, zm); - - vpblendvb(ymm1, ptr[&m_local.temp.zd], ymm4); - } - - bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - - WritePixel(ymm1, ymm0, rbp, edx, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch (m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - vpsllw(ymm0, ymm2, 8); - vpsrlw(ymm0, 8); - vpsrlw(ymm1, ymm2, 8); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(ymm7, ymm7); - - vpsrld(ymm7, 27); // 0x0000001f - vpand(ymm0, ymm2, ymm7); - vpslld(ymm0, 3); - - vpslld(ymm7, 10); // 0x00007c00 - vpand(ymm4, ymm2, ymm7); - vpslld(ymm4, 9); - - vpor(ymm0, ymm4); - - vpsrld(ymm7, 5); // 0x000003e0 - vpand(ymm1, ymm2, ymm7); - vpsrld(ymm1, 2); - - vpsllw(ymm7, 10); // 0x00008000 - vpand(ymm4, ymm2, ymm7); - vpslld(ymm4, 8); - - vpor(ymm1, ymm4); - - break; - } - } - - // ymm5, ymm6 = src rb, ga - // ymm0, ymm1 = dst rb, ga - // ymm2, ymm3 = used - // ymm4, ymm7 = free - - if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(ymm4, ymm5); - } - - if (m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(ymm5, ymm0); - break; - case 2: - vpxor(ymm5, ymm5); - break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch (m_sel.abb) - { - case 0: - vpsubw(ymm5, ymm4); - break; - case 1: - vpsubw(ymm5, ymm0); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch (m_sel.abc) - { - case 0: - case 1: - vpshuflw(ymm7, m_sel.abc ? ymm1 : ymm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(ymm7, ymm7, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(ymm7, 7); - break; - case 2: - vpbroadcastw(ymm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(ymm5, ymm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch (m_sel.abd) - { - case 0: - vpaddw(ymm5, ymm4); - break; - case 1: - vpaddw(ymm5, ymm0); - break; - case 2: - break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(ymm5, ymm0); - break; - case 2: - vpxor(ymm5, ymm5); - break; - } - } - - if (m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(ymm0, ymm6, 8); - vpsrad(ymm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(ymm5, ymm4, ymm5, ymm0); - } - - // ymm6 = src ga - // ymm1 = dst ga - // ymm5 = rb - // ymm7 = a - // ymm2, ymm3 = used - // ymm0, ymm4 = free - - vmovdqa(ymm4, ymm6); - - if (m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(ymm6, ymm1); - break; - case 2: - vpxor(ymm6, ymm6); - break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch (m_sel.abb) - { - case 0: - vpsubw(ymm6, ymm4); - break; - case 1: - vpsubw(ymm6, ymm1); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(ymm6, ymm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch (m_sel.abd) - { - case 0: - vpaddw(ymm6, ymm4); - break; - case 1: - vpaddw(ymm6, ymm1); - break; - case 2: - break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(ymm6, ymm1); - break; - case 2: - vpxor(ymm6, ymm6); - break; - } - } - - // ymm4 = src ga - // ymm5 = rb - // ymm6 = ga - // ymm2, ymm3 = used - // ymm0, ymm1, ymm7 = free - - if (m_sel.pabe) - { - vpsrld(ymm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(ymm6, ymm4, ymm6, ymm0); - } - else - { - if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(ymm6, ymm4, ymm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, ptr[esp + _top]); - and(eax, 3); - shl(eax, 5); - mov(ebp, ptr[&m_local.gd->dimx]); - vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 0]); - vpaddw(ymm5, ymm7); - vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 1]); - vpaddw(ymm6, ymm7); - } - - if (m_sel.colclamp == 0) - { - // c[0] &= 0x00ff00ff; - // c[1] &= 0x00ff00ff; - - vpcmpeqd(ymm7, ymm7); - vpsrlw(ymm7, 8); - vpand(ymm5, ymm7); - vpand(ymm6, ymm7); - } - - // GSVector8i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(ymm7, ymm5, ymm6); - vpunpcklwd(ymm5, ymm6); - vpackuswb(ymm5, ymm7); - - if (m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(ymm7, ymm7); - vpslld(ymm7, 31); - vpor(ymm5, ymm7); - } - - if (m_sel.fpsm == 2) - { - // GSVector8i rb = fs & 0x00f800f8; - // GSVector8i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm6, eax); - vpbroadcastd(ymm6, xmm6); - - mov(eax, 0x8000f800); - vmovd(xmm7, eax); - vpbroadcastd(ymm7, xmm7); - - vpand(ymm4, ymm5, ymm6); - vpand(ymm5, ymm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(ymm6, ymm4, 9); - vpsrld(ymm4, 3); - vpsrld(ymm7, ymm5, 16); - vpsrld(ymm5, 6); - - vpor(ymm5, ymm4); - vpor(ymm7, ymm6); - vpor(ymm5, ymm7); - } - - if (m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(ymm5, ymm2, ymm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - - WritePixel(ymm5, ymm0, rbx, edx, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr) -{ - vmovq(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm]); - vmovhps(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); - vmovq(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); - vmovhps(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2]); - vinserti128(dst, dst, Xmm(temp.getIdx()), 1); - /* - vmovdqu(dst, ptr[addr * 2 + (size_t)m_local.gd->vm]); - vmovdqu(temp, ptr[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); - vpunpcklqdq(dst, dst, temp); - vpermq(dst, dst, _MM_SHUFFLE(3, 1, 2, 0)); -*/ -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Reg32& mask, bool fast, int psm, int fz) -{ - Xmm src1 = Xmm(src.getIdx()); - Xmm src2 = Xmm(temp.getIdx()); - - vextracti128(src2, src, 1); - - if (m_sel.notest) - { - if (fast) - { - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); - } - else - { - WritePixel(src1, addr, 0, 0, psm); - WritePixel(src1, addr, 1, 1, psm); - WritePixel(src1, addr, 2, 2, psm); - WritePixel(src1, addr, 3, 3, psm); - WritePixel(src2, addr, 4, 0, psm); - WritePixel(src2, addr, 5, 1, psm); - WritePixel(src2, addr, 6, 2, psm); - WritePixel(src2, addr, 7, 3, psm); - } - } - else - { - // cascade tests? - - if (fast) - { - test(mask, 0x0000000f << (fz * 8)); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); - L("@@"); - - test(mask, 0x000000f0 << (fz * 8)); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); - L("@@"); - - test(mask, 0x000f0000 << (fz * 8)); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); - L("@@"); - - test(mask, 0x00f00000 << (fz * 8)); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); - L("@@"); - - // vmaskmovps? - } - else - { - test(mask, 0x00000003 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 0, 0, psm); - L("@@"); - - test(mask, 0x0000000c << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 1, 1, psm); - L("@@"); - - test(mask, 0x00000030 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 2, 2, psm); - L("@@"); - - test(mask, 0x000000c0 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 3, 3, psm); - L("@@"); - - test(mask, 0x00030000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 4, 0, psm); - L("@@"); - - test(mask, 0x000c0000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 5, 1, psm); - L("@@"); - - test(mask, 0x00300000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 6, 2, psm); - L("@@"); - - test(mask, 0x00c00000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 7, 3, psm); - L("@@"); - } - } -} - -static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - switch (psm) - { - case 0: - if (j == 0) - vmovd(dst, src); - else - vpextrd(dst, src, j); - break; - case 1: - if (j == 0) - vmovd(eax, src); - else - vpextrd(eax, src, j); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - if (j == 0) - vmovd(eax, src); - else - vpextrw(eax, src, j * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) -{ - // in - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ebx = m_local.tex[0] (!m_sel.mmin) - // ebp = m_local.tex (m_sel.mmin) - // edx = m_local.clut (m_sel.tlu) - - // out - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - - ASSERT(pixels == 1 || pixels == 4); - - mip_offset *= sizeof(void*); - - const GSVector8i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i; - - if (m_sel.mmin && !m_sel.lcm) - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; - - for (int i = 0; i < pixels; i++) - { - Ymm src = Ymm(r[i * 2 + 0]); - Ymm dst = Ymm(r[i * 2 + 1]); - Ymm t1 = Ymm(t[i * 2 + 0]); - Ymm t2 = Ymm(t[i * 2 + 1]); - - vextracti128(Xmm(t1.getIdx()), src, 1); - - for (uint8 j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j + 0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(dst, src, j); - - mov(ebx, ptr[&lod_i->u32[j + 4]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(t2, t1, j); - } - - vinserti128(dst, dst, Xmm(t2.getIdx()), 1); - } - } - else - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; - - if (m_sel.mmin && m_sel.lcm) - { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - } - - for (int i = 0; i < pixels; i++) - { - Ymm src = Ymm(r[i * 2 + 0]); - Ymm dst = Ymm(r[i * 2 + 1]); - Ymm t1 = Ymm(t[i * 2 + 0]); - Ymm t2 = Ymm(t[i * 2 + 1]); - - if (!m_sel.tlu) - { - vpcmpeqd(t1, t1); - vpgatherdd(dst, ptr[ebx + src * 4], t1); - } - else - { - vextracti128(Xmm(t1.getIdx()), src, 1); - - for (uint8 j = 0; j < 4; j++) - { - ReadTexel(dst, src, j); - ReadTexel(t2, t1, j); - } - - vinserti128(dst, dst, Xmm(t2.getIdx()), 1); - /* - vpcmpeqd(t1, t1); - vpgatherdd(t2, ptr[ebx + src * 1], t1); // either this 1x scale, or the latency of two dependendent gathers are too slow - vpslld(t2, 24); - vpsrld(t2, 24); - vpcmpeqd(t1, t1); - vpgatherdd(dst, ptr[edx + t2 * 4], t1); - */ - } - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i) -{ - ASSERT(i < 4); - - const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - - if (i == 0) - vmovd(eax, Xmm(addr.getIdx())); - else - vpextrd(eax, Xmm(addr.getIdx()), i); - - if (m_sel.tlu) - movzx(eax, byte[ebx + eax]); - - if (i == 0) - vmovd(Xmm(dst.getIdx()), src); - else - vpinsrd(Xmm(dst.getIdx()), src, i); -} - - -#endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp deleted file mode 100644 index c50205d324..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" - -#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64)) - -// It is useless to port the code to SSEx, better use the faster 32 bits version instead -void GSDrawScanlineCodeGenerator::Generate_SSE() -{ - // Avoid a crash if someone want to use it - ret(); -} - -void GSDrawScanlineCodeGenerator::Init_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::Step_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) -{ -} - -void GSDrawScanlineCodeGenerator::SampleTexture_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) -{ -} - -void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) -{ -} - -void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::ReadMask_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::TestAlpha_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::ColorTFX_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::Fog_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::ReadFrame_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::WriteMask_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::WriteFrame_SSE() -{ -} - -void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const RegLong& addr) -{ -} - -void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz) -{ -} - -//static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm) -{ -} - -void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) -{ -} - -void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) -{ -} - -#endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp deleted file mode 100644 index 33c468f912..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ /dev/null @@ -1,2936 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" -#include "GSVertexSW.h" -#include "GS/GS_codegen.h" - -#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -void GSDrawScanlineCodeGenerator::Generate_AVX() -{ - push(ebx); - push(esi); - push(edi); - push(ebp); - - Init_AVX(); - - if (!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // xmm0 = z/zi - // xmm2 = s/u (tme) - // xmm3 = t/v (tme) - // xmm4 = q (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - bool tme = m_sel.tfx != TFX_NONE; - - TestZ_AVX(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm0 - // xmm2 = s/u (tme) - // xmm3 = t/v (tme) - // xmm4 = q (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - if (m_sel.mmin) - { - SampleTextureLOD_AVX(); - } - else - { - SampleTexture_AVX(); - } - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm2 - // - xmm3 - // - xmm4 - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - AlphaTFX_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadMask_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestAlpha_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ColorTFX_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - Fog_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadFrame_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestDestAlpha_AVX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - WriteMask_AVX(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - - WriteZBuf_AVX(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // xmm2 = fd - // xmm3 = fm - // - xmm4 - // xmm5 = rb - // xmm6 = ga - - AlphaBlend_AVX(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // xmm2 = fd - // xmm3 = fm - // xmm5 = rb - // xmm6 = ga - - WriteFrame_AVX(); - -L("step"); - - // if(steps <= 0) break; - - if (!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step_AVX(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - // vzeroupper(); - - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); -} - -void GSDrawScanlineCodeGenerator::Init_AVX() -{ - if (!m_sel.notest) - { - // int skip = left & 3; - - mov(ebx, edx); - and(edx, 3); - - // int steps = pixels + skip - 4; - - lea(ecx, ptr[ecx + edx - 4]); - - // left -= skip; - - sub(ebx, edx); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(edx, 4); - - vmovdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[0]]); - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); - - vpor(xmm7, ptr[eax + (size_t)g_const->m_test_128b[7]]); - } - else - { - mov(ebx, edx); // left - xor(edx, edx); // skip - lea(ecx, ptr[ecx - 4]); // steps - } - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, ptr[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, ptr[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, ptr[&m_local.gd->fzbc]); - - if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - lea(edx, ptr[edx * 8 + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, ptr[esp + _v]); - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vmovaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p - - if (m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(xmm1, xmm0); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]); - - vmovdqa(ptr[&m_local.temp.f], xmm1); - } - - if (m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vmovaps(ptr[&m_local.temp.z], xmm0); - vmovaps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]); - vmovaps(ptr[&m_local.temp.zo], xmm2); - vaddps(xmm0, xmm2); - } - } - } - else - { - if (m_sel.ztest) - { - vmovdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.edge || m_sel.tfx != TFX_NONE) - { - vmovaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t - } - - if (m_sel.edge) - { - // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); - - vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(xmm3, 9); - - vmovdqa(ptr[&m_local.temp.cov], xmm3); - } - - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(xmm6, xmm4); - - // s = vti.xxxx() + m_local.d[skip].s; - // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - - vpshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpaddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - } - else - { - if (m_sel.ltf) - { - vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 12); - vmovdqa(ptr[&m_local.temp.vf], xmm6); - } - } - - vmovdqa(ptr[&m_local.temp.s], xmm2); - vmovdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]); - - vmovaps(ptr[&m_local.temp.s], xmm2); - vmovaps(ptr[&m_local.temp.t], xmm3); - vmovaps(ptr[&m_local.temp.q], xmm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vcvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm6, xmm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]); - - vmovdqa(ptr[&m_local.temp.rb], xmm5); - vmovdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(xmm5, ptr[&m_local.c.rb]); - vmovdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step_AVX() -{ - // steps -= 4; - - sub(ecx, 4); - - // fza_offset++; - - add(edi, 8); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // z += m_local.d4.z; - - if (m_sel.zb) - { - vmovaps(xmm0, ptr[&m_local.temp.zo]); - vaddps(xmm0, ptr[&m_local.d4.z]); - vmovaps(ptr[&m_local.temp.zo], xmm0); - vaddps(xmm0, ptr[&m_local.temp.z]); - } - - // f = f.add16(m_local.d4.f); - - if (m_sel.fwrite && m_sel.fge) - { - vmovdqa(xmm1, ptr[&m_local.temp.f]); - vpaddw(xmm1, ptr[&m_local.d4.f]); - vmovdqa(ptr[&m_local.temp.f], xmm1); - } - } - else - { - if (m_sel.ztest) - { - vmovdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i stq = m_local.d4.stq; - - // s += stq.xxxx(); - // if(!sprite) t += stq.yyyy(); - - vmovdqa(xmm4, ptr[&m_local.d4.stq]); - - vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm2, ptr[&m_local.temp.s]); - vmovdqa(ptr[&m_local.temp.s], xmm2); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm3, ptr[&m_local.temp.t]); - vmovdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - vmovdqa(xmm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vmovaps(xmm4, ptr[&m_local.d4.stq]); - - vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm2, ptr[&m_local.temp.s]); - vaddps(xmm3, ptr[&m_local.temp.t]); - vaddps(xmm4, ptr[&m_local.temp.q]); - - vmovaps(ptr[&m_local.temp.s], xmm2); - vmovaps(ptr[&m_local.temp.t], xmm3); - vmovaps(ptr[&m_local.temp.q], xmm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - vmovdqa(xmm7, ptr[&m_local.d4.c]); - - vpshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(xmm5, ptr[&m_local.temp.rb]); - vpaddw(xmm6, ptr[&m_local.temp.ga]); - - // FIXME: color may underflow and roll over at the end of the line, if decreasing - - vpxor(xmm7, xmm7); - vpmaxsw(xmm5, xmm7); - vpmaxsw(xmm6, xmm7); - - vmovdqa(ptr[&m_local.temp.rb], xmm5); - vmovdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(xmm5, ptr[&m_local.c.rb]); - vmovdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } - - if (!m_sel.notest) - { - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); - - vmovdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[7]]); - } -} - -void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2) -{ - if (!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, ptr[esi + 4]); - add(ebp, ptr[edi + 4]); - and(ebp, HALF_VM_SIZE - 1); - - // GSVector4i zs = zi; - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - vbroadcastss(temp1, ptr[&GSVector4::m_half]); - vmulps(temp1, xmm0); - vcvttps2dq(temp1, temp1); - vpslld(temp1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(temp2, temp2); - vpsrld(temp2, 31); - vpand(xmm0, temp2); - - vpor(xmm0, temp1); - } - else - { - // zs = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - vpcmpeqd(temp1, temp1); - vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); - vpminsd(xmm0, temp1); - } - - if (m_sel.zwrite) - { - vmovdqa(ptr[&m_local.temp.zs], xmm0); - } - } - - if (m_sel.ztest) - { - ReadPixel_AVX(xmm1, ebp); - - if (m_sel.zwrite && m_sel.zpsm < 2) - { - vmovdqa(ptr[&m_local.temp.zd], xmm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if (m_sel.zpsm) - { - vpslld(xmm1, static_cast(m_sel.zpsm * 8)); - vpsrld(xmm1, static_cast(m_sel.zpsm * 8)); - } - - if (m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - vpcmpeqd(temp1, temp1); - vpslld(temp1, 31); - - // GSVector4i zso = zs - o; - // GSVector4i zdo = zd - o; - - vpsubd(xmm0, temp1); - vpsubd(xmm1, temp1); - } - - switch (m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(xmm1, xmm0); - vpor(xmm7, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(xmm0, xmm1); - vpcmpeqd(temp1, temp1); - vpxor(xmm0, temp1); - vpor(xmm7, xmm0); - break; - } - - alltrue(xmm7); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture_AVX() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, ptr[&m_local.gd->tex[0]]); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - // ebx = tex - // edx = clut - - if (!m_sel.fst) - { - vrcpps(xmm0, xmm4); - - vmulps(xmm2, xmm0); - vmulps(xmm3, xmm0); - - vcvttps2dq(xmm2, xmm2); - vcvttps2dq(xmm3, xmm3); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm2, xmm4); - vpsubd(xmm3, xmm4); - } - } - - // xmm2 = u - // xmm3 = v - - if (m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(12); - - vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.uf], xmm0); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // GSVector4i vf = v.xxzzlh().srl16(12); - - vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.vf], xmm0); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm2, 16); - vpsrad(xmm3, 16); - vpackssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm1, xmm1); - vpsrlw(xmm1, 15); - vpaddw(xmm3, xmm2, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap_AVX(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap_AVX(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i y0 = uv0.uph16() << tw; - // GSVector4i x0 = uv0.upl16(); - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm4, xmm2, xmm0); - vpunpckhwd(xmm2, xmm2, xmm0); - vpslld(xmm2, static_cast(m_sel.tw + 3)); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i y1 = uv1.uph16() << tw; - // GSVector4i x1 = uv1.upl16(); - - vpunpcklwd(xmm6, xmm3, xmm0); - vpunpckhwd(xmm3, xmm3, xmm0); - vpslld(xmm3, static_cast(m_sel.tw + 3)); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm5, xmm2, xmm4); - vpaddd(xmm2, xmm2, xmm6); - vpaddd(xmm0, xmm3, xmm4); - vpaddd(xmm3, xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(4, 0); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - vmovdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm5, xmm2, xmm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(1, 0); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } -} - -void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv, ptr[&m_local.gd->t.min]); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv, xmm0); - } - - vpminsw(uv, ptr[&m_local.gd->t.max]); - } - else - { - vpand(uv, ptr[&m_local.gd->t.min]); - - if (region) - { - vpor(uv, ptr[&m_local.gd->t.max]); - } - } - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv, xmm4); - - if (region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, xmm4); - vpminsw(uv, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vpmaxsw(uv0, xmm4); - vpmaxsw(uv1, xmm4); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vpminsw(uv0, xmm5); - vpminsw(uv1, xmm5); - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vpand(uv0, xmm4); - vpand(uv1, xmm4); - - if (region) - { - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vpor(uv0, xmm5); - vpor(uv1, xmm5); - } - } - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv0, xmm4); - - if (region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, xmm4); - vpminsw(uv0, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, xmm1, xmm0); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv1, xmm4); - - if (region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, xmm4); - vpminsw(uv1, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - push(ebp); - - mov(ebp, (size_t)m_local.gd->tex); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - if (!m_sel.fst) - { - vrcpps(xmm0, xmm4); - - vmulps(xmm2, xmm0); - vmulps(xmm3, xmm0); - - vcvttps2dq(xmm2, xmm2); - vcvttps2dq(xmm3, xmm3); - } - - // xmm2 = u - // xmm3 = v - // xmm4 = q - // xmm0 = xmm1 = xmm5 = xmm6 = free - - // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) - - if (!m_sel.lcm) - { - // lod = -log2(Q) * (1 << L) + K - - vpcmpeqd(xmm1, xmm1); - vpsrld(xmm1, xmm1, 25); - vpslld(xmm0, xmm4, 1); - vpsrld(xmm0, xmm0, 24); - vpsubd(xmm0, xmm1); - vcvtdq2ps(xmm0, xmm0); - - // xmm0 = (float)(exp(q) - 127) - - vpslld(xmm4, xmm4, 9); - vpsrld(xmm4, xmm4, 9); - vorps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); - - // xmm4 = mant(q) | 1.0f - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(xmm5, ptr[g_const->m_log2_coef_128b[0]]); // c0 - vfmadd213ps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[1]]); // c0 * xmm4 + c1 - vfmadd213ps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[2]]); // (c0 * xmm4 + c1) * xmm4 + c2 - vsubps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); // xmm4 - 1.0f - vfmadd213ps(xmm4, xmm5, xmm0); // ((c0 * xmm4 + c1) * xmm4 + c2) * (xmm4 - 1.0f) + xmm0 - } - else - { - vmulps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[0]]); - vaddps(xmm5, ptr[g_const->m_log2_coef_128b[1]]); - vmulps(xmm5, xmm4); - vsubps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); - vaddps(xmm5, ptr[g_const->m_log2_coef_128b[2]]); - vmulps(xmm4, xmm5); - vaddps(xmm4, xmm0); - } - - // xmm4 = log2(Q) = ((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0 - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(xmm5, ptr[&m_local.gd->l]); - vfmadd213ps(xmm4, xmm5, ptr[&m_local.gd->k]); - } - else - { - vmulps(xmm4, ptr[&m_local.gd->l]); - vaddps(xmm4, ptr[&m_local.gd->k]); - } - - // xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000 - - vxorps(xmm0, xmm0); - vminps(xmm4, ptr[&m_local.gd->mxl]); - vmaxps(xmm4, xmm0); - vcvtps2dq(xmm4, xmm4); - - if (m_sel.mmin == 1) // round-off mode - { - mov(eax, 0x8000); - vmovd(xmm0, eax); - vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm4, xmm0); - } - - vpsrld(xmm0, xmm4, 16); - - vmovdqa(ptr[&m_local.temp.lod.i], xmm0); -/* -vpslld(xmm5, xmm0, 6); -vpslld(xmm6, xmm4, 16); -vpsrld(xmm6, xmm6, 24); -return; -*/ - if (m_sel.mmin == 2) // trilinear mode - { - vpshuflw(xmm1, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.temp.lod.f], xmm1); - } - - // shift u/v/minmax by (int)lod - - if (m_cpu.has(util::Cpu::tAVX2)) - { - vpsravd(xmm2, xmm2, xmm0); - vpsravd(xmm3, xmm3, xmm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); - vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); - - // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] - - vpxor(xmm1, xmm1); - - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vpunpcklwd(xmm5, xmm4, xmm1); // minu - vpunpckhwd(xmm6, xmm4, xmm1); // minv - vpsrlvd(xmm5, xmm5, xmm0); - vpsrlvd(xmm6, xmm6, xmm0); - vpackusdw(xmm5, xmm6); - - vmovdqa(xmm4, ptr[&m_local.gd->t.max]); - vpunpcklwd(xmm6, xmm4, xmm1); // maxu - vpunpckhwd(xmm4, xmm4, xmm1); // maxv - vpsrlvd(xmm6, xmm6, xmm0); - vpsrlvd(xmm4, xmm4, xmm0); - vpackusdw(xmm6, xmm4); - - vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); - vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); - } - else - { - vmovq(xmm4, ptr[&m_local.gd->t.minmax]); - - vpunpckldq(xmm5, xmm2, xmm3); - vpunpckhdq(xmm6, xmm2, xmm3); - vmovdqa(xmm2, xmm5); - vmovdqa(xmm3, xmm6); - - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); - vpsrad(xmm2, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); - - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); - vpsrad(xmm5, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); - - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); - vpsrad(xmm3, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); - - vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); - vpsrad(xmm6, xmm0); - vpsrlw(xmm1, xmm4, xmm0); - vmovq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); - - vpunpckldq(xmm2, xmm3); - vpunpckhdq(xmm5, xmm6); - vpunpckhdq(xmm3, xmm2, xmm5); - vpunpckldq(xmm2, xmm5); - - vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); - vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); - - vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - - vpunpcklwd(xmm0, xmm5, xmm6); - vpunpckhwd(xmm1, xmm5, xmm6); - vpunpckldq(xmm5, xmm0, xmm1); - vpunpckhdq(xmm6, xmm0, xmm1); - - vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); - vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); - } - } - else - { - // lod = K - - vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]); - - vpsrad(xmm2, xmm0); - vpsrad(xmm3, xmm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); - vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); - - vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - } - - // xmm2 = m_local.temp.uv[0] = u (level m) - // xmm3 = m_local.temp.uv[1] = v (level m) - // xmm5 = minuv - // xmm6 = maxuv - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm2, xmm4); - vpsubd(xmm3, xmm4); - - // GSVector4i uf = u.xxzzlh().srl16(1); - - vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.uf], xmm0); - - // GSVector4i vf = v.xxzzlh().srl16(1); - - vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.vf], xmm0); - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm2, 16); - vpsrad(xmm3, 16); - vpackssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm1, xmm1); - vpsrlw(xmm1, 15); - vpaddw(xmm3, xmm2, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD_AVX(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD_AVX(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm4, xmm2, xmm0); - vpunpckhwd(xmm2, xmm2, xmm0); - vpslld(xmm2, static_cast(m_sel.tw + 3)); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - vpunpcklwd(xmm6, xmm3, xmm0); - vpunpckhwd(xmm3, xmm3, xmm0); - vpslld(xmm3, static_cast(m_sel.tw + 3)); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm5, xmm2, xmm4); - vpaddd(xmm2, xmm2, xmm6); - vpaddd(xmm0, xmm3, xmm4); - vpaddd(xmm3, xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(4, 0); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - vmovdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm5, xmm2, xmm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(1, 0); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } - - if (m_sel.mmin != 1) // !round-off mode - { - vmovdqa(ptr[&m_local.temp.trb], xmm5); - vmovdqa(ptr[&m_local.temp.tga], xmm6); - - vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]); - vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]); - - vpsrad(xmm2, 1); - vpsrad(xmm3, 1); - - vmovdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - - vpsrlw(xmm5, 1); - vpsrlw(xmm6, 1); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm2, xmm4); - vpsubd(xmm3, xmm4); - - // GSVector4i uf = u.xxzzlh().srl16(1); - - vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.uf], xmm0); - - // GSVector4i vf = v.xxzzlh().srl16(1); - - vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 12); - vmovdqa(ptr[&m_local.temp.vf], xmm0); - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm2, 16); - vpsrad(xmm3, 16); - vpackssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm1, xmm1); - vpsrlw(xmm1, 15); - vpaddw(xmm3, xmm2, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD_AVX(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD_AVX(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm4, xmm2, xmm0); - vpunpckhwd(xmm2, xmm2, xmm0); - vpslld(xmm2, static_cast(m_sel.tw + 3)); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - vpunpcklwd(xmm6, xmm3, xmm0); - vpunpckhwd(xmm3, xmm3, xmm0); - vpslld(xmm3, static_cast(m_sel.tw + 3)); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm5, xmm2, xmm4); - vpaddd(xmm2, xmm2, xmm6); - vpaddd(xmm0, xmm3, xmm4); - vpaddd(xmm3, xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(4, 1); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - vmovdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm5, xmm2, xmm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_AVX(1, 1); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } - - vmovdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); - vpsrlw(xmm0, xmm0, 1); - - vmovdqa(xmm2, ptr[&m_local.temp.trb]); - vmovdqa(xmm3, ptr[&m_local.temp.tga]); - - lerp16(xmm5, xmm2, xmm0, 0); - lerp16(xmm6, xmm3, xmm0, 0); - } - - pop(ebp); -} - -void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv) -{ - // xmm5 = minuv - // xmm6 = maxuv - // xmm0, xmm1, xmm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv, xmm5); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv, xmm0); - } - - vpminsw(uv, xmm6); - } - else - { - vpand(uv, xmm5); - - if (region) - { - vpor(uv, xmm6); - } - } - } - else - { - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv, xmm5); - - if (region) - { - vpor(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, xmm5); - vpminsw(uv, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1) -{ - // xmm5 = minuv - // xmm6 = maxuv - // xmm0, xmm1, xmm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv0, xmm5); - vpmaxsw(uv1, xmm5); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - - vpminsw(uv0, xmm6); - vpminsw(uv1, xmm6); - } - else - { - vpand(uv0, xmm5); - vpand(uv1, xmm5); - - if (region) - { - vpor(uv0, xmm6); - vpor(uv1, xmm6); - } - } - } - else - { - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv0, xmm5); - - if (region) - { - vpor(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, xmm5); - vpminsw(uv0, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, xmm1, xmm0); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv1, xmm5); - - if (region) - { - vpor(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, xmm5); - vpminsw(uv1, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX_AVX() -{ - if (!m_sel.fb) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(xmm6, xmm4, 1); - - clamp16(xmm6, xmm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(xmm2, xmm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(xmm4, 7); - - if (m_sel.tcc) - { - vpaddusb(xmm4, xmm6); - } - - mix16(xmm6, xmm4, xmm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(xmm2, xmm4); - - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if (m_sel.iip) - { - vpsrlw(xmm6, 7); - } - - break; - } - - if (m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if (!m_sel.abe) - { - // a = cov - - if (m_sel.edge) - { - vmovdqa(xmm0, ptr[&m_local.temp.cov]); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - } - - mix16(xmm6, xmm0, xmm1); - } - else - { - // a = a == 0x80 ? cov : a - - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - - if (m_sel.edge) - { - vmovdqa(xmm1, ptr[&m_local.temp.cov]); - } - else - { - vmovdqa(xmm1, xmm0); - } - - vpcmpeqw(xmm0, xmm6); - vpsrld(xmm0, 16); - vpslld(xmm0, 16); - - vpblendvb(xmm6, xmm1, xmm0); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask_AVX() -{ - if (m_sel.fwrite) - { - vmovdqa(xmm3, ptr[&m_local.gd->fm]); - } - - if (m_sel.zwrite) - { - vmovdqa(xmm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha_AVX() -{ - switch (m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - vpcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpgtd(xmm1, ptr[&m_local.gd->aref]); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, xmm6, 16); - vmovdqa(xmm1, ptr[&m_local.gd->aref]); - vpcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); - break; - } - - switch (m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(xmm7, xmm1); - alltrue(xmm7); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(xmm4, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(xmm3, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(xmm4, xmm1); - // fm |= t & GSVector4i::xff000000(); - vpsrld(xmm1, 24); - vpslld(xmm1, 24); - vpor(xmm3, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(xmm5, xmm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(xmm1, xmm6); - - modulate16(xmm6, xmm2, 1); - - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(xmm2, 7); - - vpaddw(xmm6, xmm2); - - clamp16(xmm6, xmm0); - - mix16(xmm6, xmm1, xmm0); - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - vpaddw(xmm5, xmm2); - - clamp16(xmm5, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if (m_sel.iip) - { - vpsrlw(xmm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog_AVX() -{ - if (!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - vmovdqa(xmm0, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.f : &m_local.p.f]); - vmovdqa(xmm1, xmm6); - - vmovdqa(xmm2, ptr[&m_local.gd->frb]); - lerp16(xmm5, xmm2, xmm0, 0); - - vmovdqa(xmm2, ptr[&m_local.gd->fga]); - lerp16(xmm6, xmm2, xmm0, 0); - mix16(xmm6, xmm1, xmm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame_AVX() -{ - if (!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, ptr[esi]); - add(ebx, ptr[edi]); - and(ebx, HALF_VM_SIZE - 1); - - if (!m_sel.rfb) - { - return; - } - - ReadPixel_AVX(xmm2, ebx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX() -{ - if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if (m_sel.datm) - { - if (m_sel.fpsm == 2) - { - vpxor(xmm0, xmm0); - //vpsrld(xmm1, xmm2, 15); - vpslld(xmm1, xmm2, 16); - vpsrad(xmm1, 31); - vpcmpeqd(xmm1, xmm0); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm2, xmm0); - vpsrad(xmm1, 31); - } - } - else - { - if (m_sel.fpsm == 2) - { - vpslld(xmm1, xmm2, 16); - vpsrad(xmm1, 31); - } - else - { - vpsrad(xmm1, xmm2, 31); - } - } - - vpor(xmm7, xmm1); - - alltrue(xmm7); -} - -void GSDrawScanlineCodeGenerator::WriteMask_AVX() -{ - if (m_sel.notest) - { - return; - } - - // fm |= test; - // zm |= test; - - if (m_sel.fwrite) - { - vpor(xmm3, xmm7); - } - - if (m_sel.zwrite) - { - vpor(xmm4, xmm7); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - vpcmpeqd(xmm1, xmm1); - - if (m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(xmm0, xmm1, xmm4); - vpcmpeqd(xmm1, xmm3); - vpackssdw(xmm1, xmm0); - } - else if (m_sel.fwrite) - { - vpcmpeqd(xmm1, xmm3); - vpackssdw(xmm1, xmm1); - } - else if (m_sel.zwrite) - { - vpcmpeqd(xmm1, xmm4); - vpackssdw(xmm1, xmm1); - } - - vpmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf_AVX() -{ - if (!m_sel.zwrite) - { - return; - } - - vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - - if (m_sel.ztest && m_sel.zpsm < 2) - { - // zs = zs.blend8(zd, zm); - - vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); - } - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - vpcmpeqd(xmm7, xmm7); - vpsrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); - vpminsd(xmm1, xmm7); - } - - bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - - WritePixel_AVX(xmm1, ebp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch (m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - split16_2x8(xmm0, xmm1, xmm2); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(xmm7, xmm7); - - vpsrld(xmm7, 27); // 0x0000001f - vpand(xmm0, xmm2, xmm7); - vpslld(xmm0, 3); - - vpslld(xmm7, 10); // 0x00007c00 - vpand(xmm4, xmm2, xmm7); - vpslld(xmm4, 9); - - vpor(xmm0, xmm4); - - vpsrld(xmm7, 5); // 0x000003e0 - vpand(xmm1, xmm2, xmm7); - vpsrld(xmm1, 2); - - vpsllw(xmm7, 10); // 0x00008000 - vpand(xmm4, xmm2, xmm7); - vpslld(xmm4, 8); - - vpor(xmm1, xmm4); - - break; - } - } - - // xmm5, xmm6 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm2, xmm3 = used - // xmm4, xmm7 = free - - if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(xmm4, xmm5); - } - - if (m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(xmm5, xmm0); - break; - case 2: - vpxor(xmm5, xmm5); - break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch (m_sel.abb) - { - case 0: - vpsubw(xmm5, xmm4); - break; - case 1: - vpsubw(xmm5, xmm0); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch (m_sel.abc) - { - case 0: - case 1: - vpshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(xmm7, 7); - break; - case 2: - vmovdqa(xmm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(xmm5, xmm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch (m_sel.abd) - { - case 0: - vpaddw(xmm5, xmm4); - break; - case 1: - vpaddw(xmm5, xmm0); - break; - case 2: - break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(xmm5, xmm0); - break; - case 2: - vpxor(xmm5, xmm5); - break; - } - } - - if (m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(xmm0, xmm6, 8); - vpsrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(xmm5, xmm4, xmm5, xmm0); - } - - // xmm6 = src ga - // xmm1 = dst ga - // xmm5 = rb - // xmm7 = a - // xmm2, xmm3 = used - // xmm0, xmm4 = free - - vmovdqa(xmm4, xmm6); - - if (m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(xmm6, xmm1); - break; - case 2: - vpxor(xmm6, xmm6); - break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch (m_sel.abb) - { - case 0: - vpsubw(xmm6, xmm4); - break; - case 1: - vpsubw(xmm6, xmm1); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(xmm6, xmm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch (m_sel.abd) - { - case 0: - vpaddw(xmm6, xmm4); - break; - case 1: - vpaddw(xmm6, xmm1); - break; - case 2: - break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(xmm6, xmm1); - break; - case 2: - vpxor(xmm6, xmm6); - break; - } - } - - // xmm4 = src ga - // xmm5 = rb - // xmm6 = ga - // xmm2, xmm3 = used - // xmm0, xmm1, xmm7 = free - - if (m_sel.pabe) - { - vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(xmm6, xmm4, xmm6, xmm0); - } - else - { - if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(xmm6, xmm4, xmm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame_AVX() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, ptr[esp + _top]); - and(eax, 3); - shl(eax, 5); - mov(ebp, ptr[&m_local.gd->dimx]); - vpaddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]); - vpaddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]); - } - - if (m_sel.colclamp == 0) - { - // c[0] &= 0x00ff00ff; - // c[1] &= 0x00ff00ff; - - vpcmpeqd(xmm7, xmm7); - vpsrlw(xmm7, 8); - vpand(xmm5, xmm7); - vpand(xmm6, xmm7); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(xmm7, xmm5, xmm6); - vpunpcklwd(xmm5, xmm6); - vpackuswb(xmm5, xmm7); - - if (m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(xmm7, xmm7); - vpslld(xmm7, 31); - vpor(xmm5, xmm7); - } - - if (m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm6, eax); - vpshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - vmovd(xmm7, eax); - vpshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - - vpand(xmm4, xmm5, xmm6); - vpand(xmm5, xmm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(xmm6, xmm4, 9); - vpsrld(xmm4, 3); - vpsrld(xmm7, xmm5, 16); - vpsrld(xmm5, 6); - - vpor(xmm5, xmm4); - vpor(xmm7, xmm6); - vpor(xmm5, xmm7); - } - - if (m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - - WritePixel_AVX(xmm5, ebx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg32& addr) -{ - vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); - vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if (m_sel.notest) - { - if (fast) - { - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - } - else - { - WritePixel_AVX(src, addr, 0, psm); - WritePixel_AVX(src, addr, 1, psm); - WritePixel_AVX(src, addr, 2, psm); - WritePixel_AVX(src, addr, 3, psm); - } - } - else - { - if (fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - - // vmaskmovps? - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel_AVX(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel_AVX(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel_AVX(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel_AVX(src, addr, 3, psm); - L("@@"); - } - } -} - -static const int s_offsets[] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg32& addr, uint8 i, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - switch (psm) - { - case 0: - if (i == 0) - vmovd(dst, src); - else - vpextrd(dst, src, i); - break; - case 1: - if (i == 0) - vmovd(eax, src); - else - vpextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - if (i == 0) - vmovd(eax, src); - else - vpextrw(eax, src, i * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset) -{ - // in - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // ebx = m_local.tex[0] (!m_sel.mmin) - // ebp = m_local.tex (m_sel.mmin) - // edx = m_local.clut (m_sel.tlu) - - // out - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - - ASSERT(pixels == 1 || pixels == 4); - - mip_offset *= sizeof(void*); - - const GSVector4i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i; - - if (m_sel.mmin && !m_sel.lcm) - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - - if (pixels == 4) - { - vmovdqa(ptr[&m_local.temp.test], xmm7); - } - - for (uint8 j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - for (int i = 0; i < pixels; i++) - { - ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } - } - - if (pixels == 4) - { - vmovdqa(xmm5, xmm7); - vmovdqa(xmm7, ptr[&m_local.temp.test]); - } - } - else - { - if (m_sel.mmin && m_sel.lcm) - { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - } - - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - - for (int i = 0; i < pixels; i++) - { - for (uint8 j = 0; j < 4; j++) - { - ReadTexel_AVX(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i) -{ - ASSERT(i < 4); - - const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - - if (i == 0) - vmovd(eax, addr); - else - vpextrd(eax, addr, i); - - if (m_sel.tlu) - movzx(eax, byte[ebx + eax]); - - if (i == 0) - vmovd(dst, src); - else - vpinsrd(dst, src, i); -} - -#endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp deleted file mode 100644 index 7c14c55d8f..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp +++ /dev/null @@ -1,3020 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" -#include "GSVertexSW.h" -#include "GS/GS_codegen.h" -#include "GS/GSVector.h" - -#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -void GSDrawScanlineCodeGenerator::Generate() -{ - //ret(8); - - push(ebx); - push(esi); - push(edi); - push(ebp); - - //db(0xcc); - - Init(); - - if (!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ymm0 = z/zi - // ymm2 = s/u (tme) - // ymm3 = t/v (tme) - // ymm4 = q (tme) - // ymm5 = rb (!tme) - // ymm6 = ga (!tme) - // ymm7 = test - - bool tme = m_sel.tfx != TFX_NONE; - - TestZ(tme ? ymm5 : ymm2, tme ? ymm6 : ymm3); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - ymm0 - // ymm2 = s/u (tme) - // ymm3 = t/v (tme) - // ymm4 = q (tme) - // ymm5 = rb (!tme) - // ymm6 = ga (!tme) - // ymm7 = test - - if (m_sel.mmin) - { - SampleTextureLOD(); - } - else - { - SampleTexture(); - } - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - ymm2 - // - ymm3 - // - ymm4 - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - AlphaTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ReadMask(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - TestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ColorTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - Fog(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - ReadFrame(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - TestDestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - // ymm7 = test - - WriteMask(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // ymm2 = fd - // ymm3 = fm - // ymm4 = zm - // ymm5 = rb - // ymm6 = ga - - WriteZBuf(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // ymm2 = fd - // ymm3 = fm - // - ymm4 - // ymm5 = rb - // ymm6 = ga - - AlphaBlend(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ymm2 = fd - // ymm3 = fm - // ymm5 = rb - // ymm6 = ga - - WriteFrame(); - -L("step"); - - // if(steps <= 0) break; - - if (!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); -} - -void GSDrawScanlineCodeGenerator::Init() -{ - if (!m_sel.notest) - { - // int skip = left & 7; - - mov(ebx, edx); - and(edx, 7); - - // int steps = pixels + skip - 8; - - lea(ecx, ptr[ecx + edx - 8]); - - // left -= skip; - - sub(ebx, edx); - - // GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))]; - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - - vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[0]]); - vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)g_const->m_test_256b[15]]); - vpor(ymm7, ymm0); - - shl(edx, 5); - } - else - { - mov(ebx, edx); // left - xor(edx, edx); // skip - lea(ecx, ptr[ecx - 8]); // steps - } - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, ptr[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, ptr[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, ptr[&m_local.gd->fzbc]); - - if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - lea(edx, ptr[edx * 8 + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, ptr[esp + _v]); - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vbroadcastf128(ymm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p - - if (m_sel.fwrite && m_sel.fge) - { - // f = GSVector8i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(ymm1, ymm0); - vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(ymm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]); - - vmovdqa(ptr[&m_local.temp.f], ymm1); - } - - if (m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); - vmovaps(ptr[&m_local.temp.z], ymm0); - vmovaps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]); - vmovaps(ptr[&m_local.temp.zo], ymm2); - vaddps(ymm0, ymm2); - } - } - } - else - { - if (m_sel.ztest) - { - vpbroadcastd(ymm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.edge || m_sel.tfx != TFX_NONE) - { - vbroadcastf128(ymm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t - } - - if (m_sel.edge) - { - // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); - - vpshufhw(ymm3, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(ymm3, ymm3, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(ymm3, 9); - - vmovdqa(ptr[&m_local.temp.cov], ymm3); - } - - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(ymm6, ymm4); - - // s = vti.xxxx() + m_local.d[skip].s; - // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - - vpshufd(ymm2, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm3, ymm6, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - vpaddd(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - } - else - { - if (m_sel.ltf) - { - vpshuflw(ymm6, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm6, ymm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm6, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm6); - } - } - - vmovdqa(ptr[&m_local.temp.s], ymm2); - vmovdqa(ptr[&m_local.temp.t], ymm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - vaddps(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - vaddps(ymm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]); - - vmovaps(ptr[&m_local.temp.s], ymm2); - vmovaps(ptr[&m_local.temp.t], ymm3); - vmovaps(ptr[&m_local.temp.q], ymm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vbroadcastf128(ymm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c - vcvttps2dq(ymm6, ymm6); - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(ymm5, ymm6, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(ymm6, ymm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(ymm5, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm6, ymm6, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(ymm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]); - vpaddw(ymm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]); - - vmovdqa(ptr[&m_local.temp.rb], ymm5); - vmovdqa(ptr[&m_local.temp.ga], ymm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(ymm5, ptr[&m_local.c.rb]); - vmovdqa(ymm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step() -{ - // steps -= 8; - - sub(ecx, 8); - - // fza_offset += 2; - - add(edi, 16); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // zo += GSVector8::broadcast32(&m_local.d8.p.z); - - if (m_sel.zb) - { - vbroadcastss(ymm0, ptr[&m_local.d8.p.z]); - vaddps(ymm0, ptr[&m_local.temp.zo]); - vmovaps(ptr[&m_local.temp.zo], ymm0); - vaddps(ymm0, ptr[&m_local.temp.z]); - } - - // f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f)); - - if (m_sel.fwrite && m_sel.fge) - { - vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]); - vpaddw(ymm1, ptr[&m_local.temp.f]); - vmovdqa(ptr[&m_local.temp.f], ymm1); - } - } - else - { - if (m_sel.ztest) - { - vpbroadcastd(ymm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq)); - - vbroadcasti128(ymm4, ptr[&m_local.d8.stq]); - - // s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); - - vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(ymm2, ptr[&m_local.temp.s]); - vmovdqa(ptr[&m_local.temp.s], ymm2); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - // t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy()); - - vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(ymm3, ptr[&m_local.temp.t]); - vmovdqa(ptr[&m_local.temp.t], ymm3); - } - else - { - vmovdqa(ymm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector8 stq(m_local.d8.stq); - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vbroadcastf128(ymm4, ptr[&m_local.d8.stq]); - - vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(ymm2, ptr[&m_local.temp.s]); - vaddps(ymm3, ptr[&m_local.temp.t]); - vaddps(ymm4, ptr[&m_local.temp.q]); - - vmovaps(ptr[&m_local.temp.s], ymm2); - vmovaps(ptr[&m_local.temp.t], ymm3); - vmovaps(ptr[&m_local.temp.q], ymm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c); - - vpbroadcastq(ymm7, ptr[&m_local.d8.c]); - - // rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero()); - // ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero()); - - vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(ymm5, ptr[&m_local.temp.rb]); - vpaddw(ymm6, ptr[&m_local.temp.ga]); - - // FIXME: color may underflow and roll over at the end of the line, if decreasing - - vpxor(ymm7, ymm7); - vpmaxsw(ymm5, ymm7); - vpmaxsw(ymm6, ymm7); - - vmovdqa(ptr[&m_local.temp.rb], ymm5); - vmovdqa(ptr[&m_local.temp.ga], ymm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - vmovdqa(ymm5, ptr[&m_local.c.rb]); - vmovdqa(ymm6, ptr[&m_local.c.ga]); - } - } - } - } - - if (!m_sel.notest) - { - // test = m_test[15 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - - vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[15]]); - } -} - -void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2) -{ - if (!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, ptr[esi + 4]); - add(ebp, ptr[edi + 4]); - and(ebp, HALF_VM_SIZE - 1); - - // GSVector8i zs = zi; - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.zoverflow) - { - // zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); - - vbroadcastss(temp1, ptr[&GSVector8::m_half]); - vmulps(temp1, ymm0); - vcvttps2dq(temp1, temp1); - vpslld(temp1, 1); - - vcvttps2dq(ymm0, ymm0); - vpcmpeqd(temp2, temp2); - vpsrld(temp2, 31); - vpand(ymm0, temp2); - - vpor(ymm0, temp1); - } - else - { - // zs = GSVector8i(z); - - vcvttps2dq(ymm0, ymm0); - } - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - vpcmpeqd(temp1, temp1); - vpsrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); - vpminsd(ymm0, temp1); - } - - if (m_sel.zwrite) - { - vmovdqa(ptr[&m_local.temp.zs], ymm0); - } - } - - if (m_sel.ztest) - { - ReadPixel(ymm1, temp1, ebp); - - if (m_sel.zwrite && m_sel.zpsm < 2) - { - vmovdqa(ptr[&m_local.temp.zd], ymm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if (m_sel.zpsm) - { - vpslld(ymm1, (uint8)(m_sel.zpsm * 8)); - vpsrld(ymm1, (uint8)(m_sel.zpsm * 8)); - } - - if (m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector8i o = GSVector8i::x80000000(); - - vpcmpeqd(temp1, temp1); - vpslld(temp1, 31); - - // GSVector8i zso = zs - o; - // GSVector8i zdo = zd - o; - - vpsubd(ymm0, temp1); - vpsubd(ymm1, temp1); - } - - switch (m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(ymm1, ymm0); - vpor(ymm7, ymm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(ymm0, ymm1); - vpcmpeqd(temp1, temp1); - vpxor(ymm0, temp1); - vpor(ymm7, ymm0); - break; - } - - alltrue(ymm7); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, ptr[&m_local.gd->tex[0]]); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - // ebx = tex - // edx = clut - - if (!m_sel.fst) - { - vrcpps(ymm0, ymm4); - - vmulps(ymm2, ymm0); - vmulps(ymm3, ymm0); - - vcvttps2dq(ymm2, ymm2); - vcvttps2dq(ymm3, ymm3); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - } - } - - // ymm2 = u - // ymm3 = v - - if (m_sel.ltf) - { - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i y0 = uv0.uph16() << tw; - // GSVector8i x0 = uv0.upl16(); - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i y1 = uv1.uph16() << tw; - // GSVector8i x1 = uv1.upl16(); - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 0); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 0); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv) -{ - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); - vpmaxsw(uv, ymm0); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv, ymm0); - } - - vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); - vpminsw(uv, ymm0); - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); - vpand(uv, ymm0); - - if (region) - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); - vpor(uv, ymm0); - } - } - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, ymm4); - vpminsw(uv, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv0, const Ymm& uv1) -{ - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpmaxsw(uv0, ymm4); - vpmaxsw(uv1, ymm4); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv0, ymm0); - vpmaxsw(uv1, ymm0); - } - - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vpminsw(uv0, ymm5); - vpminsw(uv1, ymm5); - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpand(uv0, ymm4); - vpand(uv1, ymm4); - - if (region) - { - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vpor(uv0, ymm5); - vpor(uv1, ymm5); - } - } - } - else - { - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv0, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, ymm4); - vpminsw(uv0, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, ymm1, ymm0); - - // uv1 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv1, ymm4); - - if (region) - { - vpor(ymm1, ymm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, ymm4); - vpminsw(uv1, ymm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::SampleTextureLOD() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - push(ebp); - - mov(ebp, (size_t)m_local.gd->tex); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - if (!m_sel.fst) - { - vrcpps(ymm0, ymm4); - - vmulps(ymm2, ymm0); - vmulps(ymm3, ymm0); - - vcvttps2dq(ymm2, ymm2); - vcvttps2dq(ymm3, ymm3); - } - - // ymm2 = u - // ymm3 = v - // ymm4 = q - // ymm0 = ymm1 = ymm5 = ymm6 = free - - // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) - - if (!m_sel.lcm) - { - // lod = -log2(Q) * (1 << L) + K - - vpcmpeqd(ymm1, ymm1); - vpsrld(ymm1, ymm1, 25); - vpslld(ymm0, ymm4, 1); - vpsrld(ymm0, ymm0, 24); - vpsubd(ymm0, ymm1); - vcvtdq2ps(ymm0, ymm0); - - // ymm0 = (float)(exp(q) - 127) - - vpslld(ymm4, ymm4, 9); - vpsrld(ymm4, ymm4, 9); - vorps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); - - // ymm4 = mant(q) | 1.0f - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(ymm5, ptr[g_const->m_log2_coef_256b[0]]); // c0 - vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[1]]); // c0 * ymm4 + c1 - vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[2]]); // (c0 * ymm4 + c1) * ymm4 + c2 - vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); // ymm4 - 1.0f - vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0 - } - else - { - vmulps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[0]]); - vaddps(ymm5, ptr[g_const->m_log2_coef_256b[1]]); - vmulps(ymm5, ymm4); - vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); - vaddps(ymm5, ptr[g_const->m_log2_coef_256b[2]]); - vmulps(ymm4, ymm5); - vaddps(ymm4, ymm0); - } - - // ymm4 = log2(Q) = ((((c0 * ymm4) + c1) * ymm4) + c2) * (ymm4 - 1.0f) + ymm0 - - if (m_cpu.has(util::Cpu::tFMA)) - { - vmovaps(ymm5, ptr[&m_local.gd->l]); - vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]); - } - else - { - vmulps(ymm4, ptr[&m_local.gd->l]); - vaddps(ymm4, ptr[&m_local.gd->k]); - } - - // ymm4 = (-log2(Q) * (1 << L) + K) * 0x10000 - - vxorps(ymm0, ymm0); - vminps(ymm4, ptr[&m_local.gd->mxl]); - vmaxps(ymm4, ymm0); - vcvtps2dq(ymm4, ymm4); - - if (m_sel.mmin == 1) // round-off mode - { - mov(eax, 0x8000); - vmovd(xmm0, eax); - vpbroadcastd(ymm0, xmm0); - vpaddd(ymm4, ymm0); - } - - vpsrld(ymm0, ymm4, 16); - - vmovdqa(ptr[&m_local.temp.lod.i], ymm0); - /* -vpslld(ymm5, ymm0, 6); -vpslld(ymm6, ymm4, 16); -vpsrld(ymm6, ymm6, 24); -return; -*/ - if (m_sel.mmin == 2) // trilinear mode - { - vpshuflw(ymm1, ymm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.temp.lod.f], ymm1); - } - - // shift u/v/minmax by (int)lod - - vpsravd(ymm2, ymm2, ymm0); - vpsravd(ymm3, ymm3, ymm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); - vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); - - // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] - - vpxor(ymm1, ymm1); - - vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); - vpunpcklwd(ymm5, ymm4, ymm1); // minu - vpunpckhwd(ymm6, ymm4, ymm1); // minv - vpsrlvd(ymm5, ymm5, ymm0); - vpsrlvd(ymm6, ymm6, ymm0); - vpackusdw(ymm5, ymm6); - - vbroadcasti128(ymm4, ptr[&m_local.gd->t.max]); - vpunpcklwd(ymm6, ymm4, ymm1); // maxu - vpunpckhwd(ymm4, ymm4, ymm1); // maxv - vpsrlvd(ymm6, ymm6, ymm0); - vpsrlvd(ymm4, ymm4, ymm0); - vpackusdw(ymm6, ymm4); - - vmovdqa(ptr[&m_local.temp.uv_minmax[0]], ymm5); - vmovdqa(ptr[&m_local.temp.uv_minmax[1]], ymm6); - } - else - { - // lod = K - - vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]); - - vpsrad(ymm2, xmm0); - vpsrad(ymm3, xmm0); - - vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); - vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); - - vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); - } - - // ymm2 = m_local.temp.uv[0] = u (level m) - // ymm3 = m_local.temp.uv[1] = v (level m) - // ymm5 = minuv - // ymm6 = maxuv - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i x0 = uv0.upl16(); - // GSVector8i y0 = uv0.uph16() << tw; - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i x1 = uv1.upl16(); - // GSVector8i y1 = uv1.uph16() << tw; - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 0); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 0); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } - - if (m_sel.mmin != 1) // !round-off mode - { - vmovdqa(ptr[&m_local.temp.trb], ymm5); - vmovdqa(ptr[&m_local.temp.tga], ymm6); - - vmovdqa(ymm2, ptr[&m_local.temp.uv[0]]); - vmovdqa(ymm3, ptr[&m_local.temp.uv[1]]); - - vpsrad(ymm2, 1); - vpsrad(ymm3, 1); - - vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); - vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); - - vpsrlw(ymm5, 1); - vpsrlw(ymm6, 1); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpbroadcastd(ymm4, xmm4); - - vpsubd(ymm2, ymm4); - vpsubd(ymm3, ymm4); - - // GSVector8i uf = u.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.uf], ymm0); - - // GSVector8i vf = v.xxzzlh().srl16(1); - - vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(ymm0, 12); - vmovdqa(ptr[&m_local.temp.vf], ymm0); - } - - // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(ymm2, 16); - vpsrad(ymm3, 16); - vpackssdw(ymm2, ymm3); - - if (m_sel.ltf) - { - // GSVector8i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(ymm1, ymm1); - vpsrlw(ymm1, 15); - vpaddw(ymm3, ymm2, ymm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD(ymm2, ymm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD(ymm2); - } - - // ymm2 = uv0 - // ymm3 = uv1 (ltf) - // ymm0, ymm1, ymm4, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i x0 = uv0.upl16(); - // GSVector8i y0 = uv0.uph16() << tw; - - vpxor(ymm0, ymm0); - - vpunpcklwd(ymm4, ymm2, ymm0); - vpunpckhwd(ymm2, ymm2, ymm0); - vpslld(ymm2, (uint8)(m_sel.tw + 3)); - - // ymm0 = 0 - // ymm2 = y0 - // ymm3 = uv1 (ltf) - // ymm4 = x0 - // ymm1, ymm5, ymm6 = free - // ymm7 = used - - if (m_sel.ltf) - { - // GSVector8i x1 = uv1.upl16(); - // GSVector8i y1 = uv1.uph16() << tw; - - vpunpcklwd(ymm6, ymm3, ymm0); - vpunpckhwd(ymm3, ymm3, ymm0); - vpslld(ymm3, (uint8)(m_sel.tw + 3)); - - // ymm2 = y0 - // ymm3 = y1 - // ymm4 = x0 - // ymm6 = x1 - // ymm0, ymm5, ymm6 = free - // ymm7 = used - - // GSVector8i addr00 = y0 + x0; - // GSVector8i addr01 = y0 + x1; - // GSVector8i addr10 = y1 + x0; - // GSVector8i addr11 = y1 + x1; - - vpaddd(ymm5, ymm2, ymm4); - vpaddd(ymm2, ymm2, ymm6); - vpaddd(ymm0, ymm3, ymm4); - vpaddd(ymm3, ymm3, ymm6); - - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ymm1, ymm4, ymm6 = free - // ymm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(4, 1); - - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - // ymm0, ymm2, ymm3 = free - // ymm7 = used - - vmovdqa(ymm0, ptr[&m_local.temp.uf]); - - // GSVector8i rb00 = c00 & mask; - // GSVector8i ga00 = (c00 >> 8) & mask; - - vpsllw(ymm2, ymm6, 8); - vpsrlw(ymm2, 8); - vpsrlw(ymm6, 8); - - // GSVector8i rb01 = c01 & mask; - // GSVector8i ga01 = (c01 >> 8) & mask; - - vpsllw(ymm3, ymm4, 8); - vpsrlw(ymm3, 8); - vpsrlw(ymm4, 8); - - // ymm0 = uf - // ymm2 = rb00 - // ymm3 = rb01 - // ymm6 = ga00 - // ymm4 = ga01 - // ymm1 = c10 - // ymm5 = c11 - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb01, uf); - // ga00 = ga00.lerp16_4(ga01, uf); - - lerp16_4(ymm3, ymm2, ymm0); - lerp16_4(ymm4, ymm6, ymm0); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = c10 - // ymm5 = c11 - // ymm2, ymm6 = free - // ymm7 = used - - // GSVector8i rb10 = c10 & mask; - // GSVector8i ga10 = (c10 >> 8) & mask; - - vpsrlw(ymm2, ymm1, 8); - vpsllw(ymm1, 8); - vpsrlw(ymm1, 8); - - // GSVector8i rb11 = c11 & mask; - // GSVector8i ga11 = (c11 >> 8) & mask; - - vpsrlw(ymm6, ymm5, 8); - vpsllw(ymm5, 8); - vpsrlw(ymm5, 8); - - // ymm0 = uf - // ymm3 = rb00 - // ymm4 = ga00 - // ymm1 = rb10 - // ymm5 = rb11 - // ymm2 = ga10 - // ymm6 = ga11 - // ymm7 = used - - // rb10 = rb10.lerp16_4(rb11, uf); - // ga10 = ga10.lerp16_4(ga11, uf); - - lerp16_4(ymm5, ymm1, ymm0); - lerp16_4(ymm6, ymm2, ymm0); - - // ymm3 = rb00 - // ymm4 = ga00 - // ymm5 = rb10 - // ymm6 = ga10 - // ymm0, ymm1, ymm2 = free - // ymm7 = used - - // rb00 = rb00.lerp16_4(rb10, vf); - // ga00 = ga00.lerp16_4(ga10, vf); - - vmovdqa(ymm0, ptr[&m_local.temp.vf]); - - lerp16_4(ymm5, ymm3, ymm0); - lerp16_4(ymm6, ymm4, ymm0); - } - else - { - // GSVector8i addr00 = y0 + x0; - - vpaddd(ymm5, ymm2, ymm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(1, 1); - - // GSVector8i mask = GSVector8i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsllw(ymm5, ymm6, 8); - vpsrlw(ymm5, 8); - vpsrlw(ymm6, 8); - } - - vmovdqa(ymm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); - vpsrlw(ymm0, ymm0, 1); - - vmovdqa(ymm2, ptr[&m_local.temp.trb]); - vmovdqa(ymm3, ptr[&m_local.temp.tga]); - - lerp16(ymm5, ymm2, ymm0, 0); - lerp16(ymm6, ymm3, ymm0, 0); - } - - pop(ebp); -} - -void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv) -{ - // ymm5 = minuv - // ymm6 = maxuv - // ymm0, ymm1, ymm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv, ymm5); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv, ymm0); - } - - vpminsw(uv, ymm6); - } - else - { - vpand(uv, ymm5); - - if (region) - { - vpor(uv, ymm6); - } - } - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, ymm5); - vpminsw(uv, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv0, const Ymm& uv1) -{ - // ymm5 = minuv - // ymm6 = maxuv - // ymm0, ymm1, ymm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - vpmaxsw(uv0, ymm5); - vpmaxsw(uv1, ymm5); - } - else - { - vpxor(ymm0, ymm0); - vpmaxsw(uv0, ymm0); - vpmaxsw(uv1, ymm0); - } - - vpminsw(uv0, ymm6); - vpminsw(uv1, ymm6); - } - else - { - vpand(uv0, ymm5); - vpand(uv1, ymm5); - - if (region) - { - vpor(uv0, ymm6); - vpor(uv1, ymm6); - } - } - } - else - { - vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv0, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, ymm5); - vpminsw(uv0, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, ymm1, ymm0); - - // uv1 - - // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(ymm1, uv1, ymm5); - - if (region) - { - vpor(ymm1, ymm6); - } - - // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, ymm5); - vpminsw(uv1, ymm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, ymm1, ymm0); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ - if (!m_sel.fb) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector8i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(ymm6, ymm4, 1); - - clamp16(ymm6, ymm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(ymm2, ymm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(ymm4, 7); - - if (m_sel.tcc) - { - vpaddusb(ymm4, ymm6); - } - - mix16(ymm6, ymm4, ymm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(ymm2, ymm4); - - vpsrlw(ymm4, 7); - - mix16(ymm6, ymm4, ymm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if (m_sel.iip) - { - vpsrlw(ymm6, 7); - } - - break; - } - - if (m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if (!m_sel.abe) - { - // a = cov - - if (m_sel.edge) - { - vmovdqa(ymm0, ptr[&m_local.temp.cov]); - } - else - { - vpcmpeqd(ymm0, ymm0); - vpsllw(ymm0, 15); - vpsrlw(ymm0, 8); - } - - mix16(ymm6, ymm0, ymm1); - } - else - { - // a = a == 0x80 ? cov : a - - vpcmpeqd(ymm0, ymm0); - vpsllw(ymm0, 15); - vpsrlw(ymm0, 8); - - if (m_sel.edge) - { - vmovdqa(ymm1, ptr[&m_local.temp.cov]); - } - else - { - vmovdqa(ymm1, ymm0); - } - - vpcmpeqw(ymm0, ymm6); - vpsrld(ymm0, 16); - vpslld(ymm0, 16); - - vpblendvb(ymm6, ymm1, ymm0); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ - if (m_sel.fwrite) - { - vpbroadcastd(ymm3, ptr[&m_local.gd->fm]); - } - - if (m_sel.zwrite) - { - vpbroadcastd(ymm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ - switch (m_sel.atst) - { - case ATST_NEVER: - // t = GSVector8i::xffffffff(); - vpcmpeqd(ymm1, ymm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpgtd(ymm1, ymm0); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpeqd(ymm1, ymm0); - vpcmpeqd(ymm0, ymm0); - vpxor(ymm1, ymm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(ymm0, ymm6, 16); - vbroadcasti128(ymm1, ptr[&m_local.gd->aref]); - vpcmpgtd(ymm1, ymm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(ymm1, ymm6, 16); - vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); - vpcmpeqd(ymm1, ymm0); - break; - } - - switch (m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(ymm7, ymm1); - alltrue(ymm7); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(ymm4, ymm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(ymm3, ymm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(ymm4, ymm1); - // fm |= t & GSVector8i::xff000000(); - vpsrld(ymm1, 24); - vpslld(ymm1, 24); - vpor(ymm3, ymm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ - if (!m_sel.fwrite) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector8i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(ymm5, ymm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector8i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(ymm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(ymm1, ymm6); - - modulate16(ymm6, ymm2, 1); - - vpshuflw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(ymm2, 7); - - vpaddw(ymm6, ymm2); - - clamp16(ymm6, ymm0); - - mix16(ymm6, ymm1, ymm0); - - // GSVector8i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - vpaddw(ymm5, ymm2); - - clamp16(ymm5, ymm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if (m_sel.iip) - { - vpsrlw(ymm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog() -{ - if (!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - vmovdqa(ymm0, ptr[&m_local.temp.f]); - } - else - { - vpbroadcastw(ymm0, ptr[&m_local.p.f]); - } - - vmovdqa(ymm1, ymm6); - - vpbroadcastd(ymm2, ptr[&m_local.gd->frb]); - lerp16(ymm5, ymm2, ymm0, 0); - - vpbroadcastd(ymm2, ptr[&m_local.gd->fga]); - lerp16(ymm6, ymm2, ymm0, 0); - mix16(ymm6, ymm1, ymm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ - if (!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, ptr[esi]); - add(ebx, ptr[edi]); - and(ebx, HALF_VM_SIZE - 1); - - if (!m_sel.rfb) - { - return; - } - - ReadPixel(ymm2, ymm0, ebx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ - if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if (m_sel.datm) - { - if (m_sel.fpsm == 2) - { - vpxor(ymm0, ymm0); - //vpsrld(ymm1, ymm2, 15); - vpslld(ymm1, ymm2, 16); - vpsrad(ymm1, 31); - vpcmpeqd(ymm1, ymm0); - } - else - { - vpcmpeqd(ymm0, ymm0); - vpxor(ymm1, ymm2, ymm0); - vpsrad(ymm1, 31); - } - } - else - { - if (m_sel.fpsm == 2) - { - vpslld(ymm1, ymm2, 16); - vpsrad(ymm1, 31); - } - else - { - vpsrad(ymm1, ymm2, 31); - } - } - - vpor(ymm7, ymm1); - - alltrue(ymm7); -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ - if (m_sel.notest) - { - return; - } - - // fm |= test; - // zm |= test; - - if (m_sel.fwrite) - { - vpor(ymm3, ymm7); - } - - if (m_sel.zwrite) - { - vpor(ymm4, ymm7); - } - - // int fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask(); - - vpcmpeqd(ymm1, ymm1); - - if (m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(ymm0, ymm1, ymm4); - vpcmpeqd(ymm1, ymm3); - vpackssdw(ymm1, ymm0); - } - else if (m_sel.fwrite) - { - vpcmpeqd(ymm1, ymm3); - vpackssdw(ymm1, ymm1); - } - else if (m_sel.zwrite) - { - vpcmpeqd(ymm1, ymm4); - vpackssdw(ymm1, ymm1); - } - - vpmovmskb(edx, ymm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ - if (!m_sel.zwrite) - { - return; - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - vmovdqa(ymm1, ptr[&m_local.temp.zs]); - } - else - { - vpbroadcastd(ymm1, ptr[&m_local.p.z]); - } - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - vpcmpeqd(ymm7, ymm7); - vpsrld(ymm7, (uint8)((m_sel.zpsm & 0x3) * 8)); - vpminsd(ymm1, ymm7); - } - - if (m_sel.ztest && m_sel.zpsm < 2) - { - // zs = zs.blend8(zd, zm); - - vpblendvb(ymm1, ptr[&m_local.temp.zd], ymm4); - } - - bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - - WritePixel(ymm1, ymm0, ebp, edx, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch (m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - vpsllw(ymm0, ymm2, 8); - vpsrlw(ymm0, 8); - vpsrlw(ymm1, ymm2, 8); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(ymm7, ymm7); - - vpsrld(ymm7, 27); // 0x0000001f - vpand(ymm0, ymm2, ymm7); - vpslld(ymm0, 3); - - vpslld(ymm7, 10); // 0x00007c00 - vpand(ymm4, ymm2, ymm7); - vpslld(ymm4, 9); - - vpor(ymm0, ymm4); - - vpsrld(ymm7, 5); // 0x000003e0 - vpand(ymm1, ymm2, ymm7); - vpsrld(ymm1, 2); - - vpsllw(ymm7, 10); // 0x00008000 - vpand(ymm4, ymm2, ymm7); - vpslld(ymm4, 8); - - vpor(ymm1, ymm4); - - break; - } - } - - // ymm5, ymm6 = src rb, ga - // ymm0, ymm1 = dst rb, ga - // ymm2, ymm3 = used - // ymm4, ymm7 = free - - if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(ymm4, ymm5); - } - - if (m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(ymm5, ymm0); - break; - case 2: - vpxor(ymm5, ymm5); - break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch (m_sel.abb) - { - case 0: - vpsubw(ymm5, ymm4); - break; - case 1: - vpsubw(ymm5, ymm0); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch (m_sel.abc) - { - case 0: - case 1: - vpshuflw(ymm7, m_sel.abc ? ymm1 : ymm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(ymm7, ymm7, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(ymm7, 7); - break; - case 2: - vpbroadcastw(ymm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(ymm5, ymm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch (m_sel.abd) - { - case 0: - vpaddw(ymm5, ymm4); - break; - case 1: - vpaddw(ymm5, ymm0); - break; - case 2: - break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(ymm5, ymm0); - break; - case 2: - vpxor(ymm5, ymm5); - break; - } - } - - if (m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(ymm0, ymm6, 8); - vpsrad(ymm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(ymm5, ymm4, ymm5, ymm0); - } - - // ymm6 = src ga - // ymm1 = dst ga - // ymm5 = rb - // ymm7 = a - // ymm2, ymm3 = used - // ymm0, ymm4 = free - - vmovdqa(ymm4, ymm6); - - if (m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - vmovdqa(ymm6, ymm1); - break; - case 2: - vpxor(ymm6, ymm6); - break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch (m_sel.abb) - { - case 0: - vpsubw(ymm6, ymm4); - break; - case 1: - vpsubw(ymm6, ymm1); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(ymm6, ymm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch (m_sel.abd) - { - case 0: - vpaddw(ymm6, ymm4); - break; - case 1: - vpaddw(ymm6, ymm1); - break; - case 2: - break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - vmovdqa(ymm6, ymm1); - break; - case 2: - vpxor(ymm6, ymm6); - break; - } - } - - // ymm4 = src ga - // ymm5 = rb - // ymm6 = ga - // ymm2, ymm3 = used - // ymm0, ymm1, ymm7 = free - - if (m_sel.pabe) - { - vpsrld(ymm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(ymm6, ymm4, ymm6, ymm0); - } - else - { - if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(ymm6, ymm4, ymm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, ptr[esp + _top]); - and(eax, 3); - shl(eax, 5); - mov(ebp, ptr[&m_local.gd->dimx]); - vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 0]); - vpaddw(ymm5, ymm7); - vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 1]); - vpaddw(ymm6, ymm7); - } - - if (m_sel.colclamp == 0) - { - // c[0] &= 0x00ff00ff; - // c[1] &= 0x00ff00ff; - - vpcmpeqd(ymm7, ymm7); - vpsrlw(ymm7, 8); - vpand(ymm5, ymm7); - vpand(ymm6, ymm7); - } - - // GSVector8i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(ymm7, ymm5, ymm6); - vpunpcklwd(ymm5, ymm6); - vpackuswb(ymm5, ymm7); - - if (m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(ymm7, ymm7); - vpslld(ymm7, 31); - vpor(ymm5, ymm7); - } - - if (m_sel.fpsm == 2) - { - // GSVector8i rb = fs & 0x00f800f8; - // GSVector8i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm6, eax); - vpbroadcastd(ymm6, xmm6); - - mov(eax, 0x8000f800); - vmovd(xmm7, eax); - vpbroadcastd(ymm7, xmm7); - - vpand(ymm4, ymm5, ymm6); - vpand(ymm5, ymm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(ymm6, ymm4, 9); - vpsrld(ymm4, 3); - vpsrld(ymm7, ymm5, 16); - vpsrld(ymm5, 6); - - vpor(ymm5, ymm4); - vpor(ymm7, ymm6); - vpor(ymm5, ymm7); - } - - if (m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(ymm5, ymm2, ymm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - - WritePixel(ymm5, ymm0, ebx, edx, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr) -{ - vmovq(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm]); - vmovhps(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); - vmovq(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); - vmovhps(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2]); - vinserti128(dst, dst, Xmm(temp.getIdx()), 1); - /* - vmovdqu(dst, ptr[addr * 2 + (size_t)m_local.gd->vm]); - vmovdqu(temp, ptr[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); - vpunpcklqdq(dst, dst, temp); - vpermq(dst, dst, _MM_SHUFFLE(3, 1, 2, 0)); -*/ -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz) -{ - Xmm src1 = Xmm(src.getIdx()); - Xmm src2 = Xmm(temp.getIdx()); - - vextracti128(src2, src, 1); - - if (m_sel.notest) - { - if (fast) - { - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); - } - else - { - WritePixel(src1, addr, 0, 0, psm); - WritePixel(src1, addr, 1, 1, psm); - WritePixel(src1, addr, 2, 2, psm); - WritePixel(src1, addr, 3, 3, psm); - WritePixel(src2, addr, 4, 0, psm); - WritePixel(src2, addr, 5, 1, psm); - WritePixel(src2, addr, 6, 2, psm); - WritePixel(src2, addr, 7, 3, psm); - } - } - else - { - // cascade tests? - - if (fast) - { - test(mask, 0x0000000f << (fz * 8)); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); - L("@@"); - - test(mask, 0x000000f0 << (fz * 8)); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); - L("@@"); - - test(mask, 0x000f0000 << (fz * 8)); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); - L("@@"); - - test(mask, 0x00f00000 << (fz * 8)); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); - L("@@"); - - // vmaskmovps? - } - else - { - test(mask, 0x00000003 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 0, 0, psm); - L("@@"); - - test(mask, 0x0000000c << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 1, 1, psm); - L("@@"); - - test(mask, 0x00000030 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 2, 2, psm); - L("@@"); - - test(mask, 0x000000c0 << (fz * 8)); - je("@f"); - WritePixel(src1, addr, 3, 3, psm); - L("@@"); - - test(mask, 0x00030000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 4, 0, psm); - L("@@"); - - test(mask, 0x000c0000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 5, 1, psm); - L("@@"); - - test(mask, 0x00300000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 6, 2, psm); - L("@@"); - - test(mask, 0x00c00000 << (fz * 8)); - je("@f"); - WritePixel(src2, addr, 7, 3, psm); - L("@@"); - } - } -} - -static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - switch (psm) - { - case 0: - if (j == 0) - vmovd(dst, src); - else - vpextrd(dst, src, j); - break; - case 1: - if (j == 0) - vmovd(eax, src); - else - vpextrd(eax, src, j); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - if (j == 0) - vmovd(eax, src); - else - vpextrw(eax, src, j * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) -{ - // in - // ymm5 = addr00 - // ymm2 = addr01 - // ymm0 = addr10 - // ymm3 = addr11 - // ebx = m_local.tex[0] (!m_sel.mmin) - // ebp = m_local.tex (m_sel.mmin) - // edx = m_local.clut (m_sel.tlu) - - // out - // ymm6 = c00 - // ymm4 = c01 - // ymm1 = c10 - // ymm5 = c11 - - ASSERT(pixels == 1 || pixels == 4); - - mip_offset *= sizeof(void*); - - const GSVector8i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i; - - if (m_sel.mmin && !m_sel.lcm) - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; - - for (int i = 0; i < pixels; i++) - { - Ymm src = Ymm(r[i * 2 + 0]); - Ymm dst = Ymm(r[i * 2 + 1]); - Ymm t1 = Ymm(t[i * 2 + 0]); - Ymm t2 = Ymm(t[i * 2 + 1]); - - vextracti128(Xmm(t1.getIdx()), src, 1); - - for (uint8 j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j + 0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(dst, src, j); - - mov(ebx, ptr[&lod_i->u32[j + 4]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel(t2, t1, j); - } - - vinserti128(dst, dst, Xmm(t2.getIdx()), 1); - } - } - else - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; - - if (m_sel.mmin && m_sel.lcm) - { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - } - - for (int i = 0; i < pixels; i++) - { - Ymm src = Ymm(r[i * 2 + 0]); - Ymm dst = Ymm(r[i * 2 + 1]); - Ymm t1 = Ymm(t[i * 2 + 0]); - Ymm t2 = Ymm(t[i * 2 + 1]); - - if (!m_sel.tlu) - { - vpcmpeqd(t1, t1); - vpgatherdd(dst, ptr[ebx + src * 4], t1); - } - else - { - vextracti128(Xmm(t1.getIdx()), src, 1); - - for (uint8 j = 0; j < 4; j++) - { - ReadTexel(dst, src, j); - ReadTexel(t2, t1, j); - } - - vinserti128(dst, dst, Xmm(t2.getIdx()), 1); - /* - vpcmpeqd(t1, t1); - vpgatherdd(t2, ptr[ebx + src * 1], t1); // either this 1x scale, or the latency of two dependendent gathers are too slow - vpslld(t2, 24); - vpsrld(t2, 24); - vpcmpeqd(t1, t1); - vpgatherdd(dst, ptr[edx + t2 * 4], t1); - */ - } - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i) -{ - ASSERT(i < 4); - - const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - - if (i == 0) - vmovd(eax, Xmm(addr.getIdx())); - else - vpextrd(eax, Xmm(addr.getIdx()), i); - - if (m_sel.tlu) - movzx(eax, byte[ebx + eax]); - - if (i == 0) - vmovd(Xmm(dst.getIdx()), src); - else - vpinsrd(Xmm(dst.getIdx()), src, i); -} - - -#endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp deleted file mode 100644 index fa0cccd76a..0000000000 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp +++ /dev/null @@ -1,2953 +0,0 @@ -/* PCSX2 - PS2 Emulator for PCs - * Copyright (C) 2002-2021 PCSX2 Dev Team - * - * PCSX2 is free software: you can redistribute it and/or modify it under the terms - * of the GNU Lesser General Public License as published by the Free Software Found- - * ation, either version 3 of the License, or (at your option) any later version. - * - * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with PCSX2. - * If not, see . - */ - -#include "PrecompiledHeader.h" -#include "GSDrawScanlineCodeGenerator.h" -#include "GSVertexSW.h" -#include "GS/GS_codegen.h" - -#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -void GSDrawScanlineCodeGenerator::Generate_SSE() -{ - push(ebx); - push(esi); - push(edi); - push(ebp); - - Init_SSE(); - - if (!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // xmm0 = z/zi - // xmm2 = s/u (tme) - // xmm3 = t/v (tme) - // xmm4 = q (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - bool tme = m_sel.tfx != TFX_NONE; - - TestZ_SSE(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm0 - // xmm2 = s/u (tme) - // xmm3 = t/v (tme) - // xmm4 = q (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - if (m_sel.mmin) - { - SampleTextureLOD_SSE(); - } - else - { - SampleTexture_SSE(); - } - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm2 - // - xmm3 - // - xmm4 - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - AlphaTFX_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadMask_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestAlpha_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ColorTFX_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - Fog_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadFrame_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestDestAlpha_SSE(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - WriteMask_SSE(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - - WriteZBuf_SSE(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // xmm2 = fd - // xmm3 = fm - // - xmm4 - // xmm5 = rb - // xmm6 = ga - - AlphaBlend_SSE(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // xmm2 = fd - // xmm3 = fm - // xmm5 = rb - // xmm6 = ga - - WriteFrame_SSE(); - -L("step"); - - // if(steps <= 0) break; - - if (!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step_SSE(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - // vzeroupper(); - - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); -} - -void GSDrawScanlineCodeGenerator::Init_SSE() -{ - if (!m_sel.notest) - { - // int skip = left & 3; - - mov(ebx, edx); - and(edx, 3); - - // int steps = pixels + skip - 4; - - lea(ecx, ptr[ecx + edx - 4]); - - // left -= skip; - - sub(ebx, edx); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(edx, 4); - - movdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[0]]); - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); - - por(xmm7, ptr[eax + (size_t)g_const->m_test_128b[7]]); - } - else - { - mov(ebx, edx); // left - xor(edx, edx); // skip - lea(ecx, ptr[ecx - 4]); // steps - } - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, ptr[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, ptr[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, ptr[&m_local.gd->fzbc]); - - if (m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - lea(edx, ptr[edx * 8 + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, ptr[esp + _v]); - } - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.fwrite && m_sel.fge || m_sel.zb) - { - movaps(xmm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p - - if (m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - cvttps2dq(xmm1, xmm0); - pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - paddw(xmm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]); - - movdqa(ptr[&m_local.temp.f], xmm1); - } - - if (m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - movaps(ptr[&m_local.temp.z], xmm0); - movaps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]); - movaps(ptr[&m_local.temp.zo], xmm2); - addps(xmm0, xmm2); - } - } - } - else - { - if (m_sel.ztest) - { - movdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.edge || m_sel.tfx != TFX_NONE) - { - movaps(xmm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t - } - - if (m_sel.edge) - { - // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); - - pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); - psrlw(xmm3, 9); - - movdqa(ptr[&m_local.temp.cov], xmm3); - } - - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i vti(vt); - - cvttps2dq(xmm6, xmm4); - - // s = vti.xxxx() + m_local.d[skip].s; - // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; - - pshufd(xmm2, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm3, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); - - paddd(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - paddd(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - } - else - { - if (m_sel.ltf) - { - pshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm6, 12); - movdqa(ptr[&m_local.temp.vf], xmm6); - } - } - - movdqa(ptr[&m_local.temp.s], xmm2); - movdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm4); - - shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - addps(xmm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); - addps(xmm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); - addps(xmm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]); - - movaps(ptr[&m_local.temp.s], xmm2); - movaps(ptr[&m_local.temp.t], xmm3); - movaps(ptr[&m_local.temp.q], xmm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - cvttps2dq(xmm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); - punpcklwd(xmm6, xmm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); - - paddw(xmm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]); - paddw(xmm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]); - - movdqa(ptr[&m_local.temp.rb], xmm5); - movdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - movdqa(xmm5, ptr[&m_local.c.rb]); - movdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step_SSE() -{ - // steps -= 4; - - sub(ecx, 4); - - // fza_offset++; - - add(edi, 8); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // z += m_local.d4.z; - - if (m_sel.zb) - { - movaps(xmm0, ptr[&m_local.temp.zo]); - addps(xmm0, ptr[&m_local.d4.z]); - movaps(ptr[&m_local.temp.zo], xmm0); - addps(xmm0, ptr[&m_local.temp.z]); - } - - // f = f.add16(m_local.d4.f); - - if (m_sel.fwrite && m_sel.fge) - { - movdqa(xmm1, ptr[&m_local.temp.f]); - paddw(xmm1, ptr[&m_local.d4.f]); - movdqa(ptr[&m_local.temp.f], xmm1); - } - } - else - { - if (m_sel.ztest) - { - movdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if (m_sel.fb) - { - if (m_sel.tfx != TFX_NONE) - { - if (m_sel.fst) - { - // GSVector4i stq = m_local.d4.stq; - - // s += stq.xxxx(); - // if(!sprite) t += stq.yyyy(); - - movdqa(xmm4, ptr[&m_local.d4.stq]); - - pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - paddd(xmm2, ptr[&m_local.temp.s]); - movdqa(ptr[&m_local.temp.s], xmm2); - - if (m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) - { - pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - paddd(xmm3, ptr[&m_local.temp.t]); - movdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - movdqa(xmm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - movaps(xmm4, ptr[&m_local.d4.stq]); - movaps(xmm2, xmm4); - movaps(xmm3, xmm4); - - shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - addps(xmm2, ptr[&m_local.temp.s]); - addps(xmm3, ptr[&m_local.temp.t]); - addps(xmm4, ptr[&m_local.temp.q]); - - movaps(ptr[&m_local.temp.s], xmm2); - movaps(ptr[&m_local.temp.t], xmm3); - movaps(ptr[&m_local.temp.q], xmm4); - } - } - - if (!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if (m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - movdqa(xmm7, ptr[&m_local.d4.c]); - - pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); - - paddw(xmm5, ptr[&m_local.temp.rb]); - paddw(xmm6, ptr[&m_local.temp.ga]); - - // FIXME: color may underflow and roll over at the end of the line, if decreasing - - pxor(xmm7, xmm7); - pmaxsw(xmm5, xmm7); - pmaxsw(xmm6, xmm7); - - movdqa(ptr[&m_local.temp.rb], xmm5); - movdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if (m_sel.tfx == TFX_NONE) - { - movdqa(xmm5, ptr[&m_local.c.rb]); - movdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } - - if (!m_sel.notest) - { - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); - - movdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[7]]); - } -} - -void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) -{ - if (!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, ptr[esi + 4]); - add(ebp, ptr[edi + 4]); - and(ebp, HALF_VM_SIZE - 1); - - // GSVector4i zs = zi; - - if (m_sel.prim != GS_SPRITE_CLASS) - { - if (m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - movaps(temp1, ptr[&GSVector4::m_half]); - mulps(temp1, xmm0); - cvttps2dq(temp1, temp1); - pslld(temp1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(temp2, temp2); - psrld(temp2, 31); - pand(xmm0, temp2); - - por(xmm0, temp1); - } - else - { - // zs = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - pcmpeqd(temp1, temp1); - psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); - pminsd(xmm0, temp1); - } - - if (m_sel.zwrite) - { - movdqa(ptr[&m_local.temp.zs], xmm0); - } - } - - if (m_sel.ztest) - { - ReadPixel_SSE(xmm1, ebp); - - if (m_sel.zwrite && m_sel.zpsm < 2) - { - movdqa(ptr[&m_local.temp.zd], xmm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if (m_sel.zpsm) - { - pslld(xmm1, m_sel.zpsm * 8); - psrld(xmm1, m_sel.zpsm * 8); - } - - if (m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - pcmpeqd(temp1, temp1); - pslld(temp1, 31); - - // GSVector4i zso = zs - o; - // GSVector4i zdo = zd - o; - - psubd(xmm0, temp1); - psubd(xmm1, temp1); - } - - switch (m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - pcmpgtd(xmm1, xmm0); - por(xmm7, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - pcmpgtd(xmm0, xmm1); - pcmpeqd(temp1, temp1); - pxor(xmm0, temp1); - por(xmm7, xmm0); - break; - } - - alltrue(xmm7); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture_SSE() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, ptr[&m_local.gd->tex[0]]); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - // ebx = tex - // edx = clut - - if (!m_sel.fst) - { - rcpps(xmm4, xmm4); - - mulps(xmm2, xmm4); - mulps(xmm3, xmm4); - - cvttps2dq(xmm2, xmm2); - cvttps2dq(xmm3, xmm3); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - movd(xmm4, eax); - pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - psubd(xmm2, xmm4); - psubd(xmm3, xmm4); - } - } - - // xmm2 = u - // xmm3 = v - - if (m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.uf], xmm0); - - if (m_sel.prim != GS_SPRITE_CLASS) - { - // GSVector4i vf = v.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.vf], xmm0); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - psrad(xmm2, 16); - psrad(xmm3, 16); - packssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - movdqa(xmm3, xmm2); - pcmpeqd(xmm1, xmm1); - psrlw(xmm1, 15); - paddw(xmm3, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap_SSE(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap_SSE(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i y0 = uv0.uph16() << tw; - // GSVector4i x0 = uv0.upl16(); - - pxor(xmm0, xmm0); - - movdqa(xmm4, xmm2); - punpckhwd(xmm2, xmm0); - punpcklwd(xmm4, xmm0); - pslld(xmm2, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i y1 = uv1.uph16() << tw; - // GSVector4i x1 = uv1.upl16(); - - movdqa(xmm6, xmm3); - punpckhwd(xmm3, xmm0); - punpcklwd(xmm6, xmm0); - pslld(xmm3, m_sel.tw + 3); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - movdqa(xmm5, xmm2); - paddd(xmm5, xmm4); - paddd(xmm2, xmm6); - - movdqa(xmm0, xmm3); - paddd(xmm0, xmm4); - paddd(xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(4, 0); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - movdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp_4(rb01, uf); - // ga00 = ga00.lerp_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp_4(rb11, uf); - // ga10 = ga10.lerp_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp_4(rb10, vf); - // ga00 = ga00.lerp_4(ga10, vf); - - movdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - paddd(xmm2, xmm4); - movdqa(xmm5, xmm2); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(1, 0); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } -} - -void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - pmaxsw(uv, ptr[&m_local.gd->t.min]); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv, xmm0); - } - - pminsw(uv, ptr[&m_local.gd->t.max]); - } - else - { - pand(uv, ptr[&m_local.gd->t.min]); - - if (region) - { - por(uv, ptr[&m_local.gd->t.max]); - } - } - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - movdqa(xmm5, ptr[&m_local.gd->t.max]); - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv); - - pand(xmm1, xmm4); - - if (region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv, xmm4); - pminsw(uv, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - blend8(uv, xmm1); - } -} - -void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - pmaxsw(uv0, xmm4); - pmaxsw(uv1, xmm4); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv0, xmm0); - pmaxsw(uv1, xmm0); - } - - movdqa(xmm5, ptr[&m_local.gd->t.max]); - pminsw(uv0, xmm5); - pminsw(uv1, xmm5); - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - pand(uv0, xmm4); - pand(uv1, xmm4); - - if (region) - { - movdqa(xmm5, ptr[&m_local.gd->t.max]); - por(uv0, xmm5); - por(uv1, xmm5); - } - } - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - movdqa(xmm5, ptr[&m_local.gd->t.max]); - - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv0); - - pand(xmm1, xmm4); - - if (region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv0, xmm4); - pminsw(uv0, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - pblendvb(uv0, xmm1); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv1); - - pand(xmm1, xmm4); - - if (region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv1, xmm4); - pminsw(uv1, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - pblendvb(uv1, xmm1); - } -} - -void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE() -{ - if (!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - push(ebp); - - mov(ebp, (size_t)m_local.gd->tex); - - if (m_sel.tlu) - { - mov(edx, ptr[&m_local.gd->clut]); - } - - if (!m_sel.fst) - { - rcpps(xmm0, xmm4); - - mulps(xmm2, xmm0); - mulps(xmm3, xmm0); - - cvttps2dq(xmm2, xmm2); - cvttps2dq(xmm3, xmm3); - } - - // xmm2 = u - // xmm3 = v - // xmm4 = q - // xmm0 = xmm1 = xmm5 = xmm6 = free - - // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) - - if (!m_sel.lcm) - { - // store u/v - - movdqa(xmm0, xmm2); - punpckldq(xmm2, xmm3); - movdqa(ptr[&m_local.temp.uv[0]], xmm2); - punpckhdq(xmm0, xmm3); - movdqa(ptr[&m_local.temp.uv[1]], xmm0); - - // lod = -log2(Q) * (1 << L) + K - - movdqa(xmm0, xmm4); - pcmpeqd(xmm1, xmm1); - psrld(xmm1, 25); - pslld(xmm0, 1); - psrld(xmm0, 24); - psubd(xmm0, xmm1); - cvtdq2ps(xmm0, xmm0); - - // xmm0 = (float)(exp(q) - 127) - - pslld(xmm4, 9); - psrld(xmm4, 9); - orps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); - - // xmm4 = mant(q) | 1.0f - - movdqa(xmm5, xmm4); - mulps(xmm5, ptr[g_const->m_log2_coef_128b[0]]); - addps(xmm5, ptr[g_const->m_log2_coef_128b[1]]); - mulps(xmm5, xmm4); - subps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); - addps(xmm5, ptr[g_const->m_log2_coef_128b[2]]); - mulps(xmm4, xmm5); - addps(xmm4, xmm0); - - // xmm4 = log2(Q) = ((((c0 * xmm4) + c1) * xmm4) + c2) * (xmm4 - 1.0f) + xmm0 - - mulps(xmm4, ptr[&m_local.gd->l]); - addps(xmm4, ptr[&m_local.gd->k]); - - // xmm4 = (-log2(Q) * (1 << L) + K) * 0x10000 - - xorps(xmm0, xmm0); - minps(xmm4, ptr[&m_local.gd->mxl]); - maxps(xmm4, xmm0); - cvtps2dq(xmm4, xmm4); - - if (m_sel.mmin == 1) // round-off mode - { - mov(eax, 0x8000); - movd(xmm0, eax); - pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - paddd(xmm4, xmm0); - } - - movdqa(xmm0, xmm4); - psrld(xmm4, 16); - movdqa(ptr[&m_local.temp.lod.i], xmm4); - - if (m_sel.mmin == 2) // trilinear mode - { - pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[&m_local.temp.lod.f], xmm0); - } - - // shift u/v by (int)lod - - movq(xmm4, ptr[&m_local.gd->t.minmax]); - - movdqa(xmm2, ptr[&m_local.temp.uv[0]]); - movdqa(xmm5, xmm2); - movdqa(xmm3, ptr[&m_local.temp.uv[1]]); - movdqa(xmm6, xmm3); - - movd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); - psrad(xmm2, xmm0); - movdqa(xmm1, xmm4); - psrlw(xmm1, xmm0); - movq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); - - movd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); - psrad(xmm5, xmm0); - movdqa(xmm1, xmm4); - psrlw(xmm1, xmm0); - movq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); - - movd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); - psrad(xmm3, xmm0); - movdqa(xmm1, xmm4); - psrlw(xmm1, xmm0); - movq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); - - movd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); - psrad(xmm6, xmm0); - movdqa(xmm1, xmm4); - psrlw(xmm1, xmm0); - movq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); - - punpckldq(xmm2, xmm3); - punpckhdq(xmm5, xmm6); - movdqa(xmm3, xmm2); - punpckldq(xmm2, xmm5); - punpckhdq(xmm3, xmm5); - - movdqa(ptr[&m_local.temp.uv[0]], xmm2); - movdqa(ptr[&m_local.temp.uv[1]], xmm3); - - movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - - movdqa(xmm0, xmm5); - punpcklwd(xmm5, xmm6); - punpckhwd(xmm0, xmm6); - movdqa(xmm6, xmm5); - punpckldq(xmm5, xmm0); - punpckhdq(xmm6, xmm0); - - movdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); - movdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); - } - else - { - // lod = K - - movd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]); - - psrad(xmm2, xmm0); - psrad(xmm3, xmm0); - - movdqa(ptr[&m_local.temp.uv[0]], xmm2); - movdqa(ptr[&m_local.temp.uv[1]], xmm3); - - movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - } - - // xmm2 = m_local.temp.uv[0] = u (level m) - // xmm3 = m_local.temp.uv[1] = v (level m) - // xmm5 = minuv - // xmm6 = maxuv - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - movd(xmm4, eax); - pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - psubd(xmm2, xmm4); - psubd(xmm3, xmm4); - - // GSVector4i uf = u.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.uf], xmm0); - - // GSVector4i vf = v.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.vf], xmm0); - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - psrad(xmm2, 16); - psrad(xmm3, 16); - packssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - movdqa(xmm3, xmm2); - pcmpeqd(xmm1, xmm1); - psrlw(xmm1, 15); - paddw(xmm3, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD_SSE(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD_SSE(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - pxor(xmm0, xmm0); - - movdqa(xmm4, xmm2); - punpckhwd(xmm2, xmm0); - punpcklwd(xmm4, xmm0); - pslld(xmm2, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - movdqa(xmm6, xmm3); - punpcklwd(xmm6, xmm0); - punpckhwd(xmm3, xmm0); - pslld(xmm3, m_sel.tw + 3); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - movdqa(xmm5, xmm2); - paddd(xmm5, xmm4); - paddd(xmm2, xmm6); - - movdqa(xmm0, xmm3); - paddd(xmm0, xmm4); - paddd(xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(4, 0); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - movdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp_4(rb01, uf); - // ga00 = ga00.lerp_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp_4(rb11, uf); - // ga10 = ga10.lerp_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp_4(rb10, vf); - // ga00 = ga00.lerp_4(ga10, vf); - - movdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - paddd(xmm2, xmm4); - movdqa(xmm5, xmm2); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(1, 0); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } - - if (m_sel.mmin != 1) // !round-off mode - { - movdqa(ptr[&m_local.temp.trb], xmm5); - movdqa(ptr[&m_local.temp.tga], xmm6); - - movdqa(xmm2, ptr[&m_local.temp.uv[0]]); - movdqa(xmm3, ptr[&m_local.temp.uv[1]]); - - psrad(xmm2, 1); - psrad(xmm3, 1); - - movdqa(xmm5, ptr[&m_local.temp.uv_minmax[0]]); - movdqa(xmm6, ptr[&m_local.temp.uv_minmax[1]]); - - psrlw(xmm5, 1); - psrlw(xmm6, 1); - - if (m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - movd(xmm4, eax); - pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - psubd(xmm2, xmm4); - psubd(xmm3, xmm4); - - // GSVector4i uf = u.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.uf], xmm0); - - // GSVector4i vf = v.xxzzlh().srl16(1); - - pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 12); - movdqa(ptr[&m_local.temp.vf], xmm0); - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - psrad(xmm2, 16); - psrad(xmm3, 16); - packssdw(xmm2, xmm3); - - if (m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - movdqa(xmm3, xmm2); - pcmpeqd(xmm1, xmm1); - psrlw(xmm1, 15); - paddw(xmm3, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - WrapLOD_SSE(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - WrapLOD_SSE(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - pxor(xmm0, xmm0); - - movdqa(xmm4, xmm2); - punpckhwd(xmm2, xmm0); - punpcklwd(xmm4, xmm0); - pslld(xmm2, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if (m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - movdqa(xmm6, xmm3); - punpckhwd(xmm3, xmm0); - punpcklwd(xmm6, xmm0); - pslld(xmm3, m_sel.tw + 3); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - movdqa(xmm5, xmm2); - paddd(xmm5, xmm4); - paddd(xmm2, xmm6); - - movdqa(xmm0, xmm3); - paddd(xmm0, xmm4); - paddd(xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(4, 1); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - movdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - split16_2x8(xmm2, xmm6, xmm6); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - split16_2x8(xmm3, xmm4, xmm4); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp_4(rb01, uf); - // ga00 = ga00.lerp_4(ga01, uf); - - lerp16_4(xmm3, xmm2, xmm0); - lerp16_4(xmm4, xmm6, xmm0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - split16_2x8(xmm1, xmm2, xmm1); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm5); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp_4(rb11, uf); - // ga10 = ga10.lerp_4(ga11, uf); - - lerp16_4(xmm5, xmm1, xmm0); - lerp16_4(xmm6, xmm2, xmm0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp_4(rb10, vf); - // ga00 = ga00.lerp_4(ga10, vf); - - movdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16_4(xmm5, xmm3, xmm0); - lerp16_4(xmm6, xmm4, xmm0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - paddd(xmm2, xmm4); - movdqa(xmm5, xmm2); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel_SSE(1, 1); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - split16_2x8(xmm5, xmm6, xmm6); - } - - movdqa(xmm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); - psrlw(xmm0, 1); - - movdqa(xmm2, ptr[&m_local.temp.trb]); - movdqa(xmm3, ptr[&m_local.temp.tga]); - - lerp16(xmm5, xmm2, xmm0, 0); - lerp16(xmm6, xmm3, xmm0, 0); - } - - pop(ebp); -} - -void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv) -{ - // xmm5 = minuv - // xmm6 = maxuv - // xmm0, xmm1, xmm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - pmaxsw(uv, xmm5); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv, xmm0); - } - - pminsw(uv, xmm6); - } - else - { - pand(uv, xmm5); - - if (region) - { - por(uv, xmm6); - } - } - } - else - { - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv); - - pand(xmm1, xmm5); - - if (region) - { - por(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv, xmm5); - pminsw(uv, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - blend8(uv, xmm1); - } -} - -void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) -{ - // xmm5 = minuv - // xmm6 = maxuv - // xmm0, xmm1, xmm4 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if (wms_clamp == wmt_clamp) - { - if (wms_clamp) - { - if (region) - { - pmaxsw(uv0, xmm5); - pmaxsw(uv1, xmm5); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv0, xmm0); - pmaxsw(uv1, xmm0); - } - - pminsw(uv0, xmm6); - pminsw(uv1, xmm6); - } - else - { - pand(uv0, xmm5); - pand(uv1, xmm5); - - if (region) - { - por(uv0, xmm6); - por(uv1, xmm6); - } - } - } - else - { - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv0); - - pand(xmm1, xmm5); - - if (region) - { - por(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv0, xmm5); - pminsw(uv0, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - pblendvb(uv0, xmm1); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv1); - - pand(xmm1, xmm5); - - if (region) - { - por(xmm1, xmm6); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv1, xmm5); - pminsw(uv1, xmm6); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - pblendvb(uv1, xmm1); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX_SSE() -{ - if (!m_sel.fb) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(xmm6, xmm4, 1); - - clamp16(xmm6, xmm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - movdqa(xmm2, xmm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - psrlw(xmm4, 7); - - if (m_sel.tcc) - { - paddusb(xmm4, xmm6); - } - - mix16(xmm6, xmm4, xmm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if (!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - movdqa(xmm2, xmm4); - - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if (m_sel.iip) - { - psrlw(xmm6, 7); - } - - break; - } - - if (m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if (!m_sel.abe) - { - // a = cov - - if (m_sel.edge) - { - movdqa(xmm0, ptr[&m_local.temp.cov]); - } - else - { - pcmpeqd(xmm0, xmm0); - psllw(xmm0, 15); - psrlw(xmm0, 8); - } - - mix16(xmm6, xmm0, xmm1); - } - else - { - // a = a == 0x80 ? cov : a - - pcmpeqd(xmm0, xmm0); - psllw(xmm0, 15); - psrlw(xmm0, 8); - - if (m_sel.edge) - { - movdqa(xmm1, ptr[&m_local.temp.cov]); - } - else - { - movdqa(xmm1, xmm0); - } - - pcmpeqw(xmm0, xmm6); - psrld(xmm0, 16); - pslld(xmm0, 16); - - blend8(xmm6, xmm1); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask_SSE() -{ - if (m_sel.fwrite) - { - movdqa(xmm3, ptr[&m_local.gd->fm]); - } - - if (m_sel.zwrite) - { - movdqa(xmm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha_SSE() -{ - switch (m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - pcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpgtd(xmm1, ptr[&m_local.gd->aref]); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpeqd(xmm1, ptr[&m_local.gd->aref]); - pcmpeqd(xmm0, xmm0); - pxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - movdqa(xmm0, xmm6); - psrld(xmm0, 16); - movdqa(xmm1, ptr[&m_local.gd->aref]); - pcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpeqd(xmm1, ptr[&m_local.gd->aref]); - break; - } - - switch (m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - por(xmm7, xmm1); - alltrue(xmm7); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - por(xmm4, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - por(xmm3, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - por(xmm4, xmm1); - // fm |= t & GSVector4i::xff000000(); - psrld(xmm1, 24); - pslld(xmm1, 24); - por(xmm3, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX_SSE() -{ - if (!m_sel.fwrite) - { - return; - } - - switch (m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(xmm5, xmm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if (m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - movdqa(xmm1, xmm6); - - modulate16(xmm6, xmm2, 1); - - pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - psrlw(xmm2, 7); - - paddw(xmm6, xmm2); - - clamp16(xmm6, xmm0); - - mix16(xmm6, xmm1, xmm0); - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - paddw(xmm5, xmm2); - - clamp16(xmm5, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if (m_sel.iip) - { - psrlw(xmm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog_SSE() -{ - if (!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - movdqa(xmm0, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.f : &m_local.p.f]); - movdqa(xmm1, xmm6); - - movdqa(xmm2, ptr[&m_local.gd->frb]); - lerp16(xmm5, xmm2, xmm0, 0); - - movdqa(xmm2, ptr[&m_local.gd->fga]); - lerp16(xmm6, xmm2, xmm0, 0); - mix16(xmm6, xmm1, xmm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame_SSE() -{ - if (!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, ptr[esi]); - add(ebx, ptr[edi]); - and(ebx, HALF_VM_SIZE - 1); - - if (!m_sel.rfb) - { - return; - } - - ReadPixel_SSE(xmm2, ebx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE() -{ - if (!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - movdqa(xmm1, xmm2); - - if (m_sel.datm) - { - if (m_sel.fpsm == 2) - { - pxor(xmm0, xmm0); - // psrld(xmm1, 15); - pslld(xmm1, 16); - psrld(xmm1, 31); - pcmpeqd(xmm1, xmm0); - } - else - { - pcmpeqd(xmm0, xmm0); - pxor(xmm1, xmm0); - psrad(xmm1, 31); - } - } - else - { - if (m_sel.fpsm == 2) - { - pslld(xmm1, 16); - } - - psrad(xmm1, 31); - } - - por(xmm7, xmm1); - - alltrue(xmm7); -} - -void GSDrawScanlineCodeGenerator::WriteMask_SSE() -{ - if (m_sel.notest) - { - return; - } - - // fm |= test; - // zm |= test; - - if (m_sel.fwrite) - { - por(xmm3, xmm7); - } - - if (m_sel.zwrite) - { - por(xmm4, xmm7); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - pcmpeqd(xmm1, xmm1); - - if (m_sel.fwrite && m_sel.zwrite) - { - movdqa(xmm0, xmm1); - pcmpeqd(xmm1, xmm3); - pcmpeqd(xmm0, xmm4); - packssdw(xmm1, xmm0); - } - else if (m_sel.fwrite) - { - pcmpeqd(xmm1, xmm3); - packssdw(xmm1, xmm1); - } - else if (m_sel.zwrite) - { - pcmpeqd(xmm1, xmm4); - packssdw(xmm1, xmm1); - } - - pmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() -{ - if (!m_sel.zwrite) - { - return; - } - - movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - - if (m_sel.ztest && m_sel.zpsm < 2) - { - // zs = zs.blend8(zd, zm); - - movdqa(xmm0, xmm4); - movdqa(xmm7, ptr[&m_local.temp.zd]); - blend8(xmm1, xmm7); - } - - // Clamp Z to ZPSM_FMT_MAX - if (m_sel.zclamp) - { - pcmpeqd(xmm7, xmm7); - psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); - pminsd(xmm1, xmm7); - } - - bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; - - WritePixel_SSE(xmm1, ebp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if ((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch (m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - split16_2x8(xmm0, xmm1, xmm2); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - movdqa(xmm0, xmm2); - movdqa(xmm1, xmm2); - movdqa(xmm4, xmm2); - - pcmpeqd(xmm7, xmm7); - psrld(xmm7, 27); // 0x0000001f - pand(xmm0, xmm7); - pslld(xmm0, 3); - - pslld(xmm7, 10); // 0x00007c00 - pand(xmm4, xmm7); - pslld(xmm4, 9); - - por(xmm0, xmm4); - - movdqa(xmm4, xmm1); - - psrld(xmm7, 5); // 0x000003e0 - pand(xmm1, xmm7); - psrld(xmm1, 2); - - psllw(xmm7, 10); // 0x00008000 - pand(xmm4, xmm7); - pslld(xmm4, 8); - - por(xmm1, xmm4); - - break; - } - } - - // xmm5, xmm6 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm2, xmm3 = used - // xmm4, xmm7 = free - - if (m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - movdqa(xmm4, xmm5); - } - - if (m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - movdqa(xmm5, xmm0); - break; - case 2: - pxor(xmm5, xmm5); - break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch (m_sel.abb) - { - case 0: - psubw(xmm5, xmm4); - break; - case 1: - psubw(xmm5, xmm0); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch (m_sel.abc) - { - case 0: - case 1: - pshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1)); - pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); - psllw(xmm7, 7); - break; - case 2: - movdqa(xmm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(xmm5, xmm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch (m_sel.abd) - { - case 0: - paddw(xmm5, xmm4); - break; - case 1: - paddw(xmm5, xmm0); - break; - case 2: - break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - movdqa(xmm5, xmm0); - break; - case 2: - pxor(xmm5, xmm5); - break; - } - } - - if (m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - movdqa(xmm0, xmm6); - pslld(xmm0, 8); - psrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - blend8r(xmm5, xmm4); - } - - // xmm6 = src ga - // xmm1 = dst ga - // xmm5 = rb - // xmm7 = a - // xmm2, xmm3 = used - // xmm0, xmm4 = free - - movdqa(xmm4, xmm6); - - if (m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch (m_sel.aba) - { - case 0: - break; - case 1: - movdqa(xmm6, xmm1); - break; - case 2: - pxor(xmm6, xmm6); - break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch (m_sel.abb) - { - case 0: - psubw(xmm6, xmm4); - break; - case 1: - psubw(xmm6, xmm1); - break; - case 2: - break; - } - - if (!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(xmm6, xmm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch (m_sel.abd) - { - case 0: - paddw(xmm6, xmm4); - break; - case 1: - paddw(xmm6, xmm1); - break; - case 2: - break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch (m_sel.abd) - { - case 0: - break; - case 1: - movdqa(xmm6, xmm1); - break; - case 2: - pxor(xmm6, xmm6); - break; - } - } - - // xmm4 = src ga - // xmm5 = rb - // xmm6 = ga - // xmm2, xmm3 = used - // xmm0, xmm1, xmm7 = free - - if (m_sel.pabe) - { - psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - blend8r(xmm6, xmm4); - } - else - { - if (m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(xmm6, xmm4, xmm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame_SSE() -{ - if (!m_sel.fwrite) - { - return; - } - - if (m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, ptr[esp + _top]); - and(eax, 3); - shl(eax, 5); - mov(ebp, ptr[&m_local.gd->dimx]); - paddw(xmm5, ptr[ebp + eax + sizeof(GSVector4i) * 0]); - paddw(xmm6, ptr[ebp + eax + sizeof(GSVector4i) * 1]); - } - - if (m_sel.colclamp == 0) - { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; - - pcmpeqd(xmm7, xmm7); - psrlw(xmm7, 8); - pand(xmm5, xmm7); - pand(xmm6, xmm7); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - movdqa(xmm7, xmm5); - punpcklwd(xmm5, xmm6); - punpckhwd(xmm7, xmm6); - packuswb(xmm5, xmm7); - - if (m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - pcmpeqd(xmm7, xmm7); - pslld(xmm7, 31); - por(xmm5, xmm7); - } - - if (m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - movd(xmm6, eax); - pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - movd(xmm7, eax); - pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - - movdqa(xmm4, xmm5); - pand(xmm4, xmm6); - pand(xmm5, xmm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - movdqa(xmm6, xmm4); - movdqa(xmm7, xmm5); - - psrld(xmm4, 3); - psrld(xmm6, 9); - psrld(xmm5, 6); - psrld(xmm7, 16); - - por(xmm5, xmm4); - por(xmm7, xmm6); - por(xmm5, xmm7); - } - - if (m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; - - WritePixel_SSE(xmm5, ebx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const Reg32& addr) -{ - movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); - movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if (m_sel.notest) - { - if (fast) - { - movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - } - else - { - WritePixel_SSE(src, addr, 0, psm); - WritePixel_SSE(src, addr, 1, psm); - WritePixel_SSE(src, addr, 2, psm); - WritePixel_SSE(src, addr, 3, psm); - } - } - else - { - if (fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel_SSE(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel_SSE(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel_SSE(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel_SSE(src, addr, 3, psm); - L("@@"); - } - } -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& addr, uint8 i, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - switch (psm) - { - case 0: - if (i == 0) - movd(dst, src); - else - { - pextrd(dst, src, i); - } - break; - case 1: - if (i == 0) - movd(eax, src); - else - { - pextrd(eax, src, i); - } - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - if (i == 0) - movd(eax, src); - else - pextrw(eax, src, i * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) -{ - // in - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // ebx = m_local.tex[0] (!m_sel.mmin) - // ebp = m_local.tex (m_sel.mmin) - // edx = m_local.clut (m_sel.tlu) - - // out - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - - ASSERT(pixels == 1 || pixels == 4); - - mip_offset *= sizeof(void*); - - const GSVector4i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i; - - if (m_sel.mmin && !m_sel.lcm) - { - const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; - - if (pixels == 4) - { - movdqa(ptr[&m_local.temp.test], xmm7); - } - - for (uint8 j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - for (int i = 0; i < pixels; i++) - { - ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } - } - - if (pixels == 4) - { - movdqa(xmm5, xmm7); - movdqa(xmm7, ptr[&m_local.temp.test]); - } - } - else - { - if (m_sel.mmin && m_sel.lcm) - { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - } - - const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - - for (int i = 0; i < pixels; i++) - { - for (uint8 j = 0; j < 4; j++) - { - ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i) -{ - const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; - - ASSERT(i == 0 || m_cpu.has(util::Cpu::tSSE41)); - - if (i == 0) - movd(eax, addr); - else - pextrd(eax, addr, i); - - if (m_sel.tlu) - movzx(eax, byte[ebx + eax]); - - if (i == 0) - movd(dst, src); - else - pinsrd(dst, src, i); -} - -#endif diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 68df3ad887..a19003ec4f 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -467,12 +467,6 @@ - - - - - - @@ -812,7 +806,6 @@ - diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index 8c2cea4496..c2fa268ccb 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -1520,24 +1520,6 @@ System\Ps2\GS\Renderers\Software - - System\Ps2\GS\Renderers\Software - - - System\Ps2\GS\Renderers\Software - - - System\Ps2\GS\Renderers\Software - - - System\Ps2\GS\Renderers\Software - - - System\Ps2\GS\Renderers\Software - - - System\Ps2\GS\Renderers\Software - System\Ps2\GS\Renderers\Software @@ -2496,9 +2478,6 @@ System\Ps2\GS - - System\Ps2\GS - System\Ps2\GS