From f9da2669a7e0b4e1bd28c90cdedb712293dccbf3 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Wed, 2 Mar 2011 08:32:30 +0000 Subject: [PATCH] GSdx: fixing the vs2008 project git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4382 96395faa-99c1-11dd-bbfe-3dabce05a288 --- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 3594 +++++++------- .../GSdx/GSDrawScanlineCodeGenerator.x64.cpp | 244 +- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 3940 ++++++++-------- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 4182 ++++++++--------- .../GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp | 696 +-- plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp | 724 +-- .../GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp | 664 +-- plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp | 696 +-- plugins/GSdx/GSVertexTrace.x64.avx.cpp | 990 ++-- plugins/GSdx/GSVertexTrace.x64.cpp | 1086 ++--- plugins/GSdx/GSVertexTrace.x86.avx.cpp | 968 ++-- plugins/GSdx/GSVertexTrace.x86.cpp | 1060 ++--- plugins/GSdx/GSdx_vs2008.vcproj | 312 ++ 13 files changed, 9734 insertions(+), 9422 deletions(-) diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index 24888b577a..a12271a3eb 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -1,1798 +1,1798 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSDrawScanlineCodeGenerator.h" - -#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -#error TODO: this is still bogus somewhere - -void GSDrawScanlineCodeGenerator::Generate() -{ - // TODO: on linux/mac rsi, rdi, xmm6-xmm15 are all caller saved - - push(rbx); - push(rsi); - push(rdi); - push(rbp); - push(r12); - push(r13); - - enter(10 * 16, true); - - for(int i = 6; i < 16; i++) - { - vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); - } - - movsxd(rcx, ecx); // right - movsxd(rdx, edx); // left - movsxd(r8, r8d); // top - - mov(r10, (size_t)&m_test[0]); - mov(r11, (size_t)&m_local); - mov(r12, (size_t)m_local.gd); - mov(r13, (size_t)m_local.gd->vm); - - Init(); - - // rcx = steps - // rsi = fza_base - // rdi = fza_offset - // r10 = &m_test[0] - // r11 = &m_local - // r12 = m_local->gd - // r13 = m_local->gd.vm - // xmm7 = vf (sprite && ltf) - // xmm8 = z - // xmm9 = f - // xmm10 = s - // xmm11 = t - // xmm12 = q - // xmm13 = rb - // xmm14 = ga - // xmm15 = test - - if(!m_sel.edge) - { - align(16); - } - -L("loop"); - - TestZ(xmm5, xmm6); - - // ebp = za - - SampleTexture(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - - AlphaTFX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - - ReadMask(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - TestAlpha(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - ColorTFX(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - Fog(); - - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - - ReadFrame(); - - // ebx = fa - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - TestDestAlpha(); - - // ebx = fa - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - WriteMask(); - - // ebx = fa - // edx = fzm - // ebp = za - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm5 = zm - // xmm6 = fd - - WriteZBuf(); - - // ebx = fa - // edx = fzm - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm6 = fd - - AlphaBlend(); - - // ebx = fa - // edx = fzm - // xmm2 = rb - // xmm3 = ga - // xmm4 = fm - // xmm6 = fd - - WriteFrame(); - -L("step"); - - // if(steps <= 0) break; - - if(!m_sel.edge) - { - test(rcx, rcx); - - jle("exit", T_NEAR); - - Step(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - for(int i = 6; i < 16; i++) - { - vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); - } - - leave(); - - pop(r13); - pop(r12); - pop(rbp); - pop(rdi); - pop(rsi); - pop(rbx); - - ret(); -} - -void GSDrawScanlineCodeGenerator::Init() -{ - // int skip = left & 3; - - mov(rbx, rdx); - and(rdx, 3); - - // left -= skip; - - sub(rbx, rdx); - - // int steps = right - left - 4; - - sub(rcx, rbx); - sub(rcx, 4); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(rdx, 4); - - vmovdqa(xmm15, ptr[rdx + r10]); - - mov(rax, rcx); - sar(rax, 63); - and(rax, rcx); - add(rax, 7); - shl(rax, 4); - - vpor(xmm15, ptr[rax + r10]); - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(rax, (size_t)m_local.gd->fzbr); - lea(rsi, ptr[rax + r8 * 8]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - mov(rax, (size_t)m_local.gd->fzbc); - lea(rdi, ptr[rax + rbx * 2]); - - if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - shl(rdx, 4); - lea(rdx, ptr[rdx + r11 + offsetof(GSScanlineLocalData, d)]); - } - - if(!m_sel.sprite) - { - if(m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vmovaps(xmm0, ptr[r9 + 16]); // v.p - - if(m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(xmm9, xmm0); - vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm9, ptr[rdx + 16 * 6]); - } - - if(m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm8, ptr[rdx]); - } - } - } - else - { - if(m_sel.ztest) - { - vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]); - } - } - - if(m_sel.fb) - { - if(m_sel.edge || m_sel.tfx != TFX_NONE) - { - vmovaps(xmm0, ptr[r9 + 32]); // v.t - } - - if(m_sel.edge) - { - vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(xmm1, 9); - - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1); - } - - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(xmm0, xmm0); - - // si = vti.xxxx() + m_local.d[skip].si; - // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; - - vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(xmm10, ptr[rdx + 16 * 7]); - - if(!m_sel.sprite) - { - vpaddd(xmm11, ptr[rdx + 16 * 8]); - } - else - { - if(m_sel.ltf) - { - vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); - } - } - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm10, ptr[rdx + 16 * 1]); - vaddps(xmm11, ptr[rdx + 16 * 2]); - vaddps(xmm12, ptr[rdx + 16 * 3]); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vcvttps2dq(xmm0, ptr[r9]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm0, xmm1); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(xmm13, ptr[rdx + 16 * 4]); - vpaddw(xmm14, ptr[rdx + 16 * 5]); - } - else - { - vmovdqa(xmm13, ptr[&m_local.c.rb]); - vmovdqa(xmm14, ptr[&m_local.c.ga]); - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step() -{ - // steps -= 4; - - sub(rcx, 4); - - // fza_offset++; - - add(rdi, 8); - - if(!m_sel.sprite) - { - // z += m_local.d4.z; - - if(m_sel.zb) - { - vaddps(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, d4.z)]); - } - - // f = f.add16(m_local.d4.f); - - if(m_sel.fwrite && m_sel.fge) - { - vpaddw(xmm9, ptr[r11 + offsetof(GSScanlineLocalData, d4.f)]); - } - } - else - { - if(m_sel.ztest) - { - } - } - - if(m_sel.fb) - { - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i st = m_local.d4.st; - - // si += st.xxxx(); - // if(!sprite) ti += st.yyyy(); - - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.st)]); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm10, xmm1); - - if(!m_sel.sprite) - { - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm11, xmm1); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vmovaps(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); - - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm10, xmm1); - vaddps(xmm11, xmm2); - vaddps(xmm12, xmm3); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.c)]); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(xmm13, xmm1); - vpaddw(xmm14, xmm2); - } - else - { - if(m_sel.tfx == TFX_NONE) - { - } - } - } - } - - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(rdx, rcx); - sar(rdx, 63); - and(rdx, rcx); - add(rdx, 7); - shl(rdx, 4); - - vmovdqa(xmm15, ptr[rdx + r10]); -} - -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) -{ - if(!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - movsxd(rbp, dword[rsi + 4]); - movsxd(rax, dword[rdi + 4]); - add(rbp, rax); - - // GSVector4i zs = zi; - - if(!m_sel.sprite) - { - if(m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(rax, (size_t)&GSVector4::m_half); - - vbroadcastss(xmm0, ptr[rax]); - vmulps(xmm0, xmm8); - vcvttps2dq(xmm0, xmm0); - vpslld(xmm0, 1); - - vcvttps2dq(xmm1, xmm8); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm1, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // zs = GSVector4i(z); - - vcvttps2dq(xmm0, xmm8); - } - - if(m_sel.zwrite) - { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0); - } - } - - if(m_sel.ztest) - { - ReadPixel(xmm1, rbp); - - if(m_sel.zwrite && m_sel.zpsm < 2) - { - vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if(m_sel.zpsm) - { - vpslld(xmm1, m_sel.zpsm * 8); - vpsrld(xmm1, m_sel.zpsm * 8); - } - - if(m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - vpcmpeqd(xmm2, xmm2); - vpslld(xmm2, 31); - - // GSVector4i zso = zs - o; - - vpsubd(xmm0, xmm2); - - // GSVector4i zdo = zd - o; - - vpsubd(xmm1, xmm2); - } - - switch(m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(xmm1, xmm0); - vpor(xmm15, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(xmm0, xmm1); - vpcmpeqd(xmm2, xmm2); - vpxor(xmm0, xmm2); - vpor(xmm15, xmm0); - break; - } - - alltrue(); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ - if(!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); - - // ebx = tex - - if(!m_sel.fst) - { - vrcpps(xmm0, xmm12); - - vmulps(xmm4, xmm10, xmm0); - vmulps(xmm5, xmm11, xmm0); - - vcvttps2dq(xmm4, xmm4); - vcvttps2dq(xmm5, xmm5); - - if(m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm0, eax); - vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm4, xmm0); - vpsubd(xmm5, xmm0); - } - } - else - { - vmovdqa(xmm4, xmm10); - vmovdqa(xmm5, xmm11); - } - - if(m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(1); - - vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); - - if(!m_sel.sprite) - { - // GSVector4i vf = v.xxzzlh().srl16(1); - - vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm7, 1); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm4, 16); - vpsrad(xmm5, 16); - vpackssdw(xmm4, xmm5); - - if(m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm0, xmm0); - vpsrlw(xmm0, 15); - vpaddw(xmm5, xmm4, xmm0); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap(xmm4, xmm5); - } - else - { - // uv0 = Wrap(uv0); - - Wrap(xmm4); - } - - // xmm4 = uv0 - // xmm5 = uv1 (ltf) - // xmm6 = uf - // xmm7 = vf - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm2, xmm4, xmm0); - vpunpckhwd(xmm3, xmm4, xmm0); - vpslld(xmm3, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = x0 - // xmm3 = y0 - // xmm5 = uv1 (ltf) - // xmm6 = uf - // xmm7 = vf - - if(m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - vpunpcklwd(xmm4, xmm5, xmm0); - vpunpckhwd(xmm5, xmm5, xmm0); - vpslld(xmm5, m_sel.tw + 3); - - // xmm2 = x0 - // xmm3 = y0 - // xmm4 = x1 - // xmm5 = y1 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm0, xmm3, xmm2); - vpaddd(xmm1, xmm3, xmm4); - vpaddd(xmm2, xmm5, xmm2); - vpaddd(xmm3, xmm5, xmm4); - - // xmm0 = addr00 - // xmm1 = addr01 - // xmm2 = addr10 - // xmm3 = addr11 - // xmm6 = uf - // xmm7 = vf - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm0, xmm0, xmm4, xmm5); - ReadTexel(xmm1, xmm1, xmm4, xmm5); - ReadTexel(xmm2, xmm2, xmm4, xmm5); - ReadTexel(xmm3, xmm3, xmm4, xmm5); - - // xmm0 = c00 - // xmm1 = c01 - // xmm2 = c10 - // xmm3 = c11 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - vpsllw(xmm4, xmm0, 8); - vpsrlw(xmm4, 8); - vpsrlw(xmm5, xmm0, 8); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - vpsllw(xmm0, xmm1, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, 8); - - // xmm0 = rb01 - // xmm1 = ga01 - // xmm2 = c10 - // xmm3 = c11 - // xmm4 = rb00 - // xmm5 = ga00 - // xmm6 = uf - // xmm7 = vf - - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); - - lerp16(xmm0, xmm4, xmm6, 0); - lerp16(xmm1, xmm5, xmm6, 0); - - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = c10 - // xmm3 = c11 - // xmm6 = uf - // xmm7 = vf - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - vpsrlw(xmm5, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm4, xmm2, 8); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - vpsrlw(xmm2, xmm3, 8); - vpsllw(xmm3, 8); - vpsrlw(xmm3, 8); - - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb11 - // xmm3 = ga11 - // xmm4 = rb10 - // xmm5 = ga10 - // xmm6 = uf - // xmm7 = vf - - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); - - lerp16(xmm2, xmm4, xmm6, 0); - lerp16(xmm3, xmm5, xmm6, 0); - - // xmm0 = rb00 - // xmm1 = ga00 - // xmm2 = rb10 - // xmm3 = ga10 - // xmm7 = vf - - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); - - lerp16(xmm2, xmm0, xmm7, 0); - lerp16(xmm3, xmm1, xmm7, 0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm3, xmm2); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm2, xmm3, xmm0, xmm1); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsrlw(xmm3, xmm2, 8); - vpsllw(xmm2, 8); - vpsrlw(xmm2, 8); - } - - // xmm2 = rb - // xmm3 = ga -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) -{ - // xmm0, xmm1, xmm2, xmm3 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv, xmm0); - } - - vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - } - else - { - vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - - if(region) - { - vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - } - } - } - else - { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv, xmm2); - - if(region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, xmm2); - vpminsw(uv, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm2, xmm3 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vpminsw(uv0, xmm0); - vpminsw(uv1, xmm0); - } - else - { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vpand(uv0, xmm0); - vpand(uv1, xmm0); - - if(region) - { - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vpor(uv0, xmm0); - vpor(uv1, xmm0); - } - } - } - else - { - vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); - vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv0, xmm2); - - if(region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, xmm2); - vpminsw(uv0, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, xmm1, xmm0); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv1, xmm2); - - if(region) - { - vpor(xmm1, xmm3); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, xmm2); - vpminsw(uv1, xmm3); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ - if(!m_sel.fb) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(xmm3, xmm14, 1); - clamp16(xmm3, xmm0); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - vpsrlw(xmm1, xmm14, 7); - mix16(xmm3, xmm1, xmm0); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - vpsrlw(xmm1, xmm14, 7); - mix16(xmm3, xmm1, xmm0); - } - - break; - - case TFX_HIGHLIGHT: - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(xmm1, xmm14, 7); - if(m_sel.tcc) vpaddusb(xmm1, xmm3); - mix16(xmm3, xmm1, xmm0); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - vpsrlw(xmm1, xmm14, 7); - mix16(xmm3, xmm1, xmm0); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if(m_sel.iip) - { - vpsrlw(xmm3, xmm14, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ - if(m_sel.fwrite) - { - vmovdqa(xmm4, ptr[r12 + offsetof(GSScanlineGlobalData, fm)]); - } - - if(m_sel.zwrite) - { - vmovdqa(xmm5, ptr[r12 + offsetof(GSScanlineGlobalData, zm)]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ - switch(m_sel.afail) - { - case AFAIL_FB_ONLY: - if(!m_sel.zwrite) return; - break; - - case AFAIL_ZB_ONLY: - if(!m_sel.fwrite) return; - break; - - case AFAIL_RGB_ONLY: - if(!m_sel.zwrite && m_sel.fpsm == 1) return; - break; - } - - switch(m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - vpcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpgtd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, xmm3, 16); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); - vpcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, xmm3, 16); - vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); - break; - } - - switch(m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(xmm15, xmm1); - alltrue(); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(xmm5, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(xmm4, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(xmm5, xmm1); - // fm |= t & GSVector4i::xff000000(); - vpsrld(xmm1, 24); - vpslld(xmm1, 24); - vpor(xmm4, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ - if(!m_sel.fwrite) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(xmm2, xmm13, 1); - clamp16(xmm2, xmm0); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(xmm6, 7); - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(xmm1, xmm3); - modulate16(xmm3, xmm14, 1); - vpaddw(xmm3, xmm6); - clamp16(xmm3, xmm0); - mix16(xmm3, xmm1, xmm0); - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(xmm2, xmm13, 1); - vpaddw(xmm2, xmm6); - clamp16(xmm2, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if(m_sel.iip) - { - vpsrlw(xmm2, xmm13, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog() -{ - if(!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]); - vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]); - - vmovdqa(xmm6, xmm3); - - lerp16(xmm2, xmm0, xmm9, 0); - lerp16(xmm3, xmm1, xmm9, 0); - - mix16(xmm3, xmm6, xmm9); -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ - if(!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, dword[rsi]); - add(ebx, dword[rdi]); - movsxd(rbx, ebx); - - if(!m_sel.rfb) - { - return; - } - - ReadPixel(xmm6, rbx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ - if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if(m_sel.datm) - { - if(m_sel.fpsm == 2) - { - vpxor(xmm0, xmm0); - vpsrld(xmm1, xmm6, 15); - vpcmpeqd(xmm1, xmm0); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm6, xmm0); - vpsrad(xmm1, 31); - } - } - else - { - if(m_sel.fpsm == 2) - { - vpslld(xmm1, xmm6, 16); - vpsrad(xmm1, 31); - } - else - { - vpsrad(xmm1, xmm6, 31); - } - } - - vpor(xmm15, xmm1); - - alltrue(); -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ - // fm |= test; - // zm |= test; - - if(m_sel.fwrite) - { - vpor(xmm4, xmm15); - } - - if(m_sel.zwrite) - { - vpor(xmm5, xmm15); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - vpcmpeqd(xmm1, xmm1); - - if(m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(xmm0, xmm1, xmm5); - vpcmpeqd(xmm1, xmm4); - vpackssdw(xmm1, xmm0); - } - else if(m_sel.fwrite) - { - vpcmpeqd(xmm1, xmm4); - vpackssdw(xmm1, xmm1); - } - else if(m_sel.zwrite) - { - vpcmpeqd(xmm1, xmm5); - vpackssdw(xmm1, xmm1); - } - - vpmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ - if(!m_sel.zwrite) - { - return; - } - - bool fast = m_sel.ztest && m_sel.zpsm < 2; - - vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]); - - if(fast) - { - // zs = zs.blend8(zd, zm); - - vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4); - } - - WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch(m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - vpsllw(xmm0, xmm6, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm6, 8); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(xmm15, xmm15); - - vpsrld(xmm15, 27); // 0x0000001f - vpand(xmm0, xmm6, xmm15); - vpslld(xmm0, 3); - - vpslld(xmm15, 10); // 0x00007c00 - vpand(xmm5, xmm6, xmm15); - vpslld(xmm5, 9); - - vpor(xmm0, xmm1); - - vpsrld(xmm15, 5); // 0x000003e0 - vpand(xmm1, xmm6, xmm15); - vpsrld(xmm1, 2); - - vpsllw(xmm15, 10); // 0x00008000 - vpand(xmm5, xmm6, xmm15); - vpslld(xmm5, 8); - - vpor(xmm1, xmm5); - - break; - } - } - - // xmm2, xmm3 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm5, xmm15 = free - - if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(xmm5, xmm2); - } - - if(m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch(m_sel.aba) - { - case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch(m_sel.abb) - { - case 0: vpsubw(xmm2, xmm5); break; - case 1: vpsubw(xmm2, xmm0); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch(m_sel.abc) - { - case 0: - case 1: - vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(xmm15, 7); - break; - case 2: - vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(xmm2, xmm15, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch(m_sel.abd) - { - case 0: vpaddw(xmm2, xmm5); break; - case 1: vpaddw(xmm2, xmm0); break; - case 2: break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch(m_sel.abd) - { - case 0: break; - case 1: vmovdqa(xmm2, xmm0); break; - case 2: vpxor(xmm2, xmm2); break; - } - } - - if(m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(xmm0, xmm3, 8); - vpsrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(xmm2, xmm5, xmm2, xmm0); - } - - // xmm0 = pabe mask - // xmm3 = src ga - // xmm1 = dst ga - // xmm2 = rb - // xmm15 = a - // xmm5 = free - - vmovdqa(xmm5, xmm3); - - if(m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch(m_sel.aba) - { - case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch(m_sel.abb) - { - case 0: vpsubw(xmm3, xmm5); break; - case 1: vpsubw(xmm3, xmm1); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(xmm3, xmm15, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch(m_sel.abd) - { - case 0: vpaddw(xmm3, xmm5); break; - case 1: vpaddw(xmm3, xmm1); break; - case 2: break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch(m_sel.abd) - { - case 0: break; - case 1: vmovdqa(xmm3, xmm1); break; - case 2: vpxor(xmm3, xmm3); break; - } - } - - // xmm0 = pabe mask - // xmm5 = src ga - // xmm2 = rb - // xmm3 = ga - // xmm1, xmm15 = free - - if(m_sel.pabe) - { - vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(xmm3, xmm5, xmm3, xmm0); - } - else - { - if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(xmm3, xmm5, xmm15); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.colclamp == 0) - { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; - - vpcmpeqd(xmm15, xmm15); - vpsrlw(xmm15, 8); - vpand(xmm2, xmm15); - vpand(xmm3, xmm15); - } - - if(m_sel.fpsm == 2 && m_sel.dthe) - { - mov(rax, r8); - and(rax, 3); - shl(rax, 5); - vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[0])]); - vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[1])]); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(xmm15, xmm2, xmm3); - vpunpcklwd(xmm2, xmm3); - vpackuswb(xmm2, xmm15); - - if(m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(xmm15, xmm15); - vpslld(xmm15, 31); - vpor(xmm2, xmm15); - } - - // xmm2 = fs - // xmm4 = fm - // xmm6 = fd - - if(m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm0, eax); - vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - vmovd(xmm1, eax); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); - - vpand(xmm0, xmm2); - vpand(xmm1, xmm2); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(xmm2, xmm0, 9); - vpsrld(xmm0, 3); - vpsrld(xmm3, xmm1, 16); - vpsrld(xmm1, 6); - - vpor(xmm0, xmm1); - vpor(xmm2, xmm3); - vpor(xmm2, xmm0); - } - - if(m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb && m_sel.fpsm < 2; - - WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) -{ - vmovq(dst, qword[r13 + addr * 2]); - vmovhps(dst, qword[r13 + addr * 2 + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if(fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[r13 + addr * 2], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[r13 + addr * 2 + 8 * 2], src); - L("@@"); - - // vmaskmovps? - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); - } -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) -{ - Address dst = ptr[r13 + addr * 2 + s_offsets[i] * 2]; - - switch(psm) - { - case 0: - if(i == 0) vmovd(dst, src); - else vpextrd(dst, src, i); - break; - case 1: - if(i == 0) vmovd(eax, src); - else vpextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - vpextrw(eax, src, i * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) -{ - ReadTexel(dst, addr, 0); - ReadTexel(dst, addr, 1); - ReadTexel(dst, addr, 2); - ReadTexel(dst, addr, 3); -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) -{ - const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; - - vpextrd(eax, addr, i); - - movsxd(rax, eax); - - if(m_sel.tlu) movzx(rax, byte[rbx + rax]); - - vpinsrd(dst, src, i); -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSDrawScanlineCodeGenerator.h" + +#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +#error TODO: this is still bogus somewhere + +void GSDrawScanlineCodeGenerator::Generate() +{ + // TODO: on linux/mac rsi, rdi, xmm6-xmm15 are all caller saved + + push(rbx); + push(rsi); + push(rdi); + push(rbp); + push(r12); + push(r13); + + enter(10 * 16, true); + + for(int i = 6; i < 16; i++) + { + vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); + } + + movsxd(rcx, ecx); // right + movsxd(rdx, edx); // left + movsxd(r8, r8d); // top + + mov(r10, (size_t)&m_test[0]); + mov(r11, (size_t)&m_local); + mov(r12, (size_t)m_local.gd); + mov(r13, (size_t)m_local.gd->vm); + + Init(); + + // rcx = steps + // rsi = fza_base + // rdi = fza_offset + // r10 = &m_test[0] + // r11 = &m_local + // r12 = m_local->gd + // r13 = m_local->gd.vm + // xmm7 = vf (sprite && ltf) + // xmm8 = z + // xmm9 = f + // xmm10 = s + // xmm11 = t + // xmm12 = q + // xmm13 = rb + // xmm14 = ga + // xmm15 = test + + if(!m_sel.edge) + { + align(16); + } + +L("loop"); + + TestZ(xmm5, xmm6); + + // ebp = za + + SampleTexture(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + + AlphaTFX(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + + ReadMask(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + + TestAlpha(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + + ColorTFX(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + + Fog(); + + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + + ReadFrame(); + + // ebx = fa + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + // xmm6 = fd + + TestDestAlpha(); + + // ebx = fa + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + // xmm6 = fd + + WriteMask(); + + // ebx = fa + // edx = fzm + // ebp = za + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm5 = zm + // xmm6 = fd + + WriteZBuf(); + + // ebx = fa + // edx = fzm + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm6 = fd + + AlphaBlend(); + + // ebx = fa + // edx = fzm + // xmm2 = rb + // xmm3 = ga + // xmm4 = fm + // xmm6 = fd + + WriteFrame(); + +L("step"); + + // if(steps <= 0) break; + + if(!m_sel.edge) + { + test(rcx, rcx); + + jle("exit", T_NEAR); + + Step(); + + jmp("loop", T_NEAR); + } + +L("exit"); + + for(int i = 6; i < 16; i++) + { + vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); + } + + leave(); + + pop(r13); + pop(r12); + pop(rbp); + pop(rdi); + pop(rsi); + pop(rbx); + + ret(); +} + +void GSDrawScanlineCodeGenerator::Init() +{ + // int skip = left & 3; + + mov(rbx, rdx); + and(rdx, 3); + + // left -= skip; + + sub(rbx, rdx); + + // int steps = right - left - 4; + + sub(rcx, rbx); + sub(rcx, 4); + + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + + shl(rdx, 4); + + vmovdqa(xmm15, ptr[rdx + r10]); + + mov(rax, rcx); + sar(rax, 63); + and(rax, rcx); + add(rax, 7); + shl(rax, 4); + + vpor(xmm15, ptr[rax + r10]); + + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + + mov(rax, (size_t)m_local.gd->fzbr); + lea(rsi, ptr[rax + r8 * 8]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + + mov(rax, (size_t)m_local.gd->fzbc); + lea(rdi, ptr[rax + rbx * 2]); + + if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) + { + // edx = &m_local.d[skip] + + shl(rdx, 4); + lea(rdx, ptr[rdx + r11 + offsetof(GSScanlineLocalData, d)]); + } + + if(!m_sel.sprite) + { + if(m_sel.fwrite && m_sel.fge || m_sel.zb) + { + vmovaps(xmm0, ptr[r9 + 16]); // v.p + + if(m_sel.fwrite && m_sel.fge) + { + // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); + + vcvttps2dq(xmm9, xmm0); + vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2)); + vpaddw(xmm9, ptr[rdx + 16 * 6]); + } + + if(m_sel.zb) + { + // z = vp.zzzz() + m_local.d[skip].z; + + vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vaddps(xmm8, ptr[rdx]); + } + } + } + else + { + if(m_sel.ztest) + { + vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]); + } + } + + if(m_sel.fb) + { + if(m_sel.edge || m_sel.tfx != TFX_NONE) + { + vmovaps(xmm0, ptr[r9 + 32]); // v.t + } + + if(m_sel.edge) + { + vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + vpsrlw(xmm1, 9); + + vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1); + } + + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i vti(vt); + + vcvttps2dq(xmm0, xmm0); + + // si = vti.xxxx() + m_local.d[skip].si; + // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; + + vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddd(xmm10, ptr[rdx + 16 * 7]); + + if(!m_sel.sprite) + { + vpaddd(xmm11, ptr[rdx + 16 * 8]); + } + else + { + if(m_sel.ltf) + { + vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm6, 1); + } + } + } + else + { + // s = vt.xxxx() + m_local.d[skip].s; + // t = vt.yyyy() + m_local.d[skip].t; + // q = vt.zzzz() + m_local.d[skip].q; + + vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(xmm10, ptr[rdx + 16 * 1]); + vaddps(xmm11, ptr[rdx + 16 * 2]); + vaddps(xmm12, ptr[rdx + 16 * 3]); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i vc = GSVector4i(v.c); + + vcvttps2dq(xmm0, ptr[r9]); // v.c + + // vc = vc.upl16(vc.zwxy()); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(xmm0, xmm1); + + // rb = vc.xxxx().add16(m_local.d[skip].rb); + // ga = vc.zzzz().add16(m_local.d[skip].ga); + + vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vpaddw(xmm13, ptr[rdx + 16 * 4]); + vpaddw(xmm14, ptr[rdx + 16 * 5]); + } + else + { + vmovdqa(xmm13, ptr[&m_local.c.rb]); + vmovdqa(xmm14, ptr[&m_local.c.ga]); + } + } + } +} + +void GSDrawScanlineCodeGenerator::Step() +{ + // steps -= 4; + + sub(rcx, 4); + + // fza_offset++; + + add(rdi, 8); + + if(!m_sel.sprite) + { + // z += m_local.d4.z; + + if(m_sel.zb) + { + vaddps(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, d4.z)]); + } + + // f = f.add16(m_local.d4.f); + + if(m_sel.fwrite && m_sel.fge) + { + vpaddw(xmm9, ptr[r11 + offsetof(GSScanlineLocalData, d4.f)]); + } + } + else + { + if(m_sel.ztest) + { + } + } + + if(m_sel.fb) + { + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i st = m_local.d4.st; + + // si += st.xxxx(); + // if(!sprite) ti += st.yyyy(); + + vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.st)]); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpaddd(xmm10, xmm1); + + if(!m_sel.sprite) + { + vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vpaddd(xmm11, xmm1); + } + } + else + { + // GSVector4 stq = m_local.d4.stq; + + // s += stq.xxxx(); + // t += stq.yyyy(); + // q += stq.zzzz(); + + vmovaps(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.stq)]); + + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(xmm10, xmm1); + vaddps(xmm11, xmm2); + vaddps(xmm12, xmm3); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i c = m_local.d4.c; + + // rb = rb.add16(c.xxxx()); + // ga = ga.add16(c.yyyy()); + + vmovdqa(xmm0, ptr[r11 + offsetof(GSScanlineLocalData, d4.c)]); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddw(xmm13, xmm1); + vpaddw(xmm14, xmm2); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + } + } + } + } + + // test = m_test[7 + (steps & (steps >> 31))]; + + mov(rdx, rcx); + sar(rdx, 63); + and(rdx, rcx); + add(rdx, 7); + shl(rdx, 4); + + vmovdqa(xmm15, ptr[rdx + r10]); +} + +void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +{ + if(!m_sel.zb) + { + return; + } + + // int za = fza_base.y + fza_offset->y; + + movsxd(rbp, dword[rsi + 4]); + movsxd(rax, dword[rdi + 4]); + add(rbp, rax); + + // GSVector4i zs = zi; + + if(!m_sel.sprite) + { + if(m_sel.zoverflow) + { + // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + mov(rax, (size_t)&GSVector4::m_half); + + vbroadcastss(xmm0, ptr[rax]); + vmulps(xmm0, xmm8); + vcvttps2dq(xmm0, xmm0); + vpslld(xmm0, 1); + + vcvttps2dq(xmm1, xmm8); + vpcmpeqd(xmm2, xmm2); + vpsrld(xmm2, 31); + vpand(xmm1, xmm2); + + vpor(xmm0, xmm1); + } + else + { + // zs = GSVector4i(z); + + vcvttps2dq(xmm0, xmm8); + } + + if(m_sel.zwrite) + { + vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0); + } + } + + if(m_sel.ztest) + { + ReadPixel(xmm1, rbp); + + if(m_sel.zwrite && m_sel.zpsm < 2) + { + vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1); + } + + // zd &= 0xffffffff >> m_sel.zpsm * 8; + + if(m_sel.zpsm) + { + vpslld(xmm1, m_sel.zpsm * 8); + vpsrld(xmm1, m_sel.zpsm * 8); + } + + if(m_sel.zoverflow || m_sel.zpsm == 0) + { + // GSVector4i o = GSVector4i::x80000000(); + + vpcmpeqd(xmm2, xmm2); + vpslld(xmm2, 31); + + // GSVector4i zso = zs - o; + + vpsubd(xmm0, xmm2); + + // GSVector4i zdo = zd - o; + + vpsubd(xmm1, xmm2); + } + + switch(m_sel.ztst) + { + case ZTST_GEQUAL: + // test |= zso < zdo; // ~(zso >= zdo) + vpcmpgtd(xmm1, xmm0); + vpor(xmm15, xmm1); + break; + + case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL + // test |= zso <= zdo; // ~(zso > zdo) + vpcmpgtd(xmm0, xmm1); + vpcmpeqd(xmm2, xmm2); + vpxor(xmm0, xmm2); + vpor(xmm15, xmm0); + break; + } + + alltrue(); + } +} + +void GSDrawScanlineCodeGenerator::SampleTexture() +{ + if(!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); + + // ebx = tex + + if(!m_sel.fst) + { + vrcpps(xmm0, xmm12); + + vmulps(xmm4, xmm10, xmm0); + vmulps(xmm5, xmm11, xmm0); + + vcvttps2dq(xmm4, xmm4); + vcvttps2dq(xmm5, xmm5); + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + vmovd(xmm0, eax); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + + vpsubd(xmm4, xmm0); + vpsubd(xmm5, xmm0); + } + } + else + { + vmovdqa(xmm4, xmm10); + vmovdqa(xmm5, xmm11); + } + + if(m_sel.ltf) + { + // GSVector4i uf = u.xxzzlh().srl16(1); + + vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm6, 1); + + if(!m_sel.sprite) + { + // GSVector4i vf = v.xxzzlh().srl16(1); + + vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm7, 1); + } + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + vpsrad(xmm4, 16); + vpsrad(xmm5, 16); + vpackssdw(xmm4, xmm5); + + if(m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + vpcmpeqd(xmm0, xmm0); + vpsrlw(xmm0, 15); + vpaddw(xmm5, xmm4, xmm0); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + Wrap(xmm4, xmm5); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(xmm4); + } + + // xmm4 = uv0 + // xmm5 = uv1 (ltf) + // xmm6 = uf + // xmm7 = vf + + // GSVector4i x0 = uv0.upl16(); + // GSVector4i y0 = uv0.uph16() << tw; + + vpxor(xmm0, xmm0); + + vpunpcklwd(xmm2, xmm4, xmm0); + vpunpckhwd(xmm3, xmm4, xmm0); + vpslld(xmm3, m_sel.tw + 3); + + // xmm0 = 0 + // xmm2 = x0 + // xmm3 = y0 + // xmm5 = uv1 (ltf) + // xmm6 = uf + // xmm7 = vf + + if(m_sel.ltf) + { + // GSVector4i x1 = uv1.upl16(); + // GSVector4i y1 = uv1.uph16() << tw; + + vpunpcklwd(xmm4, xmm5, xmm0); + vpunpckhwd(xmm5, xmm5, xmm0); + vpslld(xmm5, m_sel.tw + 3); + + // xmm2 = x0 + // xmm3 = y0 + // xmm4 = x1 + // xmm5 = y1 + // xmm6 = uf + // xmm7 = vf + + // GSVector4i addr00 = y0 + x0; + // GSVector4i addr01 = y0 + x1; + // GSVector4i addr10 = y1 + x0; + // GSVector4i addr11 = y1 + x1; + + vpaddd(xmm0, xmm3, xmm2); + vpaddd(xmm1, xmm3, xmm4); + vpaddd(xmm2, xmm5, xmm2); + vpaddd(xmm3, xmm5, xmm4); + + // xmm0 = addr00 + // xmm1 = addr01 + // xmm2 = addr10 + // xmm3 = addr11 + // xmm6 = uf + // xmm7 = vf + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm0, xmm0, xmm4, xmm5); + ReadTexel(xmm1, xmm1, xmm4, xmm5); + ReadTexel(xmm2, xmm2, xmm4, xmm5); + ReadTexel(xmm3, xmm3, xmm4, xmm5); + + // xmm0 = c00 + // xmm1 = c01 + // xmm2 = c10 + // xmm3 = c11 + // xmm6 = uf + // xmm7 = vf + + // GSVector4i rb00 = c00 & mask; + // GSVector4i ga00 = (c00 >> 8) & mask; + + vpsllw(xmm4, xmm0, 8); + vpsrlw(xmm4, 8); + vpsrlw(xmm5, xmm0, 8); + + // GSVector4i rb01 = c01 & mask; + // GSVector4i ga01 = (c01 >> 8) & mask; + + vpsllw(xmm0, xmm1, 8); + vpsrlw(xmm0, 8); + vpsrlw(xmm1, 8); + + // xmm0 = rb01 + // xmm1 = ga01 + // xmm2 = c10 + // xmm3 = c11 + // xmm4 = rb00 + // xmm5 = ga00 + // xmm6 = uf + // xmm7 = vf + + // rb00 = rb00.lerp16<0>(rb01, uf); + // ga00 = ga00.lerp16<0>(ga01, uf); + + lerp16(xmm0, xmm4, xmm6, 0); + lerp16(xmm1, xmm5, xmm6, 0); + + // xmm0 = rb00 + // xmm1 = ga00 + // xmm2 = c10 + // xmm3 = c11 + // xmm6 = uf + // xmm7 = vf + + // GSVector4i rb10 = c10 & mask; + // GSVector4i ga10 = (c10 >> 8) & mask; + + vpsrlw(xmm5, xmm2, 8); + vpsllw(xmm2, 8); + vpsrlw(xmm4, xmm2, 8); + + // GSVector4i rb11 = c11 & mask; + // GSVector4i ga11 = (c11 >> 8) & mask; + + vpsrlw(xmm2, xmm3, 8); + vpsllw(xmm3, 8); + vpsrlw(xmm3, 8); + + // xmm0 = rb00 + // xmm1 = ga00 + // xmm2 = rb11 + // xmm3 = ga11 + // xmm4 = rb10 + // xmm5 = ga10 + // xmm6 = uf + // xmm7 = vf + + // rb10 = rb10.lerp16<0>(rb11, uf); + // ga10 = ga10.lerp16<0>(ga11, uf); + + lerp16(xmm2, xmm4, xmm6, 0); + lerp16(xmm3, xmm5, xmm6, 0); + + // xmm0 = rb00 + // xmm1 = ga00 + // xmm2 = rb10 + // xmm3 = ga10 + // xmm7 = vf + + // rb00 = rb00.lerp16<0>(rb10, vf); + // ga00 = ga00.lerp16<0>(ga10, vf); + + lerp16(xmm2, xmm0, xmm7, 0); + lerp16(xmm3, xmm1, xmm7, 0); + } + else + { + // GSVector4i addr00 = y0 + x0; + + vpaddd(xmm3, xmm2); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm2, xmm3, xmm0, xmm1); + + // GSVector4i mask = GSVector4i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + vpsrlw(xmm3, xmm2, 8); + vpsllw(xmm2, 8); + vpsrlw(xmm2, 8); + } + + // xmm2 = rb + // xmm3 = ga +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +{ + // xmm0, xmm1, xmm2, xmm3 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + } + else + { + vpxor(xmm0, xmm0); + vpmaxsw(uv, xmm0); + } + + vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + } + else + { + vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + + if(region) + { + vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + } + } + } + else + { + vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv, xmm2); + + if(region) + { + vpor(xmm1, xmm3); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv, xmm2); + vpminsw(uv, xmm3); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv, xmm1, xmm0); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +{ + // xmm0, xmm1, xmm2, xmm3 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpmaxsw(uv0, xmm0); + vpmaxsw(uv1, xmm0); + } + else + { + vpxor(xmm0, xmm0); + vpmaxsw(uv0, xmm0); + vpmaxsw(uv1, xmm0); + } + + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpminsw(uv0, xmm0); + vpminsw(uv1, xmm0); + } + else + { + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vpand(uv0, xmm0); + vpand(uv1, xmm0); + + if(region) + { + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vpor(uv0, xmm0); + vpor(uv1, xmm0); + } + } + } + else + { + vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); + vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); + + // uv0 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv0, xmm2); + + if(region) + { + vpor(xmm1, xmm3); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv0, xmm2); + vpminsw(uv0, xmm3); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv0, xmm1, xmm0); + + // uv1 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv1, xmm2); + + if(region) + { + vpor(xmm1, xmm3); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv1, xmm2); + vpminsw(uv1, xmm3); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv1, xmm1, xmm0); + } +} + +void GSDrawScanlineCodeGenerator::AlphaTFX() +{ + if(!m_sel.fb) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // gat = gat.modulate16<1>(ga).clamp8(); + + modulate16(xmm3, xmm14, 1); + clamp16(xmm3, xmm0); + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); + } + + break; + + case TFX_DECAL: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); + } + + break; + + case TFX_HIGHLIGHT: + + // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); + + vpsrlw(xmm1, xmm14, 7); + if(m_sel.tcc) vpaddusb(xmm1, xmm3); + mix16(xmm3, xmm1, xmm0); + + break; + + case TFX_HIGHLIGHT2: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); + } + + break; + + case TFX_NONE: + + // gat = iip ? ga.srl16(7) : ga; + + if(m_sel.iip) + { + vpsrlw(xmm3, xmm14, 7); + } + + break; + } +} + +void GSDrawScanlineCodeGenerator::ReadMask() +{ + if(m_sel.fwrite) + { + vmovdqa(xmm4, ptr[r12 + offsetof(GSScanlineGlobalData, fm)]); + } + + if(m_sel.zwrite) + { + vmovdqa(xmm5, ptr[r12 + offsetof(GSScanlineGlobalData, zm)]); + } +} + +void GSDrawScanlineCodeGenerator::TestAlpha() +{ + switch(m_sel.afail) + { + case AFAIL_FB_ONLY: + if(!m_sel.zwrite) return; + break; + + case AFAIL_ZB_ONLY: + if(!m_sel.fwrite) return; + break; + + case AFAIL_RGB_ONLY: + if(!m_sel.zwrite && m_sel.fpsm == 1) return; + break; + } + + switch(m_sel.atst) + { + case ATST_NEVER: + // t = GSVector4i::xffffffff(); + vpcmpeqd(xmm1, xmm1); + break; + + case ATST_ALWAYS: + return; + + case ATST_LESS: + case ATST_LEQUAL: + // t = (ga >> 16) > m_local.gd->aref; + vpsrld(xmm1, xmm3, 16); + vpcmpgtd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + break; + + case ATST_EQUAL: + // t = (ga >> 16) != m_local.gd->aref; + vpsrld(xmm1, xmm3, 16); + vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpcmpeqd(xmm0, xmm0); + vpxor(xmm1, xmm0); + break; + + case ATST_GEQUAL: + case ATST_GREATER: + // t = (ga >> 16) < m_local.gd->aref; + vpsrld(xmm0, xmm3, 16); + vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + vpcmpgtd(xmm1, xmm0); + break; + + case ATST_NOTEQUAL: + // t = (ga >> 16) == m_local.gd->aref; + vpsrld(xmm1, xmm3, 16); + vpcmpeqd(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, aref)]); + break; + } + + switch(m_sel.afail) + { + case AFAIL_KEEP: + // test |= t; + vpor(xmm15, xmm1); + alltrue(); + break; + + case AFAIL_FB_ONLY: + // zm |= t; + vpor(xmm5, xmm1); + break; + + case AFAIL_ZB_ONLY: + // fm |= t; + vpor(xmm4, xmm1); + break; + + case AFAIL_RGB_ONLY: + // zm |= t; + vpor(xmm5, xmm1); + // fm |= t & GSVector4i::xff000000(); + vpsrld(xmm1, 24); + vpslld(xmm1, 24); + vpor(xmm4, xmm1); + break; + } +} + +void GSDrawScanlineCodeGenerator::ColorTFX() +{ + if(!m_sel.fwrite) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // rbt = rbt.modulate16<1>(rb).clamp8(); + + modulate16(xmm2, xmm13, 1); + clamp16(xmm2, xmm0); + + break; + + case TFX_DECAL: + + break; + + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + + vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); + vpsrlw(xmm6, 7); + + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + vmovdqa(xmm1, xmm3); + modulate16(xmm3, xmm14, 1); + vpaddw(xmm3, xmm6); + clamp16(xmm3, xmm0); + mix16(xmm3, xmm1, xmm0); + + // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); + + modulate16(xmm2, xmm13, 1); + vpaddw(xmm2, xmm6); + clamp16(xmm2, xmm0); + + break; + + case TFX_NONE: + + // rbt = iip ? rb.srl16(7) : rb; + + if(m_sel.iip) + { + vpsrlw(xmm2, xmm13, 7); + } + + break; + } +} + +void GSDrawScanlineCodeGenerator::Fog() +{ + if(!m_sel.fwrite || !m_sel.fge) + { + return; + } + + // rb = m_local.gd->frb.lerp16<0>(rb, f); + // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); + + vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]); + vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]); + + vmovdqa(xmm6, xmm3); + + lerp16(xmm2, xmm0, xmm9, 0); + lerp16(xmm3, xmm1, xmm9, 0); + + mix16(xmm3, xmm6, xmm9); +} + +void GSDrawScanlineCodeGenerator::ReadFrame() +{ + if(!m_sel.fb) + { + return; + } + + // int fa = fza_base.x + fza_offset->x; + + mov(ebx, dword[rsi]); + add(ebx, dword[rdi]); + movsxd(rbx, ebx); + + if(!m_sel.rfb) + { + return; + } + + ReadPixel(xmm6, rbx); +} + +void GSDrawScanlineCodeGenerator::TestDestAlpha() +{ + if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) + { + return; + } + + // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); + + if(m_sel.datm) + { + if(m_sel.fpsm == 2) + { + vpxor(xmm0, xmm0); + vpsrld(xmm1, xmm6, 15); + vpcmpeqd(xmm1, xmm0); + } + else + { + vpcmpeqd(xmm0, xmm0); + vpxor(xmm1, xmm6, xmm0); + vpsrad(xmm1, 31); + } + } + else + { + if(m_sel.fpsm == 2) + { + vpslld(xmm1, xmm6, 16); + vpsrad(xmm1, 31); + } + else + { + vpsrad(xmm1, xmm6, 31); + } + } + + vpor(xmm15, xmm1); + + alltrue(); +} + +void GSDrawScanlineCodeGenerator::WriteMask() +{ + // fm |= test; + // zm |= test; + + if(m_sel.fwrite) + { + vpor(xmm4, xmm15); + } + + if(m_sel.zwrite) + { + vpor(xmm5, xmm15); + } + + // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + + vpcmpeqd(xmm1, xmm1); + + if(m_sel.fwrite && m_sel.zwrite) + { + vpcmpeqd(xmm0, xmm1, xmm5); + vpcmpeqd(xmm1, xmm4); + vpackssdw(xmm1, xmm0); + } + else if(m_sel.fwrite) + { + vpcmpeqd(xmm1, xmm4); + vpackssdw(xmm1, xmm1); + } + else if(m_sel.zwrite) + { + vpcmpeqd(xmm1, xmm5); + vpackssdw(xmm1, xmm1); + } + + vpmovmskb(edx, xmm1); + + not(edx); +} + +void GSDrawScanlineCodeGenerator::WriteZBuf() +{ + if(!m_sel.zwrite) + { + return; + } + + bool fast = m_sel.ztest && m_sel.zpsm < 2; + + vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]); + + if(fast) + { + // zs = zs.blend8(zd, zm); + + vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4); + } + + WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1); +} + +void GSDrawScanlineCodeGenerator::AlphaBlend() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.abe == 0 && m_sel.aa1 == 0) + { + return; + } + + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) + { + switch(m_sel.fpsm) + { + case 0: + case 1: + + // c[2] = fd & mask; + // c[3] = (fd >> 8) & mask; + + vpsllw(xmm0, xmm6, 8); + vpsrlw(xmm0, 8); + vpsrlw(xmm1, xmm6, 8); + + break; + + case 2: + + // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + + vpcmpeqd(xmm15, xmm15); + + vpsrld(xmm15, 27); // 0x0000001f + vpand(xmm0, xmm6, xmm15); + vpslld(xmm0, 3); + + vpslld(xmm15, 10); // 0x00007c00 + vpand(xmm5, xmm6, xmm15); + vpslld(xmm5, 9); + + vpor(xmm0, xmm1); + + vpsrld(xmm15, 5); // 0x000003e0 + vpand(xmm1, xmm6, xmm15); + vpsrld(xmm1, 2); + + vpsllw(xmm15, 10); // 0x00008000 + vpand(xmm5, xmm6, xmm15); + vpslld(xmm5, 8); + + vpor(xmm1, xmm5); + + break; + } + } + + // xmm2, xmm3 = src rb, ga + // xmm0, xmm1 = dst rb, ga + // xmm5, xmm15 = free + + if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) + { + vmovdqa(xmm5, xmm2); + } + + if(m_sel.aba != m_sel.abb) + { + // rb = c[aba * 2 + 0]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(xmm2, xmm0); break; + case 2: vpxor(xmm2, xmm2); break; + } + + // rb = rb.sub16(c[abb * 2 + 0]); + + switch(m_sel.abb) + { + case 0: vpsubw(xmm2, xmm5); break; + case 1: vpsubw(xmm2, xmm0); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; + + switch(m_sel.abc) + { + case 0: + case 1: + vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); + vpsllw(xmm15, 7); + break; + case 2: + vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); + break; + } + + // rb = rb.modulate16<1>(a); + + modulate16(xmm2, xmm15, 1); + } + + // rb = rb.add16(c[abd * 2 + 0]); + + switch(m_sel.abd) + { + case 0: vpaddw(xmm2, xmm5); break; + case 1: vpaddw(xmm2, xmm0); break; + case 2: break; + } + } + else + { + // rb = c[abd * 2 + 0]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(xmm2, xmm0); break; + case 2: vpxor(xmm2, xmm2); break; + } + } + + if(m_sel.pabe) + { + // mask = (c[1] << 8).sra32(31); + + vpslld(xmm0, xmm3, 8); + vpsrad(xmm0, 31); + + // rb = c[0].blend8(rb, mask); + + vpblendvb(xmm2, xmm5, xmm2, xmm0); + } + + // xmm0 = pabe mask + // xmm3 = src ga + // xmm1 = dst ga + // xmm2 = rb + // xmm15 = a + // xmm5 = free + + vmovdqa(xmm5, xmm3); + + if(m_sel.aba != m_sel.abb) + { + // ga = c[aba * 2 + 1]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(xmm3, xmm1); break; + case 2: vpxor(xmm3, xmm3); break; + } + + // ga = ga.sub16(c[abeb * 2 + 1]); + + switch(m_sel.abb) + { + case 0: vpsubw(xmm3, xmm5); break; + case 1: vpsubw(xmm3, xmm1); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // ga = ga.modulate16<1>(a); + + modulate16(xmm3, xmm15, 1); + } + + // ga = ga.add16(c[abd * 2 + 1]); + + switch(m_sel.abd) + { + case 0: vpaddw(xmm3, xmm5); break; + case 1: vpaddw(xmm3, xmm1); break; + case 2: break; + } + } + else + { + // ga = c[abd * 2 + 1]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(xmm3, xmm1); break; + case 2: vpxor(xmm3, xmm3); break; + } + } + + // xmm0 = pabe mask + // xmm5 = src ga + // xmm2 = rb + // xmm3 = ga + // xmm1, xmm15 = free + + if(m_sel.pabe) + { + vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) + + // ga = c[1].blend8(ga, mask).mix16(c[1]); + + vpblendvb(xmm3, xmm5, xmm3, xmm0); + } + else + { + if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx + { + mix16(xmm3, xmm5, xmm15); + } + } +} + +void GSDrawScanlineCodeGenerator::WriteFrame() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.colclamp == 0) + { + // c[0] &= 0x000000ff; + // c[1] &= 0x000000ff; + + vpcmpeqd(xmm15, xmm15); + vpsrlw(xmm15, 8); + vpand(xmm2, xmm15); + vpand(xmm3, xmm15); + } + + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(rax, r8); + and(rax, 3); + shl(rax, 5); + vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[0])]); + vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[1])]); + } + + // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); + + vpunpckhwd(xmm15, xmm2, xmm3); + vpunpcklwd(xmm2, xmm3); + vpackuswb(xmm2, xmm15); + + if(m_sel.fba && m_sel.fpsm != 1) + { + // fs |= 0x80000000; + + vpcmpeqd(xmm15, xmm15); + vpslld(xmm15, 31); + vpor(xmm2, xmm15); + } + + // xmm2 = fs + // xmm4 = fm + // xmm6 = fd + + if(m_sel.fpsm == 2) + { + // GSVector4i rb = fs & 0x00f800f8; + // GSVector4i ga = fs & 0x8000f800; + + mov(eax, 0x00f800f8); + vmovd(xmm0, eax); + vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + + mov(eax, 0x8000f800); + vmovd(xmm1, eax); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); + + vpand(xmm0, xmm2); + vpand(xmm1, xmm2); + + // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + + vpsrld(xmm2, xmm0, 9); + vpsrld(xmm0, 3); + vpsrld(xmm3, xmm1, 16); + vpsrld(xmm1, 6); + + vpor(xmm0, xmm1); + vpor(xmm2, xmm3); + vpor(xmm2, xmm0); + } + + if(m_sel.rfb) + { + // fs = fs.blend(fd, fm); + + blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm + } + + bool fast = m_sel.rfb && m_sel.fpsm < 2; + + WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); +} + +void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg64& addr) +{ + vmovq(dst, qword[r13 + addr * 2]); + vmovhps(dst, qword[r13 + addr * 2 + 8 * 2]); +} + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz) +{ + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); + + test(mask, 0x0f); + je("@f"); + vmovq(qword[r13 + addr * 2], src); + L("@@"); + + test(mask, 0xf0); + je("@f"); + vmovhps(qword[r13 + addr * 2 + 8 * 2], src); + L("@@"); + + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } +} + +static const int s_offsets[4] = {0, 2, 8, 10}; + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, uint8 i, int psm) +{ + Address dst = ptr[r13 + addr * 2 + s_offsets[i] * 2]; + + switch(psm) + { + case 0: + if(i == 0) vmovd(dst, src); + else vpextrd(dst, src, i); + break; + case 1: + if(i == 0) vmovd(eax, src); + else vpextrd(eax, src, i); + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + vpextrw(eax, src, i * 2); + mov(dst, ax); + break; + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) +{ + ReadTexel(dst, addr, 0); + ReadTexel(dst, addr, 1); + ReadTexel(dst, addr, 2); + ReadTexel(dst, addr, 3); +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +{ + const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; + + vpextrd(eax, addr, i); + + movsxd(rax, eax); + + if(m_sel.tlu) movzx(rax, byte[rbx + rax]); + + vpinsrd(dst, src, i); +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp index a7427545ee..d65d33c1c1 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.cpp @@ -1,123 +1,123 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#error TODO - -#include "stdafx.h" -#include "GSDrawScanlineCodeGenerator.h" - -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -void GSDrawScanlineCodeGenerator::Generate() -{ -} - -void GSDrawScanlineCodeGenerator::Init() -{ -} - -void GSDrawScanlineCodeGenerator::Step() -{ -} - -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) -{ -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) -{ -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) -{ -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ -} - -void GSDrawScanlineCodeGenerator::Fog() -{ -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) -{ -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) -{ -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) -{ -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) -{ -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) -{ -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#error TODO + +#include "stdafx.h" +#include "GSDrawScanlineCodeGenerator.h" + +#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +void GSDrawScanlineCodeGenerator::Generate() +{ +} + +void GSDrawScanlineCodeGenerator::Init() +{ +} + +void GSDrawScanlineCodeGenerator::Step() +{ +} + +void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +{ +} + +void GSDrawScanlineCodeGenerator::SampleTexture() +{ +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +{ +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +{ +} + +void GSDrawScanlineCodeGenerator::AlphaTFX() +{ +} + +void GSDrawScanlineCodeGenerator::ReadMask() +{ +} + +void GSDrawScanlineCodeGenerator::TestAlpha() +{ +} + +void GSDrawScanlineCodeGenerator::ColorTFX() +{ +} + +void GSDrawScanlineCodeGenerator::Fog() +{ +} + +void GSDrawScanlineCodeGenerator::ReadFrame() +{ +} + +void GSDrawScanlineCodeGenerator::TestDestAlpha() +{ +} + +void GSDrawScanlineCodeGenerator::WriteMask() +{ +} + +void GSDrawScanlineCodeGenerator::WriteZBuf() +{ +} + +void GSDrawScanlineCodeGenerator::AlphaBlend() +{ +} + +void GSDrawScanlineCodeGenerator::WriteFrame() +{ +} + +void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +{ +} + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +{ +} + +static const int s_offsets[4] = {0, 2, 8, 10}; + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +{ +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) +{ +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +{ +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 03094c9af9..085e212901 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -1,1971 +1,1971 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix) -// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4) - -#include "stdafx.h" -#include "GSDrawScanlineCodeGenerator.h" - -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -void GSDrawScanlineCodeGenerator::Generate() -{ - push(ebx); - push(esi); - push(edi); - push(ebp); - - Init(); - - if(!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // xmm0 = z/zi - // xmm2 = u (tme) - // xmm3 = v (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - bool tme = m_sel.tfx != TFX_NONE; - - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // - xmm0 - // xmm2 = u (tme) - // xmm3 = v (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - SampleTexture(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm2 - // - xmm3 - // - xmm4 - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - AlphaTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadMask(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ColorTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - Fog(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadFrame(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestDestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - WriteMask(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - - WriteZBuf(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // xmm2 = fd - // xmm3 = fm - // - xmm4 - // xmm5 = rb - // xmm6 = ga - - AlphaBlend(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // xmm2 = fd - // xmm3 = fm - // xmm5 = rb - // xmm6 = ga - - WriteFrame(); - -L("step"); - - // if(steps <= 0) break; - - if(!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - // vzeroupper(); - - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); -} - -void GSDrawScanlineCodeGenerator::Init() -{ - // int skip = left & 3; - - mov(ebx, edx); - and(edx, 3); - - // left -= skip; - - sub(ebx, edx); - - // int steps = right - left - 4; - - sub(ecx, ebx); - sub(ecx, 4); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(edx, 4); - - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); - - vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, dword[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, dword[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, dword[&m_local.gd->fzbc]); - - if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - shl(edx, 4); - lea(edx, ptr[edx + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, dword[esp + _v]); - } - - if(!m_sel.sprite) - { - if(m_sel.fwrite && m_sel.fge || m_sel.zb) - { - vmovaps(xmm0, ptr[ebx + 16]); // v.p - - if(m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - vcvttps2dq(xmm1, xmm0); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpaddw(xmm1, ptr[edx + 16 * 6]); - - vmovdqa(ptr[&m_local.temp.f], xmm1); - } - - if(m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm0, ptr[edx]); - - vmovaps(ptr[&m_local.temp.z], xmm0); - } - } - } - else - { - if(m_sel.ztest) - { - vmovdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if(m_sel.fb) - { - if(m_sel.edge || m_sel.tfx != TFX_NONE) - { - vmovaps(xmm4, ptr[ebx + 32]); // v.t - } - - if(m_sel.edge) - { - vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); - vpsrlw(xmm3, 9); - - vmovdqa(ptr[&m_local.temp.cov], xmm3); - } - - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i vti(vt); - - vcvttps2dq(xmm4, xmm4); - - // si = vti.xxxx() + m_local.d[skip].si; - // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; - - vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddd(xmm2, ptr[edx + 16 * 7]); - - if(!m_sel.sprite) - { - vpaddd(xmm3, ptr[edx + 16 * 8]); - } - else - { - if(m_sel.ltf) - { - vpshuflw(xmm4, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm4, 1); - - vmovdqa(ptr[&m_local.temp.vf], xmm4); - } - } - - vmovdqa(ptr[&m_local.temp.s], xmm2); - vmovdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm2, ptr[edx + 16 * 1]); - vaddps(xmm3, ptr[edx + 16 * 2]); - vaddps(xmm4, ptr[edx + 16 * 3]); - - vmovaps(ptr[&m_local.temp.s], xmm2); - vmovaps(ptr[&m_local.temp.t], xmm3); - vmovaps(ptr[&m_local.temp.q], xmm4); - - vrcpps(xmm4, xmm4); - vmulps(xmm2, xmm4); - vmulps(xmm3, xmm4); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - vcvttps2dq(xmm6, ptr[ebx]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - vpshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm6, xmm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); - - vpaddw(xmm5, ptr[edx + 16 * 4]); - vpaddw(xmm6, ptr[edx + 16 * 5]); - - vmovdqa(ptr[&m_local.temp.rb], xmm5); - vmovdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if(m_sel.tfx == TFX_NONE) - { - vmovdqa(xmm5, ptr[&m_local.c.rb]); - vmovdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step() -{ - // steps -= 4; - - sub(ecx, 4); - - // fza_offset++; - - add(edi, 8); - - if(!m_sel.sprite) - { - // z += m_local.d4.z; - - if(m_sel.zb) - { - vmovaps(xmm0, ptr[&m_local.temp.z]); - vaddps(xmm0, ptr[&m_local.d4.z]); - vmovaps(ptr[&m_local.temp.z], xmm0); - } - - // f = f.add16(m_local.d4.f); - - if(m_sel.fwrite && m_sel.fge) - { - vmovdqa(xmm1, ptr[&m_local.temp.f]); - vpaddw(xmm1, ptr[&m_local.d4.f]); - vmovdqa(ptr[&m_local.temp.f], xmm1); - } - } - else - { - if(m_sel.ztest) - { - vmovdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if(m_sel.fb) - { - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i st = m_local.d4.st; - - // si += st.xxxx(); - // if(!sprite) ti += st.yyyy(); - - vmovdqa(xmm4, ptr[&m_local.d4.st]); - - vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vpaddd(xmm2, ptr[&m_local.temp.s]); - vmovdqa(ptr[&m_local.temp.s], xmm2); - - if(!m_sel.sprite) - { - vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vpaddd(xmm3, ptr[&m_local.temp.t]); - vmovdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - vmovdqa(xmm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - vmovaps(xmm4, ptr[&m_local.d4.stq]); - - vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - vaddps(xmm2, ptr[&m_local.temp.s]); - vaddps(xmm3, ptr[&m_local.temp.t]); - vaddps(xmm4, ptr[&m_local.temp.q]); - - vmovaps(ptr[&m_local.temp.s], xmm2); - vmovaps(ptr[&m_local.temp.t], xmm3); - vmovaps(ptr[&m_local.temp.q], xmm4); - - vrcpps(xmm4, xmm4); - vmulps(xmm2, xmm4); - vmulps(xmm3, xmm4); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - vmovdqa(xmm7, ptr[&m_local.d4.c]); - - vpshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); - - vpaddw(xmm5, ptr[&m_local.temp.rb]); - vpaddw(xmm6, ptr[&m_local.temp.ga]); - - vmovdqa(ptr[&m_local.temp.rb], xmm5); - vmovdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if(m_sel.tfx == TFX_NONE) - { - vmovdqa(xmm5, ptr[&m_local.c.rb]); - vmovdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } - - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); - - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); -} - -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) -{ - if(!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, dword[esi + 4]); - add(ebp, dword[edi + 4]); - - // GSVector4i zs = zi; - - if(!m_sel.sprite) - { - if(m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - vbroadcastss(temp1, ptr[&GSVector4::m_half]); - vmulps(temp1, xmm0); - vcvttps2dq(temp1, temp1); - vpslld(temp1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(temp2, temp2); - vpsrld(temp2, 31); - vpand(xmm0, temp2); - - vpor(xmm0, temp1); - } - else - { - // zs = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - if(m_sel.zwrite) - { - vmovdqa(ptr[&m_local.temp.zs], xmm0); - } - } - - if(m_sel.ztest) - { - ReadPixel(xmm1, ebp); - - if(m_sel.zwrite && m_sel.zpsm < 2) - { - vmovdqa(ptr[&m_local.temp.zd], xmm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if(m_sel.zpsm) - { - vpslld(xmm1, m_sel.zpsm * 8); - vpsrld(xmm1, m_sel.zpsm * 8); - } - - if(m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - vpcmpeqd(xmm4, xmm4); - vpslld(xmm4, 31); - - // GSVector4i zso = zs - o; - - vpsubd(xmm0, xmm4); - - // GSVector4i zdo = zd - o; - - vpsubd(xmm1, xmm4); - } - - switch(m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - vpcmpgtd(xmm1, xmm0); - vpor(xmm7, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - vpcmpgtd(xmm0, xmm1); - vpcmpeqd(xmm4, xmm4); - vpxor(xmm0, xmm4); - vpor(xmm7, xmm0); - break; - } - - alltrue(); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ - if(!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, dword[&m_local.gd->tex]); - - // ebx = tex - - if(!m_sel.fst) - { - // TODO: move these into Init/Step too? - - vcvttps2dq(xmm2, xmm2); - vcvttps2dq(xmm3, xmm3); - - if(m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - vmovd(xmm4, eax); - vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - vpsubd(xmm2, xmm4); - vpsubd(xmm3, xmm4); - } - } - - // xmm2 = u - // xmm3 = v - - if(m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(1); - - vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); - vmovdqa(ptr[&m_local.temp.uf], xmm0); - - if(!m_sel.sprite) - { - // GSVector4i vf = v.xxzzlh().srl16(1); - - vpshuflw(xmm1, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm1, 1); - vmovdqa(ptr[&m_local.temp.vf], xmm1); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - vpsrad(xmm2, 16); - vpsrad(xmm3, 16); - vpackssdw(xmm2, xmm3); - - if(m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - vpcmpeqd(xmm1, xmm1); - vpsrlw(xmm1, 15); - vpaddw(xmm3, xmm2, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i x0 = uv0.upl16(); - // GSVector4i y0 = uv0.uph16() << tw; - - vpxor(xmm0, xmm0); - - vpunpcklwd(xmm4, xmm2, xmm0); - vpunpckhwd(xmm2, xmm2, xmm0); - vpslld(xmm2, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if(m_sel.ltf) - { - // GSVector4i x1 = uv1.upl16(); - // GSVector4i y1 = uv1.uph16() << tw; - - vpunpcklwd(xmm6, xmm3, xmm0); - vpunpckhwd(xmm3, xmm3, xmm0); - vpslld(xmm3, m_sel.tw + 3); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - vpaddd(xmm5, xmm2, xmm4); - vpaddd(xmm2, xmm2, xmm6); - vpaddd(xmm0, xmm3, xmm4); - vpaddd(xmm3, xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm6, xmm5, xmm1, xmm4); - - // xmm2, xmm5, xmm1 = free - - ReadTexel(xmm4, xmm2, xmm5, xmm1); - - // xmm0, xmm2, xmm5 = free - - ReadTexel(xmm1, xmm0, xmm2, xmm5); - - // xmm3, xmm0, xmm2 = free - - ReadTexel(xmm5, xmm3, xmm0, xmm2); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - vmovdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - vpsllw(xmm2, xmm6, 8); - vpsrlw(xmm2, 8); - vpsrlw(xmm6, 8); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - vpsllw(xmm3, xmm4, 8); - vpsrlw(xmm3, 8); - vpsrlw(xmm4, 8); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); - - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - vpsrlw(xmm2, xmm1, 8); - vpsllw(xmm1, 8); - vpsrlw(xmm1, 8); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); - - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); - - vmovdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - vpaddd(xmm2, xmm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm5, xmm2, xmm0, xmm1); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - vpsrlw(xmm6, xmm5, 8); - vpsllw(xmm5, 8); - vpsrlw(xmm5, 8); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - vpmaxsw(uv, ptr[&m_local.gd->t.min]); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv, xmm0); - } - - vpminsw(uv, ptr[&m_local.gd->t.max]); - } - else - { - vpand(uv, ptr[&m_local.gd->t.min]); - - if(region) - { - vpor(uv, ptr[&m_local.gd->t.max]); - } - } - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv, xmm4); - - if(region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv, xmm4); - vpminsw(uv, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vpmaxsw(uv0, xmm4); - vpmaxsw(uv1, xmm4); - } - else - { - vpxor(xmm0, xmm0); - vpmaxsw(uv0, xmm0); - vpmaxsw(uv1, xmm0); - } - - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vpminsw(uv0, xmm5); - vpminsw(uv1, xmm5); - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vpand(uv0, xmm4); - vpand(uv1, xmm4); - - if(region) - { - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vpor(uv0, xmm5); - vpor(uv1, xmm5); - } - } - } - else - { - vmovdqa(xmm4, ptr[&m_local.gd->t.min]); - vmovdqa(xmm5, ptr[&m_local.gd->t.max]); - vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv0, xmm4); - - if(region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv0, xmm4); - vpminsw(uv0, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv0, xmm1, xmm0); - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - vpand(xmm1, uv1, xmm4); - - if(region) - { - vpor(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - vpmaxsw(uv1, xmm4); - vpminsw(uv1, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - vpblendvb(uv1, xmm1, xmm0); - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ - if(!m_sel.fb) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(xmm6, xmm4, 1); - - clamp16(xmm6, xmm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(xmm2, xmm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - vpsrlw(xmm4, 7); - - if(m_sel.tcc) - { - vpaddusb(xmm4, xmm6); - } - - mix16(xmm6, xmm4, xmm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - vmovdqa(xmm2, xmm4); - - vpsrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if(m_sel.iip) - { - vpsrlw(xmm6, 7); - } - - break; - } - - if(m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if(!m_sel.abe) - { - // a = cov - - if(m_sel.edge) - { - vmovdqa(xmm0, ptr[&m_local.temp.cov]); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - } - - mix16(xmm6, xmm0, xmm1); - } - else - { - // a = a == 0x80 ? cov : a - - vpcmpeqd(xmm0, xmm0); - vpsllw(xmm0, 15); - vpsrlw(xmm0, 8); - - if(m_sel.edge) - { - vmovdqa(xmm1, ptr[&m_local.temp.cov]); - } - else - { - vmovdqa(xmm1, xmm0); - } - - vpcmpeqw(xmm0, xmm6); - vpsrld(xmm0, 16); - vpslld(xmm0, 16); - - vpblendvb(xmm6, xmm1, xmm0); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ - if(m_sel.fwrite) - { - vmovdqa(xmm3, ptr[&m_local.gd->fm]); - } - - if(m_sel.zwrite) - { - vmovdqa(xmm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ - switch(m_sel.afail) - { - case AFAIL_FB_ONLY: - if(!m_sel.zwrite) return; - break; - - case AFAIL_ZB_ONLY: - if(!m_sel.fwrite) return; - break; - - case AFAIL_RGB_ONLY: - if(!m_sel.zwrite && m_sel.fpsm == 1) return; - break; - } - - switch(m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - vpcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpgtd(xmm1, ptr[&m_local.gd->aref]); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - vpsrld(xmm0, xmm6, 16); - vmovdqa(xmm1, ptr[&m_local.gd->aref]); - vpcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - vpsrld(xmm1, xmm6, 16); - vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); - break; - } - - switch(m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - vpor(xmm7, xmm1); - alltrue(); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - vpor(xmm4, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - vpor(xmm3, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - vpor(xmm4, xmm1); - // fm |= t & GSVector4i::xff000000(); - vpsrld(xmm1, 24); - vpslld(xmm1, 24); - vpor(xmm3, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ - if(!m_sel.fwrite) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(xmm5, xmm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - vmovdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(xmm1, xmm6); - - modulate16(xmm6, xmm2, 1); - - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - vpsrlw(xmm2, 7); - - vpaddw(xmm6, xmm2); - - clamp16(xmm6, xmm0); - - mix16(xmm6, xmm1, xmm0); - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - vpaddw(xmm5, xmm2); - - clamp16(xmm5, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if(m_sel.iip) - { - vpsrlw(xmm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog() -{ - if(!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - vmovdqa(xmm0, ptr[!m_sel.sprite ? &m_local.temp.f : &m_local.p.f]); - vmovdqa(xmm1, xmm6); - - vmovdqa(xmm2, ptr[&m_local.gd->frb]); - lerp16(xmm5, xmm2, xmm0, 0); - - vmovdqa(xmm2, ptr[&m_local.gd->fga]); - lerp16(xmm6, xmm2, xmm0, 0); - mix16(xmm6, xmm1, xmm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ - if(!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, dword[esi]); - add(ebx, dword[edi]); - - if(!m_sel.rfb) - { - return; - } - - ReadPixel(xmm2, ebx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ - if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - if(m_sel.datm) - { - if(m_sel.fpsm == 2) - { - vpxor(xmm0, xmm0); - vpsrld(xmm1, xmm2, 15); - vpcmpeqd(xmm1, xmm0); - } - else - { - vpcmpeqd(xmm0, xmm0); - vpxor(xmm1, xmm2, xmm0); - vpsrad(xmm1, 31); - } - } - else - { - if(m_sel.fpsm == 2) - { - vpslld(xmm1, xmm2, 16); - vpsrad(xmm1, 31); - } - else - { - vpsrad(xmm1, xmm2, 31); - } - } - - vpor(xmm7, xmm1); - - alltrue(); -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ - // fm |= test; - // zm |= test; - - if(m_sel.fwrite) - { - vpor(xmm3, xmm7); - } - - if(m_sel.zwrite) - { - vpor(xmm4, xmm7); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - vpcmpeqd(xmm1, xmm1); - - if(m_sel.fwrite && m_sel.zwrite) - { - vpcmpeqd(xmm0, xmm1, xmm4); - vpcmpeqd(xmm1, xmm3); - vpackssdw(xmm1, xmm0); - } - else if(m_sel.fwrite) - { - vpcmpeqd(xmm1, xmm3); - vpackssdw(xmm1, xmm1); - } - else if(m_sel.zwrite) - { - vpcmpeqd(xmm1, xmm4); - vpackssdw(xmm1, xmm1); - } - - vpmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ - if(!m_sel.zwrite) - { - return; - } - - bool fast = m_sel.ztest && m_sel.zpsm < 2; - - vmovdqa(xmm1, ptr[!m_sel.sprite ? &m_local.temp.zs : &m_local.p.z]); - - if(fast) - { - // zs = zs.blend8(zd, zm); - - vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); - } - - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch(m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - vpsllw(xmm0, xmm2, 8); - vpsrlw(xmm0, 8); - vpsrlw(xmm1, xmm2, 8); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - vpcmpeqd(xmm7, xmm7); - - vpsrld(xmm7, 27); // 0x0000001f - vpand(xmm0, xmm2, xmm7); - vpslld(xmm0, 3); - - vpslld(xmm7, 10); // 0x00007c00 - vpand(xmm4, xmm2, xmm7); - vpslld(xmm4, 9); - - vpor(xmm0, xmm4); - - vpsrld(xmm7, 5); // 0x000003e0 - vpand(xmm1, xmm2, xmm7); - vpsrld(xmm1, 2); - - vpsllw(xmm7, 10); // 0x00008000 - vpand(xmm4, xmm2, xmm7); - vpslld(xmm4, 8); - - vpor(xmm1, xmm4); - - break; - } - } - - // xmm5, xmm6 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm2, xmm3 = used - // xmm4, xmm7 = free - - if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - vmovdqa(xmm4, xmm5); - } - - if(m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch(m_sel.aba) - { - case 0: break; - case 1: vmovdqa(xmm5, xmm0); break; - case 2: vpxor(xmm5, xmm5); break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch(m_sel.abb) - { - case 0: vpsubw(xmm5, xmm4); break; - case 1: vpsubw(xmm5, xmm0); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch(m_sel.abc) - { - case 0: - case 1: - vpshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1)); - vpshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); - vpsllw(xmm7, 7); - break; - case 2: - vmovdqa(xmm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(xmm5, xmm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch(m_sel.abd) - { - case 0: vpaddw(xmm5, xmm4); break; - case 1: vpaddw(xmm5, xmm0); break; - case 2: break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch(m_sel.abd) - { - case 0: break; - case 1: vmovdqa(xmm5, xmm0); break; - case 2: vpxor(xmm5, xmm5); break; - } - } - - if(m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - vpslld(xmm0, xmm6, 8); - vpsrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - vpblendvb(xmm5, xmm4, xmm5, xmm0); - } - - // xmm6 = src ga - // xmm1 = dst ga - // xmm5 = rb - // xmm7 = a - // xmm2, xmm3 = used - // xmm0, xmm4 = free - - vmovdqa(xmm4, xmm6); - - if(m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch(m_sel.aba) - { - case 0: break; - case 1: vmovdqa(xmm6, xmm1); break; - case 2: vpxor(xmm6, xmm6); break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch(m_sel.abb) - { - case 0: vpsubw(xmm6, xmm4); break; - case 1: vpsubw(xmm6, xmm1); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(xmm6, xmm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch(m_sel.abd) - { - case 0: vpaddw(xmm6, xmm4); break; - case 1: vpaddw(xmm6, xmm1); break; - case 2: break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch(m_sel.abd) - { - case 0: break; - case 1: vmovdqa(xmm6, xmm1); break; - case 2: vpxor(xmm6, xmm6); break; - } - } - - // xmm4 = src ga - // xmm5 = rb - // xmm6 = ga - // xmm2, xmm3 = used - // xmm0, xmm1, xmm7 = free - - if(m_sel.pabe) - { - vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - vpblendvb(xmm6, xmm4, xmm6, xmm0); - } - else - { - if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(xmm6, xmm4, xmm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.colclamp == 0) - { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; - - vpcmpeqd(xmm7, xmm7); - vpsrlw(xmm7, 8); - vpand(xmm5, xmm7); - vpand(xmm6, xmm7); - } - - if(m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, dword[esp + _top]); - and(eax, 3); - shl(eax, 5); - vpaddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); - vpaddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - vpunpckhwd(xmm7, xmm5, xmm6); - vpunpcklwd(xmm5, xmm6); - vpackuswb(xmm5, xmm7); - - if(m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - vpcmpeqd(xmm7, xmm7); - vpslld(xmm7, 31); - vpor(xmm5, xmm7); - } - - if(m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - vmovd(xmm6, eax); - vpshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - vmovd(xmm7, eax); - vpshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - - vpand(xmm4, xmm5, xmm6); - vpand(xmm5, xmm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - vpsrld(xmm6, xmm4, 9); - vpsrld(xmm4, 3); - vpsrld(xmm7, xmm5, 16); - vpsrld(xmm5, 6); - - vpor(xmm5, xmm4); - vpor(xmm7, xmm6); - vpor(xmm5, xmm7); - } - - if(m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb && m_sel.fpsm < 2; - - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) -{ - vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); - vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if(fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - - // vmaskmovps? - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); - } -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - switch(psm) - { - case 0: - if(i == 0) vmovd(dst, src); - else vpextrd(dst, src, i); - break; - case 1: - if(i == 0) vmovd(eax, src); - else vpextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - vpextrw(eax, src, i * 2); - mov(dst, ax); - break; - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) -{ - ReadTexel(dst, addr, 0); - ReadTexel(dst, addr, 1); - ReadTexel(dst, addr, 2); - ReadTexel(dst, addr, 3); -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) -{ - const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4]; - - if(i == 0) vmovd(eax, addr); - else vpextrd(eax, addr, i); - - if(m_sel.tlu) movzx(eax, byte[ebx + eax]); - - if(i == 0) vmovd(dst, src); - else vpinsrd(dst, src, i); -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix) +// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4) + +#include "stdafx.h" +#include "GSDrawScanlineCodeGenerator.h" + +#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +static const int _args = 16; +static const int _top = _args + 4; +static const int _v = _args + 8; + +void GSDrawScanlineCodeGenerator::Generate() +{ + push(ebx); + push(esi); + push(edi); + push(ebp); + + Init(); + + if(!m_sel.edge) + { + align(16); + } + +L("loop"); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // xmm0 = z/zi + // xmm2 = u (tme) + // xmm3 = v (tme) + // xmm5 = rb (!tme) + // xmm6 = ga (!tme) + // xmm7 = test + + bool tme = m_sel.tfx != TFX_NONE; + + TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // - xmm0 + // xmm2 = u (tme) + // xmm3 = v (tme) + // xmm5 = rb (!tme) + // xmm6 = ga (!tme) + // xmm7 = test + + SampleTexture(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // - xmm2 + // - xmm3 + // - xmm4 + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + AlphaTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ReadMask(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + TestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ColorTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + Fog(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ReadFrame(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + TestDestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + WriteMask(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + + WriteZBuf(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // - ebp + // xmm2 = fd + // xmm3 = fm + // - xmm4 + // xmm5 = rb + // xmm6 = ga + + AlphaBlend(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // xmm2 = fd + // xmm3 = fm + // xmm5 = rb + // xmm6 = ga + + WriteFrame(); + +L("step"); + + // if(steps <= 0) break; + + if(!m_sel.edge) + { + test(ecx, ecx); + + jle("exit", T_NEAR); + + Step(); + + jmp("loop", T_NEAR); + } + +L("exit"); + + // vzeroupper(); + + pop(ebp); + pop(edi); + pop(esi); + pop(ebx); + + ret(8); +} + +void GSDrawScanlineCodeGenerator::Init() +{ + // int skip = left & 3; + + mov(ebx, edx); + and(edx, 3); + + // left -= skip; + + sub(ebx, edx); + + // int steps = right - left - 4; + + sub(ecx, ebx); + sub(ecx, 4); + + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + + shl(edx, 4); + + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); + + vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); + + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + + mov(esi, dword[esp + _top]); + lea(esi, ptr[esi * 8]); + add(esi, dword[&m_local.gd->fzbr]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + + lea(edi, ptr[ebx * 2]); + add(edi, dword[&m_local.gd->fzbc]); + + if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) + { + // edx = &m_local.d[skip] + + shl(edx, 4); + lea(edx, ptr[edx + (size_t)m_local.d]); + + // ebx = &v + + mov(ebx, dword[esp + _v]); + } + + if(!m_sel.sprite) + { + if(m_sel.fwrite && m_sel.fge || m_sel.zb) + { + vmovaps(xmm0, ptr[ebx + 16]); // v.p + + if(m_sel.fwrite && m_sel.fge) + { + // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); + + vcvttps2dq(xmm1, xmm0); + vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpaddw(xmm1, ptr[edx + 16 * 6]); + + vmovdqa(ptr[&m_local.temp.f], xmm1); + } + + if(m_sel.zb) + { + // z = vp.zzzz() + m_local.d[skip].z; + + vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + vaddps(xmm0, ptr[edx]); + + vmovaps(ptr[&m_local.temp.z], xmm0); + } + } + } + else + { + if(m_sel.ztest) + { + vmovdqa(xmm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.edge || m_sel.tfx != TFX_NONE) + { + vmovaps(xmm4, ptr[ebx + 32]); // v.t + } + + if(m_sel.edge) + { + vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); + vpsrlw(xmm3, 9); + + vmovdqa(ptr[&m_local.temp.cov], xmm3); + } + + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i vti(vt); + + vcvttps2dq(xmm4, xmm4); + + // si = vti.xxxx() + m_local.d[skip].si; + // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; + + vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddd(xmm2, ptr[edx + 16 * 7]); + + if(!m_sel.sprite) + { + vpaddd(xmm3, ptr[edx + 16 * 8]); + } + else + { + if(m_sel.ltf) + { + vpshuflw(xmm4, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm4, 1); + + vmovdqa(ptr[&m_local.temp.vf], xmm4); + } + } + + vmovdqa(ptr[&m_local.temp.s], xmm2); + vmovdqa(ptr[&m_local.temp.t], xmm3); + } + else + { + // s = vt.xxxx() + m_local.d[skip].s; + // t = vt.yyyy() + m_local.d[skip].t; + // q = vt.zzzz() + m_local.d[skip].q; + + vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(xmm2, ptr[edx + 16 * 1]); + vaddps(xmm3, ptr[edx + 16 * 2]); + vaddps(xmm4, ptr[edx + 16 * 3]); + + vmovaps(ptr[&m_local.temp.s], xmm2); + vmovaps(ptr[&m_local.temp.t], xmm3); + vmovaps(ptr[&m_local.temp.q], xmm4); + + vrcpps(xmm4, xmm4); + vmulps(xmm2, xmm4); + vmulps(xmm3, xmm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i vc = GSVector4i(v.c); + + vcvttps2dq(xmm6, ptr[ebx]); // v.c + + // vc = vc.upl16(vc.zwxy()); + + vpshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(xmm6, xmm5); + + // rb = vc.xxxx().add16(m_local.d[skip].rb); + // ga = vc.zzzz().add16(m_local.d[skip].ga); + + vpshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); + + vpaddw(xmm5, ptr[edx + 16 * 4]); + vpaddw(xmm6, ptr[edx + 16 * 5]); + + vmovdqa(ptr[&m_local.temp.rb], xmm5); + vmovdqa(ptr[&m_local.temp.ga], xmm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + vmovdqa(xmm5, ptr[&m_local.c.rb]); + vmovdqa(xmm6, ptr[&m_local.c.ga]); + } + } + } + } +} + +void GSDrawScanlineCodeGenerator::Step() +{ + // steps -= 4; + + sub(ecx, 4); + + // fza_offset++; + + add(edi, 8); + + if(!m_sel.sprite) + { + // z += m_local.d4.z; + + if(m_sel.zb) + { + vmovaps(xmm0, ptr[&m_local.temp.z]); + vaddps(xmm0, ptr[&m_local.d4.z]); + vmovaps(ptr[&m_local.temp.z], xmm0); + } + + // f = f.add16(m_local.d4.f); + + if(m_sel.fwrite && m_sel.fge) + { + vmovdqa(xmm1, ptr[&m_local.temp.f]); + vpaddw(xmm1, ptr[&m_local.d4.f]); + vmovdqa(ptr[&m_local.temp.f], xmm1); + } + } + else + { + if(m_sel.ztest) + { + vmovdqa(xmm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i st = m_local.d4.st; + + // si += st.xxxx(); + // if(!sprite) ti += st.yyyy(); + + vmovdqa(xmm4, ptr[&m_local.d4.st]); + + vpshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + vpaddd(xmm2, ptr[&m_local.temp.s]); + vmovdqa(ptr[&m_local.temp.s], xmm2); + + if(!m_sel.sprite) + { + vpshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + vpaddd(xmm3, ptr[&m_local.temp.t]); + vmovdqa(ptr[&m_local.temp.t], xmm3); + } + else + { + vmovdqa(xmm3, ptr[&m_local.temp.t]); + } + } + else + { + // GSVector4 stq = m_local.d4.stq; + + // s += stq.xxxx(); + // t += stq.yyyy(); + // q += stq.zzzz(); + + vmovaps(xmm4, ptr[&m_local.d4.stq]); + + vshufps(xmm2, xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm3, xmm4, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm4, xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(xmm2, ptr[&m_local.temp.s]); + vaddps(xmm3, ptr[&m_local.temp.t]); + vaddps(xmm4, ptr[&m_local.temp.q]); + + vmovaps(ptr[&m_local.temp.s], xmm2); + vmovaps(ptr[&m_local.temp.t], xmm3); + vmovaps(ptr[&m_local.temp.q], xmm4); + + vrcpps(xmm4, xmm4); + vmulps(xmm2, xmm4); + vmulps(xmm3, xmm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i c = m_local.d4.c; + + // rb = rb.add16(c.xxxx()); + // ga = ga.add16(c.yyyy()); + + vmovdqa(xmm7, ptr[&m_local.d4.c]); + + vpshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddw(xmm5, ptr[&m_local.temp.rb]); + vpaddw(xmm6, ptr[&m_local.temp.ga]); + + vmovdqa(ptr[&m_local.temp.rb], xmm5); + vmovdqa(ptr[&m_local.temp.ga], xmm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + vmovdqa(xmm5, ptr[&m_local.c.rb]); + vmovdqa(xmm6, ptr[&m_local.c.ga]); + } + } + } + } + + // test = m_test[7 + (steps & (steps >> 31))]; + + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); + + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); +} + +void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +{ + if(!m_sel.zb) + { + return; + } + + // int za = fza_base.y + fza_offset->y; + + mov(ebp, dword[esi + 4]); + add(ebp, dword[edi + 4]); + + // GSVector4i zs = zi; + + if(!m_sel.sprite) + { + if(m_sel.zoverflow) + { + // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + vbroadcastss(temp1, ptr[&GSVector4::m_half]); + vmulps(temp1, xmm0); + vcvttps2dq(temp1, temp1); + vpslld(temp1, 1); + + vcvttps2dq(xmm0, xmm0); + vpcmpeqd(temp2, temp2); + vpsrld(temp2, 31); + vpand(xmm0, temp2); + + vpor(xmm0, temp1); + } + else + { + // zs = GSVector4i(z); + + vcvttps2dq(xmm0, xmm0); + } + + if(m_sel.zwrite) + { + vmovdqa(ptr[&m_local.temp.zs], xmm0); + } + } + + if(m_sel.ztest) + { + ReadPixel(xmm1, ebp); + + if(m_sel.zwrite && m_sel.zpsm < 2) + { + vmovdqa(ptr[&m_local.temp.zd], xmm1); + } + + // zd &= 0xffffffff >> m_sel.zpsm * 8; + + if(m_sel.zpsm) + { + vpslld(xmm1, m_sel.zpsm * 8); + vpsrld(xmm1, m_sel.zpsm * 8); + } + + if(m_sel.zoverflow || m_sel.zpsm == 0) + { + // GSVector4i o = GSVector4i::x80000000(); + + vpcmpeqd(xmm4, xmm4); + vpslld(xmm4, 31); + + // GSVector4i zso = zs - o; + + vpsubd(xmm0, xmm4); + + // GSVector4i zdo = zd - o; + + vpsubd(xmm1, xmm4); + } + + switch(m_sel.ztst) + { + case ZTST_GEQUAL: + // test |= zso < zdo; // ~(zso >= zdo) + vpcmpgtd(xmm1, xmm0); + vpor(xmm7, xmm1); + break; + + case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL + // test |= zso <= zdo; // ~(zso > zdo) + vpcmpgtd(xmm0, xmm1); + vpcmpeqd(xmm4, xmm4); + vpxor(xmm0, xmm4); + vpor(xmm7, xmm0); + break; + } + + alltrue(); + } +} + +void GSDrawScanlineCodeGenerator::SampleTexture() +{ + if(!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + mov(ebx, dword[&m_local.gd->tex]); + + // ebx = tex + + if(!m_sel.fst) + { + // TODO: move these into Init/Step too? + + vcvttps2dq(xmm2, xmm2); + vcvttps2dq(xmm3, xmm3); + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + vmovd(xmm4, eax); + vpshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + + vpsubd(xmm2, xmm4); + vpsubd(xmm3, xmm4); + } + } + + // xmm2 = u + // xmm3 = v + + if(m_sel.ltf) + { + // GSVector4i uf = u.xxzzlh().srl16(1); + + vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm0, 1); + vmovdqa(ptr[&m_local.temp.uf], xmm0); + + if(!m_sel.sprite) + { + // GSVector4i vf = v.xxzzlh().srl16(1); + + vpshuflw(xmm1, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(xmm1, 1); + vmovdqa(ptr[&m_local.temp.vf], xmm1); + } + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + vpsrad(xmm2, 16); + vpsrad(xmm3, 16); + vpackssdw(xmm2, xmm3); + + if(m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + vpcmpeqd(xmm1, xmm1); + vpsrlw(xmm1, 15); + vpaddw(xmm3, xmm2, xmm1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + Wrap(xmm2, xmm3); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(xmm2); + } + + // xmm2 = uv0 + // xmm3 = uv1 (ltf) + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + // xmm7 = used + + // GSVector4i x0 = uv0.upl16(); + // GSVector4i y0 = uv0.uph16() << tw; + + vpxor(xmm0, xmm0); + + vpunpcklwd(xmm4, xmm2, xmm0); + vpunpckhwd(xmm2, xmm2, xmm0); + vpslld(xmm2, m_sel.tw + 3); + + // xmm0 = 0 + // xmm2 = y0 + // xmm3 = uv1 (ltf) + // xmm4 = x0 + // xmm1, xmm5, xmm6 = free + // xmm7 = used + + if(m_sel.ltf) + { + // GSVector4i x1 = uv1.upl16(); + // GSVector4i y1 = uv1.uph16() << tw; + + vpunpcklwd(xmm6, xmm3, xmm0); + vpunpckhwd(xmm3, xmm3, xmm0); + vpslld(xmm3, m_sel.tw + 3); + + // xmm2 = y0 + // xmm3 = y1 + // xmm4 = x0 + // xmm6 = x1 + // xmm0, xmm5, xmm6 = free + // xmm7 = used + + // GSVector4i addr00 = y0 + x0; + // GSVector4i addr01 = y0 + x1; + // GSVector4i addr10 = y1 + x0; + // GSVector4i addr11 = y1 + x1; + + vpaddd(xmm5, xmm2, xmm4); + vpaddd(xmm2, xmm2, xmm6); + vpaddd(xmm0, xmm3, xmm4); + vpaddd(xmm3, xmm3, xmm6); + + // xmm5 = addr00 + // xmm2 = addr01 + // xmm0 = addr10 + // xmm3 = addr11 + // xmm1, xmm4, xmm6 = free + // xmm7 = used + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm6, xmm5, xmm1, xmm4); + + // xmm2, xmm5, xmm1 = free + + ReadTexel(xmm4, xmm2, xmm5, xmm1); + + // xmm0, xmm2, xmm5 = free + + ReadTexel(xmm1, xmm0, xmm2, xmm5); + + // xmm3, xmm0, xmm2 = free + + ReadTexel(xmm5, xmm3, xmm0, xmm2); + + // xmm6 = c00 + // xmm4 = c01 + // xmm1 = c10 + // xmm5 = c11 + // xmm0, xmm2, xmm3 = free + // xmm7 = used + + vmovdqa(xmm0, ptr[&m_local.temp.uf]); + + // GSVector4i rb00 = c00 & mask; + // GSVector4i ga00 = (c00 >> 8) & mask; + + vpsllw(xmm2, xmm6, 8); + vpsrlw(xmm2, 8); + vpsrlw(xmm6, 8); + + // GSVector4i rb01 = c01 & mask; + // GSVector4i ga01 = (c01 >> 8) & mask; + + vpsllw(xmm3, xmm4, 8); + vpsrlw(xmm3, 8); + vpsrlw(xmm4, 8); + + // xmm0 = uf + // xmm2 = rb00 + // xmm3 = rb01 + // xmm6 = ga00 + // xmm4 = ga01 + // xmm1 = c10 + // xmm5 = c11 + // xmm7 = used + + // rb00 = rb00.lerp16<0>(rb01, uf); + // ga00 = ga00.lerp16<0>(ga01, uf); + + lerp16(xmm3, xmm2, xmm0, 0); + lerp16(xmm4, xmm6, xmm0, 0); + + // xmm0 = uf + // xmm3 = rb00 + // xmm4 = ga00 + // xmm1 = c10 + // xmm5 = c11 + // xmm2, xmm6 = free + // xmm7 = used + + // GSVector4i rb10 = c10 & mask; + // GSVector4i ga10 = (c10 >> 8) & mask; + + vpsrlw(xmm2, xmm1, 8); + vpsllw(xmm1, 8); + vpsrlw(xmm1, 8); + + // GSVector4i rb11 = c11 & mask; + // GSVector4i ga11 = (c11 >> 8) & mask; + + vpsrlw(xmm6, xmm5, 8); + vpsllw(xmm5, 8); + vpsrlw(xmm5, 8); + + // xmm0 = uf + // xmm3 = rb00 + // xmm4 = ga00 + // xmm1 = rb10 + // xmm5 = rb11 + // xmm2 = ga10 + // xmm6 = ga11 + // xmm7 = used + + // rb10 = rb10.lerp16<0>(rb11, uf); + // ga10 = ga10.lerp16<0>(ga11, uf); + + lerp16(xmm5, xmm1, xmm0, 0); + lerp16(xmm6, xmm2, xmm0, 0); + + // xmm3 = rb00 + // xmm4 = ga00 + // xmm5 = rb10 + // xmm6 = ga10 + // xmm0, xmm1, xmm2 = free + // xmm7 = used + + // rb00 = rb00.lerp16<0>(rb10, vf); + // ga00 = ga00.lerp16<0>(ga10, vf); + + vmovdqa(xmm0, ptr[&m_local.temp.vf]); + + lerp16(xmm5, xmm3, xmm0, 0); + lerp16(xmm6, xmm4, xmm0, 0); + } + else + { + // GSVector4i addr00 = y0 + x0; + + vpaddd(xmm2, xmm4); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm5, xmm2, xmm0, xmm1); + + // GSVector4i mask = GSVector4i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + vpsrlw(xmm6, xmm5, 8); + vpsllw(xmm5, 8); + vpsrlw(xmm5, 8); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vpmaxsw(uv, ptr[&m_local.gd->t.min]); + } + else + { + vpxor(xmm0, xmm0); + vpmaxsw(uv, xmm0); + } + + vpminsw(uv, ptr[&m_local.gd->t.max]); + } + else + { + vpand(uv, ptr[&m_local.gd->t.min]); + + if(region) + { + vpor(uv, ptr[&m_local.gd->t.max]); + } + } + } + else + { + vmovdqa(xmm4, ptr[&m_local.gd->t.min]); + vmovdqa(xmm5, ptr[&m_local.gd->t.max]); + vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv, xmm4); + + if(region) + { + vpor(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv, xmm4); + vpminsw(uv, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv, xmm1, xmm0); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vmovdqa(xmm4, ptr[&m_local.gd->t.min]); + vpmaxsw(uv0, xmm4); + vpmaxsw(uv1, xmm4); + } + else + { + vpxor(xmm0, xmm0); + vpmaxsw(uv0, xmm0); + vpmaxsw(uv1, xmm0); + } + + vmovdqa(xmm5, ptr[&m_local.gd->t.max]); + vpminsw(uv0, xmm5); + vpminsw(uv1, xmm5); + } + else + { + vmovdqa(xmm4, ptr[&m_local.gd->t.min]); + vpand(uv0, xmm4); + vpand(uv1, xmm4); + + if(region) + { + vmovdqa(xmm5, ptr[&m_local.gd->t.max]); + vpor(uv0, xmm5); + vpor(uv1, xmm5); + } + } + } + else + { + vmovdqa(xmm4, ptr[&m_local.gd->t.min]); + vmovdqa(xmm5, ptr[&m_local.gd->t.max]); + vmovdqa(xmm0, ptr[&m_local.gd->t.mask]); + + // uv0 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv0, xmm4); + + if(region) + { + vpor(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv0, xmm4); + vpminsw(uv0, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv0, xmm1, xmm0); + + // uv1 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(xmm1, uv1, xmm4); + + if(region) + { + vpor(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv1, xmm4); + vpminsw(uv1, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv1, xmm1, xmm0); + } +} + +void GSDrawScanlineCodeGenerator::AlphaTFX() +{ + if(!m_sel.fb) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + // gat = gat.modulate16<1>(ga).clamp8(); + + modulate16(xmm6, xmm4, 1); + + clamp16(xmm6, xmm3); + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + vpsrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_DECAL: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + vpsrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_HIGHLIGHT: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + vmovdqa(xmm2, xmm4); + + // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); + + vpsrlw(xmm4, 7); + + if(m_sel.tcc) + { + vpaddusb(xmm4, xmm6); + } + + mix16(xmm6, xmm4, xmm3); + + break; + + case TFX_HIGHLIGHT2: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + vmovdqa(xmm2, xmm4); + + vpsrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_NONE: + + // gat = iip ? ga.srl16(7) : ga; + + if(m_sel.iip) + { + vpsrlw(xmm6, 7); + } + + break; + } + + if(m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if(!m_sel.abe) + { + // a = cov + + if(m_sel.edge) + { + vmovdqa(xmm0, ptr[&m_local.temp.cov]); + } + else + { + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + } + + mix16(xmm6, xmm0, xmm1); + } + else + { + // a = a == 0x80 ? cov : a + + vpcmpeqd(xmm0, xmm0); + vpsllw(xmm0, 15); + vpsrlw(xmm0, 8); + + if(m_sel.edge) + { + vmovdqa(xmm1, ptr[&m_local.temp.cov]); + } + else + { + vmovdqa(xmm1, xmm0); + } + + vpcmpeqw(xmm0, xmm6); + vpsrld(xmm0, 16); + vpslld(xmm0, 16); + + vpblendvb(xmm6, xmm1, xmm0); + } + } +} + +void GSDrawScanlineCodeGenerator::ReadMask() +{ + if(m_sel.fwrite) + { + vmovdqa(xmm3, ptr[&m_local.gd->fm]); + } + + if(m_sel.zwrite) + { + vmovdqa(xmm4, ptr[&m_local.gd->zm]); + } +} + +void GSDrawScanlineCodeGenerator::TestAlpha() +{ + switch(m_sel.afail) + { + case AFAIL_FB_ONLY: + if(!m_sel.zwrite) return; + break; + + case AFAIL_ZB_ONLY: + if(!m_sel.fwrite) return; + break; + + case AFAIL_RGB_ONLY: + if(!m_sel.zwrite && m_sel.fpsm == 1) return; + break; + } + + switch(m_sel.atst) + { + case ATST_NEVER: + // t = GSVector4i::xffffffff(); + vpcmpeqd(xmm1, xmm1); + break; + + case ATST_ALWAYS: + return; + + case ATST_LESS: + case ATST_LEQUAL: + // t = (ga >> 16) > m_local.gd->aref; + vpsrld(xmm1, xmm6, 16); + vpcmpgtd(xmm1, ptr[&m_local.gd->aref]); + break; + + case ATST_EQUAL: + // t = (ga >> 16) != m_local.gd->aref; + vpsrld(xmm1, xmm6, 16); + vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); + vpcmpeqd(xmm0, xmm0); + vpxor(xmm1, xmm0); + break; + + case ATST_GEQUAL: + case ATST_GREATER: + // t = (ga >> 16) < m_local.gd->aref; + vpsrld(xmm0, xmm6, 16); + vmovdqa(xmm1, ptr[&m_local.gd->aref]); + vpcmpgtd(xmm1, xmm0); + break; + + case ATST_NOTEQUAL: + // t = (ga >> 16) == m_local.gd->aref; + vpsrld(xmm1, xmm6, 16); + vpcmpeqd(xmm1, ptr[&m_local.gd->aref]); + break; + } + + switch(m_sel.afail) + { + case AFAIL_KEEP: + // test |= t; + vpor(xmm7, xmm1); + alltrue(); + break; + + case AFAIL_FB_ONLY: + // zm |= t; + vpor(xmm4, xmm1); + break; + + case AFAIL_ZB_ONLY: + // fm |= t; + vpor(xmm3, xmm1); + break; + + case AFAIL_RGB_ONLY: + // zm |= t; + vpor(xmm4, xmm1); + // fm |= t & GSVector4i::xff000000(); + vpsrld(xmm1, 24); + vpslld(xmm1, 24); + vpor(xmm3, xmm1); + break; + } +} + +void GSDrawScanlineCodeGenerator::ColorTFX() +{ + if(!m_sel.fwrite) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).clamp8(); + + modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + clamp16(xmm5, xmm1); + + break; + + case TFX_DECAL: + + break; + + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + + if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + } + + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + vmovdqa(xmm1, xmm6); + + modulate16(xmm6, xmm2, 1); + + vpshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); + vpsrlw(xmm2, 7); + + vpaddw(xmm6, xmm2); + + clamp16(xmm6, xmm0); + + mix16(xmm6, xmm1, xmm0); + + // GSVector4i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); + + modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + vpaddw(xmm5, xmm2); + + clamp16(xmm5, xmm0); + + break; + + case TFX_NONE: + + // rbt = iip ? rb.srl16(7) : rb; + + if(m_sel.iip) + { + vpsrlw(xmm5, 7); + } + + break; + } +} + +void GSDrawScanlineCodeGenerator::Fog() +{ + if(!m_sel.fwrite || !m_sel.fge) + { + return; + } + + // rb = m_local.gd->frb.lerp16<0>(rb, f); + // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); + + vmovdqa(xmm0, ptr[!m_sel.sprite ? &m_local.temp.f : &m_local.p.f]); + vmovdqa(xmm1, xmm6); + + vmovdqa(xmm2, ptr[&m_local.gd->frb]); + lerp16(xmm5, xmm2, xmm0, 0); + + vmovdqa(xmm2, ptr[&m_local.gd->fga]); + lerp16(xmm6, xmm2, xmm0, 0); + mix16(xmm6, xmm1, xmm0); +} + +void GSDrawScanlineCodeGenerator::ReadFrame() +{ + if(!m_sel.fb) + { + return; + } + + // int fa = fza_base.x + fza_offset->x; + + mov(ebx, dword[esi]); + add(ebx, dword[edi]); + + if(!m_sel.rfb) + { + return; + } + + ReadPixel(xmm2, ebx); +} + +void GSDrawScanlineCodeGenerator::TestDestAlpha() +{ + if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) + { + return; + } + + // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); + + if(m_sel.datm) + { + if(m_sel.fpsm == 2) + { + vpxor(xmm0, xmm0); + vpsrld(xmm1, xmm2, 15); + vpcmpeqd(xmm1, xmm0); + } + else + { + vpcmpeqd(xmm0, xmm0); + vpxor(xmm1, xmm2, xmm0); + vpsrad(xmm1, 31); + } + } + else + { + if(m_sel.fpsm == 2) + { + vpslld(xmm1, xmm2, 16); + vpsrad(xmm1, 31); + } + else + { + vpsrad(xmm1, xmm2, 31); + } + } + + vpor(xmm7, xmm1); + + alltrue(); +} + +void GSDrawScanlineCodeGenerator::WriteMask() +{ + // fm |= test; + // zm |= test; + + if(m_sel.fwrite) + { + vpor(xmm3, xmm7); + } + + if(m_sel.zwrite) + { + vpor(xmm4, xmm7); + } + + // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + + vpcmpeqd(xmm1, xmm1); + + if(m_sel.fwrite && m_sel.zwrite) + { + vpcmpeqd(xmm0, xmm1, xmm4); + vpcmpeqd(xmm1, xmm3); + vpackssdw(xmm1, xmm0); + } + else if(m_sel.fwrite) + { + vpcmpeqd(xmm1, xmm3); + vpackssdw(xmm1, xmm1); + } + else if(m_sel.zwrite) + { + vpcmpeqd(xmm1, xmm4); + vpackssdw(xmm1, xmm1); + } + + vpmovmskb(edx, xmm1); + + not(edx); +} + +void GSDrawScanlineCodeGenerator::WriteZBuf() +{ + if(!m_sel.zwrite) + { + return; + } + + bool fast = m_sel.ztest && m_sel.zpsm < 2; + + vmovdqa(xmm1, ptr[!m_sel.sprite ? &m_local.temp.zs : &m_local.p.z]); + + if(fast) + { + // zs = zs.blend8(zd, zm); + + vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); + } + + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); +} + +void GSDrawScanlineCodeGenerator::AlphaBlend() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.abe == 0 && m_sel.aa1 == 0) + { + return; + } + + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) + { + switch(m_sel.fpsm) + { + case 0: + case 1: + + // c[2] = fd & mask; + // c[3] = (fd >> 8) & mask; + + vpsllw(xmm0, xmm2, 8); + vpsrlw(xmm0, 8); + vpsrlw(xmm1, xmm2, 8); + + break; + + case 2: + + // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + + vpcmpeqd(xmm7, xmm7); + + vpsrld(xmm7, 27); // 0x0000001f + vpand(xmm0, xmm2, xmm7); + vpslld(xmm0, 3); + + vpslld(xmm7, 10); // 0x00007c00 + vpand(xmm4, xmm2, xmm7); + vpslld(xmm4, 9); + + vpor(xmm0, xmm4); + + vpsrld(xmm7, 5); // 0x000003e0 + vpand(xmm1, xmm2, xmm7); + vpsrld(xmm1, 2); + + vpsllw(xmm7, 10); // 0x00008000 + vpand(xmm4, xmm2, xmm7); + vpslld(xmm4, 8); + + vpor(xmm1, xmm4); + + break; + } + } + + // xmm5, xmm6 = src rb, ga + // xmm0, xmm1 = dst rb, ga + // xmm2, xmm3 = used + // xmm4, xmm7 = free + + if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) + { + vmovdqa(xmm4, xmm5); + } + + if(m_sel.aba != m_sel.abb) + { + // rb = c[aba * 2 + 0]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(xmm5, xmm0); break; + case 2: vpxor(xmm5, xmm5); break; + } + + // rb = rb.sub16(c[abb * 2 + 0]); + + switch(m_sel.abb) + { + case 0: vpsubw(xmm5, xmm4); break; + case 1: vpsubw(xmm5, xmm0); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; + + switch(m_sel.abc) + { + case 0: + case 1: + vpshuflw(xmm7, m_sel.abc ? xmm1 : xmm6, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); + vpsllw(xmm7, 7); + break; + case 2: + vmovdqa(xmm7, ptr[&m_local.gd->afix]); + break; + } + + // rb = rb.modulate16<1>(a); + + modulate16(xmm5, xmm7, 1); + } + + // rb = rb.add16(c[abd * 2 + 0]); + + switch(m_sel.abd) + { + case 0: vpaddw(xmm5, xmm4); break; + case 1: vpaddw(xmm5, xmm0); break; + case 2: break; + } + } + else + { + // rb = c[abd * 2 + 0]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(xmm5, xmm0); break; + case 2: vpxor(xmm5, xmm5); break; + } + } + + if(m_sel.pabe) + { + // mask = (c[1] << 8).sra32(31); + + vpslld(xmm0, xmm6, 8); + vpsrad(xmm0, 31); + + // rb = c[0].blend8(rb, mask); + + vpblendvb(xmm5, xmm4, xmm5, xmm0); + } + + // xmm6 = src ga + // xmm1 = dst ga + // xmm5 = rb + // xmm7 = a + // xmm2, xmm3 = used + // xmm0, xmm4 = free + + vmovdqa(xmm4, xmm6); + + if(m_sel.aba != m_sel.abb) + { + // ga = c[aba * 2 + 1]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(xmm6, xmm1); break; + case 2: vpxor(xmm6, xmm6); break; + } + + // ga = ga.sub16(c[abeb * 2 + 1]); + + switch(m_sel.abb) + { + case 0: vpsubw(xmm6, xmm4); break; + case 1: vpsubw(xmm6, xmm1); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // ga = ga.modulate16<1>(a); + + modulate16(xmm6, xmm7, 1); + } + + // ga = ga.add16(c[abd * 2 + 1]); + + switch(m_sel.abd) + { + case 0: vpaddw(xmm6, xmm4); break; + case 1: vpaddw(xmm6, xmm1); break; + case 2: break; + } + } + else + { + // ga = c[abd * 2 + 1]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(xmm6, xmm1); break; + case 2: vpxor(xmm6, xmm6); break; + } + } + + // xmm4 = src ga + // xmm5 = rb + // xmm6 = ga + // xmm2, xmm3 = used + // xmm0, xmm1, xmm7 = free + + if(m_sel.pabe) + { + vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) + + // ga = c[1].blend8(ga, mask).mix16(c[1]); + + vpblendvb(xmm6, xmm4, xmm6, xmm0); + } + else + { + if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx + { + mix16(xmm6, xmm4, xmm7); + } + } +} + +void GSDrawScanlineCodeGenerator::WriteFrame() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.colclamp == 0) + { + // c[0] &= 0x000000ff; + // c[1] &= 0x000000ff; + + vpcmpeqd(xmm7, xmm7); + vpsrlw(xmm7, 8); + vpand(xmm5, xmm7); + vpand(xmm6, xmm7); + } + + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(eax, dword[esp + _top]); + and(eax, 3); + shl(eax, 5); + vpaddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); + vpaddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]); + } + + // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); + + vpunpckhwd(xmm7, xmm5, xmm6); + vpunpcklwd(xmm5, xmm6); + vpackuswb(xmm5, xmm7); + + if(m_sel.fba && m_sel.fpsm != 1) + { + // fs |= 0x80000000; + + vpcmpeqd(xmm7, xmm7); + vpslld(xmm7, 31); + vpor(xmm5, xmm7); + } + + if(m_sel.fpsm == 2) + { + // GSVector4i rb = fs & 0x00f800f8; + // GSVector4i ga = fs & 0x8000f800; + + mov(eax, 0x00f800f8); + vmovd(xmm6, eax); + vpshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); + + mov(eax, 0x8000f800); + vmovd(xmm7, eax); + vpshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); + + vpand(xmm4, xmm5, xmm6); + vpand(xmm5, xmm7); + + // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + + vpsrld(xmm6, xmm4, 9); + vpsrld(xmm4, 3); + vpsrld(xmm7, xmm5, 16); + vpsrld(xmm5, 6); + + vpor(xmm5, xmm4); + vpor(xmm7, xmm6); + vpor(xmm5, xmm7); + } + + if(m_sel.rfb) + { + // fs = fs.blend(fd, fm); + + blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm + } + + bool fast = m_sel.rfb && m_sel.fpsm < 2; + + WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); +} + +void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +{ + vmovq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); + vmovhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); +} + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +{ + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); + + test(mask, 0x0f); + je("@f"); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); + + test(mask, 0xf0); + je("@f"); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); + + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } +} + +static const int s_offsets[4] = {0, 2, 8, 10}; + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +{ + Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; + + switch(psm) + { + case 0: + if(i == 0) vmovd(dst, src); + else vpextrd(dst, src, i); + break; + case 1: + if(i == 0) vmovd(eax, src); + else vpextrd(eax, src, i); + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + vpextrw(eax, src, i * 2); + mov(dst, ax); + break; + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) +{ + ReadTexel(dst, addr, 0); + ReadTexel(dst, addr, 1); + ReadTexel(dst, addr, 2); + ReadTexel(dst, addr, 3); +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +{ + const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4]; + + if(i == 0) vmovd(eax, addr); + else vpextrd(eax, addr, i); + + if(m_sel.tlu) movzx(eax, byte[ebx + eax]); + + if(i == 0) vmovd(dst, src); + else vpinsrd(dst, src, i); +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index e06b4bd622..9315dfb314 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -1,2092 +1,2092 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSDrawScanlineCodeGenerator.h" - -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -static const int _args = 16; -static const int _top = _args + 4; -static const int _v = _args + 8; - -void GSDrawScanlineCodeGenerator::Generate() -{ - push(ebx); - push(esi); - push(edi); - push(ebp); - - Init(); - - if(!m_sel.edge) - { - align(16); - } - -L("loop"); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // xmm0 = z/zi - // xmm2 = u (tme) - // xmm3 = v (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - bool tme = m_sel.tfx != TFX_NONE; - - TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // - xmm0 - // xmm2 = u (tme) - // xmm3 = v (tme) - // xmm5 = rb (!tme) - // xmm6 = ga (!tme) - // xmm7 = test - - SampleTexture(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // - xmm2 - // - xmm3 - // - xmm4 - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - AlphaTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadMask(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ColorTFX(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - Fog(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - ReadFrame(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - TestDestAlpha(); - - // ecx = steps - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - // xmm7 = test - - WriteMask(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // ebp = za - // xmm2 = fd - // xmm3 = fm - // xmm4 = zm - // xmm5 = rb - // xmm6 = ga - - WriteZBuf(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // - ebp - // xmm2 = fd - // xmm3 = fm - // - xmm4 - // xmm5 = rb - // xmm6 = ga - - AlphaBlend(); - - // ebx = fa - // ecx = steps - // edx = fzm - // esi = fzbr - // edi = fzbc - // xmm2 = fd - // xmm3 = fm - // xmm5 = rb - // xmm6 = ga - - WriteFrame(); - -L("step"); - - // if(steps <= 0) break; - - if(!m_sel.edge) - { - test(ecx, ecx); - - jle("exit", T_NEAR); - - Step(); - - jmp("loop", T_NEAR); - } - -L("exit"); - - // vzeroupper(); - - pop(ebp); - pop(edi); - pop(esi); - pop(ebx); - - ret(8); -} - -void GSDrawScanlineCodeGenerator::Init() -{ - // int skip = left & 3; - - mov(ebx, edx); - and(edx, 3); - - // left -= skip; - - sub(ebx, edx); - - // int steps = right - left - 4; - - sub(ecx, ebx); - sub(ecx, 4); - - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - shl(edx, 4); - - movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); - - por(xmm7, ptr[eax + (size_t)&m_test[7]]); - - // GSVector2i* fza_base = &m_local.gd->fzbr[top]; - - mov(esi, dword[esp + _top]); - lea(esi, ptr[esi * 8]); - add(esi, dword[&m_local.gd->fzbr]); - - // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; - - lea(edi, ptr[ebx * 2]); - add(edi, dword[&m_local.gd->fzbc]); - - if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) - { - // edx = &m_local.d[skip] - - shl(edx, 4); - lea(edx, ptr[edx + (size_t)m_local.d]); - - // ebx = &v - - mov(ebx, dword[esp + _v]); - } - - if(!m_sel.sprite) - { - if(m_sel.fwrite && m_sel.fge || m_sel.zb) - { - movaps(xmm0, ptr[ebx + 16]); // v.p - - if(m_sel.fwrite && m_sel.fge) - { - // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); - - cvttps2dq(xmm1, xmm0); - pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - paddw(xmm1, ptr[edx + 16 * 6]); - - movdqa(ptr[&m_local.temp.f], xmm1); - } - - if(m_sel.zb) - { - // z = vp.zzzz() + m_local.d[skip].z; - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - addps(xmm0, ptr[edx]); - - movaps(ptr[&m_local.temp.z], xmm0); - } - } - } - else - { - if(m_sel.ztest) - { - movdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if(m_sel.fb) - { - if(m_sel.edge || m_sel.tfx != TFX_NONE) - { - movaps(xmm4, ptr[ebx + 32]); // v.t - } - - if(m_sel.edge) - { - pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); - psrlw(xmm3, 9); - - movdqa(ptr[&m_local.temp.cov], xmm3); - } - - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i vti(vt); - - cvttps2dq(xmm4, xmm4); - - // si = vti.xxxx() + m_local.d[skip].si; - // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; - - pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - - paddd(xmm2, ptr[edx + 16 * 7]); - - if(!m_sel.sprite) - { - paddd(xmm3, ptr[edx + 16 * 8]); - } - else - { - if(m_sel.ltf) - { - movdqa(xmm4, xmm3); - pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm4, 1); - movdqa(ptr[&m_local.temp.vf], xmm4); - } - } - - movdqa(ptr[&m_local.temp.s], xmm2); - movdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - // s = vt.xxxx() + m_local.d[skip].s; - // t = vt.yyyy() + m_local.d[skip].t; - // q = vt.zzzz() + m_local.d[skip].q; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm4); - - shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - addps(xmm2, ptr[edx + 16 * 1]); - addps(xmm3, ptr[edx + 16 * 2]); - addps(xmm4, ptr[edx + 16 * 3]); - - movaps(ptr[&m_local.temp.s], xmm2); - movaps(ptr[&m_local.temp.t], xmm3); - movaps(ptr[&m_local.temp.q], xmm4); - - rcpps(xmm4, xmm4); - mulps(xmm2, xmm4); - mulps(xmm3, xmm4); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i vc = GSVector4i(v.c); - - cvttps2dq(xmm6, ptr[ebx]); // v.c - - // vc = vc.upl16(vc.zwxy()); - - pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); - punpcklwd(xmm6, xmm5); - - // rb = vc.xxxx().add16(m_local.d[skip].rb); - // ga = vc.zzzz().add16(m_local.d[skip].ga); - - pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); - - paddw(xmm5, ptr[edx + 16 * 4]); - paddw(xmm6, ptr[edx + 16 * 5]); - - movdqa(ptr[&m_local.temp.rb], xmm5); - movdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if(m_sel.tfx == TFX_NONE) - { - movdqa(xmm5, ptr[&m_local.c.rb]); - movdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } -} - -void GSDrawScanlineCodeGenerator::Step() -{ - // steps -= 4; - - sub(ecx, 4); - - // fza_offset++; - - add(edi, 8); - - if(!m_sel.sprite) - { - // z += m_local.d4.z; - - if(m_sel.zb) - { - movaps(xmm0, ptr[&m_local.temp.z]); - addps(xmm0, ptr[&m_local.d4.z]); - movaps(ptr[&m_local.temp.z], xmm0); - } - - // f = f.add16(m_local.d4.f); - - if(m_sel.fwrite && m_sel.fge) - { - movdqa(xmm1, ptr[&m_local.temp.f]); - paddw(xmm1, ptr[&m_local.d4.f]); - movdqa(ptr[&m_local.temp.f], xmm1); - } - } - else - { - if(m_sel.ztest) - { - movdqa(xmm0, ptr[&m_local.p.z]); - } - } - - if(m_sel.fb) - { - if(m_sel.tfx != TFX_NONE) - { - if(m_sel.fst) - { - // GSVector4i st = m_local.d4.st; - - // si += st.xxxx(); - // if(!sprite) ti += st.yyyy(); - - movdqa(xmm4, ptr[&m_local.d4.st]); - - pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - paddd(xmm2, ptr[&m_local.temp.s]); - movdqa(ptr[&m_local.temp.s], xmm2); - - if(!m_sel.sprite) - { - pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); - paddd(xmm3, ptr[&m_local.temp.t]); - movdqa(ptr[&m_local.temp.t], xmm3); - } - else - { - movdqa(xmm3, ptr[&m_local.temp.t]); - } - } - else - { - // GSVector4 stq = m_local.d4.stq; - - // s += stq.xxxx(); - // t += stq.yyyy(); - // q += stq.zzzz(); - - movaps(xmm2, ptr[&m_local.d4.stq]); - movaps(xmm3, xmm2); - movaps(xmm4, xmm2); - - shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); - - addps(xmm2, ptr[&m_local.temp.s]); - addps(xmm3, ptr[&m_local.temp.t]); - addps(xmm4, ptr[&m_local.temp.q]); - - movaps(ptr[&m_local.temp.s], xmm2); - movaps(ptr[&m_local.temp.t], xmm3); - movaps(ptr[&m_local.temp.q], xmm4); - - rcpps(xmm4, xmm4); - mulps(xmm2, xmm4); - mulps(xmm3, xmm4); - } - } - - if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) - { - if(m_sel.iip) - { - // GSVector4i c = m_local.d4.c; - - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); - - movdqa(xmm7, ptr[&m_local.d4.c]); - - pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); - - paddw(xmm5, ptr[&m_local.temp.rb]); - paddw(xmm6, ptr[&m_local.temp.ga]); - - movdqa(ptr[&m_local.temp.rb], xmm5); - movdqa(ptr[&m_local.temp.ga], xmm6); - } - else - { - if(m_sel.tfx == TFX_NONE) - { - movdqa(xmm5, ptr[&m_local.c.rb]); - movdqa(xmm6, ptr[&m_local.c.ga]); - } - } - } - } - - // test = m_test[7 + (steps & (steps >> 31))]; - - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); - - movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); -} - -void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) -{ - if(!m_sel.zb) - { - return; - } - - // int za = fza_base.y + fza_offset->y; - - mov(ebp, dword[esi + 4]); - add(ebp, dword[edi + 4]); - - // GSVector4i zs = zi; - - if(!m_sel.sprite) - { - if(m_sel.zoverflow) - { - // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - movaps(temp1, ptr[&GSVector4::m_half]); - mulps(temp1, xmm0); - cvttps2dq(temp1, temp1); - pslld(temp1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(temp2, temp2); - psrld(temp2, 31); - pand(xmm0, temp2); - - por(xmm0, temp1); - } - else - { - // zs = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - if(m_sel.zwrite) - { - movdqa(ptr[&m_local.temp.zs], xmm0); - } - } - - if(m_sel.ztest) - { - ReadPixel(xmm1, ebp); - - if(m_sel.zwrite && m_sel.zpsm < 2) - { - movdqa(ptr[&m_local.temp.zd], xmm1); - } - - // zd &= 0xffffffff >> m_sel.zpsm * 8; - - if(m_sel.zpsm) - { - pslld(xmm1, m_sel.zpsm * 8); - psrld(xmm1, m_sel.zpsm * 8); - } - - if(m_sel.zoverflow || m_sel.zpsm == 0) - { - // GSVector4i o = GSVector4i::x80000000(); - - pcmpeqd(xmm4, xmm4); - pslld(xmm4, 31); - - // GSVector4i zso = zs - o; - - psubd(xmm0, xmm4); - - // GSVector4i zdo = zd - o; - - psubd(xmm1, xmm4); - } - - switch(m_sel.ztst) - { - case ZTST_GEQUAL: - // test |= zso < zdo; // ~(zso >= zdo) - pcmpgtd(xmm1, xmm0); - por(xmm7, xmm1); - break; - - case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL - // test |= zso <= zdo; // ~(zso > zdo) - pcmpgtd(xmm0, xmm1); - pcmpeqd(xmm4, xmm4); - pxor(xmm0, xmm4); - por(xmm7, xmm0); - break; - } - - alltrue(); - } -} - -void GSDrawScanlineCodeGenerator::SampleTexture() -{ - if(!m_sel.fb || m_sel.tfx == TFX_NONE) - { - return; - } - - mov(ebx, dword[&m_local.gd->tex]); - - // ebx = tex - - if(!m_sel.fst) - { - // TODO: move these into Init/Step too? - - cvttps2dq(xmm2, xmm2); - cvttps2dq(xmm3, xmm3); - - if(m_sel.ltf) - { - // u -= 0x8000; - // v -= 0x8000; - - mov(eax, 0x8000); - movd(xmm4, eax); - pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - - psubd(xmm2, xmm4); - psubd(xmm3, xmm4); - } - } - - // xmm2 = u - // xmm3 = v - - if(m_sel.ltf) - { - // GSVector4i uf = u.xxzzlh().srl16(1); - - movdqa(xmm0, xmm2); - pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); - movdqa(ptr[&m_local.temp.uf], xmm0); - - if(!m_sel.sprite) - { - // GSVector4i vf = v.xxzzlh().srl16(1); - - movdqa(xmm1, xmm3); - pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm1, 1); - movdqa(ptr[&m_local.temp.vf], xmm1); - } - } - - // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - - psrad(xmm2, 16); - psrad(xmm3, 16); - packssdw(xmm2, xmm3); - - if(m_sel.ltf) - { - // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); - - movdqa(xmm3, xmm2); - pcmpeqd(xmm1, xmm1); - psrlw(xmm1, 15); - paddw(xmm3, xmm1); - - // uv0 = Wrap(uv0); - // uv1 = Wrap(uv1); - - Wrap(xmm2, xmm3); - } - else - { - // uv0 = Wrap(uv0); - - Wrap(xmm2); - } - - // xmm2 = uv0 - // xmm3 = uv1 (ltf) - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i y0 = uv0.uph16() << tw; - // GSVector4i x0 = uv0.upl16(); - - pxor(xmm0, xmm0); - - movdqa(xmm4, xmm2); - punpckhwd(xmm2, xmm0); - punpcklwd(xmm4, xmm0); - pslld(xmm2, m_sel.tw + 3); - - // xmm0 = 0 - // xmm2 = y0 - // xmm3 = uv1 (ltf) - // xmm4 = x0 - // xmm1, xmm5, xmm6 = free - // xmm7 = used - - if(m_sel.ltf) - { - // GSVector4i y1 = uv1.uph16() << tw; - // GSVector4i x1 = uv1.upl16(); - - movdqa(xmm6, xmm3); - punpckhwd(xmm3, xmm0); - punpcklwd(xmm6, xmm0); - pslld(xmm3, m_sel.tw + 3); - - // xmm2 = y0 - // xmm3 = y1 - // xmm4 = x0 - // xmm6 = x1 - // xmm0, xmm5, xmm6 = free - // xmm7 = used - - // GSVector4i addr00 = y0 + x0; - // GSVector4i addr01 = y0 + x1; - // GSVector4i addr10 = y1 + x0; - // GSVector4i addr11 = y1 + x1; - - movdqa(xmm5, xmm2); - paddd(xmm5, xmm4); - paddd(xmm2, xmm6); - - movdqa(xmm0, xmm3); - paddd(xmm0, xmm4); - paddd(xmm3, xmm6); - - // xmm5 = addr00 - // xmm2 = addr01 - // xmm0 = addr10 - // xmm3 = addr11 - // xmm1, xmm4, xmm6 = free - // xmm7 = used - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); - // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); - // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm6, xmm5, xmm1, xmm4); - - // xmm2, xmm5, xmm1 = free - - ReadTexel(xmm4, xmm2, xmm5, xmm1); - - // xmm0, xmm2, xmm5 = free - - ReadTexel(xmm1, xmm0, xmm2, xmm5); - - // xmm3, xmm0, xmm2 = free - - ReadTexel(xmm5, xmm3, xmm0, xmm2); - - // xmm6 = c00 - // xmm4 = c01 - // xmm1 = c10 - // xmm5 = c11 - // xmm0, xmm2, xmm3 = free - // xmm7 = used - - movdqa(xmm0, ptr[&m_local.temp.uf]); - - // GSVector4i rb00 = c00 & mask; - // GSVector4i ga00 = (c00 >> 8) & mask; - - movdqa(xmm2, xmm6); - psllw(xmm2, 8); - psrlw(xmm2, 8); - psrlw(xmm6, 8); - - // GSVector4i rb01 = c01 & mask; - // GSVector4i ga01 = (c01 >> 8) & mask; - - movdqa(xmm3, xmm4); - psllw(xmm3, 8); - psrlw(xmm3, 8); - psrlw(xmm4, 8); - - // xmm0 = uf - // xmm2 = rb00 - // xmm3 = rb01 - // xmm6 = ga00 - // xmm4 = ga01 - // xmm1 = c10 - // xmm5 = c11 - // xmm7 = used - - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); - - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = c10 - // xmm5 = c11 - // xmm2, xmm6 = free - // xmm7 = used - - // GSVector4i rb10 = c10 & mask; - // GSVector4i ga10 = (c10 >> 8) & mask; - - movdqa(xmm2, xmm1); - psllw(xmm1, 8); - psrlw(xmm1, 8); - psrlw(xmm2, 8); - - // GSVector4i rb11 = c11 & mask; - // GSVector4i ga11 = (c11 >> 8) & mask; - - movdqa(xmm6, xmm5); - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); - - // xmm0 = uf - // xmm3 = rb00 - // xmm4 = ga00 - // xmm1 = rb10 - // xmm5 = rb11 - // xmm2 = ga10 - // xmm6 = ga11 - // xmm7 = used - - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); - - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); - - // xmm3 = rb00 - // xmm4 = ga00 - // xmm5 = rb10 - // xmm6 = ga10 - // xmm0, xmm1, xmm2 = free - // xmm7 = used - - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); - - movdqa(xmm0, ptr[&m_local.temp.vf]); - - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); - } - else - { - // GSVector4i addr00 = y0 + x0; - - paddd(xmm2, xmm4); - - // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - - ReadTexel(xmm5, xmm2, xmm0, xmm1); - - // GSVector4i mask = GSVector4i::x00ff(); - - // c[0] = c00 & mask; - // c[1] = (c00 >> 8) & mask; - - movdqa(xmm6, xmm5); - - psllw(xmm5, 8); - psrlw(xmm5, 8); - psrlw(xmm6, 8); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - pmaxsw(uv, ptr[&m_local.gd->t.min]); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv, xmm0); - } - - pminsw(uv, ptr[&m_local.gd->t.max]); - } - else - { - pand(uv, ptr[&m_local.gd->t.min]); - - if(region) - { - por(uv, ptr[&m_local.gd->t.max]); - } - } - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - movdqa(xmm5, ptr[&m_local.gd->t.max]); - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv); - - pand(xmm1, xmm4); - - if(region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv, xmm4); - pminsw(uv, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - blend8(uv, xmm1); - } -} - -void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) -{ - // xmm0, xmm1, xmm4, xmm5, xmm6 = free - - int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; - int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; - - int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; - - if(wms_clamp == wmt_clamp) - { - if(wms_clamp) - { - if(region) - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - pmaxsw(uv0, xmm4); - pmaxsw(uv1, xmm4); - } - else - { - pxor(xmm0, xmm0); - pmaxsw(uv0, xmm0); - pmaxsw(uv1, xmm0); - } - - movdqa(xmm5, ptr[&m_local.gd->t.max]); - pminsw(uv0, xmm5); - pminsw(uv1, xmm5); - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - pand(uv0, xmm4); - pand(uv1, xmm4); - - if(region) - { - movdqa(xmm5, ptr[&m_local.gd->t.max]); - por(uv0, xmm5); - por(uv1, xmm5); - } - } - } - else - { - movdqa(xmm4, ptr[&m_local.gd->t.min]); - movdqa(xmm5, ptr[&m_local.gd->t.max]); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - } - else - { - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm6, xmm0); - } - - // uv0 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv0); - - pand(xmm1, xmm4); - - if(region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv0, xmm4); - pminsw(uv0, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pblendvb(uv0, xmm1); - } - else - { - blendr(uv0, xmm1, xmm0); - } - - // uv1 - - // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; - - movdqa(xmm1, uv1); - - pand(xmm1, xmm4); - - if(region) - { - por(xmm1, xmm5); - } - - // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); - - pmaxsw(uv1, xmm4); - pminsw(uv1, xmm5); - - // clamp.blend8(repeat, m_local.gd->t.mask); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pblendvb(uv1, xmm1); - } - else - { - blendr(uv1, xmm1, xmm6); - } - } -} - -void GSDrawScanlineCodeGenerator::AlphaTFX() -{ - if(!m_sel.fb) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - // gat = gat.modulate16<1>(ga).clamp8(); - - modulate16(xmm6, xmm4, 1); - - clamp16(xmm6, xmm3); - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_DECAL: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_HIGHLIGHT: - - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - movdqa(xmm2, xmm4); - - // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); - - psrlw(xmm4, 7); - - if(m_sel.tcc) - { - paddusb(xmm4, xmm6); - } - - mix16(xmm6, xmm4, xmm3); - - break; - - case TFX_HIGHLIGHT2: - - // if(!tcc) gat = gat.mix16(ga.srl16(7)); - - if(!m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - movdqa(xmm2, xmm4); - - psrlw(xmm4, 7); - - mix16(xmm6, xmm4, xmm3); - } - - break; - - case TFX_NONE: - - // gat = iip ? ga.srl16(7) : ga; - - if(m_sel.iip) - { - psrlw(xmm6, 7); - } - - break; - } - - if(m_sel.aa1) - { - // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha - - // FIXME: bios config screen cubes - - if(!m_sel.abe) - { - // a = cov - - if(m_sel.edge) - { - movdqa(xmm0, ptr[&m_local.temp.cov]); - } - else - { - pcmpeqd(xmm0, xmm0); - psllw(xmm0, 15); - psrlw(xmm0, 8); - } - - mix16(xmm6, xmm0, xmm1); - } - else - { - // a = a == 0x80 ? cov : a - - pcmpeqd(xmm0, xmm0); - psllw(xmm0, 15); - psrlw(xmm0, 8); - - if(m_sel.edge) - { - movdqa(xmm1, ptr[&m_local.temp.cov]); - } - else - { - movdqa(xmm1, xmm0); - } - - pcmpeqw(xmm0, xmm6); - psrld(xmm0, 16); - pslld(xmm0, 16); - - blend8(xmm6, xmm1); - } - } -} - -void GSDrawScanlineCodeGenerator::ReadMask() -{ - if(m_sel.fwrite) - { - movdqa(xmm3, ptr[&m_local.gd->fm]); - } - - if(m_sel.zwrite) - { - movdqa(xmm4, ptr[&m_local.gd->zm]); - } -} - -void GSDrawScanlineCodeGenerator::TestAlpha() -{ - switch(m_sel.afail) - { - case AFAIL_FB_ONLY: - if(!m_sel.zwrite) return; - break; - - case AFAIL_ZB_ONLY: - if(!m_sel.fwrite) return; - break; - - case AFAIL_RGB_ONLY: - if(!m_sel.zwrite && m_sel.fpsm == 1) return; - break; - } - - switch(m_sel.atst) - { - case ATST_NEVER: - // t = GSVector4i::xffffffff(); - pcmpeqd(xmm1, xmm1); - break; - - case ATST_ALWAYS: - return; - - case ATST_LESS: - case ATST_LEQUAL: - // t = (ga >> 16) > m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpgtd(xmm1, ptr[&m_local.gd->aref]); - break; - - case ATST_EQUAL: - // t = (ga >> 16) != m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpeqd(xmm1, ptr[&m_local.gd->aref]); - pcmpeqd(xmm0, xmm0); - pxor(xmm1, xmm0); - break; - - case ATST_GEQUAL: - case ATST_GREATER: - // t = (ga >> 16) < m_local.gd->aref; - movdqa(xmm0, xmm6); - psrld(xmm0, 16); - movdqa(xmm1, ptr[&m_local.gd->aref]); - pcmpgtd(xmm1, xmm0); - break; - - case ATST_NOTEQUAL: - // t = (ga >> 16) == m_local.gd->aref; - movdqa(xmm1, xmm6); - psrld(xmm1, 16); - pcmpeqd(xmm1, ptr[&m_local.gd->aref]); - break; - } - - switch(m_sel.afail) - { - case AFAIL_KEEP: - // test |= t; - por(xmm7, xmm1); - alltrue(); - break; - - case AFAIL_FB_ONLY: - // zm |= t; - por(xmm4, xmm1); - break; - - case AFAIL_ZB_ONLY: - // fm |= t; - por(xmm3, xmm1); - break; - - case AFAIL_RGB_ONLY: - // zm |= t; - por(xmm4, xmm1); - // fm |= t & GSVector4i::xff000000(); - psrld(xmm1, 24); - pslld(xmm1, 24); - por(xmm3, xmm1); - break; - } -} - -void GSDrawScanlineCodeGenerator::ColorTFX() -{ - if(!m_sel.fwrite) - { - return; - } - - switch(m_sel.tfx) - { - case TFX_MODULATE: - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - clamp16(xmm5, xmm1); - - break; - - case TFX_DECAL: - - break; - - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - - if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) - { - // GSVector4i ga = iip ? gaf : m_local.c.ga; - - movdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); - } - - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - movdqa(xmm1, xmm6); - - modulate16(xmm6, xmm2, 1); - - pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); - psrlw(xmm2, 7); - - paddw(xmm6, xmm2); - - clamp16(xmm6, xmm0); - - mix16(xmm6, xmm1, xmm0); - - // GSVector4i rb = iip ? rbf : m_local.c.rb; - - // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); - - modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); - - paddw(xmm5, xmm2); - - clamp16(xmm5, xmm0); - - break; - - case TFX_NONE: - - // rbt = iip ? rb.srl16(7) : rb; - - if(m_sel.iip) - { - psrlw(xmm5, 7); - } - - break; - } -} - -void GSDrawScanlineCodeGenerator::Fog() -{ - if(!m_sel.fwrite || !m_sel.fge) - { - return; - } - - // rb = m_local.gd->frb.lerp16<0>(rb, f); - // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); - - movdqa(xmm0, ptr[!m_sel.sprite ? &m_local.temp.f : &m_local.p.f]); - movdqa(xmm1, xmm6); - - movdqa(xmm2, ptr[&m_local.gd->frb]); - lerp16(xmm5, xmm2, xmm0, 0); - - movdqa(xmm2, ptr[&m_local.gd->fga]); - lerp16(xmm6, xmm2, xmm0, 0); - mix16(xmm6, xmm1, xmm0); -} - -void GSDrawScanlineCodeGenerator::ReadFrame() -{ - if(!m_sel.fb) - { - return; - } - - // int fa = fza_base.x + fza_offset->x; - - mov(ebx, dword[esi]); - add(ebx, dword[edi]); - - if(!m_sel.rfb) - { - return; - } - - ReadPixel(xmm2, ebx); -} - -void GSDrawScanlineCodeGenerator::TestDestAlpha() -{ - if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) - { - return; - } - - // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); - - movdqa(xmm1, xmm2); - - if(m_sel.datm) - { - if(m_sel.fpsm == 2) - { - pxor(xmm0, xmm0); - psrld(xmm1, 15); - pcmpeqd(xmm1, xmm0); - } - else - { - pcmpeqd(xmm0, xmm0); - pxor(xmm1, xmm0); - psrad(xmm1, 31); - } - } - else - { - if(m_sel.fpsm == 2) - { - pslld(xmm1, 16); - } - - psrad(xmm1, 31); - } - - por(xmm7, xmm1); - - alltrue(); -} - -void GSDrawScanlineCodeGenerator::WriteMask() -{ - // fm |= test; - // zm |= test; - - if(m_sel.fwrite) - { - por(xmm3, xmm7); - } - - if(m_sel.zwrite) - { - por(xmm4, xmm7); - } - - // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - - pcmpeqd(xmm1, xmm1); - - if(m_sel.fwrite && m_sel.zwrite) - { - movdqa(xmm0, xmm1); - pcmpeqd(xmm1, xmm3); - pcmpeqd(xmm0, xmm4); - packssdw(xmm1, xmm0); - } - else if(m_sel.fwrite) - { - pcmpeqd(xmm1, xmm3); - packssdw(xmm1, xmm1); - } - else if(m_sel.zwrite) - { - pcmpeqd(xmm1, xmm4); - packssdw(xmm1, xmm1); - } - - pmovmskb(edx, xmm1); - - not(edx); -} - -void GSDrawScanlineCodeGenerator::WriteZBuf() -{ - if(!m_sel.zwrite) - { - return; - } - - bool fast = m_sel.ztest && m_sel.zpsm < 2; - - movdqa(xmm1, ptr[!m_sel.sprite ? &m_local.temp.zs : &m_local.p.z]); - - if(fast) - { - // zs = zs.blend8(zd, zm); - - movdqa(xmm0, xmm4); - movdqa(xmm7, ptr[&m_local.temp.zd]); - blend8(xmm1, xmm7); - } - - WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); -} - -void GSDrawScanlineCodeGenerator::AlphaBlend() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.abe == 0 && m_sel.aa1 == 0) - { - return; - } - - if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) - { - switch(m_sel.fpsm) - { - case 0: - case 1: - - // c[2] = fd & mask; - // c[3] = (fd >> 8) & mask; - - movdqa(xmm0, xmm2); - movdqa(xmm1, xmm2); - - psllw(xmm0, 8); - psrlw(xmm0, 8); - psrlw(xmm1, 8); - - break; - - case 2: - - // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - - movdqa(xmm0, xmm2); - movdqa(xmm1, xmm2); - movdqa(xmm4, xmm2); - - pcmpeqd(xmm7, xmm7); - psrld(xmm7, 27); // 0x0000001f - pand(xmm0, xmm7); - pslld(xmm0, 3); - - pslld(xmm7, 10); // 0x00007c00 - pand(xmm4, xmm7); - pslld(xmm4, 9); - - por(xmm0, xmm4); - - movdqa(xmm4, xmm1); - - psrld(xmm7, 5); // 0x000003e0 - pand(xmm1, xmm7); - psrld(xmm1, 2); - - psllw(xmm7, 10); // 0x00008000 - pand(xmm4, xmm7); - pslld(xmm4, 8); - - por(xmm1, xmm4); - - break; - } - } - - // xmm5, xmm6 = src rb, ga - // xmm0, xmm1 = dst rb, ga - // xmm2, xmm3 = used - // xmm4, xmm7 = free - - if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) - { - movdqa(xmm4, xmm5); - } - - if(m_sel.aba != m_sel.abb) - { - // rb = c[aba * 2 + 0]; - - switch(m_sel.aba) - { - case 0: break; - case 1: movdqa(xmm5, xmm0); break; - case 2: pxor(xmm5, xmm5); break; - } - - // rb = rb.sub16(c[abb * 2 + 0]); - - switch(m_sel.abb) - { - case 0: psubw(xmm5, xmm4); break; - case 1: psubw(xmm5, xmm0); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; - - switch(m_sel.abc) - { - case 0: - case 1: - movdqa(xmm7, m_sel.abc ? xmm1 : xmm6); - pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); - pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); - psllw(xmm7, 7); - break; - case 2: - movdqa(xmm7, ptr[&m_local.gd->afix]); - break; - } - - // rb = rb.modulate16<1>(a); - - modulate16(xmm5, xmm7, 1); - } - - // rb = rb.add16(c[abd * 2 + 0]); - - switch(m_sel.abd) - { - case 0: paddw(xmm5, xmm4); break; - case 1: paddw(xmm5, xmm0); break; - case 2: break; - } - } - else - { - // rb = c[abd * 2 + 0]; - - switch(m_sel.abd) - { - case 0: break; - case 1: movdqa(xmm5, xmm0); break; - case 2: pxor(xmm5, xmm5); break; - } - } - - if(m_sel.pabe) - { - // mask = (c[1] << 8).sra32(31); - - movdqa(xmm0, xmm6); - pslld(xmm0, 8); - psrad(xmm0, 31); - - // rb = c[0].blend8(rb, mask); - - blend8r(xmm5, xmm4); - } - - // xmm6 = src ga - // xmm1 = dst ga - // xmm5 = rb - // xmm7 = a - // xmm2, xmm3 = used - // xmm0, xmm4 = free - - movdqa(xmm4, xmm6); - - if(m_sel.aba != m_sel.abb) - { - // ga = c[aba * 2 + 1]; - - switch(m_sel.aba) - { - case 0: break; - case 1: movdqa(xmm6, xmm1); break; - case 2: pxor(xmm6, xmm6); break; - } - - // ga = ga.sub16(c[abeb * 2 + 1]); - - switch(m_sel.abb) - { - case 0: psubw(xmm6, xmm4); break; - case 1: psubw(xmm6, xmm1); break; - case 2: break; - } - - if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) - { - // ga = ga.modulate16<1>(a); - - modulate16(xmm6, xmm7, 1); - } - - // ga = ga.add16(c[abd * 2 + 1]); - - switch(m_sel.abd) - { - case 0: paddw(xmm6, xmm4); break; - case 1: paddw(xmm6, xmm1); break; - case 2: break; - } - } - else - { - // ga = c[abd * 2 + 1]; - - switch(m_sel.abd) - { - case 0: break; - case 1: movdqa(xmm6, xmm1); break; - case 2: pxor(xmm6, xmm6); break; - } - } - - // xmm4 = src ga - // xmm5 = rb - // xmm6 = ga - // xmm2, xmm3 = used - // xmm0, xmm1, xmm7 = free - - if(m_sel.pabe) - { - if(!m_cpu.has(util::Cpu::tSSE41)) - { - // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) - - movdqa(xmm0, xmm4); - pslld(xmm0, 8); - psrad(xmm0, 31); - } - - psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) - - // ga = c[1].blend8(ga, mask).mix16(c[1]); - - blend8r(xmm6, xmm4); - } - else - { - if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx - { - mix16(xmm6, xmm4, xmm7); - } - } -} - -void GSDrawScanlineCodeGenerator::WriteFrame() -{ - if(!m_sel.fwrite) - { - return; - } - - if(m_sel.colclamp == 0) - { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; - - pcmpeqd(xmm7, xmm7); - psrlw(xmm7, 8); - pand(xmm5, xmm7); - pand(xmm6, xmm7); - } - - if(m_sel.fpsm == 2 && m_sel.dthe) - { - mov(eax, dword[esp + _top]); - and(eax, 3); - shl(eax, 5); - paddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); - paddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]); - } - - // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); - - movdqa(xmm7, xmm5); - punpcklwd(xmm5, xmm6); - punpckhwd(xmm7, xmm6); - packuswb(xmm5, xmm7); - - if(m_sel.fba && m_sel.fpsm != 1) - { - // fs |= 0x80000000; - - pcmpeqd(xmm7, xmm7); - pslld(xmm7, 31); - por(xmm5, xmm7); - } - - if(m_sel.fpsm == 2) - { - // GSVector4i rb = fs & 0x00f800f8; - // GSVector4i ga = fs & 0x8000f800; - - mov(eax, 0x00f800f8); - movd(xmm6, eax); - pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); - - mov(eax, 0x8000f800); - movd(xmm7, eax); - pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); - - movdqa(xmm4, xmm5); - pand(xmm4, xmm6); - pand(xmm5, xmm7); - - // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - - movdqa(xmm6, xmm4); - movdqa(xmm7, xmm5); - - psrld(xmm4, 3); - psrld(xmm6, 9); - psrld(xmm5, 6); - psrld(xmm7, 16); - - por(xmm5, xmm4); - por(xmm7, xmm6); - por(xmm5, xmm7); - } - - if(m_sel.rfb) - { - // fs = fs.blend(fd, fm); - - blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm - } - - bool fast = m_sel.rfb && m_sel.fpsm < 2; - - WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); -} - -void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) -{ - movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); - movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); -} - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) -{ - if(fast) - { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - } - else - { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); - - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); - - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); - - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); - } -} - -static const int s_offsets[4] = {0, 2, 8, 10}; - -void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) -{ - Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; - - if(m_cpu.has(util::Cpu::tSSE41)) - { - switch(psm) - { - case 0: - if(i == 0) movd(dst, src); - else pextrd(dst, src, i); - break; - case 1: - if(i == 0) movd(eax, src); - else pextrd(eax, src, i); - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - pextrw(eax, src, i * 2); - mov(dst, ax); - break; - } - } - else - { - switch(psm) - { - case 0: - if(i == 0) movd(dst, src); - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);} - break; - case 1: - if(i == 0) movd(eax, src); - else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);} - xor(eax, dst); - and(eax, 0xffffff); - xor(dst, eax); - break; - case 2: - pextrw(eax, src, i * 2); - mov(dst, ax); - break; - } - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) -{ - if(m_cpu.has(util::Cpu::tSSE41)) - { - ReadTexel(dst, addr, 0); - ReadTexel(dst, addr, 1); - ReadTexel(dst, addr, 2); - ReadTexel(dst, addr, 3); - } - else - { - ReadTexel(dst, addr, 0); - psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - punpckldq(dst, temp1); - - ReadTexel(temp1, addr, 0); - psrldq(addr, 4); - ReadTexel(temp2, addr, 0); - // psrldq(addr, 4); - punpckldq(temp1, temp2); - - punpcklqdq(dst, temp1); - } -} - -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) -{ - const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4]; - - if(!m_cpu.has(util::Cpu::tSSE41) && i > 0) - { - ASSERT(0); - } - - if(i == 0) movd(eax, addr); - else pextrd(eax, addr, i); - - if(m_sel.tlu) movzx(eax, byte[ebx + eax]); - - if(i == 0) movd(dst, src); - else pinsrd(dst, src, i); -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSDrawScanlineCodeGenerator.h" + +#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +static const int _args = 16; +static const int _top = _args + 4; +static const int _v = _args + 8; + +void GSDrawScanlineCodeGenerator::Generate() +{ + push(ebx); + push(esi); + push(edi); + push(ebp); + + Init(); + + if(!m_sel.edge) + { + align(16); + } + +L("loop"); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // xmm0 = z/zi + // xmm2 = u (tme) + // xmm3 = v (tme) + // xmm5 = rb (!tme) + // xmm6 = ga (!tme) + // xmm7 = test + + bool tme = m_sel.tfx != TFX_NONE; + + TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // - xmm0 + // xmm2 = u (tme) + // xmm3 = v (tme) + // xmm5 = rb (!tme) + // xmm6 = ga (!tme) + // xmm7 = test + + SampleTexture(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // - xmm2 + // - xmm3 + // - xmm4 + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + AlphaTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ReadMask(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + TestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ColorTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + Fog(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + ReadFrame(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + TestDestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + // xmm7 = test + + WriteMask(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // ebp = za + // xmm2 = fd + // xmm3 = fm + // xmm4 = zm + // xmm5 = rb + // xmm6 = ga + + WriteZBuf(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // - ebp + // xmm2 = fd + // xmm3 = fm + // - xmm4 + // xmm5 = rb + // xmm6 = ga + + AlphaBlend(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // xmm2 = fd + // xmm3 = fm + // xmm5 = rb + // xmm6 = ga + + WriteFrame(); + +L("step"); + + // if(steps <= 0) break; + + if(!m_sel.edge) + { + test(ecx, ecx); + + jle("exit", T_NEAR); + + Step(); + + jmp("loop", T_NEAR); + } + +L("exit"); + + // vzeroupper(); + + pop(ebp); + pop(edi); + pop(esi); + pop(ebx); + + ret(8); +} + +void GSDrawScanlineCodeGenerator::Init() +{ + // int skip = left & 3; + + mov(ebx, edx); + and(edx, 3); + + // left -= skip; + + sub(ebx, edx); + + // int steps = right - left - 4; + + sub(ecx, ebx); + sub(ecx, 4); + + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + + shl(edx, 4); + + movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); + + por(xmm7, ptr[eax + (size_t)&m_test[7]]); + + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + + mov(esi, dword[esp + _top]); + lea(esi, ptr[esi * 8]); + add(esi, dword[&m_local.gd->fzbr]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + + lea(edi, ptr[ebx * 2]); + add(edi, dword[&m_local.gd->fzbc]); + + if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) + { + // edx = &m_local.d[skip] + + shl(edx, 4); + lea(edx, ptr[edx + (size_t)m_local.d]); + + // ebx = &v + + mov(ebx, dword[esp + _v]); + } + + if(!m_sel.sprite) + { + if(m_sel.fwrite && m_sel.fge || m_sel.zb) + { + movaps(xmm0, ptr[ebx + 16]); // v.p + + if(m_sel.fwrite && m_sel.fge) + { + // f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); + + cvttps2dq(xmm1, xmm0); + pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + paddw(xmm1, ptr[edx + 16 * 6]); + + movdqa(ptr[&m_local.temp.f], xmm1); + } + + if(m_sel.zb) + { + // z = vp.zzzz() + m_local.d[skip].z; + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + addps(xmm0, ptr[edx]); + + movaps(ptr[&m_local.temp.z], xmm0); + } + } + } + else + { + if(m_sel.ztest) + { + movdqa(xmm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.edge || m_sel.tfx != TFX_NONE) + { + movaps(xmm4, ptr[ebx + 32]); // v.t + } + + if(m_sel.edge) + { + pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); + psrlw(xmm3, 9); + + movdqa(ptr[&m_local.temp.cov], xmm3); + } + + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i vti(vt); + + cvttps2dq(xmm4, xmm4); + + // si = vti.xxxx() + m_local.d[skip].si; + // ti = vti.yyyy(); if(!sprite) ti += m_local.d[skip].ti; + + pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + + paddd(xmm2, ptr[edx + 16 * 7]); + + if(!m_sel.sprite) + { + paddd(xmm3, ptr[edx + 16 * 8]); + } + else + { + if(m_sel.ltf) + { + movdqa(xmm4, xmm3); + pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xmm4, 1); + movdqa(ptr[&m_local.temp.vf], xmm4); + } + } + + movdqa(ptr[&m_local.temp.s], xmm2); + movdqa(ptr[&m_local.temp.t], xmm3); + } + else + { + // s = vt.xxxx() + m_local.d[skip].s; + // t = vt.yyyy() + m_local.d[skip].t; + // q = vt.zzzz() + m_local.d[skip].q; + + movaps(xmm2, xmm4); + movaps(xmm3, xmm4); + + shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + + addps(xmm2, ptr[edx + 16 * 1]); + addps(xmm3, ptr[edx + 16 * 2]); + addps(xmm4, ptr[edx + 16 * 3]); + + movaps(ptr[&m_local.temp.s], xmm2); + movaps(ptr[&m_local.temp.t], xmm3); + movaps(ptr[&m_local.temp.q], xmm4); + + rcpps(xmm4, xmm4); + mulps(xmm2, xmm4); + mulps(xmm3, xmm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i vc = GSVector4i(v.c); + + cvttps2dq(xmm6, ptr[ebx]); // v.c + + // vc = vc.upl16(vc.zwxy()); + + pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); + punpcklwd(xmm6, xmm5); + + // rb = vc.xxxx().add16(m_local.d[skip].rb); + // ga = vc.zzzz().add16(m_local.d[skip].ga); + + pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); + + paddw(xmm5, ptr[edx + 16 * 4]); + paddw(xmm6, ptr[edx + 16 * 5]); + + movdqa(ptr[&m_local.temp.rb], xmm5); + movdqa(ptr[&m_local.temp.ga], xmm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + movdqa(xmm5, ptr[&m_local.c.rb]); + movdqa(xmm6, ptr[&m_local.c.ga]); + } + } + } + } +} + +void GSDrawScanlineCodeGenerator::Step() +{ + // steps -= 4; + + sub(ecx, 4); + + // fza_offset++; + + add(edi, 8); + + if(!m_sel.sprite) + { + // z += m_local.d4.z; + + if(m_sel.zb) + { + movaps(xmm0, ptr[&m_local.temp.z]); + addps(xmm0, ptr[&m_local.d4.z]); + movaps(ptr[&m_local.temp.z], xmm0); + } + + // f = f.add16(m_local.d4.f); + + if(m_sel.fwrite && m_sel.fge) + { + movdqa(xmm1, ptr[&m_local.temp.f]); + paddw(xmm1, ptr[&m_local.d4.f]); + movdqa(ptr[&m_local.temp.f], xmm1); + } + } + else + { + if(m_sel.ztest) + { + movdqa(xmm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i st = m_local.d4.st; + + // si += st.xxxx(); + // if(!sprite) ti += st.yyyy(); + + movdqa(xmm4, ptr[&m_local.d4.st]); + + pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + paddd(xmm2, ptr[&m_local.temp.s]); + movdqa(ptr[&m_local.temp.s], xmm2); + + if(!m_sel.sprite) + { + pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); + paddd(xmm3, ptr[&m_local.temp.t]); + movdqa(ptr[&m_local.temp.t], xmm3); + } + else + { + movdqa(xmm3, ptr[&m_local.temp.t]); + } + } + else + { + // GSVector4 stq = m_local.d4.stq; + + // s += stq.xxxx(); + // t += stq.yyyy(); + // q += stq.zzzz(); + + movaps(xmm2, ptr[&m_local.d4.stq]); + movaps(xmm3, xmm2); + movaps(xmm4, xmm2); + + shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); + + addps(xmm2, ptr[&m_local.temp.s]); + addps(xmm3, ptr[&m_local.temp.t]); + addps(xmm4, ptr[&m_local.temp.q]); + + movaps(ptr[&m_local.temp.s], xmm2); + movaps(ptr[&m_local.temp.t], xmm3); + movaps(ptr[&m_local.temp.q], xmm4); + + rcpps(xmm4, xmm4); + mulps(xmm2, xmm4); + mulps(xmm3, xmm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i c = m_local.d4.c; + + // rb = rb.add16(c.xxxx()); + // ga = ga.add16(c.yyyy()); + + movdqa(xmm7, ptr[&m_local.d4.c]); + + pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); + + paddw(xmm5, ptr[&m_local.temp.rb]); + paddw(xmm6, ptr[&m_local.temp.ga]); + + movdqa(ptr[&m_local.temp.rb], xmm5); + movdqa(ptr[&m_local.temp.ga], xmm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + movdqa(xmm5, ptr[&m_local.c.rb]); + movdqa(xmm6, ptr[&m_local.c.ga]); + } + } + } + } + + // test = m_test[7 + (steps & (steps >> 31))]; + + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); + + movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); +} + +void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) +{ + if(!m_sel.zb) + { + return; + } + + // int za = fza_base.y + fza_offset->y; + + mov(ebp, dword[esi + 4]); + add(ebp, dword[edi + 4]); + + // GSVector4i zs = zi; + + if(!m_sel.sprite) + { + if(m_sel.zoverflow) + { + // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + movaps(temp1, ptr[&GSVector4::m_half]); + mulps(temp1, xmm0); + cvttps2dq(temp1, temp1); + pslld(temp1, 1); + + cvttps2dq(xmm0, xmm0); + pcmpeqd(temp2, temp2); + psrld(temp2, 31); + pand(xmm0, temp2); + + por(xmm0, temp1); + } + else + { + // zs = GSVector4i(z); + + cvttps2dq(xmm0, xmm0); + } + + if(m_sel.zwrite) + { + movdqa(ptr[&m_local.temp.zs], xmm0); + } + } + + if(m_sel.ztest) + { + ReadPixel(xmm1, ebp); + + if(m_sel.zwrite && m_sel.zpsm < 2) + { + movdqa(ptr[&m_local.temp.zd], xmm1); + } + + // zd &= 0xffffffff >> m_sel.zpsm * 8; + + if(m_sel.zpsm) + { + pslld(xmm1, m_sel.zpsm * 8); + psrld(xmm1, m_sel.zpsm * 8); + } + + if(m_sel.zoverflow || m_sel.zpsm == 0) + { + // GSVector4i o = GSVector4i::x80000000(); + + pcmpeqd(xmm4, xmm4); + pslld(xmm4, 31); + + // GSVector4i zso = zs - o; + + psubd(xmm0, xmm4); + + // GSVector4i zdo = zd - o; + + psubd(xmm1, xmm4); + } + + switch(m_sel.ztst) + { + case ZTST_GEQUAL: + // test |= zso < zdo; // ~(zso >= zdo) + pcmpgtd(xmm1, xmm0); + por(xmm7, xmm1); + break; + + case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL + // test |= zso <= zdo; // ~(zso > zdo) + pcmpgtd(xmm0, xmm1); + pcmpeqd(xmm4, xmm4); + pxor(xmm0, xmm4); + por(xmm7, xmm0); + break; + } + + alltrue(); + } +} + +void GSDrawScanlineCodeGenerator::SampleTexture() +{ + if(!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + mov(ebx, dword[&m_local.gd->tex]); + + // ebx = tex + + if(!m_sel.fst) + { + // TODO: move these into Init/Step too? + + cvttps2dq(xmm2, xmm2); + cvttps2dq(xmm3, xmm3); + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + movd(xmm4, eax); + pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + + psubd(xmm2, xmm4); + psubd(xmm3, xmm4); + } + } + + // xmm2 = u + // xmm3 = v + + if(m_sel.ltf) + { + // GSVector4i uf = u.xxzzlh().srl16(1); + + movdqa(xmm0, xmm2); + pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xmm0, 1); + movdqa(ptr[&m_local.temp.uf], xmm0); + + if(!m_sel.sprite) + { + // GSVector4i vf = v.xxzzlh().srl16(1); + + movdqa(xmm1, xmm3); + pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); + psrlw(xmm1, 1); + movdqa(ptr[&m_local.temp.vf], xmm1); + } + } + + // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + + psrad(xmm2, 16); + psrad(xmm3, 16); + packssdw(xmm2, xmm3); + + if(m_sel.ltf) + { + // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); + + movdqa(xmm3, xmm2); + pcmpeqd(xmm1, xmm1); + psrlw(xmm1, 15); + paddw(xmm3, xmm1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + Wrap(xmm2, xmm3); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(xmm2); + } + + // xmm2 = uv0 + // xmm3 = uv1 (ltf) + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + // xmm7 = used + + // GSVector4i y0 = uv0.uph16() << tw; + // GSVector4i x0 = uv0.upl16(); + + pxor(xmm0, xmm0); + + movdqa(xmm4, xmm2); + punpckhwd(xmm2, xmm0); + punpcklwd(xmm4, xmm0); + pslld(xmm2, m_sel.tw + 3); + + // xmm0 = 0 + // xmm2 = y0 + // xmm3 = uv1 (ltf) + // xmm4 = x0 + // xmm1, xmm5, xmm6 = free + // xmm7 = used + + if(m_sel.ltf) + { + // GSVector4i y1 = uv1.uph16() << tw; + // GSVector4i x1 = uv1.upl16(); + + movdqa(xmm6, xmm3); + punpckhwd(xmm3, xmm0); + punpcklwd(xmm6, xmm0); + pslld(xmm3, m_sel.tw + 3); + + // xmm2 = y0 + // xmm3 = y1 + // xmm4 = x0 + // xmm6 = x1 + // xmm0, xmm5, xmm6 = free + // xmm7 = used + + // GSVector4i addr00 = y0 + x0; + // GSVector4i addr01 = y0 + x1; + // GSVector4i addr10 = y1 + x0; + // GSVector4i addr11 = y1 + x1; + + movdqa(xmm5, xmm2); + paddd(xmm5, xmm4); + paddd(xmm2, xmm6); + + movdqa(xmm0, xmm3); + paddd(xmm0, xmm4); + paddd(xmm3, xmm6); + + // xmm5 = addr00 + // xmm2 = addr01 + // xmm0 = addr10 + // xmm3 = addr11 + // xmm1, xmm4, xmm6 = free + // xmm7 = used + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm6, xmm5, xmm1, xmm4); + + // xmm2, xmm5, xmm1 = free + + ReadTexel(xmm4, xmm2, xmm5, xmm1); + + // xmm0, xmm2, xmm5 = free + + ReadTexel(xmm1, xmm0, xmm2, xmm5); + + // xmm3, xmm0, xmm2 = free + + ReadTexel(xmm5, xmm3, xmm0, xmm2); + + // xmm6 = c00 + // xmm4 = c01 + // xmm1 = c10 + // xmm5 = c11 + // xmm0, xmm2, xmm3 = free + // xmm7 = used + + movdqa(xmm0, ptr[&m_local.temp.uf]); + + // GSVector4i rb00 = c00 & mask; + // GSVector4i ga00 = (c00 >> 8) & mask; + + movdqa(xmm2, xmm6); + psllw(xmm2, 8); + psrlw(xmm2, 8); + psrlw(xmm6, 8); + + // GSVector4i rb01 = c01 & mask; + // GSVector4i ga01 = (c01 >> 8) & mask; + + movdqa(xmm3, xmm4); + psllw(xmm3, 8); + psrlw(xmm3, 8); + psrlw(xmm4, 8); + + // xmm0 = uf + // xmm2 = rb00 + // xmm3 = rb01 + // xmm6 = ga00 + // xmm4 = ga01 + // xmm1 = c10 + // xmm5 = c11 + // xmm7 = used + + // rb00 = rb00.lerp16<0>(rb01, uf); + // ga00 = ga00.lerp16<0>(ga01, uf); + + lerp16(xmm3, xmm2, xmm0, 0); + lerp16(xmm4, xmm6, xmm0, 0); + + // xmm0 = uf + // xmm3 = rb00 + // xmm4 = ga00 + // xmm1 = c10 + // xmm5 = c11 + // xmm2, xmm6 = free + // xmm7 = used + + // GSVector4i rb10 = c10 & mask; + // GSVector4i ga10 = (c10 >> 8) & mask; + + movdqa(xmm2, xmm1); + psllw(xmm1, 8); + psrlw(xmm1, 8); + psrlw(xmm2, 8); + + // GSVector4i rb11 = c11 & mask; + // GSVector4i ga11 = (c11 >> 8) & mask; + + movdqa(xmm6, xmm5); + psllw(xmm5, 8); + psrlw(xmm5, 8); + psrlw(xmm6, 8); + + // xmm0 = uf + // xmm3 = rb00 + // xmm4 = ga00 + // xmm1 = rb10 + // xmm5 = rb11 + // xmm2 = ga10 + // xmm6 = ga11 + // xmm7 = used + + // rb10 = rb10.lerp16<0>(rb11, uf); + // ga10 = ga10.lerp16<0>(ga11, uf); + + lerp16(xmm5, xmm1, xmm0, 0); + lerp16(xmm6, xmm2, xmm0, 0); + + // xmm3 = rb00 + // xmm4 = ga00 + // xmm5 = rb10 + // xmm6 = ga10 + // xmm0, xmm1, xmm2 = free + // xmm7 = used + + // rb00 = rb00.lerp16<0>(rb10, vf); + // ga00 = ga00.lerp16<0>(ga10, vf); + + movdqa(xmm0, ptr[&m_local.temp.vf]); + + lerp16(xmm5, xmm3, xmm0, 0); + lerp16(xmm6, xmm4, xmm0, 0); + } + else + { + // GSVector4i addr00 = y0 + x0; + + paddd(xmm2, xmm4); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(xmm5, xmm2, xmm0, xmm1); + + // GSVector4i mask = GSVector4i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + movdqa(xmm6, xmm5); + + psllw(xmm5, 8); + psrlw(xmm5, 8); + psrlw(xmm6, 8); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + pmaxsw(uv, ptr[&m_local.gd->t.min]); + } + else + { + pxor(xmm0, xmm0); + pmaxsw(uv, xmm0); + } + + pminsw(uv, ptr[&m_local.gd->t.max]); + } + else + { + pand(uv, ptr[&m_local.gd->t.min]); + + if(region) + { + por(uv, ptr[&m_local.gd->t.max]); + } + } + } + else + { + movdqa(xmm4, ptr[&m_local.gd->t.min]); + movdqa(xmm5, ptr[&m_local.gd->t.max]); + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + movdqa(xmm1, uv); + + pand(xmm1, xmm4); + + if(region) + { + por(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + pmaxsw(uv, xmm4); + pminsw(uv, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + blend8(uv, xmm1); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) +{ + // xmm0, xmm1, xmm4, xmm5, xmm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + movdqa(xmm4, ptr[&m_local.gd->t.min]); + pmaxsw(uv0, xmm4); + pmaxsw(uv1, xmm4); + } + else + { + pxor(xmm0, xmm0); + pmaxsw(uv0, xmm0); + pmaxsw(uv1, xmm0); + } + + movdqa(xmm5, ptr[&m_local.gd->t.max]); + pminsw(uv0, xmm5); + pminsw(uv1, xmm5); + } + else + { + movdqa(xmm4, ptr[&m_local.gd->t.min]); + pand(uv0, xmm4); + pand(uv1, xmm4); + + if(region) + { + movdqa(xmm5, ptr[&m_local.gd->t.max]); + por(uv0, xmm5); + por(uv1, xmm5); + } + } + } + else + { + movdqa(xmm4, ptr[&m_local.gd->t.min]); + movdqa(xmm5, ptr[&m_local.gd->t.max]); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + movdqa(xmm0, ptr[&m_local.gd->t.mask]); + } + else + { + movdqa(xmm0, ptr[&m_local.gd->t.invmask]); + movdqa(xmm6, xmm0); + } + + // uv0 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + movdqa(xmm1, uv0); + + pand(xmm1, xmm4); + + if(region) + { + por(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + pmaxsw(uv0, xmm4); + pminsw(uv0, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pblendvb(uv0, xmm1); + } + else + { + blendr(uv0, xmm1, xmm0); + } + + // uv1 + + // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + movdqa(xmm1, uv1); + + pand(xmm1, xmm4); + + if(region) + { + por(xmm1, xmm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + pmaxsw(uv1, xmm4); + pminsw(uv1, xmm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pblendvb(uv1, xmm1); + } + else + { + blendr(uv1, xmm1, xmm6); + } + } +} + +void GSDrawScanlineCodeGenerator::AlphaTFX() +{ + if(!m_sel.fb) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + // gat = gat.modulate16<1>(ga).clamp8(); + + modulate16(xmm6, xmm4, 1); + + clamp16(xmm6, xmm3); + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + psrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_DECAL: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + psrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_HIGHLIGHT: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + movdqa(xmm2, xmm4); + + // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); + + psrlw(xmm4, 7); + + if(m_sel.tcc) + { + paddusb(xmm4, xmm6); + } + + mix16(xmm6, xmm4, xmm3); + + break; + + case TFX_HIGHLIGHT2: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + movdqa(xmm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + movdqa(xmm2, xmm4); + + psrlw(xmm4, 7); + + mix16(xmm6, xmm4, xmm3); + } + + break; + + case TFX_NONE: + + // gat = iip ? ga.srl16(7) : ga; + + if(m_sel.iip) + { + psrlw(xmm6, 7); + } + + break; + } + + if(m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if(!m_sel.abe) + { + // a = cov + + if(m_sel.edge) + { + movdqa(xmm0, ptr[&m_local.temp.cov]); + } + else + { + pcmpeqd(xmm0, xmm0); + psllw(xmm0, 15); + psrlw(xmm0, 8); + } + + mix16(xmm6, xmm0, xmm1); + } + else + { + // a = a == 0x80 ? cov : a + + pcmpeqd(xmm0, xmm0); + psllw(xmm0, 15); + psrlw(xmm0, 8); + + if(m_sel.edge) + { + movdqa(xmm1, ptr[&m_local.temp.cov]); + } + else + { + movdqa(xmm1, xmm0); + } + + pcmpeqw(xmm0, xmm6); + psrld(xmm0, 16); + pslld(xmm0, 16); + + blend8(xmm6, xmm1); + } + } +} + +void GSDrawScanlineCodeGenerator::ReadMask() +{ + if(m_sel.fwrite) + { + movdqa(xmm3, ptr[&m_local.gd->fm]); + } + + if(m_sel.zwrite) + { + movdqa(xmm4, ptr[&m_local.gd->zm]); + } +} + +void GSDrawScanlineCodeGenerator::TestAlpha() +{ + switch(m_sel.afail) + { + case AFAIL_FB_ONLY: + if(!m_sel.zwrite) return; + break; + + case AFAIL_ZB_ONLY: + if(!m_sel.fwrite) return; + break; + + case AFAIL_RGB_ONLY: + if(!m_sel.zwrite && m_sel.fpsm == 1) return; + break; + } + + switch(m_sel.atst) + { + case ATST_NEVER: + // t = GSVector4i::xffffffff(); + pcmpeqd(xmm1, xmm1); + break; + + case ATST_ALWAYS: + return; + + case ATST_LESS: + case ATST_LEQUAL: + // t = (ga >> 16) > m_local.gd->aref; + movdqa(xmm1, xmm6); + psrld(xmm1, 16); + pcmpgtd(xmm1, ptr[&m_local.gd->aref]); + break; + + case ATST_EQUAL: + // t = (ga >> 16) != m_local.gd->aref; + movdqa(xmm1, xmm6); + psrld(xmm1, 16); + pcmpeqd(xmm1, ptr[&m_local.gd->aref]); + pcmpeqd(xmm0, xmm0); + pxor(xmm1, xmm0); + break; + + case ATST_GEQUAL: + case ATST_GREATER: + // t = (ga >> 16) < m_local.gd->aref; + movdqa(xmm0, xmm6); + psrld(xmm0, 16); + movdqa(xmm1, ptr[&m_local.gd->aref]); + pcmpgtd(xmm1, xmm0); + break; + + case ATST_NOTEQUAL: + // t = (ga >> 16) == m_local.gd->aref; + movdqa(xmm1, xmm6); + psrld(xmm1, 16); + pcmpeqd(xmm1, ptr[&m_local.gd->aref]); + break; + } + + switch(m_sel.afail) + { + case AFAIL_KEEP: + // test |= t; + por(xmm7, xmm1); + alltrue(); + break; + + case AFAIL_FB_ONLY: + // zm |= t; + por(xmm4, xmm1); + break; + + case AFAIL_ZB_ONLY: + // fm |= t; + por(xmm3, xmm1); + break; + + case AFAIL_RGB_ONLY: + // zm |= t; + por(xmm4, xmm1); + // fm |= t & GSVector4i::xff000000(); + psrld(xmm1, 24); + pslld(xmm1, 24); + por(xmm3, xmm1); + break; + } +} + +void GSDrawScanlineCodeGenerator::ColorTFX() +{ + if(!m_sel.fwrite) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector4i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).clamp8(); + + modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + clamp16(xmm5, xmm1); + + break; + + case TFX_DECAL: + + break; + + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + + if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + movdqa(xmm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + } + + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + movdqa(xmm1, xmm6); + + modulate16(xmm6, xmm2, 1); + + pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); + pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); + psrlw(xmm2, 7); + + paddw(xmm6, xmm2); + + clamp16(xmm6, xmm0); + + mix16(xmm6, xmm1, xmm0); + + // GSVector4i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); + + modulate16(xmm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + paddw(xmm5, xmm2); + + clamp16(xmm5, xmm0); + + break; + + case TFX_NONE: + + // rbt = iip ? rb.srl16(7) : rb; + + if(m_sel.iip) + { + psrlw(xmm5, 7); + } + + break; + } +} + +void GSDrawScanlineCodeGenerator::Fog() +{ + if(!m_sel.fwrite || !m_sel.fge) + { + return; + } + + // rb = m_local.gd->frb.lerp16<0>(rb, f); + // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); + + movdqa(xmm0, ptr[!m_sel.sprite ? &m_local.temp.f : &m_local.p.f]); + movdqa(xmm1, xmm6); + + movdqa(xmm2, ptr[&m_local.gd->frb]); + lerp16(xmm5, xmm2, xmm0, 0); + + movdqa(xmm2, ptr[&m_local.gd->fga]); + lerp16(xmm6, xmm2, xmm0, 0); + mix16(xmm6, xmm1, xmm0); +} + +void GSDrawScanlineCodeGenerator::ReadFrame() +{ + if(!m_sel.fb) + { + return; + } + + // int fa = fza_base.x + fza_offset->x; + + mov(ebx, dword[esi]); + add(ebx, dword[edi]); + + if(!m_sel.rfb) + { + return; + } + + ReadPixel(xmm2, ebx); +} + +void GSDrawScanlineCodeGenerator::TestDestAlpha() +{ + if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) + { + return; + } + + // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); + + movdqa(xmm1, xmm2); + + if(m_sel.datm) + { + if(m_sel.fpsm == 2) + { + pxor(xmm0, xmm0); + psrld(xmm1, 15); + pcmpeqd(xmm1, xmm0); + } + else + { + pcmpeqd(xmm0, xmm0); + pxor(xmm1, xmm0); + psrad(xmm1, 31); + } + } + else + { + if(m_sel.fpsm == 2) + { + pslld(xmm1, 16); + } + + psrad(xmm1, 31); + } + + por(xmm7, xmm1); + + alltrue(); +} + +void GSDrawScanlineCodeGenerator::WriteMask() +{ + // fm |= test; + // zm |= test; + + if(m_sel.fwrite) + { + por(xmm3, xmm7); + } + + if(m_sel.zwrite) + { + por(xmm4, xmm7); + } + + // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + + pcmpeqd(xmm1, xmm1); + + if(m_sel.fwrite && m_sel.zwrite) + { + movdqa(xmm0, xmm1); + pcmpeqd(xmm1, xmm3); + pcmpeqd(xmm0, xmm4); + packssdw(xmm1, xmm0); + } + else if(m_sel.fwrite) + { + pcmpeqd(xmm1, xmm3); + packssdw(xmm1, xmm1); + } + else if(m_sel.zwrite) + { + pcmpeqd(xmm1, xmm4); + packssdw(xmm1, xmm1); + } + + pmovmskb(edx, xmm1); + + not(edx); +} + +void GSDrawScanlineCodeGenerator::WriteZBuf() +{ + if(!m_sel.zwrite) + { + return; + } + + bool fast = m_sel.ztest && m_sel.zpsm < 2; + + movdqa(xmm1, ptr[!m_sel.sprite ? &m_local.temp.zs : &m_local.p.z]); + + if(fast) + { + // zs = zs.blend8(zd, zm); + + movdqa(xmm0, xmm4); + movdqa(xmm7, ptr[&m_local.temp.zd]); + blend8(xmm1, xmm7); + } + + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); +} + +void GSDrawScanlineCodeGenerator::AlphaBlend() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.abe == 0 && m_sel.aa1 == 0) + { + return; + } + + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) + { + switch(m_sel.fpsm) + { + case 0: + case 1: + + // c[2] = fd & mask; + // c[3] = (fd >> 8) & mask; + + movdqa(xmm0, xmm2); + movdqa(xmm1, xmm2); + + psllw(xmm0, 8); + psrlw(xmm0, 8); + psrlw(xmm1, 8); + + break; + + case 2: + + // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + + movdqa(xmm0, xmm2); + movdqa(xmm1, xmm2); + movdqa(xmm4, xmm2); + + pcmpeqd(xmm7, xmm7); + psrld(xmm7, 27); // 0x0000001f + pand(xmm0, xmm7); + pslld(xmm0, 3); + + pslld(xmm7, 10); // 0x00007c00 + pand(xmm4, xmm7); + pslld(xmm4, 9); + + por(xmm0, xmm4); + + movdqa(xmm4, xmm1); + + psrld(xmm7, 5); // 0x000003e0 + pand(xmm1, xmm7); + psrld(xmm1, 2); + + psllw(xmm7, 10); // 0x00008000 + pand(xmm4, xmm7); + pslld(xmm4, 8); + + por(xmm1, xmm4); + + break; + } + } + + // xmm5, xmm6 = src rb, ga + // xmm0, xmm1 = dst rb, ga + // xmm2, xmm3 = used + // xmm4, xmm7 = free + + if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) + { + movdqa(xmm4, xmm5); + } + + if(m_sel.aba != m_sel.abb) + { + // rb = c[aba * 2 + 0]; + + switch(m_sel.aba) + { + case 0: break; + case 1: movdqa(xmm5, xmm0); break; + case 2: pxor(xmm5, xmm5); break; + } + + // rb = rb.sub16(c[abb * 2 + 0]); + + switch(m_sel.abb) + { + case 0: psubw(xmm5, xmm4); break; + case 1: psubw(xmm5, xmm0); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; + + switch(m_sel.abc) + { + case 0: + case 1: + movdqa(xmm7, m_sel.abc ? xmm1 : xmm6); + pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); + pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); + psllw(xmm7, 7); + break; + case 2: + movdqa(xmm7, ptr[&m_local.gd->afix]); + break; + } + + // rb = rb.modulate16<1>(a); + + modulate16(xmm5, xmm7, 1); + } + + // rb = rb.add16(c[abd * 2 + 0]); + + switch(m_sel.abd) + { + case 0: paddw(xmm5, xmm4); break; + case 1: paddw(xmm5, xmm0); break; + case 2: break; + } + } + else + { + // rb = c[abd * 2 + 0]; + + switch(m_sel.abd) + { + case 0: break; + case 1: movdqa(xmm5, xmm0); break; + case 2: pxor(xmm5, xmm5); break; + } + } + + if(m_sel.pabe) + { + // mask = (c[1] << 8).sra32(31); + + movdqa(xmm0, xmm6); + pslld(xmm0, 8); + psrad(xmm0, 31); + + // rb = c[0].blend8(rb, mask); + + blend8r(xmm5, xmm4); + } + + // xmm6 = src ga + // xmm1 = dst ga + // xmm5 = rb + // xmm7 = a + // xmm2, xmm3 = used + // xmm0, xmm4 = free + + movdqa(xmm4, xmm6); + + if(m_sel.aba != m_sel.abb) + { + // ga = c[aba * 2 + 1]; + + switch(m_sel.aba) + { + case 0: break; + case 1: movdqa(xmm6, xmm1); break; + case 2: pxor(xmm6, xmm6); break; + } + + // ga = ga.sub16(c[abeb * 2 + 1]); + + switch(m_sel.abb) + { + case 0: psubw(xmm6, xmm4); break; + case 1: psubw(xmm6, xmm1); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // ga = ga.modulate16<1>(a); + + modulate16(xmm6, xmm7, 1); + } + + // ga = ga.add16(c[abd * 2 + 1]); + + switch(m_sel.abd) + { + case 0: paddw(xmm6, xmm4); break; + case 1: paddw(xmm6, xmm1); break; + case 2: break; + } + } + else + { + // ga = c[abd * 2 + 1]; + + switch(m_sel.abd) + { + case 0: break; + case 1: movdqa(xmm6, xmm1); break; + case 2: pxor(xmm6, xmm6); break; + } + } + + // xmm4 = src ga + // xmm5 = rb + // xmm6 = ga + // xmm2, xmm3 = used + // xmm0, xmm1, xmm7 = free + + if(m_sel.pabe) + { + if(!m_cpu.has(util::Cpu::tSSE41)) + { + // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) + + movdqa(xmm0, xmm4); + pslld(xmm0, 8); + psrad(xmm0, 31); + } + + psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) + + // ga = c[1].blend8(ga, mask).mix16(c[1]); + + blend8r(xmm6, xmm4); + } + else + { + if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx + { + mix16(xmm6, xmm4, xmm7); + } + } +} + +void GSDrawScanlineCodeGenerator::WriteFrame() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.colclamp == 0) + { + // c[0] &= 0x000000ff; + // c[1] &= 0x000000ff; + + pcmpeqd(xmm7, xmm7); + psrlw(xmm7, 8); + pand(xmm5, xmm7); + pand(xmm6, xmm7); + } + + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(eax, dword[esp + _top]); + and(eax, 3); + shl(eax, 5); + paddw(xmm5, ptr[eax + (size_t)&m_local.gd->dimx[0]]); + paddw(xmm6, ptr[eax + (size_t)&m_local.gd->dimx[1]]); + } + + // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); + + movdqa(xmm7, xmm5); + punpcklwd(xmm5, xmm6); + punpckhwd(xmm7, xmm6); + packuswb(xmm5, xmm7); + + if(m_sel.fba && m_sel.fpsm != 1) + { + // fs |= 0x80000000; + + pcmpeqd(xmm7, xmm7); + pslld(xmm7, 31); + por(xmm5, xmm7); + } + + if(m_sel.fpsm == 2) + { + // GSVector4i rb = fs & 0x00f800f8; + // GSVector4i ga = fs & 0x8000f800; + + mov(eax, 0x00f800f8); + movd(xmm6, eax); + pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); + + mov(eax, 0x8000f800); + movd(xmm7, eax); + pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); + + movdqa(xmm4, xmm5); + pand(xmm4, xmm6); + pand(xmm5, xmm7); + + // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + + movdqa(xmm6, xmm4); + movdqa(xmm7, xmm5); + + psrld(xmm4, 3); + psrld(xmm6, 9); + psrld(xmm5, 6); + psrld(xmm7, 16); + + por(xmm5, xmm4); + por(xmm7, xmm6); + por(xmm5, xmm7); + } + + if(m_sel.rfb) + { + // fs = fs.blend(fd, fm); + + blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm + } + + bool fast = m_sel.rfb && m_sel.fpsm < 2; + + WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); +} + +void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) +{ + movq(dst, qword[addr * 2 + (size_t)m_local.gd->vm]); + movhps(dst, qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); +} + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) +{ + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); + + test(mask, 0x0f); + je("@f"); + movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); + + test(mask, 0xf0); + je("@f"); + movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } +} + +static const int s_offsets[4] = {0, 2, 8, 10}; + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) +{ + Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; + + if(m_cpu.has(util::Cpu::tSSE41)) + { + switch(psm) + { + case 0: + if(i == 0) movd(dst, src); + else pextrd(dst, src, i); + break; + case 1: + if(i == 0) movd(eax, src); + else pextrd(eax, src, i); + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + pextrw(eax, src, i * 2); + mov(dst, ax); + break; + } + } + else + { + switch(psm) + { + case 0: + if(i == 0) movd(dst, src); + else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(dst, xmm0);} + break; + case 1: + if(i == 0) movd(eax, src); + else {pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); movd(eax, xmm0);} + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + pextrw(eax, src, i * 2); + mov(dst, ax); + break; + } + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) +{ + if(m_cpu.has(util::Cpu::tSSE41)) + { + ReadTexel(dst, addr, 0); + ReadTexel(dst, addr, 1); + ReadTexel(dst, addr, 2); + ReadTexel(dst, addr, 3); + } + else + { + ReadTexel(dst, addr, 0); + psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) + ReadTexel(temp1, addr, 0); + psrldq(addr, 4); + punpckldq(dst, temp1); + + ReadTexel(temp1, addr, 0); + psrldq(addr, 4); + ReadTexel(temp2, addr, 0); + // psrldq(addr, 4); + punpckldq(temp1, temp2); + + punpcklqdq(dst, temp1); + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) +{ + const Address& src = m_sel.tlu ? ptr[eax * 4 + (size_t)m_local.gd->clut] : ptr[ebx + eax * 4]; + + if(!m_cpu.has(util::Cpu::tSSE41) && i > 0) + { + ASSERT(0); + } + + if(i == 0) movd(eax, addr); + else pextrd(eax, addr, i); + + if(m_sel.tlu) movzx(eax, byte[ebx + eax]); + + if(i == 0) movd(dst, src); + else pinsrd(dst, src, i); +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index 6160537635..afc28b246c 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -1,349 +1,349 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSSetupPrimCodeGenerator.h" - -#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -void GSSetupPrimCodeGenerator::Generate() -{ - enter(32, true); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - mov(r8, (size_t)&m_local); - - if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) - { - mov(rax, (size_t)&m_shift[0]); - - for(int i = 0; i < 5; i++) - { - vmovaps(Xmm(3 + i), ptr[rax + i * 16]); - } - } - - Depth(); - - Texture(); - - Color(); - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -void GSSetupPrimCodeGenerator::Depth() -{ - if(!m_en.z && !m_en.f) - { - return; - } - - if(!m_sel.sprite) - { - // GSVector4 p = dscan.p; - - vmovaps(xmm0, ptr[rdx + 16]); - - if(m_en.f) - { - // GSVector4 df = p.wwww(); - - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); - - vmulps(xmm2, xmm1, xmm3); - vcvttps2dq(xmm2, xmm2); - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); - - vmulps(xmm2, xmm1, Xmm(4 + i)); - vcvttps2dq(xmm2, xmm2); - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2); - } - } - - if(m_en.z) - { - // GSVector4 dz = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - // m_local.d4.z = dz * 4.0f; - - vmulps(xmm1, xmm0, xmm3); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].z = dz * m_shift[i]; - - vmulps(xmm1, xmm0, Xmm(4 + i)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1); - } - } - } - else - { - // GSVector4 p = vertices[0].p; - - vmovaps(xmm0, ptr[rcx + 16]); - - if(m_en.f) - { - // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); - - vcvttps2dq(xmm1, xmm0); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); - } - - if(m_en.z) - { - // GSVector4 z = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - vbroadcastss(xmm1, ptr[r9]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); - } - } -} - -void GSSetupPrimCodeGenerator::Texture() -{ - if(!m_en.t) - { - return; - } - - // GSVector4 t = dscan.t; - - vmovaps(xmm0, ptr[rdx + 32]); - - vmulps(xmm1, xmm0, xmm3); - - if(m_sel.fst) - { - // m_local.d4.st = GSVector4i(t * 4.0f); - - vcvttps2dq(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1); - } - else - { - // m_local.d4.stq = t * 4.0f; - - vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); - } - - for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) - { - // GSVector4 ds = t.xxxx(); - // GSVector4 dt = t.yyyy(); - // GSVector4 dq = t.zzzz(); - - vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - - for(int i = 0; i < 4; i++) - { - // GSVector4 v = ds/dt * m_shift[i]; - - vmulps(xmm2, xmm1, Xmm(4 + i)); - - if(m_sel.fst) - { - // m_local.d[i].si/ti = GSVector4i(v); - - vcvttps2dq(xmm2, xmm2); - - switch(j) - { - case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break; - case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break; - } - } - else - { - // m_local.d[i].s/t/q = v; - - switch(j) - { - case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break; - case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break; - case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break; - } - } - } - } -} - -void GSSetupPrimCodeGenerator::Color() -{ - if(!m_en.c) - { - return; - } - - if(m_sel.iip) - { - // GSVector4 c = dscan.c; - - vmovaps(xmm0, ptr[rdx]); - - // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); - - vmulps(xmm1, xmm0, xmm3); - vcvttps2dq(xmm1, xmm1); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); - vpackssdw(xmm1, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1); - - // xmm3 is not needed anymore - - // GSVector4 dr = c.xxxx(); - // GSVector4 db = c.zzzz(); - - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); - - vmulps(xmm0, xmm2, Xmm(4 + i)); - vcvttps2dq(xmm0, xmm0); - vpackssdw(xmm0, xmm0); - - // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); - - vmulps(xmm1, xmm3, Xmm(4 + i)); - vcvttps2dq(xmm1, xmm1); - vpackssdw(xmm1, xmm1); - - // m_local.d[i].rb = r.upl16(b); - - vpunpcklwd(xmm0, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0); - } - - // GSVector4 c = dscan.c; - - vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it - - // GSVector4 dg = c.yyyy(); - // GSVector4 da = c.wwww(); - - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); - - vmulps(xmm0, xmm2, Xmm(4 + i)); - vcvttps2dq(xmm0, xmm0); - vpackssdw(xmm0, xmm0); - - // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); - - vmulps(xmm1, xmm3, Xmm(4 + i)); - vcvttps2dq(xmm1, xmm1); - vpackssdw(xmm1, xmm1); - - // m_local.d[i].ga = g.upl16(a); - - vpunpcklwd(xmm0, xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0); - } - } - else - { - // GSVector4i c = GSVector4i(vertices[0].c); - - vcvttps2dq(xmm0, ptr[rcx]); - - // c = c.upl16(c.zwxy()); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm0, xmm1); - - // if(!tme) c = c.srl16(7); - - if(m_sel.tfx == TFX_NONE) - { - vpsrlw(xmm0, 7); - } - - // m_local.c.rb = c.xxxx(); - // m_local.c.ga = c.zzzz(); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); - } -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSSetupPrimCodeGenerator.h" + +#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +void GSSetupPrimCodeGenerator::Generate() +{ + enter(32, true); + + vmovdqa(ptr[rsp + 0], xmm6); + vmovdqa(ptr[rsp + 16], xmm7); + + mov(r8, (size_t)&m_local); + + if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) + { + mov(rax, (size_t)&m_shift[0]); + + for(int i = 0; i < 5; i++) + { + vmovaps(Xmm(3 + i), ptr[rax + i * 16]); + } + } + + Depth(); + + Texture(); + + Color(); + + vmovdqa(xmm6, ptr[rsp + 0]); + vmovdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +void GSSetupPrimCodeGenerator::Depth() +{ + if(!m_en.z && !m_en.f) + { + return; + } + + if(!m_sel.sprite) + { + // GSVector4 p = dscan.p; + + vmovaps(xmm0, ptr[rdx + 16]); + + if(m_en.f) + { + // GSVector4 df = p.wwww(); + + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + + // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); + + vmulps(xmm2, xmm1, xmm3); + vcvttps2dq(xmm2, xmm2); + vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); + + vmulps(xmm2, xmm1, Xmm(4 + i)); + vcvttps2dq(xmm2, xmm2); + vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2); + } + } + + if(m_en.z) + { + // GSVector4 dz = p.zzzz(); + + vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + // m_local.d4.z = dz * 4.0f; + + vmulps(xmm1, xmm0, xmm3); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].z = dz * m_shift[i]; + + vmulps(xmm1, xmm0, Xmm(4 + i)); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1); + } + } + } + else + { + // GSVector4 p = vertices[0].p; + + vmovaps(xmm0, ptr[rcx + 16]); + + if(m_en.f) + { + // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); + + vcvttps2dq(xmm1, xmm0); + vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + } + + if(m_en.z) + { + // GSVector4 z = p.zzzz(); + + vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + if(m_sel.zoverflow) + { + // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + mov(r9, (size_t)&GSVector4::m_half); + + vbroadcastss(xmm1, ptr[r9]); + vmulps(xmm1, xmm0); + vcvttps2dq(xmm1, xmm1); + vpslld(xmm1, 1); + + vcvttps2dq(xmm0, xmm0); + vpcmpeqd(xmm2, xmm2); + vpsrld(xmm2, 31); + vpand(xmm0, xmm2); + + vpor(xmm0, xmm1); + } + else + { + // m_local.p.z = GSVector4i(z); + + vcvttps2dq(xmm0, xmm0); + } + + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + } + } +} + +void GSSetupPrimCodeGenerator::Texture() +{ + if(!m_en.t) + { + return; + } + + // GSVector4 t = dscan.t; + + vmovaps(xmm0, ptr[rdx + 32]); + + vmulps(xmm1, xmm0, xmm3); + + if(m_sel.fst) + { + // m_local.d4.st = GSVector4i(t * 4.0f); + + vcvttps2dq(xmm1, xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1); + } + else + { + // m_local.d4.stq = t * 4.0f; + + vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + } + + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) + { + // GSVector4 ds = t.xxxx(); + // GSVector4 dt = t.yyyy(); + // GSVector4 dq = t.zzzz(); + + vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); + + for(int i = 0; i < 4; i++) + { + // GSVector4 v = ds/dt * m_shift[i]; + + vmulps(xmm2, xmm1, Xmm(4 + i)); + + if(m_sel.fst) + { + // m_local.d[i].si/ti = GSVector4i(v); + + vcvttps2dq(xmm2, xmm2); + + switch(j) + { + case 0: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break; + case 1: vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break; + } + } + else + { + // m_local.d[i].s/t/q = v; + + switch(j) + { + case 0: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break; + case 1: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break; + case 2: vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break; + } + } + } + } +} + +void GSSetupPrimCodeGenerator::Color() +{ + if(!m_en.c) + { + return; + } + + if(m_sel.iip) + { + // GSVector4 c = dscan.c; + + vmovaps(xmm0, ptr[rdx]); + + // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); + + vmulps(xmm1, xmm0, xmm3); + vcvttps2dq(xmm1, xmm1); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); + vpackssdw(xmm1, xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1); + + // xmm3 is not needed anymore + + // GSVector4 dr = c.xxxx(); + // GSVector4 db = c.zzzz(); + + vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); + + vmulps(xmm0, xmm2, Xmm(4 + i)); + vcvttps2dq(xmm0, xmm0); + vpackssdw(xmm0, xmm0); + + // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); + + vmulps(xmm1, xmm3, Xmm(4 + i)); + vcvttps2dq(xmm1, xmm1); + vpackssdw(xmm1, xmm1); + + // m_local.d[i].rb = r.upl16(b); + + vpunpcklwd(xmm0, xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm0); + } + + // GSVector4 c = dscan.c; + + vmovaps(xmm0, ptr[rdx]); // not enough regs, have to reload it + + // GSVector4 dg = c.yyyy(); + // GSVector4 da = c.wwww(); + + vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); + + vmulps(xmm0, xmm2, Xmm(4 + i)); + vcvttps2dq(xmm0, xmm0); + vpackssdw(xmm0, xmm0); + + // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); + + vmulps(xmm1, xmm3, Xmm(4 + i)); + vcvttps2dq(xmm1, xmm1); + vpackssdw(xmm1, xmm1); + + // m_local.d[i].ga = g.upl16(a); + + vpunpcklwd(xmm0, xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm0); + } + } + else + { + // GSVector4i c = GSVector4i(vertices[0].c); + + vcvttps2dq(xmm0, ptr[rcx]); + + // c = c.upl16(c.zwxy()); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(xmm0, xmm1); + + // if(!tme) c = c.srl16(7); + + if(m_sel.tfx == TFX_NONE) + { + vpsrlw(xmm0, 7); + } + + // m_local.c.rb = c.xxxx(); + // m_local.c.ga = c.zzzz(); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + } +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index 79c3245a3e..c0adc5607f 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -1,363 +1,363 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSSetupPrimCodeGenerator.h" - -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -void GSSetupPrimCodeGenerator::Generate() -{ - enter(32, true); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - mov(r8, (size_t)&m_local); - - if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) - { - for(int i = 0; i < 5; i++) - { - movaps(Xmm(3 + i), ptr[rax + i * 16]); - } - } - - Depth(); - - Texture(); - - Color(); - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -void GSSetupPrimCodeGenerator::Depth() -{ - if(!m_en.z && !m_en.f) - { - return; - } - - if(!m_sel.sprite) - { - // GSVector4 p = dscan.p; - - movaps(xmm0, ptr[rdx + 16]); - - if(m_en.f) - { - // GSVector4 df = p.wwww(); - - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - - // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); - - movaps(xmm2, xmm1); - mulps(xmm2, xmm3); - cvttps2dq(xmm2, xmm2); - pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); - - movaps(xmm2, xmm1); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2); - } - } - - if(m_en.z) - { - // GSVector4 dz = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - // m_local.d4.z = dz * 4.0f; - - movaps(xmm1, xmm0); - mulps(xmm1, xmm3); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].z = dz * m_shift[i]; - - movaps(xmm1, xmm0); - mulps(xmm1, Xmm(4 + i)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1); - } - } - } - else - { - // GSVector4 p = vertices[0].p; - - movaps(xmm0, ptr[rcx + 16]); - - if(m_en.f) - { - // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); - - cvttps2dq(xmm1, xmm0); - pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); - } - - if(m_en.z) - { - // GSVector4 z = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - mov(r9, (size_t)&GSVector4::m_half); - - movss(xmm1, ptr[r9]); - shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); - } - } -} - -void GSSetupPrimCodeGenerator::Texture() -{ - if(!m_en.t) - { - return; - } - - // GSVector4 t = dscan.t; - - movaps(xmm0, ptr[rdx + 32]); - - movaps(xmm1, xmm0); - mulps(xmm1, xmm3); - - if(m_sel.fst) - { - // m_local.d4.st = GSVector4i(t * 4.0f); - - cvttps2dq(xmm1, xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1); - } - else - { - // m_local.d4.stq = t * 4.0f; - - movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); - } - - for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) - { - // GSVector4 ds = t.xxxx(); - // GSVector4 dt = t.yyyy(); - // GSVector4 dq = t.zzzz(); - - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - - for(int i = 0; i < 4; i++) - { - // GSVector4 v = ds/dt * m_shift[i]; - - movaps(xmm2, xmm1); - mulps(xmm2, Xmm(4 + i)); - - if(m_sel.fst) - { - // m_local.d[i].si/ti = GSVector4i(v); - - cvttps2dq(xmm2, xmm2); - - switch(j) - { - case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break; - case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break; - } - } - else - { - // m_local.d[i].s/t/q = v; - - switch(j) - { - case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break; - case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break; - case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break; - } - } - } - } -} - -void GSSetupPrimCodeGenerator::Color() -{ - if(!m_en.c) - { - return; - } - - if(m_sel.iip) - { - // GSVector4 c = dscan.c; - - movaps(xmm0, ptr[rdx]); - movaps(xmm1, xmm0); - - // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, xmm3); - cvttps2dq(xmm2, xmm2); - pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); - packssdw(xmm2, xmm2); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2); - - // xmm3 is not needed anymore - - // GSVector4 dr = c.xxxx(); - // GSVector4 db = c.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - packssdw(xmm2, xmm2); - - // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); - - movaps(xmm3, xmm1); - mulps(xmm3, Xmm(4 + i)); - cvttps2dq(xmm3, xmm3); - packssdw(xmm3, xmm3); - - // m_local.d[i].rb = r.upl16(b); - - punpcklwd(xmm2, xmm3); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2); - } - - // GSVector4 c = dscan.c; - - movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it - movaps(xmm1, xmm0); - - // GSVector4 dg = c.yyyy(); - // GSVector4 da = c.wwww(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - packssdw(xmm2, xmm2); - - // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); - - movaps(xmm3, xmm1); - mulps(xmm3, Xmm(4 + i)); - cvttps2dq(xmm3, xmm3); - packssdw(xmm3, xmm3); - - // m_local.d[i].ga = g.upl16(a); - - punpcklwd(xmm2, xmm3); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2); - } - } - else - { - // GSVector4i c = GSVector4i(vertices[0].c); - - cvttps2dq(xmm0, ptr[rcx]); - - // c = c.upl16(c.zwxy()); - - pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - punpcklwd(xmm0, xmm1); - - // if(!tme) c = c.srl16(7); - - if(m_sel.tfx == TFX_NONE) - { - psrlw(xmm0, 7); - } - - // m_local.c.rb = c.xxxx(); - // m_local.c.ga = c.zzzz(); - - pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); - movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); - } -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSSetupPrimCodeGenerator.h" + +#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +void GSSetupPrimCodeGenerator::Generate() +{ + enter(32, true); + + vmovdqa(ptr[rsp + 0], xmm6); + vmovdqa(ptr[rsp + 16], xmm7); + + mov(r8, (size_t)&m_local); + + if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) + { + for(int i = 0; i < 5; i++) + { + movaps(Xmm(3 + i), ptr[rax + i * 16]); + } + } + + Depth(); + + Texture(); + + Color(); + + vmovdqa(xmm6, ptr[rsp + 0]); + vmovdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +void GSSetupPrimCodeGenerator::Depth() +{ + if(!m_en.z && !m_en.f) + { + return; + } + + if(!m_sel.sprite) + { + // GSVector4 p = dscan.p; + + movaps(xmm0, ptr[rdx + 16]); + + if(m_en.f) + { + // GSVector4 df = p.wwww(); + + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + + // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); + + movaps(xmm2, xmm1); + mulps(xmm2, xmm3); + cvttps2dq(xmm2, xmm2); + pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); + + movaps(xmm2, xmm1); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].f)], xmm2); + } + } + + if(m_en.z) + { + // GSVector4 dz = p.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + // m_local.d4.z = dz * 4.0f; + + movaps(xmm1, xmm0); + mulps(xmm1, xmm3); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].z = dz * m_shift[i]; + + movaps(xmm1, xmm0); + mulps(xmm1, Xmm(4 + i)); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].z)], xmm1); + } + } + } + else + { + // GSVector4 p = vertices[0].p; + + movaps(xmm0, ptr[rcx + 16]); + + if(m_en.f) + { + // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); + + cvttps2dq(xmm1, xmm0); + pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1); + } + + if(m_en.z) + { + // GSVector4 z = p.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + if(m_sel.zoverflow) + { + // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + mov(r9, (size_t)&GSVector4::m_half); + + movss(xmm1, ptr[r9]); + shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); + mulps(xmm1, xmm0); + cvttps2dq(xmm1, xmm1); + pslld(xmm1, 1); + + cvttps2dq(xmm0, xmm0); + pcmpeqd(xmm2, xmm2); + psrld(xmm2, 31); + pand(xmm0, xmm2); + + por(xmm0, xmm1); + } + else + { + // m_local.p.z = GSVector4i(z); + + cvttps2dq(xmm0, xmm0); + } + + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0); + } + } +} + +void GSSetupPrimCodeGenerator::Texture() +{ + if(!m_en.t) + { + return; + } + + // GSVector4 t = dscan.t; + + movaps(xmm0, ptr[rdx + 32]); + + movaps(xmm1, xmm0); + mulps(xmm1, xmm3); + + if(m_sel.fst) + { + // m_local.d4.st = GSVector4i(t * 4.0f); + + cvttps2dq(xmm1, xmm1); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.st)], xmm1); + } + else + { + // m_local.d4.stq = t * 4.0f; + + movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + } + + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) + { + // GSVector4 ds = t.xxxx(); + // GSVector4 dt = t.yyyy(); + // GSVector4 dq = t.zzzz(); + + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); + + for(int i = 0; i < 4; i++) + { + // GSVector4 v = ds/dt * m_shift[i]; + + movaps(xmm2, xmm1); + mulps(xmm2, Xmm(4 + i)); + + if(m_sel.fst) + { + // m_local.d[i].si/ti = GSVector4i(v); + + cvttps2dq(xmm2, xmm2); + + switch(j) + { + case 0: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].si)], xmm2); break; + case 1: movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ti)], xmm2); break; + } + } + else + { + // m_local.d[i].s/t/q = v; + + switch(j) + { + case 0: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].s)], xmm2); break; + case 1: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].t)], xmm2); break; + case 2: movaps(ptr[r8 + offsetof(GSScanlineLocalData, d[i].q)], xmm2); break; + } + } + } + } +} + +void GSSetupPrimCodeGenerator::Color() +{ + if(!m_en.c) + { + return; + } + + if(m_sel.iip) + { + // GSVector4 c = dscan.c; + + movaps(xmm0, ptr[rdx]); + movaps(xmm1, xmm0); + + // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, xmm3); + cvttps2dq(xmm2, xmm2); + pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); + packssdw(xmm2, xmm2); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2); + + // xmm3 is not needed anymore + + // GSVector4 dr = c.xxxx(); + // GSVector4 db = c.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + packssdw(xmm2, xmm2); + + // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); + + movaps(xmm3, xmm1); + mulps(xmm3, Xmm(4 + i)); + cvttps2dq(xmm3, xmm3); + packssdw(xmm3, xmm3); + + // m_local.d[i].rb = r.upl16(b); + + punpcklwd(xmm2, xmm3); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].rb)], xmm2); + } + + // GSVector4 c = dscan.c; + + movaps(xmm0, ptr[rdx]); // not enough regs, have to reload it + movaps(xmm1, xmm0); + + // GSVector4 dg = c.yyyy(); + // GSVector4 da = c.wwww(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + packssdw(xmm2, xmm2); + + // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); + + movaps(xmm3, xmm1); + mulps(xmm3, Xmm(4 + i)); + cvttps2dq(xmm3, xmm3); + packssdw(xmm3, xmm3); + + // m_local.d[i].ga = g.upl16(a); + + punpcklwd(xmm2, xmm3); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d[i].ga)], xmm2); + } + } + else + { + // GSVector4i c = GSVector4i(vertices[0].c); + + cvttps2dq(xmm0, ptr[rcx]); + + // c = c.upl16(c.zwxy()); + + pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); + punpcklwd(xmm0, xmm1); + + // if(!tme) c = c.srl16(7); + + if(m_sel.tfx == TFX_NONE) + { + psrlw(xmm0, 7); + } + + // m_local.c.rb = c.xxxx(); + // m_local.c.ga = c.zzzz(); + + pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2); + } +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index 09caa03fb8..46ac4df8af 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -1,333 +1,333 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSSetupPrimCodeGenerator.h" - -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -void GSSetupPrimCodeGenerator::Generate() -{ - if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) - { - for(int i = 0; i < 5; i++) - { - vmovaps(Xmm(3 + i), ptr[&m_shift[i]]); - } - } - - Depth(); - - Texture(); - - Color(); - - ret(); -} - -void GSSetupPrimCodeGenerator::Depth() -{ - if(!m_en.z && !m_en.f) - { - return; - } - - if(!m_sel.sprite) - { - // GSVector4 p = dscan.p; - - vmovaps(xmm0, ptr[edx + 16]); - - if(m_en.f) - { - // GSVector4 df = p.wwww(); - - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); - - vmulps(xmm2, xmm1, xmm3); - vcvttps2dq(xmm2, xmm2); - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.d4.f], xmm2); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); - - vmulps(xmm2, xmm1, Xmm(4 + i)); - vcvttps2dq(xmm2, xmm2); - vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.d[i].f], xmm2); - } - } - - if(m_en.z) - { - // GSVector4 dz = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - // m_local.d4.z = dz * 4.0f; - - vmulps(xmm1, xmm0, xmm3); - vmovdqa(ptr[&m_local.d4.z], xmm1); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].z = dz * m_shift[i]; - - vmulps(xmm1, xmm0, Xmm(4 + i)); - vmovdqa(ptr[&m_local.d[i].z], xmm1); - } - } - } - else - { - // GSVector4 p = vertices[0].p; - - vmovaps(xmm0, ptr[ecx + 16]); - - if(m_en.f) - { - // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); - - vcvttps2dq(xmm1, xmm0); - vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - vmovdqa(ptr[&m_local.p.f], xmm1); - } - - if(m_en.z) - { - // GSVector4 z = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - vbroadcastss(xmm1, ptr[&GSVector4::m_half]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } - - vmovdqa(ptr[&m_local.p.z], xmm0); - } - } -} - -void GSSetupPrimCodeGenerator::Texture() -{ - if(!m_en.t) - { - return; - } - - // GSVector4 t = dscan.t; - - vmovaps(xmm0, ptr[edx + 32]); - - vmulps(xmm1, xmm0, xmm3); - - if(m_sel.fst) - { - // m_local.d4.st = GSVector4i(t * 4.0f); - - vcvttps2dq(xmm1, xmm1); - vmovdqa(ptr[&m_local.d4.st], xmm1); - } - else - { - // m_local.d4.stq = t * 4.0f; - - vmovaps(ptr[&m_local.d4.stq], xmm1); - } - - for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) - { - // GSVector4 ds = t.xxxx(); - // GSVector4 dt = t.yyyy(); - // GSVector4 dq = t.zzzz(); - - vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - - for(int i = 0; i < 4; i++) - { - // GSVector4 v = ds/dt * m_shift[i]; - - vmulps(xmm2, xmm1, Xmm(4 + i)); - - if(m_sel.fst) - { - // m_local.d[i].si/ti = GSVector4i(v); - - vcvttps2dq(xmm2, xmm2); - - switch(j) - { - case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break; - case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break; - } - } - else - { - // m_local.d[i].s/t/q = v; - - switch(j) - { - case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break; - case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break; - case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break; - } - } - } - } -} - -void GSSetupPrimCodeGenerator::Color() -{ - if(!m_en.c) - { - return; - } - - if(m_sel.iip) - { - // GSVector4 c = dscan.c; - - vmovaps(xmm0, ptr[edx]); - - // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); - - vmulps(xmm1, xmm0, xmm3); - vcvttps2dq(xmm1, xmm1); - vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); - vpackssdw(xmm1, xmm1); - vmovdqa(ptr[&m_local.d4.c], xmm1); - - // xmm3 is not needed anymore - - // GSVector4 dr = c.xxxx(); - // GSVector4 db = c.zzzz(); - - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); - - vmulps(xmm0, xmm2, Xmm(4 + i)); - vcvttps2dq(xmm0, xmm0); - vpackssdw(xmm0, xmm0); - - // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); - - vmulps(xmm1, xmm3, Xmm(4 + i)); - vcvttps2dq(xmm1, xmm1); - vpackssdw(xmm1, xmm1); - - // m_local.d[i].rb = r.upl16(b); - - vpunpcklwd(xmm0, xmm1); - vmovdqa(ptr[&m_local.d[i].rb], xmm0); - } - - // GSVector4 c = dscan.c; - - vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it - - // GSVector4 dg = c.yyyy(); - // GSVector4 da = c.wwww(); - - vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); - - vmulps(xmm0, xmm2, Xmm(4 + i)); - vcvttps2dq(xmm0, xmm0); - vpackssdw(xmm0, xmm0); - - // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); - - vmulps(xmm1, xmm3, Xmm(4 + i)); - vcvttps2dq(xmm1, xmm1); - vpackssdw(xmm1, xmm1); - - // m_local.d[i].ga = g.upl16(a); - - vpunpcklwd(xmm0, xmm1); - vmovdqa(ptr[&m_local.d[i].ga], xmm0); - } - } - else - { - // GSVector4i c = GSVector4i(vertices[0].c); - - vcvttps2dq(xmm0, ptr[ecx]); - - // c = c.upl16(c.zwxy()); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - vpunpcklwd(xmm0, xmm1); - - // if(!tme) c = c.srl16(7); - - if(m_sel.tfx == TFX_NONE) - { - vpsrlw(xmm0, 7); - } - - // m_local.c.rb = c.xxxx(); - // m_local.c.ga = c.zzzz(); - - vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - vmovdqa(ptr[&m_local.c.rb], xmm1); - vmovdqa(ptr[&m_local.c.ga], xmm2); - } -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSSetupPrimCodeGenerator.h" + +#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +void GSSetupPrimCodeGenerator::Generate() +{ + if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) + { + for(int i = 0; i < 5; i++) + { + vmovaps(Xmm(3 + i), ptr[&m_shift[i]]); + } + } + + Depth(); + + Texture(); + + Color(); + + ret(); +} + +void GSSetupPrimCodeGenerator::Depth() +{ + if(!m_en.z && !m_en.f) + { + return; + } + + if(!m_sel.sprite) + { + // GSVector4 p = dscan.p; + + vmovaps(xmm0, ptr[edx + 16]); + + if(m_en.f) + { + // GSVector4 df = p.wwww(); + + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + + // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); + + vmulps(xmm2, xmm1, xmm3); + vcvttps2dq(xmm2, xmm2); + vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.d4.f], xmm2); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); + + vmulps(xmm2, xmm1, Xmm(4 + i)); + vcvttps2dq(xmm2, xmm2); + vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.d[i].f], xmm2); + } + } + + if(m_en.z) + { + // GSVector4 dz = p.zzzz(); + + vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + // m_local.d4.z = dz * 4.0f; + + vmulps(xmm1, xmm0, xmm3); + vmovdqa(ptr[&m_local.d4.z], xmm1); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].z = dz * m_shift[i]; + + vmulps(xmm1, xmm0, Xmm(4 + i)); + vmovdqa(ptr[&m_local.d[i].z], xmm1); + } + } + } + else + { + // GSVector4 p = vertices[0].p; + + vmovaps(xmm0, ptr[ecx + 16]); + + if(m_en.f) + { + // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); + + vcvttps2dq(xmm1, xmm0); + vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + vmovdqa(ptr[&m_local.p.f], xmm1); + } + + if(m_en.z) + { + // GSVector4 z = p.zzzz(); + + vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + if(m_sel.zoverflow) + { + // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + vbroadcastss(xmm1, ptr[&GSVector4::m_half]); + vmulps(xmm1, xmm0); + vcvttps2dq(xmm1, xmm1); + vpslld(xmm1, 1); + + vcvttps2dq(xmm0, xmm0); + vpcmpeqd(xmm2, xmm2); + vpsrld(xmm2, 31); + vpand(xmm0, xmm2); + + vpor(xmm0, xmm1); + } + else + { + // m_local.p.z = GSVector4i(z); + + vcvttps2dq(xmm0, xmm0); + } + + vmovdqa(ptr[&m_local.p.z], xmm0); + } + } +} + +void GSSetupPrimCodeGenerator::Texture() +{ + if(!m_en.t) + { + return; + } + + // GSVector4 t = dscan.t; + + vmovaps(xmm0, ptr[edx + 32]); + + vmulps(xmm1, xmm0, xmm3); + + if(m_sel.fst) + { + // m_local.d4.st = GSVector4i(t * 4.0f); + + vcvttps2dq(xmm1, xmm1); + vmovdqa(ptr[&m_local.d4.st], xmm1); + } + else + { + // m_local.d4.stq = t * 4.0f; + + vmovaps(ptr[&m_local.d4.stq], xmm1); + } + + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) + { + // GSVector4 ds = t.xxxx(); + // GSVector4 dt = t.yyyy(); + // GSVector4 dq = t.zzzz(); + + vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); + + for(int i = 0; i < 4; i++) + { + // GSVector4 v = ds/dt * m_shift[i]; + + vmulps(xmm2, xmm1, Xmm(4 + i)); + + if(m_sel.fst) + { + // m_local.d[i].si/ti = GSVector4i(v); + + vcvttps2dq(xmm2, xmm2); + + switch(j) + { + case 0: vmovdqa(ptr[&m_local.d[i].si], xmm2); break; + case 1: vmovdqa(ptr[&m_local.d[i].ti], xmm2); break; + } + } + else + { + // m_local.d[i].s/t/q = v; + + switch(j) + { + case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break; + case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break; + case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break; + } + } + } + } +} + +void GSSetupPrimCodeGenerator::Color() +{ + if(!m_en.c) + { + return; + } + + if(m_sel.iip) + { + // GSVector4 c = dscan.c; + + vmovaps(xmm0, ptr[edx]); + + // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); + + vmulps(xmm1, xmm0, xmm3); + vcvttps2dq(xmm1, xmm1); + vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); + vpackssdw(xmm1, xmm1); + vmovdqa(ptr[&m_local.d4.c], xmm1); + + // xmm3 is not needed anymore + + // GSVector4 dr = c.xxxx(); + // GSVector4 db = c.zzzz(); + + vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); + + vmulps(xmm0, xmm2, Xmm(4 + i)); + vcvttps2dq(xmm0, xmm0); + vpackssdw(xmm0, xmm0); + + // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); + + vmulps(xmm1, xmm3, Xmm(4 + i)); + vcvttps2dq(xmm1, xmm1); + vpackssdw(xmm1, xmm1); + + // m_local.d[i].rb = r.upl16(b); + + vpunpcklwd(xmm0, xmm1); + vmovdqa(ptr[&m_local.d[i].rb], xmm0); + } + + // GSVector4 c = dscan.c; + + vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it + + // GSVector4 dg = c.yyyy(); + // GSVector4 da = c.wwww(); + + vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); + + vmulps(xmm0, xmm2, Xmm(4 + i)); + vcvttps2dq(xmm0, xmm0); + vpackssdw(xmm0, xmm0); + + // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); + + vmulps(xmm1, xmm3, Xmm(4 + i)); + vcvttps2dq(xmm1, xmm1); + vpackssdw(xmm1, xmm1); + + // m_local.d[i].ga = g.upl16(a); + + vpunpcklwd(xmm0, xmm1); + vmovdqa(ptr[&m_local.d[i].ga], xmm0); + } + } + else + { + // GSVector4i c = GSVector4i(vertices[0].c); + + vcvttps2dq(xmm0, ptr[ecx]); + + // c = c.upl16(c.zwxy()); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(xmm0, xmm1); + + // if(!tme) c = c.srl16(7); + + if(m_sel.tfx == TFX_NONE) + { + vpsrlw(xmm0, 7); + } + + // m_local.c.rb = c.xxxx(); + // m_local.c.ga = c.zzzz(); + + vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vmovdqa(ptr[&m_local.c.rb], xmm1); + vmovdqa(ptr[&m_local.c.ga], xmm2); + } +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 38c91baf8d..28354c4086 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -1,349 +1,349 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSSetupPrimCodeGenerator.h" - -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -void GSSetupPrimCodeGenerator::Generate() -{ - if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) - { - for(int i = 0; i < 5; i++) - { - movaps(Xmm(3 + i), ptr[&m_shift[i]]); - } - } - - Depth(); - - Texture(); - - Color(); - - ret(); -} - -void GSSetupPrimCodeGenerator::Depth() -{ - if(!m_en.z && !m_en.f) - { - return; - } - - if(!m_sel.sprite) - { - // GSVector4 p = dscan.p; - - movaps(xmm0, ptr[edx + 16]); - - if(m_en.f) - { - // GSVector4 df = p.wwww(); - - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - - // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); - - movaps(xmm2, xmm1); - mulps(xmm2, xmm3); - cvttps2dq(xmm2, xmm2); - pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[&m_local.d4.f], xmm2); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); - - movaps(xmm2, xmm1); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); - movdqa(ptr[&m_local.d[i].f], xmm2); - } - } - - if(m_en.z) - { - // GSVector4 dz = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - // m_local.d4.z = dz * 4.0f; - - movaps(xmm1, xmm0); - mulps(xmm1, xmm3); - movdqa(ptr[&m_local.d4.z], xmm1); - - for(int i = 0; i < 4; i++) - { - // m_local.d[i].z = dz * m_shift[i]; - - movaps(xmm1, xmm0); - mulps(xmm1, Xmm(4 + i)); - movdqa(ptr[&m_local.d[i].z], xmm1); - } - } - } - else - { - // GSVector4 p = vertices[0].p; - - movaps(xmm0, ptr[ecx + 16]); - - if(m_en.f) - { - // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); - - cvttps2dq(xmm1, xmm0); - pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - movdqa(ptr[&m_local.p.f], xmm1); - } - - if(m_en.z) - { - // GSVector4 z = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - movaps(xmm1, ptr[&GSVector4::m_half]); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - - movdqa(ptr[&m_local.p.z], xmm0); - } - } -} - -void GSSetupPrimCodeGenerator::Texture() -{ - if(!m_en.t) - { - return; - } - - // GSVector4 t = dscan.t; - - movaps(xmm0, ptr[edx + 32]); - - movaps(xmm1, xmm0); - mulps(xmm1, xmm3); - - if(m_sel.fst) - { - // m_local.d4.st = GSVector4i(t * 4.0f); - - cvttps2dq(xmm1, xmm1); - movdqa(ptr[&m_local.d4.st], xmm1); - } - else - { - // m_local.d4.stq = t * 4.0f; - - movaps(ptr[&m_local.d4.stq], xmm1); - } - - for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) - { - // GSVector4 ds = t.xxxx(); - // GSVector4 dt = t.yyyy(); - // GSVector4 dq = t.zzzz(); - - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - - for(int i = 0; i < 4; i++) - { - // GSVector4 v = ds/dt * m_shift[i]; - - movaps(xmm2, xmm1); - mulps(xmm2, Xmm(4 + i)); - - if(m_sel.fst) - { - // m_local.d[i].si/ti = GSVector4i(v); - - cvttps2dq(xmm2, xmm2); - - switch(j) - { - case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break; - case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break; - } - } - else - { - // m_local.d[i].s/t/q = v; - - switch(j) - { - case 0: movaps(ptr[&m_local.d[i].s], xmm2); break; - case 1: movaps(ptr[&m_local.d[i].t], xmm2); break; - case 2: movaps(ptr[&m_local.d[i].q], xmm2); break; - } - } - } - } -} - -void GSSetupPrimCodeGenerator::Color() -{ - if(!m_en.c) - { - return; - } - - if(m_sel.iip) - { - // GSVector4 c = dscan.c; - - movaps(xmm0, ptr[edx]); - movaps(xmm1, xmm0); - - // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, xmm3); - cvttps2dq(xmm2, xmm2); - pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); - packssdw(xmm2, xmm2); - movdqa(ptr[&m_local.d4.c], xmm2); - - // xmm3 is not needed anymore - - // GSVector4 dr = c.xxxx(); - // GSVector4 db = c.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - packssdw(xmm2, xmm2); - - // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); - - movaps(xmm3, xmm1); - mulps(xmm3, Xmm(4 + i)); - cvttps2dq(xmm3, xmm3); - packssdw(xmm3, xmm3); - - // m_local.d[i].rb = r.upl16(b); - - punpcklwd(xmm2, xmm3); - movdqa(ptr[&m_local.d[i].rb], xmm2); - } - - // GSVector4 c = dscan.c; - - movaps(xmm0, ptr[edx]); // not enough regs, have to reload it - movaps(xmm1, xmm0); - - // GSVector4 dg = c.yyyy(); - // GSVector4 da = c.wwww(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - - for(int i = 0; i < 4; i++) - { - // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); - - movaps(xmm2, xmm0); - mulps(xmm2, Xmm(4 + i)); - cvttps2dq(xmm2, xmm2); - packssdw(xmm2, xmm2); - - // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); - - movaps(xmm3, xmm1); - mulps(xmm3, Xmm(4 + i)); - cvttps2dq(xmm3, xmm3); - packssdw(xmm3, xmm3); - - // m_local.d[i].ga = g.upl16(a); - - punpcklwd(xmm2, xmm3); - movdqa(ptr[&m_local.d[i].ga], xmm2); - } - } - else - { - // GSVector4i c = GSVector4i(vertices[0].c); - - movaps(xmm0, ptr[ecx]); - cvttps2dq(xmm0, xmm0); - - // c = c.upl16(c.zwxy()); - - pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); - punpcklwd(xmm0, xmm1); - - // if(!tme) c = c.srl16(7); - - if(m_sel.tfx == TFX_NONE) - { - psrlw(xmm0, 7); - } - - // m_local.c.rb = c.xxxx(); - // m_local.c.ga = c.zzzz(); - - pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); - pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - movdqa(ptr[&m_local.c.rb], xmm1); - movdqa(ptr[&m_local.c.ga], xmm2); - } -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSSetupPrimCodeGenerator.h" + +#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +void GSSetupPrimCodeGenerator::Generate() +{ + if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) + { + for(int i = 0; i < 5; i++) + { + movaps(Xmm(3 + i), ptr[&m_shift[i]]); + } + } + + Depth(); + + Texture(); + + Color(); + + ret(); +} + +void GSSetupPrimCodeGenerator::Depth() +{ + if(!m_en.z && !m_en.f) + { + return; + } + + if(!m_sel.sprite) + { + // GSVector4 p = dscan.p; + + movaps(xmm0, ptr[edx + 16]); + + if(m_en.f) + { + // GSVector4 df = p.wwww(); + + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + + // m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh(); + + movaps(xmm2, xmm1); + mulps(xmm2, xmm3); + cvttps2dq(xmm2, xmm2); + pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + movdqa(ptr[&m_local.d4.f], xmm2); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); + + movaps(xmm2, xmm1); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); + movdqa(ptr[&m_local.d[i].f], xmm2); + } + } + + if(m_en.z) + { + // GSVector4 dz = p.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + // m_local.d4.z = dz * 4.0f; + + movaps(xmm1, xmm0); + mulps(xmm1, xmm3); + movdqa(ptr[&m_local.d4.z], xmm1); + + for(int i = 0; i < 4; i++) + { + // m_local.d[i].z = dz * m_shift[i]; + + movaps(xmm1, xmm0); + mulps(xmm1, Xmm(4 + i)); + movdqa(ptr[&m_local.d[i].z], xmm1); + } + } + } + else + { + // GSVector4 p = vertices[0].p; + + movaps(xmm0, ptr[ecx + 16]); + + if(m_en.f) + { + // m_local.p.f = GSVector4i(p).zzzzh().zzzz(); + + cvttps2dq(xmm1, xmm0); + pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + movdqa(ptr[&m_local.p.f], xmm1); + } + + if(m_en.z) + { + // GSVector4 z = p.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + if(m_sel.zoverflow) + { + // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + + movaps(xmm1, ptr[&GSVector4::m_half]); + mulps(xmm1, xmm0); + cvttps2dq(xmm1, xmm1); + pslld(xmm1, 1); + + cvttps2dq(xmm0, xmm0); + pcmpeqd(xmm2, xmm2); + psrld(xmm2, 31); + pand(xmm0, xmm2); + + por(xmm0, xmm1); + } + else + { + // m_local.p.z = GSVector4i(z); + + cvttps2dq(xmm0, xmm0); + } + + movdqa(ptr[&m_local.p.z], xmm0); + } + } +} + +void GSSetupPrimCodeGenerator::Texture() +{ + if(!m_en.t) + { + return; + } + + // GSVector4 t = dscan.t; + + movaps(xmm0, ptr[edx + 32]); + + movaps(xmm1, xmm0); + mulps(xmm1, xmm3); + + if(m_sel.fst) + { + // m_local.d4.st = GSVector4i(t * 4.0f); + + cvttps2dq(xmm1, xmm1); + movdqa(ptr[&m_local.d4.st], xmm1); + } + else + { + // m_local.d4.stq = t * 4.0f; + + movaps(ptr[&m_local.d4.stq], xmm1); + } + + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) + { + // GSVector4 ds = t.xxxx(); + // GSVector4 dt = t.yyyy(); + // GSVector4 dq = t.zzzz(); + + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); + + for(int i = 0; i < 4; i++) + { + // GSVector4 v = ds/dt * m_shift[i]; + + movaps(xmm2, xmm1); + mulps(xmm2, Xmm(4 + i)); + + if(m_sel.fst) + { + // m_local.d[i].si/ti = GSVector4i(v); + + cvttps2dq(xmm2, xmm2); + + switch(j) + { + case 0: movdqa(ptr[&m_local.d[i].si], xmm2); break; + case 1: movdqa(ptr[&m_local.d[i].ti], xmm2); break; + } + } + else + { + // m_local.d[i].s/t/q = v; + + switch(j) + { + case 0: movaps(ptr[&m_local.d[i].s], xmm2); break; + case 1: movaps(ptr[&m_local.d[i].t], xmm2); break; + case 2: movaps(ptr[&m_local.d[i].q], xmm2); break; + } + } + } + } +} + +void GSSetupPrimCodeGenerator::Color() +{ + if(!m_en.c) + { + return; + } + + if(m_sel.iip) + { + // GSVector4 c = dscan.c; + + movaps(xmm0, ptr[edx]); + movaps(xmm1, xmm0); + + // m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, xmm3); + cvttps2dq(xmm2, xmm2); + pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); + packssdw(xmm2, xmm2); + movdqa(ptr[&m_local.d4.c], xmm2); + + // xmm3 is not needed anymore + + // GSVector4 dr = c.xxxx(); + // GSVector4 db = c.zzzz(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + packssdw(xmm2, xmm2); + + // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); + + movaps(xmm3, xmm1); + mulps(xmm3, Xmm(4 + i)); + cvttps2dq(xmm3, xmm3); + packssdw(xmm3, xmm3); + + // m_local.d[i].rb = r.upl16(b); + + punpcklwd(xmm2, xmm3); + movdqa(ptr[&m_local.d[i].rb], xmm2); + } + + // GSVector4 c = dscan.c; + + movaps(xmm0, ptr[edx]); // not enough regs, have to reload it + movaps(xmm1, xmm0); + + // GSVector4 dg = c.yyyy(); + // GSVector4 da = c.wwww(); + + shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + + for(int i = 0; i < 4; i++) + { + // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); + + movaps(xmm2, xmm0); + mulps(xmm2, Xmm(4 + i)); + cvttps2dq(xmm2, xmm2); + packssdw(xmm2, xmm2); + + // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); + + movaps(xmm3, xmm1); + mulps(xmm3, Xmm(4 + i)); + cvttps2dq(xmm3, xmm3); + packssdw(xmm3, xmm3); + + // m_local.d[i].ga = g.upl16(a); + + punpcklwd(xmm2, xmm3); + movdqa(ptr[&m_local.d[i].ga], xmm2); + } + } + else + { + // GSVector4i c = GSVector4i(vertices[0].c); + + movaps(xmm0, ptr[ecx]); + cvttps2dq(xmm0, xmm0); + + // c = c.upl16(c.zwxy()); + + pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2)); + punpcklwd(xmm0, xmm1); + + // if(!tme) c = c.srl16(7); + + if(m_sel.tfx == TFX_NONE) + { + psrlw(xmm0, 7); + } + + // m_local.c.rb = c.xxxx(); + // m_local.c.ga = c.zzzz(); + + pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); + pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + + movdqa(ptr[&m_local.c.rb], xmm1); + movdqa(ptr[&m_local.c.ga], xmm2); + } +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSVertexTrace.x64.avx.cpp b/plugins/GSdx/GSVertexTrace.x64.avx.cpp index e4140e0fdc..b3ba691c37 100644 --- a/plugins/GSdx/GSVertexTrace.x64.avx.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.avx.cpp @@ -1,496 +1,496 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - enter(32, true); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - vmovaps(xmm2, xmm4); - vmovaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); - - vminps(xmm2, xmm0); - vmaxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - } - - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexSW)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - vcvttps2dq(xmm2, xmm2); - vpsrld(xmm2, 7); - vmovaps(ptr[r8], xmm2); - - vcvttps2dq(xmm3, xmm3); - vpsrld(xmm3, 7); - vmovaps(ptr[r9], xmm3); - } - - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); - - if(tme) - { - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - enter(32, true); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - vdivps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[r8], xmm2); - vmovaps(ptr[r9], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - enter(32, true); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - vmovaps(xmm1, xmm0); - } - - vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - - vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); - vpmovzxwd(xmm1, xmm0); - - vpsrld(xmm0, 1); - vpunpcklqdq(xmm1, xmm0); - vcvtdq2ps(xmm1, xmm1); - - vminps(xmm4, xmm1); - vmaxps(xmm5, xmm1); - } - - add(rdx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[r8], xmm2); - vmovaps(ptr[r9], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[r8 + 16], xmm4); - vmovaps(ptr[r9 + 16], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - vmovaps(ptr[r8 + 32], xmm6); - vmovaps(ptr[r9 + 32], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSVertexTrace.h" + +#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + enter(32, true); + + vmovdqa(ptr[rsp + 0], xmm6); + vmovdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + vbroadcastss(xmm4, ptr[rax + 0]); + vbroadcastss(xmm5, ptr[rax + 4]); + + if(color) + { + // min.c = FLT_MAX; + // max.c = -FLT_MAX; + + vmovaps(xmm2, xmm4); + vmovaps(xmm3, xmm5); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); + vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1)) + { + // min.c = min.c.minv(v[i + j].c); + // max.c = max.c.maxv(v[i + j].c); + + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); + + vminps(xmm2, xmm0); + vmaxps(xmm3, xmm0); + } + + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); + + vminps(xmm4, xmm0); + vmaxps(xmm5, xmm0); + + if(tme) + { + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); + + if(!fst) + { + if(primclass != GS_SPRITE_CLASS) + { + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + } + + vdivps(xmm0, xmm1); + vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); + } + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + } + + add(rdx, n * sizeof(GSVertexSW)); + sub(ecx, n); + + jg("loop"); + + // } + + if(color) + { + vcvttps2dq(xmm2, xmm2); + vpsrld(xmm2, 7); + vmovaps(ptr[r8], xmm2); + + vcvttps2dq(xmm3, xmm3); + vpsrld(xmm3, 7); + vmovaps(ptr[r9], xmm3); + } + + vmovaps(ptr[r8 + 16], xmm4); + vmovaps(ptr[r9 + 16], xmm5); + + if(tme) + { + vmovaps(ptr[r8 + 32], xmm6); + vmovaps(ptr[r9 + 32], xmm7); + } + + vmovdqa(xmm6, ptr[rsp + 0]); + vmovdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + case GS_SPRITE_CLASS: + n = 6; + break; + } + + enter(32, true); + + vmovdqa(ptr[rsp + 0], xmm6); + vmovdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + vbroadcastss(xmm4, ptr[rax + 0]); + vbroadcastss(xmm5, ptr[rax + 4]); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + vpcmpeqd(xmm2, xmm2); + vpxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); + vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + for(int j = 0; j < n; j++) + { + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); + + vminps(xmm4, xmm0); + vmaxps(xmm5, xmm0); + + if(tme && !fst && primclass != GS_SPRITE_CLASS) + { + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + } + + if(color && (iip || j == n - 1) || tme) + { + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); + } + + if(color && (iip || j == n - 1)) + { + // min.c = min.c.min_u8(v[i + j].c); + // max.c = max.c.min_u8(v[i + j].c); + + vpminub(xmm2, xmm0); + vpmaxub(xmm3, xmm0); + } + + if(tme) + { + vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral + + if(!fst) + { + // t /= p.wwww(); + + vdivps(xmm0, xmm1); + } + + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + } + + add(rdx, n * sizeof(GSVertexHW9)); + sub(ecx, n); + + jg("loop"); + + // } + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm2, xmm2); + + vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm3, xmm3); + + vmovaps(ptr[r8], xmm2); + vmovaps(ptr[r9], xmm3); + } + + // m_min.p = pmin; + // m_max.p = pmax; + + vmovaps(ptr[r8 + 16], xmm4); + vmovaps(ptr[r9 + 16], xmm5); + + if(tme) + { + // m_min.t = tmin.xyww(pmin); + // m_max.t = tmax.xyww(pmax); + + vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + vmovaps(ptr[r8 + 32], xmm6); + vmovaps(ptr[r9 + 32], xmm7); + } + + vmovdqa(xmm6, ptr[rsp + 0]); + vmovdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + enter(32, true); + + vmovdqa(ptr[rsp + 0], xmm6); + vmovdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + vbroadcastss(xmm4, ptr[rax + 0]); + vbroadcastss(xmm5, ptr[rax + 4]); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + vpcmpeqd(xmm2, xmm2); + vpxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1) || tme) + { + vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); + } + + if(color && (iip || j == n - 1)) + { + vpminub(xmm2, xmm0); + vpmaxub(xmm3, xmm0); + } + + if(tme) + { + if(!fst) + { + vmovaps(xmm1, xmm0); + } + + vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral + + if(!fst) + { + vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + vdivps(xmm0, xmm1); + vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q + } + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + + vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); + vpmovzxwd(xmm1, xmm0); + + vpsrld(xmm0, 1); + vpunpcklqdq(xmm1, xmm0); + vcvtdq2ps(xmm1, xmm1); + + vminps(xmm4, xmm1); + vmaxps(xmm5, xmm1); + } + + add(rdx, n * sizeof(GSVertexHW11)); + sub(ecx, n); + + jg("loop"); + + // } + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm2, xmm2); + + vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm3, xmm3); + + vmovaps(ptr[r8], xmm2); + vmovaps(ptr[r9], xmm3); + } + + // m_min.p = pmin.xyww(); + // m_max.p = pmax.xyww(); + + vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + vmovaps(ptr[r8 + 16], xmm4); + vmovaps(ptr[r9 + 16], xmm5); + + if(tme) + { + // m_min.t = tmin; + // m_max.t = tmax; + + vmovaps(ptr[r8 + 32], xmm6); + vmovaps(ptr[r9 + 32], xmm7); + } + + vmovdqa(xmm6, ptr[rsp + 0]); + vmovdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSVertexTrace.x64.cpp b/plugins/GSdx/GSVertexTrace.x64.cpp index 4be6ca68aa..d584b4b3c0 100644 --- a/plugins/GSdx/GSVertexTrace.x64.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.cpp @@ -1,543 +1,543 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - enter(32, true); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 4]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); - - minps(xmm2, xmm0); - maxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexSW)); - sub(rcx, n); - - jg("loop"); - - // } - - if(color) - { - cvttps2dq(xmm2, xmm2); - psrld(xmm2, 7); - movaps(ptr[r8], xmm2); - - cvttps2dq(xmm3, xmm3); - psrld(xmm3, 7); - movaps(ptr[r9], xmm3); - } - - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); - - if(tme) - { - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - enter(32, true); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 16]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - divps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[r8], xmm2); - movaps(ptr[r9], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - enter(32, true); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 16]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - movaps(xmm1, xmm0); - } - - shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - - movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pmovzxwd(xmm1, xmm0); - } - else - { - movdqa(xmm1, xmm0); - punpcklwd(xmm1, xmm1); - psrld(xmm1, 16); - } - - psrld(xmm0, 1); - punpcklqdq(xmm1, xmm0); - cvtdq2ps(xmm1, xmm1); - - minps(xmm4, xmm1); - maxps(xmm5, xmm1); - } - - add(rdx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[r8], xmm2); - movaps(ptr[r9], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[r8 + 16], xmm4); - movaps(ptr[r9 + 16], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - movaps(ptr[r8 + 32], xmm6); - movaps(ptr[r9 + 32], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - leave(); - - ret(); -} - -#endif +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSVertexTrace.h" + +#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + enter(32, true); + + movdqa(ptr[rsp + 0], xmm6); + movdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + movss(xmm4, ptr[rax + 0]); + movss(xmm5, ptr[rax + 4]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = FLT_MAX; + // max.c = -FLT_MAX; + + movaps(xmm2, xmm4); + movaps(xmm3, xmm5); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + 32]); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1)) + { + // min.c = min.c.minv(v[i + j].c); + // max.c = max.c.maxv(v[i + j].c); + + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW)]); + + minps(xmm2, xmm0); + maxps(xmm3, xmm0); + } + + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 16]); + + minps(xmm4, xmm0); + maxps(xmm5, xmm0); + + if(tme) + { + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + 32]); + + if(!fst) + { + if(primclass != GS_SPRITE_CLASS) + { + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + divps(xmm0, xmm1); + shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); + } + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + } + + add(rdx, n * sizeof(GSVertexSW)); + sub(rcx, n); + + jg("loop"); + + // } + + if(color) + { + cvttps2dq(xmm2, xmm2); + psrld(xmm2, 7); + movaps(ptr[r8], xmm2); + + cvttps2dq(xmm3, xmm3); + psrld(xmm3, 7); + movaps(ptr[r9], xmm3); + } + + movaps(ptr[r8 + 16], xmm4); + movaps(ptr[r9 + 16], xmm5); + + if(tme) + { + movaps(ptr[r8 + 32], xmm6); + movaps(ptr[r9 + 32], xmm7); + } + + movdqa(xmm6, ptr[rsp + 0]); + movdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + case GS_SPRITE_CLASS: + n = 6; + break; + } + + enter(32, true); + + movdqa(ptr[rsp + 0], xmm6); + movdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + movss(xmm4, ptr[rax + 0]); + movss(xmm5, ptr[rax + 16]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + pcmpeqd(xmm2, xmm2); + pxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + 16]); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + for(int j = 0; j < n; j++) + { + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + 16]); + + minps(xmm4, xmm0); + maxps(xmm5, xmm0); + + if(tme && !fst && primclass != GS_SPRITE_CLASS) + { + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + if(color && (iip || j == n - 1) || tme) + { + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9)]); + } + + if(color && (iip || j == n - 1)) + { + // min.c = min.c.min_u8(v[i + j].c); + // max.c = max.c.min_u8(v[i + j].c); + + pminub(xmm2, xmm0); + pmaxub(xmm3, xmm0); + } + + if(tme) + { + shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral + + if(!fst) + { + // t /= p.wwww(); + + divps(xmm0, xmm1); + } + + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + } + + add(rdx, n * sizeof(GSVertexHW9)); + sub(ecx, n); + + jg("loop"); + + // } + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm2, xmm2); + + pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm3, xmm3); + } + else + { + pxor(xmm0, xmm0); + + punpckhbw(xmm2, xmm0); + punpcklwd(xmm2, xmm0); + + punpckhbw(xmm3, xmm0); + punpcklwd(xmm3, xmm0); + } + + movaps(ptr[r8], xmm2); + movaps(ptr[r9], xmm3); + } + + // m_min.p = pmin; + // m_max.p = pmax; + + movaps(ptr[r8 + 16], xmm4); + movaps(ptr[r9 + 16], xmm5); + + if(tme) + { + // m_min.t = tmin.xyww(pmin); + // m_max.t = tmax.xyww(pmax); + + shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + movaps(ptr[r8 + 32], xmm6); + movaps(ptr[r9 + 32], xmm7); + } + + movdqa(xmm6, ptr[rsp + 0]); + movdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + enter(32, true); + + movdqa(ptr[rsp + 0], xmm6); + movdqa(ptr[rsp + 16], xmm7); + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + mov(rax, (size_t)&s_minmax); + + movss(xmm4, ptr[rax + 0]); + movss(xmm5, ptr[rax + 16]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + pcmpeqd(xmm2, xmm2); + pxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + align(16); + + L("loop"); + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1) || tme) + { + movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); + } + + if(color && (iip || j == n - 1)) + { + pminub(xmm2, xmm0); + pmaxub(xmm3, xmm0); + } + + if(tme) + { + if(!fst) + { + movaps(xmm1, xmm0); + } + + shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral + + if(!fst) + { + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + divps(xmm0, xmm1); + shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q + } + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + + movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pmovzxwd(xmm1, xmm0); + } + else + { + movdqa(xmm1, xmm0); + punpcklwd(xmm1, xmm1); + psrld(xmm1, 16); + } + + psrld(xmm0, 1); + punpcklqdq(xmm1, xmm0); + cvtdq2ps(xmm1, xmm1); + + minps(xmm4, xmm1); + maxps(xmm5, xmm1); + } + + add(rdx, n * sizeof(GSVertexHW11)); + sub(ecx, n); + + jg("loop"); + + // } + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm2, xmm2); + + pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm3, xmm3); + } + else + { + pxor(xmm0, xmm0); + + punpckhbw(xmm2, xmm0); + punpcklwd(xmm2, xmm0); + + punpckhbw(xmm3, xmm0); + punpcklwd(xmm3, xmm0); + } + + movaps(ptr[r8], xmm2); + movaps(ptr[r9], xmm3); + } + + // m_min.p = pmin.xyww(); + // m_max.p = pmax.xyww(); + + shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + movaps(ptr[r8 + 16], xmm4); + movaps(ptr[r9 + 16], xmm5); + + if(tme) + { + // m_min.t = tmin; + // m_max.t = tmax; + + movaps(ptr[r8 + 32], xmm6); + movaps(ptr[r9 + 32], xmm7); + } + + movdqa(xmm6, ptr[rsp + 0]); + movdqa(xmm7, ptr[rsp + 16]); + + leave(); + + ret(); +} + +#endif diff --git a/plugins/GSdx/GSVertexTrace.x86.avx.cpp b/plugins/GSdx/GSVertexTrace.x86.avx.cpp index 8f9084c66b..c1323d276d 100644 --- a/plugins/GSdx/GSVertexTrace.x86.avx.cpp +++ b/plugins/GSdx/GSVertexTrace.x86.avx.cpp @@ -1,484 +1,484 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -static const int _args = 0; -static const int _count = _args + 4; // rcx -static const int _v = _args + 8; // rdx -static const int _min = _args + 12; // r8 -static const int _max = _args + 16; // r9 - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - vmovaps(xmm2, xmm4); - vmovaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); - - vminps(xmm2, xmm0); - vmaxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - } - - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(edx, n * sizeof(GSVertexSW)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - vcvttps2dq(xmm2, xmm2); - vpsrld(xmm2, 7); - vmovaps(ptr[eax], xmm2); - - vcvttps2dq(xmm3, xmm3); - vpsrld(xmm3, 7); - vmovaps(ptr[edx], xmm3); - } - - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); - - if(tme) - { - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); - } - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - vdivps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(edx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[eax], xmm2); - vmovaps(ptr[edx], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); - } - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - vmovaps(xmm1, xmm0); - } - - vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - - vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]); - vpmovzxwd(xmm1, xmm0); - - vpsrld(xmm0, 1); - vpunpcklqdq(xmm1, xmm0); - vcvtdq2ps(xmm1, xmm1); - - vminps(xmm4, xmm1); - vmaxps(xmm5, xmm1); - } - - add(edx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[eax], xmm2); - vmovaps(ptr[edx], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[eax + 16], xmm4); - vmovaps(ptr[edx + 16], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - vmovaps(ptr[eax + 32], xmm6); - vmovaps(ptr[edx + 32], xmm7); - } - - ret(); -} - -#endif +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSVertexTrace.h" + +#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +static const int _args = 0; +static const int _count = _args + 4; // rcx +static const int _v = _args + 8; // rdx +static const int _min = _args + 12; // r8 +static const int _max = _args + 16; // r9 + +GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + vbroadcastss(xmm4, ptr[&s_minmax.x]); + vbroadcastss(xmm5, ptr[&s_minmax.y]); + + if(color) + { + // min.c = FLT_MAX; + // max.c = -FLT_MAX; + + vmovaps(xmm2, xmm4); + vmovaps(xmm3, xmm5); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + vmovaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); + vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1)) + { + // min.c = min.c.minv(v[i + j].c); + // max.c = max.c.maxv(v[i + j].c); + + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); + + vminps(xmm2, xmm0); + vmaxps(xmm3, xmm0); + } + + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); + + vminps(xmm4, xmm0); + vmaxps(xmm5, xmm0); + + if(tme) + { + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); + + if(!fst) + { + if(primclass != GS_SPRITE_CLASS) + { + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); + } + + vdivps(xmm0, xmm1); + vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); + } + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + } + + add(edx, n * sizeof(GSVertexSW)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + vcvttps2dq(xmm2, xmm2); + vpsrld(xmm2, 7); + vmovaps(ptr[eax], xmm2); + + vcvttps2dq(xmm3, xmm3); + vpsrld(xmm3, 7); + vmovaps(ptr[edx], xmm3); + } + + vmovaps(ptr[eax + 16], xmm4); + vmovaps(ptr[edx + 16], xmm5); + + if(tme) + { + vmovaps(ptr[eax + 32], xmm6); + vmovaps(ptr[edx + 32], xmm7); + } + + ret(); +} + +GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + case GS_SPRITE_CLASS: + n = 6; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + vbroadcastss(xmm4, ptr[&s_minmax.x]); + vbroadcastss(xmm5, ptr[&s_minmax.y]); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + vpcmpeqd(xmm2, xmm2); + vpxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + vmovaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); + vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + for(int j = 0; j < n; j++) + { + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); + + vminps(xmm4, xmm0); + vmaxps(xmm5, xmm0); + + if(tme && !fst && primclass != GS_SPRITE_CLASS) + { + vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + } + + if(color && (iip || j == n - 1) || tme) + { + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); + } + + if(color && (iip || j == n - 1)) + { + // min.c = min.c.min_u8(v[i + j].c); + // max.c = max.c.min_u8(v[i + j].c); + + vpminub(xmm2, xmm0); + vpmaxub(xmm3, xmm0); + } + + if(tme) + { + vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral + + if(!fst) + { + // t /= p.wwww(); + + vdivps(xmm0, xmm1); + } + + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + } + + add(edx, n * sizeof(GSVertexHW9)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm2, xmm2); + + vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm3, xmm3); + + vmovaps(ptr[eax], xmm2); + vmovaps(ptr[edx], xmm3); + } + + // m_min.p = pmin; + // m_max.p = pmax; + + vmovaps(ptr[eax + 16], xmm4); + vmovaps(ptr[edx + 16], xmm5); + + if(tme) + { + // m_min.t = tmin.xyww(pmin); + // m_max.t = tmax.xyww(pmax); + + vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + vmovaps(ptr[eax + 32], xmm6); + vmovaps(ptr[edx + 32], xmm7); + } + + ret(); +} + +GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + vbroadcastss(xmm4, ptr[&s_minmax.x]); + vbroadcastss(xmm5, ptr[&s_minmax.y]); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + vpcmpeqd(xmm2, xmm2); + vpxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + vmovaps(xmm6, xmm4); + vmovaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1) || tme) + { + vmovaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]); + } + + if(color && (iip || j == n - 1)) + { + vpminub(xmm2, xmm0); + vpmaxub(xmm3, xmm0); + } + + if(tme) + { + if(!fst) + { + vmovaps(xmm1, xmm0); + } + + vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral + + if(!fst) + { + vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + vdivps(xmm0, xmm1); + vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q + } + + vminps(xmm6, xmm0); + vmaxps(xmm7, xmm0); + } + + vmovdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]); + vpmovzxwd(xmm1, xmm0); + + vpsrld(xmm0, 1); + vpunpcklqdq(xmm1, xmm0); + vcvtdq2ps(xmm1, xmm1); + + vminps(xmm4, xmm1); + vmaxps(xmm5, xmm1); + } + + add(edx, n * sizeof(GSVertexHW11)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm2, xmm2); + + vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + vpmovzxbd(xmm3, xmm3); + + vmovaps(ptr[eax], xmm2); + vmovaps(ptr[edx], xmm3); + } + + // m_min.p = pmin.xyww(); + // m_max.p = pmax.xyww(); + + vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + vmovaps(ptr[eax + 16], xmm4); + vmovaps(ptr[edx + 16], xmm5); + + if(tme) + { + // m_min.t = tmin; + // m_max.t = tmax; + + vmovaps(ptr[eax + 32], xmm6); + vmovaps(ptr[edx + 32], xmm7); + } + + ret(); +} + +#endif diff --git a/plugins/GSdx/GSVertexTrace.x86.cpp b/plugins/GSdx/GSVertexTrace.x86.cpp index 8ffaef3e48..2d0a4ed3f5 100644 --- a/plugins/GSdx/GSVertexTrace.x86.cpp +++ b/plugins/GSdx/GSVertexTrace.x86.cpp @@ -1,531 +1,531 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -static const int _args = 0; -static const int _count = _args + 4; // rcx -static const int _v = _args + 8; // rdx -static const int _min = _args + 12; // r8 -static const int _max = _args + 16; // r9 - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); - - minps(xmm2, xmm0); - maxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(edx, n * sizeof(GSVertexSW)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - cvttps2dq(xmm2, xmm2); - psrld(xmm2, 7); - movaps(ptr[eax], xmm2); - - cvttps2dq(xmm3, xmm3); - psrld(xmm3, 7); - movaps(ptr[edx], xmm3); - } - - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); - - if(tme) - { - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); - } - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - divps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(edx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[eax], xmm2); - movaps(ptr[edx], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); - } - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _v]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - movaps(xmm1, xmm0); - } - - shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - - movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pmovzxwd(xmm1, xmm0); - } - else - { - movdqa(xmm1, xmm0); - punpcklwd(xmm1, xmm1); - psrld(xmm1, 16); - } - - psrld(xmm0, 1); - punpcklqdq(xmm1, xmm0); - cvtdq2ps(xmm1, xmm1); - - minps(xmm4, xmm1); - maxps(xmm5, xmm1); - } - - add(edx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[eax], xmm2); - movaps(ptr[edx], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[eax + 16], xmm4); - movaps(ptr[edx + 16], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - movaps(ptr[eax + 32], xmm6); - movaps(ptr[edx + 32], xmm7); - } - - ret(); -} - +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSVertexTrace.h" + +#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +static const int _args = 0; +static const int _count = _args + 4; // rcx +static const int _v = _args + 8; // rdx +static const int _min = _args + 12; // r8 +static const int _max = _args + 16; // r9 + +GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + movss(xmm4, ptr[&s_minmax.x]); + movss(xmm5, ptr[&s_minmax.y]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = FLT_MAX; + // max.c = -FLT_MAX; + + movaps(xmm2, xmm4); + movaps(xmm3, xmm5); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1)) + { + // min.c = min.c.minv(v[i + j].c); + // max.c = max.c.maxv(v[i + j].c); + + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]); + + minps(xmm2, xmm0); + maxps(xmm3, xmm0); + } + + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]); + + minps(xmm4, xmm0); + maxps(xmm5, xmm0); + + if(tme) + { + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]); + + if(!fst) + { + if(primclass != GS_SPRITE_CLASS) + { + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); + } + + divps(xmm0, xmm1); + shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); + } + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + } + + add(edx, n * sizeof(GSVertexSW)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + cvttps2dq(xmm2, xmm2); + psrld(xmm2, 7); + movaps(ptr[eax], xmm2); + + cvttps2dq(xmm3, xmm3); + psrld(xmm3, 7); + movaps(ptr[edx], xmm3); + } + + movaps(ptr[eax + 16], xmm4); + movaps(ptr[edx + 16], xmm5); + + if(tme) + { + movaps(ptr[eax + 32], xmm6); + movaps(ptr[edx + 32], xmm7); + } + + ret(); +} + +GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + case GS_SPRITE_CLASS: + n = 6; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + movss(xmm4, ptr[&s_minmax.x]); + movss(xmm5, ptr[&s_minmax.y]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + pcmpeqd(xmm2, xmm2); + pxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + if(tme && !fst && primclass == GS_SPRITE_CLASS) + { + movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + for(int j = 0; j < n; j++) + { + // min.p = min.p.minv(v[i + j].p); + // max.p = max.p.maxv(v[i + j].p); + + movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]); + + minps(xmm4, xmm0); + maxps(xmm5, xmm0); + + if(tme && !fst && primclass != GS_SPRITE_CLASS) + { + movaps(xmm1, xmm0); + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + } + + if(color && (iip || j == n - 1) || tme) + { + movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]); + } + + if(color && (iip || j == n - 1)) + { + // min.c = min.c.min_u8(v[i + j].c); + // max.c = max.c.min_u8(v[i + j].c); + + pminub(xmm2, xmm0); + pmaxub(xmm3, xmm0); + } + + if(tme) + { + shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral + + if(!fst) + { + // t /= p.wwww(); + + divps(xmm0, xmm1); + } + + // min.t = min.t.minv(v[i + j].t); + // max.t = max.t.maxv(v[i + j].t); + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + } + + add(edx, n * sizeof(GSVertexHW9)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm2, xmm2); + + pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm3, xmm3); + } + else + { + pxor(xmm0, xmm0); + + punpckhbw(xmm2, xmm0); + punpcklwd(xmm2, xmm0); + + punpckhbw(xmm3, xmm0); + punpcklwd(xmm3, xmm0); + } + + movaps(ptr[eax], xmm2); + movaps(ptr[edx], xmm3); + } + + // m_min.p = pmin; + // m_max.p = pmax; + + movaps(ptr[eax + 16], xmm4); + movaps(ptr[edx + 16], xmm5); + + if(tme) + { + // m_min.t = tmin.xyww(pmin); + // m_max.t = tmax.xyww(pmax); + + shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + movaps(ptr[eax + 32], xmm6); + movaps(ptr[edx + 32], xmm7); + } + + ret(); +} + +GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) + : GSCodeGenerator(code, maxsize) +{ + uint32 primclass = (key >> 0) & 3; + uint32 iip = (key >> 2) & 1; + uint32 tme = (key >> 3) & 1; + uint32 fst = (key >> 4) & 1; + uint32 color = (key >> 5) & 1; + + int n = 1; + + switch(primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + // min.p = FLT_MAX; + // max.p = -FLT_MAX; + + movss(xmm4, ptr[&s_minmax.x]); + movss(xmm5, ptr[&s_minmax.y]); + + shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); + + if(color) + { + // min.c = 0xffffffff; + // max.c = 0; + + pcmpeqd(xmm2, xmm2); + pxor(xmm3, xmm3); + } + + if(tme) + { + // min.t = FLT_MAX; + // max.t = -FLT_MAX; + + movaps(xmm6, xmm4); + movaps(xmm7, xmm5); + } + + // for(int i = 0; i < count; i += step) { + + mov(edx, dword[esp + _v]); + mov(ecx, dword[esp + _count]); + + align(16); + + L("loop"); + + for(int j = 0; j < n; j++) + { + if(color && (iip || j == n - 1) || tme) + { + movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]); + } + + if(color && (iip || j == n - 1)) + { + pminub(xmm2, xmm0); + pmaxub(xmm3, xmm0); + } + + if(tme) + { + if(!fst) + { + movaps(xmm1, xmm0); + } + + shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral + + if(!fst) + { + shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); + divps(xmm0, xmm1); + shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q + } + + minps(xmm6, xmm0); + maxps(xmm7, xmm0); + } + + movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pmovzxwd(xmm1, xmm0); + } + else + { + movdqa(xmm1, xmm0); + punpcklwd(xmm1, xmm1); + psrld(xmm1, 16); + } + + psrld(xmm0, 1); + punpcklqdq(xmm1, xmm0); + cvtdq2ps(xmm1, xmm1); + + minps(xmm4, xmm1); + maxps(xmm5, xmm1); + } + + add(edx, n * sizeof(GSVertexHW11)); + sub(ecx, n); + + jg("loop"); + + // } + + mov(eax, dword[esp + _min]); + mov(edx, dword[esp + _max]); + + if(color) + { + // m_min.c = cmin.zzzz().u8to32(); + // m_max.c = cmax.zzzz().u8to32(); + + if(m_cpu.has(util::Cpu::tSSE41)) + { + pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm2, xmm2); + + pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); + pmovzxbd(xmm3, xmm3); + } + else + { + pxor(xmm0, xmm0); + + punpckhbw(xmm2, xmm0); + punpcklwd(xmm2, xmm0); + + punpckhbw(xmm3, xmm0); + punpcklwd(xmm3, xmm0); + } + + movaps(ptr[eax], xmm2); + movaps(ptr[edx], xmm3); + } + + // m_min.p = pmin.xyww(); + // m_max.p = pmax.xyww(); + + shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); + shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); + + movaps(ptr[eax + 16], xmm4); + movaps(ptr[edx + 16], xmm5); + + if(tme) + { + // m_min.t = tmin; + // m_max.t = tmax; + + movaps(ptr[eax + 32], xmm6); + movaps(ptr[edx + 32], xmm7); + } + + ret(); +} + #endif \ No newline at end of file diff --git a/plugins/GSdx/GSdx_vs2008.vcproj b/plugins/GSdx/GSdx_vs2008.vcproj index a4b1f394ac..8a4b1a71dc 100644 --- a/plugins/GSdx/GSdx_vs2008.vcproj +++ b/plugins/GSdx/GSdx_vs2008.vcproj @@ -892,6 +892,110 @@ RelativePath=".\GSDrawScanlineCodeGenerator.cpp" > + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -956,6 +1060,110 @@ RelativePath=".\GSSetupPrimCodeGenerator.cpp" > + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1032,6 +1240,110 @@ RelativePath=".\GSVertexTrace.cpp" > + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +