/* * Copyright (C) 2007-2009 Gabest * http://www.gabest.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. * http://www.gnu.org/copyleft/gpl.html * */ // TODO: x64 #include "stdafx.h" #include "GPUDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" static const int _args = 8; static const int _top = _args + 4; static const int _v = _args + 8; GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) , m_local(*(GPUScanlineLocalData*)param) { #if _M_AMD64 //#error TODO #endif m_sel.key = key; Generate(); } void GPUDrawScanlineCodeGenerator::Generate() { push(esi); push(edi); Init(); align(16); L("loop"); // GSVector4i test = m_test[7 + (steps & (steps >> 31))]; mov(edx, ecx); sar(edx, 31); and(edx, ecx); shl(edx, 4); movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); // movdqu(xmm1, ptr[edi]); movq(xmm1, qword[edi]); movhps(xmm1, qword[edi + 8]); // ecx = steps // esi = tex (tme) // edi = fb // xmm1 = fd // xmm2 = s // xmm3 = t // xmm4 = r // xmm5 = g // xmm6 = b // xmm7 = test TestMask(); SampleTexture(); // xmm1 = fd // xmm3 = a // xmm4 = r // xmm5 = g // xmm6 = b // xmm7 = test // xmm0, xmm2 = free ColorTFX(); AlphaBlend(); Dither(); WriteFrame(); L("step"); // if(steps <= 0) break; test(ecx, ecx); jle("exit", T_NEAR); Step(); jmp("loop", T_NEAR); L("exit"); pop(edi); pop(esi); ret(8); } void GPUDrawScanlineCodeGenerator::Init() { mov(eax, dword[esp + _top]); // uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left; mov(edi, eax); shl(edi, 10 + m_sel.scalex); add(edi, edx); lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]); // int steps = pixels - 8; sub(ecx, 8); if(m_sel.dtd) { // dither = GSVector4i::load(&m_dither[top & 3][left & 3]); and(eax, 3); shl(eax, 5); and(edx, 3); shl(edx, 1); movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]); movdqa(ptr[&m_local.temp.dither], xmm0); } mov(edx, dword[esp + _v]); if(m_sel.tme) { mov(esi, dword[&m_local.gd->tex]); // GSVector4i vt = GSVector4i(v.t).xxzzl(); cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); // s = vt.xxxx().add16(m_local.d.s); // t = vt.yyyy().add16(m_local.d.t); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm2, ptr[&m_local.d.s]); if(!m_sel.sprite) { paddw(xmm3, ptr[&m_local.d.t]); } else { if(m_sel.ltf) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } movdqa(ptr[&m_local.temp.s], xmm2); movdqa(ptr[&m_local.temp.t], xmm3); } if(m_sel.tfx != 3) // != decal { // GSVector4i vc = GSVector4i(v.c).xxzzlh(); cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]); pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); // r = vc.xxxx(); // g = vc.yyyy(); // b = vc.zzzz(); pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); if(m_sel.iip) { // r = r.add16(m_local.d.r); // g = g.add16(m_local.d.g); // b = b.add16(m_local.d.b); paddw(xmm4, ptr[&m_local.d.r]); paddw(xmm5, ptr[&m_local.d.g]); paddw(xmm6, ptr[&m_local.d.b]); } movdqa(ptr[&m_local.temp.r], xmm4); movdqa(ptr[&m_local.temp.g], xmm5); movdqa(ptr[&m_local.temp.b], xmm6); } } void GPUDrawScanlineCodeGenerator::Step() { // steps -= 8; sub(ecx, 8); // fb += 8; add(edi, 8 * sizeof(uint16)); if(m_sel.tme) { // GSVector4i st = m_local.d8.st; movdqa(xmm4, ptr[&m_local.d8.st]); // s = s.add16(st.xxxx()); // t = t.add16(st.yyyy()); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); paddw(xmm2, ptr[&m_local.temp.s]); movdqa(ptr[&m_local.temp.s], xmm2); // TODO: if(!sprite) ... else reload t pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm3, ptr[&m_local.temp.t]); movdqa(ptr[&m_local.temp.t], xmm3); } if(m_sel.tfx != 3) // != decal { if(m_sel.iip) { // GSVector4i c = m_local.d8.c; // r = r.add16(c.xxxx()); // g = g.add16(c.yyyy()); // b = b.add16(c.zzzz()); movdqa(xmm6, ptr[&m_local.d8.c]); pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm4, ptr[&m_local.temp.r]); paddw(xmm5, ptr[&m_local.temp.g]); paddw(xmm6, ptr[&m_local.temp.b]); movdqa(ptr[&m_local.temp.r], xmm4); movdqa(ptr[&m_local.temp.g], xmm5); movdqa(ptr[&m_local.temp.b], xmm6); } else { movdqa(xmm4, ptr[&m_local.temp.r]); movdqa(xmm5, ptr[&m_local.temp.g]); movdqa(xmm6, ptr[&m_local.temp.b]); } } } void GPUDrawScanlineCodeGenerator::TestMask() { if(!m_sel.me) { return; } // test |= fd.sra16(15); movdqa(xmm0, xmm1); psraw(xmm0, 15); por(xmm7, xmm0); alltrue(); } void GPUDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.tme) { return; } if(m_sel.tlu) { mov(edx, ptr[&m_local.gd->clut]); } // xmm2 = s // xmm3 = t // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f // GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f mov(eax, 0x00200020); movd(xmm0, eax); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); psubw(xmm2, xmm0); psubw(xmm3, xmm0); // GSVector4i uf = (u & GSVector4i::x00ff()) << 7; // GSVector4i vf = (v & GSVector4i::x00ff()) << 7; movdqa(xmm0, xmm2); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.uf], xmm0); if(!m_sel.sprite) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } // GSVector4i u0 = s.srl16(8); // GSVector4i v0 = t.srl16(8); psrlw(xmm2, 8); psrlw(xmm3, 8); // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u1 = u0.add16(GSVector4i::x0001()); // GSVector4i v1 = v0.add16(GSVector4i::x0001()); movdqa(xmm4, xmm2); movdqa(xmm5, xmm3); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 15); paddw(xmm4, xmm0); paddw(xmm5, xmm0); if(m_sel.twin) { // u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); // v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); // u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); // v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); movdqa(xmm0, ptr[&m_local.twin[0].u]); movdqa(xmm6, ptr[&m_local.twin[1].u]); pand(xmm2, xmm0); paddw(xmm2, xmm6); pand(xmm4, xmm0); paddw(xmm4, xmm6); movdqa(xmm0, ptr[&m_local.twin[0].v]); movdqa(xmm6, ptr[&m_local.twin[1].v]); pand(xmm3, xmm0); paddw(xmm3, xmm6); pand(xmm5, xmm0); paddw(xmm5, xmm6); } else { // u0 = u0.min_i16(m_local.twin[2].u); // v0 = v0.min_i16(m_local.twin[2].v); // u1 = u1.min_i16(m_local.twin[2].u); // v1 = v1.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: movdqa(xmm0, ptr[&m_local.twin[2].u]); movdqa(xmm6, ptr[&m_local.twin[2].v]); pminsw(xmm2, xmm0); pminsw(xmm3, xmm6); pminsw(xmm4, xmm0); pminsw(xmm5, xmm6); } // xmm2 = u0 // xmm3 = v0 // xmm4 = u1 // xmm5 = v1 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // GSVector4i addr00 = v0.sll16(8) | u0; // GSVector4i addr01 = v0.sll16(8) | u1; // GSVector4i addr10 = v1.sll16(8) | u0; // GSVector4i addr11 = v1.sll16(8) | u1; psllw(xmm3, 8); movdqa(xmm0, xmm3); por(xmm3, xmm2); por(xmm0, xmm4); psllw(xmm5, 8); movdqa(xmm6, xmm5); por(xmm5, xmm2); por(xmm6, xmm4); // xmm3 = addr00 // xmm0 = addr01 // xmm5 = addr10 // xmm6 = addr11 // xmm7 = test // xmm2, xmm4 = free // xmm1 = used ReadTexel(xmm2, xmm3); ReadTexel(xmm4, xmm0); ReadTexel(xmm3, xmm5); ReadTexel(xmm5, xmm6); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // spill (TODO) movdqa(ptr[&m_local.temp.fd], xmm1); movdqa(ptr[&m_local.temp.test], xmm7); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm1, xmm6, xmm7 = free movdqa(xmm1, xmm2); psllw(xmm1, 11); psrlw(xmm1, 8); movdqa(xmm0, xmm4); psllw(xmm0, 11); psrlw(xmm0, 8); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]); movdqa(xmm6, xmm2); psllw(xmm6, 6); psrlw(xmm6, 11); psllw(xmm6, 3); movdqa(xmm1, xmm4); psllw(xmm1, 6); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]); movdqa(xmm7, xmm2); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm6, xmm4); psllw(xmm6, 1); psrlw(xmm6, 11); psllw(xmm6, 3); lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]); psraw(xmm2, 15); psrlw(xmm2, 8); psraw(xmm4, 15); psrlw(xmm4, 8); lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]); // xmm0 = r00 // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm2, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 11); psrlw(xmm7, 8); movdqa(xmm2, xmm5); psllw(xmm2, 11); psrlw(xmm2, 8); lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]); // xmm2 = r // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 6); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm0, xmm5); psllw(xmm0, 6); psrlw(xmm0, 11); psllw(xmm0, 3); lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm1, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm1, xmm5); psllw(xmm1, 1); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm6, xmm7 = free psraw(xmm3, 15); psrlw(xmm3, 8); psraw(xmm5, 15); psrlw(xmm5, 8); lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]); lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm5 = a // xmm3, xmm4, xmm6, xmm7 = free // TODO movdqa(xmm3, xmm5); // a movdqa(xmm4, xmm2); // r movdqa(xmm6, xmm1); // b movdqa(xmm5, xmm0); // g // reload test movdqa(xmm7, ptr[&m_local.temp.test]); // xmm4 = r // xmm5 = g // xmm6 = b // xmm3 = a // xmm7 = test // xmm0, xmm1, xmm2 = free // test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) movdqa(xmm1, xmm3); por(xmm1, xmm4); movdqa(xmm2, xmm5); por(xmm2, xmm6); por(xmm1, xmm2); pxor(xmm0, xmm0); pcmpeqw(xmm1, xmm0); por(xmm7, xmm1); // a = a.gt16(GSVector4i::zero()); pcmpgtw(xmm3, xmm0); // reload fd movdqa(xmm1, ptr[&m_local.temp.fd]); } else { if(m_sel.twin) { // u = (u & m_local.twin[0].u).add16(m_local.twin[1].u); // v = (v & m_local.twin[0].v).add16(m_local.twin[1].v); pand(xmm2, ptr[&m_local.twin[0].u]); paddw(xmm2, ptr[&m_local.twin[1].u]); pand(xmm3, ptr[&m_local.twin[0].v]); paddw(xmm3, ptr[&m_local.twin[1].v]); } else { // u = u.min_i16(m_local.twin[2].u); // v = v.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: pminsw(xmm2, ptr[&m_local.twin[2].u]); pminsw(xmm3, ptr[&m_local.twin[2].v]); } // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used // GSVector4i addr = v.sll16(8) | u; psllw(xmm3, 8); por(xmm3, xmm2); // xmm3 = addr // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used ReadTexel(xmm6, xmm3); // xmm3 = c00 // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used // test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels pxor(xmm0, xmm0); pcmpeqw(xmm0, xmm6); por(xmm7, xmm0); // c[0] = (c00 << 3) & 0x00f800f8; // c[1] = (c00 >> 2) & 0x00f800f8; // c[2] = (c00 >> 7) & 0x00f800f8; // c[3] = c00.sra16(15); movdqa(xmm3, xmm6); psraw(xmm3, 15); // a pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 3); // 0x00f8 movdqa(xmm4, xmm6); psllw(xmm4, 3); pand(xmm4, xmm0); // r movdqa(xmm5, xmm6); psrlw(xmm5, 2); pand(xmm5, xmm0); // g psrlw(xmm6, 7); pand(xmm6, xmm0); // b } } void GPUDrawScanlineCodeGenerator::ColorTFX() { switch(m_sel.tfx) { case 0: // none (tfx = 0) case 1: // none (tfx = tge) // c[0] = r.srl16(7); // c[1] = g.srl16(7); // c[2] = b.srl16(7); psrlw(xmm4, 7); psrlw(xmm5, 7); psrlw(xmm6, 7); break; case 2: // modulate (tfx = tme | tge) // c[0] = c[0].modulate16<1>(r).clamp8(); // c[1] = c[1].modulate16<1>(g).clamp8(); // c[2] = c[2].modulate16<1>(b).clamp8(); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 8); modulate16<1>(xmm4, ptr[&m_local.temp.r]); pminsw(xmm4, xmm0); modulate16<1>(xmm5, ptr[&m_local.temp.g]); pminsw(xmm5, xmm0); modulate16<1>(xmm6, ptr[&m_local.temp.b]); pminsw(xmm6, xmm0); break; case 3: // decal (tfx = tme) break; } } void GPUDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.abe) { return; } // xmm1 = fd // xmm3 = a // xmm4 = r // xmm5 = g // xmm6 = b // xmm7 = test // xmm0, xmm2 = free // GSVector4i r = (fd & 0x001f001f) << 3; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); // 0x001f movdqa(xmm2, xmm1); pand(xmm2, xmm0); psllw(xmm2, 3); switch(m_sel.abr) { case 0: // r = r.avg8(c[0]); pavgb(xmm2, xmm4); break; case 1: // r = r.addus8(c[0]); paddusb(xmm2, xmm4); break; case 2: // r = r.subus8(c[0]); psubusb(xmm2, xmm4); break; case 3: // r = r.addus8(c[0].srl16(2)); movdqa(xmm0, xmm4); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm4, xmm2); } else { movdqa(xmm4, xmm2); } // GSVector4i g = (d & 0x03e003e0) >> 2; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 5); // 0x03e0 movdqa(xmm2, xmm1); pand(xmm2, xmm0); psrlw(xmm2, 2); switch(m_sel.abr) { case 0: // g = g.avg8(c[2]); pavgb(xmm2, xmm5); break; case 1: // g = g.addus8(c[2]); paddusb(xmm2, xmm5); break; case 2: // g = g.subus8(c[2]); psubusb(xmm2, xmm5); break; case 3: // g = g.addus8(c[2].srl16(2)); movdqa(xmm0, xmm5); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm5, xmm2); } else { movdqa(xmm5, xmm2); } // GSVector4i b = (d & 0x7c007c00) >> 7; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 10); // 0x7c00 movdqa(xmm2, xmm1); pand(xmm2, xmm0); psrlw(xmm2, 7); switch(m_sel.abr) { case 0: // b = b.avg8(c[2]); pavgb(xmm2, xmm6); break; case 1: // b = b.addus8(c[2]); paddusb(xmm2, xmm6); break; case 2: // b = b.subus8(c[2]); psubusb(xmm2, xmm6); break; case 3: // b = b.addus8(c[2].srl16(2)); movdqa(xmm0, xmm6); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm6, xmm2); } else { movdqa(xmm6, xmm2); } } void GPUDrawScanlineCodeGenerator::Dither() { if(!m_sel.dtd) { return; } // c[0] = c[0].addus8(dither); // c[1] = c[1].addus8(dither); // c[2] = c[2].addus8(dither); movdqa(xmm0, ptr[&m_local.temp.dither]); paddusb(xmm4, xmm0); paddusb(xmm5, xmm0); paddusb(xmm6, xmm0); } void GPUDrawScanlineCodeGenerator::WriteFrame() { // GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0); pcmpeqd(xmm0, xmm0); if(m_sel.md || m_sel.tme) { movdqa(xmm2, xmm0); psllw(xmm2, 15); } psrlw(xmm0, 11); psllw(xmm0, 3); // xmm0 = 0x00f8 // xmm2 = 0x8000 (md) // GSVector4i r = (c[0] & 0x00f800f8) >> 3; pand(xmm4, xmm0); psrlw(xmm4, 3); // GSVector4i g = (c[1] & 0x00f800f8) << 2; pand(xmm5, xmm0); psllw(xmm5, 2); por(xmm4, xmm5); // GSVector4i b = (c[2] & 0x00f800f8) << 7; pand(xmm6, xmm0); psllw(xmm6, 7); por(xmm4, xmm6); if(m_sel.md) { // GSVector4i a = GSVector4i(0x80008000); por(xmm4, xmm2); } else if(m_sel.tme) { // GSVector4i a = (c[3] << 8) & 0x80008000; psllw(xmm3, 8); pand(xmm3, xmm2); por(xmm4, xmm3); } // fs = fs.blend8(fd, test); movdqa(xmm0, xmm7); blend8(xmm4, xmm1); // GSVector4i::store(fb, fs); // movdqu(ptr[edi], xmm4); movq(qword[edi], xmm4); movhps(qword[edi + 8], xmm4); } void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr) { for(int i = 0; i < 8; i++) { pextrw(eax, addr, (uint8)i); if(m_sel.tlu) movzx(eax, byte[esi + eax]); const Address& src = m_sel.tlu ? ptr[edx + eax * 2] : ptr[esi + eax * 2]; if(i == 0) movd(dst, src); else pinsrw(dst, src, (uint8)i); } } template void GPUDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f) { if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) { pmulhrsw(a, f); } else { psllw(a, shift + 1); pmulhw(a, f); } } template void GPUDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Operand& f) { psubw(a, b); modulate16(a, f); paddw(a, b); } void GPUDrawScanlineCodeGenerator::alltrue() { pmovmskb(eax, xmm7); cmp(eax, 0xffff); je("step", T_NEAR); } void GPUDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) { if(m_cpu.has(util::Cpu::tSSE41)) { pblendvb(a, b); } else { blend(a, b, xmm0); } } void GPUDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) { pand(b, mask); pandn(mask, a); por(b, mask); movdqa(a, b); } const GSVector4i GPUDrawScanlineCodeGenerator::m_test[8] = { GSVector4i(0xffff0000, 0xffffffff, 0xffffffff, 0xffffffff), GSVector4i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff), GSVector4i(0x00000000, 0xffff0000, 0xffffffff, 0xffffffff), GSVector4i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff), GSVector4i(0x00000000, 0x00000000, 0xffff0000, 0xffffffff), GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffffffff), GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffff0000), GSVector4i::zero(), }; __aligned(const uint16, 32) GPUDrawScanlineCodeGenerator::m_dither[4][16] = { {7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1}, {2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4}, {1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7}, {4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2}, };