From d20bd4f86af7ad6133ad30d7410309d31fd0fad8 Mon Sep 17 00:00:00 2001 From: "gabest11@gmail.com" Date: Thu, 20 Jun 2013 05:07:52 +0000 Subject: [PATCH] GSdx: The sw renderer now uses avx2, not much faster though, +10% maybe, if the game is not EE limited. I'm not sure if haswell has that much better sse execution (load/store units doubled for example), or the avx2 code is not fully optimized yet. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5677 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GSBlock.h | 42 +- plugins/GSdx/GSDrawScanline.cpp | 1306 +++++++- plugins/GSdx/GSDrawScanline.h | 4 +- plugins/GSdx/GSDrawScanlineCodeGenerator.cpp | 111 + plugins/GSdx/GSDrawScanlineCodeGenerator.h | 57 + .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 44 +- .../GSDrawScanlineCodeGenerator.x86.avx2.cpp | 2968 +++++++++++++++++ plugins/GSdx/GSFunctionMap.h | 4 +- plugins/GSdx/GSRasterizer.cpp | 6 +- plugins/GSdx/GSRendererSW.cpp | 40 +- plugins/GSdx/GSScanlineEnvironment.h | 52 +- plugins/GSdx/GSSetupPrimCodeGenerator.cpp | 19 + plugins/GSdx/GSSetupPrimCodeGenerator.h | 4 + .../GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp | 2 +- .../GSSetupPrimCodeGenerator.x86.avx2.cpp | 353 ++ plugins/GSdx/GSVector.cpp | 1 + plugins/GSdx/GSVector.h | 27 +- plugins/GSdx/GSdx_vs11.vcxproj | 15 +- plugins/GSdx/GSdx_vs11.vcxproj.filters | 8 +- plugins/GSdx/xbyak/xbyak_mnemonic.h | 46 +- 20 files changed, 5021 insertions(+), 88 deletions(-) create mode 100644 plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp create mode 100644 plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h index a478c92baa..63cb348fe5 100644 --- a/plugins/GSdx/GSBlock.h +++ b/plugins/GSdx/GSBlock.h @@ -1789,23 +1789,39 @@ public: GSVector8i TA0(TEXA.TA0 << 24); GSVector8i mask = GSVector8i::x00ffffff(); - for(int i = 0; i < 4; i++, dst += dstpitch * 2) - { - GSVector8i v0 = s[i * 2 + 0]; - GSVector8i v1 = s[i * 2 + 1]; + GSVector8i v0, v1, v2, v3; - GSVector8i::sw128(v0, v1); - GSVector8i::sw64(v0, v1); + v0 = s[0] & mask; + v1 = s[1] & mask; + v2 = s[2] & mask; + v3 = s[3] & mask; - v0 &= mask; - v1 &= mask; + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + GSVector8i::sw128(v2, v3); + GSVector8i::sw64(v2, v3); - GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0]; - GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1]; + *(GSVector8i*)&dst[dstpitch * 0] = Expand24to32(v0, TA0); + *(GSVector8i*)&dst[dstpitch * 1] = Expand24to32(v1, TA0); + *(GSVector8i*)&dst[dstpitch * 2] = Expand24to32(v2, TA0); + *(GSVector8i*)&dst[dstpitch * 3] = Expand24to32(v3, TA0); - d0[0] = Expand24to32(v0, TA0); - d1[0] = Expand24to32(v1, TA0); - } + v0 = s[4] & mask; + v1 = s[5] & mask; + v2 = s[6] & mask; + v3 = s[7] & mask; + + GSVector8i::sw128(v0, v1); + GSVector8i::sw64(v0, v1); + GSVector8i::sw128(v2, v3); + GSVector8i::sw64(v2, v3); + + dst += dstpitch * 4; + + *(GSVector8i*)&dst[dstpitch * 0] = Expand24to32(v0, TA0); + *(GSVector8i*)&dst[dstpitch * 1] = Expand24to32(v1, TA0); + *(GSVector8i*)&dst[dstpitch * 2] = Expand24to32(v2, TA0); + *(GSVector8i*)&dst[dstpitch * 3] = Expand24to32(v3, TA0); #else diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 01ff9d3dfe..c2bc515e45 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -42,7 +42,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) if(m_global.sel.mmin && m_global.sel.lcm) { - GSVector4i v = m_global.t.minmax.srl16(m_global.lod.i.x); + GSVector4i v = m_global.t.minmax.srl16(m_global.lod.i.extract32<0>());//.x); v = v.upl16(v); @@ -107,13 +107,165 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co { GSScanlineSelector sel = m_global.sel; - const GSVector4* shift = GSSetupPrimCodeGenerator::m_shift; - bool has_z = sel.zb != 0; bool has_f = sel.fb && sel.fge; bool has_t = sel.fb && sel.tfx != TFX_NONE; bool has_c = sel.fb && !(sel.tfx == TFX_DECAL && sel.tcc); + #if _M_SSE >= 0x501 + + const GSVector8* shift = GSSetupPrimCodeGenerator::m_shift; + + if(has_z || has_f) + { + if(sel.prim != GS_SPRITE_CLASS) + { + if(has_f) + { + GSVector8 df = GSVector8::broadcast32(dscan.p.wwww()); + + m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh(); + + for(int i = 0; i < 8; i++) + { + m_local.d[i].f = GSVector8i(df * shift[1 + i]).xxzzlh(); + } + } + + if(has_z) + { + GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz()); + + m_local.d8.z = dz * shift[0]; + + for(int i = 0; i < 8; i++) + { + m_local.d[i].z = dz * shift[1 + i]; + } + } + } + else + { + if(has_f) + { + m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>(); + } + + if(has_z) + { + m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w + } + } + } + + if(has_t) + { + GSVector8 dt(dscan.t); + + GSVector8 dt8 = dt * shift[0]; + + if(sel.fst) + { + m_local.d8.stq = GSVector8::cast(GSVector8i(dt8)); + } + else + { + m_local.d8.stq = dt8; + } + + for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++) + { + GSVector8 dstq; + + switch(j) + { + case 0: dstq = dt.xxxx(); break; + case 1: dstq = dt.yyyy(); break; + case 2: dstq = dt.zzzz(); break; + } + + for(int i = 0; i < 8; i++) + { + GSVector8 v = dstq * shift[1 + i]; + + if(sel.fst) + { + switch(j) + { + case 0: m_local.d[i].s = GSVector8::cast(GSVector8i(v)); break; + case 1: m_local.d[i].t = GSVector8::cast(GSVector8i(v)); break; + } + } + else + { + switch(j) + { + case 0: m_local.d[i].s = v; break; + case 1: m_local.d[i].t = v; break; + case 2: m_local.d[i].q = v; break; + } + } + } + } + } + + if(has_c) + { + if(sel.iip) + { + GSVector8 dc(dscan.c); + + m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32(); + + GSVector8 dr = dc.xxxx(); + GSVector8 db = dc.zzzz(); + + for(int i = 0; i < 8; i++) + { + GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32(); + GSVector8i b = GSVector8i(db * shift[1 + i]).ps32(); + + m_local.d[i].rb = r.upl16(b); + } + + GSVector8 dg = dc.yyyy(); + GSVector8 da = dc.wwww(); + + for(int i = 0; i < 8; i++) + { + GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32(); + GSVector8i a = GSVector8i(da * shift[1 + i]).ps32(); + + m_local.d[i].ga = g.upl16(a); + } + } + else + { + int last = 0; + + switch(sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c)); + + c = c.upl16(c.zwxy()); + + if(sel.tfx == TFX_NONE) c = c.srl16(7); + + m_local.c.rb = c.xxxx(); + m_local.c.ga = c.zzzz(); + } + } + + #else + + const GSVector4* shift = GSSetupPrimCodeGenerator::m_shift; + if(has_z || has_f) { if(sel.prim != GS_SPRITE_CLASS) @@ -255,12 +407,1129 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co m_local.c.ga = c.zzzz(); } } + + #endif } void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) { GSScanlineSelector sel = m_global.sel; + #if _M_SSE >= 0x501 + + GSVector8i test; + GSVector8 zo; + GSVector8i f; + GSVector8 s, t, q; + GSVector8i uf, vf; + GSVector8i rbf, gaf; + GSVector8i cov; + + // Init + + int skip, steps; + + if(!sel.notest) + { + skip = left & 7; + steps = pixels + skip - 8; + left -= skip; + test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]; + } + else + { + skip = 0; + steps = pixels - 8; + } + + ASSERT((left & 7) == 0); + + const GSVector2i* fza_base = &m_global.fzbr[top]; + const GSVector2i* fza_offset = &m_global.fzbc[left >> 2]; + + if(sel.prim != GS_SPRITE_CLASS) + { + if(sel.fwrite && sel.fge) + { + f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(m_local.d[skip].f); + } + + if(sel.zb) + { + zo = m_local.d[skip].z; + } + } + + if(sel.fb) + { + if(sel.edge) + { + cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9); + } + + if(sel.tfx != TFX_NONE) + { + if(sel.fst) + { + GSVector4i vt(scan.t); + + GSVector8i u = GSVector8i::broadcast32(vt.xxxx()) + GSVector8i::cast(m_local.d[skip].s); + GSVector8i v = GSVector8i::broadcast32(vt.yyyy()); + + if(sel.prim != GS_SPRITE_CLASS || sel.mmin) + { + v += GSVector8i::cast(m_local.d[skip].t); + } + else if(sel.ltf) + { + vf = v.xxzzlh().srl16(12); + } + + s = GSVector8::cast(u); + t = GSVector8::cast(v); + } + else + { + s = GSVector8::broadcast32(scan.t.xxxx()) + m_local.d[skip].s; + t = GSVector8::broadcast32(scan.t.yyyy()) + m_local.d[skip].t; + q = GSVector8::broadcast32(scan.t.zzzz()) + m_local.d[skip].q; + } + } + + if(!(sel.tfx == TFX_DECAL && sel.tcc)) + { + if(sel.iip) + { + GSVector4i c(scan.c); + + c = c.upl16(c.zwxy()); + + rbf = GSVector8i::broadcast32(c.xxxx()).add16(m_local.d[skip].rb); + gaf = GSVector8i::broadcast32(c.zzzz()).add16(m_local.d[skip].ga); + } + else + { + rbf = m_local.c.rb; + gaf = m_local.c.ga; + } + } + } + + while(1) + { + do + { + int fa = 0, za = 0; + GSVector8i fd, zs, zd; + GSVector8i fm, zm; + GSVector8i rb, ga; + + // TestZ + + if(sel.zb) + { + za = fza_base->y + fza_offset->y; + + if(sel.prim != GS_SPRITE_CLASS) + { + GSVector8 z = GSVector8::broadcast32(scan.p.zzzz()) + zo; + + if(sel.zoverflow) + { + zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); + } + else + { + zs = GSVector8i(z); + } + } + else + { + zs = GSVector8i::broadcast32(GSVector4i::load(m_local.p.z)); + } + + if(sel.ztest) + { + zd = GSVector8i::load( + (uint8*)m_global.vm + za * 2, (uint8*)m_global.vm + za * 2 + 16, + (uint8*)m_global.vm + za * 2 + 32, (uint8*)m_global.vm + za * 2 + 48); + + switch(sel.zpsm) + { + case 1: zd = zd.sll32(8).srl32(8); break; + case 2: zd = zd.sll32(16).srl32(16); break; + default: break; + } + + GSVector8i zso = zs; + GSVector8i zdo = zd; + + if(sel.zoverflow || sel.zpsm == 0) + { + zso -= GSVector8i::x80000000(); + zdo -= GSVector8i::x80000000(); + } + + switch(sel.ztst) + { + case ZTST_GEQUAL: test |= zso < zdo; break; + case ZTST_GREATER: test |= zso <= zdo; break; + } + + if(test.alltrue()) continue; + } + } + + // SampleTexture + + if(sel.fb && sel.tfx != TFX_NONE) + { + GSVector8i u, v, uv[2]; + GSVector8i lodi, lodf; + GSVector8i minuv, maxuv; + GSVector8i addr00, addr01, addr10, addr11; + GSVector8i c00, c01, c10, c11; + + if(sel.mmin) + { + if(!sel.fst) + { + GSVector8 qrcp = q.rcp(); + + u = GSVector8i(s * qrcp); + v = GSVector8i(t * qrcp); + } + else + { + u = GSVector8i::cast(s); + v = GSVector8i::cast(t); + } + + if(!sel.lcm) + { + GSVector8 tmp = q.log2(3) * m_global.l + m_global.k; // (-log2(Q) * (1 << L) + K) * 0x10000 + + GSVector8i lod = GSVector8i(tmp.sat(GSVector8::zero(), m_global.mxl), false); + + if(sel.mmin == 1) // round-off mode + { + lod += 0x8000; + } + + lodi = lod.srl32(16); + + if(sel.mmin == 2) // trilinear mode + { + lodf = lod.xxzzlh(); + } + + // shift u/v by (int)lod + + u = u.srav32(lodi); + v = v.srav32(lodi); + + uv[0] = u.srav32(lodi); + uv[1] = v.srav32(lodi); + + GSVector8i tmin = GSVector8i::broadcast128(m_global.t.min); + GSVector8i tminu = tmin.upl16().srlv32(lodi); + GSVector8i tminv = tmin.uph16().srlv32(lodi); + + GSVector8i tmax = GSVector8i::broadcast128(m_global.t.max); + GSVector8i tmaxu = tmax.upl16().srlv32(lodi); + GSVector8i tmaxv = tmax.uph16().srlv32(lodi); + + minuv = tminu.pu32(tminv); + maxuv = tmaxu.pu32(tmaxv); + } + else + { + lodi = m_global.lod.i; + + u = u.srav32(lodi); + v = v.srav32(lodi); + + uv[0] = u; + uv[1] = v; + + minuv = m_local.temp.uv_minmax[0]; + maxuv = m_local.temp.uv_minmax[1]; + } + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + + uf = u.xxzzlh().srl16(12); + vf = v.xxzzlh().srl16(12); + } + + GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector8i uv1 = uv0; + + { + GSVector8i repeat = (uv0 & minuv) | maxuv; + GSVector8i clamp = uv0.sat_i16(minuv, maxuv); + + uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector8i::x0001()); + + GSVector8i repeat = (uv1 & minuv) | maxuv; + GSVector8i clamp = uv1.sat_i16(minuv, maxuv); + + uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + GSVector8i y0 = uv0.uph16() << (sel.tw + 3); + GSVector8i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector8i y1 = uv1.uph16() << (sel.tw + 3); + GSVector8i x1 = uv1.upl16(); + + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; + + if(sel.tlu) + { + for(int i = 0; i < 8; i++) + { + const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = m_global.clut[tex[addr00.u32[i]]]; + c01.u32[i] = m_global.clut[tex[addr01.u32[i]]]; + c10.u32[i] = m_global.clut[tex[addr10.u32[i]]]; + c11.u32[i] = m_global.clut[tex[addr11.u32[i]]]; + } + } + else + { + for(int i = 0; i < 8; i++) + { + const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = tex[addr00.u32[i]]; + c01.u32[i] = tex[addr01.u32[i]]; + c10.u32[i] = tex[addr10.u32[i]]; + c11.u32[i] = tex[addr11.u32[i]]; + } + } + + GSVector8i rb00 = c00.sll16(8).srl16(8); + GSVector8i ga00 = c00.srl16(8); + GSVector8i rb01 = c01.sll16(8).srl16(8); + GSVector8i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); + + GSVector8i rb10 = c10.sll16(8).srl16(8); + GSVector8i ga10 = c10.srl16(8); + GSVector8i rb11 = c11.sll16(8).srl16(8); + GSVector8i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); + + rb = rb00.lerp16_4(rb10, vf); + ga = ga00.lerp16_4(ga10, vf); + } + else + { + addr00 = y0 + x0; + + if(sel.tlu) + { + for(int i = 0; i < 8; i++) + { + c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]]; + } + } + else + { + for(int i = 0; i < 8; i++) + { + c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]; + } + } + + rb = c00.sll16(8).srl16(8); + ga = c00.srl16(8); + } + + if(sel.mmin != 1) // !round-off mode + { + GSVector8i rb2, ga2; + + lodi += GSVector8i::x00000001(); + + u = uv[0].sra32(1); + v = uv[1].sra32(1); + + minuv = minuv.srl16(1); + maxuv = maxuv.srl16(1); + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + + uf = u.xxzzlh().srl16(12); + vf = v.xxzzlh().srl16(12); + } + + GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector8i uv1 = uv0; + + { + GSVector8i repeat = (uv0 & minuv) | maxuv; + GSVector8i clamp = uv0.sat_i16(minuv, maxuv); + + uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector8i::x0001()); + + GSVector8i repeat = (uv1 & minuv) | maxuv; + GSVector8i clamp = uv1.sat_i16(minuv, maxuv); + + uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + GSVector8i y0 = uv0.uph16() << (sel.tw + 3); + GSVector8i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector8i y1 = uv1.uph16() << (sel.tw + 3); + GSVector8i x1 = uv1.upl16(); + + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; + + if(sel.tlu) + { + for(int i = 0; i < 8; i++) + { + const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = m_global.clut[tex[addr00.u32[i]]]; + c01.u32[i] = m_global.clut[tex[addr01.u32[i]]]; + c10.u32[i] = m_global.clut[tex[addr10.u32[i]]]; + c11.u32[i] = m_global.clut[tex[addr11.u32[i]]]; + } + } + else + { + for(int i = 0; i < 8; i++) + { + const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = tex[addr00.u32[i]]; + c01.u32[i] = tex[addr01.u32[i]]; + c10.u32[i] = tex[addr10.u32[i]]; + c11.u32[i] = tex[addr11.u32[i]]; + } + } + + GSVector8i rb00 = c00.sll16(8).srl16(8); + GSVector8i ga00 = c00.srl16(8); + GSVector8i rb01 = c01.sll16(8).srl16(8); + GSVector8i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); + + GSVector8i rb10 = c10.sll16(8).srl16(8); + GSVector8i ga10 = c10.srl16(8); + GSVector8i rb11 = c11.sll16(8).srl16(8); + GSVector8i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); + + rb2 = rb00.lerp16_4(rb10, vf); + ga2 = ga00.lerp16_4(ga10, vf); + } + else + { + addr00 = y0 + x0; + + if(sel.tlu) + { + for(int i = 0; i < 8; i++) + { + c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]]; + } + } + else + { + for(int i = 0; i < 8; i++) + { + c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]; + } + } + + rb2 = c00.sll16(8).srl16(8); + ga2 = c00.srl16(8); + } + + if(sel.lcm) lodf = m_global.lod.f; + + lodf = lodf.srl16(1); + + rb = rb.lerp16<0>(rb2, lodf); + ga = ga.lerp16<0>(ga2, lodf); + } + } + else + { + if(!sel.fst) + { + GSVector8 qrcp = q.rcp(); + + u = GSVector8i(s * qrcp); + v = GSVector8i(t * qrcp); + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + } + } + else + { + u = GSVector8i::cast(s); + v = GSVector8i::cast(t); + } + + if(sel.ltf) + { + uf = u.xxzzlh().srl16(12); + + if(sel.prim != GS_SPRITE_CLASS) + { + vf = v.xxzzlh().srl16(12); + } + } + + GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector8i uv1 = uv0; + + GSVector8i tmin = GSVector8i::broadcast128(m_global.t.min); + GSVector8i tmax = GSVector8i::broadcast128(m_global.t.max); + + { + GSVector8i repeat = (uv0 & tmin) | tmax; + GSVector8i clamp = uv0.sat_i16(tmin, tmax); + + uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector8i::x0001()); + + GSVector8i repeat = (uv1 & tmin) | tmax; + GSVector8i clamp = uv1.sat_i16(tmin, tmax); + + uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(m_global.t.mask)); + } + + GSVector8i y0 = uv0.uph16() << (sel.tw + 3); + GSVector8i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector8i y1 = uv1.uph16() << (sel.tw + 3); + GSVector8i x1 = uv1.upl16(); + + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; + + if(sel.tlu) + { + const uint8* tex = (const uint8*)m_global.tex[0]; + + c00 = addr00.gather32_32(tex, m_global.clut); + c01 = addr01.gather32_32(tex, m_global.clut); + c10 = addr10.gather32_32(tex, m_global.clut); + c11 = addr11.gather32_32(tex, m_global.clut); + } + else + { + const uint32* tex = (const uint32*)m_global.tex[0]; + + c00 = addr00.gather32_32(tex); + c01 = addr01.gather32_32(tex); + c10 = addr10.gather32_32(tex); + c11 = addr11.gather32_32(tex); + } + + GSVector8i rb00 = c00.sll16(8).srl16(8); + GSVector8i ga00 = c00.srl16(8); + GSVector8i rb01 = c01.sll16(8).srl16(8); + GSVector8i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); + + GSVector8i rb10 = c10.sll16(8).srl16(8); + GSVector8i ga10 = c10.srl16(8); + GSVector8i rb11 = c11.sll16(8).srl16(8); + GSVector8i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); + + rb = rb00.lerp16_4(rb10, vf); + ga = ga00.lerp16_4(ga10, vf); + } + else + { + addr00 = y0 + x0; + + if(sel.tlu) + { + c00 = addr00.gather32_32((const uint8*)m_global.tex[0], m_global.clut); + } + else + { + c00 = addr00.gather32_32((const uint32*)m_global.tex[0]); + } + + rb = c00.sll16(8).srl16(8); + ga = c00.srl16(8); + } + } + } + + // AlphaTFX + + if(sel.fb) + { + switch(sel.tfx) + { + case TFX_MODULATE: + ga = ga.modulate16<1>(gaf).clamp8(); + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_DECAL: + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_HIGHLIGHT: + ga = ga.mix16(!sel.tcc ? gaf.srl16(7) : ga.addus8(gaf.srl16(7))); + break; + case TFX_HIGHLIGHT2: + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_NONE: + ga = sel.iip ? gaf.srl16(7) : gaf; + break; + } + + if(sel.aa1) + { + GSVector8i x00800080(0x00800080); + + GSVector8i a = sel.edge ? cov : x00800080; + + if(!sel.abe) + { + ga = ga.mix16(a); + } + else + { + ga = ga.blend8(a, ga.eq16(x00800080).srl32(16).sll32(16)); + } + } + } + + // ReadMask + + if(sel.fwrite) + { + fm = m_global.fm; + } + + if(sel.zwrite) + { + zm = m_global.zm; + } + + // TestAlpha + + if(!TestAlpha(test, fm, zm, ga)) continue; + + // ColorTFX + + if(sel.fwrite) + { + GSVector8i af; + + switch(sel.tfx) + { + case TFX_MODULATE: + rb = rb.modulate16<1>(rbf).clamp8(); + break; + case TFX_DECAL: + break; + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + af = gaf.yywwlh().srl16(7); + rb = rb.modulate16<1>(rbf).add16(af).clamp8(); + ga = ga.modulate16<1>(gaf).add16(af).clamp8().mix16(ga); + break; + case TFX_NONE: + rb = sel.iip ? rbf.srl16(7) : rbf; + break; + } + } + + // Fog + + if(sel.fwrite && sel.fge) + { + GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(GSVector4i::load(m_local.p.f)); + + GSVector8i frb((int)m_global.frb); + GSVector8i fga((int)m_global.fga); + + rb = frb.lerp16<0>(rb, fog); + ga = fga.lerp16<0>(ga, fog).mix16(ga); + + /* + fog = fog.srl16(7); + + GSVector8i ifog = GSVector4i::x00ff().sub16(fog); + + rb = rb.mul16l(fog).add16(frb.mul16l(ifog)).srl16(8); + ga = ga.mul16l(fog).add16(fga.mul16l(ifog)).srl16(8).mix16(ga); + */ + } + + // ReadFrame + + if(sel.fb) + { + fa = fza_base->x + fza_offset->x; + + if(sel.rfb) + { + fd = GSVector8i::load( + (uint8*)m_global.vm + fa * 2, (uint8*)m_global.vm + fa * 2 + 16, + (uint8*)m_global.vm + fa * 2 + 32, (uint8*)m_global.vm + fa * 2 + 48); + } + } + + // TestDestAlpha + + if(sel.date && (sel.fpsm == 0 || sel.fpsm == 2)) + { + if(sel.datm) + { + if(sel.fpsm == 2) + { + test |= fd.srl32(15) == GSVector8i::zero(); + } + else + { + test |= (~fd).sra32(31); + } + } + else + { + if(sel.fpsm == 2) + { + test |= fd.sll32(16).sra32(31); + } + else + { + test |= fd.sra32(31); + } + } + + if(test.alltrue()) continue; + } + + // WriteMask + + int fzm = 0; + + if(!sel.notest) + { + if(sel.fwrite) + { + fm |= test; + } + + if(sel.zwrite) + { + zm |= test; + } + + if(sel.fwrite && sel.zwrite) + { + fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask(); + } + else if(sel.fwrite) + { + fzm = ~(fm == GSVector8i::xffffffff()).ps32().mask(); + } + else if(sel.zwrite) + { + fzm = ~(zm == GSVector8i::xffffffff()).ps32().mask(); + } + } + + // WriteZBuf + + if(sel.zwrite) + { + if(sel.ztest && sel.zpsm < 2) + { + zs = zs.blend8(zd, zm); + } + + bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; + + if(sel.notest) + { + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + za * 2, zs.extract<0>()); + GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs.extract<0>()); + GSVector4i::storel((uint8*)m_global.vm + za * 2 + 32, zs.extract<1>()); + GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 48, zs.extract<1>()); + } + else + { + WritePixel(zs, za, 0, sel.zpsm); + WritePixel(zs, za, 1, sel.zpsm); + WritePixel(zs, za, 2, sel.zpsm); + WritePixel(zs, za, 3, sel.zpsm); + WritePixel(zs, za, 4, sel.zpsm); + WritePixel(zs, za, 5, sel.zpsm); + WritePixel(zs, za, 6, sel.zpsm); + WritePixel(zs, za, 7, sel.zpsm); + } + } + else + { + if(fast) + { + if(fzm & 0x00000f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs.extract<0>()); + if(fzm & 0x0000f000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs.extract<0>()); + if(fzm & 0x0f000000) GSVector4i::storel((uint8*)m_global.vm + za * 2 + 32, zs.extract<1>()); + if(fzm & 0xf0000000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 48, zs.extract<1>()); + } + else + { + if(fzm & 0x00000300) WritePixel(zs, za, 0, sel.zpsm); + if(fzm & 0x00000c00) WritePixel(zs, za, 1, sel.zpsm); + if(fzm & 0x00003000) WritePixel(zs, za, 2, sel.zpsm); + if(fzm & 0x0000c000) WritePixel(zs, za, 3, sel.zpsm); + if(fzm & 0x03000000) WritePixel(zs, za, 4, sel.zpsm); + if(fzm & 0x0c000000) WritePixel(zs, za, 5, sel.zpsm); + if(fzm & 0x30000000) WritePixel(zs, za, 6, sel.zpsm); + if(fzm & 0xc0000000) WritePixel(zs, za, 7, sel.zpsm); + } + } + } + + // AlphaBlend + + if(sel.fwrite && (sel.abe || sel.aa1)) + { + GSVector8i rbs = rb, gas = ga, rbd, gad, a, mask; + + if(sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1) + { + switch(sel.fpsm) + { + case 0: + case 1: + rbd = fd.sll16(8).srl16(8); + gad = fd.srl16(8); + break; + case 2: + rbd = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + gad = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + break; + } + } + + if(sel.aba != sel.abb) + { + switch(sel.aba) + { + case 0: break; + case 1: rb = rbd; break; + case 2: rb = GSVector8i::zero(); break; + } + + switch(sel.abb) + { + case 0: rb = rb.sub16(rbs); break; + case 1: rb = rb.sub16(rbd); break; + case 2: break; + } + + if(!(sel.fpsm == 1 && sel.abc == 1)) + { + switch(sel.abc) + { + case 0: a = gas.yywwlh().sll16(7); break; + case 1: a = gad.yywwlh().sll16(7); break; + case 2: a = m_global.afix; break; + } + + rb = rb.modulate16<1>(a); + } + + switch(sel.abd) + { + case 0: rb = rb.add16(rbs); break; + case 1: rb = rb.add16(rbd); break; + case 2: break; + } + } + else + { + switch(sel.abd) + { + case 0: break; + case 1: rb = rbd; break; + case 2: rb = GSVector8i::zero(); break; + } + } + + if(sel.pabe) + { + mask = (gas << 8).sra32(31); + + rb = rbs.blend8(rb, mask); + } + + if(sel.aba != sel.abb) + { + switch(sel.aba) + { + case 0: break; + case 1: ga = gad; break; + case 2: ga = GSVector8i::zero(); break; + } + + switch(sel.abb) + { + case 0: ga = ga.sub16(gas); break; + case 1: ga = ga.sub16(gad); break; + case 2: break; + } + + if(!(sel.fpsm == 1 && sel.abc == 1)) + { + ga = ga.modulate16<1>(a); + } + + switch(sel.abd) + { + case 0: ga = ga.add16(gas); break; + case 1: ga = ga.add16(gad); break; + case 2: break; + } + } + else + { + switch(sel.abd) + { + case 0: break; + case 1: ga = gad; break; + case 2: ga = GSVector8i::zero(); break; + } + } + + if(sel.pabe) + { + ga = gas.blend8(ga, mask >> 16); + } + else + { + if(sel.fpsm != 1) + { + ga = ga.mix16(gas); + } + } + } + + // WriteFrame + + if(sel.fwrite) + { + if(sel.fpsm == 2 && sel.dthe) + { + int y = (top & 3) << 1; + + rb = rb.add16(GSVector8i::broadcast128(m_global.dimx[0 + y])); + ga = ga.add16(GSVector8i::broadcast128(m_global.dimx[1 + y])); + } + + if(sel.colclamp == 0) + { + rb &= GSVector8i::x00ff(); + ga &= GSVector8i::x00ff(); + } + + GSVector8i fs = rb.upl16(ga).pu16(rb.uph16(ga)); + + if(sel.fba && sel.fpsm != 1) + { + fs |= GSVector8i::x80000000(); + } + + if(sel.fpsm == 2) + { + GSVector8i rb = fs & 0x00f800f8; + GSVector8i ga = fs & 0x8000f800; + + fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + } + + if(sel.rfb) + { + fs = fs.blend(fd, fm); + } + + bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest; + + if(sel.notest) + { + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs.extract<0>()); + GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs.extract<0>()); + GSVector4i::storel((uint8*)m_global.vm + fa * 2 + 32, fs.extract<1>()); + GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 48, fs.extract<1>()); + } + else + { + WritePixel(fs, fa, 0, sel.fpsm); + WritePixel(fs, fa, 1, sel.fpsm); + WritePixel(fs, fa, 2, sel.fpsm); + WritePixel(fs, fa, 3, sel.fpsm); + WritePixel(fs, fa, 4, sel.fpsm); + WritePixel(fs, fa, 5, sel.fpsm); + WritePixel(fs, fa, 6, sel.fpsm); + WritePixel(fs, fa, 7, sel.fpsm); + } + } + else + { + if(fast) + { + if(fzm & 0x0000000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs.extract<0>()); + if(fzm & 0x000000f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs.extract<0>()); + if(fzm & 0x000f0000) GSVector4i::storel((uint8*)m_global.vm + fa * 2 + 32, fs.extract<1>()); + if(fzm & 0x00f00000) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 48, fs.extract<1>()); + } + else + { + if(fzm & 0x00000003) WritePixel(fs, fa, 0, sel.fpsm); + if(fzm & 0x0000000c) WritePixel(fs, fa, 1, sel.fpsm); + if(fzm & 0x00000030) WritePixel(fs, fa, 2, sel.fpsm); + if(fzm & 0x000000c0) WritePixel(fs, fa, 3, sel.fpsm); + if(fzm & 0x00030000) WritePixel(fs, fa, 4, sel.fpsm); + if(fzm & 0x000c0000) WritePixel(fs, fa, 5, sel.fpsm); + if(fzm & 0x00300000) WritePixel(fs, fa, 6, sel.fpsm); + if(fzm & 0x00c00000) WritePixel(fs, fa, 7, sel.fpsm); + } + } + } + } + while(0); + + if(sel.edge) break; + + if(steps <= 0) break; + + // Step + + steps -= 8; + + fza_offset += 2; + + if(sel.prim != GS_SPRITE_CLASS) + { + if(sel.zb) + { + zo += m_local.d8.z; + } + + if(sel.fwrite && sel.fge) + { + f = f.add16(m_local.d8.f); + } + } + + if(sel.fb) + { + if(sel.tfx != TFX_NONE) + { + if(sel.fst) + { + GSVector8i stq = GSVector8i::cast(m_local.d8.stq); + + s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); + + if(sel.prim != GS_SPRITE_CLASS || sel.mmin) + { + t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy()); + } + } + else + { + GSVector8 stq = m_local.d8.stq; + + s += stq.xxxx(); + t += stq.yyyy(); + q += stq.zzzz(); + } + } + } + + if(!(sel.tfx == TFX_DECAL && sel.tcc)) + { + if(sel.iip) + { + GSVector8i c = m_local.d8.c; + + rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero()); + gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero()); + } + } + + if(!sel.notest) + { + test = GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]; + } + } + + #else + GSVector4i test; GSVector4 zo; GSVector4i f; @@ -286,6 +1555,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS steps = pixels - 4; } + ASSERT((left & 3) == 0); + const GSVector2i* fza_base = &m_global.fzbr[top]; const GSVector2i* fza_offset = &m_global.fzbc[left >> 2]; @@ -1348,6 +2619,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; } } + + #endif } void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) @@ -1364,7 +2637,8 @@ void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& s m_global.sel.edge = edge; } -bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga) +template +bool GSDrawScanline::TestAlpha(T& test, T& fm, T& zm, const T& ga) { GSScanlineSelector sel = m_global.sel; @@ -1383,7 +2657,7 @@ bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, break; } - GSVector4i t; + T t; switch(sel.atst) { @@ -1396,20 +2670,20 @@ bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, case ATST_LESS: case ATST_LEQUAL: - t = (ga >> 16) > m_global.aref; + t = (ga >> 16) > T(m_global.aref); break; case ATST_EQUAL: - t = (ga >> 16) != m_global.aref; + t = (ga >> 16) != T(m_global.aref); break; case ATST_GEQUAL: case ATST_GREATER: - t = (ga >> 16) < m_global.aref; + t = (ga >> 16) < T(m_global.aref); break; case ATST_NOTEQUAL: - t = (ga >> 16) == m_global.aref; + t = (ga >> 16) == T(m_global.aref); break; default: @@ -1433,7 +2707,7 @@ bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, case AFAIL_RGB_ONLY: zm |= t; - fm |= t & GSVector4i::xff000000(); + fm |= t & T::xff000000(); break; default: @@ -1443,9 +2717,9 @@ bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, return true; } -static const int s_offsets[4] = {0, 2, 8, 10}; +static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26}; // columnTable16[0] -void GSDrawScanline::WritePixel(const GSVector4i& src, int addr, int i, uint32 psm) +template void GSDrawScanline::WritePixel(const T& src, int addr, int i, uint32 psm) { uint8* dst = (uint8*)m_global.vm + addr * 2 + s_offsets[i] * 2; @@ -1474,7 +2748,11 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) uint32 m; + #if _M_SSE >= 0x501 + m = m_global.zm; + #else m = m_global.zm.u32[0]; + #endif if(m != 0xffffffff) { @@ -1507,7 +2785,11 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) } } + #if _M_SSE >= 0x501 + m = m_global.fm; + #else m = m_global.fm.u32[0]; + #endif if(m != 0xffffffff) { diff --git a/plugins/GSdx/GSDrawScanline.h b/plugins/GSdx/GSDrawScanline.h index 1d0d221ef8..3e0e3b0464 100644 --- a/plugins/GSdx/GSDrawScanline.h +++ b/plugins/GSdx/GSDrawScanline.h @@ -81,8 +81,8 @@ public: bool IsEdge() const {return m_global.sel.aa1;} bool IsRect() const {return m_global.sel.IsSolidRect();} - bool TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga); - void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm); + template bool TestAlpha(T& test, T& fm, T& zm, const T& ga); + template void WritePixel(const T& src, int addr, int i, uint32 psm); #endif diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 2cee7ec503..9de1c01107 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -22,6 +22,38 @@ #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" +#if _M_SSE >= 0x501 + +const GSVector8i GSDrawScanlineCodeGenerator::m_test[16] = +{ + GSVector8i::zero(), + GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000), + GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000), + GSVector8i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff), + GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff), + GSVector8i::zero(), +}; + +const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] = +{ + GSVector8(0.204446009836232697516f), + GSVector8(-1.04913055217340124191f), + GSVector8(2.28330284476918490682f), + GSVector8(1.0f), +}; + +#else + const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] = { GSVector4i::zero(), @@ -42,6 +74,8 @@ const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] = GSVector4(1.0f), }; +#endif + GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) , m_local(*(GSScanlineLocalData*)param) @@ -51,6 +85,81 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key Generate(); } +#if _M_SSE >= 0x501 + +void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift) +{ + if(shift == 0) + { + vpmulhrsw(a, f); + } + else + { + vpsllw(a, (uint8)(shift + 1)); + vpmulhw(a, f); + } +} + +void GSDrawScanlineCodeGenerator::lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift) +{ + vpsubw(a, b); + modulate16(a, f, shift); + vpaddw(a, b); +} + +void GSDrawScanlineCodeGenerator::lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f) +{ + vpsubw(a, b); + vpmullw(a, f); + vpsraw(a, 4); + vpaddw(a, b); +} + +void GSDrawScanlineCodeGenerator::mix16(const Ymm& a, const Ymm& b, const Ymm& temp) +{ + vpblendw(a, b, 0xaa); +} + +void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp) +{ + vpackuswb(a, a); + vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks + vpmovzxbw(a, a); +} + +void GSDrawScanlineCodeGenerator::alltrue() +{ + vpmovmskb(eax, ymm7); + cmp(eax, 0xffffffff); + je("step", T_NEAR); +} + +void GSDrawScanlineCodeGenerator::blend(const Ymm& a, const Ymm& b, const Ymm& mask) +{ + vpand(b, mask); + vpandn(mask, a); + vpor(a, b, mask); +} + +void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask) +{ + vpand(b, mask); + vpandn(mask, a); + vpor(b, mask); +} + +void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b) +{ + vpblendvb(a, a, b, xmm0); +} + +void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a) +{ + vpblendvb(b, a, b, xmm0); +} + +#else + void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) { #if _M_SSE >= 0x500 @@ -244,3 +353,5 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) #endif } + +#endif \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 2552b33d88..808f6d1f84 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -35,6 +35,55 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void Generate(); + #if _M_SSE >= 0x501 + + void Init(); + void Step(); + void TestZ(const Ymm& temp1, const Ymm& temp2); + void SampleTexture(); + void Wrap(const Ymm& uv0); + void Wrap(const Ymm& uv0, const Ymm& uv1); + void SampleTextureLOD(); + void WrapLOD(const Ymm& uv0); + void WrapLOD(const Ymm& uv0, const Ymm& uv1); + void AlphaTFX(); + void ReadMask(); + void TestAlpha(); + void ColorTFX(); + void Fog(); + void ReadFrame(); + void TestDestAlpha(); + void WriteMask(); + void WriteZBuf(); + void AlphaBlend(); + void WriteFrame(); + + #if defined(_M_AMD64) || defined(_WIN64) + void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr); + void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz); + void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm); + #else + void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr); + void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz); + void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm); + #endif + + void ReadTexel(int pixels, int mip_offset = 0); + void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i); + + void modulate16(const Ymm& a, const Operand& f, int shift); + void lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift); + void lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f); + void mix16(const Ymm& a, const Ymm& b, const Ymm& temp); + void clamp16(const Ymm& a, const Ymm& temp); + void alltrue(); + void blend(const Ymm& a, const Ymm& b, const Ymm& mask); + void blendr(const Ymm& b, const Ymm& a, const Ymm& mask); + void blend8(const Ymm& a, const Ymm& b); + void blend8r(const Ymm& b, const Ymm& a); + + #else + void Init(); void Step(); void TestZ(const Xmm& temp1, const Xmm& temp2); @@ -80,9 +129,17 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void blend8(const Xmm& a, const Xmm& b); void blend8r(const Xmm& b, const Xmm& a); + #endif + public: GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); + #if _M_SSE >= 0x501 + static const GSVector8i m_test[16]; + static const GSVector8 m_log2_coef[4]; + #else static const GSVector4i m_test[8]; static const GSVector4 m_log2_coef[4]; + #endif + }; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 9db0ea64d0..3924291cfe 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -23,7 +23,7 @@ #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) static const int _args = 16; static const int _top = _args + 4; @@ -1236,24 +1236,21 @@ return; // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] - vmovq(xmm4, ptr[&m_local.gd->t.minmax]); // x x x x maxv maxu minv minu - vpunpcklwd(xmm4, xmm4); // maxv maxv maxu maxu minv minv minu minu - vpxor(xmm1, xmm1); - - vpunpckldq(xmm6, xmm4, xmm4); // minv minv minv minv minu minu minu minu - vpunpcklwd(xmm5, xmm6, xmm1); // 0 minu 0 minu 0 minu 0 minu + + vmovdqa(xmm4, ptr[&m_local.gd->t.min]); + vpunpcklwd(xmm5, xmm4, xmm1); // minu + vpunpckhwd(xmm6, xmm4, xmm1); // minv vpsrlvd(xmm5, xmm5, xmm0); - vpunpckhwd(xmm6, xmm6, xmm1); // 0 minv 0 minv 0 minv 0 minv vpsrlvd(xmm6, xmm6, xmm0); - vpackusdw(xmm5, xmm6); // xmm5 = minv minv minv minv minu minu minu minu - - vpunpckhdq(xmm4, xmm4); // maxv maxv maxv maxv maxu maxu maxu maxu - vpunpcklwd(xmm6, xmm4, xmm1); // 0 maxu 0 maxu 0 maxu 0 maxu + vpackusdw(xmm5, xmm6); + + vmovdqa(xmm4, ptr[&m_local.gd->t.max]); + vpunpcklwd(xmm6, xmm4, xmm1); // maxu + vpunpckhwd(xmm4, xmm4, xmm1); // maxv vpsrlvd(xmm6, xmm6, xmm0); - vpunpckhwd(xmm4, xmm1); // 0 maxv 0 maxv 0 maxv 0 maxv vpsrlvd(xmm4, xmm4, xmm0); - vpackusdw(xmm6, xmm4); // xmm6 = maxv maxv maxv maxv maxu maxu maxu maxu + vpackusdw(xmm6, xmm4); vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5); vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6); @@ -2807,7 +2804,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, } } -static const int s_offsets[4] = {0, 2, 8, 10}; +static const int s_offsets[] = {0, 2, 8, 10}; void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm) { @@ -2865,7 +2862,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) vmovdqa(ptr[&m_local.temp.test], xmm7); } - for(int j = 0; j < 4; j++) + for(uint8 j = 0; j < 4; j++) { mov(ebx, ptr[&lod_i->u32[j]]); mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); @@ -2895,18 +2892,9 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) for(int i = 0; i < pixels; i++) { - if(m_cpu.has(util::Cpu::tAVX2) && !m_sel.tlu) // vpgatherdd seems to be dead slow for byte aligned offsets, not using it for palette lookups + for(uint8 j = 0; j < 4; j++) { - Xmm mask = Xmm(t[i]); - vpcmpeqd(mask, mask); - vpgatherdd(Xmm(r[i * 2 + 1]), ptr[ebx + Xmm(r[i * 2 + 0]) * 4], mask); - } - else - { - for(int j = 0; j < 4; j++) - { - ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } + ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } } @@ -2914,6 +2902,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) { + ASSERT(i < 4); + const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; if(i == 0) vmovd(eax, addr); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp new file mode 100644 index 0000000000..02ef50003b --- /dev/null +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp @@ -0,0 +1,2968 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSDrawScanlineCodeGenerator.h" +#include "GSVertexSW.h" + +#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) + +static const int _args = 16; +static const int _top = _args + 4; +static const int _v = _args + 8; + +void GSDrawScanlineCodeGenerator::Generate() +{ +//ret(8); + + push(ebx); + push(esi); + push(edi); + push(ebp); + + //db(0xcc); + + Init(); + + if(!m_sel.edge) + { + align(16); + } + +L("loop"); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ymm0 = z/zi + // ymm2 = s/u (tme) + // ymm3 = t/v (tme) + // ymm4 = q (tme) + // ymm5 = rb (!tme) + // ymm6 = ga (!tme) + // ymm7 = test + + bool tme = m_sel.tfx != TFX_NONE; + + TestZ(tme ? ymm5 : ymm2, tme ? ymm6 : ymm3); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // - ymm0 + // ymm2 = s/u (tme) + // ymm3 = t/v (tme) + // ymm4 = q (tme) + // ymm5 = rb (!tme) + // ymm6 = ga (!tme) + // ymm7 = test + + if(m_sel.mmin) + { + SampleTextureLOD(); + } + else + { + SampleTexture(); + } + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // - ymm2 + // - ymm3 + // - ymm4 + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + AlphaTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + ReadMask(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + TestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + ColorTFX(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + Fog(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + ReadFrame(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = fd + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + TestDestAlpha(); + + // ecx = steps + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = fd + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + // ymm7 = test + + WriteMask(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // ebp = za + // ymm2 = fd + // ymm3 = fm + // ymm4 = zm + // ymm5 = rb + // ymm6 = ga + + WriteZBuf(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // - ebp + // ymm2 = fd + // ymm3 = fm + // - ymm4 + // ymm5 = rb + // ymm6 = ga + + AlphaBlend(); + + // ebx = fa + // ecx = steps + // edx = fzm + // esi = fzbr + // edi = fzbc + // ymm2 = fd + // ymm3 = fm + // ymm5 = rb + // ymm6 = ga + + WriteFrame(); + +L("step"); + + // if(steps <= 0) break; + + if(!m_sel.edge) + { + test(ecx, ecx); + + jle("exit", T_NEAR); + + Step(); + + jmp("loop", T_NEAR); + } + +L("exit"); + + pop(ebp); + pop(edi); + pop(esi); + pop(ebx); + + ret(8); +} + +void GSDrawScanlineCodeGenerator::Init() +{ + if(!m_sel.notest) + { + // int skip = left & 7; + + mov(ebx, edx); + and(edx, 7); + + // int steps = pixels + skip - 8; + + lea(ecx, ptr[ecx + edx - 8]); + + // left -= skip; + + sub(ebx, edx); + + // GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))]; + + shl(edx, 5); + + vmovdqa(ymm7, ptr[edx + (size_t)&m_test[0]]); + + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 5); + + vpor(ymm7, ptr[eax + (size_t)&m_test[15]]); + } + else + { + mov(ebx, edx); // left + xor(edx, edx); // skip + lea(ecx, ptr[ecx - 8]); // steps + } + + // GSVector2i* fza_base = &m_local.gd->fzbr[top]; + + mov(esi, ptr[esp + _top]); + lea(esi, ptr[esi * 8]); + add(esi, ptr[&m_local.gd->fzbr]); + + // GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2]; + + lea(edi, ptr[ebx * 2]); + add(edi, ptr[&m_local.gd->fzbc]); + + if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) + { + // edx = &m_local.d[skip] + + lea(edx, ptr[edx * 8 + (size_t)m_local.d]); + + // ebx = &v + + mov(ebx, ptr[esp + _v]); + } + + if(m_sel.prim != GS_SPRITE_CLASS) + { + if(m_sel.fwrite && m_sel.fge || m_sel.zb) + { + vbroadcastf128(ymm0, ptr[ebx + offsetof(GSVertexSW, p)]); // v.p + + if(m_sel.fwrite && m_sel.fge) + { + // f = GSVector8i(vp).zzzzh().zzzz().add16(m_local.d[skip].f); + + vcvttps2dq(ymm1, ymm0); + vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(ymm1, ymm1, _MM_SHUFFLE(2, 2, 2, 2)); + vpaddw(ymm1, ptr[edx + offsetof(GSScanlineLocalData::skip, f)]); + + vmovdqa(ptr[&m_local.temp.f], ymm1); + } + + if(m_sel.zb) + { + // z = vp.zzzz() + m_local.d[skip].z; + + vshufps(ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); + vmovaps(ptr[&m_local.temp.z], ymm0); + vmovaps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, z)]); + vmovaps(ptr[&m_local.temp.zo], ymm2); + vaddps(ymm0, ymm2); + } + } + } + else + { + if(m_sel.ztest) + { + vpbroadcastd(ymm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.edge || m_sel.tfx != TFX_NONE) + { + vbroadcastf128(ymm4, ptr[ebx + offsetof(GSVertexSW, t)]); // v.t + } + + if(m_sel.edge) + { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + + vpshufhw(ymm3, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); + vpshufd(ymm3, ymm3, _MM_SHUFFLE(3, 3, 3, 3)); + vpsrlw(ymm3, 9); + + vmovdqa(ptr[&m_local.temp.cov], ymm3); + } + + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector4i vti(vt); + + vcvttps2dq(ymm6, ymm4); + + // s = vti.xxxx() + m_local.d[skip].s; + // t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t; + + vpshufd(ymm2, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(ymm3, ymm6, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddd(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); + + if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) + { + vpaddd(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); + } + else + { + if(m_sel.ltf) + { + vpshuflw(ymm6, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm6, ymm6, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm6, 12); + vmovdqa(ptr[&m_local.temp.vf], ymm6); + } + } + + vmovdqa(ptr[&m_local.temp.s], ymm2); + vmovdqa(ptr[&m_local.temp.t], ymm3); + } + else + { + // s = vt.xxxx() + m_local.d[skip].s; + // t = vt.yyyy() + m_local.d[skip].t; + // q = vt.zzzz() + m_local.d[skip].q; + + vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(ymm2, ptr[edx + offsetof(GSScanlineLocalData::skip, s)]); + vaddps(ymm3, ptr[edx + offsetof(GSScanlineLocalData::skip, t)]); + vaddps(ymm4, ptr[edx + offsetof(GSScanlineLocalData::skip, q)]); + + vmovaps(ptr[&m_local.temp.s], ymm2); + vmovaps(ptr[&m_local.temp.t], ymm3); + vmovaps(ptr[&m_local.temp.q], ymm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector4i vc = GSVector4i(v.c); + + vbroadcastf128(ymm6, ptr[ebx + offsetof(GSVertexSW, c)]); // v.c + vcvttps2dq(ymm6, ymm6); + + // vc = vc.upl16(vc.zwxy()); + + vpshufd(ymm5, ymm6, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(ymm6, ymm5); + + // rb = vc.xxxx().add16(m_local.d[skip].rb); + // ga = vc.zzzz().add16(m_local.d[skip].ga); + + vpshufd(ymm5, ymm6, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(ymm6, ymm6, _MM_SHUFFLE(2, 2, 2, 2)); + + vpaddw(ymm5, ptr[edx + offsetof(GSScanlineLocalData::skip, rb)]); + vpaddw(ymm6, ptr[edx + offsetof(GSScanlineLocalData::skip, ga)]); + + vmovdqa(ptr[&m_local.temp.rb], ymm5); + vmovdqa(ptr[&m_local.temp.ga], ymm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + vmovdqa(ymm5, ptr[&m_local.c.rb]); + vmovdqa(ymm6, ptr[&m_local.c.ga]); + } + } + } + } +} + +void GSDrawScanlineCodeGenerator::Step() +{ + // steps -= 8; + + sub(ecx, 8); + + // fza_offset += 2; + + add(edi, 16); + + if(m_sel.prim != GS_SPRITE_CLASS) + { + // z += m_local.d8.z; + + if(m_sel.zb) + { + vmovaps(ymm0, ptr[&m_local.temp.zo]); + vaddps(ymm0, ptr[&m_local.d8.z]); + vmovaps(ptr[&m_local.temp.zo], ymm0); + vaddps(ymm0, ptr[&m_local.temp.z]); + } + + // f = f.add16(m_local.d4.f); + + if(m_sel.fwrite && m_sel.fge) + { + vmovdqa(ymm1, ptr[&m_local.temp.f]); + vpaddw(ymm1, ptr[&m_local.d8.f]); + vmovdqa(ptr[&m_local.temp.f], ymm1); + } + } + else + { + if(m_sel.ztest) + { + vpbroadcastd(ymm0, ptr[&m_local.p.z]); + } + } + + if(m_sel.fb) + { + if(m_sel.tfx != TFX_NONE) + { + if(m_sel.fst) + { + // GSVector8i stq = m_local.d8.stq; + + // s += stq.xxxx(); + // if(!sprite) t += stq.yyyy(); + + vmovdqa(ymm4, ptr[&m_local.d8.stq]); + + vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); + vpaddd(ymm2, ptr[&m_local.temp.s]); + vmovdqa(ptr[&m_local.temp.s], ymm2); + + if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) + { + vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); + vpaddd(ymm3, ptr[&m_local.temp.t]); + vmovdqa(ptr[&m_local.temp.t], ymm3); + } + else + { + vmovdqa(ymm3, ptr[&m_local.temp.t]); + } + } + else + { + // GSVector8 stq = m_local.d8.stq; + + // s += stq.xxxx(); + // t += stq.yyyy(); + // q += stq.zzzz(); + + vmovaps(ymm4, ptr[&m_local.d8.stq]); + + vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(ymm4, ymm4, ymm4, _MM_SHUFFLE(2, 2, 2, 2)); + + vaddps(ymm2, ptr[&m_local.temp.s]); + vaddps(ymm3, ptr[&m_local.temp.t]); + vaddps(ymm4, ptr[&m_local.temp.q]); + + vmovaps(ptr[&m_local.temp.s], ymm2); + vmovaps(ptr[&m_local.temp.t], ymm3); + vmovaps(ptr[&m_local.temp.q], ymm4); + } + } + + if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) + { + if(m_sel.iip) + { + // GSVector8i c = m_local.d8.c; + + // rb = rb.add16(c.xxxx()); + // ga = ga.add16(c.yyyy()); + + vmovdqa(ymm7, ptr[&m_local.d8.c]); + + vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1)); + + vpaddw(ymm5, ptr[&m_local.temp.rb]); + vpaddw(ymm6, ptr[&m_local.temp.ga]); + + // FIXME: color may underflow and roll over at the end of the line, if decreasing + + vpxor(ymm7, ymm7); + vpmaxsw(ymm5, ymm7); + vpmaxsw(ymm6, ymm7); + + vmovdqa(ptr[&m_local.temp.rb], ymm5); + vmovdqa(ptr[&m_local.temp.ga], ymm6); + } + else + { + if(m_sel.tfx == TFX_NONE) + { + vmovdqa(ymm5, ptr[&m_local.c.rb]); + vmovdqa(ymm6, ptr[&m_local.c.ga]); + } + } + } + } + + if(!m_sel.notest) + { + // test = m_test[15 + (steps & (steps >> 31))]; + + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 5); + + vmovdqa(ymm7, ptr[edx + (size_t)&m_test[15]]); + } +} + +void GSDrawScanlineCodeGenerator::TestZ(const Ymm& temp1, const Ymm& temp2) +{ + if(!m_sel.zb) + { + return; + } + + // int za = fza_base.y + fza_offset->y; + + mov(ebp, ptr[esi + 4]); + add(ebp, ptr[edi + 4]); + + // GSVector8i zs = zi; + + if(m_sel.prim != GS_SPRITE_CLASS) + { + if(m_sel.zoverflow) + { + // zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); + + vbroadcastss(temp1, ptr[&GSVector8::m_half]); + vmulps(temp1, ymm0); + vcvttps2dq(temp1, temp1); + vpslld(temp1, 1); + + vcvttps2dq(ymm0, ymm0); + vpcmpeqd(temp2, temp2); + vpsrld(temp2, 31); + vpand(ymm0, temp2); + + vpor(ymm0, temp1); + } + else + { + // zs = GSVector8i(z); + + vcvttps2dq(ymm0, ymm0); + } + + if(m_sel.zwrite) + { + vmovdqa(ptr[&m_local.temp.zs], ymm0); + } + } + + if(m_sel.ztest) + { + ReadPixel(ymm1, temp1, ebp); + + if(m_sel.zwrite && m_sel.zpsm < 2) + { + vmovdqa(ptr[&m_local.temp.zd], ymm1); + } + + // zd &= 0xffffffff >> m_sel.zpsm * 8; + + if(m_sel.zpsm) + { + vpslld(ymm1, (uint8)(m_sel.zpsm * 8)); + vpsrld(ymm1, (uint8)(m_sel.zpsm * 8)); + } + + if(m_sel.zoverflow || m_sel.zpsm == 0) + { + // GSVector8i o = GSVector8i::x80000000(); + + vpcmpeqd(temp1, temp1); + vpslld(temp1, 31); + + // GSVector8i zso = zs - o; + // GSVector8i zdo = zd - o; + + vpsubd(ymm0, temp1); + vpsubd(ymm1, temp1); + } + + switch(m_sel.ztst) + { + case ZTST_GEQUAL: + // test |= zso < zdo; // ~(zso >= zdo) + vpcmpgtd(ymm1, ymm0); + vpor(ymm7, ymm1); + break; + + case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL + // test |= zso <= zdo; // ~(zso > zdo) + vpcmpgtd(ymm0, ymm1); + vpcmpeqd(temp1, temp1); + vpxor(ymm0, temp1); + vpor(ymm7, ymm0); + break; + } + + alltrue(); + } +} + +void GSDrawScanlineCodeGenerator::SampleTexture() +{ + if(!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + mov(ebx, ptr[&m_local.gd->tex[0]]); + + if(m_sel.tlu) + { + mov(edx, ptr[&m_local.gd->clut]); + } + + // ebx = tex + // edx = clut + + if(!m_sel.fst) + { + vrcpps(ymm0, ymm4); + + vmulps(ymm2, ymm0); + vmulps(ymm3, ymm0); + + vcvttps2dq(ymm2, ymm2); + vcvttps2dq(ymm3, ymm3); + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + vmovd(xmm4, eax); + vpbroadcastd(ymm4, xmm4); + + vpsubd(ymm2, ymm4); + vpsubd(ymm3, ymm4); + } + } + + // ymm2 = u + // ymm3 = v + + if(m_sel.ltf) + { + // GSVector8i uf = u.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.uf], ymm0); + + if(m_sel.prim != GS_SPRITE_CLASS) + { + // GSVector8i vf = v.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.vf], ymm0); + } + } + + // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + + vpsrad(ymm2, 16); + vpsrad(ymm3, 16); + vpackssdw(ymm2, ymm3); + + if(m_sel.ltf) + { + // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); + + vpcmpeqd(ymm1, ymm1); + vpsrlw(ymm1, 15); + vpaddw(ymm3, ymm2, ymm1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + Wrap(ymm2, ymm3); + } + else + { + // uv0 = Wrap(uv0); + + Wrap(ymm2); + } + + // ymm2 = uv0 + // ymm3 = uv1 (ltf) + // ymm0, ymm1, ymm4, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i y0 = uv0.uph16() << tw; + // GSVector8i x0 = uv0.upl16(); + + vpxor(ymm0, ymm0); + + vpunpcklwd(ymm4, ymm2, ymm0); + vpunpckhwd(ymm2, ymm2, ymm0); + vpslld(ymm2, (uint8)(m_sel.tw + 3)); + + // ymm0 = 0 + // ymm2 = y0 + // ymm3 = uv1 (ltf) + // ymm4 = x0 + // ymm1, ymm5, ymm6 = free + // ymm7 = used + + if(m_sel.ltf) + { + // GSVector8i y1 = uv1.uph16() << tw; + // GSVector8i x1 = uv1.upl16(); + + vpunpcklwd(ymm6, ymm3, ymm0); + vpunpckhwd(ymm3, ymm3, ymm0); + vpslld(ymm3, (uint8)(m_sel.tw + 3)); + + // ymm2 = y0 + // ymm3 = y1 + // ymm4 = x0 + // ymm6 = x1 + // ymm0, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i addr00 = y0 + x0; + // GSVector8i addr01 = y0 + x1; + // GSVector8i addr10 = y1 + x0; + // GSVector8i addr11 = y1 + x1; + + vpaddd(ymm5, ymm2, ymm4); + vpaddd(ymm2, ymm2, ymm6); + vpaddd(ymm0, ymm3, ymm4); + vpaddd(ymm3, ymm3, ymm6); + + // ymm5 = addr00 + // ymm2 = addr01 + // ymm0 = addr10 + // ymm3 = addr11 + // ymm1, ymm4, ymm6 = free + // ymm7 = used + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(4, 0); + + // ymm6 = c00 + // ymm4 = c01 + // ymm1 = c10 + // ymm5 = c11 + // ymm0, ymm2, ymm3 = free + // ymm7 = used + + vmovdqa(ymm0, ptr[&m_local.temp.uf]); + + // GSVector8i rb00 = c00 & mask; + // GSVector8i ga00 = (c00 >> 8) & mask; + + vpsllw(ymm2, ymm6, 8); + vpsrlw(ymm2, 8); + vpsrlw(ymm6, 8); + + // GSVector8i rb01 = c01 & mask; + // GSVector8i ga01 = (c01 >> 8) & mask; + + vpsllw(ymm3, ymm4, 8); + vpsrlw(ymm3, 8); + vpsrlw(ymm4, 8); + + // ymm0 = uf + // ymm2 = rb00 + // ymm3 = rb01 + // ymm6 = ga00 + // ymm4 = ga01 + // ymm1 = c10 + // ymm5 = c11 + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); + + lerp16_4(ymm3, ymm2, ymm0); + lerp16_4(ymm4, ymm6, ymm0); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = c10 + // ymm5 = c11 + // ymm2, ymm6 = free + // ymm7 = used + + // GSVector8i rb10 = c10 & mask; + // GSVector8i ga10 = (c10 >> 8) & mask; + + vpsrlw(ymm2, ymm1, 8); + vpsllw(ymm1, 8); + vpsrlw(ymm1, 8); + + // GSVector8i rb11 = c11 & mask; + // GSVector8i ga11 = (c11 >> 8) & mask; + + vpsrlw(ymm6, ymm5, 8); + vpsllw(ymm5, 8); + vpsrlw(ymm5, 8); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = rb10 + // ymm5 = rb11 + // ymm2 = ga10 + // ymm6 = ga11 + // ymm7 = used + + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); + + lerp16_4(ymm5, ymm1, ymm0); + lerp16_4(ymm6, ymm2, ymm0); + + // ymm3 = rb00 + // ymm4 = ga00 + // ymm5 = rb10 + // ymm6 = ga10 + // ymm0, ymm1, ymm2 = free + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); + + vmovdqa(ymm0, ptr[&m_local.temp.vf]); + + lerp16_4(ymm5, ymm3, ymm0); + lerp16_4(ymm6, ymm4, ymm0); + } + else + { + // GSVector8i addr00 = y0 + x0; + + vpaddd(ymm5, ymm2, ymm4); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(1, 0); + + // GSVector8i mask = GSVector8i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + vpsllw(ymm5, ymm6, 8); + vpsrlw(ymm5, 8); + vpsrlw(ymm6, 8); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv) +{ + // ymm0, ymm1, ymm4, ymm5, ymm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); + vpmaxsw(uv, ymm0); + } + else + { + vpxor(ymm0, ymm0); + vpmaxsw(uv, ymm0); + } + + vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); + vpminsw(uv, ymm0); + } + else + { + vbroadcasti128(ymm0, ptr[&m_local.gd->t.min]); + vpand(uv, ymm0); + + if(region) + { + vbroadcasti128(ymm0, ptr[&m_local.gd->t.max]); + vpor(uv, ymm0); + } + } + } + else + { + vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); + vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); + vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv, ymm4); + + if(region) + { + vpor(ymm1, ymm5); + } + + // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv, ymm4); + vpminsw(uv, ymm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv, ymm1, ymm0); + } +} + +void GSDrawScanlineCodeGenerator::Wrap(const Ymm& uv0, const Ymm& uv1) +{ + // ymm0, ymm1, ymm4, ymm5, ymm6 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); + vpmaxsw(uv0, ymm4); + vpmaxsw(uv1, ymm4); + } + else + { + vpxor(ymm0, ymm0); + vpmaxsw(uv0, ymm0); + vpmaxsw(uv1, ymm0); + } + + vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); + vpminsw(uv0, ymm5); + vpminsw(uv1, ymm5); + } + else + { + vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); + vpand(uv0, ymm4); + vpand(uv1, ymm4); + + if(region) + { + vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); + vpor(uv0, ymm5); + vpor(uv1, ymm5); + } + } + } + else + { + vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); + vbroadcasti128(ymm5, ptr[&m_local.gd->t.max]); + vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); + + // uv0 + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv0, ymm4); + + if(region) + { + vpor(ymm1, ymm5); + } + + // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv0, ymm4); + vpminsw(uv0, ymm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv0, ymm1, ymm0); + + // uv1 + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv1, ymm4); + + if(region) + { + vpor(ymm1, ymm5); + } + + // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv1, ymm4); + vpminsw(uv1, ymm5); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv1, ymm1, ymm0); + } +} + +void GSDrawScanlineCodeGenerator::SampleTextureLOD() +{ + if(!m_sel.fb || m_sel.tfx == TFX_NONE) + { + return; + } + + push(ebp); + + mov(ebp, (size_t)m_local.gd->tex); + + if(m_sel.tlu) + { + mov(edx, ptr[&m_local.gd->clut]); + } + + if(!m_sel.fst) + { + vrcpps(ymm0, ymm4); + + vmulps(ymm2, ymm0); + vmulps(ymm3, ymm0); + + vcvttps2dq(ymm2, ymm2); + vcvttps2dq(ymm3, ymm3); + } + + // ymm2 = u + // ymm3 = v + // ymm4 = q + // ymm0 = ymm1 = ymm5 = ymm6 = free + + // TODO: if the fractional part is not needed in round-off mode then there is a faster integer log2 (just take the exp) (but can we round it?) + + if(!m_sel.lcm) + { + // lod = -log2(Q) * (1 << L) + K + + vpcmpeqd(ymm1, ymm1); + vpsrld(ymm1, ymm1, 25); + vpslld(ymm0, ymm4, 1); + vpsrld(ymm0, ymm0, 24); + vpsubd(ymm0, ymm1); + vcvtdq2ps(ymm0, ymm0); + + // ymm0 = (float)(exp(q) - 127) + + vpslld(ymm4, ymm4, 9); + vpsrld(ymm4, ymm4, 9); + vorps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); + + // ymm4 = mant(q) | 1.0f + + if(m_cpu.has(util::Cpu::tFMA)) + { + vmovaps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0 + vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * ymm4 + c1 + vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * ymm4 + c1) * ymm4 + c2 + vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // ymm4 - 1.0f + vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0 + } + else + { + vmulps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); + vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); + vmulps(ymm5, ymm4); + vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); + vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); + vmulps(ymm4, ymm5); + vaddps(ymm4, ymm0); + } + + // ymm4 = log2(Q) = ((((c0 * ymm4) + c1) * ymm4) + c2) * (ymm4 - 1.0f) + ymm0 + + if(m_cpu.has(util::Cpu::tFMA)) + { + vmovaps(ymm5, ptr[&m_local.gd->l]); + vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]); + } + else + { + vmulps(ymm4, ptr[&m_local.gd->l]); + vaddps(ymm4, ptr[&m_local.gd->k]); + } + + // ymm4 = (-log2(Q) * (1 << L) + K) * 0x10000 + + vxorps(ymm0, ymm0); + vminps(ymm4, ptr[&m_local.gd->mxl]); + vmaxps(ymm4, ymm0); + vcvtps2dq(ymm4, ymm4); + + if(m_sel.mmin == 1) // round-off mode + { + mov(eax, 0x8000); + vmovd(xmm0, eax); + vpbroadcastd(ymm0, xmm0); + vpaddd(ymm4, ymm0); + } + + vpsrld(ymm0, ymm4, 16); + + vmovdqa(ptr[&m_local.temp.lod.i], ymm0); +/* +vpslld(ymm5, ymm0, 6); +vpslld(ymm6, ymm4, 16); +vpsrld(ymm6, ymm6, 24); +return; +*/ + if(m_sel.mmin == 2) // trilinear mode + { + vpshuflw(ymm1, ymm4, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm1, ymm1, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.temp.lod.f], ymm1); + } + + // shift u/v/minmax by (int)lod + + vpsravd(ymm2, ymm2, ymm0); + vpsravd(ymm3, ymm3, ymm0); + + vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); + vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); + + // m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1] + + vpxor(ymm1, ymm1); + + vbroadcasti128(ymm4, ptr[&m_local.gd->t.min]); + vpunpcklwd(ymm5, ymm4, ymm1); // minu + vpunpckhwd(ymm6, ymm4, ymm1); // minv + vpsrlvd(ymm5, ymm5, ymm0); + vpsrlvd(ymm6, ymm6, ymm0); + vpackusdw(ymm5, ymm6); + + vbroadcasti128(ymm4, ptr[&m_local.gd->t.max]); + vpunpcklwd(ymm6, ymm4, ymm1); // maxu + vpunpckhwd(ymm4, ymm4, ymm1); // maxv + vpsrlvd(ymm6, ymm6, ymm0); + vpsrlvd(ymm4, ymm4, ymm0); + vpackusdw(ymm6, ymm4); + + vmovdqa(ptr[&m_local.temp.uv_minmax[0]], ymm5); + vmovdqa(ptr[&m_local.temp.uv_minmax[1]], ymm6); + } + else + { + // lod = K + + vmovd(xmm0, ptr[&m_local.gd->lod.i.u32[0]]); + + vpsrad(ymm2, xmm0); + vpsrad(ymm3, xmm0); + + vmovdqa(ptr[&m_local.temp.uv[0]], ymm2); + vmovdqa(ptr[&m_local.temp.uv[1]], ymm3); + + vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); + vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); + } + + // ymm2 = m_local.temp.uv[0] = u (level m) + // ymm3 = m_local.temp.uv[1] = v (level m) + // ymm5 = minuv + // ymm6 = maxuv + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + vmovd(xmm4, eax); + vpbroadcastd(ymm4, xmm4); + + vpsubd(ymm2, ymm4); + vpsubd(ymm3, ymm4); + + // GSVector8i uf = u.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.uf], ymm0); + + // GSVector8i vf = v.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.vf], ymm0); + } + + // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + + vpsrad(ymm2, 16); + vpsrad(ymm3, 16); + vpackssdw(ymm2, ymm3); + + if(m_sel.ltf) + { + // GSVector8i uv1 = uv0.add16(GSVector8i::x0001()); + + vpcmpeqd(ymm1, ymm1); + vpsrlw(ymm1, 15); + vpaddw(ymm3, ymm2, ymm1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + WrapLOD(ymm2, ymm3); + } + else + { + // uv0 = Wrap(uv0); + + WrapLOD(ymm2); + } + + // ymm2 = uv0 + // ymm3 = uv1 (ltf) + // ymm0, ymm1, ymm4, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i x0 = uv0.upl16(); + // GSVector8i y0 = uv0.uph16() << tw; + + vpxor(ymm0, ymm0); + + vpunpcklwd(ymm4, ymm2, ymm0); + vpunpckhwd(ymm2, ymm2, ymm0); + vpslld(ymm2, (uint8)(m_sel.tw + 3)); + + // ymm0 = 0 + // ymm2 = y0 + // ymm3 = uv1 (ltf) + // ymm4 = x0 + // ymm1, ymm5, ymm6 = free + // ymm7 = used + + if(m_sel.ltf) + { + // GSVector8i x1 = uv1.upl16(); + // GSVector8i y1 = uv1.uph16() << tw; + + vpunpcklwd(ymm6, ymm3, ymm0); + vpunpckhwd(ymm3, ymm3, ymm0); + vpslld(ymm3, (uint8)(m_sel.tw + 3)); + + // ymm2 = y0 + // ymm3 = y1 + // ymm4 = x0 + // ymm6 = x1 + // ymm0, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i addr00 = y0 + x0; + // GSVector8i addr01 = y0 + x1; + // GSVector8i addr10 = y1 + x0; + // GSVector8i addr11 = y1 + x1; + + vpaddd(ymm5, ymm2, ymm4); + vpaddd(ymm2, ymm2, ymm6); + vpaddd(ymm0, ymm3, ymm4); + vpaddd(ymm3, ymm3, ymm6); + + // ymm5 = addr00 + // ymm2 = addr01 + // ymm0 = addr10 + // ymm3 = addr11 + // ymm1, ymm4, ymm6 = free + // ymm7 = used + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(4, 0); + + // ymm6 = c00 + // ymm4 = c01 + // ymm1 = c10 + // ymm5 = c11 + // ymm0, ymm2, ymm3 = free + // ymm7 = used + + vmovdqa(ymm0, ptr[&m_local.temp.uf]); + + // GSVector8i rb00 = c00 & mask; + // GSVector8i ga00 = (c00 >> 8) & mask; + + vpsllw(ymm2, ymm6, 8); + vpsrlw(ymm2, 8); + vpsrlw(ymm6, 8); + + // GSVector8i rb01 = c01 & mask; + // GSVector8i ga01 = (c01 >> 8) & mask; + + vpsllw(ymm3, ymm4, 8); + vpsrlw(ymm3, 8); + vpsrlw(ymm4, 8); + + // ymm0 = uf + // ymm2 = rb00 + // ymm3 = rb01 + // ymm6 = ga00 + // ymm4 = ga01 + // ymm1 = c10 + // ymm5 = c11 + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); + + lerp16_4(ymm3, ymm2, ymm0); + lerp16_4(ymm4, ymm6, ymm0); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = c10 + // ymm5 = c11 + // ymm2, ymm6 = free + // ymm7 = used + + // GSVector8i rb10 = c10 & mask; + // GSVector8i ga10 = (c10 >> 8) & mask; + + vpsrlw(ymm2, ymm1, 8); + vpsllw(ymm1, 8); + vpsrlw(ymm1, 8); + + // GSVector8i rb11 = c11 & mask; + // GSVector8i ga11 = (c11 >> 8) & mask; + + vpsrlw(ymm6, ymm5, 8); + vpsllw(ymm5, 8); + vpsrlw(ymm5, 8); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = rb10 + // ymm5 = rb11 + // ymm2 = ga10 + // ymm6 = ga11 + // ymm7 = used + + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); + + lerp16_4(ymm5, ymm1, ymm0); + lerp16_4(ymm6, ymm2, ymm0); + + // ymm3 = rb00 + // ymm4 = ga00 + // ymm5 = rb10 + // ymm6 = ga10 + // ymm0, ymm1, ymm2 = free + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); + + vmovdqa(ymm0, ptr[&m_local.temp.vf]); + + lerp16_4(ymm5, ymm3, ymm0); + lerp16_4(ymm6, ymm4, ymm0); + } + else + { + // GSVector8i addr00 = y0 + x0; + + vpaddd(ymm5, ymm2, ymm4); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(1, 0); + + // GSVector8i mask = GSVector8i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + vpsllw(ymm5, ymm6, 8); + vpsrlw(ymm5, 8); + vpsrlw(ymm6, 8); + } + + if(m_sel.mmin != 1) // !round-off mode + { + vmovdqa(ptr[&m_local.temp.trb], ymm5); + vmovdqa(ptr[&m_local.temp.tga], ymm6); + + vmovdqa(ymm2, ptr[&m_local.temp.uv[0]]); + vmovdqa(ymm3, ptr[&m_local.temp.uv[1]]); + + vpsrad(ymm2, 1); + vpsrad(ymm3, 1); + + vmovdqa(ymm5, ptr[&m_local.temp.uv_minmax[0]]); + vmovdqa(ymm6, ptr[&m_local.temp.uv_minmax[1]]); + + vpsrlw(ymm5, 1); + vpsrlw(ymm6, 1); + + if(m_sel.ltf) + { + // u -= 0x8000; + // v -= 0x8000; + + mov(eax, 0x8000); + vmovd(xmm4, eax); + vpbroadcastd(ymm4, xmm4); + + vpsubd(ymm2, ymm4); + vpsubd(ymm3, ymm4); + + // GSVector8i uf = u.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm2, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.uf], ymm0); + + // GSVector8i vf = v.xxzzlh().srl16(1); + + vpshuflw(ymm0, ymm3, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpsrlw(ymm0, 12); + vmovdqa(ptr[&m_local.temp.vf], ymm0); + } + + // GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); + + vpsrad(ymm2, 16); + vpsrad(ymm3, 16); + vpackssdw(ymm2, ymm3); + + if(m_sel.ltf) + { + // GSVector8i uv1 = uv0.add16(GSVector4i::x0001()); + + vpcmpeqd(ymm1, ymm1); + vpsrlw(ymm1, 15); + vpaddw(ymm3, ymm2, ymm1); + + // uv0 = Wrap(uv0); + // uv1 = Wrap(uv1); + + WrapLOD(ymm2, ymm3); + } + else + { + // uv0 = Wrap(uv0); + + WrapLOD(ymm2); + } + + // ymm2 = uv0 + // ymm3 = uv1 (ltf) + // ymm0, ymm1, ymm4, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i x0 = uv0.upl16(); + // GSVector8i y0 = uv0.uph16() << tw; + + vpxor(ymm0, ymm0); + + vpunpcklwd(ymm4, ymm2, ymm0); + vpunpckhwd(ymm2, ymm2, ymm0); + vpslld(ymm2, (uint8)(m_sel.tw + 3)); + + // ymm0 = 0 + // ymm2 = y0 + // ymm3 = uv1 (ltf) + // ymm4 = x0 + // ymm1, ymm5, ymm6 = free + // ymm7 = used + + if(m_sel.ltf) + { + // GSVector8i x1 = uv1.upl16(); + // GSVector8i y1 = uv1.uph16() << tw; + + vpunpcklwd(ymm6, ymm3, ymm0); + vpunpckhwd(ymm3, ymm3, ymm0); + vpslld(ymm3, (uint8)(m_sel.tw + 3)); + + // ymm2 = y0 + // ymm3 = y1 + // ymm4 = x0 + // ymm6 = x1 + // ymm0, ymm5, ymm6 = free + // ymm7 = used + + // GSVector8i addr00 = y0 + x0; + // GSVector8i addr01 = y0 + x1; + // GSVector8i addr10 = y1 + x0; + // GSVector8i addr11 = y1 + x1; + + vpaddd(ymm5, ymm2, ymm4); + vpaddd(ymm2, ymm2, ymm6); + vpaddd(ymm0, ymm3, ymm4); + vpaddd(ymm3, ymm3, ymm6); + + // ymm5 = addr00 + // ymm2 = addr01 + // ymm0 = addr10 + // ymm3 = addr11 + // ymm1, ymm4, ymm6 = free + // ymm7 = used + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); + // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); + // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(4, 1); + + // ymm6 = c00 + // ymm4 = c01 + // ymm1 = c10 + // ymm5 = c11 + // ymm0, ymm2, ymm3 = free + // ymm7 = used + + vmovdqa(ymm0, ptr[&m_local.temp.uf]); + + // GSVector8i rb00 = c00 & mask; + // GSVector8i ga00 = (c00 >> 8) & mask; + + vpsllw(ymm2, ymm6, 8); + vpsrlw(ymm2, 8); + vpsrlw(ymm6, 8); + + // GSVector8i rb01 = c01 & mask; + // GSVector8i ga01 = (c01 >> 8) & mask; + + vpsllw(ymm3, ymm4, 8); + vpsrlw(ymm3, 8); + vpsrlw(ymm4, 8); + + // ymm0 = uf + // ymm2 = rb00 + // ymm3 = rb01 + // ymm6 = ga00 + // ymm4 = ga01 + // ymm1 = c10 + // ymm5 = c11 + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); + + lerp16_4(ymm3, ymm2, ymm0); + lerp16_4(ymm4, ymm6, ymm0); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = c10 + // ymm5 = c11 + // ymm2, ymm6 = free + // ymm7 = used + + // GSVector8i rb10 = c10 & mask; + // GSVector8i ga10 = (c10 >> 8) & mask; + + vpsrlw(ymm2, ymm1, 8); + vpsllw(ymm1, 8); + vpsrlw(ymm1, 8); + + // GSVector8i rb11 = c11 & mask; + // GSVector8i ga11 = (c11 >> 8) & mask; + + vpsrlw(ymm6, ymm5, 8); + vpsllw(ymm5, 8); + vpsrlw(ymm5, 8); + + // ymm0 = uf + // ymm3 = rb00 + // ymm4 = ga00 + // ymm1 = rb10 + // ymm5 = rb11 + // ymm2 = ga10 + // ymm6 = ga11 + // ymm7 = used + + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); + + lerp16_4(ymm5, ymm1, ymm0); + lerp16_4(ymm6, ymm2, ymm0); + + // ymm3 = rb00 + // ymm4 = ga00 + // ymm5 = rb10 + // ymm6 = ga10 + // ymm0, ymm1, ymm2 = free + // ymm7 = used + + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); + + vmovdqa(ymm0, ptr[&m_local.temp.vf]); + + lerp16_4(ymm5, ymm3, ymm0); + lerp16_4(ymm6, ymm4, ymm0); + } + else + { + // GSVector8i addr00 = y0 + x0; + + vpaddd(ymm5, ymm2, ymm4); + + // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); + + ReadTexel(1, 1); + + // GSVector8i mask = GSVector8i::x00ff(); + + // c[0] = c00 & mask; + // c[1] = (c00 >> 8) & mask; + + vpsllw(ymm5, ymm6, 8); + vpsrlw(ymm5, 8); + vpsrlw(ymm6, 8); + } + + vmovdqa(ymm0, ptr[m_sel.lcm ? &m_local.gd->lod.f : &m_local.temp.lod.f]); + vpsrlw(ymm0, ymm0, 1); + + vmovdqa(ymm2, ptr[&m_local.temp.trb]); + vmovdqa(ymm3, ptr[&m_local.temp.tga]); + + lerp16(ymm5, ymm2, ymm0, 0); + lerp16(ymm6, ymm3, ymm0, 0); + } + + pop(ebp); +} + +void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv) +{ + // ymm5 = minuv + // ymm6 = maxuv + // ymm0, ymm1, ymm4 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vpmaxsw(uv, ymm5); + } + else + { + vpxor(ymm0, ymm0); + vpmaxsw(uv, ymm0); + } + + vpminsw(uv, ymm6); + } + else + { + vpand(uv, ymm5); + + if(region) + { + vpor(uv, ymm6); + } + } + } + else + { + vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv, ymm5); + + if(region) + { + vpor(ymm1, ymm6); + } + + // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv, ymm5); + vpminsw(uv, ymm6); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv, ymm1, ymm0); + } +} + +void GSDrawScanlineCodeGenerator::WrapLOD(const Ymm& uv0, const Ymm& uv1) +{ + // ymm5 = minuv + // ymm6 = maxuv + // ymm0, ymm1, ymm4 = free + + int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; + int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; + + int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; + + if(wms_clamp == wmt_clamp) + { + if(wms_clamp) + { + if(region) + { + vpmaxsw(uv0, ymm5); + vpmaxsw(uv1, ymm5); + } + else + { + vpxor(ymm0, ymm0); + vpmaxsw(uv0, ymm0); + vpmaxsw(uv1, ymm0); + } + + vpminsw(uv0, ymm6); + vpminsw(uv1, ymm6); + } + else + { + vpand(uv0, ymm5); + vpand(uv1, ymm5); + + if(region) + { + vpor(uv0, ymm6); + vpor(uv1, ymm6); + } + } + } + else + { + vbroadcasti128(ymm0, ptr[&m_local.gd->t.mask]); + + // uv0 + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv0, ymm5); + + if(region) + { + vpor(ymm1, ymm6); + } + + // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv0, ymm5); + vpminsw(uv0, ymm6); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv0, ymm1, ymm0); + + // uv1 + + // GSVector8i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; + + vpand(ymm1, uv1, ymm5); + + if(region) + { + vpor(ymm1, ymm6); + } + + // GSVector8i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); + + vpmaxsw(uv1, ymm5); + vpminsw(uv1, ymm6); + + // clamp.blend8(repeat, m_local.gd->t.mask); + + vpblendvb(uv1, ymm1, ymm0); + } +} + +void GSDrawScanlineCodeGenerator::AlphaTFX() +{ + if(!m_sel.fb) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector8i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + // gat = gat.modulate16<1>(ga).clamp8(); + + modulate16(ymm6, ymm4, 1); + + clamp16(ymm6, ymm3); + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + vpsrlw(ymm4, 7); + + mix16(ymm6, ymm4, ymm3); + } + + break; + + case TFX_DECAL: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + + vpsrlw(ymm4, 7); + + mix16(ymm6, ymm4, ymm3); + } + + break; + + case TFX_HIGHLIGHT: + + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + vmovdqa(ymm2, ymm4); + + // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); + + vpsrlw(ymm4, 7); + + if(m_sel.tcc) + { + vpaddusb(ymm4, ymm6); + } + + mix16(ymm6, ymm4, ymm3); + + break; + + case TFX_HIGHLIGHT2: + + // if(!tcc) gat = gat.mix16(ga.srl16(7)); + + if(!m_sel.tcc) + { + // GSVector4i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(ymm4, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + vmovdqa(ymm2, ymm4); + + vpsrlw(ymm4, 7); + + mix16(ymm6, ymm4, ymm3); + } + + break; + + case TFX_NONE: + + // gat = iip ? ga.srl16(7) : ga; + + if(m_sel.iip) + { + vpsrlw(ymm6, 7); + } + + break; + } + + if(m_sel.aa1) + { + // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha + + // FIXME: bios config screen cubes + + if(!m_sel.abe) + { + // a = cov + + if(m_sel.edge) + { + vmovdqa(ymm0, ptr[&m_local.temp.cov]); + } + else + { + vpcmpeqd(ymm0, ymm0); + vpsllw(ymm0, 15); + vpsrlw(ymm0, 8); + } + + mix16(ymm6, ymm0, ymm1); + } + else + { + // a = a == 0x80 ? cov : a + + vpcmpeqd(ymm0, ymm0); + vpsllw(ymm0, 15); + vpsrlw(ymm0, 8); + + if(m_sel.edge) + { + vmovdqa(ymm1, ptr[&m_local.temp.cov]); + } + else + { + vmovdqa(ymm1, ymm0); + } + + vpcmpeqw(ymm0, ymm6); + vpsrld(ymm0, 16); + vpslld(ymm0, 16); + + vpblendvb(ymm6, ymm1, ymm0); + } + } +} + +void GSDrawScanlineCodeGenerator::ReadMask() +{ + if(m_sel.fwrite) + { + vpbroadcastd(ymm3, ptr[&m_local.gd->fm]); + } + + if(m_sel.zwrite) + { + vpbroadcastd(ymm4, ptr[&m_local.gd->zm]); + } +} + +void GSDrawScanlineCodeGenerator::TestAlpha() +{ + switch(m_sel.afail) + { + case AFAIL_FB_ONLY: + if(!m_sel.zwrite) return; + break; + + case AFAIL_ZB_ONLY: + if(!m_sel.fwrite) return; + break; + + case AFAIL_RGB_ONLY: + if(!m_sel.zwrite && m_sel.fpsm == 1) return; + break; + } + + switch(m_sel.atst) + { + case ATST_NEVER: + // t = GSVector8i::xffffffff(); + vpcmpeqd(ymm1, ymm1); + break; + + case ATST_ALWAYS: + return; + + case ATST_LESS: + case ATST_LEQUAL: + // t = (ga >> 16) > m_local.gd->aref; + vpsrld(ymm1, ymm6, 16); + vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); + vpcmpgtd(ymm1, ymm0); + break; + + case ATST_EQUAL: + // t = (ga >> 16) != m_local.gd->aref; + vpsrld(ymm1, ymm6, 16); + vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); + vpcmpeqd(ymm1, ymm0); + vpcmpeqd(ymm0, ymm0); + vpxor(ymm1, ymm0); + break; + + case ATST_GEQUAL: + case ATST_GREATER: + // t = (ga >> 16) < m_local.gd->aref; + vpsrld(ymm0, ymm6, 16); + vbroadcasti128(ymm1, ptr[&m_local.gd->aref]); + vpcmpgtd(ymm1, ymm0); + break; + + case ATST_NOTEQUAL: + // t = (ga >> 16) == m_local.gd->aref; + vpsrld(ymm1, ymm6, 16); + vbroadcasti128(ymm0, ptr[&m_local.gd->aref]); + vpcmpeqd(ymm1, ymm0); + break; + } + + switch(m_sel.afail) + { + case AFAIL_KEEP: + // test |= t; + vpor(ymm7, ymm1); + alltrue(); + break; + + case AFAIL_FB_ONLY: + // zm |= t; + vpor(ymm4, ymm1); + break; + + case AFAIL_ZB_ONLY: + // fm |= t; + vpor(ymm3, ymm1); + break; + + case AFAIL_RGB_ONLY: + // zm |= t; + vpor(ymm4, ymm1); + // fm |= t & GSVector8i::xff000000(); + vpsrld(ymm1, 24); + vpslld(ymm1, 24); + vpor(ymm3, ymm1); + break; + } +} + +void GSDrawScanlineCodeGenerator::ColorTFX() +{ + if(!m_sel.fwrite) + { + return; + } + + switch(m_sel.tfx) + { + case TFX_MODULATE: + + // GSVector8i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).clamp8(); + + modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + clamp16(ymm5, ymm1); + + break; + + case TFX_DECAL: + + break; + + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + + if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) + { + // GSVector8i ga = iip ? gaf : m_local.c.ga; + + vmovdqa(ymm2, ptr[m_sel.iip ? &m_local.temp.ga : &m_local.c.ga]); + } + + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + vmovdqa(ymm1, ymm6); + + modulate16(ymm6, ymm2, 1); + + vpshuflw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(ymm2, ymm2, _MM_SHUFFLE(3, 3, 1, 1)); + vpsrlw(ymm2, 7); + + vpaddw(ymm6, ymm2); + + clamp16(ymm6, ymm0); + + mix16(ymm6, ymm1, ymm0); + + // GSVector8i rb = iip ? rbf : m_local.c.rb; + + // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); + + modulate16(ymm5, ptr[m_sel.iip ? &m_local.temp.rb : &m_local.c.rb], 1); + + vpaddw(ymm5, ymm2); + + clamp16(ymm5, ymm0); + + break; + + case TFX_NONE: + + // rbt = iip ? rb.srl16(7) : rb; + + if(m_sel.iip) + { + vpsrlw(ymm5, 7); + } + + break; + } +} + +void GSDrawScanlineCodeGenerator::Fog() +{ + if(!m_sel.fwrite || !m_sel.fge) + { + return; + } + + // rb = m_local.gd->frb.lerp16<0>(rb, f); + // ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga); + + if(m_sel.prim != GS_SPRITE_CLASS) + { + vmovdqa(ymm0, ptr[&m_local.temp.f]); + } + else + { + vpbroadcastw(ymm0, ptr[&m_local.p.f]); + } + + vmovdqa(ymm1, ymm6); + + vpbroadcastd(ymm2, ptr[&m_local.gd->frb]); + lerp16(ymm5, ymm2, ymm0, 0); + + vpbroadcastd(ymm2, ptr[&m_local.gd->fga]); + lerp16(ymm6, ymm2, ymm0, 0); + mix16(ymm6, ymm1, ymm0); +} + +void GSDrawScanlineCodeGenerator::ReadFrame() +{ + if(!m_sel.fb) + { + return; + } + + // int fa = fza_base.x + fza_offset->x; + + mov(ebx, ptr[esi]); + add(ebx, ptr[edi]); + + if(!m_sel.rfb) + { + return; + } + + ReadPixel(ymm2, ymm0, ebx); +} + +void GSDrawScanlineCodeGenerator::TestDestAlpha() +{ + if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2) + { + return; + } + + // test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31); + + if(m_sel.datm) + { + if(m_sel.fpsm == 2) + { + vpxor(ymm0, ymm0); + vpsrld(ymm1, ymm2, 15); + vpcmpeqd(ymm1, ymm0); + } + else + { + vpcmpeqd(ymm0, ymm0); + vpxor(ymm1, ymm2, ymm0); + vpsrad(ymm1, 31); + } + } + else + { + if(m_sel.fpsm == 2) + { + vpslld(ymm1, ymm2, 16); + vpsrad(ymm1, 31); + } + else + { + vpsrad(ymm1, ymm2, 31); + } + } + + vpor(ymm7, ymm1); + + alltrue(); +} + +void GSDrawScanlineCodeGenerator::WriteMask() +{ + if(m_sel.notest) + { + return; + } + + // fm |= test; + // zm |= test; + + if(m_sel.fwrite) + { + vpor(ymm3, ymm7); + } + + if(m_sel.zwrite) + { + vpor(ymm4, ymm7); + } + + // int fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask(); + + vpcmpeqd(ymm1, ymm1); + + if(m_sel.fwrite && m_sel.zwrite) + { + vpcmpeqd(ymm0, ymm1, ymm4); + vpcmpeqd(ymm1, ymm3); + vpackssdw(ymm1, ymm0); + } + else if(m_sel.fwrite) + { + vpcmpeqd(ymm1, ymm3); + vpackssdw(ymm1, ymm1); + } + else if(m_sel.zwrite) + { + vpcmpeqd(ymm1, ymm4); + vpackssdw(ymm1, ymm1); + } + + vpmovmskb(edx, ymm1); + + not(edx); +} + +void GSDrawScanlineCodeGenerator::WriteZBuf() +{ + if(!m_sel.zwrite) + { + return; + } + + if(m_sel.prim != GS_SPRITE_CLASS) + { + vmovdqa(ymm1, ptr[&m_local.temp.zs]); + } + else + { + vpbroadcastd(ymm1, ptr[&m_local.p.z]); + } + + if(m_sel.ztest && m_sel.zpsm < 2) + { + // zs = zs.blend8(zd, zm); + + vpblendvb(ymm1, ptr[&m_local.temp.zd], ymm4); + } + + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + + WritePixel(ymm1, ymm0, ebp, edx, fast, m_sel.zpsm, 1); +} + +void GSDrawScanlineCodeGenerator::AlphaBlend() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.abe == 0 && m_sel.aa1 == 0) + { + return; + } + + if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) + { + switch(m_sel.fpsm) + { + case 0: + case 1: + + // c[2] = fd & mask; + // c[3] = (fd >> 8) & mask; + + vpsllw(ymm0, ymm2, 8); + vpsrlw(ymm0, 8); + vpsrlw(ymm1, ymm2, 8); + + break; + + case 2: + + // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + + vpcmpeqd(ymm7, ymm7); + + vpsrld(ymm7, 27); // 0x0000001f + vpand(ymm0, ymm2, ymm7); + vpslld(ymm0, 3); + + vpslld(ymm7, 10); // 0x00007c00 + vpand(ymm4, ymm2, ymm7); + vpslld(ymm4, 9); + + vpor(ymm0, ymm4); + + vpsrld(ymm7, 5); // 0x000003e0 + vpand(ymm1, ymm2, ymm7); + vpsrld(ymm1, 2); + + vpsllw(ymm7, 10); // 0x00008000 + vpand(ymm4, ymm2, ymm7); + vpslld(ymm4, 8); + + vpor(ymm1, ymm4); + + break; + } + } + + // ymm5, ymm6 = src rb, ga + // ymm0, ymm1 = dst rb, ga + // ymm2, ymm3 = used + // ymm4, ymm7 = free + + if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) + { + vmovdqa(ymm4, ymm5); + } + + if(m_sel.aba != m_sel.abb) + { + // rb = c[aba * 2 + 0]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(ymm5, ymm0); break; + case 2: vpxor(ymm5, ymm5); break; + } + + // rb = rb.sub16(c[abb * 2 + 0]); + + switch(m_sel.abb) + { + case 0: vpsubw(ymm5, ymm4); break; + case 1: vpsubw(ymm5, ymm0); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; + + switch(m_sel.abc) + { + case 0: + case 1: + vpshuflw(ymm7, m_sel.abc ? ymm1 : ymm6, _MM_SHUFFLE(3, 3, 1, 1)); + vpshufhw(ymm7, ymm7, _MM_SHUFFLE(3, 3, 1, 1)); + vpsllw(ymm7, 7); + break; + case 2: + vpbroadcastw(ymm7, ptr[&m_local.gd->afix]); + break; + } + + // rb = rb.modulate16<1>(a); + + modulate16(ymm5, ymm7, 1); + } + + // rb = rb.add16(c[abd * 2 + 0]); + + switch(m_sel.abd) + { + case 0: vpaddw(ymm5, ymm4); break; + case 1: vpaddw(ymm5, ymm0); break; + case 2: break; + } + } + else + { + // rb = c[abd * 2 + 0]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(ymm5, ymm0); break; + case 2: vpxor(ymm5, ymm5); break; + } + } + + if(m_sel.pabe) + { + // mask = (c[1] << 8).sra32(31); + + vpslld(ymm0, ymm6, 8); + vpsrad(ymm0, 31); + + // rb = c[0].blend8(rb, mask); + + vpblendvb(ymm5, ymm4, ymm5, ymm0); + } + + // ymm6 = src ga + // ymm1 = dst ga + // ymm5 = rb + // ymm7 = a + // ymm2, ymm3 = used + // ymm0, ymm4 = free + + vmovdqa(ymm4, ymm6); + + if(m_sel.aba != m_sel.abb) + { + // ga = c[aba * 2 + 1]; + + switch(m_sel.aba) + { + case 0: break; + case 1: vmovdqa(ymm6, ymm1); break; + case 2: vpxor(ymm6, ymm6); break; + } + + // ga = ga.sub16(c[abeb * 2 + 1]); + + switch(m_sel.abb) + { + case 0: vpsubw(ymm6, ymm4); break; + case 1: vpsubw(ymm6, ymm1); break; + case 2: break; + } + + if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) + { + // ga = ga.modulate16<1>(a); + + modulate16(ymm6, ymm7, 1); + } + + // ga = ga.add16(c[abd * 2 + 1]); + + switch(m_sel.abd) + { + case 0: vpaddw(ymm6, ymm4); break; + case 1: vpaddw(ymm6, ymm1); break; + case 2: break; + } + } + else + { + // ga = c[abd * 2 + 1]; + + switch(m_sel.abd) + { + case 0: break; + case 1: vmovdqa(ymm6, ymm1); break; + case 2: vpxor(ymm6, ymm6); break; + } + } + + // ymm4 = src ga + // ymm5 = rb + // ymm6 = ga + // ymm2, ymm3 = used + // ymm0, ymm1, ymm7 = free + + if(m_sel.pabe) + { + vpsrld(ymm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) + + // ga = c[1].blend8(ga, mask).mix16(c[1]); + + vpblendvb(ymm6, ymm4, ymm6, ymm0); + } + else + { + if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx + { + mix16(ymm6, ymm4, ymm7); + } + } +} + +void GSDrawScanlineCodeGenerator::WriteFrame() +{ + if(!m_sel.fwrite) + { + return; + } + + if(m_sel.fpsm == 2 && m_sel.dthe) + { + mov(eax, ptr[esp + _top]); + and(eax, 3); + shl(eax, 5); + mov(ebp, ptr[&m_local.gd->dimx]); + vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 0]); + vpaddw(ymm5, ymm7); + vbroadcasti128(ymm7, ptr[ebp + eax + sizeof(GSVector4i) * 1]); + vpaddw(ymm6, ymm7); + } + + if(m_sel.colclamp == 0) + { + // c[0] &= 0x00ff00ff; + // c[1] &= 0x00ff00ff; + + vpcmpeqd(ymm7, ymm7); + vpsrlw(ymm7, 8); + vpand(ymm5, ymm7); + vpand(ymm6, ymm7); + } + + // GSVector8i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); + + vpunpckhwd(ymm7, ymm5, ymm6); + vpunpcklwd(ymm5, ymm6); + vpackuswb(ymm5, ymm7); + + if(m_sel.fba && m_sel.fpsm != 1) + { + // fs |= 0x80000000; + + vpcmpeqd(ymm7, ymm7); + vpslld(ymm7, 31); + vpor(ymm5, ymm7); + } + + if(m_sel.fpsm == 2) + { + // GSVector8i rb = fs & 0x00f800f8; + // GSVector8i ga = fs & 0x8000f800; + + mov(eax, 0x00f800f8); + vmovd(xmm6, eax); + vpbroadcastd(ymm6, xmm6); + + mov(eax, 0x8000f800); + vmovd(xmm7, eax); + vpbroadcastd(ymm7, xmm7); + + vpand(ymm4, ymm5, ymm6); + vpand(ymm5, ymm7); + + // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + + vpsrld(ymm6, ymm4, 9); + vpsrld(ymm4, 3); + vpsrld(ymm7, ymm5, 16); + vpsrld(ymm5, 6); + + vpor(ymm5, ymm4); + vpor(ymm7, ymm6); + vpor(ymm5, ymm7); + } + + if(m_sel.rfb) + { + // fs = fs.blend(fd, fm); + + blend(ymm5, ymm2, ymm3); // TODO: could be skipped in certain cases, depending on fpsm and fm + } + + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; + + WritePixel(ymm5, ymm0, ebx, edx, fast, m_sel.fpsm, 0); +} + +void GSDrawScanlineCodeGenerator::ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr) +{ + vmovq(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm]); + vmovhps(Xmm(dst.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2]); + vmovq(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); + vmovhps(Xmm(temp.getIdx()), qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2]); + vinserti128(dst, dst, temp, 1); +/* + vmovdqu(dst, ptr[addr * 2 + (size_t)m_local.gd->vm]); + vmovdqu(temp, ptr[addr * 2 + (size_t)m_local.gd->vm + 16 * 2]); + vpunpcklqdq(dst, dst, temp); + vpermq(dst, dst, _MM_SHUFFLE(3, 1, 2, 0)); +*/ +} + +void GSDrawScanlineCodeGenerator::WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz) +{ + Xmm src1 = Xmm(src.getIdx()); + Xmm src2 = Xmm(temp.getIdx()); + + vextracti128(src2, src, 1); + + if(m_sel.notest) + { + if(fast) + { + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); + } + else + { + WritePixel(src1, addr, 0, 0, psm); + WritePixel(src1, addr, 1, 1, psm); + WritePixel(src1, addr, 2, 2, psm); + WritePixel(src1, addr, 3, 3, psm); + WritePixel(src2, addr, 4, 0, psm); + WritePixel(src2, addr, 5, 1, psm); + WritePixel(src2, addr, 6, 2, psm); + WritePixel(src2, addr, 7, 3, psm); + } + } + else + { + // cascade tests? + + if(fast) + { + test(mask, 0x0000000f << (fz * 8)); + je("@f"); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src1); + L("@@"); + + test(mask, 0x000000f0 << (fz * 8)); + je("@f"); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src1); + L("@@"); + + test(mask, 0x000f0000 << (fz * 8)); + je("@f"); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm + 16 * 2], src2); + L("@@"); + + test(mask, 0x00f00000 << (fz * 8)); + je("@f"); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 24 * 2], src2); + L("@@"); + + // vmaskmovps? + } + else + { + test(mask, 0x00000003 << (fz * 8)); + je("@f"); + WritePixel(src1, addr, 0, 0, psm); + L("@@"); + + test(mask, 0x0000000c << (fz * 8)); + je("@f"); + WritePixel(src1, addr, 1, 1, psm); + L("@@"); + + test(mask, 0x00000030 << (fz * 8)); + je("@f"); + WritePixel(src1, addr, 2, 2, psm); + L("@@"); + + test(mask, 0x000000c0 << (fz * 8)); + je("@f"); + WritePixel(src1, addr, 3, 3, psm); + L("@@"); + + test(mask, 0x00030000 << (fz * 8)); + je("@f"); + WritePixel(src2, addr, 4, 0, psm); + L("@@"); + + test(mask, 0x000c0000 << (fz * 8)); + je("@f"); + WritePixel(src2, addr, 5, 1, psm); + L("@@"); + + test(mask, 0x00300000 << (fz * 8)); + je("@f"); + WritePixel(src2, addr, 6, 2, psm); + L("@@"); + + test(mask, 0x00c00000 << (fz * 8)); + je("@f"); + WritePixel(src2, addr, 7, 3, psm); + L("@@"); + } + } +} + +static const int s_offsets[] = {0, 2, 8, 10, 16, 18, 24, 26}; + +void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm) +{ + Address dst = ptr[addr * 2 + (size_t)m_local.gd->vm + s_offsets[i] * 2]; + + switch(psm) + { + case 0: + if(j == 0) vmovd(dst, src); + else vpextrd(dst, src, j); + break; + case 1: + if(j == 0) vmovd(eax, src); + else vpextrd(eax, src, j); + xor(eax, dst); + and(eax, 0xffffff); + xor(dst, eax); + break; + case 2: + vpextrw(eax, src, j * 2); + mov(dst, ax); + break; + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) +{ + // in + // ymm5 = addr00 + // ymm2 = addr01 + // ymm0 = addr10 + // ymm3 = addr11 + // ebx = m_local.tex[0] (!m_sel.mmin) + // ebp = m_local.tex (m_sel.mmin) + // edx = m_local.clut (m_sel.tlu) + + // out + // ymm6 = c00 + // ymm4 = c01 + // ymm1 = c10 + // ymm5 = c11 + + ASSERT(pixels == 1 || pixels == 4); + + mip_offset *= sizeof(void*); + + const GSVector8i* lod_i = m_sel.lcm ? &m_local.gd->lod.i : &m_local.temp.lod.i; + + if(m_sel.mmin && !m_sel.lcm) + { + const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; + const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; + + for(int i = 0; i < pixels; i++) + { + Ymm src = Ymm(r[i * 2 + 0]); + Ymm dst = Ymm(r[i * 2 + 1]); + Ymm t1 = Ymm(t[i * 2 + 0]); + Ymm t2 = Ymm(t[i * 2 + 1]); + + vextracti128(Xmm(t1.getIdx()), src, 1); + + for(uint8 j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j + 0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(dst, src, j); + + mov(ebx, ptr[&lod_i->u32[j + 4]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + + ReadTexel(t2, t1, j); + } + + vinserti128(dst, dst, t2, 1); + } + } + else + { + const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; + const int t[] = {1, 4, 5, 1, 2, 5, 0, 2}; + + if(m_sel.mmin && m_sel.lcm) + { + mov(ebx, ptr[&lod_i->u32[0]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); + } + + for(int i = 0; i < pixels; i++) + { + Ymm src = Ymm(r[i * 2 + 0]); + Ymm dst = Ymm(r[i * 2 + 1]); + Ymm t1 = Ymm(t[i * 2 + 0]); + Ymm t2 = Ymm(t[i * 2 + 1]); + + if(!m_sel.tlu) + { + vpcmpeqd(t1, t1); + vpgatherdd(dst, ptr[ebx + src * 4], t1); + } + else + { + vextracti128(Xmm(t1.getIdx()), src, 1); + + for(uint8 j = 0; j < 4; j++) + { + ReadTexel(dst, src, j); + ReadTexel(t2, t1, j); + } + + vinserti128(dst, dst, t2, 1); + /* + vpcmpeqd(t1, t1); + vpgatherdd(t2, ptr[ebx + src * 1], t1); // either this 1x scale, or the latency of two dependendent gathers are too slow + vpslld(t2, 24); + vpsrld(t2, 24); + vpcmpeqd(t1, t1); + vpgatherdd(dst, ptr[edx + t2 * 4], t1); + */ + } + } + } +} + +void GSDrawScanlineCodeGenerator::ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i) +{ + ASSERT(i < 4); + + const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4]; + + if(i == 0) vmovd(eax, Xmm(addr.getIdx())); + else vpextrd(eax, Xmm(addr.getIdx()), i); + + if(m_sel.tlu) movzx(eax, byte[ebx + eax]); + + if(i == 0) vmovd(Xmm(dst.getIdx()), src); + else vpinsrd(Xmm(dst.getIdx()), src, i); +} + + +#endif \ No newline at end of file diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index 4dfa92df6c..68cd72fa29 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -209,7 +209,7 @@ public: iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml); /* - name = format("c:/temp/%s_%016llx.bin", m_name.c_str(), (uint64)key); + name = format("c:/temp1/%s_%016llx.bin", m_name.c_str(), (uint64)key); if(FILE* fp = fopen(name.c_str(), "wb")) { @@ -218,7 +218,7 @@ public: fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp); fwrite(cg->getCode(), cg->getSize(), 1, fp); - + fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp); fputc(0x0F, fp); fputc(0x0B, fp); diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 44b9a51ef6..ba11a90a37 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -208,6 +208,10 @@ void GSRasterizer::Draw(GSRasterizerData* data) __assume(0); } + #if _M_SSE >= 0x501 + _mm256_zeroupper(); + #endif + data->pixels = m_pixels; uint64 ticks = __rdtsc() - data->start; @@ -917,7 +921,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon) { for(int i = 0; i < threads; i++, row++) { - m_scanline[row] = i; + m_scanline[row] = (uint8)i; } } } diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index e606dc7228..ee7e31b931 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -400,7 +400,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* dst->t = t; - #if _M_SSE >= 0x501 + #if 0 //_M_SSE >= 0x501 dst->_pad = GSVector4::zero(); @@ -1342,8 +1342,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) { gd.sel.fge = 1; - gd.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff); - gd.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff); + gd.frb = env.FOGCOL.u32[0] & 0x00ff00ff; + gd.fga = (env.FOGCOL.u32[0] >> 8) & 0x00ff00ff; } if(context->FRAME.PSM != PSM_PSMCT24) @@ -1403,6 +1403,34 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000; } + #if _M_SSE >= 0x501 + + gd.fm = fm; + gd.zm = zm; + + if(gd.sel.fpsm == 1) + { + gd.fm |= 0xff000000; + } + else if(gd.sel.fpsm == 2) + { + uint32 rb = gd.fm & 0x00f800f8; + uint32 ga = gd.fm & 0x8000f800; + + gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; + } + + if(gd.sel.zpsm == 1) + { + gd.zm |= 0xff000000; + } + else if(gd.sel.zpsm == 2) + { + gd.zm |= 0xffff0000; + } + + #else + gd.fm = GSVector4i(fm); gd.zm = GSVector4i(zm); @@ -1427,6 +1455,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zm |= GSVector4i::xffff0000(); } + #endif + if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only { gd.sel.notest = 1; @@ -1435,7 +1465,11 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) for(int i = 0, j = m_vertex.tail; i < j; i++) { + #if _M_SSE >= 0x501 + if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8 + #else if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4 + #endif { gd.sel.notest = 0; diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index bcecc5a635..4265a8fe1a 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -116,7 +116,7 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like void* vm; const void* tex[7]; uint32* clut; - GSVector4i* dimx; + GSVector4i* dimx; const int* fbr; const int* zbr; @@ -125,19 +125,63 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like const GSVector2i* fzbr; const GSVector2i* fzbc; - GSVector4i fm, zm; - struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4 GSVector4i aref; GSVector4i afix; + struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4 + + #if _M_SSE >= 0x501 + + uint32 fm, zm; + uint32 frb, fga; + GSVector8 mxl; + GSVector8 k; // TEX1.K * 0x10000 + GSVector8 l; // TEX1.L * -0x10000 + struct {GSVector8i i, f;} lod; // lcm == 1 + + #else + + GSVector4i fm, zm; GSVector4i frb, fga; GSVector4 mxl; GSVector4 k; // TEX1.K * 0x10000 GSVector4 l; // TEX1.L * -0x10000 struct {GSVector4i i, f;} lod; // lcm == 1 + + #endif }; __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own { + #if _M_SSE >= 0x501 + + struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8]; + struct step {GSVector8 z, stq; GSVector8i c, f;} d8; + struct {GSVector8i rb, ga;} c; + struct {uint32 z, f;} p; + + // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) + + struct + { + GSVector8 z, zo; + GSVector8i f; + GSVector8 s, t, q; + GSVector8i rb, ga; + GSVector8i zs, zd; + GSVector8i uf, vf; + GSVector8i cov; + + // mipmapping + + struct {GSVector8i i, f;} lod; + GSVector8i uv[2]; + GSVector8i uv_minmax[2]; + GSVector8i trb, tga; + GSVector8i test; + } temp; + + #else + struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4]; struct step {GSVector4 z, stq; GSVector4i c, f;} d4; struct {GSVector4i rb, ga;} c; @@ -164,6 +208,8 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has GSVector4i test; } temp; + #endif + // const GSScanlineGlobalData* gd; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp index 6beb59773a..37e253ee9f 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp @@ -22,6 +22,23 @@ #include "stdafx.h" #include "GSSetupPrimCodeGenerator.h" +#if _M_SSE >= 0x501 + +const GSVector8 GSSetupPrimCodeGenerator::m_shift[9] = +{ + GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f), + GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f), + GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f), + GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f), + GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f), + GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f), + GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f), + GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f), + GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f), +}; + +#else + const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] = { GSVector4(4.0f, 4.0f, 4.0f, 4.0f), @@ -31,6 +48,8 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] = GSVector4(-3.0f, -2.0f, -1.0f, 0.0f), }; +#endif + GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) , m_local(*(GSScanlineLocalData*)param) diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.h b/plugins/GSdx/GSSetupPrimCodeGenerator.h index 5bf9eacc2b..746d7996aa 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.h +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.h @@ -42,5 +42,9 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator public: GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); + #if _M_SSE >= 0x501 + static const GSVector8 m_shift[9]; + #else static const GSVector4 m_shift[5]; + #endif }; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index 91910d89f1..21a7d47c97 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -23,7 +23,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSVertexSW.h" -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) +#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) using namespace Xbyak; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp new file mode 100644 index 0000000000..f8d46ea42a --- /dev/null +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp @@ -0,0 +1,353 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSSetupPrimCodeGenerator.h" +#include "GSVertexSW.h" + +#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64)) + +using namespace Xbyak; + +static const int _args = 0; +static const int _vertex = _args + 4; +static const int _index = _args + 8; +static const int _dscan = _args + 12; + +void GSSetupPrimCodeGenerator::Generate() +{ + if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) + { + mov(edx, dword[esp + _dscan]); + + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) + { + vmovaps(Ymm(3 + i), ptr[&m_shift[i]]); + } + } + + Depth(); + + Texture(); + + Color(); + + ret(); +} + +void GSSetupPrimCodeGenerator::Depth() +{ + if(!m_en.z && !m_en.f) + { + return; + } + + if(m_sel.prim != GS_SPRITE_CLASS) + { + // GSVector4 p = dscan.p; + + if(m_en.f) + { + // GSVector8 df = GSVector8::broadcast32(dscan.p.wwww()); + + vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]); + } + + if(m_en.z) + { + // GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz()); + + vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]); + } + + if(m_en.f) + { + // m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh(); + + vmulps(ymm0, ymm1, ymm3); + vcvttps2dq(ymm0, ymm0); + vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.d8.f], ymm0); + } + + if(m_en.z) + { + // m_local.d8.z = dz * shift[0]; + + vmulps(ymm0, ymm2, ymm3); + vmovaps(ptr[&m_local.d8.z], ymm0); + } + + for(int i = 0; i < (m_sel.notest ? 1 : 8); i++) + { + if(m_en.f) + { + // m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh(); + + if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i)); + else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]); + vcvttps2dq(ymm0, ymm0); + vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); + vmovdqa(ptr[&m_local.d[i].f], ymm0); + } + + if(m_en.z) + { + // m_local.d[i].z = dz * shift[1 + i]; + + if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i)); + else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]); + vmovaps(ptr[&m_local.d[i].z], ymm0); + } + } + } + else + { + // GSVector4 p = vertex[index[1]].p; + + mov(ecx, ptr[esp + _index]); + mov(ecx, ptr[ecx + sizeof(uint32) * 1]); + shl(ecx, 6); // * sizeof(GSVertexSW) + add(ecx, ptr[esp + _vertex]); + + if(m_en.f) + { + // m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>(); + + vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]); + vcvttps2dq(xmm0, xmm0); + vpextrd(ptr[&m_local.p.f], xmm0, 3); + } + + if(m_en.z) + { + // m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w + + mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]); + mov(ptr[&m_local.p.z], eax); + } + } +} + +void GSSetupPrimCodeGenerator::Texture() +{ + if(!m_en.t) + { + return; + } + + // GSVector8 dt(dscan.t); + + vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]); + + // GSVector8 dt8 = dt * shift[0]; + + vmulps(ymm1, ymm0, ymm3); + + if(m_sel.fst) + { + // m_local.d8.stq = GSVector8::cast(GSVector8i(dt8)); + + vcvttps2dq(ymm1, ymm1); + + vmovdqa(ptr[&m_local.d8.stq], ymm1); + } + else + { + // m_local.d8.stq = dt8; + + vmovaps(ptr[&m_local.d8.stq], ymm1); + } + + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) + { + // GSVector8 dstq = dt.xxxx/yyyy/zzzz(); + + vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j)); + + for(int i = 0; i < (m_sel.notest ? 1 : 8); i++) + { + // GSVector8 v = dstq * shift[1 + i]; + + if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i)); + else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]); + + if(m_sel.fst) + { + // m_local.d[i].s/t = GSVector8::cast(GSVector8i(v)); + + vcvttps2dq(ymm2, ymm2); + + switch(j) + { + case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break; + case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break; + } + } + else + { + // m_local.d[i].s/t/q = v; + + switch(j) + { + case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break; + case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break; + case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break; + } + } + } + } +} + +void GSSetupPrimCodeGenerator::Color() +{ + if(!m_en.c) + { + return; + } + + if(m_sel.iip) + { + // GSVector8 dc(dscan.c); + + vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); + + // m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32(); + + vmulps(ymm1, ymm0, ymm3); + vcvttps2dq(ymm1, ymm1); + vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0)); + vpackssdw(ymm1, ymm1); + vmovdqa(ptr[&m_local.d8.c], ymm1); + + // ymm3 is not needed anymore + + // GSVector8 dr = dc.xxxx(); + // GSVector8 db = dc.zzzz(); + + vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0)); + vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); + + for(int i = 0; i < (m_sel.notest ? 1 : 8); i++) + { + // GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32(); + + if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i)); + else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]); + vcvttps2dq(ymm0, ymm0); + vpackssdw(ymm0, ymm0); + + // GSVector4i b = GSVector8i(db * shift[1 + i]).ps32(); + + if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i)); + else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]); + vcvttps2dq(ymm1, ymm1); + vpackssdw(ymm1, ymm1); + + // m_local.d[i].rb = r.upl16(b); + + vpunpcklwd(ymm0, ymm1); + vmovdqa(ptr[&m_local.d[i].rb], ymm0); + } + + // GSVector8 dc(dscan.c); + + vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it + + // GSVector8 dg = dc.yyyy(); + // GSVector8 da = dc.wwww(); + + vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1)); + vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3)); + + for(int i = 0; i < (m_sel.notest ? 1 : 8); i++) + { + // GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32(); + + if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i)); + else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]); + vcvttps2dq(ymm0, ymm0); + vpackssdw(ymm0, ymm0); + + // GSVector8i a = GSVector8i(da * shift[1 + i]).ps32(); + + if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i)); + else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]); + vcvttps2dq(ymm1, ymm1); + vpackssdw(ymm1, ymm1); + + // m_local.d[i].ga = g.upl16(a); + + vpunpcklwd(ymm0, ymm1); + vmovdqa(ptr[&m_local.d[i].ga], ymm0); + } + } + else + { + // GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c)); + + int last = 0; + + switch(m_sel.prim) + { + case GS_POINT_CLASS: last = 0; break; + case GS_LINE_CLASS: last = 1; break; + case GS_TRIANGLE_CLASS: last = 2; break; + case GS_SPRITE_CLASS: last = 1; break; + } + + if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth() + { + mov(ecx, ptr[esp + _index]); + mov(ecx, ptr[ecx + sizeof(uint32) * last]); + shl(ecx, 6); // * sizeof(GSVertexSW) + add(ecx, ptr[esp + _vertex]); + } + + vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]); + vcvttps2dq(ymm0, ymm0); + + // c = c.upl16(c.zwxy()); + + vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2)); + vpunpcklwd(ymm0, ymm1); + + // if(!tme) c = c.srl16(7); + + if(m_sel.tfx == TFX_NONE) + { + vpsrlw(ymm0, 7); + } + + // m_local.c.rb = c.xxxx(); + // m_local.c.ga = c.zzzz(); + + vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0)); + vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); + + vmovdqa(ptr[&m_local.c.rb], ymm1); + vmovdqa(ptr[&m_local.c.ga], ymm2); + } +} + +#endif \ No newline at end of file diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp index 0be126e05d..6cf849a795 100644 --- a/plugins/GSdx/GSVector.cpp +++ b/plugins/GSdx/GSVector.cpp @@ -75,6 +75,7 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000 #if _M_SSE >= 0x500 +const GSVector8 GSVector8::m_half(0.5f); const GSVector8 GSVector8::m_one(1.0f); const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index dd8f338981..890491e182 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -4119,6 +4119,15 @@ public: // TODO: extract/insert + template __forceinline int extract32() const + { + GSVector4i v = extract(); + + if((i & 3) == 0) return GSVector4i::store(v); + + return v.extract32(); + } + template __forceinline GSVector4i extract() const { if(i == 0) return GSVector4i(_mm256_castsi256_si128(m)); @@ -4141,7 +4150,6 @@ public: GSVector4i a0 = extract<0>(); GSVector4i a1 = extract<1>(); - v0 = GSVector4i::load((int)ptr[a0.extract32<0>()]); v0 = v0.insert32<1>((int)ptr[a0.extract32<1>()]); v0 = v0.insert32<2>((int)ptr[a0.extract32<2>()]); @@ -4191,14 +4199,14 @@ public: return cast(v0).insert<1>(v1); } - template<> __forceinline GSVector8i gather32_32(const uint32* ptr1, const uint8* ptr2) const + template<> __forceinline GSVector8i gather32_32(const uint8* ptr1, const uint32* ptr2) const { - return gather32_32(ptr2).gather32_32(ptr1); + return gather32_32(ptr1).gather32_32(ptr2); } template<> __forceinline GSVector8i gather32_32(const uint32* ptr1, const uint32* ptr2) const { - return gather32_32(ptr2).gather32_32(ptr1); + return gather32_32(ptr1).gather32_32(ptr2); } template __forceinline void gather32_32(const T* RESTRICT ptr, GSVector8i* RESTRICT dst) const @@ -4731,6 +4739,16 @@ public: return GSVector8i(_mm256_broadcastq_epi64(v.m)); } + __forceinline static GSVector8i broadcast128(const GSVector4i& v) + { + // this one only has m128 source op, it will be saved to a temp on stack if the compiler is not smart enough and use the address of v directly (<= vs2012u3rc2) + + return GSVector8i(_mm256_broadcastsi128_si256(v)); // fastest + //return GSVector8i(v); // almost as fast as broadcast + //return cast(v).insert<1>(v); // slow + //return cast(v).aa(); // slowest + } + __forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());} __forceinline static GSVector8i xffffffff() {return zero() == zero();} @@ -4958,6 +4976,7 @@ public: __m128 m0, m1; }; + static const GSVector8 m_half; static const GSVector8 m_one; static const GSVector8 m_x7fffffff; static const GSVector8 m_x80000000; diff --git a/plugins/GSdx/GSdx_vs11.vcxproj b/plugins/GSdx/GSdx_vs11.vcxproj index 988ebb8919..5e11c30087 100644 --- a/plugins/GSdx/GSdx_vs11.vcxproj +++ b/plugins/GSdx/GSdx_vs11.vcxproj @@ -646,6 +646,18 @@ true true true + true + true + + + true + true + true + true + true + true + true + true true @@ -737,6 +749,7 @@ true true + true true @@ -2054,4 +2067,4 @@ - + \ No newline at end of file diff --git a/plugins/GSdx/GSdx_vs11.vcxproj.filters b/plugins/GSdx/GSdx_vs11.vcxproj.filters index 7712fb7c28..3bef440ea7 100644 --- a/plugins/GSdx/GSdx_vs11.vcxproj.filters +++ b/plugins/GSdx/GSdx_vs11.vcxproj.filters @@ -336,6 +336,12 @@ Source Files + + Source Files + + + Source Files + @@ -728,4 +734,4 @@ Resource Files - + \ No newline at end of file diff --git a/plugins/GSdx/xbyak/xbyak_mnemonic.h b/plugins/GSdx/xbyak/xbyak_mnemonic.h index bcd81b7629..c8a785a3f7 100644 --- a/plugins/GSdx/xbyak/xbyak_mnemonic.h +++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h @@ -789,22 +789,22 @@ void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(x void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, true, -1); } void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, true, -1); } void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, true, -1); } -void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); } -void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); } -void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); } -void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); } -void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); } -void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); } -void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); } -void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); } -void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); } -void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); } -void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); } -void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); } -void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); } -void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); } -void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); } -void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); } +void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, true, -1); } +void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, true, -1); } +void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, true, -1); } +void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, true, -1); } +void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, true, -1); } +void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, true, -1); } +void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, true, -1); } +void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, true, -1); } +void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, true, -1); } +void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, true, -1); } +void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, true, -1); } +void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, true, -1); } +void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, true, -1); } +void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, true, -1); } +void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, true, -1); } +void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, true, -1); } void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, true, -1); } void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, true, -1); } void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, true, -1); } @@ -1345,8 +1345,8 @@ void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); } void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); } void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); } -void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); } -void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); } +void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); } +void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); } void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); } void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); } void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); } @@ -1410,3 +1410,13 @@ void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1 void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0, 2); } void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1, 0); } void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1, 1); } + +// mods + +void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x78, true, 0); } +void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x79, true, 0); } +void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x58, true, 0); } +void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x59, true, 0); } + +// supportYMM = true +// vpblendvb, vpsllw-vpsrlq