From c2638131b4bec6a6fc980bca820ef498a22542ca Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Sat, 19 Mar 2022 03:09:11 -0500 Subject: [PATCH] GS:SW: Merge SSE and AVX implementations of CDrawScanline --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 1688 +++++----------------- 1 file changed, 346 insertions(+), 1342 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index b22bc67e7a..4accabe8f1 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -317,16 +317,18 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global) { GSScanlineSelector sel = global.sel; + constexpr int vlen = sizeof(VectorF) / sizeof(float); -#if _M_SSE >= 0x501 - - GSVector8i test; - GSVector8 zo; - GSVector8i f; - GSVector8 s, t, q; - GSVector8i uf, vf; - GSVector8i rbf, gaf; - GSVector8i cov; +#if _M_SSE < 0x501 + const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b; +#endif + VectorI test; + VectorF zo; + VectorI f; + VectorF s, t, q; + VectorI uf, vf; + VectorI rbf, gaf; + VectorI cov; // Init @@ -334,18 +336,22 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (!sel.notest) { - skip = left & 7; - steps = pixels + skip - 8; + skip = left & (vlen - 1); + steps = pixels + skip - vlen; left -= skip; +#if _M_SSE >= 0x501 test = GSVector8i::i8to32(g_const->m_test_256b[skip]) | GSVector8i::i8to32(g_const->m_test_256b[15 + (steps & (steps >> 31))]); +#else + test = const_test[skip] | const_test[7 + (steps & (steps >> 31))]; +#endif } else { skip = 0; - steps = pixels - 8; + steps = pixels - vlen; } - ASSERT((left & 7) == 0); + ASSERT((left & (vlen - 1)) == 0); const GSVector2i* fza_base = &global.fzbr[top]; const GSVector2i* fza_offset = &global.fzbc[left >> 2]; @@ -354,7 +360,11 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (sel.fwrite && sel.fge) { +#if _M_SSE >= 0x501 f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(local.d[skip].f); +#else + f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f); +#endif } if (sel.zb) @@ -367,35 +377,45 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (sel.edge) { +#if _M_SSE >= 0x501 cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9); +#else + cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9); +#endif } if (sel.tfx != TFX_NONE) { if (sel.fst) { - GSVector4i vt(scan.t); + VectorI vt = VectorI::broadcast128(GSVector4i(scan.t)); - GSVector8i u = GSVector8i::broadcast32(vt.xxxx()) + GSVector8i::cast(local.d[skip].s); - GSVector8i v = GSVector8i::broadcast32(vt.yyyy()); + VectorI u = vt.xxxx() + VectorI::cast(local.d[skip].s); + VectorI v = vt.yyyy(); if (sel.prim != GS_SPRITE_CLASS || sel.mmin) { - v += GSVector8i::cast(local.d[skip].t); + v += VectorI::cast(local.d[skip].t); } else if (sel.ltf) { vf = v.xxzzlh().srl16(12); } - s = GSVector8::cast(u); - t = GSVector8::cast(v); + s = VectorF::cast(u); + t = VectorF::cast(v); } else { +#if _M_SSE >= 0x501 s = GSVector8::broadcast32(&scan.t.x) + local.d[skip].s; t = GSVector8::broadcast32(&scan.t.y) + local.d[skip].t; q = GSVector8::broadcast32(&scan.t.z) + local.d[skip].q; +#else + s = scan.t.xxxx() + local.d[skip].s; + t = scan.t.yyyy() + local.d[skip].t; + q = scan.t.zzzz() + local.d[skip].q; +#endif } } @@ -407,8 +427,13 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex c = c.upl16(c.zwxy()); +#if _M_SSE >= 0x501 rbf = GSVector8i::broadcast32(&c.x).add16(local.d[skip].rb); gaf = GSVector8i::broadcast32(&c.z).add16(local.d[skip].ga); +#else + rbf = c.xxxx().add16(local.d[skip].rb); + gaf = c.zzzz().add16(local.d[skip].ga); +#endif } else { @@ -423,9 +448,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex do { int fa = 0, za = 0; - GSVector8i fd, zs, zd; - GSVector8i fm, zm; - GSVector8i rb, ga; + VectorI fd, zs, zd; + VectorI fm, zm; + VectorI rb, ga; // TestZ @@ -436,31 +461,39 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.prim != GS_SPRITE_CLASS) { // Need to handle when the float converts incorrectly +#if _M_SSE >= 0x501 GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; +#else + GSVector4 z = scan.p.zzzz() + zo; +#endif if (sel.zequal) { - zs = GSVector8i::broadcast32(&local.p.z); + zs = local.p.z; } else if (sel.zoverflow) { - zs = (GSVector8i(z * 0.5f) << 1) | (GSVector8i(z) & GSVector8i::x00000001()); + zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001()); } else { - zs = GSVector8i(z); + zs = VectorI(z); } } else { - zs = GSVector8i::broadcast32(&local.p.z); + zs = local.p.z; } if (sel.ztest) { +#if _M_SSE >= 0x501 zd = GSVector8i::load( - (u8*)global.vm + za * 2, (u8*)global.vm + za * 2 + 16, + (u8*)global.vm + za * 2 , (u8*)global.vm + za * 2 + 16, (u8*)global.vm + za * 2 + 32, (u8*)global.vm + za * 2 + 48); +#else + zd = GSVector4i::load((u8*)global.vm + za * 2, (u8*)global.vm + za * 2 + 16); +#endif switch (sel.zpsm) { @@ -469,17 +502,17 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex default: break; } - GSVector8i zso = zs; - GSVector8i zdo = zd; + VectorI zso = zs; + VectorI zdo = zd; if (sel.zoverflow || sel.zpsm == 0) { - zso -= GSVector8i::x80000000(); - zdo -= GSVector8i::x80000000(); + zso -= VectorI::x80000000(); + zdo -= VectorI::x80000000(); } if (sel.zclamp) - zso = zso.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8)); + zso = zso.min_u32(VectorI::xffffffff().srl32(sel.zpsm * 8)); switch (sel.ztst) { @@ -496,30 +529,30 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.fb && sel.tfx != TFX_NONE) { - GSVector8i u, v, uv[2]; - GSVector8i lodi, lodf; - GSVector8i minuv, maxuv; - GSVector8i addr00, addr01, addr10, addr11; - GSVector8i c00, c01, c10, c11; + VectorI u, v, uv[2]; + VectorI lodi, lodf; + VectorI minuv, maxuv; + VectorI addr00, addr01, addr10, addr11; + VectorI c00, c01, c10, c11; if (sel.mmin) { if (!sel.fst) { - u = GSVector8i(s / q); - v = GSVector8i(t / q); + u = VectorI(s / q); + v = VectorI(t / q); } else { - u = GSVector8i::cast(s); - v = GSVector8i::cast(t); + u = VectorI::cast(s); + v = VectorI::cast(t); } if (!sel.lcm) { - GSVector8 tmp = q.log2(3) * global.l + global.k; // (-log2(Q) * (1 << L) + K) * 0x10000 + VectorF tmp = q.log2(3) * global.l + global.k; // (-log2(Q) * (1 << L) + K) * 0x10000 - GSVector8i lod = GSVector8i(tmp.sat(GSVector8::zero(), global.mxl), false); + VectorI lod = VectorI(tmp.sat(VectorF::zero(), global.mxl), false); if (sel.mmin == 1) // round-off mode { @@ -535,6 +568,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex // shift u/v by (int)lod +#if _M_SSE >= 0x501 u = u.srav32(lodi); v = v.srav32(lodi); @@ -551,13 +585,49 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex minuv = tminu.pu32(tminv); maxuv = tmaxu.pu32(tmaxv); +#else + GSVector4i aabb = u.upl32(v); + GSVector4i ccdd = u.uph32(v); + + GSVector4i aaxx = aabb.sra32(lodi.x); + GSVector4i xxbb = aabb.sra32(lodi.y); + GSVector4i ccxx = ccdd.sra32(lodi.z); + GSVector4i xxdd = ccdd.sra32(lodi.w); + + GSVector4i acac = aaxx.upl32(ccxx); + GSVector4i bdbd = xxbb.uph32(xxdd); + + u = acac.upl32(bdbd); + v = acac.uph32(bdbd); + + uv[0] = u; + uv[1] = v; + + GSVector4i minmax = global.t.minmax; + + GSVector4i v0 = minmax.srl16(lodi.x); + GSVector4i v1 = minmax.srl16(lodi.y); + GSVector4i v2 = minmax.srl16(lodi.z); + GSVector4i v3 = minmax.srl16(lodi.w); + + v0 = v0.upl16(v1); + v2 = v2.upl16(v3); + + minuv = v0.upl32(v2); + maxuv = v0.uph32(v2); +#endif } else { lodi = global.lod.i; +#if _M_SSE >= 0x501 u = u.srav32(lodi); v = v.srav32(lodi); +#else + u = u.sra32(lodi.x); + v = v.sra32(lodi.x); +#endif uv[0] = u; uv[1] = v; @@ -575,33 +645,33 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex vf = v.xxzzlh().srl16(12); } - GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector8i uv1 = uv0; + VectorI uv0 = u.sra32(16).ps32(v.sra32(16)); + VectorI uv1 = uv0; { - GSVector8i repeat = (uv0 & minuv) | maxuv; - GSVector8i clamp = uv0.sat_i16(minuv, maxuv); + VectorI repeat = (uv0 & minuv) | maxuv; + VectorI clamp = uv0.sat_i16(minuv, maxuv); - uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv0 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } if (sel.ltf) { - uv1 = uv1.add16(GSVector8i::x0001()); + uv1 = uv1.add16(VectorI::x0001()); - GSVector8i repeat = (uv1 & minuv) | maxuv; - GSVector8i clamp = uv1.sat_i16(minuv, maxuv); + VectorI repeat = (uv1 & minuv) | maxuv; + VectorI clamp = uv1.sat_i16(minuv, maxuv); - uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv1 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } - GSVector8i y0 = uv0.uph16() << (sel.tw + 3); - GSVector8i x0 = uv0.upl16(); + VectorI y0 = uv0.uph16() << (sel.tw + 3); + VectorI x0 = uv0.upl16(); if (sel.ltf) { - GSVector8i y1 = uv1.uph16() << (sel.tw + 3); - GSVector8i x1 = uv1.upl16(); + VectorI y1 = uv1.uph16() << (sel.tw + 3); + VectorI x1 = uv1.upl16(); addr00 = y0 + x0; addr01 = y0 + x1; @@ -610,7 +680,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.tlu) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { const u8* tex = (const u8*)global.tex[lodi.U32[i]]; @@ -622,7 +692,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } else { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { const u32* tex = (const u32*)global.tex[lodi.U32[i]]; @@ -633,18 +703,18 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } } - GSVector8i rb00 = c00.sll16(8).srl16(8); - GSVector8i ga00 = c00.srl16(8); - GSVector8i rb01 = c01.sll16(8).srl16(8); - GSVector8i ga01 = c01.srl16(8); + VectorI rb00 = c00.sll16(8).srl16(8); + VectorI ga00 = c00.srl16(8); + VectorI rb01 = c01.sll16(8).srl16(8); + VectorI ga01 = c01.srl16(8); rb00 = rb00.lerp16_4(rb01, uf); ga00 = ga00.lerp16_4(ga01, uf); - GSVector8i rb10 = c10.sll16(8).srl16(8); - GSVector8i ga10 = c10.srl16(8); - GSVector8i rb11 = c11.sll16(8).srl16(8); - GSVector8i ga11 = c11.srl16(8); + VectorI rb10 = c10.sll16(8).srl16(8); + VectorI ga10 = c10.srl16(8); + VectorI rb11 = c11.sll16(8).srl16(8); + VectorI ga11 = c11.srl16(8); rb10 = rb10.lerp16_4(rb11, uf); ga10 = ga10.lerp16_4(ga11, uf); @@ -658,14 +728,14 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.tlu) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { c00.U32[i] = global.clut[((const u8*)global.tex[lodi.U32[i]])[addr00.U32[i]]]; } } else { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { c00.U32[i] = ((const u32*)global.tex[lodi.U32[i]])[addr00.U32[i]]; } @@ -677,9 +747,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.mmin != 1) // !round-off mode { - GSVector8i rb2, ga2; + VectorI rb2, ga2; - lodi += GSVector8i::x00000001(); + lodi += VectorI::x00000001(); u = uv[0].sra32(1); v = uv[1].sra32(1); @@ -696,33 +766,33 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex vf = v.xxzzlh().srl16(12); } - GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector8i uv1 = uv0; + VectorI uv0 = u.sra32(16).ps32(v.sra32(16)); + VectorI uv1 = uv0; { - GSVector8i repeat = (uv0 & minuv) | maxuv; - GSVector8i clamp = uv0.sat_i16(minuv, maxuv); + VectorI repeat = (uv0 & minuv) | maxuv; + VectorI clamp = uv0.sat_i16(minuv, maxuv); - uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv0 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } if (sel.ltf) { - uv1 = uv1.add16(GSVector8i::x0001()); + uv1 = uv1.add16(VectorI::x0001()); - GSVector8i repeat = (uv1 & minuv) | maxuv; - GSVector8i clamp = uv1.sat_i16(minuv, maxuv); + VectorI repeat = (uv1 & minuv) | maxuv; + VectorI clamp = uv1.sat_i16(minuv, maxuv); - uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv1 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } - GSVector8i y0 = uv0.uph16() << (sel.tw + 3); - GSVector8i x0 = uv0.upl16(); + VectorI y0 = uv0.uph16() << (sel.tw + 3); + VectorI x0 = uv0.upl16(); if (sel.ltf) { - GSVector8i y1 = uv1.uph16() << (sel.tw + 3); - GSVector8i x1 = uv1.upl16(); + VectorI y1 = uv1.uph16() << (sel.tw + 3); + VectorI x1 = uv1.upl16(); addr00 = y0 + x0; addr01 = y0 + x1; @@ -731,7 +801,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.tlu) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { const u8* tex = (const u8*)global.tex[lodi.U32[i]]; @@ -743,7 +813,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } else { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { const u32* tex = (const u32*)global.tex[lodi.U32[i]]; @@ -754,18 +824,18 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } } - GSVector8i rb00 = c00.sll16(8).srl16(8); - GSVector8i ga00 = c00.srl16(8); - GSVector8i rb01 = c01.sll16(8).srl16(8); - GSVector8i ga01 = c01.srl16(8); + VectorI rb00 = c00.sll16(8).srl16(8); + VectorI ga00 = c00.srl16(8); + VectorI rb01 = c01.sll16(8).srl16(8); + VectorI ga01 = c01.srl16(8); rb00 = rb00.lerp16_4(rb01, uf); ga00 = ga00.lerp16_4(ga01, uf); - GSVector8i rb10 = c10.sll16(8).srl16(8); - GSVector8i ga10 = c10.srl16(8); - GSVector8i rb11 = c11.sll16(8).srl16(8); - GSVector8i ga11 = c11.srl16(8); + VectorI rb10 = c10.sll16(8).srl16(8); + VectorI ga10 = c10.srl16(8); + VectorI rb11 = c11.sll16(8).srl16(8); + VectorI ga11 = c11.srl16(8); rb10 = rb10.lerp16_4(rb11, uf); ga10 = ga10.lerp16_4(ga11, uf); @@ -779,14 +849,14 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.tlu) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { c00.U32[i] = global.clut[((const u8*)global.tex[lodi.U32[i]])[addr00.U32[i]]]; } } else { - for (int i = 0; i < 8; i++) + for (int i = 0; i < vlen; i++) { c00.U32[i] = ((const u32*)global.tex[lodi.U32[i]])[addr00.U32[i]]; } @@ -809,8 +879,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (!sel.fst) { - u = GSVector8i(s / q); - v = GSVector8i(t / q); + u = VectorI(s / q); + v = VectorI(t / q); if (sel.ltf) { @@ -820,8 +890,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } else { - u = GSVector8i::cast(s); - v = GSVector8i::cast(t); + u = VectorI::cast(s); + v = VectorI::cast(t); } if (sel.ltf) @@ -834,36 +904,36 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } } - GSVector8i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector8i uv1 = uv0; + VectorI uv0 = u.sra32(16).ps32(v.sra32(16)); + VectorI uv1 = uv0; - GSVector8i tmin = GSVector8i::broadcast128(global.t.min); - GSVector8i tmax = GSVector8i::broadcast128(global.t.max); + VectorI tmin = VectorI::broadcast128(global.t.min); + VectorI tmax = VectorI::broadcast128(global.t.max); { - GSVector8i repeat = (uv0 & tmin) | tmax; - GSVector8i clamp = uv0.sat_i16(tmin, tmax); + VectorI repeat = (uv0 & tmin) | tmax; + VectorI clamp = uv0.sat_i16(tmin, tmax); - uv0 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv0 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } if (sel.ltf) { - uv1 = uv1.add16(GSVector8i::x0001()); + uv1 = uv1.add16(VectorI::x0001()); - GSVector8i repeat = (uv1 & tmin) | tmax; - GSVector8i clamp = uv1.sat_i16(tmin, tmax); + VectorI repeat = (uv1 & tmin) | tmax; + VectorI clamp = uv1.sat_i16(tmin, tmax); - uv1 = clamp.blend8(repeat, GSVector8i::broadcast128(global.t.mask)); + uv1 = clamp.blend8(repeat, VectorI::broadcast128(global.t.mask)); } - GSVector8i y0 = uv0.uph16() << (sel.tw + 3); - GSVector8i x0 = uv0.upl16(); + VectorI y0 = uv0.uph16() << (sel.tw + 3); + VectorI x0 = uv0.upl16(); if (sel.ltf) { - GSVector8i y1 = uv1.uph16() << (sel.tw + 3); - GSVector8i x1 = uv1.upl16(); + VectorI y1 = uv1.uph16() << (sel.tw + 3); + VectorI x1 = uv1.upl16(); addr00 = y0 + x0; addr01 = y0 + x1; @@ -889,18 +959,18 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex c11 = addr11.gather32_32(tex); } - GSVector8i rb00 = c00.sll16(8).srl16(8); - GSVector8i ga00 = c00.srl16(8); - GSVector8i rb01 = c01.sll16(8).srl16(8); - GSVector8i ga01 = c01.srl16(8); + VectorI rb00 = c00.sll16(8).srl16(8); + VectorI ga00 = c00.srl16(8); + VectorI rb01 = c01.sll16(8).srl16(8); + VectorI ga01 = c01.srl16(8); rb00 = rb00.lerp16_4(rb01, uf); ga00 = ga00.lerp16_4(ga01, uf); - GSVector8i rb10 = c10.sll16(8).srl16(8); - GSVector8i ga10 = c10.srl16(8); - GSVector8i rb11 = c11.sll16(8).srl16(8); - GSVector8i ga11 = c11.srl16(8); + VectorI rb10 = c10.sll16(8).srl16(8); + VectorI ga10 = c10.srl16(8); + VectorI rb11 = c11.sll16(8).srl16(8); + VectorI ga11 = c11.srl16(8); rb10 = rb10.lerp16_4(rb11, uf); ga10 = ga10.lerp16_4(ga11, uf); @@ -956,9 +1026,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.aa1) { - GSVector8i x00800080(0x00800080); + VectorI x00800080(0x00800080); - GSVector8i a = sel.edge ? cov : x00800080; + VectorI a = sel.edge ? cov : x00800080; if (!sel.abe) { @@ -992,7 +1062,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.fwrite) { - GSVector8i af; + VectorI af; switch (sel.tfx) { @@ -1017,10 +1087,17 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.fwrite && sel.fge) { +#if _M_SSE >= 0x501 GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(&local.p.f); GSVector8i frb((int)global.frb); GSVector8i fga((int)global.fga); +#else + GSVector4i fog = sel.prim != GS_SPRITE_CLASS ? f : local.p.f; + + GSVector4i frb = global.frb; + GSVector4i fga = global.fga; +#endif rb = frb.lerp16<0>(rb, fog); ga = fga.lerp16<0>(ga, fog).mix16(ga); @@ -1028,7 +1105,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex /* fog = fog.srl16(7); - GSVector8i ifog = GSVector4i::x00ff().sub16(fog); + VectorI ifog = VectorI::x00ff().sub16(fog); rb = rb.mul16l(fog).add16(frb.mul16l(ifog)).srl16(8); ga = ga.mul16l(fog).add16(fga.mul16l(ifog)).srl16(8).mix16(ga); @@ -1043,9 +1120,13 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.rfb) { +#if _M_SSE >= 0x501 fd = GSVector8i::load( - (u8*)global.vm + fa * 2, (u8*)global.vm + fa * 2 + 16, + (u8*)global.vm + fa * 2 , (u8*)global.vm + fa * 2 + 16, (u8*)global.vm + fa * 2 + 32, (u8*)global.vm + fa * 2 + 48); +#else + fd = GSVector4i::load((u8*)global.vm + fa * 2, (u8*)global.vm + fa * 2 + 16); +#endif } } @@ -1057,8 +1138,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (sel.fpsm == 2) { - // test |= fd.srl32(15) == GSVector8i::zero(); - test |= fd.sll32(16).sra32(31) == GSVector8i::zero(); + // test |= fd.srl32(15) == VectorI::zero(); + test |= fd.sll32(16).sra32(31) == VectorI::zero(); } else { @@ -1069,7 +1150,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (sel.fpsm == 2) { - test |= fd.sll32(16).sra32(31); // == GSVector8i::xffffffff(); + test |= fd.sll32(16).sra32(31); // == VectorI::xffffffff(); } else { @@ -1099,15 +1180,15 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.fwrite && sel.zwrite) { - fzm = ~(fm == GSVector8i::xffffffff()).ps32(zm == GSVector8i::xffffffff()).mask(); + fzm = ~(fm == VectorI::xffffffff()).ps32(zm == VectorI::xffffffff()).mask(); } else if (sel.fwrite) { - fzm = ~(fm == GSVector8i::xffffffff()).ps32().mask(); + fzm = ~(fm == VectorI::xffffffff()).ps32().mask(); } else if (sel.zwrite) { - fzm = ~(zm == GSVector8i::xffffffff()).ps32().mask(); + fzm = ~(zm == VectorI::xffffffff()).ps32().mask(); } } @@ -1121,7 +1202,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex } if (sel.zclamp) - zs = zs.min_u32(GSVector8i::xffffffff().srl32(sel.zpsm * 8)); + zs = zs.min_u32(VectorI::xffffffff().srl32(sel.zpsm * 8)); bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; @@ -1129,10 +1210,15 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (fast) { - GSVector4i::storel((u8*)global.vm + za * 2, zs.extract<0>()); +#if _M_SSE >= 0x501 + GSVector4i::storel((u8*)global.vm + za * 2 , zs.extract<0>()); GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs.extract<0>()); GSVector4i::storel((u8*)global.vm + za * 2 + 32, zs.extract<1>()); GSVector4i::storeh((u8*)global.vm + za * 2 + 48, zs.extract<1>()); +#else + GSVector4i::storel((u8*)global.vm + za * 2 , zs); + GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs); +#endif } else { @@ -1140,20 +1226,27 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex WritePixel(zs, za, 1, sel.zpsm, global); WritePixel(zs, za, 2, sel.zpsm, global); WritePixel(zs, za, 3, sel.zpsm, global); +#if _M_SSE >= 0x501 WritePixel(zs, za, 4, sel.zpsm, global); WritePixel(zs, za, 5, sel.zpsm, global); WritePixel(zs, za, 6, sel.zpsm, global); WritePixel(zs, za, 7, sel.zpsm, global); +#endif } } else { if (fast) { - if (fzm & 0x00000f00) GSVector4i::storel((u8*)global.vm + za * 2, zs.extract<0>()); +#if _M_SSE >= 0x501 + if (fzm & 0x00000f00) GSVector4i::storel((u8*)global.vm + za * 2 , zs.extract<0>()); if (fzm & 0x0000f000) GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs.extract<0>()); if (fzm & 0x0f000000) GSVector4i::storel((u8*)global.vm + za * 2 + 32, zs.extract<1>()); if (fzm & 0xf0000000) GSVector4i::storeh((u8*)global.vm + za * 2 + 48, zs.extract<1>()); +#else + if (fzm & 0x0f00) GSVector4i::storel((u8*)global.vm + za * 2 , zs); + if (fzm & 0xf000) GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs); +#endif } else { @@ -1161,10 +1254,12 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (fzm & 0x00000c00) WritePixel(zs, za, 1, sel.zpsm, global); if (fzm & 0x00003000) WritePixel(zs, za, 2, sel.zpsm, global); if (fzm & 0x0000c000) WritePixel(zs, za, 3, sel.zpsm, global); +#if _M_SSE >= 0x501 if (fzm & 0x03000000) WritePixel(zs, za, 4, sel.zpsm, global); if (fzm & 0x0c000000) WritePixel(zs, za, 5, sel.zpsm, global); if (fzm & 0x30000000) WritePixel(zs, za, 6, sel.zpsm, global); if (fzm & 0xc0000000) WritePixel(zs, za, 7, sel.zpsm, global); +#endif } } } @@ -1173,7 +1268,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (sel.fwrite && (sel.abe || sel.aa1)) { - GSVector8i rbs = rb, gas = ga, rbd, gad, a, mask; + VectorI rbs = rb, gas = ga, rbd, gad, a, mask; if (sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1) { @@ -1197,7 +1292,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { case 0: break; case 1: rb = rbd; break; - case 2: rb = GSVector8i::zero(); break; + case 2: rb = VectorI::zero(); break; } switch(sel.abb) @@ -1232,7 +1327,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { case 0: break; case 1: rb = rbd; break; - case 2: rb = GSVector8i::zero(); break; + case 2: rb = VectorI::zero(); break; } } @@ -1249,7 +1344,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { case 0: break; case 1: ga = gad; break; - case 2: ga = GSVector8i::zero(); break; + case 2: ga = VectorI::zero(); break; } switch(sel.abb) @@ -1277,7 +1372,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { case 0: break; case 1: ga = gad; break; - case 2: ga = GSVector8i::zero(); break; + case 2: ga = VectorI::zero(); break; } } @@ -1302,27 +1397,27 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { int y = (top & 3) << 1; - rb = rb.add16(GSVector8i::broadcast128(global.dimx[0 + y])); - ga = ga.add16(GSVector8i::broadcast128(global.dimx[1 + y])); + rb = rb.add16(VectorI::broadcast128(global.dimx[0 + y])); + ga = ga.add16(VectorI::broadcast128(global.dimx[1 + y])); } if (sel.colclamp == 0) { - rb &= GSVector8i::x00ff(); - ga &= GSVector8i::x00ff(); + rb &= VectorI::x00ff(); + ga &= VectorI::x00ff(); } - GSVector8i fs = rb.upl16(ga).pu16(rb.uph16(ga)); + VectorI fs = rb.upl16(ga).pu16(rb.uph16(ga)); if (sel.fba && sel.fpsm != 1) { - fs |= GSVector8i::x80000000(); + fs |= VectorI::x80000000(); } if (sel.fpsm == 2) { - GSVector8i rb = fs & 0x00f800f8; - GSVector8i ga = fs & 0x8000f800; + VectorI rb = fs & 0x00f800f8; + VectorI ga = fs & 0x8000f800; fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); } @@ -1338,10 +1433,15 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex { if (fast) { - GSVector4i::storel((u8*)global.vm + fa * 2, fs.extract<0>()); +#if _M_SSE >= 0x501 + GSVector4i::storel((u8*)global.vm + fa * 2 , fs.extract<0>()); GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs.extract<0>()); GSVector4i::storel((u8*)global.vm + fa * 2 + 32, fs.extract<1>()); GSVector4i::storeh((u8*)global.vm + fa * 2 + 48, fs.extract<1>()); +#else + GSVector4i::storel((u8*)global.vm + fa * 2 , fs); + GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs); +#endif } else { @@ -1349,20 +1449,27 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex WritePixel(fs, fa, 1, sel.fpsm, global); WritePixel(fs, fa, 2, sel.fpsm, global); WritePixel(fs, fa, 3, sel.fpsm, global); +#if _M_SSE >= 0x501 WritePixel(fs, fa, 4, sel.fpsm, global); WritePixel(fs, fa, 5, sel.fpsm, global); WritePixel(fs, fa, 6, sel.fpsm, global); WritePixel(fs, fa, 7, sel.fpsm, global); +#endif } } else { if (fast) { - if (fzm & 0x0000000f) GSVector4i::storel((u8*)global.vm + fa * 2, fs.extract<0>()); +#if _M_SSE >= 0x501 + if (fzm & 0x0000000f) GSVector4i::storel((u8*)global.vm + fa * 2 , fs.extract<0>()); if (fzm & 0x000000f0) GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs.extract<0>()); if (fzm & 0x000f0000) GSVector4i::storel((u8*)global.vm + fa * 2 + 32, fs.extract<1>()); if (fzm & 0x00f00000) GSVector4i::storeh((u8*)global.vm + fa * 2 + 48, fs.extract<1>()); +#else + if (fzm & 0x000f) GSVector4i::storel((u8*)global.vm + fa * 2 , fs); + if (fzm & 0x00f0) GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs); +#endif } else { @@ -1370,1202 +1477,99 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex if (fzm & 0x0000000c) WritePixel(fs, fa, 1, sel.fpsm, global); if (fzm & 0x00000030) WritePixel(fs, fa, 2, sel.fpsm, global); if (fzm & 0x000000c0) WritePixel(fs, fa, 3, sel.fpsm, global); +#if _M_SSE >= 0x501 if (fzm & 0x00030000) WritePixel(fs, fa, 4, sel.fpsm, global); if (fzm & 0x000c0000) WritePixel(fs, fa, 5, sel.fpsm, global); if (fzm & 0x00300000) WritePixel(fs, fa, 6, sel.fpsm, global); if (fzm & 0x00c00000) WritePixel(fs, fa, 7, sel.fpsm, global); - } - } - } - } while (0); - - if (sel.edge) - break; - - if (steps <= 0) - break; - - // Step - - steps -= 8; - - fza_offset += 2; - - if (sel.prim != GS_SPRITE_CLASS) - { - if (sel.zb) - { - zo += GSVector8::broadcast32(&local.d8.p.z); - } - - if (sel.fwrite && sel.fge) - { - f = f.add16(GSVector8i::broadcast16(&local.d8.p.f)); - } - } - - if (sel.fb) - { - if (sel.tfx != TFX_NONE) - { - if (sel.fst) - { - GSVector8i stq = GSVector8i::cast(GSVector8(local.d8.stq)); - - s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); - - if (sel.prim != GS_SPRITE_CLASS || sel.mmin) - { - t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy()); - } - } - else - { - GSVector8 stq(local.d8.stq); - - s += stq.xxxx(); - t += stq.yyyy(); - q += stq.zzzz(); - } - } - } - - if (!(sel.tfx == TFX_DECAL && sel.tcc)) - { - if (sel.iip) - { - GSVector8i c = GSVector8i::broadcast64(&local.d8.c); - - rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero()); - gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero()); - } - } - - if (!sel.notest) - { - test = GSVector8i::i8to32(g_const->m_test_256b[15 + (steps & (steps >> 31))]); - } - } - -#else - - const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b; - GSVector4i test; - GSVector4 zo; - GSVector4i f; - GSVector4 s, t, q; - GSVector4i uf, vf; - GSVector4i rbf, gaf; - GSVector4i cov; - - // Init - - int skip, steps; - - if (!sel.notest) - { - skip = left & 3; - steps = pixels + skip - 4; - left -= skip; - test = const_test[skip] | const_test[7 + (steps & (steps >> 31))]; - } - else - { - skip = 0; - steps = pixels - 4; - } - - ASSERT((left & 3) == 0); - - const GSVector2i* fza_base = &global.fzbr[top]; - const GSVector2i* fza_offset = &global.fzbc[left >> 2]; - - if (sel.prim != GS_SPRITE_CLASS) - { - if (sel.fwrite && sel.fge) - { - f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f); - } - - if (sel.zb) - { - zo = local.d[skip].z; - } - } - - if (sel.fb) - { - if (sel.edge) - { - cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9); - } - - if (sel.tfx != TFX_NONE) - { - if (sel.fst) - { - GSVector4i vt(scan.t); - - GSVector4i u = vt.xxxx() + GSVector4i::cast(local.d[skip].s); - GSVector4i v = vt.yyyy(); - - if (sel.prim != GS_SPRITE_CLASS || sel.mmin) - { - v += GSVector4i::cast(local.d[skip].t); - } - else if (sel.ltf) - { - vf = v.xxzzlh().srl16(12); - } - - s = GSVector4::cast(u); - t = GSVector4::cast(v); - } - else - { - s = scan.t.xxxx() + local.d[skip].s; - t = scan.t.yyyy() + local.d[skip].t; - q = scan.t.zzzz() + local.d[skip].q; - } - } - - if (!(sel.tfx == TFX_DECAL && sel.tcc)) - { - if (sel.iip) - { - GSVector4i c(scan.c); - - c = c.upl16(c.zwxy()); - - rbf = c.xxxx().add16(local.d[skip].rb); - gaf = c.zzzz().add16(local.d[skip].ga); - } - else - { - rbf = local.c.rb; - gaf = local.c.ga; - } - } - } - - while (1) - { - do - { - int fa = 0, za = 0; - GSVector4i fd, zs, zd; - GSVector4i fm, zm; - GSVector4i rb, ga; - - // TestZ - - if (sel.zb) - { - za = (fza_base->y + fza_offset->y) % HALF_VM_SIZE; - - if (sel.prim != GS_SPRITE_CLASS) - { - GSVector4 z = scan.p.zzzz() + zo; - - if (sel.zoverflow) - { - zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - } - else - { - zs = GSVector4i(z); - } - } - else - { - zs = local.p.z; - } - - if (sel.ztest) - { - zd = GSVector4i::load((u8*)global.vm + za * 2, (u8*)global.vm + za * 2 + 16); - - switch (sel.zpsm) - { - case 1: zd = zd.sll32( 8).srl32( 8); break; - case 2: zd = zd.sll32(16).srl32(16); break; - default: break; - } - - GSVector4i zso = zs; - GSVector4i zdo = zd; - - if (sel.zoverflow || sel.zpsm == 0) - { - zso -= GSVector4i::x80000000(); - zdo -= GSVector4i::x80000000(); - } - - if (sel.zclamp) - { - const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8); - - zso.U32[0] = std::min(z_max, zso.U32[0]); - zso.U32[1] = std::min(z_max, zso.U32[1]); - zso.U32[2] = std::min(z_max, zso.U32[2]); - zso.U32[3] = std::min(z_max, zso.U32[3]); - } - - switch (sel.ztst) - { - case ZTST_GEQUAL: test |= zso < zdo; break; - case ZTST_GREATER: test |= zso <= zdo; break; - } - - if (test.alltrue()) - continue; - } - } - - // SampleTexture - - if (sel.fb && sel.tfx != TFX_NONE) - { - GSVector4i u, v, uv[2]; - GSVector4i lodi, lodf; - GSVector4i minuv, maxuv; - GSVector4i addr00, addr01, addr10, addr11; - GSVector4i c00, c01, c10, c11; - - if (sel.mmin) - { - if (!sel.fst) - { - u = GSVector4i(s / q); - v = GSVector4i(t / q); - } - else - { - u = GSVector4i::cast(s); - v = GSVector4i::cast(t); - } - - if (!sel.lcm) - { - GSVector4 tmp = q.log2(3) * global.l + global.k; // (-log2(Q) * (1 << L) + K) * 0x10000 - - GSVector4i lod = GSVector4i(tmp.sat(GSVector4::zero(), global.mxl), false); - - if (sel.mmin == 1) // round-off mode - { - lod += 0x8000; - } - - lodi = lod.srl32(16); - - if (sel.mmin == 2) // trilinear mode - { - lodf = lod.xxzzlh(); - } - - // shift u/v by (int)lod - - GSVector4i aabb = u.upl32(v); - GSVector4i ccdd = u.uph32(v); - - GSVector4i aaxx = aabb.sra32(lodi.x); - GSVector4i xxbb = aabb.sra32(lodi.y); - GSVector4i ccxx = ccdd.sra32(lodi.z); - GSVector4i xxdd = ccdd.sra32(lodi.w); - - GSVector4i acac = aaxx.upl32(ccxx); - GSVector4i bdbd = xxbb.uph32(xxdd); - - u = acac.upl32(bdbd); - v = acac.uph32(bdbd); - - uv[0] = u; - uv[1] = v; - - GSVector4i minmax = global.t.minmax; - - GSVector4i v0 = minmax.srl16(lodi.x); - GSVector4i v1 = minmax.srl16(lodi.y); - GSVector4i v2 = minmax.srl16(lodi.z); - GSVector4i v3 = minmax.srl16(lodi.w); - - v0 = v0.upl16(v1); - v2 = v2.upl16(v3); - - minuv = v0.upl32(v2); - maxuv = v0.uph32(v2); - } - else - { - lodi = global.lod.i; - - u = u.sra32(lodi.x); - v = v.sra32(lodi.x); - - uv[0] = u; - uv[1] = v; - - minuv = local.temp.uv_minmax[0]; - maxuv = local.temp.uv_minmax[1]; - } - - if (sel.ltf) - { - u -= 0x8000; - v -= 0x8000; - - uf = u.xxzzlh().srl16(12); - vf = v.xxzzlh().srl16(12); - } - - GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector4i uv1 = uv0; - - { - GSVector4i repeat = (uv0 & minuv) | maxuv; - GSVector4i clamp = uv0.sat_i16(minuv, maxuv); - - uv0 = clamp.blend8(repeat, global.t.mask); - } - - if (sel.ltf) - { - uv1 = uv1.add16(GSVector4i::x0001()); - - GSVector4i repeat = (uv1 & minuv) | maxuv; - GSVector4i clamp = uv1.sat_i16(minuv, maxuv); - - uv1 = clamp.blend8(repeat, global.t.mask); - } - - GSVector4i y0 = uv0.uph16() << (sel.tw + 3); - GSVector4i x0 = uv0.upl16(); - - if (sel.ltf) - { - GSVector4i y1 = uv1.uph16() << (sel.tw + 3); - GSVector4i x1 = uv1.upl16(); - - addr00 = y0 + x0; - addr01 = y0 + x1; - addr10 = y1 + x0; - addr11 = y1 + x1; - - if (sel.tlu) - { - for (int i = 0; i < 4; i++) - { - const u8* tex = (const u8*)global.tex[lodi.U32[i]]; - - c00.U32[i] = global.clut[tex[addr00.U32[i]]]; - c01.U32[i] = global.clut[tex[addr01.U32[i]]]; - c10.U32[i] = global.clut[tex[addr10.U32[i]]]; - c11.U32[i] = global.clut[tex[addr11.U32[i]]]; - } - } - else - { - for (int i = 0; i < 4; i++) - { - const u32* tex = (const u32*)global.tex[lodi.U32[i]]; - - c00.U32[i] = tex[addr00.U32[i]]; - c01.U32[i] = tex[addr01.U32[i]]; - c10.U32[i] = tex[addr10.U32[i]]; - c11.U32[i] = tex[addr11.U32[i]]; - } - } - - GSVector4i rb00 = c00.sll16(8).srl16(8); - GSVector4i ga00 = c00.srl16(8); - GSVector4i rb01 = c01.sll16(8).srl16(8); - GSVector4i ga01 = c01.srl16(8); - - rb00 = rb00.lerp16_4(rb01, uf); - ga00 = ga00.lerp16_4(ga01, uf); - - GSVector4i rb10 = c10.sll16(8).srl16(8); - GSVector4i ga10 = c10.srl16(8); - GSVector4i rb11 = c11.sll16(8).srl16(8); - GSVector4i ga11 = c11.srl16(8); - - rb10 = rb10.lerp16_4(rb11, uf); - ga10 = ga10.lerp16_4(ga11, uf); - - rb = rb00.lerp16_4(rb10, vf); - ga = ga00.lerp16_4(ga10, vf); - } - else - { - addr00 = y0 + x0; - - if (sel.tlu) - { - for (int i = 0; i < 4; i++) - { - c00.U32[i] = global.clut[((const u8*)global.tex[lodi.U32[i]])[addr00.U32[i]]]; - } - } - else - { - for (int i = 0; i < 4; i++) - { - c00.U32[i] = ((const u32*)global.tex[lodi.U32[i]])[addr00.U32[i]]; - } - } - - rb = c00.sll16(8).srl16(8); - ga = c00.srl16(8); - } - - if (sel.mmin != 1) // !round-off mode - { - GSVector4i rb2, ga2; - - lodi += GSVector4i::x00000001(); - - u = uv[0].sra32(1); - v = uv[1].sra32(1); - - minuv = minuv.srl16(1); - maxuv = maxuv.srl16(1); - - if (sel.ltf) - { - u -= 0x8000; - v -= 0x8000; - - uf = u.xxzzlh().srl16(12); - vf = v.xxzzlh().srl16(12); - } - - GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector4i uv1 = uv0; - - { - GSVector4i repeat = (uv0 & minuv) | maxuv; - GSVector4i clamp = uv0.sat_i16(minuv, maxuv); - - uv0 = clamp.blend8(repeat, global.t.mask); - } - - if (sel.ltf) - { - uv1 = uv1.add16(GSVector4i::x0001()); - - GSVector4i repeat = (uv1 & minuv) | maxuv; - GSVector4i clamp = uv1.sat_i16(minuv, maxuv); - - uv1 = clamp.blend8(repeat, global.t.mask); - } - - GSVector4i y0 = uv0.uph16() << (sel.tw + 3); - GSVector4i x0 = uv0.upl16(); - - if (sel.ltf) - { - GSVector4i y1 = uv1.uph16() << (sel.tw + 3); - GSVector4i x1 = uv1.upl16(); - - addr00 = y0 + x0; - addr01 = y0 + x1; - addr10 = y1 + x0; - addr11 = y1 + x1; - - if (sel.tlu) - { - for (int i = 0; i < 4; i++) - { - const u8* tex = (const u8*)global.tex[lodi.U32[i]]; - - c00.U32[i] = global.clut[tex[addr00.U32[i]]]; - c01.U32[i] = global.clut[tex[addr01.U32[i]]]; - c10.U32[i] = global.clut[tex[addr10.U32[i]]]; - c11.U32[i] = global.clut[tex[addr11.U32[i]]]; - } - } - else - { - for (int i = 0; i < 4; i++) - { - const u32* tex = (const u32*)global.tex[lodi.U32[i]]; - - c00.U32[i] = tex[addr00.U32[i]]; - c01.U32[i] = tex[addr01.U32[i]]; - c10.U32[i] = tex[addr10.U32[i]]; - c11.U32[i] = tex[addr11.U32[i]]; - } - } - - GSVector4i rb00 = c00.sll16(8).srl16(8); - GSVector4i ga00 = c00.srl16(8); - GSVector4i rb01 = c01.sll16(8).srl16(8); - GSVector4i ga01 = c01.srl16(8); - - rb00 = rb00.lerp16_4(rb01, uf); - ga00 = ga00.lerp16_4(ga01, uf); - - GSVector4i rb10 = c10.sll16(8).srl16(8); - GSVector4i ga10 = c10.srl16(8); - GSVector4i rb11 = c11.sll16(8).srl16(8); - GSVector4i ga11 = c11.srl16(8); - - rb10 = rb10.lerp16_4(rb11, uf); - ga10 = ga10.lerp16_4(ga11, uf); - - rb2 = rb00.lerp16_4(rb10, vf); - ga2 = ga00.lerp16_4(ga10, vf); - } - else - { - addr00 = y0 + x0; - - if (sel.tlu) - { - for (int i = 0; i < 4; i++) - { - c00.U32[i] = global.clut[((const u8*)global.tex[lodi.U32[i]])[addr00.U32[i]]]; - } - } - else - { - for (int i = 0; i < 4; i++) - { - c00.U32[i] = ((const u32*)global.tex[lodi.U32[i]])[addr00.U32[i]]; - } - } - - rb2 = c00.sll16(8).srl16(8); - ga2 = c00.srl16(8); - } - - if (sel.lcm) - lodf = global.lod.f; - - lodf = lodf.srl16(1); - - rb = rb.lerp16<0>(rb2, lodf); - ga = ga.lerp16<0>(ga2, lodf); - } - } - else - { - if (!sel.fst) - { - u = GSVector4i(s / q); - v = GSVector4i(t / q); - - if (sel.ltf) - { - u -= 0x8000; - v -= 0x8000; - } - } - else - { - u = GSVector4i::cast(s); - v = GSVector4i::cast(t); - } - - if (sel.ltf) - { - uf = u.xxzzlh().srl16(12); - - if (sel.prim != GS_SPRITE_CLASS) - { - vf = v.xxzzlh().srl16(12); - } - } - - GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); - GSVector4i uv1 = uv0; - - { - GSVector4i repeat = (uv0 & global.t.min) | global.t.max; - GSVector4i clamp = uv0.sat_i16(global.t.min, global.t.max); - - uv0 = clamp.blend8(repeat, global.t.mask); - } - - if (sel.ltf) - { - uv1 = uv1.add16(GSVector4i::x0001()); - - GSVector4i repeat = (uv1 & global.t.min) | global.t.max; - GSVector4i clamp = uv1.sat_i16(global.t.min, global.t.max); - - uv1 = clamp.blend8(repeat, global.t.mask); - } - - GSVector4i y0 = uv0.uph16() << (sel.tw + 3); - GSVector4i x0 = uv0.upl16(); - - if (sel.ltf) - { - GSVector4i y1 = uv1.uph16() << (sel.tw + 3); - GSVector4i x1 = uv1.upl16(); - - addr00 = y0 + x0; - addr01 = y0 + x1; - addr10 = y1 + x0; - addr11 = y1 + x1; - - if (sel.tlu) - { - const u8* tex = (const u8*)global.tex[0]; - - c00 = addr00.gather32_32(tex, global.clut); - c01 = addr01.gather32_32(tex, global.clut); - c10 = addr10.gather32_32(tex, global.clut); - c11 = addr11.gather32_32(tex, global.clut); - } - else - { - const u32* tex = (const u32*)global.tex[0]; - - c00 = addr00.gather32_32(tex); - c01 = addr01.gather32_32(tex); - c10 = addr10.gather32_32(tex); - c11 = addr11.gather32_32(tex); - } - - GSVector4i rb00 = c00.sll16(8).srl16(8); - GSVector4i ga00 = c00.srl16(8); - GSVector4i rb01 = c01.sll16(8).srl16(8); - GSVector4i ga01 = c01.srl16(8); - - rb00 = rb00.lerp16_4(rb01, uf); - ga00 = ga00.lerp16_4(ga01, uf); - - GSVector4i rb10 = c10.sll16(8).srl16(8); - GSVector4i ga10 = c10.srl16(8); - GSVector4i rb11 = c11.sll16(8).srl16(8); - GSVector4i ga11 = c11.srl16(8); - - rb10 = rb10.lerp16_4(rb11, uf); - ga10 = ga10.lerp16_4(ga11, uf); - - rb = rb00.lerp16_4(rb10, vf); - ga = ga00.lerp16_4(ga10, vf); - } - else - { - addr00 = y0 + x0; - - if (sel.tlu) - { - c00 = addr00.gather32_32((const u8*)global.tex[0], global.clut); - } - else - { - c00 = addr00.gather32_32((const u32*)global.tex[0]); - } - - rb = c00.sll16(8).srl16(8); - ga = c00.srl16(8); - } - } - } - - // AlphaTFX - - if (sel.fb) - { - switch (sel.tfx) - { - case TFX_MODULATE: - ga = ga.modulate16<1>(gaf).clamp8(); - if (!sel.tcc) - ga = ga.mix16(gaf.srl16(7)); - break; - case TFX_DECAL: - if (!sel.tcc) - ga = ga.mix16(gaf.srl16(7)); - break; - case TFX_HIGHLIGHT: - ga = ga.mix16(!sel.tcc ? gaf.srl16(7) : ga.addus8(gaf.srl16(7))); - break; - case TFX_HIGHLIGHT2: - if (!sel.tcc) - ga = ga.mix16(gaf.srl16(7)); - break; - case TFX_NONE: - ga = sel.iip ? gaf.srl16(7) : gaf; - break; - } - - if (sel.aa1) - { - GSVector4i x00800080(0x00800080); - - GSVector4i a = sel.edge ? cov : x00800080; - - if (!sel.abe) - { - ga = ga.mix16(a); - } - else - { - ga = ga.blend8(a, ga.eq16(x00800080).srl32(16).sll32(16)); - } - } - } - - // ReadMask - - if (sel.fwrite) - { - fm = global.fm; - } - - if (sel.zwrite) - { - zm = global.zm; - } - - // TestAlpha - - if (!TestAlpha(test, fm, zm, ga, global)) - continue; - - // ColorTFX - - if (sel.fwrite) - { - GSVector4i af; - - switch (sel.tfx) - { - case TFX_MODULATE: - rb = rb.modulate16<1>(rbf).clamp8(); - break; - case TFX_DECAL: - break; - case TFX_HIGHLIGHT: - case TFX_HIGHLIGHT2: - af = gaf.yywwlh().srl16(7); - rb = rb.modulate16<1>(rbf).add16(af).clamp8(); - ga = ga.modulate16<1>(gaf).add16(af).clamp8().mix16(ga); - break; - case TFX_NONE: - rb = sel.iip ? rbf.srl16(7) : rbf; - break; - } - } - - // Fog - - if (sel.fwrite && sel.fge) - { - GSVector4i fog = sel.prim != GS_SPRITE_CLASS ? f : local.p.f; - - rb = global.frb.lerp16<0>(rb, fog); - ga = global.fga.lerp16<0>(ga, fog).mix16(ga); - - /* - fog = fog.srl16(7); - - GSVector4i ifog = GSVector4i::x00ff().sub16(fog); - - rb = rb.mul16l(fog).add16(global.frb.mul16l(ifog)).srl16(8); - ga = ga.mul16l(fog).add16(global.fga.mul16l(ifog)).srl16(8).mix16(ga); - */ - } - - // ReadFrame - - if (sel.fb) - { - fa = (fza_base->x + fza_offset->x) % HALF_VM_SIZE; - - if (sel.rfb) - { - fd = GSVector4i::load((u8*)global.vm + fa * 2, (u8*)global.vm + fa * 2 + 16); - } - } - - // TestDestAlpha - - if (sel.date && (sel.fpsm == 0 || sel.fpsm == 2)) - { - if (sel.datm) - { - if (sel.fpsm == 2) - { - // test |= fd.srl32(15) == GSVector4i::zero(); - test |= fd.sll32(16).sra32(31) == GSVector4i::zero(); - } - else - { - test |= (~fd).sra32(31); - } - } - else - { - if (sel.fpsm == 2) - { - test |= fd.sll32(16).sra32(31); // == GSVector4i::xffffffff(); - } - else - { - test |= fd.sra32(31); - } - } - - if (test.alltrue()) - continue; - } - - // WriteMask - - int fzm = 0; - - if (!sel.notest) - { - if (sel.fwrite) - { - fm |= test; - } - - if (sel.zwrite) - { - zm |= test; - } - - if (sel.fwrite && sel.zwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - } - else if (sel.fwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); - } - else if (sel.zwrite) - { - fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); - } - } - - // WriteZBuf - - if (sel.zwrite) - { - if (sel.ztest && sel.zpsm < 2) - { - zs = zs.blend8(zd, zm); - } - - if (sel.zclamp) - { - const unsigned int z_max = 0xffffffff >> (sel.zpsm * 8); - - zs.U32[0] = std::min(z_max, zs.U32[0]); - zs.U32[1] = std::min(z_max, zs.U32[1]); - zs.U32[2] = std::min(z_max, zs.U32[2]); - zs.U32[3] = std::min(z_max, zs.U32[3]); - } - - bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; - - if (sel.notest) - { - if (fast) - { - GSVector4i::storel((u8*)global.vm + za * 2, zs); - GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs); - } - else - { - WritePixel(zs, za, 0, sel.zpsm, global); - WritePixel(zs, za, 1, sel.zpsm, global); - WritePixel(zs, za, 2, sel.zpsm, global); - WritePixel(zs, za, 3, sel.zpsm, global); - } - } - else - { - if (fast) - { - if (fzm & 0x0f00) GSVector4i::storel((u8*)global.vm + za * 2, zs); - if (fzm & 0xf000) GSVector4i::storeh((u8*)global.vm + za * 2 + 16, zs); - } - else - { - if (fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm, global); - if (fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm, global); - if (fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm, global); - if (fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm, global); - } - } - } - - // AlphaBlend - - if (sel.fwrite && (sel.abe || sel.aa1)) - { - GSVector4i rbs = rb, gas = ga, rbd, gad, a, mask; - - if (sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1) - { - switch (sel.fpsm) - { - case 0: - case 1: - rbd = fd.sll16(8).srl16(8); - gad = fd.srl16(8); - break; - case 2: - rbd = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); - gad = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); - break; - } - } - - if (sel.aba != sel.abb) - { - switch (sel.aba) - { - case 0: break; - case 1: rb = rbd; break; - case 2: rb = GSVector4i::zero(); break; - } - - switch (sel.abb) - { - case 0: rb = rb.sub16(rbs); break; - case 1: rb = rb.sub16(rbd); break; - case 2: break; - } - - if (!(sel.fpsm == 1 && sel.abc == 1)) - { - switch(sel.abc) - { - case 0: a = gas.yywwlh().sll16(7); break; - case 1: a = gad.yywwlh().sll16(7); break; - case 2: a = global.afix; break; - } - - rb = rb.modulate16<1>(a); - } - - switch (sel.abd) - { - case 0: rb = rb.add16(rbs); break; - case 1: rb = rb.add16(rbd); break; - case 2: break; - } - } - else - { - switch (sel.abd) - { - case 0: break; - case 1: rb = rbd; break; - case 2: rb = GSVector4i::zero(); break; - } - } - - if (sel.pabe) - { - mask = (gas << 8).sra32(31); - - rb = rbs.blend8(rb, mask); - } - - if (sel.aba != sel.abb) - { - switch (sel.aba) - { - case 0: break; - case 1: ga = gad; break; - case 2: ga = GSVector4i::zero(); break; - } - - switch (sel.abb) - { - case 0: ga = ga.sub16(gas); break; - case 1: ga = ga.sub16(gad); break; - case 2: break; - } - - if (!(sel.fpsm == 1 && sel.abc == 1)) - { - ga = ga.modulate16<1>(a); - } - - switch (sel.abd) - { - case 0: ga = ga.add16(gas); break; - case 1: ga = ga.add16(gad); break; - case 2: break; - } - } - else - { - switch (sel.abd) - { - case 0: break; - case 1: ga = gad; break; - case 2: ga = GSVector4i::zero(); break; - } - } - - if (sel.pabe) - { - ga = gas.blend8(ga, mask >> 16); - } - else - { - if (sel.fpsm != 1) - { - ga = ga.mix16(gas); - } - } - } - - // WriteFrame - - if (sel.fwrite) - { - if (sel.fpsm == 2 && sel.dthe) - { - int y = (top & 3) << 1; - - rb = rb.add16(global.dimx[0 + y]); - ga = ga.add16(global.dimx[1 + y]); - } - - if (sel.colclamp == 0) - { - rb &= GSVector4i::x00ff(); - ga &= GSVector4i::x00ff(); - } - - GSVector4i fs = rb.upl16(ga).pu16(rb.uph16(ga)); - - if (sel.fba && sel.fpsm != 1) - { - fs |= GSVector4i::x80000000(); - } - - if (sel.fpsm == 2) - { - GSVector4i rb = fs & 0x00f800f8; - GSVector4i ga = fs & 0x8000f800; - - fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); - } - - if (sel.rfb) - { - fs = fs.blend(fd, fm); - } - - bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest; - - if (sel.notest) - { - if (fast) - { - GSVector4i::storel((u8*)global.vm + fa * 2, fs); - GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs); - } - else - { - WritePixel(fs, fa, 0, sel.fpsm, global); - WritePixel(fs, fa, 1, sel.fpsm, global); - WritePixel(fs, fa, 2, sel.fpsm, global); - WritePixel(fs, fa, 3, sel.fpsm, global); - } - } - else - { - if (fast) - { - if (fzm & 0x000f) GSVector4i::storel((u8*)global.vm + fa * 2, fs); - if (fzm & 0x00f0) GSVector4i::storeh((u8*)global.vm + fa * 2 + 16, fs); - } - else - { - if (fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm, global); - if (fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm, global); - if (fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm, global); - if (fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm, global); - } - } - } - } while (0); - - if (sel.edge) - break; - - if (steps <= 0) - break; - - // Step - - steps -= 4; - - fza_offset++; - - if (sel.prim != GS_SPRITE_CLASS) - { - if (sel.zb) - { - zo += local.d4.z; - } - - if (sel.fwrite && sel.fge) - { - f = f.add16(local.d4.f); - } - } - - if (sel.fb) - { - if (sel.tfx != TFX_NONE) - { - if (sel.fst) - { - GSVector4i stq = GSVector4i::cast(local.d4.stq); - - s = GSVector4::cast(GSVector4i::cast(s) + stq.xxxx()); - - if (sel.prim != GS_SPRITE_CLASS || sel.mmin) - { - t = GSVector4::cast(GSVector4i::cast(t) + stq.yyyy()); - } - } - else - { - GSVector4 stq = local.d4.stq; - - s += stq.xxxx(); - t += stq.yyyy(); - q += stq.zzzz(); - } - } - } - - if (!(sel.tfx == TFX_DECAL && sel.tcc)) - { - if (sel.iip) - { - GSVector4i c = local.d4.c; - - rbf = rbf.add16(c.xxxx()).max_i16(GSVector4i::zero()); - gaf = gaf.add16(c.yyyy()).max_i16(GSVector4i::zero()); - } - } - - if (!sel.notest) - { - test = const_test[7 + (steps & (steps >> 31))]; - } - } - #endif + } + } + } + } while (0); + + if (sel.edge) + break; + + if (steps <= 0) + break; + + // Step + + steps -= vlen; + + fza_offset += vlen / 4; + + if (sel.prim != GS_SPRITE_CLASS) + { + if (sel.zb) + { +#if _M_SSE >= 0x501 + zo += GSVector8::broadcast32(&local.d8.p.z); +#else + zo += local.d4.z; +#endif + } + + if (sel.fwrite && sel.fge) + { +#if _M_SSE >= 0x501 + f = f.add16(GSVector8i::broadcast16(&local.d8.p.f)); +#else + f = f.add16(local.d4.f); +#endif + } + } + + if (sel.fb) + { + if (sel.tfx != TFX_NONE) + { + if (sel.fst) + { + VectorI stq = VectorI::cast(VectorF(LOCAL_STEP.stq)); + + s = VectorF::cast(VectorI::cast(s) + stq.xxxx()); + + if (sel.prim != GS_SPRITE_CLASS || sel.mmin) + { + t = VectorF::cast(VectorI::cast(t) + stq.yyyy()); + } + } + else + { + VectorF stq(LOCAL_STEP.stq); + + s += stq.xxxx(); + t += stq.yyyy(); + q += stq.zzzz(); + } + } + } + + if (!(sel.tfx == TFX_DECAL && sel.tcc)) + { + if (sel.iip) + { +#if _M_SSE >= 0x501 + GSVector8i c = GSVector8i::broadcast64(&local.d8.c); +#else + GSVector4i c = local.d4.c; +#endif + rbf = rbf.add16(c.xxxx()).max_i16(VectorI::zero()); + gaf = gaf.add16(c.yyyy()).max_i16(VectorI::zero()); + } + } + + if (!sel.notest) + { +#if _M_SSE >= 0x501 + test = GSVector8i::i8to32(g_const->m_test_256b[15 + (steps & (steps >> 31))]); +#else + test = const_test[7 + (steps & (steps >> 31))]; +#endif + } + } } #ifndef ENABLE_JIT_RASTERIZER