From 9d546770552ee5994cfb1c3e81c05d958b853a08 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Fri, 25 Nov 2011 23:48:59 +0000 Subject: [PATCH] GSdx: re-implemented the drawing pipeline in c++, just for reference and easier debugging. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4972 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GPUDrawScanline.cpp | 29 + plugins/GSdx/GPUDrawScanline.h | 11 +- plugins/GSdx/GSDrawScanline.cpp | 1293 +++++++++++++++++ plugins/GSdx/GSDrawScanline.h | 20 +- plugins/GSdx/GSDrawScanlineCodeGenerator.h | 6 +- .../GSDrawScanlineCodeGenerator.x64.avx.cpp | 4 +- .../GSDrawScanlineCodeGenerator.x86.avx.cpp | 25 +- .../GSdx/GSDrawScanlineCodeGenerator.x86.cpp | 21 +- plugins/GSdx/GSRasterizer.cpp | 51 +- plugins/GSdx/GSRasterizer.h | 14 + plugins/GSdx/GSScanlineEnvironment.h | 5 +- plugins/GSdx/GSSetupPrimCodeGenerator.h | 4 +- plugins/GSdx/GSVector.h | 58 +- 13 files changed, 1469 insertions(+), 72 deletions(-) diff --git a/plugins/GSdx/GPUDrawScanline.cpp b/plugins/GSdx/GPUDrawScanline.cpp index fb233109cb..9fd5ac2ac2 100644 --- a/plugins/GSdx/GPUDrawScanline.cpp +++ b/plugins/GSdx/GPUDrawScanline.cpp @@ -80,3 +80,32 @@ void GPUDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame) { m_ds_map.UpdateStats(stats, frame); } + +void GPUDrawScanline::PrintStats() +{ + m_ds_map.PrintStats(); +} + +#ifndef JIT_DRAW + +void GPUDrawScanline::SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) +{ + // TODO +} + +void GPUDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) +{ + // TODO +} + +void GPUDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) +{ + // TODO +} + +void GPUDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) +{ + // TODO +} + +#endif \ No newline at end of file diff --git a/plugins/GSdx/GPUDrawScanline.h b/plugins/GSdx/GPUDrawScanline.h index 7c21a04cc8..5dc1db5fab 100644 --- a/plugins/GSdx/GPUDrawScanline.h +++ b/plugins/GSdx/GPUDrawScanline.h @@ -43,5 +43,14 @@ public: void BeginDraw(const void* param); void EndDraw(const GSRasterizerStats& stats, uint64 frame); - void PrintStats() {m_ds_map.PrintStats();} + void PrintStats(); + +#ifndef JIT_DRAW + + void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan); + void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan); + void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan); + void DrawRect(const GSVector4i& r, const GSVertexSW& v); + +#endif }; diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 8eb7830111..ddc4d6e9b1 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -100,6 +100,1299 @@ void GSDrawScanline::EndDraw(const GSRasterizerStats& stats, uint64 frame) m_ds_map.UpdateStats(stats, frame); } +void GSDrawScanline::PrintStats() +{ + m_ds_map.PrintStats(); +} + +#ifndef JIT_DRAW + +void GSDrawScanline::SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) +{ + GSScanlineSelector sel = m_global.sel; + + const GSVector4* shift = GSSetupPrimCodeGenerator::m_shift; + + bool has_z = sel.zb != 0; + bool has_f = sel.fb && sel.fge; + bool has_t = sel.fb && sel.tfx != TFX_NONE; + bool has_c = sel.fb && !(sel.tfx == TFX_DECAL && sel.tcc); + + if(has_z || has_f) + { + if(!sel.sprite) + { + if(has_f) + { + GSVector4 df = dscan.p.wwww(); + + m_local.d4.f = GSVector4i(df * shift[0]).xxzzlh(); + + for(int i = 0; i < 4; i++) + { + m_local.d[i].f = GSVector4i(df * shift[1 + i]).xxzzlh(); + } + } + + if(has_z) + { + GSVector4 dz = dscan.p.zzzz(); + + m_local.d4.z = dz * shift[0]; + + for(int i = 0; i < 4; i++) + { + m_local.d[i].z = dz * shift[1 + i]; + } + } + } + else + { + if(has_f) + { + m_local.p.f = GSVector4i(vertices[0].p).zzzzh().zzzz(); + } + + if(has_z) + { + m_local.p.z = vertices[0].t.u32[3]; // uint32 z is bypassed in t.w + } + } + } + + if(has_t) + { + GSVector4 t = dscan.t; + + if(sel.fst) + { + m_local.d4.stq = GSVector4::cast(GSVector4i(t * shift[0])); + } + else + { + m_local.d4.stq = t * shift[0]; + } + + for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++) + { + GSVector4 dstq; + + switch(j) + { + case 0: dstq = t.xxxx(); break; + case 1: dstq = t.yyyy(); break; + case 2: dstq = t.zzzz(); break; + } + + for(int i = 0; i < 4; i++) + { + GSVector4 v = dstq * shift[1 + i]; + + if(sel.fst) + { + switch(j) + { + case 0: m_local.d[i].s = GSVector4::cast(GSVector4i(v)); break; + case 1: m_local.d[i].t = GSVector4::cast(GSVector4i(v)); break; + } + } + else + { + switch(j) + { + case 0: m_local.d[i].s = v; break; + case 1: m_local.d[i].t = v; break; + case 2: m_local.d[i].q = v; break; + } + } + } + } + } + + if(has_c) + { + if(sel.iip) + { + m_local.d4.c = GSVector4i(dscan.c * shift[0]).xzyw().ps32(); + + GSVector4 dr = dscan.c.xxxx(); + GSVector4 db = dscan.c.zzzz(); + + for(int i = 0; i < 4; i++) + { + GSVector4i r = GSVector4i(dr * shift[1 + i]).ps32(); + GSVector4i b = GSVector4i(db * shift[1 + i]).ps32(); + + m_local.d[i].rb = r.upl16(b); + } + + GSVector4 dg = dscan.c.yyyy(); + GSVector4 da = dscan.c.wwww(); + + for(int i = 0; i < 4; i++) + { + GSVector4i g = GSVector4i(dg * shift[1 + i]).ps32(); + GSVector4i a = GSVector4i(da * shift[1 + i]).ps32(); + + m_local.d[i].ga = g.upl16(a); + } + } + else + { + GSVector4i c = GSVector4i(vertices[0].c); + + c = c.upl16(c.zwxy()); + + if(sel.tfx == TFX_NONE) c = c.srl16(7); + + m_local.c.rb = c.xxxx(); + m_local.c.ga = c.zzzz(); + } + } +} + +void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) +{ + GSScanlineSelector sel = m_global.sel; + + GSVector4i test; + GSVector4 z; + GSVector4i f; + GSVector4 s, t, q; + GSVector4i uf, vf; + GSVector4i rbf, gaf; + GSVector4i cov; + + // Init + + int skip = left & 3; + + left -= skip; + + int steps = pixels + skip - 4; + + const GSVector2i* fza_base = &m_global.fzbr[top]; + const GSVector2i* fza_offset = &m_global.fzbc[left >> 2]; + + test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + + if(!sel.sprite) + { + if(sel.fwrite && sel.fge) + { + f = GSVector4i(scan.p).zzzzh().zzzz().add16(m_local.d[skip].f); + } + + if(sel.zb) + { + z = scan.p.zzzz() + m_local.d[skip].z; + } + } + + if(sel.fb) + { + if(sel.edge) + { + cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9); + } + + if(sel.tfx != TFX_NONE) + { + if(sel.fst) + { + GSVector4i vt(scan.t); + + GSVector4i u = vt.xxxx() + GSVector4i::cast(m_local.d[skip].s); + GSVector4i v = vt.yyyy(); + + if(!sel.sprite || sel.mmin) + { + v += GSVector4i::cast(m_local.d[skip].t); + } + else if(sel.ltf) + { + vf = v.xxzzlh().srl16(1); + } + + s = GSVector4::cast(u); + t = GSVector4::cast(v); + } + else + { + s = scan.t.xxxx() + m_local.d[skip].s; + t = scan.t.yyyy() + m_local.d[skip].t; + q = scan.t.zzzz() + m_local.d[skip].q; + } + } + + if(!(sel.tfx == TFX_DECAL && sel.tcc)) + { + if(sel.iip) + { + GSVector4i c(scan.c); + + c = c.upl16(c.zwxy()); + + rbf = c.xxxx().add16(m_local.d[skip].rb); + gaf = c.zzzz().add16(m_local.d[skip].ga); + } + else + { + rbf = m_local.c.rb; + gaf = m_local.c.ga; + } + } + } + + while(1) + { + do + { + int fa = 0, za = 0; + GSVector4i fd, zs, zd; + GSVector4i fm, zm; + GSVector4i rb, ga; + + // TestZ + + if(sel.zb) + { + za = fza_base->y + fza_offset->y; + + if(!sel.sprite) + { + if(sel.zoverflow) + { + zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); + } + else + { + zs = GSVector4i(z); + } + } + else + { + zs = m_local.p.z; + } + + if(sel.ztest) + { + zd = GSVector4i::load((uint8*)m_global.vm + za * 2, (uint8*)m_global.vm + za * 2 + 16); + + switch(sel.zpsm) + { + case 1: zd = zd.sll32(8).srl32(8); break; + case 2: zd = zd.sll32(16).srl32(16); break; + default: break; + } + + GSVector4i zso = zs; + GSVector4i zdo = zd; + + if(sel.zoverflow || sel.zpsm == 0) + { + zso -= GSVector4i::x80000000(); + zdo -= GSVector4i::x80000000(); + } + + switch(sel.ztst) + { + case ZTST_GEQUAL: test |= zso < zdo; break; + case ZTST_GREATER: test |= zso <= zdo; break; + } + + if(test.alltrue()) continue; + } + } + + // SampleTexture + + if(sel.fb && sel.tfx != TFX_NONE) + { + GSVector4i u, v, uv[2]; + GSVector4i lodi, lodf; + GSVector4i minuv, maxuv; + + if(sel.mmin) + { + if(!sel.fst) + { + GSVector4 qrcp = q.rcp(); + + u = GSVector4i(s * qrcp); + v = GSVector4i(t * qrcp); + } + else + { + u = GSVector4i::cast(s); + v = GSVector4i::cast(t); + } + + if(!sel.lcm) + { + GSVector4 tmp = q.log2(3) * m_global.l + m_global.k; // (-log2(Q) * (1 << L) + K) * 0x10000 + + GSVector4i lod = GSVector4i(tmp.sat(GSVector4::zero(), m_global.mxl), false); + + if(sel.mmin == 1) // round-off mode + { + lod += 0x8000; + } + + lodi = lod.srl32(16); + + if(sel.mmin == 2) // trilinear mode + { + lodf = lod.xxzzlh(); + } + + // shift u/v by (int)lod + + GSVector4i aabb = u.upl32(v); + GSVector4i ccdd = u.uph32(v); + + GSVector4i aaxx = aabb.sra32(lodi.x); + GSVector4i xxbb = aabb.sra32(lodi.y); + GSVector4i ccxx = ccdd.sra32(lodi.z); + GSVector4i xxdd = ccdd.sra32(lodi.w); + + GSVector4i acac = aaxx.upl32(ccxx); + GSVector4i bdbd = xxbb.uph32(xxdd); + + u = acac.upl32(bdbd); + v = acac.uph32(bdbd); + + uv[0] = u; + uv[1] = v; + + GSVector4i minmax = m_global.t.minmax; + + GSVector4i v0 = minmax.srl16(lodi.x); + GSVector4i v1 = minmax.srl16(lodi.y); + GSVector4i v2 = minmax.srl16(lodi.z); + GSVector4i v3 = minmax.srl16(lodi.w); + + v0 = v0.upl16(v1); + v2 = v2.upl16(v3); + + minuv = v0.upl32(v2); + maxuv = v0.uph32(v2); + } + else + { + lodi = m_global.lod.i; + + u = u.sra32(lodi.x); + v = v.sra32(lodi.x); + + uv[0] = u; + uv[1] = v; + + minuv = m_local.temp.uv_minmax[0]; + maxuv = m_local.temp.uv_minmax[1]; + } + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + + uf = u.xxzzlh().srl16(1); + vf = v.xxzzlh().srl16(1); + } + + GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector4i uv1 = uv0; + + { + GSVector4i repeat = (uv0 & minuv) | maxuv; + GSVector4i clamp = uv0.sat_i16(minuv, maxuv); + + uv0 = clamp.blend8(repeat, m_global.t.mask); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector4i::x0001()); + + GSVector4i repeat = (uv1 & minuv) | maxuv; + GSVector4i clamp = uv1.sat_i16(minuv, maxuv); + + uv1 = clamp.blend8(repeat, m_global.t.mask); + } + + GSVector4i y0 = uv0.uph16() << (sel.tw + 3); + GSVector4i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector4i y1 = uv1.uph16() << (sel.tw + 3); + GSVector4i x1 = uv1.upl16(); + + GSVector4i addr00 = y0 + x0; + GSVector4i addr01 = y0 + x1; + GSVector4i addr10 = y1 + x0; + GSVector4i addr11 = y1 + x1; + + GSVector4i c00, c01, c10, c11; + + if(sel.tlu) + { + for(int i = 0; i < 4; i++) + { + const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = m_global.clut[tex[addr00.u32[i]]]; + c01.u32[i] = m_global.clut[tex[addr01.u32[i]]]; + c10.u32[i] = m_global.clut[tex[addr10.u32[i]]]; + c11.u32[i] = m_global.clut[tex[addr11.u32[i]]]; + } + } + else + { + for(int i = 0; i < 4; i++) + { + const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = tex[addr00.u32[i]]; + c01.u32[i] = tex[addr01.u32[i]]; + c10.u32[i] = tex[addr10.u32[i]]; + c11.u32[i] = tex[addr11.u32[i]]; + } + } + + GSVector4i rb00 = c00.sll16(8).srl16(8); + GSVector4i ga00 = c00.srl16(8); + GSVector4i rb01 = c01.sll16(8).srl16(8); + GSVector4i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16<0>(rb01, uf); + ga00 = ga00.lerp16<0>(ga01, uf); + + GSVector4i rb10 = c10.sll16(8).srl16(8); + GSVector4i ga10 = c10.srl16(8); + GSVector4i rb11 = c11.sll16(8).srl16(8); + GSVector4i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16<0>(rb11, uf); + ga10 = ga10.lerp16<0>(ga11, uf); + + rb = rb00.lerp16<0>(rb10, vf); + ga = ga00.lerp16<0>(ga10, vf); + } + else + { + GSVector4i addr00 = y0 + x0; + + GSVector4i c00; + + if(sel.tlu) + { + for(int i = 0; i < 4; i++) + { + c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]]; + } + } + else + { + for(int i = 0; i < 4; i++) + { + c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]; + } + } + + rb = c00.sll16(8).srl16(8); + ga = c00.srl16(8); + } + + if(sel.mmin != 1) // !round-off mode + { + GSVector4i rb2, ga2; + + lodi += GSVector4i::x00000001(); + + u = uv[0].sra32(1); + v = uv[1].sra32(1); + + minuv = minuv.srl16(1); + maxuv = maxuv.srl16(1); + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + + uf = u.xxzzlh().srl16(1); + vf = v.xxzzlh().srl16(1); + } + + GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector4i uv1 = uv0; + + { + GSVector4i repeat = (uv0 & minuv) | maxuv; + GSVector4i clamp = uv0.sat_i16(minuv, maxuv); + + uv0 = clamp.blend8(repeat, m_global.t.mask); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector4i::x0001()); + + GSVector4i repeat = (uv1 & minuv) | maxuv; + GSVector4i clamp = uv1.sat_i16(minuv, maxuv); + + uv1 = clamp.blend8(repeat, m_global.t.mask); + } + + GSVector4i y0 = uv0.uph16() << (sel.tw + 3); + GSVector4i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector4i y1 = uv1.uph16() << (sel.tw + 3); + GSVector4i x1 = uv1.upl16(); + + GSVector4i addr00 = y0 + x0; + GSVector4i addr01 = y0 + x1; + GSVector4i addr10 = y1 + x0; + GSVector4i addr11 = y1 + x1; + + GSVector4i c00, c01, c10, c11; + + if(sel.tlu) + { + for(int i = 0; i < 4; i++) + { + const uint8* tex = (const uint8*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = m_global.clut[tex[addr00.u32[i]]]; + c01.u32[i] = m_global.clut[tex[addr01.u32[i]]]; + c10.u32[i] = m_global.clut[tex[addr10.u32[i]]]; + c11.u32[i] = m_global.clut[tex[addr11.u32[i]]]; + } + } + else + { + for(int i = 0; i < 4; i++) + { + const uint32* tex = (const uint32*)m_global.tex[lodi.u32[i]]; + + c00.u32[i] = tex[addr00.u32[i]]; + c01.u32[i] = tex[addr01.u32[i]]; + c10.u32[i] = tex[addr10.u32[i]]; + c11.u32[i] = tex[addr11.u32[i]]; + } + } + + GSVector4i rb00 = c00.sll16(8).srl16(8); + GSVector4i ga00 = c00.srl16(8); + GSVector4i rb01 = c01.sll16(8).srl16(8); + GSVector4i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16<0>(rb01, uf); + ga00 = ga00.lerp16<0>(ga01, uf); + + GSVector4i rb10 = c10.sll16(8).srl16(8); + GSVector4i ga10 = c10.srl16(8); + GSVector4i rb11 = c11.sll16(8).srl16(8); + GSVector4i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16<0>(rb11, uf); + ga10 = ga10.lerp16<0>(ga11, uf); + + rb2 = rb00.lerp16<0>(rb10, vf); + ga2 = ga00.lerp16<0>(ga10, vf); + } + else + { + GSVector4i addr00 = y0 + x0; + + GSVector4i c00; + + if(sel.tlu) + { + for(int i = 0; i < 4; i++) + { + c00.u32[i] = m_global.clut[((const uint8*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]]; + } + } + else + { + for(int i = 0; i < 4; i++) + { + c00.u32[i] = ((const uint32*)m_global.tex[lodi.u32[i]])[addr00.u32[i]]; + } + } + + rb2 = c00.sll16(8).srl16(8); + ga2 = c00.srl16(8); + } + + if(sel.lcm) lodf = m_global.lod.f; + + lodf = lodf.srl16(1); + + rb = rb.lerp16<0>(rb2, lodf); + ga = ga.lerp16<0>(ga2, lodf); + } + } + else + { + if(!sel.fst) + { + GSVector4 qrcp = q.rcp(); + + u = GSVector4i(s * qrcp); + v = GSVector4i(t * qrcp); + + if(sel.ltf) + { + u -= 0x8000; + v -= 0x8000; + } + } + else + { + u = GSVector4i::cast(s); + v = GSVector4i::cast(t); + } + + if(sel.ltf) + { + uf = u.xxzzlh().srl16(1); + + if(!sel.sprite) + { + vf = v.xxzzlh().srl16(1); + } + } + + GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); + GSVector4i uv1 = uv0; + + { + GSVector4i repeat = (uv0 & m_global.t.min) | m_global.t.max; + GSVector4i clamp = uv0.sat_i16(m_global.t.min, m_global.t.max); + + uv0 = clamp.blend8(repeat, m_global.t.mask); + } + + if(sel.ltf) + { + uv1 = uv1.add16(GSVector4i::x0001()); + + GSVector4i repeat = (uv1 & m_global.t.min) | m_global.t.max; + GSVector4i clamp = uv1.sat_i16(m_global.t.min, m_global.t.max); + + uv1 = clamp.blend8(repeat, m_global.t.mask); + } + + GSVector4i y0 = uv0.uph16() << (sel.tw + 3); + GSVector4i x0 = uv0.upl16(); + + if(sel.ltf) + { + GSVector4i y1 = uv1.uph16() << (sel.tw + 3); + GSVector4i x1 = uv1.upl16(); + + GSVector4i addr00 = y0 + x0; + GSVector4i addr01 = y0 + x1; + GSVector4i addr10 = y1 + x0; + GSVector4i addr11 = y1 + x1; + + GSVector4i c00, c01, c10, c11; + + if(sel.tlu) + { + const uint8* tex = (const uint8*)m_global.tex[0]; + + c00 = addr00.gather32_32(tex, m_global.clut); + c01 = addr01.gather32_32(tex, m_global.clut); + c10 = addr10.gather32_32(tex, m_global.clut); + c11 = addr11.gather32_32(tex, m_global.clut); + } + else + { + const uint32* tex = (const uint32*)m_global.tex[0]; + + c00 = addr00.gather32_32(tex); + c01 = addr01.gather32_32(tex); + c10 = addr10.gather32_32(tex); + c11 = addr11.gather32_32(tex); + } + + GSVector4i rb00 = c00.sll16(8).srl16(8); + GSVector4i ga00 = c00.srl16(8); + GSVector4i rb01 = c01.sll16(8).srl16(8); + GSVector4i ga01 = c01.srl16(8); + + rb00 = rb00.lerp16<0>(rb01, uf); + ga00 = ga00.lerp16<0>(ga01, uf); + + GSVector4i rb10 = c10.sll16(8).srl16(8); + GSVector4i ga10 = c10.srl16(8); + GSVector4i rb11 = c11.sll16(8).srl16(8); + GSVector4i ga11 = c11.srl16(8); + + rb10 = rb10.lerp16<0>(rb11, uf); + ga10 = ga10.lerp16<0>(ga11, uf); + + rb = rb00.lerp16<0>(rb10, vf); + ga = ga00.lerp16<0>(ga10, vf); + } + else + { + GSVector4i addr00 = y0 + x0; + + GSVector4i c00; + + if(sel.tlu) + { + c00 = addr00.gather32_32((const uint8*)m_global.tex[0], m_global.clut); + } + else + { + c00 = addr00.gather32_32((const uint32*)m_global.tex[0]); + } + + rb = c00.sll16(8).srl16(8); + ga = c00.srl16(8); + } + } + } + + // AlphaTFX + + if(sel.fb) + { + switch(sel.tfx) + { + case TFX_MODULATE: + ga = ga.modulate16<1>(gaf).clamp8(); + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_DECAL: + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_HIGHLIGHT: + ga = ga.mix16(!sel.tcc ? gaf.srl16(7) : ga.addus8(gaf.srl16(7))); + break; + case TFX_HIGHLIGHT2: + if(!sel.tcc) ga = ga.mix16(gaf.srl16(7)); + break; + case TFX_NONE: + ga = sel.iip ? gaf.srl16(7) : gaf; + break; + } + + if(sel.aa1) + { + GSVector4i x00800080(0x00800080); + + GSVector4i a = sel.edge ? cov : x00800080; + + if(!sel.abe) + { + ga = ga.mix16(a); + } + else + { + ga = ga.blend8(a, ga.eq16(x00800080).srl32(16).sll32(16)); + } + } + } + + // ReadMask + + if(sel.fwrite) + { + fm = m_global.fm; + } + + if(sel.zwrite) + { + zm = m_global.zm; + } + + // TestAlpha + + if(!TestAlpha(test, fm, zm, ga)) continue; + + // ColorTFX + + if(sel.fwrite) + { + GSVector4i af; + + switch(sel.tfx) + { + case TFX_MODULATE: + rb = rb.modulate16<1>(rbf).clamp8(); + break; + case TFX_DECAL: + break; + case TFX_HIGHLIGHT: + case TFX_HIGHLIGHT2: + af = gaf.yywwlh().srl16(7); + rb = rb.modulate16<1>(rbf).add16(af).clamp8(); + ga = ga.modulate16<1>(gaf).add16(af).clamp8().mix16(ga); + break; + case TFX_NONE: + rb = sel.iip ? rbf.srl16(7) : rbf; + break; + } + } + + // Fog + + if(sel.fwrite && sel.fge) + { + GSVector4i fog = !sel.sprite ? f : m_local.p.f; + + rb = m_global.frb.lerp16<0>(rb, fog); + ga = m_global.fga.lerp16<0>(ga, fog).mix16(ga); + } + + // ReadFrame + + if(sel.fb) + { + fa = fza_base->x + fza_offset->x; + + if(sel.rfb) + { + fd = GSVector4i::load((uint8*)m_global.vm + fa * 2, (uint8*)m_global.vm + fa * 2 + 16); + } + } + + // TestDestAlpha + + if(sel.date && (sel.fpsm == 0 || sel.fpsm == 2)) + { + if(sel.datm) + { + if(sel.fpsm == 2) + { + test |= fd.srl32(15) == GSVector4i::zero(); + } + else + { + test |= (~fd).sra32(31); + } + } + else + { + if(sel.fpsm == 2) + { + test |= fd.sll32(16).sra32(31); + } + else + { + test |= fd.sra32(31); + } + } + + if(test.alltrue()) continue; + } + + // WriteMask + + int fzm = 0; + + if(sel.fwrite) + { + fm |= test; + } + + if(sel.zwrite) + { + zm |= test; + } + + if(sel.fwrite && sel.zwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + } + else if(sel.fwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); + } + else if(sel.zwrite) + { + fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); + } + + // WriteZBuf + + if(sel.zwrite) + { + if(sel.ztest && sel.zpsm < 2) + { + zs = zs.blend8(zd, zm); + + if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); + if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + } + else + { + if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm); + if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm); + if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm); + if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm); + } + } + + // AlphaBlend + + if(sel.fwrite && (sel.abe || sel.aa1)) + { + GSVector4i rbs = rb, gas = ga, rbd, gad, a, mask; + + if(sel.aba != sel.abb && (sel.aba == 1 || sel.abb == 1 || sel.abc == 1) || sel.abd == 1) + { + switch(sel.fpsm) + { + case 0: + case 1: + rbd = fd.sll16(8).srl16(8); + gad = fd.srl16(8); + break; + case 2: + rbd = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); + gad = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); + break; + } + } + + if(sel.aba != sel.abb) + { + switch(sel.aba) + { + case 0: break; + case 1: rb = rbd; break; + case 2: rb = GSVector4i::zero(); break; + } + + switch(sel.abb) + { + case 0: rb = rb.sub16(rbs); break; + case 1: rb = rb.sub16(rbd); break; + case 2: break; + } + + if(!(sel.fpsm == 1 && sel.abc == 1)) + { + switch(sel.abc) + { + case 0: a = gas.yywwlh().sll16(7); break; + case 1: a = gad.yywwlh().sll16(7); break; + case 2: a = m_global.afix; break; + } + + rb = rb.modulate16<1>(a); + } + + switch(sel.abd) + { + case 0: rb = rb.add16(rbs); break; + case 1: rb = rb.add16(rbd); break; + case 2: break; + } + } + else + { + switch(sel.abd) + { + case 0: break; + case 1: rb = rbd; break; + case 2: rb = GSVector4i::zero(); break; + } + } + + if(sel.pabe) + { + mask = (gas << 8).sra32(31); + + rb = rbs.blend8(rb, mask); + } + + if(sel.aba != sel.abb) + { + switch(sel.aba) + { + case 0: break; + case 1: ga = gad; break; + case 2: ga = GSVector4i::zero(); break; + } + + switch(sel.abb) + { + case 0: ga = ga.sub16(gas); break; + case 1: ga = ga.sub16(gad); break; + case 2: break; + } + + if(!(sel.fpsm == 1 && sel.abc == 1)) + { + ga = ga.modulate16<1>(a); + } + + switch(sel.abd) + { + case 0: ga = ga.add16(gas); break; + case 1: ga = ga.add16(gad); break; + case 2: break; + } + } + + if(sel.pabe) + { + ga = gas.blend8(ga, mask >> 16); + } + else + { + if(sel.fpsm != 1) + { + ga = ga.mix16(gas); + } + } + } + + // WriteFrame + + if(sel.fwrite) + { + if(sel.colclamp == 0) + { + rb &= GSVector4i::x00ff(); + ga &= GSVector4i::x00ff(); + } + + if(sel.fpsm == 2 && sel.dthe) + { + int y = (top & 3) << 1; + + rb = rb.add16(m_global.dimx[0 + y]); + ga = ga.add16(m_global.dimx[1 + y]); + } + + GSVector4i fs = rb.upl16(ga).pu16(rb.uph16(ga)); + + if(sel.fba && sel.fpsm != 1) + { + fs |= GSVector4i::x80000000(); + } + + if(sel.fpsm == 2) + { + GSVector4i rb = fs & 0x00f800f8; + GSVector4i ga = fs & 0x8000f800; + + fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); + } + + if(sel.rfb) + { + fs = fs.blend(fd, fm); + } + + if(sel.rfb && sel.fpsm < 2) + { + if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); + if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + } + else + { + if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm); + if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm); + if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm); + if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm); + } + } + } + while(0); + + if(sel.edge) break; + + if(steps <= 0) break; + + // Step + + steps -= 4; + + fza_offset++; + + if(!sel.sprite) + { + if(sel.zb) + { + z += m_local.d4.z; + } + + if(sel.fwrite && sel.fge) + { + f = f.add16(m_local.d4.f); + } + } + + if(sel.fb) + { + if(sel.tfx != TFX_NONE) + { + if(sel.fst) + { + GSVector4i stq = GSVector4i::cast(m_local.d4.stq); + + s = GSVector4::cast(GSVector4i::cast(s) + stq.xxxx()); + + if(!sel.sprite || sel.mmin) + { + t = GSVector4::cast(GSVector4i::cast(t) + stq.yyyy()); + } + } + else + { + GSVector4 stq = m_local.d4.stq; + + s += stq.xxxx(); + t += stq.yyyy(); + q += stq.zzzz(); + } + } + } + + if(!(sel.tfx == TFX_DECAL && sel.tcc)) + { + if(sel.iip) + { + GSVector4i c = m_local.d4.c; + + rbf = rbf.add16(c.xxxx()).max_i16(GSVector4i::zero()); + gaf = gaf.add16(c.yyyy()).max_i16(GSVector4i::zero()); + } + } + + test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + } +} + +void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) +{ + uint32 zwrite = m_global.sel.zwrite; + uint32 edge = m_global.sel.edge; + + m_global.sel.zwrite = 0; + m_global.sel.edge = 1; + + DrawScanline(pixels, left, top, scan); + + m_global.sel.zwrite = zwrite; + m_global.sel.edge = edge; +} + +bool GSDrawScanline::TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga) +{ + GSScanlineSelector sel = m_global.sel; + + switch(sel.afail) + { + case AFAIL_FB_ONLY: + if(!sel.zwrite) return true; + break; + + case AFAIL_ZB_ONLY: + if(!sel.fwrite) return true; + break; + + case AFAIL_RGB_ONLY: + if(!sel.zwrite && sel.fpsm == 1) return true; + break; + } + + GSVector4i t; + + switch(sel.atst) + { + case ATST_NEVER: + t = GSVector4i::xffffffff(); + break; + + case ATST_ALWAYS: + return true; + + case ATST_LESS: + case ATST_LEQUAL: + t = (ga >> 16) > m_global.aref; + break; + + case ATST_EQUAL: + t = (ga >> 16) != m_global.aref; + break; + + case ATST_GEQUAL: + case ATST_GREATER: + t = (ga >> 16) < m_global.aref; + break; + + case ATST_NOTEQUAL: + t = (ga >> 16) == m_global.aref; + break; + + default: + __assume(0); + } + + switch(sel.afail) + { + case AFAIL_KEEP: + test |= t; + if(test.alltrue()) return false; + break; + + case AFAIL_FB_ONLY: + zm |= t; + break; + + case AFAIL_ZB_ONLY: + fm |= t; + break; + + case AFAIL_RGB_ONLY: + zm |= t; + fm |= t & GSVector4i::xff000000(); + break; + + default: + __assume(0); + } + + return true; +} + +static const int s_offsets[4] = {0, 2, 8, 10}; + +void GSDrawScanline::WritePixel(const GSVector4i& src, int addr, int i, uint32 psm) +{ + uint8* dst = (uint8*)m_global.vm + addr * 2 + s_offsets[i] * 2; + + switch(psm) + { + case 0: + *(uint32*)dst = src.u32[i]; + break; + case 1: + *(uint32*)dst = (src.u32[i] & 0xffffff) | (*(uint32*)dst & 0xff000000); + break; + case 2: + *(uint16*)dst = src.u16[i * 2]; + break; + } +} + +#endif + void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) { ASSERT(r.y >= 0); diff --git a/plugins/GSdx/GSDrawScanline.h b/plugins/GSdx/GSDrawScanline.h index e3953c0ed6..20fb4c3072 100644 --- a/plugins/GSdx/GSDrawScanline.h +++ b/plugins/GSdx/GSDrawScanline.h @@ -35,8 +35,6 @@ class GSDrawScanline : public IDrawScanline GSCodeGeneratorFunctionMap m_sp_map; GSCodeGeneratorFunctionMap m_ds_map; - void DrawRect(const GSVector4i& r, const GSVertexSW& v); - template void DrawRectT(const int* RESTRICT row, const int* RESTRICT col, const GSVector4i& r, uint32 c, uint32 m); @@ -54,5 +52,21 @@ public: void BeginDraw(const void* param); void EndDraw(const GSRasterizerStats& stats, uint64 frame); - void PrintStats() {m_ds_map.PrintStats();} + void PrintStats(); + + void DrawRect(const GSVector4i& r, const GSVertexSW& v); + +#ifndef JIT_DRAW + + void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan); + void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan); + void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan); + + bool IsEdge() const {return m_global.sel.aa1;} + bool IsRect() const {return m_global.sel.IsSolidRect();} + + bool TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga); + void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm); + +#endif }; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 8cb5b894c6..4b5b6c746d 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -30,9 +30,6 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator { void operator = (const GSDrawScanlineCodeGenerator&); - static const GSVector4i m_test[8]; - static const GSVector4 m_log2_coef[4]; - GSScanlineSelector m_sel; GSScanlineLocalData& m_local; @@ -84,4 +81,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator public: GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); + + static const GSVector4i m_test[8]; + static const GSVector4 m_log2_coef[4]; }; diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index 36797c7f5e..235a13f355 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -1648,8 +1648,8 @@ void GSDrawScanlineCodeGenerator::WriteFrame() if(m_sel.colclamp == 0) { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; + // c[0] &= 0x00ff00ff; + // c[1] &= 0x00ff00ff; vpcmpeqd(xmm15, xmm15); vpsrlw(xmm15, 8); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index 8a8dbb9d66..1d0596ea89 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -343,6 +343,8 @@ void GSDrawScanlineCodeGenerator::Init() if(m_sel.edge) { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + vpshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); vpsrlw(xmm3, 9); @@ -1184,34 +1186,35 @@ return; vmovq(xmm4, ptr[&m_local.gd->t.minmax]); - vmovq(xmm2, ptr[&m_local.temp.uv[0].u32[0]]); + vmovdqa(xmm2, ptr[&m_local.temp.uv[0]]); + vmovdqa(xmm5, xmm2); + vmovdqa(xmm3, ptr[&m_local.temp.uv[1]]); + vmovdqa(xmm6, xmm3); + vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); vpsrad(xmm2, xmm0); vpsrlw(xmm1, xmm4, xmm0); vmovq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); - vmovq(xmm3, ptr[&m_local.temp.uv[0].u32[2]]); vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); - vpsrad(xmm3, xmm0); + vpsrad(xmm5, xmm0); vpsrlw(xmm1, xmm4, xmm0); vmovq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); - vmovq(xmm5, ptr[&m_local.temp.uv[1].u32[0]]); vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); - vpsrad(xmm5, xmm0); + vpsrad(xmm3, xmm0); vpsrlw(xmm1, xmm4, xmm0); vmovq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); - vmovq(xmm6, ptr[&m_local.temp.uv[1].u32[2]]); vmovd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); vpsrad(xmm6, xmm0); vpsrlw(xmm1, xmm4, xmm0); vmovq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); vpunpckldq(xmm2, xmm3); - vpunpckldq(xmm5, xmm6); - vpunpckhqdq(xmm3, xmm2, xmm5); - vpunpcklqdq(xmm2, xmm5); + vpunpckhdq(xmm5, xmm6); + vpunpckhdq(xmm3, xmm2, xmm5); + vpunpckldq(xmm2, xmm5); vmovdqa(ptr[&m_local.temp.uv[0]], xmm2); vmovdqa(ptr[&m_local.temp.uv[1]], xmm3); @@ -2573,8 +2576,8 @@ void GSDrawScanlineCodeGenerator::WriteFrame() if(m_sel.colclamp == 0) { - // c[0] &= 0x000000ff; - // c[1] &= 0x000000ff; + // c[0] &= 0x00ff00ff; + // c[1] &= 0x00ff00ff; vpcmpeqd(xmm7, xmm7); vpsrlw(xmm7, 8); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index 5eac262f19..a081767a58 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -343,6 +343,8 @@ void GSDrawScanlineCodeGenerator::Init() if(m_sel.edge) { + // m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9); + pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); psrlw(xmm3, 9); @@ -1232,28 +1234,29 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() movq(xmm4, ptr[&m_local.gd->t.minmax]); - movq(xmm2, ptr[&m_local.temp.uv[0].u32[0]]); + movdqa(xmm2, ptr[&m_local.temp.uv[0]]); + movdqa(xmm5, xmm2); + movdqa(xmm3, ptr[&m_local.temp.uv[1]]); + movdqa(xmm6, xmm3); + movd(xmm0, ptr[&m_local.temp.lod.i.u32[0]]); psrad(xmm2, xmm0); movdqa(xmm1, xmm4); psrlw(xmm1, xmm0); movq(ptr[&m_local.temp.uv_minmax[0].u32[0]], xmm1); - movq(xmm3, ptr[&m_local.temp.uv[0].u32[2]]); movd(xmm0, ptr[&m_local.temp.lod.i.u32[1]]); - psrad(xmm3, xmm0); + psrad(xmm5, xmm0); movdqa(xmm1, xmm4); psrlw(xmm1, xmm0); movq(ptr[&m_local.temp.uv_minmax[1].u32[0]], xmm1); - movq(xmm5, ptr[&m_local.temp.uv[1].u32[0]]); movd(xmm0, ptr[&m_local.temp.lod.i.u32[2]]); - psrad(xmm5, xmm0); + psrad(xmm3, xmm0); movdqa(xmm1, xmm4); psrlw(xmm1, xmm0); movq(ptr[&m_local.temp.uv_minmax[0].u32[2]], xmm1); - movq(xmm6, ptr[&m_local.temp.uv[1].u32[2]]); movd(xmm0, ptr[&m_local.temp.lod.i.u32[3]]); psrad(xmm6, xmm0); movdqa(xmm1, xmm4); @@ -1261,10 +1264,10 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() movq(ptr[&m_local.temp.uv_minmax[1].u32[2]], xmm1); punpckldq(xmm2, xmm3); - punpckldq(xmm5, xmm6); + punpckhdq(xmm5, xmm6); movdqa(xmm3, xmm2); - punpcklqdq(xmm2, xmm5); - punpckhqdq(xmm3, xmm5); + punpckldq(xmm2, xmm5); + punpckhdq(xmm3, xmm5); movdqa(ptr[&m_local.temp.uv[0]], xmm2); movdqa(ptr[&m_local.temp.uv[1]], xmm3); diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index fcd66893af..5dd257377e 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -277,6 +277,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) int i = (y0011 == y1221).mask() & 7; + // if(i == 0) => y0 < y1 < y2 + // if(i == 1) => y0 == y1 < y2 + // if(i == 4) => y0 < y1 == y2 + if(i == 7) return; // y0 == y1 == y2 GSVector4 tbf = y0011.xzxz(y1221).ceil(); @@ -338,14 +342,25 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) dscan.c = _r.ywyw(_g).hsub(_b.ywyw(_a)); // dy0 * r1 - dy1 * r0, dy0 * g1 - dy1 * g0, dy0 * b1 - dy1 * b0, dy0 * a1 - dy1 * a0 dedge.c = _r.zxzx(_g).hsub(_b.zxzx(_a)); // dx1 * r0 - dx0 * r1, dx1 * g0 - dx0 * g1, dx1 * b0 - dx0 * b1, dx1 * a0 - dx0 * a1 - GSVector4 x0; - - switch(i) + if(i & 1) { - case 0: // y0 < y1 < y2 - case 4: // y0 < y1 == y2 + if(tb.y < tb.w) + { + edge = v[1 - j]; - x0 = v[0].p.xxxx(); + GSVector4 dy = tbmax.xxxx() - edge.p.yyyy(); + + edge.p = edge.p.insert<0, 1>(v[j].p); + dedge.p = ddx[2 - (j << 1)].yzzw(dedge.p); + + edge += dedge * dy; + + DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, v[1 - j].p.xxxx()); + } + } + else + { + GSVector4 x0 = v[0].p.xxxx(); if(tb.x < tb.z) { @@ -374,30 +389,6 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices) DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v[1].p.xxxx()); } - - break; - - case 1: // y0 == y1 < y2 - - if(tb.y < tb.w) - { - edge = v[1 - j]; - - GSVector4 dy = tbmax.xxxx() - edge.p.yyyy(); - - edge.p = edge.p.insert<0, 1>(v[j].p); - dedge.p = ddx[2 - (j << 1)].yzzw(dedge.p); - - edge += dedge * dy; - - DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, v[1 - j].p.xxxx()); - } - - break; - - default: - - __assume(0); } Flush(v, dscan); diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 3976c9991b..10703d9aca 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -27,6 +27,9 @@ #include "GSThread.h" #include "GSAlignedClass.h" +// +#define JIT_DRAW + __aligned(class, 32) GSRasterizerData { public: @@ -62,11 +65,22 @@ public: virtual void EndDraw(const GSRasterizerStats& stats, uint64 frame) = 0; virtual void PrintStats() = 0; +#ifdef JIT_DRAW + __forceinline void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) {m_sp(vertices, dscan);} __forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) {m_ds(pixels, left, top, scan);} __forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) {m_de(pixels, left, top, scan);} __forceinline void DrawRect(const GSVector4i& r, const GSVertexSW& v) {(this->*m_dr)(r, v);} +#else + + virtual void SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) = 0; + virtual void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) = 0; + virtual void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) = 0; + virtual void DrawRect(const GSVector4i& r, const GSVertexSW& v) = 0; + +#endif + __forceinline bool IsEdge() const {return m_de != NULL;} __forceinline bool IsRect() const {return m_dr != NULL;} }; diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index 46cfb9d008..ee210cf3c9 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -146,8 +146,9 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has struct { - GSVector4i z, f; - GSVector4i s, t, q; + GSVector4 z; + GSVector4i f; + GSVector4 s, t, q; GSVector4i rb, ga; GSVector4i zs, zd; GSVector4i uf, vf; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.h b/plugins/GSdx/GSSetupPrimCodeGenerator.h index ba43dda5b7..82ddf5c9c7 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.h +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.h @@ -28,8 +28,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator { void operator = (const GSSetupPrimCodeGenerator&); - static const GSVector4 m_shift[5]; - GSScanlineSelector m_sel; GSScanlineLocalData& m_local; @@ -43,4 +41,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator public: GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize); + + static const GSVector4 m_shift[5]; }; diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 691bee02f9..e9f3166fb6 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -151,7 +151,7 @@ public: this->m = m; } - __forceinline explicit GSVector4i(const GSVector4& v); + __forceinline explicit GSVector4i(const GSVector4& v, bool truncate = true); __forceinline void operator = (const GSVector4i& v) { @@ -796,41 +796,81 @@ public: return GSVector4i(_mm_srai_epi16(m, i)); } + __forceinline GSVector4i sra16(__m128i i) const + { + return GSVector4i(_mm_sra_epi16(m, i)); + } + __forceinline GSVector4i sra32(int i) const { return GSVector4i(_mm_srai_epi32(m, i)); } + __forceinline GSVector4i sra32(__m128i i) const + { + return GSVector4i(_mm_sra_epi32(m, i)); + } + __forceinline GSVector4i sll16(int i) const { return GSVector4i(_mm_slli_epi16(m, i)); } + __forceinline GSVector4i sll16(__m128i i) const + { + return GSVector4i(_mm_sll_epi16(m, i)); + } + __forceinline GSVector4i sll32(int i) const { return GSVector4i(_mm_slli_epi32(m, i)); } + __forceinline GSVector4i sll32(__m128i i) const + { + return GSVector4i(_mm_sll_epi32(m, i)); + } + __forceinline GSVector4i sll64(int i) const { return GSVector4i(_mm_slli_epi64(m, i)); } + __forceinline GSVector4i sll64(__m128i i) const + { + return GSVector4i(_mm_sll_epi64(m, i)); + } + __forceinline GSVector4i srl16(int i) const { return GSVector4i(_mm_srli_epi16(m, i)); } + __forceinline GSVector4i srl16(__m128i i) const + { + return GSVector4i(_mm_srl_epi16(m, i)); + } + __forceinline GSVector4i srl32(int i) const { return GSVector4i(_mm_srli_epi32(m, i)); } + __forceinline GSVector4i srl32(__m128i i) const + { + return GSVector4i(_mm_srl_epi32(m, i)); + } + __forceinline GSVector4i srl64(int i) const { return GSVector4i(_mm_srli_epi64(m, i)); } + __forceinline GSVector4i srl64(__m128i i) const + { + return GSVector4i(_mm_srl_epi64(m, i)); + } + __forceinline GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); @@ -3109,9 +3149,9 @@ public: VECTOR4_SHUFFLE_1(w, 3) }; -__forceinline GSVector4i::GSVector4i(const GSVector4& v) +__forceinline GSVector4i::GSVector4i(const GSVector4& v, bool truncate) { - m = _mm_cvttps_epi32(v); + m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); } __forceinline GSVector4::GSVector4(const GSVector4i& v) @@ -3158,7 +3198,7 @@ public: __forceinline GSVector8i() {} - __forceinline explicit GSVector8i(const GSVector8& v); + __forceinline explicit GSVector8i(const GSVector8& v, bool truncate = true); static GSVector8i cast(const GSVector8& v); @@ -4158,9 +4198,9 @@ public: #if _M_SSE >= 0x500 -__forceinline GSVector8i::GSVector8i(const GSVector8& v) +__forceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate) { - m = _mm256_cvttps_epi32(v); + m = truncate ? _mm256_cvttps_epi32(v) : _mm256_cvtps_epi32(v); } __forceinline GSVector8::GSVector8(const GSVector8i& v) @@ -4180,10 +4220,10 @@ __forceinline GSVector8 GSVector8::cast(const GSVector8i& v) #else -__forceinline GSVector8i::GSVector8i(const GSVector8& v) +__forceinline GSVector8i::GSVector8i(const GSVector8& v, bool truncate) { - m[0] = _mm_cvttps_epi32(v.m[0]); - m[1] = _mm_cvttps_epi32(v.m[1]); + m[0] = truncate ? _mm_cvttps_epi32(v.m[0]) : _mm_cvtps_epi32(v.m[0]); + m[1] = truncate ? _mm_cvttps_epi32(v.m[1]) : _mm_cvtps_epi32(v.m[1]); } __forceinline GSVector8::GSVector8(const GSVector8i& v)