diff --git a/plugins/GSdx/GPUDrawScanline.cpp b/plugins/GSdx/GPUDrawScanline.cpp index 9fd5ac2ac2..0bc5896f5e 100644 --- a/plugins/GSdx/GPUDrawScanline.cpp +++ b/plugins/GSdx/GPUDrawScanline.cpp @@ -90,17 +90,406 @@ void GPUDrawScanline::PrintStats() void GPUDrawScanline::SetupPrim(const GSVertexSW* vertices, const GSVertexSW& dscan) { - // TODO + GPUScanlineSelector sel = m_global.sel; + + const GSVector4* shift = GPUSetupPrimCodeGenerator::m_shift; + + if(sel.tme && !sel.twin) + { + if(sel.sprite) + { + GSVector4i t = (GSVector4i(vertices[1].t) >> 8) - GSVector4i::x00000001(); + + t = t.ps32(t); + t = t.upl16(t); + + m_local.twin[2].u = t.xxxx(); + m_local.twin[2].v = t.yyyy(); + } + else + { + // TODO: not really needed + + m_local.twin[2].u = GSVector4i::x00ff(); + m_local.twin[2].v = GSVector4i::x00ff(); + } + } + + if(sel.tme || sel.iip && sel.tfx != 3) + { + GSVector4 dt = dscan.t; + GSVector4 dc = dscan.c; + + GSVector4i dtc8 = GSVector4i(dt * shift[0]).ps32(GSVector4i(dc * shift[0])); + + if(sel.tme) + { + m_local.d8.st = dtc8.upl16(dtc8); + } + + if(sel.iip && sel.tfx != 3) + { + m_local.d8.c = dtc8.uph16(dtc8); + } + + if(sel.tme) + { + GSVector4 dtx = dt.xxxx(); + GSVector4 dty = dt.yyyy(); + + m_local.d.s = GSVector4i(dtx * shift[1]).ps32(GSVector4i(dtx * shift[2])); + m_local.d.t = GSVector4i(dty * shift[1]).ps32(GSVector4i(dty * shift[2])); + } + + if(sel.iip && sel.tfx != 3) + { + GSVector4 dcx = dc.xxxx(); + GSVector4 dcy = dc.yyyy(); + GSVector4 dcz = dc.zzzz(); + + m_local.d.r = GSVector4i(dcx * shift[1]).ps32(GSVector4i(dcx * shift[2])); + m_local.d.g = GSVector4i(dcy * shift[1]).ps32(GSVector4i(dcy * shift[2])); + m_local.d.b = GSVector4i(dcz * shift[1]).ps32(GSVector4i(dcz * shift[2])); + } + } } void GPUDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) { - // TODO + // TODO: not tested yet, probably bogus + + GPUScanlineSelector sel = m_global.sel; + + GSVector4i s, t; + GSVector4i uf, vf; + GSVector4i rf, gf, bf; + GSVector4i dither; + + // Init + + uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left; + + int steps = pixels - 8; + + if(sel.dtd) + { + dither = GSVector4i::load(&GPUDrawScanlineCodeGenerator::m_dither[top & 3][left & 3]); + } + + if(sel.tme) + { + GSVector4i vt = GSVector4i(scan.t).xxzzl(); + + s = vt.xxxx().add16(m_local.d.s); + t = vt.yyyy(); + + if(!sel.sprite) + { + t = t.add16(m_local.d.t); + } + else + { + if(sel.ltf) + { + vf = t.sll16(1).srl16(1); + } + } + } + + if(sel.tfx != 3) + { + GSVector4i vc = GSVector4i(scan.c).xxzzlh(); + + rf = vc.xxxx(); + gf = vc.yyyy(); + bf = vc.zzzz(); + + if(sel.iip) + { + rf = rf.add16(m_local.d.r); + gf = gf.add16(m_local.d.g); + bf = bf.add16(m_local.d.b); + } + } + + while(1) + { + do + { + GSVector4i test = GPUDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + + GSVector4i fd = GSVector4i::load(fb, fb + 8); + + GSVector4i r, g, b, a; + + // TestMask + + if(sel.me) + { + test |= fd.sra16(15); + + if(test.alltrue()) continue; + } + + // SampleTexture + + if(sel.tme) + { + GSVector4i u0, v0, u1, v1; + GSVector4i addr00, addr01, addr10, addr11; + GSVector4i c00, c01, c10, c11; + + if(sel.ltf) + { + u0 = s.sub16(GSVector4i(0x00200020)); // - 0.125f + v0 = t.sub16(GSVector4i(0x00200020)); // - 0.125f + + uf = u0.sll16(8).srl16(1); + vf = v0.sll16(8).srl16(1);; + } + else + { + u0 = s; + v0 = t; + } + + u0 = u0.srl16(8); + v0 = v0.srl16(8); + + if(sel.ltf) + { + u1 = u0.add16(GSVector4i::x0001()); + v1 = v0.add16(GSVector4i::x0001()); + + if(sel.twin) + { + u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); + v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); + u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); + v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); + } + else + { + u0 = u0.min_i16(m_local.twin[2].u); + v0 = v0.min_i16(m_local.twin[2].v); + u1 = u1.min_i16(m_local.twin[2].u); + v1 = v1.min_i16(m_local.twin[2].v); + } + + addr00 = v0.sll16(8) | u0; + addr01 = v0.sll16(8) | u1; + addr10 = v1.sll16(8) | u0; + addr11 = v1.sll16(8) | u1; + + // TODO + + if(sel.tlu) + { + c00 = addr00.gather16_16((const uint16*)m_global.vm, m_global.clut); + c01 = addr01.gather16_16((const uint16*)m_global.vm, m_global.clut); + c10 = addr10.gather16_16((const uint16*)m_global.vm, m_global.clut); + c11 = addr11.gather16_16((const uint16*)m_global.vm, m_global.clut); + } + else + { + c00 = addr00.gather16_16((const uint16*)m_global.vm); + c01 = addr01.gather16_16((const uint16*)m_global.vm); + c10 = addr10.gather16_16((const uint16*)m_global.vm); + c11 = addr11.gather16_16((const uint16*)m_global.vm); + } + + GSVector4i r00 = c00.sll16(11).srl16(8); + GSVector4i r01 = c01.sll16(11).srl16(8); + GSVector4i r10 = c10.sll16(11).srl16(8); + GSVector4i r11 = c11.sll16(11).srl16(8); + + r00 = r00.lerp16<0>(r01, uf); + r10 = r10.lerp16<0>(r11, uf); + + GSVector4i g00 = c00.sll16(6).srl16(11).sll16(3); + GSVector4i g01 = c01.sll16(6).srl16(11).sll16(3); + GSVector4i g10 = c10.sll16(6).srl16(11).sll16(3); + GSVector4i g11 = c11.sll16(6).srl16(11).sll16(3); + + g00 = g00.lerp16<0>(g01, uf); + g10 = g10.lerp16<0>(g11, uf); + + GSVector4i b00 = c00.sll16(1).srl16(11).sll16(3); + GSVector4i b01 = c01.sll16(1).srl16(11).sll16(3); + GSVector4i b10 = c10.sll16(1).srl16(11).sll16(3); + GSVector4i b11 = c11.sll16(1).srl16(11).sll16(3); + + b00 = b00.lerp16<0>(b01, uf); + b10 = b10.lerp16<0>(b11, uf); + + GSVector4i a00 = c00.sra16(15).sll16(8); + GSVector4i a01 = c01.sra16(15).sll16(8); + GSVector4i a10 = c10.sra16(15).sll16(8); + GSVector4i a11 = c11.sra16(15).sll16(8); + + a00 = a00.lerp16<0>(a01, uf); + a10 = a10.lerp16<0>(a11, uf); + + r = r00.lerp16<0>(r10, vf); + g = g00.lerp16<0>(g10, vf); + b = b00.lerp16<0>(b10, vf); + a = a00.lerp16<0>(a10, vf); + + test |= (r | g | b | a).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) + + a = a.gt16(GSVector4i::zero()); + } + else + { + if(sel.twin) + { + u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); + v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); + } + else + { + u0 = u0.min_i16(m_local.twin[2].u); + v0 = v0.min_i16(m_local.twin[2].v); + } + + addr00 = v0.sll16(8) | u0; + + // TODO + + if(sel.tlu) + { + c00 = addr00.gather16_16((const uint16*)m_global.vm, m_global.clut); + } + else + { + c00 = addr00.gather16_16((const uint16*)m_global.vm); + } + + r = (c00 << 3) & 0x00f800f8; + g = (c00 >> 2) & 0x00f800f8; + b = (c00 >> 7) & 0x00f800f8; + a = c00.sra16(15); + + test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels + } + } + + // ColorTFX + + switch(sel.tfx) + { + case 0: // none (tfx = 0) + case 1: // none (tfx = tge) + r = rf.srl16(7); + g = gf.srl16(7); + b = bf.srl16(7); + break; + case 2: // modulate (tfx = tme | tge) + r = r.modulate16<1>(rf).clamp8(); + g = g.modulate16<1>(gf).clamp8(); + b = b.modulate16<1>(bf).clamp8(); + break; + case 3: // decal (tfx = tme) + break; + default: + __assume(0); + } + + // AlphaBlend + + if(sel.abe) + { + GSVector4i rs = r; + GSVector4i gs = g; + GSVector4i bs = b; + GSVector4i rd = (fd & 0x001f001f) << 3; + GSVector4i gd = (fd & 0x03e003e0) >> 2; + GSVector4i bd = (fd & 0x7c007c00) >> 7; + + switch(sel.abr) + { + case 0: + r = rd.avg8(rs); + g = gd.avg8(gs); + b = bd.avg8(bs); + break; + case 1: + r = rd.addus8(rs); + g = gd.addus8(gs); + b = bd.addus8(bs); + break; + case 2: + r = rd.subus8(rs); + g = gd.subus8(gs); + b = bd.subus8(bs); + break; + case 3: + r = rd.addus8(rs.srl16(2)); + g = gd.addus8(gs.srl16(2)); + b = bd.addus8(bs.srl16(2)); + break; + default: + __assume(0); + } + + if(sel.tme) + { + r = rs.blend8(rd, a); + g = gs.blend8(gd, a); + b = bs.blend8(bd, a); + } + } + + // Dither + + if(sel.dtd) + { + r = r.addus8(dither); + g = g.addus8(dither); + b = b.addus8(dither); + } + + // WriteFrame + + GSVector4i fs = r | g | b | (sel.md ? GSVector4i(0x80008000) : sel.tme ? a : GSVector4i::zero()); + + fs = fs.blend8(fd, test); + + GSVector4i::store(fb, fb + 8, fs); + } + while(0); + + if(steps <= 0) break; + + steps -= 8; + + fb += 8; + + if(sel.tme) + { + GSVector4i st = m_local.d8.st; + + s = s.add16(st.xxxx()); + t = t.add16(st.yyyy()); + } + + if(sel.tfx != 3) // != decal + { + if(sel.iip) + { + GSVector4i c = m_local.d8.c; + + rf = rf.add16(c.xxxx()); + gf = gf.add16(c.yyyy()); + bf = bf.add16(c.zzzz()); + } + } + } } void GPUDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) { - // TODO + ASSERT(0); } void GPUDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) diff --git a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp index 1e2dbfd629..3378804fac 100644 --- a/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GPUDrawScanlineCodeGenerator.cpp @@ -121,7 +121,7 @@ void GPUDrawScanlineCodeGenerator::Init() { mov(eax, dword[esp + _top]); - // uint16* fb = &m_local.vm[(top << (10 + m_sel.scalex)) + left]; + // uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left; mov(edi, eax); shl(edi, 10 + m_sel.scalex); @@ -134,7 +134,7 @@ void GPUDrawScanlineCodeGenerator::Init() if(m_sel.dtd) { - // dither = GSVector4i::load(&s_dither[top & 3][left & 3]); + // dither = GSVector4i::load(&m_dither[top & 3][left & 3]); and(eax, 3); shl(eax, 5); @@ -741,7 +741,7 @@ void GPUDrawScanlineCodeGenerator::AlphaBlend() // xmm7 = test // xmm0, xmm2 = free - // GSVector4i r = (d & 0x001f001f) << 3; + // GSVector4i r = (fd & 0x001f001f) << 3; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); // 0x001f diff --git a/plugins/GSdx/GPUDrawScanlineCodeGenerator.h b/plugins/GSdx/GPUDrawScanlineCodeGenerator.h index 8705d5057d..22b5b8eb49 100644 --- a/plugins/GSdx/GPUDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GPUDrawScanlineCodeGenerator.h @@ -30,9 +30,6 @@ class GPUDrawScanlineCodeGenerator : public GSCodeGenerator { void operator = (const GPUDrawScanlineCodeGenerator&); - static const GSVector4i m_test[8]; - static const uint16 m_dither[4][16]; - GPUScanlineSelector m_sel; GPUScanlineLocalData& m_local; @@ -57,4 +54,7 @@ class GPUDrawScanlineCodeGenerator : public GSCodeGenerator public: GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize); + + static const GSVector4i m_test[8]; + static __aligned(const uint16, 32) m_dither[4][16]; }; \ No newline at end of file diff --git a/plugins/GSdx/GPUSetupPrimCodeGenerator.h b/plugins/GSdx/GPUSetupPrimCodeGenerator.h index 1dacffaab8..b4bb4bab14 100644 --- a/plugins/GSdx/GPUSetupPrimCodeGenerator.h +++ b/plugins/GSdx/GPUSetupPrimCodeGenerator.h @@ -28,8 +28,6 @@ class GPUSetupPrimCodeGenerator : public GSCodeGenerator { void operator = (const GPUSetupPrimCodeGenerator&); - static const GSVector4 m_shift[3]; - GPUScanlineSelector m_sel; GPUScanlineLocalData& m_local; @@ -37,4 +35,6 @@ class GPUSetupPrimCodeGenerator : public GSCodeGenerator public: GPUSetupPrimCodeGenerator(void* param, uint32 key, void* code, size_t maxsize); + + static const GSVector4 m_shift[3]; }; \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index ddc4d6e9b1..a48e441d30 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -412,6 +412,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i u, v, uv[2]; GSVector4i lodi, lodf; GSVector4i minuv, maxuv; + GSVector4i addr00, addr01, addr10, addr11; + GSVector4i c00, c01, c10, c11; if(sel.mmin) { @@ -529,12 +531,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i y1 = uv1.uph16() << (sel.tw + 3); GSVector4i x1 = uv1.upl16(); - GSVector4i addr00 = y0 + x0; - GSVector4i addr01 = y0 + x1; - GSVector4i addr10 = y1 + x0; - GSVector4i addr11 = y1 + x1; - - GSVector4i c00, c01, c10, c11; + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; if(sel.tlu) { @@ -582,9 +582,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - GSVector4i addr00 = y0 + x0; - - GSVector4i c00; + addr00 = y0 + x0; if(sel.tlu) { @@ -654,12 +652,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i y1 = uv1.uph16() << (sel.tw + 3); GSVector4i x1 = uv1.upl16(); - GSVector4i addr00 = y0 + x0; - GSVector4i addr01 = y0 + x1; - GSVector4i addr10 = y1 + x0; - GSVector4i addr11 = y1 + x1; - - GSVector4i c00, c01, c10, c11; + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; if(sel.tlu) { @@ -707,9 +703,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - GSVector4i addr00 = y0 + x0; - - GSVector4i c00; + addr00 = y0 + x0; if(sel.tlu) { @@ -797,12 +791,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i y1 = uv1.uph16() << (sel.tw + 3); GSVector4i x1 = uv1.upl16(); - GSVector4i addr00 = y0 + x0; - GSVector4i addr01 = y0 + x1; - GSVector4i addr10 = y1 + x0; - GSVector4i addr11 = y1 + x1; - - GSVector4i c00, c01, c10, c11; + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; if(sel.tlu) { @@ -844,9 +836,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - GSVector4i addr00 = y0 + x0; - - GSVector4i c00; + addr00 = y0 + x0; if(sel.tlu) { diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index c11bc40f53..cbc9869a53 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -552,7 +552,9 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) for(int i = 0, j = m_count; i < j; i++) { - v[i].t *= w; + GSVector4 t = v[i].t; + + v[i].t = (t * w).xyzw(t); } } } @@ -562,10 +564,13 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) for(int i = 0, j = m_count; i < j; i += 2) { - GSVector4 w = v[i + 1].t.zzzz().rcpnr(); + GSVector4 t0 = v[i + 0].t; + GSVector4 t1 = v[i + 1].t; - v[i + 0].t *= w; - v[i + 1].t *= w; + GSVector4 w = t1.zzzz().rcpnr(); + + v[i + 0].t = (t0 * w).xyzw(t0); + v[i + 1].t = (t1 * w).xyzw(t1); } } } @@ -582,7 +587,9 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) for(int i = 0, j = m_count; i < j; i++) { - v[i].t -= half; + GSVector4 t = v[i].t; + + v[i].t = (t - half).xyzw(t); } } }