diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h index 0a835f04f3..ef5e4885a3 100644 --- a/plugins/GSdx/GSBlock.h +++ b/plugins/GSdx/GSBlock.h @@ -884,7 +884,7 @@ public: } } - static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs + template static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs { const GSVector4i* s = (const GSVector4i*)src; @@ -895,44 +895,36 @@ public: GSVector4i bm = m_xxbx; GSVector4i l, h; - if(TEXA.AEM) + for(int i = 0; i < 8; i++, dst += dstpitch) { - for(int i = 0; i < 8; i++, dst += dstpitch) + GSVector4i v0 = s[i * 2 + 0]; + + l = v0.upl16(v0); + h = v0.uph16(v0); + + if(AEM) { - GSVector4i v0 = s[i * 2 + 0]; - - l = v0.upl16(v0); - h = v0.uph16(v0); - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); + } + else + { + ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); + ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); + } - GSVector4i v1 = s[i * 2 + 1]; + GSVector4i v1 = s[i * 2 + 1]; - l = v1.upl16(v1); - h = v1.uph16(v1); + l = v1.upl16(v1); + h = v1.uph16(v1); + if(AEM) + { ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); } - } - else - { - for(int i = 0; i < 8; i++, dst += dstpitch) + else { - GSVector4i v0 = s[i * 2 + 0]; - - l = v0.upl16(v0); - h = v0.uph16(v0); - - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); - ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); - - GSVector4i v1 = s[i * 2 + 1]; - - l = v1.upl16(v1); - h = v1.uph16(v1); - ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); } @@ -1432,6 +1424,56 @@ public: } } } + template __forceinline static GSVector4i Expand16to32(const GSVector4i& c, const GSVector4i& TA0, const GSVector4i& TA1) + { + return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == GSVector4i::zero()) : TA0.blend(TA1, c.sra16(15))); + } + + template __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) + { + #if 0 // not faster + + const GSVector4i* s = (const GSVector4i*)src; + + GSVector4i TA0(TEXA.TA0 << 24); + GSVector4i TA1(TEXA.TA1 << 24); + + for(int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector4i v0 = s[i * 4 + 0]; + GSVector4i v1 = s[i * 4 + 1]; + GSVector4i v2 = s[i * 4 + 2]; + GSVector4i v3 = s[i * 4 + 3]; + + GSVector4i::sw16(v0, v1, v2, v3); + GSVector4i::sw32(v0, v1, v2, v3); + GSVector4i::sw16(v0, v2, v1, v3); + + GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; + + d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); + d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); + d0[2] = Expand16to32(v1.upl16(v1), TA0, TA1); + d0[3] = Expand16to32(v1.uph16(v1), TA0, TA1); + + GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; + + d1[0] = Expand16to32(v2.upl16(v2), TA0, TA1); + d1[1] = Expand16to32(v2.uph16(v2), TA0, TA1); + d1[2] = Expand16to32(v3.upl16(v3), TA0, TA1); + d1[3] = Expand16to32(v3.uph16(v3), TA0, TA1); + } + + #else + + __aligned(uint16, 32) block[16 * 8]; + + ReadBlock16(src, (uint8*)block, sizeof(block) / 8); + + ExpandBlock16(block, dst, dstpitch, TEXA); + + #endif + } __forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index a151d1038b..af79af75f0 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -91,6 +91,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) sel.fb = m_global.sel.fb; sel.zb = m_global.sel.zb; sel.zoverflow = m_global.sel.zoverflow; + sel.notest = m_global.sel.notest; m_sp = m_sp_map[sel]; } @@ -272,17 +273,24 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS // Init - int skip = left & 3; + int skip, steps; - left -= skip; - - int steps = pixels + skip - 4; + if(!sel.notest) + { + skip = left & 3; + steps = pixels + skip - 4; + left -= skip; + test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + } + else + { + skip = 0; + steps = pixels - 4; + } const GSVector2i* fza_base = &m_global.fzbr[top]; const GSVector2i* fza_offset = &m_global.fzbc[left >> 2]; - test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; - if(sel.prim != GS_SPRITE_CLASS) { if(sel.fwrite && sel.fge) @@ -1000,27 +1008,30 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS int fzm = 0; - if(sel.fwrite) + if(!sel.notest) { - fm |= test; - } + if(sel.fwrite) + { + fm |= test; + } - if(sel.zwrite) - { - zm |= test; - } + if(sel.zwrite) + { + zm |= test; + } - if(sel.fwrite && sel.zwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - } - else if(sel.fwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); - } - else if(sel.zwrite) - { - fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); + if(sel.fwrite && sel.zwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + } + else if(sel.fwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); + } + else if(sel.zwrite) + { + fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); + } } // WriteZBuf @@ -1030,16 +1041,39 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.ztest && sel.zpsm < 2) { zs = zs.blend8(zd, zm); + } - if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); - if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; + + if(sel.notest) + { + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); + GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + } + else + { + WritePixel(zs, za, 0, sel.zpsm); + WritePixel(zs, za, 1, sel.zpsm); + WritePixel(zs, za, 2, sel.zpsm); + WritePixel(zs, za, 3, sel.zpsm); + } } else { - if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm); - if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm); - if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm); - if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm); + if(fast) + { + if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); + if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + } + else + { + if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm); + if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm); + if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm); + if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm); + } } } @@ -1197,17 +1231,37 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS fs = fs.blend(fd, fm); } - if(sel.rfb && sel.fpsm < 2) + bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest; + + if(sel.notest) { - if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); - if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); + GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + } + else + { + WritePixel(fs, fa, 0, sel.fpsm); + WritePixel(fs, fa, 1, sel.fpsm); + WritePixel(fs, fa, 2, sel.fpsm); + WritePixel(fs, fa, 3, sel.fpsm); + } } else { - if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm); - if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm); - if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm); - if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm); + if(fast) + { + if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); + if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + } + else + { + if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm); + if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm); + if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm); + if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm); + } } } } @@ -1273,7 +1327,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } } - test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + if(!sel.notest) + { + test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + } } } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index d66da34b2a..c0d938f10c 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -250,31 +250,40 @@ L("exit"); void GSDrawScanlineCodeGenerator::Init() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(ebx, edx); - and(edx, 3); + mov(ebx, edx); + and(edx, 3); - // left -= skip; + // int steps = pixels + skip - 4; - sub(ebx, edx); + lea(ecx, ptr[ecx + edx - 4]); - // int steps = pixels + skip - 4; + // left -= skip; - lea(ecx, ptr[ecx + edx - 4]); + sub(ebx, edx); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(edx, 4); + shl(edx, 4); - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); - vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); + vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); + } + else + { + mov(ebx, edx); // left + xor(edx, edx); // skip + lea(ecx, ptr[ecx - 4]); // steps + } // GSVector2i* fza_base = &m_local.gd->fzbr[top]; @@ -574,14 +583,17 @@ void GSDrawScanlineCodeGenerator::Step() } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + } } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -2309,6 +2321,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::WriteMask() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; @@ -2355,17 +2372,17 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; - vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } @@ -2671,7 +2688,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } @@ -2684,49 +2701,67 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - - // vmaskmovps? + if(fast) + { + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + } + else + { + WritePixel(src, addr, 0, psm); + WritePixel(src, addr, 1, psm); + WritePixel(src, addr, 2, psm); + WritePixel(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } } } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index a806d383f4..b37dc11638 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -250,31 +250,40 @@ L("exit"); void GSDrawScanlineCodeGenerator::Init() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(ebx, edx); - and(edx, 3); + mov(ebx, edx); + and(edx, 3); - // left -= skip; + // int steps = pixels + skip - 4; - sub(ebx, edx); + lea(ecx, ptr[ecx + edx - 4]); - // int steps = pixels + skip - 4; + // left -= skip; - lea(ecx, ptr[ecx + edx - 4]); + sub(ebx, edx); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(edx, 4); + shl(edx, 4); - movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); - por(xmm7, ptr[eax + (size_t)&m_test[7]]); + por(xmm7, ptr[eax + (size_t)&m_test[7]]); + } + else + { + mov(ebx, edx); // left + xor(edx, edx); // skip + lea(ecx, ptr[ecx - 4]); // steps + } // GSVector2i* fza_base = &m_local.gd->fzbr[top]; @@ -579,14 +588,17 @@ void GSDrawScanlineCodeGenerator::Step() } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); - movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + } } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -2422,6 +2434,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::WriteMask() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; @@ -2469,11 +2486,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; - movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); @@ -2482,6 +2497,8 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() blend8(xmm1, xmm7); } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } @@ -2811,7 +2828,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } @@ -2824,47 +2841,65 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); + if(fast) + { + movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + } + else + { + WritePixel(src, addr, 0, psm); + WritePixel(src, addr, 1, psm); + WritePixel(src, addr, 2, psm); + WritePixel(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } } } diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index c74a609364..dfdd11274a 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -342,55 +342,55 @@ GSLocalMemory::GSLocalMemory() m_psm[PSM_PSMCT24].rtx = &GSLocalMemory::ReadTexture24; m_psm[PSM_PSMCT16].rtx = &GSLocalMemory::ReadTexture16; - m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16S; + m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8; m_psm[PSM_PSMT4].rtx = &GSLocalMemory::ReadTexture4; m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8H; m_psm[PSM_PSMT4HL].rtx = &GSLocalMemory::ReadTexture4HL; m_psm[PSM_PSMT4HH].rtx = &GSLocalMemory::ReadTexture4HH; - m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32Z; - m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24Z; - m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16Z; - m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16SZ; + m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32; + m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24; + m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16; + m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMCT24].rtxP = &GSLocalMemory::ReadTexture24; m_psm[PSM_PSMCT16].rtxP = &GSLocalMemory::ReadTexture16; - m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16S; + m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMT8].rtxP = &GSLocalMemory::ReadTexture8P; m_psm[PSM_PSMT4].rtxP = &GSLocalMemory::ReadTexture4P; m_psm[PSM_PSMT8H].rtxP = &GSLocalMemory::ReadTexture8HP; m_psm[PSM_PSMT4HL].rtxP = &GSLocalMemory::ReadTexture4HLP; m_psm[PSM_PSMT4HH].rtxP = &GSLocalMemory::ReadTexture4HHP; - m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32Z; - m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24Z; - m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16Z; - m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16SZ; + m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32; + m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24; + m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16; + m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMCT24].rtxb = &GSLocalMemory::ReadTextureBlock24; m_psm[PSM_PSMCT16].rtxb = &GSLocalMemory::ReadTextureBlock16; - m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16S; + m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8; m_psm[PSM_PSMT4].rtxb = &GSLocalMemory::ReadTextureBlock4; m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8H; m_psm[PSM_PSMT4HL].rtxb = &GSLocalMemory::ReadTextureBlock4HL; m_psm[PSM_PSMT4HH].rtxb = &GSLocalMemory::ReadTextureBlock4HH; - m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32Z; - m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24Z; - m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16Z; - m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16SZ; + m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32; + m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24; + m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16; + m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMCT24].rtxbP = &GSLocalMemory::ReadTextureBlock24; m_psm[PSM_PSMCT16].rtxbP = &GSLocalMemory::ReadTextureBlock16; - m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16S; + m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMT8].rtxbP = &GSLocalMemory::ReadTextureBlock8P; m_psm[PSM_PSMT4].rtxbP = &GSLocalMemory::ReadTextureBlock4P; m_psm[PSM_PSMT8H].rtxbP = &GSLocalMemory::ReadTextureBlock8HP; m_psm[PSM_PSMT4HL].rtxbP = &GSLocalMemory::ReadTextureBlock4HLP; m_psm[PSM_PSMT4HH].rtxbP = &GSLocalMemory::ReadTextureBlock4HHP; - m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32Z; - m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24Z; - m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16Z; - m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16SZ; + m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32; + m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24; + m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16; + m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16; m_psm[PSM_PSMT8].bpp = 8; @@ -1606,28 +1606,22 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) { - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) + if(TEXA.AEM) { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + FOREACH_BLOCK_START(r, 16, 8, 32) + { + ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + } + FOREACH_BLOCK_END } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) + else { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + FOREACH_BLOCK_START(r, 16, 8, 32) + { + ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + } + FOREACH_BLOCK_END } - FOREACH_BLOCK_END } void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) @@ -1685,61 +1679,6 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_END } -void GSLocalMemory::ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadBlock32(src, dst, dstpitch); - } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - if(TEXA.AEM) - { - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END - } - else - { - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END - } -} - -void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) - { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) - { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END -} - /////////////////// void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1765,20 +1704,16 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { - __aligned(uint16, 32) block[16 * 8]; + ALIGN_STACK(32); - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - -void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + if(TEXA.AEM) + { + ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + } + else + { + ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + } } void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1816,45 +1751,6 @@ void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, con ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); } -void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - ALIGN_STACK(32); - - ReadBlock32(BlockPtr(bp), dst, dstpitch); -} - -void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - ALIGN_STACK(32); - - if(TEXA.AEM) - { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); - } - else - { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); - } -} - -void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - -void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - /////////////////// void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index 383cca4e5f..8cda1b73b2 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -875,32 +875,22 @@ public: void ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; // pal ? 8 : 32 diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 7252cfaa89..d1644195d8 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -30,6 +30,8 @@ #define THREAD_HEIGHT 4 +int GSRasterizerData::s_counter = 0; + GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon) : m_ds(ds) , m_id(id) @@ -124,6 +126,8 @@ void GSRasterizer::Draw(GSRasterizerData* data) if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return; + data->start = __rdtsc(); + m_ds->BeginDraw(data); const GSVertexSW* vertex = data->vertex; @@ -140,8 +144,6 @@ void GSRasterizer::Draw(GSRasterizerData* data) m_fscissor_x = GSVector4(data->scissor).xzxz(); m_fscissor_y = GSVector4(data->scissor).ywyw(); - uint64 start = __rdtsc(); - switch(data->primclass) { case GS_POINT_CLASS: @@ -206,7 +208,9 @@ void GSRasterizer::Draw(GSRasterizerData* data) __assume(0); } - uint64 ticks = __rdtsc() - start; + data->pixels = m_pixels; + + uint64 ticks = __rdtsc() - data->start; m_ds->EndDraw(data->frame, ticks, m_pixels); } diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 2dc611e7a8..45ab368d1c 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -30,6 +30,8 @@ __aligned(class, 32) GSRasterizerData : public GSAlignedClass<32> { + static int s_counter; + public: GSVector4i scissor; GSVector4i bbox; @@ -40,6 +42,9 @@ public: uint32* index; int index_count; uint64 frame; + uint64 start; + int pixels; + int counter; GSRasterizerData() : scissor(GSVector4i::zero()) @@ -51,7 +56,10 @@ public: , index(NULL) , index_count(0) , frame(0) + , start(0) + , pixels(0) { + counter = s_counter++; } virtual ~GSRasterizerData() diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 480e1de678..4e9da1b068 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -357,6 +357,28 @@ void GSRendererSW::Draw() if(!GetScanlineGlobalData(sd)) return; + if(0) if(LOG) + { + int n = GSUtil::GetVertexCount(PRIM->PRIM); + + for(int i = 0, j = 0; i < m_index.tail; i += n, j++) + { + for(int k = 0; k < n; k++) + { + GSVertex* v = &m_vertex.buff[m_index.buff[i + k]]; + GSVertex* vn = &m_vertex.buff[m_index.buff[i + n - 1]]; + + fprintf(s_fp, "%d:%d %f %f %f %f\n", + j, k, + (float)(v->XYZ.X - context->XYOFFSET.OFX) / 16, + (float)(v->XYZ.Y - context->XYOFFSET.OFY) / 16, + PRIM->FST ? (float)(v->U) / 16 : v->ST.S / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q), + PRIM->FST ? (float)(v->V) / 16 : v->ST.T / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q) + ); + } + } + } + GSVector4i scissor = GSVector4i(context->scissor.in); GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); GSVector4i r = bbox.rintersect(scissor); @@ -404,11 +426,17 @@ void GSRendererSW::Draw() // - if(LOG) {fprintf(s_fp, "queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d | %d %d %d\n", - m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, - PRIM->PRIM, sd->vertex_count, sd->index_count); fflush(s_fp);} + if(LOG) + { + fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n", + sd->counter, + m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA, + PRIM->PRIM, sd->vertex_count, sd->index_count); + + fflush(s_fp); + } if(s_dump) { @@ -581,7 +609,7 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) { - if(LOG) {fprintf(s_fp, "r %05x %d %d, %d %d %d %d\n", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} if(!m_rl->IsSynced()) { @@ -814,8 +842,6 @@ bool GSRendererSW::CheckSourcePages(SharedData* sd) if(m_fzb_pages[*p]) // currently being drawn to? => sync { - if(LOG) fprintf(s_fp, "r=8 %05x\n", *p << 5); - return true; } } @@ -864,7 +890,10 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) if(PRIM->TME) { - m_mem.m_clut.Read32(context->TEX0, env.TEXA); + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + } } if(context->TEST.ATE) @@ -1305,6 +1334,23 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zm |= GSVector4i::xffff0000(); } + if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) + { + gd.sel.notest = 1; + + uint32 ofx = context->XYOFFSET.OFX; + + for(int i = 0, j = m_vertex.tail; i < j; i++) + { + if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4 + { + gd.sel.notest = 0; + + break; + } + } + } + return true; } @@ -1329,6 +1375,14 @@ GSRendererSW::SharedData::~SharedData() if(global.clut) _aligned_free(global.clut); if(global.dimx) _aligned_free(global.dimx); + + if(LOG) {fprintf(s_fp, "[%d] done t=%lld p=%d | %d %d %d | %08x_%08x\n", + counter, + __rdtsc() - start, pixels, + primclass, vertex_count, index_count, + global.sel.hi, global.sel.lo + ); + fflush(s_fp);} } void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm) @@ -1421,7 +1475,7 @@ void GSRendererSW::SharedData::UpdateSource() if(m_parent->s_save && m_parent->s_n >= m_parent->s_saven) { - s = format("c:\\temp1\\_%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM); + s = format("c:\\temp1\\_%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n - 2, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM); m_tex[i].t->Save(s); } diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index 75b82c8c27..a8d9637c40 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -67,8 +67,9 @@ union GSScanlineSelector uint32 edge:1; // 48 uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) - uint32 lcm:1; // 50 - uint32 mmin:2; // 51 + uint32 lcm:1; // 52 + uint32 mmin:2; // 53 + uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) }; struct diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 23cb97e7fd..905b8b5f11 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -315,7 +315,7 @@ void GSSettingsDlg::UpdateControls() EnableWindow(GetDlgItem(m_hWnd, IDC_NATIVERES), hw); EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), hw); EnableWindow(GetDlgItem(m_hWnd, IDC_PALTEX), hw); - EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw && GSDevice9::GetMaxDepth(m_lastValidMsaa) < 32); + EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw); EnableWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 && hw); //EnableWindow(GetDlgItem(m_hWnd, IDC_AA1), sw); // Let uers set software params regardless of renderer used //EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index b473094d3b..86df61d412 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate() { mov(edx, dword[esp + _dscan]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { vmovaps(Xmm(3 + i), ptr[&m_shift[i]]); } @@ -80,7 +80,7 @@ void GSSetupPrimCodeGenerator::Depth() vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vmovdqa(ptr[&m_local.d4.f], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -103,7 +103,7 @@ void GSSetupPrimCodeGenerator::Depth() vmulps(xmm1, xmm0, xmm3); vmovdqa(ptr[&m_local.d4.z], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -139,36 +139,6 @@ void GSSetupPrimCodeGenerator::Depth() vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]); vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - -/* - // GSVector4 z = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - vbroadcastss(xmm1, ptr[&GSVector4::m_half]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } -*/ - vmovdqa(ptr[&m_local.p.z], xmm0); } } @@ -210,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture() vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -272,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -302,7 +272,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 01d79b21fc..070ccbf109 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate() { mov(edx, dword[esp + _dscan]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { movaps(Xmm(3 + i), ptr[&m_shift[i]]); } @@ -82,7 +82,7 @@ void GSSetupPrimCodeGenerator::Depth() pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(ptr[&m_local.d4.f], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -107,7 +107,7 @@ void GSSetupPrimCodeGenerator::Depth() mulps(xmm1, xmm3); movdqa(ptr[&m_local.d4.z], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -144,36 +144,6 @@ void GSSetupPrimCodeGenerator::Depth() movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]); pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - /* - // GSVector4 z = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - movaps(xmm1, ptr[&GSVector4::m_half]); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - */ - movdqa(ptr[&m_local.p.z], xmm0); } } @@ -217,7 +187,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -282,7 +252,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -315,7 +285,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index ae5b16fea3..3562a94f55 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -37,6 +37,7 @@ GSState::GSState() , m_frameskip(0) , m_vt(this) , m_q(1.0f) + , m_texflush(true) { m_nativeres = !!theApp.GetConfig("nativeres", 0); @@ -200,6 +201,8 @@ void GSState::Reset() m_vertex.tail = 0; m_vertex.next = 0; m_index.tail = 0; + + m_texflush = true; } void GSState::ResetHandlers() @@ -992,7 +995,7 @@ void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r) void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r) { - // TRACE(_T("TEXFLUSH\n")); + m_texflush = true; } template void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r) diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h index 38ae4a1044..8e4431d7be 100644 --- a/plugins/GSdx/GSState.h +++ b/plugins/GSdx/GSState.h @@ -143,6 +143,7 @@ protected: float m_q; GSVector4 m_scissor; uint32 m_ofxy; + bool m_texflush; struct { diff --git a/plugins/GSdx/GSThread.cpp b/plugins/GSdx/GSThread.cpp index 3f0eda6921..995772bee1 100644 --- a/plugins/GSdx/GSThread.cpp +++ b/plugins/GSdx/GSThread.cpp @@ -28,10 +28,13 @@ InitializeConditionVariablePtr pInitializeConditionVariable; WakeConditionVariablePtr pWakeConditionVariable; WakeAllConditionVariablePtr pWakeAllConditionVariable; SleepConditionVariableSRWPtr pSleepConditionVariableSRW; -InitializeSRWLockPtr pInitializeSRWLock;; +InitializeSRWLockPtr pInitializeSRWLock; AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive; ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +AcquireSRWLockSharedPtr pAcquireSRWLockShared; +TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared; +ReleaseSRWLockSharedPtr pReleaseSRWLockShared; class InitCondVar { @@ -50,6 +53,9 @@ public: pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive"); pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive"); pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive"); + pAcquireSRWLockShared = (AcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "AcquireSRWLockShared"); + pTryAcquireSRWLockShared = (TryAcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockShared"); + pReleaseSRWLockShared = (ReleaseSRWLockSharedPtr)GetProcAddress(m_kernel32, "ReleaseSRWLockShared"); } virtual ~InitCondVar() diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index d53faf04e6..4a9e7223dc 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -23,25 +23,54 @@ #include "GSdx.h" +class IGSThread +{ +protected: + virtual void ThreadProc() = 0; +}; + +class IGSLock +{ +public: + virtual void Lock() = 0; + virtual bool TryLock() = 0; + virtual void Unlock() = 0; +}; + +class IGSEvent +{ +public: + virtual void Set() = 0; + virtual bool Wait(IGSLock* l) = 0; +}; + #ifdef _WINDOWS typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); typedef void (WINAPI * WakeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); -typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags); +typedef BOOL (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags); typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock); typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock); -typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef void (WINAPI * AcquireSRWLockSharedPtr)(SRWLOCK* SRWLock); +typedef BOOLEAN (WINAPI * TryAcquireSRWLockSharedPtr)(SRWLOCK* SRWLock); +typedef void (WINAPI * ReleaseSRWLockSharedPtr)(SRWLOCK* SRWLock); extern InitializeConditionVariablePtr pInitializeConditionVariable; extern WakeConditionVariablePtr pWakeConditionVariable; extern WakeAllConditionVariablePtr pWakeAllConditionVariable; extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW; -extern InitializeSRWLockPtr pInitializeSRWLock;; +extern InitializeSRWLockPtr pInitializeSRWLock; extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; -extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive; +extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +extern AcquireSRWLockSharedPtr pAcquireSRWLockShared; +extern TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared; +extern ReleaseSRWLockSharedPtr pReleaseSRWLockShared; -class GSThread +class GSThread : public IGSThread { DWORD m_ThreadId; HANDLE m_hThread; @@ -49,8 +78,6 @@ class GSThread static DWORD WINAPI StaticThreadProc(void* lpParam); protected: - virtual void ThreadProc() = 0; - void CreateThread(); void CloseThread(); @@ -59,7 +86,7 @@ public: virtual ~GSThread(); }; -class GSCritSec +class GSCritSec : public IGSLock { CRITICAL_SECTION m_cs; @@ -67,26 +94,25 @@ public: GSCritSec() {InitializeCriticalSection(&m_cs);} ~GSCritSec() {DeleteCriticalSection(&m_cs);} - void Lock() {EnterCriticalSection(&m_cs);} - bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;} - void Unlock() {LeaveCriticalSection(&m_cs);} + void Lock() {EnterCriticalSection(&m_cs);} + bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;} + void Unlock() {LeaveCriticalSection(&m_cs);} }; -class GSEvent +class GSEvent : public IGSEvent { protected: HANDLE m_hEvent; public: - GSEvent(bool manual = false, bool initial = false) {m_hEvent = CreateEvent(NULL, manual, initial, NULL);} + GSEvent() {m_hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);} ~GSEvent() {CloseHandle(m_hEvent);} void Set() {SetEvent(m_hEvent);} - void Reset() {ResetEvent(m_hEvent);} - bool Wait() {return WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0;} + bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0; if(l) l->Lock(); return b;} }; -class GSCondVarLock +class GSCondVarLock : public IGSLock { SRWLOCK m_lock; @@ -94,12 +120,13 @@ public: GSCondVarLock() {pInitializeSRWLock(&m_lock);} void Lock() {pAcquireSRWLockExclusive(&m_lock);} - bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);} - + bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} + void Unlock() {pReleaseSRWLockExclusive(&m_lock);} + operator SRWLOCK* () {return &m_lock;} }; -class GSCondVar +class GSCondVar : public IGSEvent { CONDITION_VARIABLE m_cv; @@ -107,7 +134,7 @@ public: GSCondVar() {pInitializeConditionVariable(&m_cv);} void Set() {pWakeConditionVariable(&m_cv);} - void Wait(GSCondVarLock& lock) {pSleepConditionVariableSRW(&m_cv, lock, INFINITE, 0);} + bool Wait(IGSLock* l) {return pSleepConditionVariableSRW(&m_cv, *(GSCondVarLock*)l, INFINITE, 0) != 0;} operator CONDITION_VARIABLE* () {return &m_cv;} }; @@ -117,7 +144,7 @@ public: #include #include -class GSThread +class GSThread : public IGSThread { pthread_attr_t m_thread_attr; pthread_t m_thread; @@ -125,8 +152,6 @@ class GSThread static void* StaticThreadProc(void* param); protected: - virtual void ThreadProc() = 0; - void CreateThread(); void CloseThread(); @@ -135,16 +160,16 @@ public: virtual ~GSThread(); }; -class GSCritSec +class GSCritSec : public IGSLock { pthread_mutexattr_t m_mutex_attr; pthread_mutex_t m_mutex; public: - GSCritSec() + GSCritSec(bool recursive = true) { pthread_mutexattr_init(&m_mutex_attr); - pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutexattr_settype(&m_mutex_attr, recursive ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL); pthread_mutex_init(&m_mutex, &m_mutex_attr); } @@ -159,7 +184,7 @@ public: void Unlock() {pthread_mutex_unlock(&m_mutex);} }; -class GSEvent +class GSEvent : public IGSEvent { protected: sem_t m_sem; @@ -169,36 +194,18 @@ public: ~GSEvent() {sem_destroy(&m_sem);} void Set() {sem_post(&m_sem);} - bool Wait() {return sem_wait(&m_sem) == 0;} + bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = sem_wait(&m_sem) == 0; if(l) l->Lock(); return b;} }; -// Note except the mutex attribute the code is same as GSCritSec object -class GSCondVarLock +class GSCondVarLock : public GSCritSec { - pthread_mutexattr_t m_mutex_attr; - pthread_mutex_t m_mutex; - public: - GSCondVarLock() + GSCondVarLock() : GSCritSec(false) { - pthread_mutexattr_init(&m_mutex_attr); - pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_NORMAL); - pthread_mutex_init(&m_mutex, &m_mutex_attr); } - virtual ~GSCondVarLock() - { - pthread_mutex_destroy(&m_mutex); - pthread_mutexattr_destroy(&m_mutex_attr); - } - - void Lock() {pthread_mutex_lock(&m_mutex);} - bool TryLock() {return pthread_mutex_trylock(&m_mutex) == 0;} - void Unlock() {pthread_mutex_unlock(&m_mutex);} - - operator pthread_mutex_t* () {return &m_mutex;} }; -class GSCondVar +class GSCondVar : public IGSEvent { pthread_cond_t m_cv; pthread_condattr_t m_cv_attr; @@ -209,6 +216,7 @@ public: pthread_condattr_init(&m_cv_attr); pthread_cond_init(&m_cv, &m_cv_attr); } + virtual ~GSCondVar() { pthread_condattr_destroy(&m_cv_attr); @@ -216,7 +224,7 @@ public: } void Set() {pthread_cond_signal(&m_cv);} - void Wait(GSCondVarLock& lock) {pthread_cond_wait(&m_cv, lock);} + bool Wait(IGSLock* l) {pthread_cond_wait(&m_cv, *(GSCondVarLock*)l) == 0;} operator pthread_cond_t* () {return &m_cv;} }; @@ -225,32 +233,11 @@ public: class GSAutoLock { -protected: - GSCritSec* m_cs; + IGSLock* m_lock; public: - GSAutoLock(GSCritSec* cs) {m_cs = cs; m_cs->Lock();} - ~GSAutoLock() {m_cs->Unlock();} -}; - -class GSEventSpin -{ -protected: - volatile long m_sync; - volatile bool m_manual; - -public: - GSEventSpin(bool manual = false, bool initial = false) {m_sync = initial ? 1 : 0; m_manual = manual;} - ~GSEventSpin() {} - - void Set() {_interlockedbittestandset(&m_sync, 0);} - void Reset() {_interlockedbittestandreset(&m_sync, 0);} - bool Wait() - { - if(m_manual) while(!m_sync) _mm_pause(); - else while(!_interlockedbittestandreset(&m_sync, 0)) _mm_pause(); - return true; - } + GSAutoLock(IGSLock* l) {(m_lock = l)->Lock();} + ~GSAutoLock() {m_lock->Unlock();} }; template class GSJobQueue : private GSThread @@ -259,70 +246,36 @@ protected: queue m_queue; volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent) volatile bool m_exit; - struct {GSCritSec lock; GSEvent notempty;} m_ev; - struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv; + IGSEvent* m_notempty; + IGSEvent* m_empty; + IGSLock* m_lock; void ThreadProc() { - if(m_cv.available) + m_lock->Lock(); + + while(true) { - m_cv.lock.Lock(); - - while(true) + while(m_queue.empty()) { - while(m_queue.empty()) - { - m_cv.notempty.Wait(m_cv.lock); + m_notempty->Wait(m_lock); - if(m_exit) {m_cv.lock.Unlock(); return;} - } - - T& item = m_queue.front(); - - m_cv.lock.Unlock(); - - Process(item); - - m_cv.lock.Lock(); - - m_queue.pop(); - - m_count--; - - if(m_queue.empty()) - { - m_cv.empty.Set(); - } + if(m_exit) {m_lock->Unlock(); return;} } - } - else - { - m_ev.lock.Lock(); - while(true) + T& item = m_queue.front(); + + m_lock->Unlock(); + + Process(item); + + m_lock->Lock(); + + m_queue.pop(); + + if(--m_count == 0) { - while(m_queue.empty()) - { - m_ev.lock.Unlock(); - - m_ev.notempty.Wait(); - - if(m_exit) {return;} - - m_ev.lock.Lock(); - } - - T& item = m_queue.front(); - - m_ev.lock.Unlock(); - - Process(item); - - m_ev.lock.Lock(); - - m_queue.pop(); - - m_count--; + m_empty->Set(); } } } @@ -332,17 +285,30 @@ public: : m_count(0) , m_exit(false) { - m_cv.available = !!theApp.GetConfig("condvar", 1); + bool condvar = !!theApp.GetConfig("condvar", 1); #ifdef _WINDOWS if(pInitializeConditionVariable == NULL) { - m_cv.available = false; + condvar = false; } #endif + if(condvar) + { + m_notempty = new GSCondVar(); + m_empty = new GSCondVar(); + m_lock = new GSCondVarLock(); + } + else + { + m_notempty = new GSEvent(); + m_empty = new GSEvent(); + m_lock = new GSCritSec(); + } + CreateThread(); } @@ -350,14 +316,13 @@ public: { m_exit = true; - if(m_cv.available) - { - m_cv.notempty.Set(); - } - else - { - m_ev.notempty.Set(); - } + m_notempty->Set(); + + CloseThread(); + + delete m_notempty; + delete m_empty; + delete m_lock; } bool IsEmpty() const @@ -369,51 +334,32 @@ public: void Push(const T& item) { - if(m_cv.available) - { - m_cv.lock.Lock(); + m_lock->Lock(); - m_queue.push(item); + m_queue.push(item); - m_count++; - - m_cv.lock.Unlock(); - - m_cv.notempty.Set(); - } - else + if(m_count++ == 0) { - GSAutoLock l(&m_ev.lock); - - m_queue.push(item); - - m_count++; - - m_ev.notempty.Set(); + m_notempty->Set(); } + + m_lock->Unlock(); } void Wait() { - if(m_cv.available) + if(m_count > 0) { - if(m_count > 0) + m_lock->Lock(); + + while(m_count != 0) { - m_cv.lock.Lock(); - - while(!m_queue.empty()) - { - m_cv.empty.Wait(m_cv.lock); - } - - ASSERT(m_count == 0); - - m_cv.lock.Unlock(); + m_empty->Wait(m_lock); } - } - else - { - while(m_count > 0) _mm_pause(); + + ASSERT(m_queue.empty()); + + m_lock->Unlock(); } }