GSdx: this should fix xp/wine crashing when extrathreads > 0, and added a sprite drawing shortcut, hopefully won't break anything.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5089 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2012-02-08 16:57:14 +00:00
parent 4593b6ac52
commit 19be605150
17 changed files with 620 additions and 602 deletions

View File

@ -884,7 +884,7 @@ public:
}
}
static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs
template<bool AEM> static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs
{
const GSVector4i* s = (const GSVector4i*)src;
@ -895,44 +895,36 @@ public:
GSVector4i bm = m_xxbx;
GSVector4i l, h;
if(TEXA.AEM)
for(int i = 0; i < 8; i++, dst += dstpitch)
{
for(int i = 0; i < 8; i++, dst += dstpitch)
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
if(AEM)
{
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero());
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero());
}
else
{
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
}
GSVector4i v1 = s[i * 2 + 1];
GSVector4i v1 = s[i * 2 + 1];
l = v1.upl16(v1);
h = v1.uph16(v1);
l = v1.upl16(v1);
h = v1.uph16(v1);
if(AEM)
{
((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero());
((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero());
}
}
else
{
for(int i = 0; i < 8; i++, dst += dstpitch)
else
{
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
GSVector4i v1 = s[i * 2 + 1];
l = v1.upl16(v1);
h = v1.uph16(v1);
((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
}
@ -1432,6 +1424,56 @@ public:
}
}
}
template<bool AEM> __forceinline static GSVector4i Expand16to32(const GSVector4i& c, const GSVector4i& TA0, const GSVector4i& TA1)
{
return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == GSVector4i::zero()) : TA0.blend(TA1, c.sra16(15)));
}
template<bool AEM> __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA)
{
#if 0 // not faster
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i TA0(TEXA.TA0 << 24);
GSVector4i TA1(TEXA.TA1 << 24);
for(int i = 0; i < 4; i++, dst += dstpitch * 2)
{
GSVector4i v0 = s[i * 4 + 0];
GSVector4i v1 = s[i * 4 + 1];
GSVector4i v2 = s[i * 4 + 2];
GSVector4i v3 = s[i * 4 + 3];
GSVector4i::sw16(v0, v1, v2, v3);
GSVector4i::sw32(v0, v1, v2, v3);
GSVector4i::sw16(v0, v2, v1, v3);
GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
d0[2] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
d0[3] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
d1[0] = Expand16to32<AEM>(v2.upl16(v2), TA0, TA1);
d1[1] = Expand16to32<AEM>(v2.uph16(v2), TA0, TA1);
d1[2] = Expand16to32<AEM>(v3.upl16(v3), TA0, TA1);
d1[3] = Expand16to32<AEM>(v3.uph16(v3), TA0, TA1);
}
#else
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16<AEM>(block, dst, dstpitch, TEXA);
#endif
}
__forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
{

View File

@ -91,6 +91,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.fb = m_global.sel.fb;
sel.zb = m_global.sel.zb;
sel.zoverflow = m_global.sel.zoverflow;
sel.notest = m_global.sel.notest;
m_sp = m_sp_map[sel];
}
@ -272,17 +273,24 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
// Init
int skip = left & 3;
int skip, steps;
left -= skip;
int steps = pixels + skip - 4;
if(!sel.notest)
{
skip = left & 3;
steps = pixels + skip - 4;
left -= skip;
test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
}
else
{
skip = 0;
steps = pixels - 4;
}
const GSVector2i* fza_base = &m_global.fzbr[top];
const GSVector2i* fza_offset = &m_global.fzbc[left >> 2];
test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
if(sel.prim != GS_SPRITE_CLASS)
{
if(sel.fwrite && sel.fge)
@ -1000,27 +1008,30 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
int fzm = 0;
if(sel.fwrite)
if(!sel.notest)
{
fm |= test;
}
if(sel.fwrite)
{
fm |= test;
}
if(sel.zwrite)
{
zm |= test;
}
if(sel.zwrite)
{
zm |= test;
}
if(sel.fwrite && sel.zwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
}
else if(sel.fwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask();
}
else if(sel.zwrite)
{
fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask();
if(sel.fwrite && sel.zwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
}
else if(sel.fwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask();
}
else if(sel.zwrite)
{
fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask();
}
}
// WriteZBuf
@ -1030,16 +1041,39 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(sel.ztest && sel.zpsm < 2)
{
zs = zs.blend8(zd, zm);
}
if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
if(sel.notest)
{
if(fast)
{
GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
}
else
{
WritePixel(zs, za, 0, sel.zpsm);
WritePixel(zs, za, 1, sel.zpsm);
WritePixel(zs, za, 2, sel.zpsm);
WritePixel(zs, za, 3, sel.zpsm);
}
}
else
{
if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm);
if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm);
if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm);
if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm);
if(fast)
{
if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
}
else
{
if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm);
if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm);
if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm);
if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm);
}
}
}
@ -1197,17 +1231,37 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
fs = fs.blend(fd, fm);
}
if(sel.rfb && sel.fpsm < 2)
bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest;
if(sel.notest)
{
if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
if(fast)
{
GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
}
else
{
WritePixel(fs, fa, 0, sel.fpsm);
WritePixel(fs, fa, 1, sel.fpsm);
WritePixel(fs, fa, 2, sel.fpsm);
WritePixel(fs, fa, 3, sel.fpsm);
}
}
else
{
if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm);
if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm);
if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm);
if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm);
if(fast)
{
if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
}
else
{
if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm);
if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm);
if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm);
if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm);
}
}
}
}
@ -1273,7 +1327,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
}
test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
if(!sel.notest)
{
test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
}
}
}

View File

@ -250,31 +250,40 @@ L("exit");
void GSDrawScanlineCodeGenerator::Init()
{
// int skip = left & 3;
if(!m_sel.notest)
{
// int skip = left & 3;
mov(ebx, edx);
and(edx, 3);
mov(ebx, edx);
and(edx, 3);
// left -= skip;
// int steps = pixels + skip - 4;
sub(ebx, edx);
lea(ecx, ptr[ecx + edx - 4]);
// int steps = pixels + skip - 4;
// left -= skip;
lea(ecx, ptr[ecx + edx - 4]);
sub(ebx, edx);
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
shl(edx, 4);
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
}
else
{
mov(ebx, edx); // left
xor(edx, edx); // skip
lea(ecx, ptr[ecx - 4]); // steps
}
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
@ -574,14 +583,17 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
// test = m_test[7 + (steps & (steps >> 31))];
if(!m_sel.notest)
{
// test = m_test[7 + (steps & (steps >> 31))];
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@ -2309,6 +2321,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::WriteMask()
{
if(m_sel.notest)
{
return;
}
// fm |= test;
// zm |= test;
@ -2355,17 +2372,17 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
return;
}
bool fast = m_sel.ztest && m_sel.zpsm < 2;
vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
if(fast)
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);
vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
@ -2671,7 +2688,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
}
bool fast = m_sel.rfb && m_sel.fpsm < 2;
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
@ -2684,49 +2701,67 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(fast)
if(m_sel.notest)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x0f);
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0xf0);
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
// vmaskmovps?
if(fast)
{
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
}
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
if(fast)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0f);
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0xf0);
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
// vmaskmovps?
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
}
}
}

View File

@ -250,31 +250,40 @@ L("exit");
void GSDrawScanlineCodeGenerator::Init()
{
// int skip = left & 3;
if(!m_sel.notest)
{
// int skip = left & 3;
mov(ebx, edx);
and(edx, 3);
mov(ebx, edx);
and(edx, 3);
// left -= skip;
// int steps = pixels + skip - 4;
sub(ebx, edx);
lea(ecx, ptr[ecx + edx - 4]);
// int steps = pixels + skip - 4;
// left -= skip;
lea(ecx, ptr[ecx + edx - 4]);
sub(ebx, edx);
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
shl(edx, 4);
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
por(xmm7, ptr[eax + (size_t)&m_test[7]]);
por(xmm7, ptr[eax + (size_t)&m_test[7]]);
}
else
{
mov(ebx, edx); // left
xor(edx, edx); // skip
lea(ecx, ptr[ecx - 4]); // steps
}
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
@ -579,14 +588,17 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
// test = m_test[7 + (steps & (steps >> 31))];
if(!m_sel.notest)
{
// test = m_test[7 + (steps & (steps >> 31))];
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@ -2422,6 +2434,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::WriteMask()
{
if(m_sel.notest)
{
return;
}
// fm |= test;
// zm |= test;
@ -2469,11 +2486,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
return;
}
bool fast = m_sel.ztest && m_sel.zpsm < 2;
movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
if(fast)
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);
@ -2482,6 +2497,8 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
blend8(xmm1, xmm7);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
@ -2811,7 +2828,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
}
bool fast = m_sel.rfb && m_sel.fpsm < 2;
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
@ -2824,47 +2841,65 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(fast)
if(m_sel.notest)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x0f);
je("@f");
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0xf0);
je("@f");
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
if(fast)
{
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
}
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
if(fast)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0f);
je("@f");
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0xf0);
je("@f");
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
}
}
}

View File

@ -342,55 +342,55 @@ GSLocalMemory::GSLocalMemory()
m_psm[PSM_PSMCT24].rtx = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMCT16].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16S;
m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8;
m_psm[PSM_PSMT4].rtx = &GSLocalMemory::ReadTexture4;
m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8H;
m_psm[PSM_PSMT4HL].rtx = &GSLocalMemory::ReadTexture4HL;
m_psm[PSM_PSMT4HH].rtx = &GSLocalMemory::ReadTexture4HH;
m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32Z;
m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24Z;
m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16Z;
m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16SZ;
m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32;
m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT24].rtxP = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMCT16].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16S;
m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMT8].rtxP = &GSLocalMemory::ReadTexture8P;
m_psm[PSM_PSMT4].rtxP = &GSLocalMemory::ReadTexture4P;
m_psm[PSM_PSMT8H].rtxP = &GSLocalMemory::ReadTexture8HP;
m_psm[PSM_PSMT4HL].rtxP = &GSLocalMemory::ReadTexture4HLP;
m_psm[PSM_PSMT4HH].rtxP = &GSLocalMemory::ReadTexture4HHP;
m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32Z;
m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24Z;
m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16Z;
m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16SZ;
m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32;
m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT24].rtxb = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMCT16].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16S;
m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8;
m_psm[PSM_PSMT4].rtxb = &GSLocalMemory::ReadTextureBlock4;
m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8H;
m_psm[PSM_PSMT4HL].rtxb = &GSLocalMemory::ReadTextureBlock4HL;
m_psm[PSM_PSMT4HH].rtxb = &GSLocalMemory::ReadTextureBlock4HH;
m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32Z;
m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24Z;
m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16Z;
m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16SZ;
m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32;
m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMCT16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16S;
m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMT8].rtxbP = &GSLocalMemory::ReadTextureBlock8P;
m_psm[PSM_PSMT4].rtxbP = &GSLocalMemory::ReadTextureBlock4P;
m_psm[PSM_PSMT8H].rtxbP = &GSLocalMemory::ReadTextureBlock8HP;
m_psm[PSM_PSMT4HL].rtxbP = &GSLocalMemory::ReadTextureBlock4HLP;
m_psm[PSM_PSMT4HH].rtxbP = &GSLocalMemory::ReadTextureBlock4HHP;
m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32Z;
m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24Z;
m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16Z;
m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16SZ;
m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32;
m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16;
m_psm[PSM_PSMT8].bpp = 8;
@ -1606,28 +1606,22 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
if(TEXA.AEM)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
else
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
@ -1685,61 +1679,6 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i&
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadBlock32<true>(src, dst, dstpitch);
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
if(TEXA.AEM)
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
else
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
}
void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
///////////////////
void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@ -1765,20 +1704,16 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ALIGN_STACK(32);
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
if(TEXA.AEM)
{
ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
}
else
{
ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
}
}
void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@ -1816,45 +1751,6 @@ void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, con
ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(32);
if(TEXA.AEM)
{
ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
}
else
{
ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
}
}
void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
///////////////////
void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)

View File

@ -875,32 +875,22 @@ public:
void ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
// pal ? 8 : 32

View File

@ -30,6 +30,8 @@
#define THREAD_HEIGHT 4
int GSRasterizerData::s_counter = 0;
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
: m_ds(ds)
, m_id(id)
@ -124,6 +126,8 @@ void GSRasterizer::Draw(GSRasterizerData* data)
if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return;
data->start = __rdtsc();
m_ds->BeginDraw(data);
const GSVertexSW* vertex = data->vertex;
@ -140,8 +144,6 @@ void GSRasterizer::Draw(GSRasterizerData* data)
m_fscissor_x = GSVector4(data->scissor).xzxz();
m_fscissor_y = GSVector4(data->scissor).ywyw();
uint64 start = __rdtsc();
switch(data->primclass)
{
case GS_POINT_CLASS:
@ -206,7 +208,9 @@ void GSRasterizer::Draw(GSRasterizerData* data)
__assume(0);
}
uint64 ticks = __rdtsc() - start;
data->pixels = m_pixels;
uint64 ticks = __rdtsc() - data->start;
m_ds->EndDraw(data->frame, ticks, m_pixels);
}

View File

@ -30,6 +30,8 @@
__aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
{
static int s_counter;
public:
GSVector4i scissor;
GSVector4i bbox;
@ -40,6 +42,9 @@ public:
uint32* index;
int index_count;
uint64 frame;
uint64 start;
int pixels;
int counter;
GSRasterizerData()
: scissor(GSVector4i::zero())
@ -51,7 +56,10 @@ public:
, index(NULL)
, index_count(0)
, frame(0)
, start(0)
, pixels(0)
{
counter = s_counter++;
}
virtual ~GSRasterizerData()

View File

@ -357,6 +357,28 @@ void GSRendererSW::Draw()
if(!GetScanlineGlobalData(sd)) return;
if(0) if(LOG)
{
int n = GSUtil::GetVertexCount(PRIM->PRIM);
for(int i = 0, j = 0; i < m_index.tail; i += n, j++)
{
for(int k = 0; k < n; k++)
{
GSVertex* v = &m_vertex.buff[m_index.buff[i + k]];
GSVertex* vn = &m_vertex.buff[m_index.buff[i + n - 1]];
fprintf(s_fp, "%d:%d %f %f %f %f\n",
j, k,
(float)(v->XYZ.X - context->XYOFFSET.OFX) / 16,
(float)(v->XYZ.Y - context->XYOFFSET.OFY) / 16,
PRIM->FST ? (float)(v->U) / 16 : v->ST.S / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q),
PRIM->FST ? (float)(v->V) / 16 : v->ST.T / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q)
);
}
}
}
GSVector4i scissor = GSVector4i(context->scissor.in);
GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
GSVector4i r = bbox.rintersect(scissor);
@ -404,11 +426,17 @@ void GSRendererSW::Draw()
//
if(LOG) {fprintf(s_fp, "queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d | %d %d %d\n",
m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, sd->vertex_count, sd->index_count); fflush(s_fp);}
if(LOG)
{
fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n",
sd->counter,
m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite,
m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA,
PRIM->PRIM, sd->vertex_count, sd->index_count);
fflush(s_fp);
}
if(s_dump)
{
@ -581,7 +609,7 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
{
if(LOG) {fprintf(s_fp, "r %05x %d %d, %d %d %d %d\n", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);}
if(!m_rl->IsSynced())
{
@ -814,8 +842,6 @@ bool GSRendererSW::CheckSourcePages(SharedData* sd)
if(m_fzb_pages[*p]) // currently being drawn to? => sync
{
if(LOG) fprintf(s_fp, "r=8 %05x\n", *p << 5);
return true;
}
}
@ -864,7 +890,10 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
if(PRIM->TME)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
}
}
if(context->TEST.ATE)
@ -1305,6 +1334,23 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
gd.zm |= GSVector4i::xffff0000();
}
if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor)))
{
gd.sel.notest = 1;
uint32 ofx = context->XYOFFSET.OFX;
for(int i = 0, j = m_vertex.tail; i < j; i++)
{
if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4
{
gd.sel.notest = 0;
break;
}
}
}
return true;
}
@ -1329,6 +1375,14 @@ GSRendererSW::SharedData::~SharedData()
if(global.clut) _aligned_free(global.clut);
if(global.dimx) _aligned_free(global.dimx);
if(LOG) {fprintf(s_fp, "[%d] done t=%lld p=%d | %d %d %d | %08x_%08x\n",
counter,
__rdtsc() - start, pixels,
primclass, vertex_count, index_count,
global.sel.hi, global.sel.lo
);
fflush(s_fp);}
}
void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm)
@ -1421,7 +1475,7 @@ void GSRendererSW::SharedData::UpdateSource()
if(m_parent->s_save && m_parent->s_n >= m_parent->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM);
s = format("c:\\temp1\\_%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n - 2, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM);
m_tex[i].t->Save(s);
}

View File

@ -67,8 +67,9 @@ union GSScanlineSelector
uint32 edge:1; // 48
uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 lcm:1; // 50
uint32 mmin:2; // 51
uint32 lcm:1; // 52
uint32 mmin:2; // 53
uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
};
struct

View File

@ -315,7 +315,7 @@ void GSSettingsDlg::UpdateControls()
EnableWindow(GetDlgItem(m_hWnd, IDC_NATIVERES), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_PALTEX), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw && GSDevice9::GetMaxDepth(m_lastValidMsaa) < 32);
EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 && hw);
//EnableWindow(GetDlgItem(m_hWnd, IDC_AA1), sw); // Let uers set software params regardless of renderer used
//EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw);

View File

@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate()
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < 5; i++)
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
@ -80,7 +80,7 @@ void GSSetupPrimCodeGenerator::Depth()
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -103,7 +103,7 @@ void GSSetupPrimCodeGenerator::Depth()
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
@ -139,36 +139,6 @@ void GSSetupPrimCodeGenerator::Depth()
vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
/*
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
*/
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
@ -210,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture()
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
@ -272,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color()
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
@ -302,7 +272,7 @@ void GSSetupPrimCodeGenerator::Color()
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

View File

@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate()
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < 5; i++)
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
@ -82,7 +82,7 @@ void GSSetupPrimCodeGenerator::Depth()
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -107,7 +107,7 @@ void GSSetupPrimCodeGenerator::Depth()
mulps(xmm1, xmm3);
movdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
@ -144,36 +144,6 @@ void GSSetupPrimCodeGenerator::Depth()
movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
/*
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
movaps(xmm1, ptr[&GSVector4::m_half]);
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
*/
movdqa(ptr[&m_local.p.z], xmm0);
}
}
@ -217,7 +187,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
@ -282,7 +252,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
@ -315,7 +285,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

View File

@ -37,6 +37,7 @@ GSState::GSState()
, m_frameskip(0)
, m_vt(this)
, m_q(1.0f)
, m_texflush(true)
{
m_nativeres = !!theApp.GetConfig("nativeres", 0);
@ -200,6 +201,8 @@ void GSState::Reset()
m_vertex.tail = 0;
m_vertex.next = 0;
m_index.tail = 0;
m_texflush = true;
}
void GSState::ResetHandlers()
@ -992,7 +995,7 @@ void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r)
void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r)
{
// TRACE(_T("TEXFLUSH\n"));
m_texflush = true;
}
template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r)

View File

@ -143,6 +143,7 @@ protected:
float m_q;
GSVector4 m_scissor;
uint32 m_ofxy;
bool m_texflush;
struct
{

View File

@ -28,10 +28,13 @@ InitializeConditionVariablePtr pInitializeConditionVariable;
WakeConditionVariablePtr pWakeConditionVariable;
WakeAllConditionVariablePtr pWakeAllConditionVariable;
SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
InitializeSRWLockPtr pInitializeSRWLock;;
InitializeSRWLockPtr pInitializeSRWLock;
AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
AcquireSRWLockSharedPtr pAcquireSRWLockShared;
TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared;
ReleaseSRWLockSharedPtr pReleaseSRWLockShared;
class InitCondVar
{
@ -50,6 +53,9 @@ public:
pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive");
pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive");
pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive");
pAcquireSRWLockShared = (AcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "AcquireSRWLockShared");
pTryAcquireSRWLockShared = (TryAcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockShared");
pReleaseSRWLockShared = (ReleaseSRWLockSharedPtr)GetProcAddress(m_kernel32, "ReleaseSRWLockShared");
}
virtual ~InitCondVar()

View File

@ -23,25 +23,54 @@
#include "GSdx.h"
class IGSThread
{
protected:
virtual void ThreadProc() = 0;
};
class IGSLock
{
public:
virtual void Lock() = 0;
virtual bool TryLock() = 0;
virtual void Unlock() = 0;
};
class IGSEvent
{
public:
virtual void Set() = 0;
virtual bool Wait(IGSLock* l) = 0;
};
#ifdef _WINDOWS
typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * WakeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
typedef BOOL (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * AcquireSRWLockSharedPtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockSharedPtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * ReleaseSRWLockSharedPtr)(SRWLOCK* SRWLock);
extern InitializeConditionVariablePtr pInitializeConditionVariable;
extern WakeConditionVariablePtr pWakeConditionVariable;
extern WakeAllConditionVariablePtr pWakeAllConditionVariable;
extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
extern InitializeSRWLockPtr pInitializeSRWLock;;
extern InitializeSRWLockPtr pInitializeSRWLock;
extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
extern AcquireSRWLockSharedPtr pAcquireSRWLockShared;
extern TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared;
extern ReleaseSRWLockSharedPtr pReleaseSRWLockShared;
class GSThread
class GSThread : public IGSThread
{
DWORD m_ThreadId;
HANDLE m_hThread;
@ -49,8 +78,6 @@ class GSThread
static DWORD WINAPI StaticThreadProc(void* lpParam);
protected:
virtual void ThreadProc() = 0;
void CreateThread();
void CloseThread();
@ -59,7 +86,7 @@ public:
virtual ~GSThread();
};
class GSCritSec
class GSCritSec : public IGSLock
{
CRITICAL_SECTION m_cs;
@ -67,26 +94,25 @@ public:
GSCritSec() {InitializeCriticalSection(&m_cs);}
~GSCritSec() {DeleteCriticalSection(&m_cs);}
void Lock() {EnterCriticalSection(&m_cs);}
bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;}
void Unlock() {LeaveCriticalSection(&m_cs);}
void Lock() {EnterCriticalSection(&m_cs);}
bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;}
void Unlock() {LeaveCriticalSection(&m_cs);}
};
class GSEvent
class GSEvent : public IGSEvent
{
protected:
HANDLE m_hEvent;
public:
GSEvent(bool manual = false, bool initial = false) {m_hEvent = CreateEvent(NULL, manual, initial, NULL);}
GSEvent() {m_hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);}
~GSEvent() {CloseHandle(m_hEvent);}
void Set() {SetEvent(m_hEvent);}
void Reset() {ResetEvent(m_hEvent);}
bool Wait() {return WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0;}
bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0; if(l) l->Lock(); return b;}
};
class GSCondVarLock
class GSCondVarLock : public IGSLock
{
SRWLOCK m_lock;
@ -94,12 +120,13 @@ public:
GSCondVarLock() {pInitializeSRWLock(&m_lock);}
void Lock() {pAcquireSRWLockExclusive(&m_lock);}
bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;}
void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
operator SRWLOCK* () {return &m_lock;}
};
class GSCondVar
class GSCondVar : public IGSEvent
{
CONDITION_VARIABLE m_cv;
@ -107,7 +134,7 @@ public:
GSCondVar() {pInitializeConditionVariable(&m_cv);}
void Set() {pWakeConditionVariable(&m_cv);}
void Wait(GSCondVarLock& lock) {pSleepConditionVariableSRW(&m_cv, lock, INFINITE, 0);}
bool Wait(IGSLock* l) {return pSleepConditionVariableSRW(&m_cv, *(GSCondVarLock*)l, INFINITE, 0) != 0;}
operator CONDITION_VARIABLE* () {return &m_cv;}
};
@ -117,7 +144,7 @@ public:
#include <pthread.h>
#include <semaphore.h>
class GSThread
class GSThread : public IGSThread
{
pthread_attr_t m_thread_attr;
pthread_t m_thread;
@ -125,8 +152,6 @@ class GSThread
static void* StaticThreadProc(void* param);
protected:
virtual void ThreadProc() = 0;
void CreateThread();
void CloseThread();
@ -135,16 +160,16 @@ public:
virtual ~GSThread();
};
class GSCritSec
class GSCritSec : public IGSLock
{
pthread_mutexattr_t m_mutex_attr;
pthread_mutex_t m_mutex;
public:
GSCritSec()
GSCritSec(bool recursive = true)
{
pthread_mutexattr_init(&m_mutex_attr);
pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutexattr_settype(&m_mutex_attr, recursive ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL);
pthread_mutex_init(&m_mutex, &m_mutex_attr);
}
@ -159,7 +184,7 @@ public:
void Unlock() {pthread_mutex_unlock(&m_mutex);}
};
class GSEvent
class GSEvent : public IGSEvent
{
protected:
sem_t m_sem;
@ -169,36 +194,18 @@ public:
~GSEvent() {sem_destroy(&m_sem);}
void Set() {sem_post(&m_sem);}
bool Wait() {return sem_wait(&m_sem) == 0;}
bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = sem_wait(&m_sem) == 0; if(l) l->Lock(); return b;}
};
// Note except the mutex attribute the code is same as GSCritSec object
class GSCondVarLock
class GSCondVarLock : public GSCritSec
{
pthread_mutexattr_t m_mutex_attr;
pthread_mutex_t m_mutex;
public:
GSCondVarLock()
GSCondVarLock() : GSCritSec(false)
{
pthread_mutexattr_init(&m_mutex_attr);
pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_NORMAL);
pthread_mutex_init(&m_mutex, &m_mutex_attr);
}
virtual ~GSCondVarLock()
{
pthread_mutex_destroy(&m_mutex);
pthread_mutexattr_destroy(&m_mutex_attr);
}
void Lock() {pthread_mutex_lock(&m_mutex);}
bool TryLock() {return pthread_mutex_trylock(&m_mutex) == 0;}
void Unlock() {pthread_mutex_unlock(&m_mutex);}
operator pthread_mutex_t* () {return &m_mutex;}
};
class GSCondVar
class GSCondVar : public IGSEvent
{
pthread_cond_t m_cv;
pthread_condattr_t m_cv_attr;
@ -209,6 +216,7 @@ public:
pthread_condattr_init(&m_cv_attr);
pthread_cond_init(&m_cv, &m_cv_attr);
}
virtual ~GSCondVar()
{
pthread_condattr_destroy(&m_cv_attr);
@ -216,7 +224,7 @@ public:
}
void Set() {pthread_cond_signal(&m_cv);}
void Wait(GSCondVarLock& lock) {pthread_cond_wait(&m_cv, lock);}
bool Wait(IGSLock* l) {pthread_cond_wait(&m_cv, *(GSCondVarLock*)l) == 0;}
operator pthread_cond_t* () {return &m_cv;}
};
@ -225,32 +233,11 @@ public:
class GSAutoLock
{
protected:
GSCritSec* m_cs;
IGSLock* m_lock;
public:
GSAutoLock(GSCritSec* cs) {m_cs = cs; m_cs->Lock();}
~GSAutoLock() {m_cs->Unlock();}
};
class GSEventSpin
{
protected:
volatile long m_sync;
volatile bool m_manual;
public:
GSEventSpin(bool manual = false, bool initial = false) {m_sync = initial ? 1 : 0; m_manual = manual;}
~GSEventSpin() {}
void Set() {_interlockedbittestandset(&m_sync, 0);}
void Reset() {_interlockedbittestandreset(&m_sync, 0);}
bool Wait()
{
if(m_manual) while(!m_sync) _mm_pause();
else while(!_interlockedbittestandreset(&m_sync, 0)) _mm_pause();
return true;
}
GSAutoLock(IGSLock* l) {(m_lock = l)->Lock();}
~GSAutoLock() {m_lock->Unlock();}
};
template<class T> class GSJobQueue : private GSThread
@ -259,70 +246,36 @@ protected:
queue<T> m_queue;
volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
volatile bool m_exit;
struct {GSCritSec lock; GSEvent notempty;} m_ev;
struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv;
IGSEvent* m_notempty;
IGSEvent* m_empty;
IGSLock* m_lock;
void ThreadProc()
{
if(m_cv.available)
m_lock->Lock();
while(true)
{
m_cv.lock.Lock();
while(true)
while(m_queue.empty())
{
while(m_queue.empty())
{
m_cv.notempty.Wait(m_cv.lock);
m_notempty->Wait(m_lock);
if(m_exit) {m_cv.lock.Unlock(); return;}
}
T& item = m_queue.front();
m_cv.lock.Unlock();
Process(item);
m_cv.lock.Lock();
m_queue.pop();
m_count--;
if(m_queue.empty())
{
m_cv.empty.Set();
}
if(m_exit) {m_lock->Unlock(); return;}
}
}
else
{
m_ev.lock.Lock();
while(true)
T& item = m_queue.front();
m_lock->Unlock();
Process(item);
m_lock->Lock();
m_queue.pop();
if(--m_count == 0)
{
while(m_queue.empty())
{
m_ev.lock.Unlock();
m_ev.notempty.Wait();
if(m_exit) {return;}
m_ev.lock.Lock();
}
T& item = m_queue.front();
m_ev.lock.Unlock();
Process(item);
m_ev.lock.Lock();
m_queue.pop();
m_count--;
m_empty->Set();
}
}
}
@ -332,17 +285,30 @@ public:
: m_count(0)
, m_exit(false)
{
m_cv.available = !!theApp.GetConfig("condvar", 1);
bool condvar = !!theApp.GetConfig("condvar", 1);
#ifdef _WINDOWS
if(pInitializeConditionVariable == NULL)
{
m_cv.available = false;
condvar = false;
}
#endif
if(condvar)
{
m_notempty = new GSCondVar();
m_empty = new GSCondVar();
m_lock = new GSCondVarLock();
}
else
{
m_notempty = new GSEvent();
m_empty = new GSEvent();
m_lock = new GSCritSec();
}
CreateThread();
}
@ -350,14 +316,13 @@ public:
{
m_exit = true;
if(m_cv.available)
{
m_cv.notempty.Set();
}
else
{
m_ev.notempty.Set();
}
m_notempty->Set();
CloseThread();
delete m_notempty;
delete m_empty;
delete m_lock;
}
bool IsEmpty() const
@ -369,51 +334,32 @@ public:
void Push(const T& item)
{
if(m_cv.available)
{
m_cv.lock.Lock();
m_lock->Lock();
m_queue.push(item);
m_queue.push(item);
m_count++;
m_cv.lock.Unlock();
m_cv.notempty.Set();
}
else
if(m_count++ == 0)
{
GSAutoLock l(&m_ev.lock);
m_queue.push(item);
m_count++;
m_ev.notempty.Set();
m_notempty->Set();
}
m_lock->Unlock();
}
void Wait()
{
if(m_cv.available)
if(m_count > 0)
{
if(m_count > 0)
m_lock->Lock();
while(m_count != 0)
{
m_cv.lock.Lock();
while(!m_queue.empty())
{
m_cv.empty.Wait(m_cv.lock);
}
ASSERT(m_count == 0);
m_cv.lock.Unlock();
m_empty->Wait(m_lock);
}
}
else
{
while(m_count > 0) _mm_pause();
ASSERT(m_queue.empty());
m_lock->Unlock();
}
}