GSdx: More avx2 code to read/write different block formats, the GSBenchmark function shows nice improvements, but no games run faster. I just upload the changes before messing with the drawing part.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5675 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11@gmail.com 2013-06-17 04:11:10 +00:00
parent f4ce6d6fce
commit 8b9f5b5bc2
20 changed files with 1467 additions and 596 deletions

View File

@ -71,4 +71,6 @@ public:
void DrawRect(const GSVector4i& r, const GSVertexSW& v);
#endif
void PrintStats() {m_ds_map.PrintStats();}
};

View File

@ -120,7 +120,7 @@ protected:
void GrowVertexBuffer()
{
int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 16);
Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);
if(m_vertices != NULL)
{

View File

@ -27,7 +27,7 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
: GPURendererT<GSVertexSW>(dev)
, m_texture(NULL)
{
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 32);
m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads, &m_perfmon);
}
@ -121,7 +121,7 @@ void GPURendererSW::Draw()
data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16);
data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 32);
data->vertex = (GSVertexSW*)data->buff;
data->vertex_count = m_count;

View File

@ -747,7 +747,7 @@ GPUState::Buffer::Buffer()
{
bytes = 0;
maxbytes = 4096;
buff = (uint8*)_aligned_malloc(maxbytes, 16);
buff = (uint8*)_aligned_malloc(maxbytes, 32);
cur = 0;
}
@ -761,9 +761,9 @@ void GPUState::Buffer::Reserve(int size)
if(size > maxbytes)
{
int new_maxbytes = (maxbytes + size + 1023) & ~1023;
uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 16);
uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 32);
if(buff != NULL)
if(buff != NULL)
{
memcpy(new_buff, buff, maxbytes);
_aligned_free(buff);

View File

@ -23,9 +23,7 @@
#include "GSBlock.h"
#if _M_SSE >= 0x501
const GSVector8i GSBlock::m_r16mask(
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#else
const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#endif
@ -44,7 +42,7 @@ const GSVector4i GSBlock::m_xgxx(0x000003e0);
const GSVector4i GSBlock::m_rxxx(0x0000001f);
#endif
const GSVector4i GSBlock::m_uw8hmask0 = GSVector4i(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
const GSVector4i GSBlock::m_uw8hmask1 = GSVector4i(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
const GSVector4i GSBlock::m_uw8hmask2 = GSVector4i(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
const GSVector4i GSBlock::m_uw8hmask3 = GSVector4i(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
const GSVector4i GSBlock::m_uw8hmask3(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);

File diff suppressed because it is too large Load Diff

View File

@ -177,7 +177,7 @@ public:
: CBaseFilter(NAME("GSSource"), pUnk, this, __uuidof(this), &hr)
, m_output(NULL)
, m_size(w, h)
, m_atpf(10000000i64 / fps)
, m_atpf((REFERENCE_TIME)(10000000.0f / fps))
, m_now(0)
{
m_output = new GSSourceOutputPin(m_size, m_atpf, this, this, hr, colorspace);

View File

@ -85,4 +85,6 @@ public:
void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);
#endif
void PrintStats() {m_ds_map.PrintStats();}
};

View File

@ -44,8 +44,9 @@ public:
struct
{
GSVector4 in;
GSVector4i ex;
GSVector4 ofex;
uint32 ofxy;
GSVector4i ofxy;
} scissor;
struct
@ -83,6 +84,11 @@ public:
void UpdateScissor()
{
scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4);
scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4);
scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4);
scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4);
scissor.ofex = GSVector4(
(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
(int)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY),
@ -95,10 +101,11 @@ public:
(int)SCISSOR.SCAX1 + 1,
(int)SCISSOR.SCAY1 + 1);
uint16 ofx = (uint16)XYOFFSET.OFX - 15;
uint16 ofy = (uint16)XYOFFSET.OFY - 15;
uint16 ofx = (uint16)XYOFFSET.OFX;
uint16 ofy = (uint16)XYOFFSET.OFY;
scissor.ofxy = ((ofy << 16) | ofx); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
scissor.ofxy.u32[0] = (ofy << 16) | ofx;
scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
}
bool DepthRead() const

View File

@ -678,7 +678,7 @@ vector<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
////////////////////
template<int psm, int bsx, int bsy, bool aligned>
template<int psm, int bsx, int bsy, int alignment>
void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
{
uint32 bp = BITBLTBUF.DBP;
@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
{
switch(psm)
{
case PSM_PSMCT32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMCT16: WriteColumn16<aligned>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT16S: WriteColumn16<aligned>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMT8: WriteColumn8<aligned>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
case PSM_PSMT4: WriteColumn4<aligned>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
case PSM_PSMZ32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMZ16: WriteColumn16<aligned>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMZ16S: WriteColumn16<aligned>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMCT16: WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT16S: WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMT8: WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
case PSM_PSMT4: WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
case PSM_PSMZ32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMZ16: WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMZ16S: WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
// TODO
default: __assume(0);
}
@ -707,7 +707,7 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
}
}
template<int psm, int bsx, int bsy, bool aligned>
template<int psm, int bsx, int bsy, int alignment>
void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
{
uint32 bp = BITBLTBUF.DBP;
@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src
{
switch(psm)
{
case PSM_PSMCT32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMCT16: WriteBlock16<aligned>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT16S: WriteBlock16<aligned>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMT8: WriteBlock8<aligned>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
case PSM_PSMT4: WriteBlock4<aligned>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
case PSM_PSMZ32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMZ16: WriteBlock16<aligned>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMZ16S: WriteBlock16<aligned>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMCT16: WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMCT16S: WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMT8: WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
case PSM_PSMT4: WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
case PSM_PSMZ32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
case PSM_PSMZ16: WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
case PSM_PSMZ16S: WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
// TODO
default: __assume(0);
}
@ -803,7 +803,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
case PSM_PSMZ32:
ReadColumn32(y, dst, buff, 32);
memcpy(&buff[32], &src[x * 4], 32);
WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
break;
case PSM_PSMCT16:
case PSM_PSMCT16S:
@ -811,17 +811,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
case PSM_PSMZ16S:
ReadColumn16(y, dst, buff, 32);
memcpy(&buff[32], &src[x * 2], 32);
WriteColumn16<true>(y, dst, buff, 32);
WriteColumn16<32>(y, dst, buff, 32);
break;
case PSM_PSMT8:
ReadColumn8(y, dst, buff, 16);
for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
WriteColumn8<true>(y, dst, buff, 16);
WriteColumn8<32>(y, dst, buff, 16);
break;
case PSM_PSMT4:
ReadColumn4(y, dst, buff, 16);
for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
WriteColumn4<true>(y, dst, buff, 16);
WriteColumn4<32>(y, dst, buff, 16);
break;
// TODO
default:
@ -841,13 +841,19 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
if(h2 > 0)
{
if(((size_t)&src[l * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
size_t addr = (size_t)&src[l * trbpp >> 3];
if((addr & 31) == 0 && (srcpitch & 31) == 0)
{
WriteImageColumn<psm, bsx, bsy, true>(l, r, y, h2, src, srcpitch, BITBLTBUF);
WriteImageColumn<psm, bsx, bsy, 32>(l, r, y, h2, src, srcpitch, BITBLTBUF);
}
else if((addr & 15) == 0 && (srcpitch & 15) == 0)
{
WriteImageColumn<psm, bsx, bsy, 16>(l, r, y, h2, src, srcpitch, BITBLTBUF);
}
else
{
WriteImageColumn<psm, bsx, bsy, false>(l, r, y, h2, src, srcpitch, BITBLTBUF);
WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
}
src += srcpitch * h2;
@ -884,7 +890,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
case PSM_PSMZ32:
ReadColumn32(y, dst, buff, 32);
memcpy(&buff[0], &src[x * 4], 32);
WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
break;
case PSM_PSMCT16:
case PSM_PSMCT16S:
@ -892,17 +898,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
case PSM_PSMZ16S:
ReadColumn16(y, dst, buff, 32);
memcpy(&buff[0], &src[x * 2], 32);
WriteColumn16<true>(y, dst, buff, 32);
WriteColumn16<32>(y, dst, buff, 32);
break;
case PSM_PSMT8:
ReadColumn8(y, dst, buff, 16);
for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
WriteColumn8<true>(y, dst, buff, 16);
WriteColumn8<32>(y, dst, buff, 16);
break;
case PSM_PSMT4:
ReadColumn4(y, dst, buff, 16);
for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
WriteColumn4<true>(y, dst, buff, 16);
WriteColumn4<32>(y, dst, buff, 16);
break;
// TODO
default:
@ -982,13 +988,19 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const uint8* src, int len, GIFR
if(h2 > 0)
{
if(((size_t)&s[la * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
size_t addr = (size_t)&s[la * trbpp >> 3];
if((addr & 31) == 0 && (srcpitch & 31) == 0)
{
WriteImageBlock<psm, bsx, bsy, true>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
WriteImageBlock<psm, bsx, bsy, 32>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
}
else if((addr & 15) == 0 && (srcpitch & 15) == 0)
{
WriteImageBlock<psm, bsx, bsy, 16>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
}
else
{
WriteImageBlock<psm, bsx, bsy, false>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
}
s += srcpitch * h2;
@ -1385,6 +1397,8 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
int sx = (int)TRXPOS.SSAX;
int ex = sx + (int)TRXREG.RRW;
// printf("spsm=%d x=%d ex=%d y=%d len=%d\n", BITBLTBUF.SPSM, x, ex, y, len);
switch(BITBLTBUF.SPSM)
{
case PSM_PSMCT32:
@ -1399,12 +1413,21 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
int* RESTRICT offset = psm->rowOffset[y & 7];
uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pd += 4)
for(; len > 0 && x < ex && (x & 7); len--, x++, pd++)
{
pd[0] = ps[offset[x + 0]];
pd[1] = ps[offset[x + 1]];
pd[2] = ps[offset[x + 2]];
pd[3] = ps[offset[x + 3]];
*pd = ps[offset[x]];
}
// aligned to a column
for(int ex8 = ex - 8; len >= 8 && x <= ex8; len -= 8, x += 8, pd += 8)
{
int o = offset[x];
GSVector4i::store<false>(&pd[0], GSVector4i::load(&ps[o + 0], &ps[o + 4]));
GSVector4i::store<false>(&pd[4], GSVector4i::load(&ps[o + 8], &ps[o + 12]));
for(int i = 0; i < 8; i++) ASSERT(pd[i] == ps[offset[x + i]]);
}
for(; len > 0 && x < ex; len--, x++, pd++)

View File

@ -845,10 +845,10 @@ public:
//
template<int psm, int bsx, int bsy, bool aligned>
template<int psm, int bsx, int bsy, int alignment>
void WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
template<int psm, int bsx, int bsy, bool aligned>
template<int psm, int bsx, int bsy, int alignment>
void WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
template<int psm, int bsx, int bsy>

View File

@ -467,7 +467,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
{
edge = vertex[i[1 - m2]];
edge.p = edge.p.insert<0, 1>(vertex[i[m2]].p);
edge.p = edge.p.insert32<0, 1>(vertex[i[m2]].p);
dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, vertex[i[1 - m2]].p);
@ -489,7 +489,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
{
edge = v1;
edge.p = v0.p.xxxx().addm(ddx[m2], dv[0].p.yyyy()).xyzw(edge.p);
edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p);
dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p);
@ -532,7 +532,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
GSVertexSW scan;
scan.p = edge.p.addm(dedge.p, dy);
scan.p = edge.p + dedge.p * dy;
GSVector4 lrf = scan.p.ceil();
GSVector4 l = lrf.max(scissor);
@ -546,16 +546,18 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
if(pixels > 0)
{
scan.t = edge.t.addm(dedge.t, dy);
scan.c = edge.c.addm(dedge.c, dy);
scan.t = edge.t + dedge.t * dy;
scan.c = edge.c + dedge.c * dy;
GSVector4 prestep = (l - p0).xxxx();
scan.p = scan.p.addm(dscan.p, prestep);
scan.t = scan.t.addm(dscan.t, prestep);
scan.c = scan.c.addm(dscan.c, prestep);
scan.p = scan.p + dscan.p * prestep;
scan.t = scan.t + dscan.t * prestep;
scan.c = scan.c + dscan.c * prestep;
AddScanline(e++, pixels, left, top, scan);
//m_pixels += pixels; m_ds->DrawScanline(pixels, left, top, scan);
}
top++;
@ -629,8 +631,8 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index)
GSVertexSW dedge;
GSVertexSW dscan;
dedge.t = GSVector4::zero().insert<1, 1>(dt);
dscan.t = GSVector4::zero().insert<0, 0>(dt);
dedge.t = GSVector4::zero().insert32<1, 1>(dt);
dscan.t = GSVector4::zero().insert32<0, 0>(dt);
GSVector4 prestep = GSVector4(r.left, r.top) - scan.p;
@ -851,9 +853,9 @@ void GSRasterizer::AddScanline(GSVertexSW* e, int pixels, int left, int top, con
{
*e = scan;
e->p.i16[0] = (int16)pixels;
e->p.i16[1] = (int16)left;
e->p.i16[2] = (int16)top;
e->_pad.i32[0] = pixels;
e->_pad.i32[1] = left;
e->_pad.i32[2] = top;
}
void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge)
@ -873,9 +875,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
{
do
{
int pixels = e->p.i16[0];
int left = e->p.i16[1];
int top = e->p.i16[2];
int pixels = e->_pad.i32[0];
int left = e->_pad.i32[1];
int top = e->_pad.i32[2];
m_pixels += pixels;
@ -887,9 +889,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
{
do
{
int pixels = e->p.i16[0];
int left = e->p.i16[1];
int top = e->p.i16[2];
int pixels = e->_pad.i32[0];
int left = e->_pad.i32[1];
int top = e->_pad.i32[2];
m_pixels += pixels;

View File

@ -104,6 +104,8 @@ public:
#endif
virtual void PrintStats() = 0;
__forceinline bool HasEdge() const {return m_de != NULL;}
__forceinline bool IsSolidRect() const {return m_dr != NULL;}
};
@ -117,6 +119,7 @@ public:
virtual void Sync() = 0;
virtual bool IsSynced() const = 0;
virtual int GetPixels(bool reset = true) = 0;
virtual void PrintStats() = 0;
};
__aligned(class, 32) GSRasterizer : public IRasterizer
@ -164,10 +167,10 @@ public:
void Sync() {}
bool IsSynced() const {return true;}
int GetPixels(bool reset);
void PrintStats() {m_ds->PrintStats();}
};
class GSRasterizerList
: public IRasterizer
class GSRasterizerList : public IRasterizer
{
protected:
class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
@ -221,4 +224,5 @@ public:
void Sync();
bool IsSynced() const;
int GetPixels(bool reset);
void PrintStats() {}
};

View File

@ -28,6 +28,10 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#if _M_SSE >= 0x501
const GSVector8 g_pos_scale2(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#endif
GSRendererSW::GSRendererSW(int threads)
: m_fzb(NULL)
{
@ -210,7 +214,7 @@ void GSRendererSW::VSync(int field)
m_tc->IncAge();
// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
// if((m_perfmon.GetFrame() & 255) == 0) m_rl->PrintStats();
}
void GSRendererSW::ResetDevice()
@ -263,18 +267,80 @@ GSTexture* GSRendererSW::GetOutput(int i)
template<uint32 primclass, uint32 tme, uint32 fst>
void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
{
size_t i = m_vertex.next;
#if 0//_M_SSE >= 0x501
// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
GSVector8i o2((GSVector4i)m_context->XYOFFSET);
GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
{
GSVector8i v0 = GSVector8i::load<true>(src[0].m);
GSVector8i v1 = GSVector8i::load<true>(src[1].m);
GSVector8 stcq = GSVector8::cast(v0.ac(v1));
GSVector8i xyzuvf = v0.bd(v1);
//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
GSVector8i xy = xyzuvf.upl16() - o2;
GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * g_pos_scale2;
GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
GSVector8 t = GSVector8::zero();
if(tme)
{
if(fst)
{
t = GSVector8(xyzuvf.uph16() << (16 - 4));
}
else
{
t = stcq.xyww() * tsize2;
}
}
if(primclass == GS_SPRITE_CLASS)
{
t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
}
/*
if(tme || primclass == GS_SPRITE_CLASS)
{
GSVector8::store<true>(&dst[0].p, p.ac(t));
}
else
{
GSVector8::storel(&dst[0].p, p);
}
*/
GSVector8::store<true>(&dst[0].p, p.ac(t));
GSVector8::store<true>(&dst[0].c, c.a_());
/*
if(tme || primclass == GS_SPRITE_CLASS)
{
GSVector8::store<true>(&dst[1].p, p.bd(t));
}
else
{
GSVector8::storeh(&dst[1].p, p);
}
*/
GSVector8::store<true>(&dst[1].p, p.bd(t));
GSVector8::store<true>(&dst[1].c, c.b_());
}
#else
GSVector4i o = (GSVector4i)m_context->XYOFFSET;
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
#if _M_SSE >= 0x501
// TODO: process vertices in pairs, when AVX2 becomes available
#endif
for(; i > 0; i--, src++, dst++)
for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
{
GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
@ -297,7 +363,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
GSVector4 t;
GSVector4 t = GSVector4::zero();
if(tme)
{
@ -323,17 +389,25 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
{
#if _M_SSE >= 0x401
t = t.insert<1, 3>(GSVector4::cast(xyzuvf));
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
#else
t = t.insert<0, 3>(GSVector4::cast(GSVector4i::load(z)));
t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
#endif
}
dst->t = t;
#if _M_SSE >= 0x501
dst->_pad = GSVector4::zero();
#endif
}
#endif
}
void GSRendererSW::Draw()
@ -345,10 +419,10 @@ void GSRendererSW::Draw()
shared_ptr<GSRasterizerData> data(sd);
sd->primclass = m_vt.m_primclass;
sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32);
sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1) + sizeof(uint32) * m_index.tail, 32);
sd->vertex = (GSVertexSW*)sd->buff;
sd->vertex_count = m_vertex.next;
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next);
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
sd->index_count = m_index.tail;
(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
@ -631,6 +705,20 @@ void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
}
}
__forceinline void Increment16(volatile short* lpAddend)
{
// (*lpAddend)++;
_InterlockedIncrement16(lpAddend);
}
__forceinline void Decrement16(volatile short* lpAddend)
{
// (*lpAddend)--;
_InterlockedDecrement16(lpAddend);
}
void GSRendererSW::UsePages(const uint32* pages, int type)
{
if(type < 2)
@ -639,7 +727,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
{
ASSERT(((short*)&m_fzb_pages[*p])[type] < SHRT_MAX);
_InterlockedIncrement16((short*)&m_fzb_pages[*p] + type);
Increment16((short*)&m_fzb_pages[*p] + type);
}
}
else
@ -648,7 +736,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
{
ASSERT(m_tex_pages[*p] < SHRT_MAX);
_InterlockedIncrement16((short*)&m_tex_pages[*p]); // remember which texture pages are used
Increment16((short*)&m_tex_pages[*p]);
}
}
}
@ -661,7 +749,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
{
ASSERT(((short*)&m_fzb_pages[*p])[type] > 0);
_InterlockedDecrement16((short*)&m_fzb_pages[*p] + type);
Decrement16((short*)&m_fzb_pages[*p] + type);
}
}
else
@ -670,7 +758,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
{
ASSERT(m_tex_pages[*p] > 0);
_InterlockedDecrement16((short*)&m_tex_pages[*p]);
Decrement16((short*)&m_tex_pages[*p]);
}
}
}
@ -1390,23 +1478,29 @@ GSRendererSW::SharedData::~SharedData()
fflush(s_fp);}
}
static TransactionScope::Lock s_lock;
void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm)
{
if(m_using_pages) return;
if(global.sel.fb)
{
m_parent->UsePages(fb_pages, 0);
}
//TransactionScope scope(s_lock);
if(global.sel.zb)
{
m_parent->UsePages(zb_pages, 1);
}
if(global.sel.fb)
{
m_parent->UsePages(fb_pages, 0);
}
for(size_t i = 0; m_tex[i].t != NULL; i++)
{
m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
if(global.sel.zb)
{
m_parent->UsePages(zb_pages, 1);
}
for(size_t i = 0; m_tex[i].t != NULL; i++)
{
m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
}
}
m_fb_pages = fb_pages;
@ -1421,19 +1515,23 @@ void GSRendererSW::SharedData::ReleasePages()
{
if(!m_using_pages) return;
if(global.sel.fb)
{
m_parent->ReleasePages(m_fb_pages, 0);
}
//TransactionScope scope(s_lock);
if(global.sel.zb)
{
m_parent->ReleasePages(m_zb_pages, 1);
}
if(global.sel.fb)
{
m_parent->ReleasePages(m_fb_pages, 0);
}
for(size_t i = 0; m_tex[i].t != NULL; i++)
{
m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
if(global.sel.zb)
{
m_parent->ReleasePages(m_zb_pages, 1);
}
for(size_t i = 0; m_tex[i].t != NULL; i++)
{
m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
}
}
delete [] m_fb_pages;

View File

@ -516,7 +516,7 @@ void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
*/
GSVector4i xy = GSVector4i::loadl(&r->u64[0]);
GSVector4i zf = GSVector4i::loadl(&r->u64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf);
@ -567,14 +567,19 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, ui
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
/*
GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
GSVector4i rbga = rg.upl8(ba);
GSVector4i rgba = rbga.upl8(rbga.zzzz());
*/
q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
@ -599,7 +604,12 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uin
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
/*
GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
GSVector4i rbga = rg.upl8(ba);
GSVector4i rgba = rbga.upl8(rbga.zzzz());
*/
q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
@ -719,7 +729,7 @@ void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)
GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
GSVector4i uvf = GSVector4i::loadl(&m_v.UV).upl32(xyzf.srl32(24).srl<4>());
GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
m_v.m[1] = xyz.upl64(uvf);
@ -2258,7 +2268,7 @@ void GSState::UpdateContext()
void GSState::UpdateScissor()
{
m_scissor = m_context->scissor.ofex;
m_scissor = m_context->scissor.ex;
m_ofxy = m_context->scissor.ofxy;
}
@ -2286,8 +2296,8 @@ void GSState::GrowVertexBuffer()
{
int maxcount = std::max<int>(m_vertex.maxcount * 3 / 2, 10000);
GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 16);
uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 16); // worst case is slightly less than vertex number * 3
GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3
if(m_vertex.buff != NULL)
{
@ -2328,7 +2338,13 @@ __forceinline void GSState::VertexKick(uint32 skip)
tailptr[0] = v0;
tailptr[1] = v1;
m_vertex.xy[xy_tail & 3] = GSVector4(v1.upl32(v1.sub16(GSVector4i::load(m_ofxy)).sra16(4)).upl16()); // zw not sign extended, only useful for eq tests
GSVector4i xy = v1.xxxx().sub16(m_ofxy);
#if _M_SSE >= 0x401
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4)));
#else
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy()));
#endif
m_vertex.tail = ++tail;
m_vertex.xy_tail = ++xy_tail;
@ -2356,14 +2372,14 @@ __forceinline void GSState::VertexKick(uint32 skip)
if(skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
{
GSVector4 v0, v1, v2, v3;
GSVector4i v0, v1, v2, v3, pmin, pmax;
v0 = m_vertex.xy[(xy_tail + 1) & 3]; // T-3
v1 = m_vertex.xy[(xy_tail + 2) & 3]; // T-2
v2 = m_vertex.xy[(xy_tail + 3) & 3]; // T-1
v3 = m_vertex.xy[(xy_tail - m) & 3]; // H
v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
GSVector4 pmin, pmax, cross;
GSVector4 cross;
switch(prim)
{
@ -2374,21 +2390,21 @@ __forceinline void GSState::VertexKick(uint32 skip)
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin = v2.min(v1);
pmax = v2.max(v1);
pmin = v2.min_i16(v1);
pmax = v2.max_i16(v1);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
pmin = v2.min(v1.min(v0));
pmax = v2.max(v1.max(v0));
pmin = v2.min_i16(v1.min_i16(v0));
pmax = v2.max_i16(v1.max_i16(v0));
break;
case GS_TRIANGLEFAN:
pmin = v2.min(v1.min(v3));
pmax = v2.max(v1.max(v3));
pmin = v2.min_i16(v1.min_i16(v3));
pmax = v2.max_i16(v1.max_i16(v3));
break;
}
GSVector4 test = pmax < m_scissor | pmin > m_scissor.zwxy();
GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl());
switch(prim)
{
@ -2396,7 +2412,7 @@ __forceinline void GSState::VertexKick(uint32 skip)
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= m_nativeres ? (pmin == pmax).zwzw() : pmin == pmax;
test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
break;
}
@ -2404,16 +2420,19 @@ __forceinline void GSState::VertexKick(uint32 skip)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
cross = (v2 - v1) * (v2 - v0).yxwz();
test |= cross == cross.yxwz();
// TODO: any way to do a 16-bit integer cross product?
cross = GSVector4(v2.xyxyl().i16to32().sub32(v0.upl32(v1).i16to32())); // x20, y20, x21, y21
cross = cross * cross.wzwz(); // x20 * y21, y20 * x21
test |= GSVector4i::cast(cross == cross.yxwz());
break;
case GS_TRIANGLEFAN:
cross = (v2 - v1) * (v2 - v3).yxwz();
test |= cross == cross.yxwz();
cross = GSVector4(v2.xyxyl().i16to32().sub32(v3.upl32(v1).i16to32())); // x23, y23, x21, y21
cross = cross * cross.wzwz(); // x23 * y21, y23 * x21
test |= GSVector4i::cast(cross == cross.yxwz());
break;
}
skip |= test.mask() & 3;
skip |= test.mask() & 15;
}
if(skip != 0)

View File

@ -148,16 +148,16 @@ protected:
GSVertex m_v;
float m_q;
GSVector4 m_scissor;
uint32 m_ofxy;
GSVector4i m_scissor;
GSVector4i m_ofxy;
bool m_texflush;
struct
{
GSVertex* buff;
size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
GSVector4 xy[4];
size_t xy_tail;
uint64 xy[4];
} m_vertex;
struct

View File

@ -367,3 +367,99 @@ public:
virtual void Process(T& item) = 0;
};
// http://software.intel.com/en-us/blogs/2012/11/06/exploring-intel-transactional-synchronization-extensions-with-intel-software
class TransactionScope
{
public:
class Lock
{
volatile long state;
public:
Lock()
: state(0)
{
}
void lock()
{
while(_InterlockedCompareExchange(&state, 1, 0) != 0)
{
do {_mm_pause();} while(state == 1);
}
}
void unlock()
{
_InterlockedExchange(&state, 0);
}
bool isLocked() const
{
return state == 1;
}
};
private:
Lock& fallBackLock;
TransactionScope();
public:
TransactionScope(Lock& fallBackLock_, int max_retries = 3)
: fallBackLock(fallBackLock_)
{
#if _M_SSE >= 0x501
int nretries = 0;
while(1)
{
++nretries;
unsigned status = _xbegin();
if(status == _XBEGIN_STARTED)
{
if(!fallBackLock.isLocked()) return;
_xabort(0xff);
}
if((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff && !(status & _XABORT_NESTED))
{
while(fallBackLock.isLocked()) _mm_pause();
}
else if(!(status & _XABORT_RETRY))
{
break;
}
if(nretries >= max_retries)
{
break;
}
}
#endif
fallBackLock.lock();
}
~TransactionScope()
{
if(fallBackLock.isLocked())
{
fallBackLock.unlock();
}
#if _M_SSE >= 0x501
else
{
_xend();
}
#endif
}
};

View File

@ -78,6 +78,8 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000
const GSVector8 GSVector8::m_one(1.0f);
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));
#endif

View File

@ -159,7 +159,7 @@ public:
__forceinline explicit GSVector4i(int i)
{
m = _mm_set1_epi32(i);
*this = i;
}
__forceinline explicit GSVector4i(__m128i m)
@ -190,7 +190,15 @@ public:
__forceinline void operator = (int i)
{
#if _M_SSE >= 0x501
m = _mm_broadcastd_epi32(_mm_cvtsi32_si128(i));
#else
m = _mm_set1_epi32(i);
#endif
}
__forceinline void operator = (__m128i m)
@ -790,6 +798,16 @@ public:
return upl32();
}
__forceinline GSVector4i i8to16() const
{
return zero().upl8().sra16(8);
}
__forceinline GSVector4i i16to32() const
{
return zero().upl16().sra32(16);
}
#endif
template<int i> __forceinline GSVector4i srl() const
@ -1245,6 +1263,15 @@ public:
#if _M_SSE >= 0x401
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
{
return GSVector4i(_mm_blend_epi32(m, v.m, i));
}
#endif
#if _M_SSE >= 0x401
template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
{
GSVector4i v;
@ -2446,21 +2473,29 @@ public:
m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
}
__forceinline explicit GSVector4(float f)
{
m = _mm_set1_ps(f);
}
__forceinline explicit GSVector4(__m128 m)
{
this->m = m;
}
__forceinline explicit GSVector4(float f)
{
*this = f;
}
__forceinline explicit GSVector4(int i)
{
#if _M_SSE >= 0x501
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
#else
GSVector4i v((int)i);
*this = GSVector4(v);
#endif
}
__forceinline explicit GSVector4(uint32 u)
@ -2493,7 +2528,15 @@ public:
__forceinline void operator = (float f)
{
#if _M_SSE >= 0x501
m = _mm_broadcastss_ps(_mm_load_ss(&f));
#else
m = _mm_set1_ps(f);
#endif
}
__forceinline void operator = (__m128 m)
@ -2788,6 +2831,15 @@ public:
return GSVector4(_mm_max_ps(m, a));
}
#if _M_SSE >= 0x401
template<int mask> __forceinline GSVector4 blend32(const GSVector4& a) const
{
return GSVector4(_mm_blend_ps(m, a, mask));
}
#endif
__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
{
#if _M_SSE >= 0x401
@ -2855,7 +2907,7 @@ public:
#endif
}
template<int src, int dst> __forceinline GSVector4 insert(const GSVector4& v) const
template<int src, int dst> __forceinline GSVector4 insert32(const GSVector4& v) const
{
// TODO: use blendps when src == dst
@ -2918,7 +2970,7 @@ public:
return *this;
}
template<int i> __forceinline int extract() const
template<int i> __forceinline int extract32() const
{
#if _M_SSE >= 0x401
@ -3273,7 +3325,15 @@ public:
__forceinline GSVector8i(__m128i m0, __m128i m1)
{
this->m = zero().insert<0>(m0).insert<1>(m1);
#if 0 // _MSC_VER >= 1700
this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m0), _mm256_castsi128_si256(m1), 0);
#else
*this = zero().insert<0>(m0).insert<1>(m1);
#endif
}
__forceinline GSVector8i(const GSVector8i& v)
@ -3283,20 +3343,12 @@ public:
__forceinline explicit GSVector8i(int i)
{
#if _M_SSE >= 0x501
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
#else
m = _mm256_set1_epi32(i);
#endif
*this = i;
}
__forceinline explicit GSVector8i(__m128i m)
{
#if _M_SSE >= 0x501
this->m = _mm256_broadcastsi128_si256(m);
#else
this->m = zero().insert<0>(m).insert<1>(m);
#endif
*this = m;
}
__forceinline explicit GSVector8i(__m256i m)
@ -3311,19 +3363,19 @@ public:
__forceinline void operator = (int i)
{
#if _M_SSE >= 0x501
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
#else
m = _mm256_set1_epi32(i);
#endif
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); // m = _mm256_set1_epi32(i);
}
__forceinline void operator = (__m128i m)
{
#if _M_SSE >= 0x501
this->m = _mm256_broadcastsi128_si256(m);
#if 0 // _MSC_VER >= 1700
this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m), _mm256_castsi128_si256(m), 0);
#else
this->m = zero().insert<0>(m).insert<1>(m);
*this = zero().insert<0>(m).aa();
#endif
}
@ -3609,74 +3661,78 @@ public:
return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256()));
}
__forceinline GSVector8i i8to16() const
// cross lane! from 128-bit to full 256-bit range
__forceinline GSVector8i i8to16c() const
{
return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u8to16() const
__forceinline GSVector8i u8to16c() const
{
return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i i8to32() const
__forceinline GSVector8i i8to32c() const
{
return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u8to32() const
__forceinline GSVector8i u8to32c() const
{
return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i i8to64() const
__forceinline GSVector8i i8to64c() const
{
return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u8to64() const
__forceinline GSVector8i u8to64c() const
{
return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i i16to32() const
__forceinline GSVector8i i16to32c() const
{
return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u16to32() const
__forceinline GSVector8i u16to32c() const
{
return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i i16to64() const
__forceinline GSVector8i i16to64c() const
{
return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u16to64() const
__forceinline GSVector8i u16to64c() const
{
return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i i32to64() const
__forceinline GSVector8i i32to64c() const
{
return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m)));
}
__forceinline GSVector8i u32to64() const
__forceinline GSVector8i u32to64c() const
{
return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m)));
}
//
template<int i> __forceinline GSVector8i srl() const
{
return GSVector8i(_mm256_srli_si156(m, i));
return GSVector8i(_mm256_srli_si256(m, i));
}
template<int i> __forceinline GSVector8i srl(const GSVector8i& v)
{
return GSVector4i(_mm256_alignr_epi8(v.m, m, i));
return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
}
template<int i> __forceinline GSVector8i sll() const
@ -4065,12 +4121,14 @@ public:
template<int i> __forceinline GSVector4i extract() const
{
return GSVector4i(_mm256_extractf128_si256(m, i));
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
return GSVector4i(_mm256_extracti128_si256(m, i));
}
template<int i> __forceinline GSVector8i insert(__m128i m) const
{
return GSVector8i(_mm256_insertf128_si256(this->m, m, i));
return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
}
// TODO: gather
@ -4111,6 +4169,16 @@ public:
*/
}
__forceinline static GSVector8i load(const void* pll, const void* plh, const void* phl, const void* phh)
{
GSVector4i l = GSVector4i::load(pll, plh);
GSVector4i h = GSVector4i::load(phl, phh);
return cast(l).ac(cast(h));
// return GSVector8i(l).insert<1>(h);
}
template<bool aligned> __forceinline static GSVector8i load(const void* p)
{
return GSVector8i(aligned ? _mm256_load_si256((__m256i*)p) : _mm256_loadu_si256((__m256i*)p));
@ -4243,6 +4311,78 @@ public:
b = c.bd(d);
}
__forceinline static void sw4(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
const __m256i epi32_0f0f0f0f = _mm256_set1_epi32(0x0f0f0f0f);
GSVector8i mask(epi32_0f0f0f0f);
GSVector8i e = (b << 4).blend(a, mask);
GSVector8i f = b.blend(a >> 4, mask);
GSVector8i g = (d << 4).blend(c, mask);
GSVector8i h = d.blend(c >> 4, mask);
a = e.upl8(f);
c = e.uph8(f);
b = g.upl8(h);
d = g.uph8(h);
}
__forceinline static void sw8(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
GSVector8i e = a;
GSVector8i f = c;
a = e.upl8(b);
c = e.uph8(b);
b = f.upl8(d);
d = f.uph8(d);
}
__forceinline static void sw16(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
GSVector8i e = a;
GSVector8i f = c;
a = e.upl16(b);
c = e.uph16(b);
b = f.upl16(d);
d = f.uph16(d);
}
__forceinline static void sw32(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
GSVector8i e = a;
GSVector8i f = c;
a = e.upl32(b);
c = e.uph32(b);
b = f.upl32(d);
d = f.uph32(d);
}
__forceinline static void sw64(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
GSVector8i e = a;
GSVector8i f = c;
a = e.upl64(b);
c = e.uph64(b);
b = f.upl64(d);
d = f.uph64(d);
}
__forceinline static void sw128(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
{
GSVector8i e = a;
GSVector8i f = c;
a = e.ac(b);
c = e.bd(b);
b = f.ac(d);
d = f.bd(d);
}
__forceinline void operator += (const GSVector8i& v)
{
m = _mm256_add_epi32(m, v);
@ -4706,6 +4846,8 @@ public:
static const GSVector8 m_one;
static const GSVector8 m_x7fffffff;
static const GSVector8 m_x80000000;
static const GSVector8 m_x4b000000;
static const GSVector8 m_x4f800000;
__forceinline GSVector8()
{
@ -4723,9 +4865,7 @@ public:
__forceinline GSVector8(__m128 m0, __m128 m1)
{
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps (in vs2012 they simply changed it to vmovups, still can't keep the second xmm in a register)
#if _MSC_VER >= 1700
#if 0 // _MSC_VER >= 1700
this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
@ -4743,23 +4883,27 @@ public:
__forceinline explicit GSVector8(float f)
{
m = _mm256_set1_ps(f);
*this = f;
}
__forceinline explicit GSVector8(int i)
{
#if _M_SSE >= 0x501
m = _mm256_cvtepi32_ps(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)));
#else
GSVector4i v((int)i);
*this = GSVector4(v);
#endif
}
__forceinline explicit GSVector8(__m128 m)
{
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
#if _MSC_VER >= 1700
this->m = _mm256_castps128_ps256(m);
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
#else
this->m = zero().insert<0>(m).aa();
#endif
*this = m;
}
__forceinline explicit GSVector8(__m256 m)
@ -4785,17 +4929,22 @@ public:
__forceinline void operator = (float f)
{
#if _M_SSE >= 0x501
m = _mm256_broadcastss_ps(_mm_load_ss(&f));
#else
m = _mm256_set1_ps(f);
#endif
}
__forceinline void operator = (__m128 m)
{
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
#if 0 // _MSC_VER >= 1700
#if _MSC_VER >= 1700
this->m = _mm256_castps128_ps256(m);
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m), _mm256_castps128_ps256(m), 0x20);
#else
@ -5092,7 +5241,70 @@ public:
return _mm256_testz_ps(m, m) != 0;
}
// TODO: 32-bit insert/extract
template<int src, int dst> __forceinline GSVector8 insert32(const GSVector8& v) const
{
// TODO: use blendps when src == dst
ASSERT(src < 4 && dst < 4); // not cross lane like extract32()
switch(dst)
{
case 0:
switch(src)
{
case 0: return yyxx(v).zxzw(*this);
case 1: return yyyy(v).zxzw(*this);
case 2: return yyzz(v).zxzw(*this);
case 3: return yyww(v).zxzw(*this);
default: __assume(0);
}
break;
case 1:
switch(src)
{
case 0: return xxxx(v).xzzw(*this);
case 1: return xxyy(v).xzzw(*this);
case 2: return xxzz(v).xzzw(*this);
case 3: return xxww(v).xzzw(*this);
default: __assume(0);
}
break;
case 2:
switch(src)
{
case 0: return xyzx(wwxx(v));
case 1: return xyzx(wwyy(v));
case 2: return xyzx(wwzz(v));
case 3: return xyzx(wwww(v));
default: __assume(0);
}
break;
case 3:
switch(src)
{
case 0: return xyxz(zzxx(v));
case 1: return xyxz(zzyy(v));
case 2: return xyxz(zzzz(v));
case 3: return xyxz(zzww(v));
default: __assume(0);
}
break;
default:
__assume(0);
}
return *this;
}
template<int i> __forceinline int extract32() const
{
if(i < 4) return extract<0>().extract<i>();
else if(i < 8) return extract<1>().extract<i - 4>();
else ASSERT(0);
return 0;
}
template<int i> __forceinline GSVector8 insert(__m128 m) const
{
@ -5101,6 +5313,8 @@ public:
template<int i> __forceinline GSVector4 extract() const
{
if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
return GSVector4(_mm256_extractf128_ps(m, i));
}
@ -5114,14 +5328,44 @@ public:
return zero() == zero();
}
// TODO: load low, ss
// TODO
__forceinline static GSVector8 loadl(const void* p)
{
return GSVector8(_mm256_castps128_ps256(_mm_load_ps((float*)p)));
}
__forceinline static GSVector8 loadh(const void* p)
{
return zero().insert<1>(_mm_load_ps((float*)p));
}
__forceinline static GSVector8 loadh(const void* p, const GSVector8& v)
{
return GSVector8(_mm256_insertf128_ps(v, _mm_load_ps((float*)p), 1));
}
__forceinline static GSVector8 load(const void* pl, const void* ph)
{
return loadh(ph, loadl(pl));
}
template<bool aligned> __forceinline static GSVector8 load(const void* p)
{
return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
}
// TODO: store low, ss
// TODO
__forceinline static void storel(void* p, const GSVector8& v)
{
_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 0));
}
__forceinline static void storeh(void* p, const GSVector8& v)
{
_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 1));
}
template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
{

View File

@ -402,6 +402,24 @@ struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_fr
return retval;
}
__forceinline long _InterlockedCompareExchange(volatile long* const Destination, const long Exchange, const long Comperand)
{
long retval = Comperand;
__asm__("lock; cmpxchgl %k[Exchange], %[Destination]" : [retval] "+a" (retval) : [Destination] "m" (*Destination), [Exchange] "q" (Exchange): "memory");
return retval;
}
__forceinline long _InterlockedExchange(volatile long* const Target, const long Value)
{
long retval = Value;
__asm__("xchgl %[retval], %[Target]" : [retval] "+r" (retval) : [Target] "m" (*Target) : "memory");
return retval;
}
__forceinline long _InterlockedExchangeAdd(volatile long* const Addend, const long Value)
{
long retval = Value;