mirror of https://github.com/PCSX2/pcsx2.git
GSdx: More avx2 code to read/write different block formats, the GSBenchmark function shows nice improvements, but no games run faster. I just upload the changes before messing with the drawing part.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5675 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
f4ce6d6fce
commit
8b9f5b5bc2
|
@ -71,4 +71,6 @@ public:
|
|||
void DrawRect(const GSVector4i& r, const GSVertexSW& v);
|
||||
|
||||
#endif
|
||||
|
||||
void PrintStats() {m_ds_map.PrintStats();}
|
||||
};
|
||||
|
|
|
@ -120,7 +120,7 @@ protected:
|
|||
void GrowVertexBuffer()
|
||||
{
|
||||
int maxcount = std::max<int>(m_maxcount * 3 / 2, 10000);
|
||||
Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 16);
|
||||
Vertex* vertices = (Vertex*)_aligned_malloc(sizeof(Vertex) * maxcount, 32);
|
||||
|
||||
if(m_vertices != NULL)
|
||||
{
|
||||
|
|
|
@ -27,7 +27,7 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
|
|||
: GPURendererT<GSVertexSW>(dev)
|
||||
, m_texture(NULL)
|
||||
{
|
||||
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
|
||||
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 32);
|
||||
|
||||
m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads, &m_perfmon);
|
||||
}
|
||||
|
@ -121,7 +121,7 @@ void GPURendererSW::Draw()
|
|||
data->scissor.right = min((int)(m_env.DRAREABR.X + 1) << m_scale.x, m_mem.GetWidth());
|
||||
data->scissor.bottom = min((int)(m_env.DRAREABR.Y + 1) << m_scale.y, m_mem.GetHeight());
|
||||
|
||||
data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16);
|
||||
data->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 32);
|
||||
data->vertex = (GSVertexSW*)data->buff;
|
||||
data->vertex_count = m_count;
|
||||
|
||||
|
|
|
@ -747,7 +747,7 @@ GPUState::Buffer::Buffer()
|
|||
{
|
||||
bytes = 0;
|
||||
maxbytes = 4096;
|
||||
buff = (uint8*)_aligned_malloc(maxbytes, 16);
|
||||
buff = (uint8*)_aligned_malloc(maxbytes, 32);
|
||||
cur = 0;
|
||||
}
|
||||
|
||||
|
@ -761,9 +761,9 @@ void GPUState::Buffer::Reserve(int size)
|
|||
if(size > maxbytes)
|
||||
{
|
||||
int new_maxbytes = (maxbytes + size + 1023) & ~1023;
|
||||
uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 16);
|
||||
uint8* new_buff = (uint8*)_aligned_malloc(new_maxbytes, 32);
|
||||
|
||||
if(buff != NULL)
|
||||
if(buff != NULL)
|
||||
{
|
||||
memcpy(new_buff, buff, maxbytes);
|
||||
_aligned_free(buff);
|
||||
|
|
|
@ -23,9 +23,7 @@
|
|||
#include "GSBlock.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
const GSVector8i GSBlock::m_r16mask(
|
||||
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
|
||||
const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
|
||||
#else
|
||||
const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
|
||||
#endif
|
||||
|
@ -44,7 +42,7 @@ const GSVector4i GSBlock::m_xgxx(0x000003e0);
|
|||
const GSVector4i GSBlock::m_rxxx(0x0000001f);
|
||||
#endif
|
||||
|
||||
const GSVector4i GSBlock::m_uw8hmask0 = GSVector4i(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
|
||||
const GSVector4i GSBlock::m_uw8hmask1 = GSVector4i(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
|
||||
const GSVector4i GSBlock::m_uw8hmask2 = GSVector4i(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
|
||||
const GSVector4i GSBlock::m_uw8hmask3 = GSVector4i(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
|
||||
const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
|
||||
const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
|
||||
const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
|
||||
const GSVector4i GSBlock::m_uw8hmask3(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -177,7 +177,7 @@ public:
|
|||
: CBaseFilter(NAME("GSSource"), pUnk, this, __uuidof(this), &hr)
|
||||
, m_output(NULL)
|
||||
, m_size(w, h)
|
||||
, m_atpf(10000000i64 / fps)
|
||||
, m_atpf((REFERENCE_TIME)(10000000.0f / fps))
|
||||
, m_now(0)
|
||||
{
|
||||
m_output = new GSSourceOutputPin(m_size, m_atpf, this, this, hr, colorspace);
|
||||
|
|
|
@ -85,4 +85,6 @@ public:
|
|||
void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);
|
||||
|
||||
#endif
|
||||
|
||||
void PrintStats() {m_ds_map.PrintStats();}
|
||||
};
|
||||
|
|
|
@ -44,8 +44,9 @@ public:
|
|||
struct
|
||||
{
|
||||
GSVector4 in;
|
||||
GSVector4i ex;
|
||||
GSVector4 ofex;
|
||||
uint32 ofxy;
|
||||
GSVector4i ofxy;
|
||||
} scissor;
|
||||
|
||||
struct
|
||||
|
@ -83,6 +84,11 @@ public:
|
|||
|
||||
void UpdateScissor()
|
||||
{
|
||||
scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4);
|
||||
scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4);
|
||||
scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4);
|
||||
scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4);
|
||||
|
||||
scissor.ofex = GSVector4(
|
||||
(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
|
||||
(int)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY),
|
||||
|
@ -95,10 +101,11 @@ public:
|
|||
(int)SCISSOR.SCAX1 + 1,
|
||||
(int)SCISSOR.SCAY1 + 1);
|
||||
|
||||
uint16 ofx = (uint16)XYOFFSET.OFX - 15;
|
||||
uint16 ofy = (uint16)XYOFFSET.OFY - 15;
|
||||
uint16 ofx = (uint16)XYOFFSET.OFX;
|
||||
uint16 ofy = (uint16)XYOFFSET.OFY;
|
||||
|
||||
scissor.ofxy = ((ofy << 16) | ofx); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
|
||||
scissor.ofxy.u32[0] = (ofy << 16) | ofx;
|
||||
scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
|
||||
}
|
||||
|
||||
bool DepthRead() const
|
||||
|
|
|
@ -678,7 +678,7 @@ vector<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
|
|||
|
||||
////////////////////
|
||||
|
||||
template<int psm, int bsx, int bsy, bool aligned>
|
||||
template<int psm, int bsx, int bsy, int alignment>
|
||||
void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
|
||||
{
|
||||
uint32 bp = BITBLTBUF.DBP;
|
||||
|
@ -692,14 +692,14 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
|
|||
{
|
||||
switch(psm)
|
||||
{
|
||||
case PSM_PSMCT32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMCT16: WriteColumn16<aligned>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT16S: WriteColumn16<aligned>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMT8: WriteColumn8<aligned>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
|
||||
case PSM_PSMT4: WriteColumn4<aligned>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
|
||||
case PSM_PSMZ32: WriteColumn32<aligned, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMZ16: WriteColumn16<aligned>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMZ16S: WriteColumn16<aligned>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMCT16: WriteColumn16<alignment>(y, BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT16S: WriteColumn16<alignment>(y, BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMT8: WriteColumn8<alignment>(y, BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
|
||||
case PSM_PSMT4: WriteColumn4<alignment>(y, BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
|
||||
case PSM_PSMZ32: WriteColumn32<alignment, 0xffffffff>(y, BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMZ16: WriteColumn16<alignment>(y, BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMZ16S: WriteColumn16<alignment>(y, BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
// TODO
|
||||
default: __assume(0);
|
||||
}
|
||||
|
@ -707,7 +707,7 @@ void GSLocalMemory::WriteImageColumn(int l, int r, int y, int h, const uint8* sr
|
|||
}
|
||||
}
|
||||
|
||||
template<int psm, int bsx, int bsy, bool aligned>
|
||||
template<int psm, int bsx, int bsy, int alignment>
|
||||
void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
|
||||
{
|
||||
uint32 bp = BITBLTBUF.DBP;
|
||||
|
@ -719,14 +719,14 @@ void GSLocalMemory::WriteImageBlock(int l, int r, int y, int h, const uint8* src
|
|||
{
|
||||
switch(psm)
|
||||
{
|
||||
case PSM_PSMCT32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMCT16: WriteBlock16<aligned>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT16S: WriteBlock16<aligned>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMT8: WriteBlock8<aligned>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
|
||||
case PSM_PSMT4: WriteBlock4<aligned>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
|
||||
case PSM_PSMZ32: WriteBlock32<aligned, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMZ16: WriteBlock16<aligned>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMZ16S: WriteBlock16<aligned>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMCT16: WriteBlock16<alignment>(BlockPtr16(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMCT16S: WriteBlock16<alignment>(BlockPtr16S(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMT8: WriteBlock8<alignment>(BlockPtr8(x, y, bp, bw), &src[x], srcpitch); break;
|
||||
case PSM_PSMT4: WriteBlock4<alignment>(BlockPtr4(x, y, bp, bw), &src[x >> 1], srcpitch); break;
|
||||
case PSM_PSMZ32: WriteBlock32<alignment, 0xffffffff>(BlockPtr32Z(x, y, bp, bw), &src[x * 4], srcpitch); break;
|
||||
case PSM_PSMZ16: WriteBlock16<alignment>(BlockPtr16Z(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
case PSM_PSMZ16S: WriteBlock16<alignment>(BlockPtr16SZ(x, y, bp, bw), &src[x * 2], srcpitch); break;
|
||||
// TODO
|
||||
default: __assume(0);
|
||||
}
|
||||
|
@ -803,7 +803,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
|
|||
case PSM_PSMZ32:
|
||||
ReadColumn32(y, dst, buff, 32);
|
||||
memcpy(&buff[32], &src[x * 4], 32);
|
||||
WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
|
||||
WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
|
||||
break;
|
||||
case PSM_PSMCT16:
|
||||
case PSM_PSMCT16S:
|
||||
|
@ -811,17 +811,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
|
|||
case PSM_PSMZ16S:
|
||||
ReadColumn16(y, dst, buff, 32);
|
||||
memcpy(&buff[32], &src[x * 2], 32);
|
||||
WriteColumn16<true>(y, dst, buff, 32);
|
||||
WriteColumn16<32>(y, dst, buff, 32);
|
||||
break;
|
||||
case PSM_PSMT8:
|
||||
ReadColumn8(y, dst, buff, 16);
|
||||
for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + x], 16);
|
||||
WriteColumn8<true>(y, dst, buff, 16);
|
||||
WriteColumn8<32>(y, dst, buff, 16);
|
||||
break;
|
||||
case PSM_PSMT4:
|
||||
ReadColumn4(y, dst, buff, 16);
|
||||
for(int i = 0, j = y2; i < h2; i++, j++) memcpy(&buff[j * 16], &src[i * srcpitch + (x >> 1)], 16);
|
||||
WriteColumn4<true>(y, dst, buff, 16);
|
||||
WriteColumn4<32>(y, dst, buff, 16);
|
||||
break;
|
||||
// TODO
|
||||
default:
|
||||
|
@ -841,13 +841,19 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
|
|||
|
||||
if(h2 > 0)
|
||||
{
|
||||
if(((size_t)&src[l * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
|
||||
size_t addr = (size_t)&src[l * trbpp >> 3];
|
||||
|
||||
if((addr & 31) == 0 && (srcpitch & 31) == 0)
|
||||
{
|
||||
WriteImageColumn<psm, bsx, bsy, true>(l, r, y, h2, src, srcpitch, BITBLTBUF);
|
||||
WriteImageColumn<psm, bsx, bsy, 32>(l, r, y, h2, src, srcpitch, BITBLTBUF);
|
||||
}
|
||||
else if((addr & 15) == 0 && (srcpitch & 15) == 0)
|
||||
{
|
||||
WriteImageColumn<psm, bsx, bsy, 16>(l, r, y, h2, src, srcpitch, BITBLTBUF);
|
||||
}
|
||||
else
|
||||
{
|
||||
WriteImageColumn<psm, bsx, bsy, false>(l, r, y, h2, src, srcpitch, BITBLTBUF);
|
||||
WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
|
||||
}
|
||||
|
||||
src += srcpitch * h2;
|
||||
|
@ -884,7 +890,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
|
|||
case PSM_PSMZ32:
|
||||
ReadColumn32(y, dst, buff, 32);
|
||||
memcpy(&buff[0], &src[x * 4], 32);
|
||||
WriteColumn32<true, 0xffffffff>(y, dst, buff, 32);
|
||||
WriteColumn32<32, 0xffffffff>(y, dst, buff, 32);
|
||||
break;
|
||||
case PSM_PSMCT16:
|
||||
case PSM_PSMCT16S:
|
||||
|
@ -892,17 +898,17 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8*
|
|||
case PSM_PSMZ16S:
|
||||
ReadColumn16(y, dst, buff, 32);
|
||||
memcpy(&buff[0], &src[x * 2], 32);
|
||||
WriteColumn16<true>(y, dst, buff, 32);
|
||||
WriteColumn16<32>(y, dst, buff, 32);
|
||||
break;
|
||||
case PSM_PSMT8:
|
||||
ReadColumn8(y, dst, buff, 16);
|
||||
for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + x], 16);
|
||||
WriteColumn8<true>(y, dst, buff, 16);
|
||||
WriteColumn8<32>(y, dst, buff, 16);
|
||||
break;
|
||||
case PSM_PSMT4:
|
||||
ReadColumn4(y, dst, buff, 16);
|
||||
for(int i = 0; i < h; i++) memcpy(&buff[i * 16], &src[i * srcpitch + (x >> 1)], 16);
|
||||
WriteColumn4<true>(y, dst, buff, 16);
|
||||
WriteColumn4<32>(y, dst, buff, 16);
|
||||
break;
|
||||
// TODO
|
||||
default:
|
||||
|
@ -982,13 +988,19 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const uint8* src, int len, GIFR
|
|||
|
||||
if(h2 > 0)
|
||||
{
|
||||
if(((size_t)&s[la * trbpp >> 3] & 15) == 0 && (srcpitch & 15) == 0)
|
||||
size_t addr = (size_t)&s[la * trbpp >> 3];
|
||||
|
||||
if((addr & 31) == 0 && (srcpitch & 31) == 0)
|
||||
{
|
||||
WriteImageBlock<psm, bsx, bsy, true>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
|
||||
WriteImageBlock<psm, bsx, bsy, 32>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
|
||||
}
|
||||
else if((addr & 15) == 0 && (srcpitch & 15) == 0)
|
||||
{
|
||||
WriteImageBlock<psm, bsx, bsy, 16>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
|
||||
}
|
||||
else
|
||||
{
|
||||
WriteImageBlock<psm, bsx, bsy, false>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
|
||||
WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
|
||||
}
|
||||
|
||||
s += srcpitch * h2;
|
||||
|
@ -1385,6 +1397,8 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
|
|||
int sx = (int)TRXPOS.SSAX;
|
||||
int ex = sx + (int)TRXREG.RRW;
|
||||
|
||||
// printf("spsm=%d x=%d ex=%d y=%d len=%d\n", BITBLTBUF.SPSM, x, ex, y, len);
|
||||
|
||||
switch(BITBLTBUF.SPSM)
|
||||
{
|
||||
case PSM_PSMCT32:
|
||||
|
@ -1399,12 +1413,21 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB
|
|||
int* RESTRICT offset = psm->rowOffset[y & 7];
|
||||
uint32* RESTRICT ps = &m_vm32[psm->pa(0, y, bp, bw)];
|
||||
|
||||
for(int ex4 = ex - 4; len >= 4 && x <= ex4; len -= 4, x += 4, pd += 4)
|
||||
for(; len > 0 && x < ex && (x & 7); len--, x++, pd++)
|
||||
{
|
||||
pd[0] = ps[offset[x + 0]];
|
||||
pd[1] = ps[offset[x + 1]];
|
||||
pd[2] = ps[offset[x + 2]];
|
||||
pd[3] = ps[offset[x + 3]];
|
||||
*pd = ps[offset[x]];
|
||||
}
|
||||
|
||||
// aligned to a column
|
||||
|
||||
for(int ex8 = ex - 8; len >= 8 && x <= ex8; len -= 8, x += 8, pd += 8)
|
||||
{
|
||||
int o = offset[x];
|
||||
|
||||
GSVector4i::store<false>(&pd[0], GSVector4i::load(&ps[o + 0], &ps[o + 4]));
|
||||
GSVector4i::store<false>(&pd[4], GSVector4i::load(&ps[o + 8], &ps[o + 12]));
|
||||
|
||||
for(int i = 0; i < 8; i++) ASSERT(pd[i] == ps[offset[x + i]]);
|
||||
}
|
||||
|
||||
for(; len > 0 && x < ex; len--, x++, pd++)
|
||||
|
|
|
@ -845,10 +845,10 @@ public:
|
|||
|
||||
//
|
||||
|
||||
template<int psm, int bsx, int bsy, bool aligned>
|
||||
template<int psm, int bsx, int bsy, int alignment>
|
||||
void WriteImageColumn(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
|
||||
|
||||
template<int psm, int bsx, int bsy, bool aligned>
|
||||
template<int psm, int bsx, int bsy, int alignment>
|
||||
void WriteImageBlock(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF);
|
||||
|
||||
template<int psm, int bsx, int bsy>
|
||||
|
|
|
@ -467,7 +467,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
|
|||
{
|
||||
edge = vertex[i[1 - m2]];
|
||||
|
||||
edge.p = edge.p.insert<0, 1>(vertex[i[m2]].p);
|
||||
edge.p = edge.p.insert32<0, 1>(vertex[i[m2]].p);
|
||||
dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
|
||||
|
||||
DrawTriangleSection(tb.x, tb.w, edge, dedge, dscan, vertex[i[1 - m2]].p);
|
||||
|
@ -489,7 +489,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
|
|||
{
|
||||
edge = v1;
|
||||
|
||||
edge.p = v0.p.xxxx().addm(ddx[m2], dv[0].p.yyyy()).xyzw(edge.p);
|
||||
edge.p = (v0.p.xxxx() + ddx[m2] * dv[0].p.yyyy()).xyzw(edge.p);
|
||||
dedge.p = ddx[2 - (m2 << 1)].yzzw(dedge.p);
|
||||
|
||||
DrawTriangleSection(tb.y, tb.w, edge, dedge, dscan, v1.p);
|
||||
|
@ -532,7 +532,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
|
|||
|
||||
GSVertexSW scan;
|
||||
|
||||
scan.p = edge.p.addm(dedge.p, dy);
|
||||
scan.p = edge.p + dedge.p * dy;
|
||||
|
||||
GSVector4 lrf = scan.p.ceil();
|
||||
GSVector4 l = lrf.max(scissor);
|
||||
|
@ -546,16 +546,18 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
|
|||
|
||||
if(pixels > 0)
|
||||
{
|
||||
scan.t = edge.t.addm(dedge.t, dy);
|
||||
scan.c = edge.c.addm(dedge.c, dy);
|
||||
scan.t = edge.t + dedge.t * dy;
|
||||
scan.c = edge.c + dedge.c * dy;
|
||||
|
||||
GSVector4 prestep = (l - p0).xxxx();
|
||||
|
||||
scan.p = scan.p.addm(dscan.p, prestep);
|
||||
scan.t = scan.t.addm(dscan.t, prestep);
|
||||
scan.c = scan.c.addm(dscan.c, prestep);
|
||||
scan.p = scan.p + dscan.p * prestep;
|
||||
scan.t = scan.t + dscan.t * prestep;
|
||||
scan.c = scan.c + dscan.c * prestep;
|
||||
|
||||
AddScanline(e++, pixels, left, top, scan);
|
||||
|
||||
//m_pixels += pixels; m_ds->DrawScanline(pixels, left, top, scan);
|
||||
}
|
||||
|
||||
top++;
|
||||
|
@ -629,8 +631,8 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertex, const uint32* index)
|
|||
GSVertexSW dedge;
|
||||
GSVertexSW dscan;
|
||||
|
||||
dedge.t = GSVector4::zero().insert<1, 1>(dt);
|
||||
dscan.t = GSVector4::zero().insert<0, 0>(dt);
|
||||
dedge.t = GSVector4::zero().insert32<1, 1>(dt);
|
||||
dscan.t = GSVector4::zero().insert32<0, 0>(dt);
|
||||
|
||||
GSVector4 prestep = GSVector4(r.left, r.top) - scan.p;
|
||||
|
||||
|
@ -851,9 +853,9 @@ void GSRasterizer::AddScanline(GSVertexSW* e, int pixels, int left, int top, con
|
|||
{
|
||||
*e = scan;
|
||||
|
||||
e->p.i16[0] = (int16)pixels;
|
||||
e->p.i16[1] = (int16)left;
|
||||
e->p.i16[2] = (int16)top;
|
||||
e->_pad.i32[0] = pixels;
|
||||
e->_pad.i32[1] = left;
|
||||
e->_pad.i32[2] = top;
|
||||
}
|
||||
|
||||
void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GSVertexSW& dscan, bool edge)
|
||||
|
@ -873,9 +875,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
|
|||
{
|
||||
do
|
||||
{
|
||||
int pixels = e->p.i16[0];
|
||||
int left = e->p.i16[1];
|
||||
int top = e->p.i16[2];
|
||||
int pixels = e->_pad.i32[0];
|
||||
int left = e->_pad.i32[1];
|
||||
int top = e->_pad.i32[2];
|
||||
|
||||
m_pixels += pixels;
|
||||
|
||||
|
@ -887,9 +889,9 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
|
|||
{
|
||||
do
|
||||
{
|
||||
int pixels = e->p.i16[0];
|
||||
int left = e->p.i16[1];
|
||||
int top = e->p.i16[2];
|
||||
int pixels = e->_pad.i32[0];
|
||||
int left = e->_pad.i32[1];
|
||||
int top = e->_pad.i32[2];
|
||||
|
||||
m_pixels += pixels;
|
||||
|
||||
|
|
|
@ -104,6 +104,8 @@ public:
|
|||
|
||||
#endif
|
||||
|
||||
virtual void PrintStats() = 0;
|
||||
|
||||
__forceinline bool HasEdge() const {return m_de != NULL;}
|
||||
__forceinline bool IsSolidRect() const {return m_dr != NULL;}
|
||||
};
|
||||
|
@ -117,6 +119,7 @@ public:
|
|||
virtual void Sync() = 0;
|
||||
virtual bool IsSynced() const = 0;
|
||||
virtual int GetPixels(bool reset = true) = 0;
|
||||
virtual void PrintStats() = 0;
|
||||
};
|
||||
|
||||
__aligned(class, 32) GSRasterizer : public IRasterizer
|
||||
|
@ -164,10 +167,10 @@ public:
|
|||
void Sync() {}
|
||||
bool IsSynced() const {return true;}
|
||||
int GetPixels(bool reset);
|
||||
void PrintStats() {m_ds->PrintStats();}
|
||||
};
|
||||
|
||||
class GSRasterizerList
|
||||
: public IRasterizer
|
||||
class GSRasterizerList : public IRasterizer
|
||||
{
|
||||
protected:
|
||||
class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
|
||||
|
@ -221,4 +224,5 @@ public:
|
|||
void Sync();
|
||||
bool IsSynced() const;
|
||||
int GetPixels(bool reset);
|
||||
void PrintStats() {}
|
||||
};
|
||||
|
|
|
@ -28,6 +28,10 @@ static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
|
|||
|
||||
const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
const GSVector8 g_pos_scale2(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
|
||||
#endif
|
||||
|
||||
GSRendererSW::GSRendererSW(int threads)
|
||||
: m_fzb(NULL)
|
||||
{
|
||||
|
@ -210,7 +214,7 @@ void GSRendererSW::VSync(int field)
|
|||
|
||||
m_tc->IncAge();
|
||||
|
||||
// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
|
||||
// if((m_perfmon.GetFrame() & 255) == 0) m_rl->PrintStats();
|
||||
}
|
||||
|
||||
void GSRendererSW::ResetDevice()
|
||||
|
@ -263,18 +267,80 @@ GSTexture* GSRendererSW::GetOutput(int i)
|
|||
template<uint32 primclass, uint32 tme, uint32 fst>
|
||||
void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count)
|
||||
{
|
||||
size_t i = m_vertex.next;
|
||||
#if 0//_M_SSE >= 0x501
|
||||
|
||||
// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
|
||||
|
||||
GSVector8i o2((GSVector4i)m_context->XYOFFSET);
|
||||
GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
|
||||
|
||||
for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
|
||||
{
|
||||
GSVector8i v0 = GSVector8i::load<true>(src[0].m);
|
||||
GSVector8i v1 = GSVector8i::load<true>(src[1].m);
|
||||
|
||||
GSVector8 stcq = GSVector8::cast(v0.ac(v1));
|
||||
GSVector8i xyzuvf = v0.bd(v1);
|
||||
|
||||
//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
|
||||
//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
|
||||
|
||||
GSVector8i xy = xyzuvf.upl16() - o2;
|
||||
GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
|
||||
|
||||
GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * g_pos_scale2;
|
||||
GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
|
||||
|
||||
GSVector8 t = GSVector8::zero();
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(fst)
|
||||
{
|
||||
t = GSVector8(xyzuvf.uph16() << (16 - 4));
|
||||
}
|
||||
else
|
||||
{
|
||||
t = stcq.xyww() * tsize2;
|
||||
}
|
||||
}
|
||||
|
||||
if(primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
|
||||
}
|
||||
/*
|
||||
if(tme || primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
GSVector8::store<true>(&dst[0].p, p.ac(t));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector8::storel(&dst[0].p, p);
|
||||
}
|
||||
*/
|
||||
GSVector8::store<true>(&dst[0].p, p.ac(t));
|
||||
GSVector8::store<true>(&dst[0].c, c.a_());
|
||||
/*
|
||||
if(tme || primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
GSVector8::store<true>(&dst[1].p, p.bd(t));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector8::storeh(&dst[1].p, p);
|
||||
}
|
||||
*/
|
||||
GSVector8::store<true>(&dst[1].p, p.bd(t));
|
||||
GSVector8::store<true>(&dst[1].c, c.b_());
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
GSVector4i o = (GSVector4i)m_context->XYOFFSET;
|
||||
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
// TODO: process vertices in pairs, when AVX2 becomes available
|
||||
|
||||
#endif
|
||||
|
||||
for(; i > 0; i--, src++, dst++)
|
||||
for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++)
|
||||
{
|
||||
GSVector4 stcq = GSVector4::load<true>(&src->m[0]); // s t rgba q
|
||||
|
||||
|
@ -297,7 +363,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
|
||||
dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
|
||||
|
||||
GSVector4 t;
|
||||
GSVector4 t = GSVector4::zero();
|
||||
|
||||
if(tme)
|
||||
{
|
||||
|
@ -323,17 +389,25 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
{
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
t = t.insert<1, 3>(GSVector4::cast(xyzuvf));
|
||||
|
||||
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
|
||||
|
||||
#else
|
||||
|
||||
t = t.insert<0, 3>(GSVector4::cast(GSVector4i::load(z)));
|
||||
|
||||
t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z)));
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
dst->t = t;
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
dst->_pad = GSVector4::zero();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSRendererSW::Draw()
|
||||
|
@ -345,10 +419,10 @@ void GSRendererSW::Draw()
|
|||
shared_ptr<GSRasterizerData> data(sd);
|
||||
|
||||
sd->primclass = m_vt.m_primclass;
|
||||
sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32);
|
||||
sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1) + sizeof(uint32) * m_index.tail, 32);
|
||||
sd->vertex = (GSVertexSW*)sd->buff;
|
||||
sd->vertex_count = m_vertex.next;
|
||||
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next);
|
||||
sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * ((m_vertex.next + 1) & ~1));
|
||||
sd->index_count = m_index.tail;
|
||||
|
||||
(this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next);
|
||||
|
@ -631,6 +705,20 @@ void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS
|
|||
}
|
||||
}
|
||||
|
||||
__forceinline void Increment16(volatile short* lpAddend)
|
||||
{
|
||||
// (*lpAddend)++;
|
||||
|
||||
_InterlockedIncrement16(lpAddend);
|
||||
}
|
||||
|
||||
__forceinline void Decrement16(volatile short* lpAddend)
|
||||
{
|
||||
// (*lpAddend)--;
|
||||
|
||||
_InterlockedDecrement16(lpAddend);
|
||||
}
|
||||
|
||||
void GSRendererSW::UsePages(const uint32* pages, int type)
|
||||
{
|
||||
if(type < 2)
|
||||
|
@ -639,7 +727,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
|
|||
{
|
||||
ASSERT(((short*)&m_fzb_pages[*p])[type] < SHRT_MAX);
|
||||
|
||||
_InterlockedIncrement16((short*)&m_fzb_pages[*p] + type);
|
||||
Increment16((short*)&m_fzb_pages[*p] + type);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -648,7 +736,7 @@ void GSRendererSW::UsePages(const uint32* pages, int type)
|
|||
{
|
||||
ASSERT(m_tex_pages[*p] < SHRT_MAX);
|
||||
|
||||
_InterlockedIncrement16((short*)&m_tex_pages[*p]); // remember which texture pages are used
|
||||
Increment16((short*)&m_tex_pages[*p]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -661,7 +749,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
|
|||
{
|
||||
ASSERT(((short*)&m_fzb_pages[*p])[type] > 0);
|
||||
|
||||
_InterlockedDecrement16((short*)&m_fzb_pages[*p] + type);
|
||||
Decrement16((short*)&m_fzb_pages[*p] + type);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -670,7 +758,7 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type)
|
|||
{
|
||||
ASSERT(m_tex_pages[*p] > 0);
|
||||
|
||||
_InterlockedDecrement16((short*)&m_tex_pages[*p]);
|
||||
Decrement16((short*)&m_tex_pages[*p]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1390,23 +1478,29 @@ GSRendererSW::SharedData::~SharedData()
|
|||
fflush(s_fp);}
|
||||
}
|
||||
|
||||
static TransactionScope::Lock s_lock;
|
||||
|
||||
void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm)
|
||||
{
|
||||
if(m_using_pages) return;
|
||||
|
||||
if(global.sel.fb)
|
||||
{
|
||||
m_parent->UsePages(fb_pages, 0);
|
||||
}
|
||||
//TransactionScope scope(s_lock);
|
||||
|
||||
if(global.sel.zb)
|
||||
{
|
||||
m_parent->UsePages(zb_pages, 1);
|
||||
}
|
||||
if(global.sel.fb)
|
||||
{
|
||||
m_parent->UsePages(fb_pages, 0);
|
||||
}
|
||||
|
||||
for(size_t i = 0; m_tex[i].t != NULL; i++)
|
||||
{
|
||||
m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
|
||||
if(global.sel.zb)
|
||||
{
|
||||
m_parent->UsePages(zb_pages, 1);
|
||||
}
|
||||
|
||||
for(size_t i = 0; m_tex[i].t != NULL; i++)
|
||||
{
|
||||
m_parent->UsePages(m_tex[i].t->m_pages.n, 2);
|
||||
}
|
||||
}
|
||||
|
||||
m_fb_pages = fb_pages;
|
||||
|
@ -1421,19 +1515,23 @@ void GSRendererSW::SharedData::ReleasePages()
|
|||
{
|
||||
if(!m_using_pages) return;
|
||||
|
||||
if(global.sel.fb)
|
||||
{
|
||||
m_parent->ReleasePages(m_fb_pages, 0);
|
||||
}
|
||||
//TransactionScope scope(s_lock);
|
||||
|
||||
if(global.sel.zb)
|
||||
{
|
||||
m_parent->ReleasePages(m_zb_pages, 1);
|
||||
}
|
||||
if(global.sel.fb)
|
||||
{
|
||||
m_parent->ReleasePages(m_fb_pages, 0);
|
||||
}
|
||||
|
||||
for(size_t i = 0; m_tex[i].t != NULL; i++)
|
||||
{
|
||||
m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
|
||||
if(global.sel.zb)
|
||||
{
|
||||
m_parent->ReleasePages(m_zb_pages, 1);
|
||||
}
|
||||
|
||||
for(size_t i = 0; m_tex[i].t != NULL; i++)
|
||||
{
|
||||
m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2);
|
||||
}
|
||||
}
|
||||
|
||||
delete [] m_fb_pages;
|
||||
|
|
|
@ -516,7 +516,7 @@ void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
|
|||
*/
|
||||
GSVector4i xy = GSVector4i::loadl(&r->u64[0]);
|
||||
GSVector4i zf = GSVector4i::loadl(&r->u64[1]);
|
||||
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
|
||||
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
|
||||
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
|
||||
|
||||
m_v.m[1] = xy.upl32(zf);
|
||||
|
@ -567,14 +567,19 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, ui
|
|||
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
|
||||
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
|
||||
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
|
||||
|
||||
/*
|
||||
GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
|
||||
GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
|
||||
GSVector4i rbga = rg.upl8(ba);
|
||||
GSVector4i rgba = rbga.upl8(rbga.zzzz());
|
||||
*/
|
||||
q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
|
||||
|
||||
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
|
||||
|
||||
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
|
||||
GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
|
||||
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
|
||||
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
|
||||
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
|
||||
|
||||
m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
|
||||
|
@ -599,7 +604,12 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uin
|
|||
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
|
||||
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
|
||||
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
|
||||
|
||||
/*
|
||||
GSVector4i rg = GSVector4i::loadl(&r[1].u64[0]);
|
||||
GSVector4i ba = GSVector4i::loadl(&r[1].u64[1]);
|
||||
GSVector4i rbga = rg.upl8(ba);
|
||||
GSVector4i rgba = rbga.upl8(rbga.zzzz());
|
||||
*/
|
||||
q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ
|
||||
|
||||
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
|
||||
|
@ -719,7 +729,7 @@ void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)
|
|||
|
||||
GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
|
||||
GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
|
||||
GSVector4i uvf = GSVector4i::loadl(&m_v.UV).upl32(xyzf.srl32(24).srl<4>());
|
||||
GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
|
||||
|
||||
m_v.m[1] = xyz.upl64(uvf);
|
||||
|
||||
|
@ -2258,7 +2268,7 @@ void GSState::UpdateContext()
|
|||
|
||||
void GSState::UpdateScissor()
|
||||
{
|
||||
m_scissor = m_context->scissor.ofex;
|
||||
m_scissor = m_context->scissor.ex;
|
||||
m_ofxy = m_context->scissor.ofxy;
|
||||
}
|
||||
|
||||
|
@ -2286,8 +2296,8 @@ void GSState::GrowVertexBuffer()
|
|||
{
|
||||
int maxcount = std::max<int>(m_vertex.maxcount * 3 / 2, 10000);
|
||||
|
||||
GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 16);
|
||||
uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 16); // worst case is slightly less than vertex number * 3
|
||||
GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 32);
|
||||
uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 32); // worst case is slightly less than vertex number * 3
|
||||
|
||||
if(m_vertex.buff != NULL)
|
||||
{
|
||||
|
@ -2328,7 +2338,13 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
|||
tailptr[0] = v0;
|
||||
tailptr[1] = v1;
|
||||
|
||||
m_vertex.xy[xy_tail & 3] = GSVector4(v1.upl32(v1.sub16(GSVector4i::load(m_ofxy)).sra16(4)).upl16()); // zw not sign extended, only useful for eq tests
|
||||
GSVector4i xy = v1.xxxx().sub16(m_ofxy);
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4)));
|
||||
#else
|
||||
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy()));
|
||||
#endif
|
||||
|
||||
m_vertex.tail = ++tail;
|
||||
m_vertex.xy_tail = ++xy_tail;
|
||||
|
@ -2356,14 +2372,14 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
|||
|
||||
if(skip == 0 && (prim != GS_TRIANGLEFAN || m <= 4)) // m_vertex.xy only knows about the last 4 vertices, head could be far behind for fan
|
||||
{
|
||||
GSVector4 v0, v1, v2, v3;
|
||||
GSVector4i v0, v1, v2, v3, pmin, pmax;
|
||||
|
||||
v0 = m_vertex.xy[(xy_tail + 1) & 3]; // T-3
|
||||
v1 = m_vertex.xy[(xy_tail + 2) & 3]; // T-2
|
||||
v2 = m_vertex.xy[(xy_tail + 3) & 3]; // T-1
|
||||
v3 = m_vertex.xy[(xy_tail - m) & 3]; // H
|
||||
v0 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 1) & 3]); // T-3
|
||||
v1 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 2) & 3]); // T-2
|
||||
v2 = GSVector4i::loadl(&m_vertex.xy[(xy_tail + 3) & 3]); // T-1
|
||||
v3 = GSVector4i::loadl(&m_vertex.xy[(xy_tail - m) & 3]); // H
|
||||
|
||||
GSVector4 pmin, pmax, cross;
|
||||
GSVector4 cross;
|
||||
|
||||
switch(prim)
|
||||
{
|
||||
|
@ -2374,21 +2390,21 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
|||
case GS_LINELIST:
|
||||
case GS_LINESTRIP:
|
||||
case GS_SPRITE:
|
||||
pmin = v2.min(v1);
|
||||
pmax = v2.max(v1);
|
||||
pmin = v2.min_i16(v1);
|
||||
pmax = v2.max_i16(v1);
|
||||
break;
|
||||
case GS_TRIANGLELIST:
|
||||
case GS_TRIANGLESTRIP:
|
||||
pmin = v2.min(v1.min(v0));
|
||||
pmax = v2.max(v1.max(v0));
|
||||
pmin = v2.min_i16(v1.min_i16(v0));
|
||||
pmax = v2.max_i16(v1.max_i16(v0));
|
||||
break;
|
||||
case GS_TRIANGLEFAN:
|
||||
pmin = v2.min(v1.min(v3));
|
||||
pmax = v2.max(v1.max(v3));
|
||||
pmin = v2.min_i16(v1.min_i16(v3));
|
||||
pmax = v2.max_i16(v1.max_i16(v3));
|
||||
break;
|
||||
}
|
||||
|
||||
GSVector4 test = pmax < m_scissor | pmin > m_scissor.zwxy();
|
||||
GSVector4i test = pmax.lt16(m_scissor) | pmin.gt16(m_scissor.zwzwl());
|
||||
|
||||
switch(prim)
|
||||
{
|
||||
|
@ -2396,7 +2412,7 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
|||
case GS_TRIANGLESTRIP:
|
||||
case GS_TRIANGLEFAN:
|
||||
case GS_SPRITE:
|
||||
test |= m_nativeres ? (pmin == pmax).zwzw() : pmin == pmax;
|
||||
test |= m_nativeres ? pmin.eq16(pmax).zwzwl() : pmin.eq16(pmax);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -2404,16 +2420,19 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
|||
{
|
||||
case GS_TRIANGLELIST:
|
||||
case GS_TRIANGLESTRIP:
|
||||
cross = (v2 - v1) * (v2 - v0).yxwz();
|
||||
test |= cross == cross.yxwz();
|
||||
// TODO: any way to do a 16-bit integer cross product?
|
||||
cross = GSVector4(v2.xyxyl().i16to32().sub32(v0.upl32(v1).i16to32())); // x20, y20, x21, y21
|
||||
cross = cross * cross.wzwz(); // x20 * y21, y20 * x21
|
||||
test |= GSVector4i::cast(cross == cross.yxwz());
|
||||
break;
|
||||
case GS_TRIANGLEFAN:
|
||||
cross = (v2 - v1) * (v2 - v3).yxwz();
|
||||
test |= cross == cross.yxwz();
|
||||
cross = GSVector4(v2.xyxyl().i16to32().sub32(v3.upl32(v1).i16to32())); // x23, y23, x21, y21
|
||||
cross = cross * cross.wzwz(); // x23 * y21, y23 * x21
|
||||
test |= GSVector4i::cast(cross == cross.yxwz());
|
||||
break;
|
||||
}
|
||||
|
||||
skip |= test.mask() & 3;
|
||||
skip |= test.mask() & 15;
|
||||
}
|
||||
|
||||
if(skip != 0)
|
||||
|
|
|
@ -148,16 +148,16 @@ protected:
|
|||
|
||||
GSVertex m_v;
|
||||
float m_q;
|
||||
GSVector4 m_scissor;
|
||||
uint32 m_ofxy;
|
||||
GSVector4i m_scissor;
|
||||
GSVector4i m_ofxy;
|
||||
bool m_texflush;
|
||||
|
||||
struct
|
||||
{
|
||||
GSVertex* buff;
|
||||
size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
|
||||
GSVector4 xy[4];
|
||||
size_t xy_tail;
|
||||
uint64 xy[4];
|
||||
} m_vertex;
|
||||
|
||||
struct
|
||||
|
|
|
@ -367,3 +367,99 @@ public:
|
|||
|
||||
virtual void Process(T& item) = 0;
|
||||
};
|
||||
|
||||
// http://software.intel.com/en-us/blogs/2012/11/06/exploring-intel-transactional-synchronization-extensions-with-intel-software
|
||||
|
||||
class TransactionScope
|
||||
{
|
||||
public:
|
||||
class Lock
|
||||
{
|
||||
volatile long state;
|
||||
|
||||
public:
|
||||
Lock()
|
||||
: state(0)
|
||||
{
|
||||
}
|
||||
|
||||
void lock()
|
||||
{
|
||||
while(_InterlockedCompareExchange(&state, 1, 0) != 0)
|
||||
{
|
||||
do {_mm_pause();} while(state == 1);
|
||||
}
|
||||
}
|
||||
|
||||
void unlock()
|
||||
{
|
||||
_InterlockedExchange(&state, 0);
|
||||
}
|
||||
|
||||
bool isLocked() const
|
||||
{
|
||||
return state == 1;
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
Lock& fallBackLock;
|
||||
|
||||
TransactionScope();
|
||||
|
||||
public:
|
||||
TransactionScope(Lock& fallBackLock_, int max_retries = 3)
|
||||
: fallBackLock(fallBackLock_)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
int nretries = 0;
|
||||
|
||||
while(1)
|
||||
{
|
||||
++nretries;
|
||||
|
||||
unsigned status = _xbegin();
|
||||
|
||||
if(status == _XBEGIN_STARTED)
|
||||
{
|
||||
if(!fallBackLock.isLocked()) return;
|
||||
|
||||
_xabort(0xff);
|
||||
}
|
||||
|
||||
if((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff && !(status & _XABORT_NESTED))
|
||||
{
|
||||
while(fallBackLock.isLocked()) _mm_pause();
|
||||
}
|
||||
else if(!(status & _XABORT_RETRY))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if(nretries >= max_retries)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
fallBackLock.lock();
|
||||
}
|
||||
|
||||
~TransactionScope()
|
||||
{
|
||||
if(fallBackLock.isLocked())
|
||||
{
|
||||
fallBackLock.unlock();
|
||||
}
|
||||
#if _M_SSE >= 0x501
|
||||
else
|
||||
{
|
||||
_xend();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -78,6 +78,8 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000
|
|||
const GSVector8 GSVector8::m_one(1.0f);
|
||||
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
|
||||
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
|
||||
const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
|
||||
const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -209,4 +211,4 @@ GSVector4i GSVector4i::fit(int preset) const
|
|||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,7 +159,7 @@ public:
|
|||
|
||||
__forceinline explicit GSVector4i(int i)
|
||||
{
|
||||
m = _mm_set1_epi32(i);
|
||||
*this = i;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4i(__m128i m)
|
||||
|
@ -190,7 +190,15 @@ public:
|
|||
|
||||
__forceinline void operator = (int i)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
m = _mm_broadcastd_epi32(_mm_cvtsi32_si128(i));
|
||||
|
||||
#else
|
||||
|
||||
m = _mm_set1_epi32(i);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline void operator = (__m128i m)
|
||||
|
@ -790,6 +798,16 @@ public:
|
|||
return upl32();
|
||||
}
|
||||
|
||||
__forceinline GSVector4i i8to16() const
|
||||
{
|
||||
return zero().upl8().sra16(8);
|
||||
}
|
||||
|
||||
__forceinline GSVector4i i16to32() const
|
||||
{
|
||||
return zero().upl16().sra32(16);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<int i> __forceinline GSVector4i srl() const
|
||||
|
@ -1245,6 +1263,15 @@ public:
|
|||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(_mm_blend_epi32(m, v.m, i));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
|
||||
{
|
||||
GSVector4i v;
|
||||
|
@ -2446,23 +2473,31 @@ public:
|
|||
m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(float f)
|
||||
{
|
||||
m = _mm_set1_ps(f);
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(__m128 m)
|
||||
{
|
||||
this->m = m;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(float f)
|
||||
{
|
||||
*this = f;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(int i)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
|
||||
|
||||
#else
|
||||
|
||||
GSVector4i v((int)i);
|
||||
|
||||
*this = GSVector4(v);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector4(uint32 u)
|
||||
{
|
||||
GSVector4i v((int)u);
|
||||
|
@ -2493,7 +2528,15 @@ public:
|
|||
|
||||
__forceinline void operator = (float f)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
m = _mm_broadcastss_ps(_mm_load_ss(&f));
|
||||
|
||||
#else
|
||||
|
||||
m = _mm_set1_ps(f);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline void operator = (__m128 m)
|
||||
|
@ -2788,6 +2831,15 @@ public:
|
|||
return GSVector4(_mm_max_ps(m, a));
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
template<int mask> __forceinline GSVector4 blend32(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(_mm_blend_ps(m, a, mask));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
|
||||
{
|
||||
#if _M_SSE >= 0x401
|
||||
|
@ -2855,7 +2907,7 @@ public:
|
|||
#endif
|
||||
}
|
||||
|
||||
template<int src, int dst> __forceinline GSVector4 insert(const GSVector4& v) const
|
||||
template<int src, int dst> __forceinline GSVector4 insert32(const GSVector4& v) const
|
||||
{
|
||||
// TODO: use blendps when src == dst
|
||||
|
||||
|
@ -2918,7 +2970,7 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
template<int i> __forceinline int extract() const
|
||||
template<int i> __forceinline int extract32() const
|
||||
{
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
|
@ -3273,7 +3325,15 @@ public:
|
|||
|
||||
__forceinline GSVector8i(__m128i m0, __m128i m1)
|
||||
{
|
||||
this->m = zero().insert<0>(m0).insert<1>(m1);
|
||||
#if 0 // _MSC_VER >= 1700
|
||||
|
||||
this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m0), _mm256_castsi128_si256(m1), 0);
|
||||
|
||||
#else
|
||||
|
||||
*this = zero().insert<0>(m0).insert<1>(m1);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline GSVector8i(const GSVector8i& v)
|
||||
|
@ -3283,20 +3343,12 @@ public:
|
|||
|
||||
__forceinline explicit GSVector8i(int i)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
|
||||
#else
|
||||
m = _mm256_set1_epi32(i);
|
||||
#endif
|
||||
*this = i;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8i(__m128i m)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
this->m = _mm256_broadcastsi128_si256(m);
|
||||
#else
|
||||
this->m = zero().insert<0>(m).insert<1>(m);
|
||||
#endif
|
||||
*this = m;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8i(__m256i m)
|
||||
|
@ -3311,19 +3363,19 @@ public:
|
|||
|
||||
__forceinline void operator = (int i)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i));
|
||||
#else
|
||||
m = _mm256_set1_epi32(i);
|
||||
#endif
|
||||
m = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)); // m = _mm256_set1_epi32(i);
|
||||
}
|
||||
|
||||
__forceinline void operator = (__m128i m)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
this->m = _mm256_broadcastsi128_si256(m);
|
||||
#if 0 // _MSC_VER >= 1700
|
||||
|
||||
this->m = _mm256_permute2x128_si256(_mm256_castsi128_si256(m), _mm256_castsi128_si256(m), 0);
|
||||
|
||||
#else
|
||||
this->m = zero().insert<0>(m).insert<1>(m);
|
||||
|
||||
*this = zero().insert<0>(m).aa();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -3609,74 +3661,78 @@ public:
|
|||
return GSVector8i(_mm256_unpackhi_epi64(m, _mm256_setzero_si256()));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i8to16() const
|
||||
// cross lane! from 128-bit to full 256-bit range
|
||||
|
||||
__forceinline GSVector8i i8to16c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi8_epi16(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u8to16() const
|
||||
__forceinline GSVector8i u8to16c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i8to32() const
|
||||
__forceinline GSVector8i i8to32c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi8_epi32(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u8to32() const
|
||||
__forceinline GSVector8i u8to32c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu8_epi32(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i8to64() const
|
||||
__forceinline GSVector8i i8to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi8_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u8to64() const
|
||||
__forceinline GSVector8i u8to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i16to32() const
|
||||
__forceinline GSVector8i i16to32c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u16to32() const
|
||||
__forceinline GSVector8i u16to32c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i16to64() const
|
||||
__forceinline GSVector8i i16to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi16_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u16to64() const
|
||||
__forceinline GSVector8i u16to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu16_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i i32to64() const
|
||||
__forceinline GSVector8i i32to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
__forceinline GSVector8i u32to64() const
|
||||
__forceinline GSVector8i u32to64c() const
|
||||
{
|
||||
return GSVector8i(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(m)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
template<int i> __forceinline GSVector8i srl() const
|
||||
{
|
||||
return GSVector8i(_mm256_srli_si156(m, i));
|
||||
return GSVector8i(_mm256_srli_si256(m, i));
|
||||
}
|
||||
|
||||
template<int i> __forceinline GSVector8i srl(const GSVector8i& v)
|
||||
{
|
||||
return GSVector4i(_mm256_alignr_epi8(v.m, m, i));
|
||||
return GSVector8i(_mm256_alignr_epi8(v.m, m, i));
|
||||
}
|
||||
|
||||
template<int i> __forceinline GSVector8i sll() const
|
||||
|
@ -4065,12 +4121,14 @@ public:
|
|||
|
||||
template<int i> __forceinline GSVector4i extract() const
|
||||
{
|
||||
return GSVector4i(_mm256_extractf128_si256(m, i));
|
||||
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
|
||||
|
||||
return GSVector4i(_mm256_extracti128_si256(m, i));
|
||||
}
|
||||
|
||||
template<int i> __forceinline GSVector8i insert(__m128i m) const
|
||||
{
|
||||
return GSVector8i(_mm256_insertf128_si256(this->m, m, i));
|
||||
return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
|
||||
}
|
||||
|
||||
// TODO: gather
|
||||
|
@ -4111,6 +4169,16 @@ public:
|
|||
*/
|
||||
}
|
||||
|
||||
__forceinline static GSVector8i load(const void* pll, const void* plh, const void* phl, const void* phh)
|
||||
{
|
||||
GSVector4i l = GSVector4i::load(pll, plh);
|
||||
GSVector4i h = GSVector4i::load(phl, phh);
|
||||
|
||||
return cast(l).ac(cast(h));
|
||||
|
||||
// return GSVector8i(l).insert<1>(h);
|
||||
}
|
||||
|
||||
template<bool aligned> __forceinline static GSVector8i load(const void* p)
|
||||
{
|
||||
return GSVector8i(aligned ? _mm256_load_si256((__m256i*)p) : _mm256_loadu_si256((__m256i*)p));
|
||||
|
@ -4243,6 +4311,78 @@ public:
|
|||
b = c.bd(d);
|
||||
}
|
||||
|
||||
__forceinline static void sw4(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
const __m256i epi32_0f0f0f0f = _mm256_set1_epi32(0x0f0f0f0f);
|
||||
|
||||
GSVector8i mask(epi32_0f0f0f0f);
|
||||
|
||||
GSVector8i e = (b << 4).blend(a, mask);
|
||||
GSVector8i f = b.blend(a >> 4, mask);
|
||||
GSVector8i g = (d << 4).blend(c, mask);
|
||||
GSVector8i h = d.blend(c >> 4, mask);
|
||||
|
||||
a = e.upl8(f);
|
||||
c = e.uph8(f);
|
||||
b = g.upl8(h);
|
||||
d = g.uph8(h);
|
||||
}
|
||||
|
||||
__forceinline static void sw8(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
GSVector8i e = a;
|
||||
GSVector8i f = c;
|
||||
|
||||
a = e.upl8(b);
|
||||
c = e.uph8(b);
|
||||
b = f.upl8(d);
|
||||
d = f.uph8(d);
|
||||
}
|
||||
|
||||
__forceinline static void sw16(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
GSVector8i e = a;
|
||||
GSVector8i f = c;
|
||||
|
||||
a = e.upl16(b);
|
||||
c = e.uph16(b);
|
||||
b = f.upl16(d);
|
||||
d = f.uph16(d);
|
||||
}
|
||||
|
||||
__forceinline static void sw32(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
GSVector8i e = a;
|
||||
GSVector8i f = c;
|
||||
|
||||
a = e.upl32(b);
|
||||
c = e.uph32(b);
|
||||
b = f.upl32(d);
|
||||
d = f.uph32(d);
|
||||
}
|
||||
|
||||
__forceinline static void sw64(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
GSVector8i e = a;
|
||||
GSVector8i f = c;
|
||||
|
||||
a = e.upl64(b);
|
||||
c = e.uph64(b);
|
||||
b = f.upl64(d);
|
||||
d = f.uph64(d);
|
||||
}
|
||||
|
||||
__forceinline static void sw128(GSVector8i& a, GSVector8i& b, GSVector8i& c, GSVector8i& d)
|
||||
{
|
||||
GSVector8i e = a;
|
||||
GSVector8i f = c;
|
||||
|
||||
a = e.ac(b);
|
||||
c = e.bd(b);
|
||||
b = f.ac(d);
|
||||
d = f.bd(d);
|
||||
}
|
||||
|
||||
__forceinline void operator += (const GSVector8i& v)
|
||||
{
|
||||
m = _mm256_add_epi32(m, v);
|
||||
|
@ -4706,6 +4846,8 @@ public:
|
|||
static const GSVector8 m_one;
|
||||
static const GSVector8 m_x7fffffff;
|
||||
static const GSVector8 m_x80000000;
|
||||
static const GSVector8 m_x4b000000;
|
||||
static const GSVector8 m_x4f800000;
|
||||
|
||||
__forceinline GSVector8()
|
||||
{
|
||||
|
@ -4723,9 +4865,7 @@ public:
|
|||
|
||||
__forceinline GSVector8(__m128 m0, __m128 m1)
|
||||
{
|
||||
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps (in vs2012 they simply changed it to vmovups, still can't keep the second xmm in a register)
|
||||
|
||||
#if _MSC_VER >= 1700
|
||||
#if 0 // _MSC_VER >= 1700
|
||||
|
||||
this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m0), _mm256_castps128_ps256(m1), 0x20);
|
||||
|
||||
|
@ -4743,23 +4883,27 @@ public:
|
|||
|
||||
__forceinline explicit GSVector8(float f)
|
||||
{
|
||||
m = _mm256_set1_ps(f);
|
||||
*this = f;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8(int i)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
m = _mm256_cvtepi32_ps(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(i)));
|
||||
|
||||
#else
|
||||
|
||||
GSVector4i v((int)i);
|
||||
|
||||
*this = GSVector4(v);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8(__m128 m)
|
||||
{
|
||||
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
|
||||
|
||||
#if _MSC_VER >= 1700
|
||||
|
||||
this->m = _mm256_castps128_ps256(m);
|
||||
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
|
||||
|
||||
#else
|
||||
|
||||
this->m = zero().insert<0>(m).aa();
|
||||
|
||||
#endif
|
||||
*this = m;
|
||||
}
|
||||
|
||||
__forceinline explicit GSVector8(__m256 m)
|
||||
|
@ -4785,17 +4929,22 @@ public:
|
|||
|
||||
__forceinline void operator = (float f)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
m = _mm256_broadcastss_ps(_mm_load_ss(&f));
|
||||
|
||||
#else
|
||||
|
||||
m = _mm256_set1_ps(f);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline void operator = (__m128 m)
|
||||
{
|
||||
// FIXME: MSVC bug, _mm256_castps128_ps256 may directy reload spilled regs from unaligned memory with vmovaps
|
||||
#if 0 // _MSC_VER >= 1700
|
||||
|
||||
#if _MSC_VER >= 1700
|
||||
|
||||
this->m = _mm256_castps128_ps256(m);
|
||||
this->m = _mm256_permute2f128_ps(this->m, this->m, 0);
|
||||
this->m = _mm256_permute2f128_ps(_mm256_castps128_ps256(m), _mm256_castps128_ps256(m), 0x20);
|
||||
|
||||
#else
|
||||
|
||||
|
@ -5092,7 +5241,70 @@ public:
|
|||
return _mm256_testz_ps(m, m) != 0;
|
||||
}
|
||||
|
||||
// TODO: 32-bit insert/extract
|
||||
|
||||
template<int src, int dst> __forceinline GSVector8 insert32(const GSVector8& v) const
|
||||
{
|
||||
// TODO: use blendps when src == dst
|
||||
|
||||
ASSERT(src < 4 && dst < 4); // not cross lane like extract32()
|
||||
|
||||
switch(dst)
|
||||
{
|
||||
case 0:
|
||||
switch(src)
|
||||
{
|
||||
case 0: return yyxx(v).zxzw(*this);
|
||||
case 1: return yyyy(v).zxzw(*this);
|
||||
case 2: return yyzz(v).zxzw(*this);
|
||||
case 3: return yyww(v).zxzw(*this);
|
||||
default: __assume(0);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
switch(src)
|
||||
{
|
||||
case 0: return xxxx(v).xzzw(*this);
|
||||
case 1: return xxyy(v).xzzw(*this);
|
||||
case 2: return xxzz(v).xzzw(*this);
|
||||
case 3: return xxww(v).xzzw(*this);
|
||||
default: __assume(0);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
switch(src)
|
||||
{
|
||||
case 0: return xyzx(wwxx(v));
|
||||
case 1: return xyzx(wwyy(v));
|
||||
case 2: return xyzx(wwzz(v));
|
||||
case 3: return xyzx(wwww(v));
|
||||
default: __assume(0);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
switch(src)
|
||||
{
|
||||
case 0: return xyxz(zzxx(v));
|
||||
case 1: return xyxz(zzyy(v));
|
||||
case 2: return xyxz(zzzz(v));
|
||||
case 3: return xyxz(zzww(v));
|
||||
default: __assume(0);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
__assume(0);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<int i> __forceinline int extract32() const
|
||||
{
|
||||
if(i < 4) return extract<0>().extract<i>();
|
||||
else if(i < 8) return extract<1>().extract<i - 4>();
|
||||
else ASSERT(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<int i> __forceinline GSVector8 insert(__m128 m) const
|
||||
{
|
||||
|
@ -5101,6 +5313,8 @@ public:
|
|||
|
||||
template<int i> __forceinline GSVector4 extract() const
|
||||
{
|
||||
if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
|
||||
|
||||
return GSVector4(_mm256_extractf128_ps(m, i));
|
||||
}
|
||||
|
||||
|
@ -5114,14 +5328,44 @@ public:
|
|||
return zero() == zero();
|
||||
}
|
||||
|
||||
// TODO: load low, ss
|
||||
// TODO
|
||||
|
||||
__forceinline static GSVector8 loadl(const void* p)
|
||||
{
|
||||
return GSVector8(_mm256_castps128_ps256(_mm_load_ps((float*)p)));
|
||||
}
|
||||
|
||||
__forceinline static GSVector8 loadh(const void* p)
|
||||
{
|
||||
return zero().insert<1>(_mm_load_ps((float*)p));
|
||||
}
|
||||
|
||||
__forceinline static GSVector8 loadh(const void* p, const GSVector8& v)
|
||||
{
|
||||
return GSVector8(_mm256_insertf128_ps(v, _mm_load_ps((float*)p), 1));
|
||||
}
|
||||
|
||||
__forceinline static GSVector8 load(const void* pl, const void* ph)
|
||||
{
|
||||
return loadh(ph, loadl(pl));
|
||||
}
|
||||
|
||||
template<bool aligned> __forceinline static GSVector8 load(const void* p)
|
||||
{
|
||||
return GSVector8(aligned ? _mm256_load_ps((const float*)p) : _mm256_loadu_ps((const float*)p));
|
||||
}
|
||||
|
||||
// TODO: store low, ss
|
||||
// TODO
|
||||
|
||||
__forceinline static void storel(void* p, const GSVector8& v)
|
||||
{
|
||||
_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 0));
|
||||
}
|
||||
|
||||
__forceinline static void storeh(void* p, const GSVector8& v)
|
||||
{
|
||||
_mm_store_ps((float*)p, _mm256_extractf128_ps(v.m, 1));
|
||||
}
|
||||
|
||||
template<bool aligned> __forceinline static void store(void* p, const GSVector8& v)
|
||||
{
|
||||
|
|
|
@ -402,6 +402,24 @@ struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_fr
|
|||
return retval;
|
||||
}
|
||||
|
||||
__forceinline long _InterlockedCompareExchange(volatile long* const Destination, const long Exchange, const long Comperand)
|
||||
{
|
||||
long retval = Comperand;
|
||||
|
||||
__asm__("lock; cmpxchgl %k[Exchange], %[Destination]" : [retval] "+a" (retval) : [Destination] "m" (*Destination), [Exchange] "q" (Exchange): "memory");
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
__forceinline long _InterlockedExchange(volatile long* const Target, const long Value)
|
||||
{
|
||||
long retval = Value;
|
||||
|
||||
__asm__("xchgl %[retval], %[Target]" : [retval] "+r" (retval) : [Target] "m" (*Target) : "memory");
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
__forceinline long _InterlockedExchangeAdd(volatile long* const Addend, const long Value)
|
||||
{
|
||||
long retval = Value;
|
||||
|
|
Loading…
Reference in New Issue