mirror of https://github.com/PCSX2/pcsx2.git
GSdx: Found some 16-bit integer overflow in GSState::VertexKick, some triangles could have been removed by the scissor test.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5680 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
37dd5f5bb5
commit
4dd898c754
|
@ -120,11 +120,13 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
|
||||||
{
|
{
|
||||||
if(sel.prim != GS_SPRITE_CLASS)
|
if(sel.prim != GS_SPRITE_CLASS)
|
||||||
{
|
{
|
||||||
|
GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||||
|
|
||||||
if(has_f)
|
if(has_f)
|
||||||
{
|
{
|
||||||
GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
|
m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||||
|
|
||||||
m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
|
GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
|
||||||
|
|
||||||
for(int i = 0; i < 8; i++)
|
for(int i = 0; i < 8; i++)
|
||||||
{
|
{
|
||||||
|
@ -134,9 +136,9 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
|
||||||
|
|
||||||
if(has_z)
|
if(has_z)
|
||||||
{
|
{
|
||||||
GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
|
m_local.d8.p.z = dp8.extract32<2>();
|
||||||
|
|
||||||
m_local.d8.z = dz * shift[0];
|
GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
|
||||||
|
|
||||||
for(int i = 0; i < 8; i++)
|
for(int i = 0; i < 8; i++)
|
||||||
{
|
{
|
||||||
|
@ -160,19 +162,19 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
|
||||||
|
|
||||||
if(has_t)
|
if(has_t)
|
||||||
{
|
{
|
||||||
GSVector8 dt(dscan.t);
|
GSVector4 dt8 = dscan.t * GSVector4::broadcast32(&shift[0]);
|
||||||
|
|
||||||
GSVector8 dt8 = dt * shift[0];
|
|
||||||
|
|
||||||
if(sel.fst)
|
if(sel.fst)
|
||||||
{
|
{
|
||||||
m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
|
m_local.d8.stq = GSVector4::cast(GSVector4i(dt8));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
m_local.d8.stq = dt8;
|
m_local.d8.stq = dt8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GSVector8 dt(dscan.t);
|
||||||
|
|
||||||
for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
|
for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
|
||||||
{
|
{
|
||||||
GSVector8 dstq;
|
GSVector8 dstq;
|
||||||
|
@ -213,9 +215,11 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
|
||||||
{
|
{
|
||||||
if(sel.iip)
|
if(sel.iip)
|
||||||
{
|
{
|
||||||
GSVector8 dc(dscan.c);
|
GSVector4 dc8 = dscan.c * GSVector4::broadcast32(&shift[0]);
|
||||||
|
|
||||||
m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
|
GSVector4i::storel(&m_local.d8.c, GSVector4i(dc8).xzyw().ps32());
|
||||||
|
|
||||||
|
GSVector8 dc(dscan.c);
|
||||||
|
|
||||||
GSVector8 dr = dc.xxxx();
|
GSVector8 dr = dc.xxxx();
|
||||||
GSVector8 db = dc.zzzz();
|
GSVector8 db = dc.zzzz();
|
||||||
|
@ -490,9 +494,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
s = GSVector8::broadcast32(scan.t.xxxx()) + m_local.d[skip].s;
|
s = GSVector8::broadcast32(&scan.t.x) + m_local.d[skip].s;
|
||||||
t = GSVector8::broadcast32(scan.t.yyyy()) + m_local.d[skip].t;
|
t = GSVector8::broadcast32(&scan.t.y) + m_local.d[skip].t;
|
||||||
q = GSVector8::broadcast32(scan.t.zzzz()) + m_local.d[skip].q;
|
q = GSVector8::broadcast32(&scan.t.z) + m_local.d[skip].q;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -504,8 +508,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
|
|
||||||
c = c.upl16(c.zwxy());
|
c = c.upl16(c.zwxy());
|
||||||
|
|
||||||
rbf = GSVector8i::broadcast32(c.xxxx()).add16(m_local.d[skip].rb);
|
rbf = GSVector8i::broadcast32(&c.x).add16(m_local.d[skip].rb);
|
||||||
gaf = GSVector8i::broadcast32(c.zzzz()).add16(m_local.d[skip].ga);
|
gaf = GSVector8i::broadcast32(&c.z).add16(m_local.d[skip].ga);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -532,7 +536,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
|
|
||||||
if(sel.prim != GS_SPRITE_CLASS)
|
if(sel.prim != GS_SPRITE_CLASS)
|
||||||
{
|
{
|
||||||
GSVector8 z = GSVector8::broadcast32(scan.p.zzzz()) + zo;
|
GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
|
||||||
|
|
||||||
if(sel.zoverflow)
|
if(sel.zoverflow)
|
||||||
{
|
{
|
||||||
|
@ -545,7 +549,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
zs = GSVector8i::broadcast32(GSVector4i::load(m_local.p.z));
|
zs = GSVector8i::broadcast32(&m_local.p.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(sel.ztest)
|
if(sel.ztest)
|
||||||
|
@ -1104,7 +1108,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
|
|
||||||
if(sel.fwrite && sel.fge)
|
if(sel.fwrite && sel.fge)
|
||||||
{
|
{
|
||||||
GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(GSVector4i::load(m_local.p.f));
|
GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(&m_local.p.f);
|
||||||
|
|
||||||
GSVector8i frb((int)m_global.frb);
|
GSVector8i frb((int)m_global.frb);
|
||||||
GSVector8i fga((int)m_global.fga);
|
GSVector8i fga((int)m_global.fga);
|
||||||
|
@ -1476,12 +1480,12 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
{
|
{
|
||||||
if(sel.zb)
|
if(sel.zb)
|
||||||
{
|
{
|
||||||
zo += m_local.d8.z;
|
zo += GSVector8::broadcast32(&m_local.d8.p.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(sel.fwrite && sel.fge)
|
if(sel.fwrite && sel.fge)
|
||||||
{
|
{
|
||||||
f = f.add16(m_local.d8.f);
|
f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1491,7 +1495,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
{
|
{
|
||||||
if(sel.fst)
|
if(sel.fst)
|
||||||
{
|
{
|
||||||
GSVector8i stq = GSVector8i::cast(m_local.d8.stq);
|
GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
|
||||||
|
|
||||||
s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
|
s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
|
||||||
|
|
||||||
|
@ -1502,7 +1506,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
GSVector8 stq = m_local.d8.stq;
|
GSVector8 stq(m_local.d8.stq);
|
||||||
|
|
||||||
s += stq.xxxx();
|
s += stq.xxxx();
|
||||||
t += stq.yyyy();
|
t += stq.yyyy();
|
||||||
|
@ -1515,7 +1519,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
|
||||||
{
|
{
|
||||||
if(sel.iip)
|
if(sel.iip)
|
||||||
{
|
{
|
||||||
GSVector8i c = m_local.d8.c;
|
GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
|
||||||
|
|
||||||
rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero());
|
rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero());
|
||||||
gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero());
|
gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero());
|
||||||
|
|
|
@ -468,22 +468,22 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
|
|
||||||
if(m_sel.prim != GS_SPRITE_CLASS)
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
||||||
{
|
{
|
||||||
// z += m_local.d8.z;
|
// zo += GSVector8::broadcast32(&m_local.d8.p.z);
|
||||||
|
|
||||||
if(m_sel.zb)
|
if(m_sel.zb)
|
||||||
{
|
{
|
||||||
vmovaps(ymm0, ptr[&m_local.temp.zo]);
|
vbroadcastss(ymm0, ptr[&m_local.d8.p.z]);
|
||||||
vaddps(ymm0, ptr[&m_local.d8.z]);
|
vaddps(ymm0, ptr[&m_local.temp.zo]);
|
||||||
vmovaps(ptr[&m_local.temp.zo], ymm0);
|
vmovaps(ptr[&m_local.temp.zo], ymm0);
|
||||||
vaddps(ymm0, ptr[&m_local.temp.z]);
|
vaddps(ymm0, ptr[&m_local.temp.z]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// f = f.add16(m_local.d4.f);
|
// f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
|
||||||
|
|
||||||
if(m_sel.fwrite && m_sel.fge)
|
if(m_sel.fwrite && m_sel.fge)
|
||||||
{
|
{
|
||||||
vmovdqa(ymm1, ptr[&m_local.temp.f]);
|
vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]);
|
||||||
vpaddw(ymm1, ptr[&m_local.d8.f]);
|
vpaddw(ymm1, ptr[&m_local.temp.f]);
|
||||||
vmovdqa(ptr[&m_local.temp.f], ymm1);
|
vmovdqa(ptr[&m_local.temp.f], ymm1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -501,12 +501,11 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
{
|
{
|
||||||
if(m_sel.fst)
|
if(m_sel.fst)
|
||||||
{
|
{
|
||||||
// GSVector8i stq = m_local.d8.stq;
|
// GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
|
||||||
|
|
||||||
// s += stq.xxxx();
|
vbroadcasti128(ymm4, ptr[&m_local.d8.stq]);
|
||||||
// if(!sprite) t += stq.yyyy();
|
|
||||||
|
|
||||||
vmovdqa(ymm4, ptr[&m_local.d8.stq]);
|
// s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
|
||||||
|
|
||||||
vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
|
vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
vpaddd(ymm2, ptr[&m_local.temp.s]);
|
vpaddd(ymm2, ptr[&m_local.temp.s]);
|
||||||
|
@ -514,6 +513,8 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
|
|
||||||
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
|
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
|
||||||
{
|
{
|
||||||
|
// t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy());
|
||||||
|
|
||||||
vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
|
vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
vpaddd(ymm3, ptr[&m_local.temp.t]);
|
vpaddd(ymm3, ptr[&m_local.temp.t]);
|
||||||
vmovdqa(ptr[&m_local.temp.t], ymm3);
|
vmovdqa(ptr[&m_local.temp.t], ymm3);
|
||||||
|
@ -525,13 +526,13 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// GSVector8 stq = m_local.d8.stq;
|
// GSVector8 stq(m_local.d8.stq);
|
||||||
|
|
||||||
// s += stq.xxxx();
|
// s += stq.xxxx();
|
||||||
// t += stq.yyyy();
|
// t += stq.yyyy();
|
||||||
// q += stq.zzzz();
|
// q += stq.zzzz();
|
||||||
|
|
||||||
vmovaps(ymm4, ptr[&m_local.d8.stq]);
|
vbroadcastf128(ymm4, ptr[&m_local.d8.stq]);
|
||||||
|
|
||||||
vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
|
vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
|
vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
|
@ -551,12 +552,12 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
{
|
{
|
||||||
if(m_sel.iip)
|
if(m_sel.iip)
|
||||||
{
|
{
|
||||||
// GSVector8i c = m_local.d8.c;
|
// GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
|
||||||
|
|
||||||
// rb = rb.add16(c.xxxx());
|
vpbroadcastq(ymm7, ptr[&m_local.d8.c]);
|
||||||
// ga = ga.add16(c.yyyy());
|
|
||||||
|
|
||||||
vmovdqa(ymm7, ptr[&m_local.d8.c]);
|
// rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero());
|
||||||
|
// ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero());
|
||||||
|
|
||||||
vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0));
|
vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1));
|
vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
|
|
|
@ -84,10 +84,12 @@ public:
|
||||||
|
|
||||||
void UpdateScissor()
|
void UpdateScissor()
|
||||||
{
|
{
|
||||||
scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4);
|
ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
|
||||||
scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4);
|
|
||||||
scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4);
|
scissor.ex.u16[0] = (uint16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
|
||||||
scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4);
|
scissor.ex.u16[1] = (uint16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
|
||||||
|
scissor.ex.u16[2] = (uint16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
|
||||||
|
scissor.ex.u16[3] = (uint16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
|
||||||
|
|
||||||
scissor.ofex = GSVector4(
|
scissor.ofex = GSVector4(
|
||||||
(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
|
(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
|
||||||
|
@ -101,11 +103,11 @@ public:
|
||||||
(int)SCISSOR.SCAX1 + 1,
|
(int)SCISSOR.SCAX1 + 1,
|
||||||
(int)SCISSOR.SCAY1 + 1);
|
(int)SCISSOR.SCAY1 + 1);
|
||||||
|
|
||||||
uint16 ofx = (uint16)XYOFFSET.OFX;
|
scissor.ofxy = GSVector4i(
|
||||||
uint16 ofy = (uint16)XYOFFSET.OFY;
|
0x8000,
|
||||||
|
0x8000,
|
||||||
scissor.ofxy.u32[0] = (ofy << 16) | ofx;
|
(int)XYOFFSET.OFX - 15,
|
||||||
scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
|
(int)XYOFFSET.OFY - 15);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DepthRead() const
|
bool DepthRead() const
|
||||||
|
|
|
@ -162,7 +162,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap<KEY, VALUE>
|
||||||
hash_map<uint64, VALUE> m_cgmap;
|
hash_map<uint64, VALUE> m_cgmap;
|
||||||
GSCodeBuffer m_cb;
|
GSCodeBuffer m_cb;
|
||||||
|
|
||||||
enum {MAX_SIZE = 4096};
|
enum {MAX_SIZE = 8192};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GSCodeGeneratorFunctionMap(const char* name, void* param)
|
GSCodeGeneratorFunctionMap(const char* name, void* param)
|
||||||
|
|
|
@ -155,7 +155,7 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
|
|
||||||
struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
|
struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
|
||||||
struct step {GSVector8 z, stq; GSVector8i c, f;} d8;
|
struct step {GSVector4 stq; struct {uint32 rb, ga;} c; struct {uint32 z, f;} p;} d8;
|
||||||
struct {GSVector8i rb, ga;} c;
|
struct {GSVector8i rb, ga;} c;
|
||||||
struct {uint32 z, f;} p;
|
struct {uint32 z, f;} p;
|
||||||
|
|
||||||
|
|
|
@ -62,43 +62,52 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
|
|
||||||
if(m_sel.prim != GS_SPRITE_CLASS)
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
||||||
{
|
{
|
||||||
// GSVector4 p = dscan.p;
|
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||||
|
|
||||||
|
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||||
|
|
||||||
|
vmulps(ymm1, ymm0, ymm3);
|
||||||
|
|
||||||
|
if(m_en.z)
|
||||||
|
{
|
||||||
|
// m_local.d8.p.z = dp8.extract32<2>();
|
||||||
|
|
||||||
|
vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
|
||||||
|
}
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
// GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
|
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||||
|
|
||||||
vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]);
|
vcvtps2dq(ymm2, ymm1);
|
||||||
|
vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(m_en.z)
|
if(m_en.z)
|
||||||
{
|
{
|
||||||
// GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
|
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||||
|
|
||||||
vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]);
|
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
// m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
|
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||||
|
|
||||||
vmulps(ymm0, ymm1, ymm3);
|
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
vcvttps2dq(ymm0, ymm0);
|
|
||||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
|
||||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
|
||||||
vmovdqa(ptr[&m_local.d8.f], ymm0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(m_en.z)
|
|
||||||
{
|
|
||||||
// m_local.d8.z = dz * shift[0];
|
|
||||||
|
|
||||||
vmulps(ymm0, ymm2, ymm3);
|
|
||||||
vmovaps(ptr[&m_local.d8.z], ymm0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||||
{
|
{
|
||||||
|
if(m_en.z)
|
||||||
|
{
|
||||||
|
// m_local.d[i].z = dz * shift[1 + i];
|
||||||
|
|
||||||
|
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||||
|
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
|
||||||
|
vmovaps(ptr[&m_local.d[i].z], ymm0);
|
||||||
|
}
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||||
|
@ -110,15 +119,6 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
vmovdqa(ptr[&m_local.d[i].f], ymm0);
|
vmovdqa(ptr[&m_local.d[i].f], ymm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(m_en.z)
|
|
||||||
{
|
|
||||||
// m_local.d[i].z = dz * shift[1 + i];
|
|
||||||
|
|
||||||
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
|
|
||||||
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
|
|
||||||
vmovaps(ptr[&m_local.d[i].z], ymm0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -170,13 +170,13 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
vcvttps2dq(ymm1, ymm1);
|
vcvttps2dq(ymm1, ymm1);
|
||||||
|
|
||||||
vmovdqa(ptr[&m_local.d8.stq], ymm1);
|
vmovdqa(ptr[&m_local.d8.stq], xmm1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// m_local.d8.stq = dt8;
|
// m_local.d8.stq = dt8;
|
||||||
|
|
||||||
vmovaps(ptr[&m_local.d8.stq], ymm1);
|
vmovaps(ptr[&m_local.d8.stq], xmm1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||||
|
@ -238,7 +238,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vcvttps2dq(ymm1, ymm1);
|
vcvttps2dq(ymm1, ymm1);
|
||||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
vpackssdw(ymm1, ymm1);
|
vpackssdw(ymm1, ymm1);
|
||||||
vmovdqa(ptr[&m_local.d8.c], ymm1);
|
vmovq(ptr[&m_local.d8.c], xmm1);
|
||||||
|
|
||||||
// ymm3 is not needed anymore
|
// ymm3 is not needed anymore
|
||||||
|
|
||||||
|
|
|
@ -2338,12 +2338,12 @@ __forceinline void GSState::VertexKick(uint32 skip)
|
||||||
tailptr[0] = v0;
|
tailptr[0] = v0;
|
||||||
tailptr[1] = v1;
|
tailptr[1] = v1;
|
||||||
|
|
||||||
GSVector4i xy = v1.xxxx().sub16(m_ofxy);
|
GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy);
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x401
|
||||||
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4)));
|
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
|
||||||
#else
|
#else
|
||||||
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy()));
|
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
m_vertex.tail = ++tail;
|
m_vertex.tail = ++tail;
|
||||||
|
|
|
@ -524,6 +524,15 @@ public:
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if _M_SSE >= 0x501
|
||||||
|
|
||||||
|
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
|
||||||
|
{
|
||||||
|
return GSVector4i(_mm_blend_epi32(m, v.m, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
|
__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
|
||||||
{
|
{
|
||||||
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
|
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
|
||||||
|
@ -1261,15 +1270,6 @@ public:
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
|
||||||
|
|
||||||
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
|
|
||||||
{
|
|
||||||
return GSVector4i(_mm_blend_epi32(m, v.m, i));
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if _M_SSE >= 0x401
|
#if _M_SSE >= 0x401
|
||||||
|
|
||||||
template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
|
template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
|
||||||
|
@ -3263,6 +3263,21 @@ public:
|
||||||
VECTOR4_SHUFFLE_1(y, 1)
|
VECTOR4_SHUFFLE_1(y, 1)
|
||||||
VECTOR4_SHUFFLE_1(z, 2)
|
VECTOR4_SHUFFLE_1(z, 2)
|
||||||
VECTOR4_SHUFFLE_1(w, 3)
|
VECTOR4_SHUFFLE_1(w, 3)
|
||||||
|
|
||||||
|
__forceinline GSVector4 broadcast32() const
|
||||||
|
{
|
||||||
|
return GSVector4(_mm_broadcastss_ps(m));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector4 broadcast32(const GSVector4& v)
|
||||||
|
{
|
||||||
|
return GSVector4(_mm_broadcastss_ps(v.m));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector4 broadcast32(const void* f)
|
||||||
|
{
|
||||||
|
return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
|
@ -4181,17 +4196,39 @@ public:
|
||||||
|
|
||||||
// TODO: extract/insert
|
// TODO: extract/insert
|
||||||
|
|
||||||
|
template<int i> __forceinline int extract8() const
|
||||||
|
{
|
||||||
|
ASSERT(i < 32);
|
||||||
|
|
||||||
|
GSVector4i v = extract<i / 16>();
|
||||||
|
|
||||||
|
return v.extract8<i & 15>();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int i> __forceinline int extract16() const
|
||||||
|
{
|
||||||
|
ASSERT(i < 16);
|
||||||
|
|
||||||
|
GSVector4i v = extract<i / 8>();
|
||||||
|
|
||||||
|
return v.extract16<i & 8>();
|
||||||
|
}
|
||||||
|
|
||||||
template<int i> __forceinline int extract32() const
|
template<int i> __forceinline int extract32() const
|
||||||
{
|
{
|
||||||
|
ASSERT(i < 8);
|
||||||
|
|
||||||
GSVector4i v = extract<i / 4>();
|
GSVector4i v = extract<i / 4>();
|
||||||
|
|
||||||
if((i & 3) == 0) return GSVector4i::store(v);
|
if((i & 3) == 0) return GSVector4i::store(v);
|
||||||
|
|
||||||
return v.extract32<i>();
|
return v.extract32<i & 3>();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int i> __forceinline GSVector4i extract() const
|
template<int i> __forceinline GSVector4i extract() const
|
||||||
{
|
{
|
||||||
|
ASSERT(i < 2);
|
||||||
|
|
||||||
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
|
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
|
||||||
|
|
||||||
return GSVector4i(_mm256_extracti128_si256(m, i));
|
return GSVector4i(_mm256_extracti128_si256(m, i));
|
||||||
|
@ -4199,6 +4236,8 @@ public:
|
||||||
|
|
||||||
template<int i> __forceinline GSVector8i insert(__m128i m) const
|
template<int i> __forceinline GSVector8i insert(__m128i m) const
|
||||||
{
|
{
|
||||||
|
ASSERT(i < 2);
|
||||||
|
|
||||||
return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
|
return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4811,6 +4850,31 @@ public:
|
||||||
//return cast(v).aa(); // slowest
|
//return cast(v).aa(); // slowest
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8i broadcast8(const void* p)
|
||||||
|
{
|
||||||
|
return GSVector8i(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(*(const int*)p)));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8i broadcast16(const void* p)
|
||||||
|
{
|
||||||
|
return GSVector8i(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(const int*)p)));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8i broadcast32(const void* p)
|
||||||
|
{
|
||||||
|
return GSVector8i(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(*(const int*)p)));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8i broadcast64(const void* p)
|
||||||
|
{
|
||||||
|
return GSVector8i(_mm256_broadcastq_epi64(_mm_loadl_epi64((const __m128i*)p)));
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8i broadcast128(const void* p)
|
||||||
|
{
|
||||||
|
return GSVector8i(_mm256_broadcastsi128_si256(*(const __m128i*)p));
|
||||||
|
}
|
||||||
|
|
||||||
__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
|
__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
|
||||||
|
|
||||||
__forceinline static GSVector8i xffffffff() {return zero() == zero();}
|
__forceinline static GSVector8i xffffffff() {return zero() == zero();}
|
||||||
|
@ -5495,20 +5559,22 @@ public:
|
||||||
|
|
||||||
template<int i> __forceinline int extract32() const
|
template<int i> __forceinline int extract32() const
|
||||||
{
|
{
|
||||||
if(i < 4) return extract<0>().extract<i>();
|
ASSERT(i < 8);
|
||||||
else if(i < 8) return extract<1>().extract<i - 4>();
|
|
||||||
else ASSERT(0);
|
|
||||||
|
|
||||||
return 0;
|
return extract<i / 4>().extract32<i & 3>();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int i> __forceinline GSVector8 insert(__m128 m) const
|
template<int i> __forceinline GSVector8 insert(__m128 m) const
|
||||||
{
|
{
|
||||||
|
ASSERT(i < 2);
|
||||||
|
|
||||||
return GSVector8(_mm256_insertf128_ps(this->m, m, i));
|
return GSVector8(_mm256_insertf128_ps(this->m, m, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int i> __forceinline GSVector4 extract() const
|
template<int i> __forceinline GSVector4 extract() const
|
||||||
{
|
{
|
||||||
|
ASSERT(i < 2);
|
||||||
|
|
||||||
if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
|
if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
|
||||||
|
|
||||||
return GSVector4(_mm256_extractf128_ps(m, i));
|
return GSVector4(_mm256_extractf128_ps(m, i));
|
||||||
|
@ -5831,6 +5897,11 @@ public:
|
||||||
return GSVector8(_mm256_broadcastss_ps(v.m));
|
return GSVector8(_mm256_broadcastss_ps(v.m));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline static GSVector8 broadcast32(const void* f)
|
||||||
|
{
|
||||||
|
return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
|
// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue