GSdx: Found some 16-bit integer overflow in GSState::VertexKick, some triangles could have been removed by the scissor test.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5680 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11@gmail.com 2013-06-24 01:11:00 +00:00
parent 37dd5f5bb5
commit 4dd898c754
8 changed files with 177 additions and 99 deletions

View File

@ -120,11 +120,13 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
{
if(sel.prim != GS_SPRITE_CLASS)
{
GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
if(has_f)
{
GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
for(int i = 0; i < 8; i++)
{
@ -134,9 +136,9 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
if(has_z)
{
GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
m_local.d8.p.z = dp8.extract32<2>();
m_local.d8.z = dz * shift[0];
GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
for(int i = 0; i < 8; i++)
{
@ -160,19 +162,19 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
if(has_t)
{
GSVector8 dt(dscan.t);
GSVector8 dt8 = dt * shift[0];
GSVector4 dt8 = dscan.t * GSVector4::broadcast32(&shift[0]);
if(sel.fst)
{
m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
m_local.d8.stq = GSVector4::cast(GSVector4i(dt8));
}
else
{
m_local.d8.stq = dt8;
}
GSVector8 dt(dscan.t);
for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++)
{
GSVector8 dstq;
@ -213,9 +215,11 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
{
if(sel.iip)
{
GSVector8 dc(dscan.c);
GSVector4 dc8 = dscan.c * GSVector4::broadcast32(&shift[0]);
m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
GSVector4i::storel(&m_local.d8.c, GSVector4i(dc8).xzyw().ps32());
GSVector8 dc(dscan.c);
GSVector8 dr = dc.xxxx();
GSVector8 db = dc.zzzz();
@ -490,9 +494,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
else
{
s = GSVector8::broadcast32(scan.t.xxxx()) + m_local.d[skip].s;
t = GSVector8::broadcast32(scan.t.yyyy()) + m_local.d[skip].t;
q = GSVector8::broadcast32(scan.t.zzzz()) + m_local.d[skip].q;
s = GSVector8::broadcast32(&scan.t.x) + m_local.d[skip].s;
t = GSVector8::broadcast32(&scan.t.y) + m_local.d[skip].t;
q = GSVector8::broadcast32(&scan.t.z) + m_local.d[skip].q;
}
}
@ -504,8 +508,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
c = c.upl16(c.zwxy());
rbf = GSVector8i::broadcast32(c.xxxx()).add16(m_local.d[skip].rb);
gaf = GSVector8i::broadcast32(c.zzzz()).add16(m_local.d[skip].ga);
rbf = GSVector8i::broadcast32(&c.x).add16(m_local.d[skip].rb);
gaf = GSVector8i::broadcast32(&c.z).add16(m_local.d[skip].ga);
}
else
{
@ -532,7 +536,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(sel.prim != GS_SPRITE_CLASS)
{
GSVector8 z = GSVector8::broadcast32(scan.p.zzzz()) + zo;
GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
if(sel.zoverflow)
{
@ -545,7 +549,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
else
{
zs = GSVector8i::broadcast32(GSVector4i::load(m_local.p.z));
zs = GSVector8i::broadcast32(&m_local.p.z);
}
if(sel.ztest)
@ -1104,7 +1108,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(sel.fwrite && sel.fge)
{
GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(GSVector4i::load(m_local.p.f));
GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(&m_local.p.f);
GSVector8i frb((int)m_global.frb);
GSVector8i fga((int)m_global.fga);
@ -1476,12 +1480,12 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.zb)
{
zo += m_local.d8.z;
zo += GSVector8::broadcast32(&m_local.d8.p.z);
}
if(sel.fwrite && sel.fge)
{
f = f.add16(m_local.d8.f);
f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
}
}
@ -1491,7 +1495,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.fst)
{
GSVector8i stq = GSVector8i::cast(m_local.d8.stq);
GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
@ -1502,7 +1506,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
else
{
GSVector8 stq = m_local.d8.stq;
GSVector8 stq(m_local.d8.stq);
s += stq.xxxx();
t += stq.yyyy();
@ -1515,7 +1519,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.iip)
{
GSVector8i c = m_local.d8.c;
GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero());
gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero());

View File

@ -468,22 +468,22 @@ void GSDrawScanlineCodeGenerator::Step()
if(m_sel.prim != GS_SPRITE_CLASS)
{
// z += m_local.d8.z;
// zo += GSVector8::broadcast32(&m_local.d8.p.z);
if(m_sel.zb)
{
vmovaps(ymm0, ptr[&m_local.temp.zo]);
vaddps(ymm0, ptr[&m_local.d8.z]);
vbroadcastss(ymm0, ptr[&m_local.d8.p.z]);
vaddps(ymm0, ptr[&m_local.temp.zo]);
vmovaps(ptr[&m_local.temp.zo], ymm0);
vaddps(ymm0, ptr[&m_local.temp.z]);
}
// f = f.add16(m_local.d4.f);
// f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f));
if(m_sel.fwrite && m_sel.fge)
{
vmovdqa(ymm1, ptr[&m_local.temp.f]);
vpaddw(ymm1, ptr[&m_local.d8.f]);
vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]);
vpaddw(ymm1, ptr[&m_local.temp.f]);
vmovdqa(ptr[&m_local.temp.f], ymm1);
}
}
@ -501,12 +501,11 @@ void GSDrawScanlineCodeGenerator::Step()
{
if(m_sel.fst)
{
// GSVector8i stq = m_local.d8.stq;
// GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq));
// s += stq.xxxx();
// if(!sprite) t += stq.yyyy();
vbroadcasti128(ymm4, ptr[&m_local.d8.stq]);
vmovdqa(ymm4, ptr[&m_local.d8.stq]);
// s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx());
vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
vpaddd(ymm2, ptr[&m_local.temp.s]);
@ -514,6 +513,8 @@ void GSDrawScanlineCodeGenerator::Step()
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
{
// t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy());
vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(ymm3, ptr[&m_local.temp.t]);
vmovdqa(ptr[&m_local.temp.t], ymm3);
@ -525,13 +526,13 @@ void GSDrawScanlineCodeGenerator::Step()
}
else
{
// GSVector8 stq = m_local.d8.stq;
// GSVector8 stq(m_local.d8.stq);
// s += stq.xxxx();
// t += stq.yyyy();
// q += stq.zzzz();
vmovaps(ymm4, ptr[&m_local.d8.stq]);
vbroadcastf128(ymm4, ptr[&m_local.d8.stq]);
vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1));
@ -551,12 +552,12 @@ void GSDrawScanlineCodeGenerator::Step()
{
if(m_sel.iip)
{
// GSVector8i c = m_local.d8.c;
// GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c);
// rb = rb.add16(c.xxxx());
// ga = ga.add16(c.yyyy());
vpbroadcastq(ymm7, ptr[&m_local.d8.c]);
vmovdqa(ymm7, ptr[&m_local.d8.c]);
// rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero());
// ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero());
vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1));

View File

@ -84,10 +84,12 @@ public:
void UpdateScissor()
{
scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4);
scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4);
scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4);
scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4);
ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800);
scissor.ex.u16[0] = (uint16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000);
scissor.ex.u16[1] = (uint16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000);
scissor.ex.u16[2] = (uint16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000);
scissor.ex.u16[3] = (uint16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000);
scissor.ofex = GSVector4(
(int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX),
@ -101,11 +103,11 @@ public:
(int)SCISSOR.SCAX1 + 1,
(int)SCISSOR.SCAY1 + 1);
uint16 ofx = (uint16)XYOFFSET.OFX;
uint16 ofy = (uint16)XYOFFSET.OFY;
scissor.ofxy.u32[0] = (ofy << 16) | ofx;
scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4
scissor.ofxy = GSVector4i(
0x8000,
0x8000,
(int)XYOFFSET.OFX - 15,
(int)XYOFFSET.OFY - 15);
}
bool DepthRead() const

View File

@ -162,7 +162,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap<KEY, VALUE>
hash_map<uint64, VALUE> m_cgmap;
GSCodeBuffer m_cb;
enum {MAX_SIZE = 4096};
enum {MAX_SIZE = 8192};
public:
GSCodeGeneratorFunctionMap(const char* name, void* param)

View File

@ -155,7 +155,7 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has
#if _M_SSE >= 0x501
struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
struct step {GSVector8 z, stq; GSVector8i c, f;} d8;
struct step {GSVector4 stq; struct {uint32 rb, ga;} c; struct {uint32 z, f;} p;} d8;
struct {GSVector8i rb, ga;} c;
struct {uint32 z, f;} p;

View File

@ -62,43 +62,52 @@ void GSSetupPrimCodeGenerator::Depth()
if(m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
vmulps(ymm1, ymm0, ymm3);
if(m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
}
if(m_en.f)
{
// GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]);
vcvtps2dq(ymm2, ymm1);
vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
}
if(m_en.z)
{
// GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]);
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
}
if(m_en.f)
{
// m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
// GSVector8 df = GSVector8(dscan.p).wwww();
vmulps(ymm0, ymm1, ymm3);
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d8.f], ymm0);
}
if(m_en.z)
{
// m_local.d8.z = dz * shift[0];
vmulps(ymm0, ymm2, ymm3);
vmovaps(ptr[&m_local.d8.z], ymm0);
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
if(m_en.z)
{
// m_local.d[i].z = dz * shift[1 + i];
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
vmovaps(ptr[&m_local.d[i].z], ymm0);
}
if(m_en.f)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
@ -110,15 +119,6 @@ void GSSetupPrimCodeGenerator::Depth()
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], ymm0);
}
if(m_en.z)
{
// m_local.d[i].z = dz * shift[1 + i];
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
vmovaps(ptr[&m_local.d[i].z], ymm0);
}
}
}
else
@ -170,13 +170,13 @@ void GSSetupPrimCodeGenerator::Texture()
vcvttps2dq(ymm1, ymm1);
vmovdqa(ptr[&m_local.d8.stq], ymm1);
vmovdqa(ptr[&m_local.d8.stq], xmm1);
}
else
{
// m_local.d8.stq = dt8;
vmovaps(ptr[&m_local.d8.stq], ymm1);
vmovaps(ptr[&m_local.d8.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -238,7 +238,7 @@ void GSSetupPrimCodeGenerator::Color()
vcvttps2dq(ymm1, ymm1);
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(ymm1, ymm1);
vmovdqa(ptr[&m_local.d8.c], ymm1);
vmovq(ptr[&m_local.d8.c], xmm1);
// ymm3 is not needed anymore

View File

@ -2338,12 +2338,12 @@ __forceinline void GSState::VertexKick(uint32 skip)
tailptr[0] = v0;
tailptr[1] = v1;
GSVector4i xy = v1.xxxx().sub16(m_ofxy);
GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy);
#if _M_SSE >= 0x501
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4)));
#if _M_SSE >= 0x401
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32());
#else
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy()));
GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32());
#endif
m_vertex.tail = ++tail;

View File

@ -524,6 +524,15 @@ public:
#endif
#if _M_SSE >= 0x501
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
{
return GSVector4i(_mm_blend_epi32(m, v.m, i));
}
#endif
__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
{
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));
@ -1261,15 +1270,6 @@ public:
#endif
#if _M_SSE >= 0x501
template<int i> __forceinline GSVector4i blend32(const GSVector4i& v) const
{
return GSVector4i(_mm_blend_epi32(m, v.m, i));
}
#endif
#if _M_SSE >= 0x401
template<int src, class T> __forceinline GSVector4i gather8_4(const T* ptr) const
@ -3263,6 +3263,21 @@ public:
VECTOR4_SHUFFLE_1(y, 1)
VECTOR4_SHUFFLE_1(z, 2)
VECTOR4_SHUFFLE_1(w, 3)
__forceinline GSVector4 broadcast32() const
{
return GSVector4(_mm_broadcastss_ps(m));
}
__forceinline static GSVector4 broadcast32(const GSVector4& v)
{
return GSVector4(_mm_broadcastss_ps(v.m));
}
__forceinline static GSVector4 broadcast32(const void* f)
{
return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
}
};
#if _M_SSE >= 0x501
@ -4181,17 +4196,39 @@ public:
// TODO: extract/insert
template<int i> __forceinline int extract8() const
{
ASSERT(i < 32);
GSVector4i v = extract<i / 16>();
return v.extract8<i & 15>();
}
template<int i> __forceinline int extract16() const
{
ASSERT(i < 16);
GSVector4i v = extract<i / 8>();
return v.extract16<i & 8>();
}
template<int i> __forceinline int extract32() const
{
ASSERT(i < 8);
GSVector4i v = extract<i / 4>();
if((i & 3) == 0) return GSVector4i::store(v);
return v.extract32<i>();
return v.extract32<i & 3>();
}
template<int i> __forceinline GSVector4i extract() const
{
ASSERT(i < 2);
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
return GSVector4i(_mm256_extracti128_si256(m, i));
@ -4199,6 +4236,8 @@ public:
template<int i> __forceinline GSVector8i insert(__m128i m) const
{
ASSERT(i < 2);
return GSVector8i(_mm256_inserti128_si256(this->m, m, i));
}
@ -4811,6 +4850,31 @@ public:
//return cast(v).aa(); // slowest
}
__forceinline static GSVector8i broadcast8(const void* p)
{
return GSVector8i(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(*(const int*)p)));
}
__forceinline static GSVector8i broadcast16(const void* p)
{
return GSVector8i(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(const int*)p)));
}
__forceinline static GSVector8i broadcast32(const void* p)
{
return GSVector8i(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(*(const int*)p)));
}
__forceinline static GSVector8i broadcast64(const void* p)
{
return GSVector8i(_mm256_broadcastq_epi64(_mm_loadl_epi64((const __m128i*)p)));
}
__forceinline static GSVector8i broadcast128(const void* p)
{
return GSVector8i(_mm256_broadcastsi128_si256(*(const __m128i*)p));
}
__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
__forceinline static GSVector8i xffffffff() {return zero() == zero();}
@ -5495,20 +5559,22 @@ public:
template<int i> __forceinline int extract32() const
{
if(i < 4) return extract<0>().extract<i>();
else if(i < 8) return extract<1>().extract<i - 4>();
else ASSERT(0);
ASSERT(i < 8);
return 0;
return extract<i / 4>().extract32<i & 3>();
}
template<int i> __forceinline GSVector8 insert(__m128 m) const
{
ASSERT(i < 2);
return GSVector8(_mm256_insertf128_ps(this->m, m, i));
}
template<int i> __forceinline GSVector4 extract() const
{
ASSERT(i < 2);
if(i == 0) return GSVector4(_mm256_castps256_ps128(m));
return GSVector4(_mm256_extractf128_ps(m, i));
@ -5831,6 +5897,11 @@ public:
return GSVector8(_mm256_broadcastss_ps(v.m));
}
__forceinline static GSVector8 broadcast32(const void* f)
{
return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f)));
}
// TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element
#endif