From 4dd898c754d5a265aa8e685028a0e3dce5ab4617 Mon Sep 17 00:00:00 2001 From: "gabest11@gmail.com" Date: Mon, 24 Jun 2013 01:11:00 +0000 Subject: [PATCH] GSdx: Found some 16-bit integer overflow in GSState::VertexKick, some triangles could have been removed by the scissor test. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5680 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GSDrawScanline.cpp | 50 +++++----- .../GSDrawScanlineCodeGenerator.x86.avx2.cpp | 33 ++++--- plugins/GSdx/GSDrawingContext.h | 20 ++-- plugins/GSdx/GSFunctionMap.h | 2 +- plugins/GSdx/GSScanlineEnvironment.h | 2 +- .../GSSetupPrimCodeGenerator.x86.avx2.cpp | 62 ++++++------ plugins/GSdx/GSState.cpp | 8 +- plugins/GSdx/GSVector.h | 99 ++++++++++++++++--- 8 files changed, 177 insertions(+), 99 deletions(-) diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 222ff373d0..568a9c2bc3 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -120,11 +120,13 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co { if(sel.prim != GS_SPRITE_CLASS) { + GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]); + if(has_f) { - GSVector8 df = GSVector8::broadcast32(dscan.p.wwww()); + m_local.d8.p.f = GSVector4i(dp8).extract32<3>(); - m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh(); + GSVector8 df = GSVector8::broadcast32(&dscan.p.w); for(int i = 0; i < 8; i++) { @@ -134,9 +136,9 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co if(has_z) { - GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz()); + m_local.d8.p.z = dp8.extract32<2>(); - m_local.d8.z = dz * shift[0]; + GSVector8 dz = GSVector8::broadcast32(&dscan.p.z); for(int i = 0; i < 8; i++) { @@ -160,19 +162,19 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co if(has_t) { - GSVector8 dt(dscan.t); - - GSVector8 dt8 = dt * shift[0]; + GSVector4 dt8 = dscan.t * GSVector4::broadcast32(&shift[0]); if(sel.fst) { - m_local.d8.stq = GSVector8::cast(GSVector8i(dt8)); + m_local.d8.stq = GSVector4::cast(GSVector4i(dt8)); } else { m_local.d8.stq = dt8; } + GSVector8 dt(dscan.t); + for(int j = 0, k = sel.fst ? 2 : 3; j < k; j++) { GSVector8 dstq; @@ -213,9 +215,11 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co { if(sel.iip) { - GSVector8 dc(dscan.c); + GSVector4 dc8 = dscan.c * GSVector4::broadcast32(&shift[0]); - m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32(); + GSVector4i::storel(&m_local.d8.c, GSVector4i(dc8).xzyw().ps32()); + + GSVector8 dc(dscan.c); GSVector8 dr = dc.xxxx(); GSVector8 db = dc.zzzz(); @@ -490,9 +494,9 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - s = GSVector8::broadcast32(scan.t.xxxx()) + m_local.d[skip].s; - t = GSVector8::broadcast32(scan.t.yyyy()) + m_local.d[skip].t; - q = GSVector8::broadcast32(scan.t.zzzz()) + m_local.d[skip].q; + s = GSVector8::broadcast32(&scan.t.x) + m_local.d[skip].s; + t = GSVector8::broadcast32(&scan.t.y) + m_local.d[skip].t; + q = GSVector8::broadcast32(&scan.t.z) + m_local.d[skip].q; } } @@ -504,8 +508,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS c = c.upl16(c.zwxy()); - rbf = GSVector8i::broadcast32(c.xxxx()).add16(m_local.d[skip].rb); - gaf = GSVector8i::broadcast32(c.zzzz()).add16(m_local.d[skip].ga); + rbf = GSVector8i::broadcast32(&c.x).add16(m_local.d[skip].rb); + gaf = GSVector8i::broadcast32(&c.z).add16(m_local.d[skip].ga); } else { @@ -532,7 +536,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.prim != GS_SPRITE_CLASS) { - GSVector8 z = GSVector8::broadcast32(scan.p.zzzz()) + zo; + GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo; if(sel.zoverflow) { @@ -545,7 +549,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - zs = GSVector8i::broadcast32(GSVector4i::load(m_local.p.z)); + zs = GSVector8i::broadcast32(&m_local.p.z); } if(sel.ztest) @@ -1104,7 +1108,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.fwrite && sel.fge) { - GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(GSVector4i::load(m_local.p.f)); + GSVector8i fog = sel.prim != GS_SPRITE_CLASS ? f : GSVector8i::broadcast16(&m_local.p.f); GSVector8i frb((int)m_global.frb); GSVector8i fga((int)m_global.fga); @@ -1476,12 +1480,12 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS { if(sel.zb) { - zo += m_local.d8.z; + zo += GSVector8::broadcast32(&m_local.d8.p.z); } if(sel.fwrite && sel.fge) { - f = f.add16(m_local.d8.f); + f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f)); } } @@ -1491,7 +1495,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS { if(sel.fst) { - GSVector8i stq = GSVector8i::cast(m_local.d8.stq); + GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq)); s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); @@ -1502,7 +1506,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else { - GSVector8 stq = m_local.d8.stq; + GSVector8 stq(m_local.d8.stq); s += stq.xxxx(); t += stq.yyyy(); @@ -1515,7 +1519,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS { if(sel.iip) { - GSVector8i c = m_local.d8.c; + GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c); rbf = rbf.add16(c.xxxx()).max_i16(GSVector8i::zero()); gaf = gaf.add16(c.yyyy()).max_i16(GSVector8i::zero()); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp index 03a51a777d..f06b853916 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp @@ -468,22 +468,22 @@ void GSDrawScanlineCodeGenerator::Step() if(m_sel.prim != GS_SPRITE_CLASS) { - // z += m_local.d8.z; + // zo += GSVector8::broadcast32(&m_local.d8.p.z); if(m_sel.zb) { - vmovaps(ymm0, ptr[&m_local.temp.zo]); - vaddps(ymm0, ptr[&m_local.d8.z]); + vbroadcastss(ymm0, ptr[&m_local.d8.p.z]); + vaddps(ymm0, ptr[&m_local.temp.zo]); vmovaps(ptr[&m_local.temp.zo], ymm0); vaddps(ymm0, ptr[&m_local.temp.z]); } - // f = f.add16(m_local.d4.f); + // f = f.add16(GSVector8i::broadcast16(&m_local.d8.p.f)); if(m_sel.fwrite && m_sel.fge) { - vmovdqa(ymm1, ptr[&m_local.temp.f]); - vpaddw(ymm1, ptr[&m_local.d8.f]); + vpbroadcastw(ymm1, ptr[&m_local.d8.p.f]); + vpaddw(ymm1, ptr[&m_local.temp.f]); vmovdqa(ptr[&m_local.temp.f], ymm1); } } @@ -501,12 +501,11 @@ void GSDrawScanlineCodeGenerator::Step() { if(m_sel.fst) { - // GSVector8i stq = m_local.d8.stq; + // GSVector8i stq = GSVector8i::cast(GSVector8(m_local.d8.stq)); - // s += stq.xxxx(); - // if(!sprite) t += stq.yyyy(); + vbroadcasti128(ymm4, ptr[&m_local.d8.stq]); - vmovdqa(ymm4, ptr[&m_local.d8.stq]); + // s = GSVector8::cast(GSVector8i::cast(s) + stq.xxxx()); vpshufd(ymm2, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); vpaddd(ymm2, ptr[&m_local.temp.s]); @@ -514,6 +513,8 @@ void GSDrawScanlineCodeGenerator::Step() if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin) { + // t = GSVector8::cast(GSVector8i::cast(t) + stq.yyyy()); + vpshufd(ymm3, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); vpaddd(ymm3, ptr[&m_local.temp.t]); vmovdqa(ptr[&m_local.temp.t], ymm3); @@ -525,13 +526,13 @@ void GSDrawScanlineCodeGenerator::Step() } else { - // GSVector8 stq = m_local.d8.stq; + // GSVector8 stq(m_local.d8.stq); // s += stq.xxxx(); // t += stq.yyyy(); // q += stq.zzzz(); - vmovaps(ymm4, ptr[&m_local.d8.stq]); + vbroadcastf128(ymm4, ptr[&m_local.d8.stq]); vshufps(ymm2, ymm4, ymm4, _MM_SHUFFLE(0, 0, 0, 0)); vshufps(ymm3, ymm4, ymm4, _MM_SHUFFLE(1, 1, 1, 1)); @@ -551,12 +552,12 @@ void GSDrawScanlineCodeGenerator::Step() { if(m_sel.iip) { - // GSVector8i c = m_local.d8.c; + // GSVector8i c = GSVector8i::broadcast64(&m_local.d8.c); - // rb = rb.add16(c.xxxx()); - // ga = ga.add16(c.yyyy()); + vpbroadcastq(ymm7, ptr[&m_local.d8.c]); - vmovdqa(ymm7, ptr[&m_local.d8.c]); + // rb = rb.add16(c.xxxx()).max_i16(GSVector8i::zero()); + // ga = ga.add16(c.yyyy()).max_i16(GSVector8i::zero()); vpshufd(ymm5, ymm7, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(ymm6, ymm7, _MM_SHUFFLE(1, 1, 1, 1)); diff --git a/plugins/GSdx/GSDrawingContext.h b/plugins/GSdx/GSDrawingContext.h index 6efa167c1b..85b6542fbd 100644 --- a/plugins/GSdx/GSDrawingContext.h +++ b/plugins/GSdx/GSDrawingContext.h @@ -84,10 +84,12 @@ public: void UpdateScissor() { - scissor.ex.u16[0] = (uint16)(SCISSOR.SCAX0 << 4); - scissor.ex.u16[1] = (uint16)(SCISSOR.SCAY0 << 4); - scissor.ex.u16[2] = (uint16)(SCISSOR.SCAX1 << 4); - scissor.ex.u16[3] = (uint16)(SCISSOR.SCAY1 << 4); + ASSERT(XYOFFSET.OFX <= 0xf800 && XYOFFSET.OFY <= 0xf800); + + scissor.ex.u16[0] = (uint16)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX - 0x8000); + scissor.ex.u16[1] = (uint16)((SCISSOR.SCAY0 << 4) + XYOFFSET.OFY - 0x8000); + scissor.ex.u16[2] = (uint16)((SCISSOR.SCAX1 << 4) + XYOFFSET.OFX - 0x8000); + scissor.ex.u16[3] = (uint16)((SCISSOR.SCAY1 << 4) + XYOFFSET.OFY - 0x8000); scissor.ofex = GSVector4( (int)((SCISSOR.SCAX0 << 4) + XYOFFSET.OFX), @@ -101,11 +103,11 @@ public: (int)SCISSOR.SCAX1 + 1, (int)SCISSOR.SCAY1 + 1); - uint16 ofx = (uint16)XYOFFSET.OFX; - uint16 ofy = (uint16)XYOFFSET.OFY; - - scissor.ofxy.u32[0] = (ofy << 16) | ofx; - scissor.ofxy.u32[1] = ((ofy - 15) << 16) | (ofx - 15); // ceil(xy) => (xy - offset + 15) >> 4 => (xy - [offset - 15]) >> 4 + scissor.ofxy = GSVector4i( + 0x8000, + 0x8000, + (int)XYOFFSET.OFX - 15, + (int)XYOFFSET.OFY - 15); } bool DepthRead() const diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index 60af124ff4..3e87ac6214 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -162,7 +162,7 @@ class GSCodeGeneratorFunctionMap : public GSFunctionMap hash_map m_cgmap; GSCodeBuffer m_cb; - enum {MAX_SIZE = 4096}; + enum {MAX_SIZE = 8192}; public: GSCodeGeneratorFunctionMap(const char* name, void* param) diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index 4265a8fe1a..cc71026b72 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -155,7 +155,7 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has #if _M_SSE >= 0x501 struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8]; - struct step {GSVector8 z, stq; GSVector8i c, f;} d8; + struct step {GSVector4 stq; struct {uint32 rb, ga;} c; struct {uint32 z, f;} p;} d8; struct {GSVector8i rb, ga;} c; struct {uint32 z, f;} p; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp index f8d46ea42a..cb62337316 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp @@ -62,43 +62,52 @@ void GSSetupPrimCodeGenerator::Depth() if(m_sel.prim != GS_SPRITE_CLASS) { - // GSVector4 p = dscan.p; + // GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]); + vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]); + + vmulps(ymm1, ymm0, ymm3); + + if(m_en.z) + { + // m_local.d8.p.z = dp8.extract32<2>(); + + vextractps(ptr[&m_local.d8.p.z], xmm1, 2); + } + if(m_en.f) { - // GSVector8 df = GSVector8::broadcast32(dscan.p.wwww()); + // m_local.d8.p.f = GSVector4i(dp8).extract32<3>(); - vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]); + vcvtps2dq(ymm2, ymm1); + vpextrd(ptr[&m_local.d8.p.f], xmm2, 3); } if(m_en.z) { - // GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz()); + // GSVector8 dz = GSVector8(dscan.p).zzzz(); - vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]); + vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2)); } if(m_en.f) { - // m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh(); + // GSVector8 df = GSVector8(dscan.p).wwww(); - vmulps(ymm0, ymm1, ymm3); - vcvttps2dq(ymm0, ymm0); - vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); - vmovdqa(ptr[&m_local.d8.f], ymm0); - } - - if(m_en.z) - { - // m_local.d8.z = dz * shift[0]; - - vmulps(ymm0, ymm2, ymm3); - vmovaps(ptr[&m_local.d8.z], ymm0); + vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3)); } for(int i = 0; i < (m_sel.notest ? 1 : 8); i++) { + if(m_en.z) + { + // m_local.d[i].z = dz * shift[1 + i]; + + if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i)); + else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]); + vmovaps(ptr[&m_local.d[i].z], ymm0); + } + if(m_en.f) { // m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh(); @@ -110,15 +119,6 @@ void GSSetupPrimCodeGenerator::Depth() vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); vmovdqa(ptr[&m_local.d[i].f], ymm0); } - - if(m_en.z) - { - // m_local.d[i].z = dz * shift[1 + i]; - - if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i)); - else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]); - vmovaps(ptr[&m_local.d[i].z], ymm0); - } } } else @@ -170,13 +170,13 @@ void GSSetupPrimCodeGenerator::Texture() vcvttps2dq(ymm1, ymm1); - vmovdqa(ptr[&m_local.d8.stq], ymm1); + vmovdqa(ptr[&m_local.d8.stq], xmm1); } else { // m_local.d8.stq = dt8; - vmovaps(ptr[&m_local.d8.stq], ymm1); + vmovaps(ptr[&m_local.d8.stq], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) @@ -238,7 +238,7 @@ void GSSetupPrimCodeGenerator::Color() vcvttps2dq(ymm1, ymm1); vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0)); vpackssdw(ymm1, ymm1); - vmovdqa(ptr[&m_local.d8.c], ymm1); + vmovq(ptr[&m_local.d8.c], xmm1); // ymm3 is not needed anymore diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index 2c69119d8a..acdc51e477 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -2338,12 +2338,12 @@ __forceinline void GSState::VertexKick(uint32 skip) tailptr[0] = v0; tailptr[1] = v1; - GSVector4i xy = v1.xxxx().sub16(m_ofxy); + GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy); - #if _M_SSE >= 0x501 - GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend32<2>(xy.sra16(4))); + #if _M_SSE >= 0x401 + GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32()); #else - GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl32(xy.sra16(4).yyyy())); + GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32()); #endif m_vertex.tail = ++tail; diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 7cd5eafb6f..be75085bac 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -524,6 +524,15 @@ public: #endif + #if _M_SSE >= 0x501 + + template __forceinline GSVector4i blend32(const GSVector4i& v) const + { + return GSVector4i(_mm_blend_epi32(m, v.m, i)); + } + + #endif + __forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const { return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a))); @@ -1261,15 +1270,6 @@ public: #endif - #if _M_SSE >= 0x501 - - template __forceinline GSVector4i blend32(const GSVector4i& v) const - { - return GSVector4i(_mm_blend_epi32(m, v.m, i)); - } - - #endif - #if _M_SSE >= 0x401 template __forceinline GSVector4i gather8_4(const T* ptr) const @@ -3263,6 +3263,21 @@ public: VECTOR4_SHUFFLE_1(y, 1) VECTOR4_SHUFFLE_1(z, 2) VECTOR4_SHUFFLE_1(w, 3) + + __forceinline GSVector4 broadcast32() const + { + return GSVector4(_mm_broadcastss_ps(m)); + } + + __forceinline static GSVector4 broadcast32(const GSVector4& v) + { + return GSVector4(_mm_broadcastss_ps(v.m)); + } + + __forceinline static GSVector4 broadcast32(const void* f) + { + return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f))); + } }; #if _M_SSE >= 0x501 @@ -4181,17 +4196,39 @@ public: // TODO: extract/insert + template __forceinline int extract8() const + { + ASSERT(i < 32); + + GSVector4i v = extract(); + + return v.extract8(); + } + + template __forceinline int extract16() const + { + ASSERT(i < 16); + + GSVector4i v = extract(); + + return v.extract16(); + } + template __forceinline int extract32() const { + ASSERT(i < 8); + GSVector4i v = extract(); if((i & 3) == 0) return GSVector4i::store(v); - return v.extract32(); + return v.extract32(); } template __forceinline GSVector4i extract() const { + ASSERT(i < 2); + if(i == 0) return GSVector4i(_mm256_castsi256_si128(m)); return GSVector4i(_mm256_extracti128_si256(m, i)); @@ -4199,6 +4236,8 @@ public: template __forceinline GSVector8i insert(__m128i m) const { + ASSERT(i < 2); + return GSVector8i(_mm256_inserti128_si256(this->m, m, i)); } @@ -4811,6 +4850,31 @@ public: //return cast(v).aa(); // slowest } + __forceinline static GSVector8i broadcast8(const void* p) + { + return GSVector8i(_mm256_broadcastb_epi8(_mm_cvtsi32_si128(*(const int*)p))); + } + + __forceinline static GSVector8i broadcast16(const void* p) + { + return GSVector8i(_mm256_broadcastw_epi16(_mm_cvtsi32_si128(*(const int*)p))); + } + + __forceinline static GSVector8i broadcast32(const void* p) + { + return GSVector8i(_mm256_broadcastd_epi32(_mm_cvtsi32_si128(*(const int*)p))); + } + + __forceinline static GSVector8i broadcast64(const void* p) + { + return GSVector8i(_mm256_broadcastq_epi64(_mm_loadl_epi64((const __m128i*)p))); + } + + __forceinline static GSVector8i broadcast128(const void* p) + { + return GSVector8i(_mm256_broadcastsi128_si256(*(const __m128i*)p)); + } + __forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());} __forceinline static GSVector8i xffffffff() {return zero() == zero();} @@ -5495,20 +5559,22 @@ public: template __forceinline int extract32() const { - if(i < 4) return extract<0>().extract(); - else if(i < 8) return extract<1>().extract(); - else ASSERT(0); + ASSERT(i < 8); - return 0; + return extract().extract32(); } template __forceinline GSVector8 insert(__m128 m) const { + ASSERT(i < 2); + return GSVector8(_mm256_insertf128_ps(this->m, m, i)); } template __forceinline GSVector4 extract() const { + ASSERT(i < 2); + if(i == 0) return GSVector4(_mm256_castps256_ps128(m)); return GSVector4(_mm256_extractf128_ps(m, i)); @@ -5831,6 +5897,11 @@ public: return GSVector8(_mm256_broadcastss_ps(v.m)); } + __forceinline static GSVector8 broadcast32(const void* f) + { + return GSVector8(_mm256_broadcastss_ps(_mm_load_ss((const float*)f))); + } + // TODO: v.(x0|y0|z0|w0|x1|y1|z1|w1) // broadcast element #endif