diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h index 1c38fcb529..99b255bcbb 100644 --- a/plugins/GSdx/GSBlock.h +++ b/plugins/GSdx/GSBlock.h @@ -158,8 +158,6 @@ public: { GSVector4i v4((int)mask); - #if _M_SSE >= 0x401 - if(mask == 0xff000000 || mask == 0x00ffffff) { ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, v4); @@ -169,19 +167,11 @@ public: } else { - - #endif - - ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4); - ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4); - ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4); - ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4); - - #if _M_SSE >= 0x401 - + ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend(v0, v4); + ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend(v1, v4); + ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4); + ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4); } - - #endif } #endif @@ -524,40 +514,18 @@ public: GSVector4i::store(&d1[0], v1); GSVector4i::store(&d1[1], v3); - #else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = s[i * 4 + 0]; - GSVector4i v1 = s[i * 4 + 1]; - GSVector4i v2 = s[i * 4 + 2]; - GSVector4i v3 = s[i * 4 + 3]; - - //for(int16 i = 0; i < 8; i++) {v0.i16[i] = i; v1.i16[i] = i + 8; v2.i16[i] = i + 16; v3.i16[i] = i + 24;} - - GSVector4i::sw16(v0, v1, v2, v3); - GSVector4i::sw32(v0, v1, v2, v3); - GSVector4i::sw16(v0, v2, v1, v3); - - GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; - GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - - GSVector4i::store(&d0[0], v0); - GSVector4i::store(&d0[1], v1); - GSVector4i::store(&d1[0], v2); - GSVector4i::store(&d1[1], v3); - #endif } template __forceinline static void ReadColumn8(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) { + //for(int j = 0; j < 64; j++) ((uint8*)src)[j] = (uint8)j; - #if 0//_M_SSE >= 0x501 + #if 0 //_M_SSE >= 0x501 const GSVector8i* s = (const GSVector8i*)src; - + GSVector8i v0 = s[i * 2 + 0]; GSVector8i v1 = s[i * 2 + 1]; @@ -578,7 +546,7 @@ public: // TODO: not sure if this is worth it, not in this form, there should be a shorter path - #elif _M_SSE >= 0x301 + #else const GSVector4i* s = (const GSVector4i*)src; @@ -612,36 +580,6 @@ public: GSVector4i::store(&dst[dstpitch * 2], v1); GSVector4i::store(&dst[dstpitch * 3], v2); - #else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = s[i * 4 + 0]; - GSVector4i v1 = s[i * 4 + 1]; - GSVector4i v2 = s[i * 4 + 2]; - GSVector4i v3 = s[i * 4 + 3]; - - GSVector4i::sw8(v0, v1, v2, v3); - GSVector4i::sw16(v0, v1, v2, v3); - GSVector4i::sw8(v0, v2, v1, v3); - GSVector4i::sw64(v0, v1, v2, v3); - - if((i & 1) == 0) - { - v2 = v2.yxwz(); - v3 = v3.yxwz(); - } - else - { - v0 = v0.yxwz(); - v1 = v1.yxwz(); - } - - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v1); - GSVector4i::store(&dst[dstpitch * 2], v2); - GSVector4i::store(&dst[dstpitch * 3], v3); - #endif } @@ -649,8 +587,6 @@ public: { //printf("ReadColumn4\n"); - #if _M_SSE >= 0x301 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = s[i * 4 + 0].xzyw(); @@ -680,46 +616,6 @@ public: GSVector4i::store(&dst[dstpitch * 1], v1); GSVector4i::store(&dst[dstpitch * 2], v2); GSVector4i::store(&dst[dstpitch * 3], v3); - - #else - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0 = s[i * 4 + 0]; - GSVector4i v1 = s[i * 4 + 1]; - GSVector4i v2 = s[i * 4 + 2]; - GSVector4i v3 = s[i * 4 + 3]; - - GSVector4i::sw32(v0, v1, v2, v3); - GSVector4i::sw32(v0, v1, v2, v3); - GSVector4i::sw4(v0, v2, v1, v3); - GSVector4i::sw8(v0, v1, v2, v3); - GSVector4i::sw16(v0, v2, v1, v3); - - v0 = v0.xzyw(); - v1 = v1.xzyw(); - v2 = v2.xzyw(); - v3 = v3.xzyw(); - - GSVector4i::sw64(v0, v1, v2, v3); - - if((i & 1) == 0) - { - v2 = v2.yxwzlh(); - v3 = v3.yxwzlh(); - } - else - { - v0 = v0.yxwzlh(); - v1 = v1.yxwzlh(); - } - - GSVector4i::store(&dst[dstpitch * 0], v0); - GSVector4i::store(&dst[dstpitch * 1], v1); - GSVector4i::store(&dst[dstpitch * 2], v2); - GSVector4i::store(&dst[dstpitch * 3], v3); - - #endif } static void ReadColumn32(int y, const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch) @@ -1238,7 +1134,6 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x401 const GSVector4i* s = (const GSVector4i*)src; @@ -1246,15 +1141,6 @@ public: GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); - - #else - - for(int i = 0; i < 8; i++) - { - ((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 24]; - } - - #endif } } @@ -1273,23 +1159,12 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal); GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); - - #else - - for(int i = 0; i < 8; i++) - { - ((uint16*)dst)[i] = (uint16)pal[(src[j * 8 + i] >> 24) & 0xf]; - } - - #endif } } @@ -1308,23 +1183,12 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal); GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal); ((GSVector4i*)dst)[0] = v0.pu32(v1); - - #else - - for(int i = 0; i < 8; i++) - { - ((uint16*)dst)[i] = (uint16)pal[src[j * 8 + i] >> 28]; - } - - #endif } } @@ -1486,32 +1350,6 @@ public: ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask); } - #else - - GSVector4i v0, v1, v2, v3; - GSVector4i mask = GSVector4i::xff000000(); - - for(int i = 0; i < 4; i++, src += srcpitch * 2) - { - v4 = GSVector4i::loadl(&src[srcpitch * 0]); - v5 = GSVector4i::loadl(&src[srcpitch * 1]); - - v6 = v4.upl16(v5); - - v4 = v6.upl8(v6); - v5 = v6.uph8(v6); - - v0 = v4.upl16(v4); - v1 = v4.uph16(v4); - v2 = v5.upl16(v5); - v3 = v5.uph16(v5); - - ((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask); - ((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask); - ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask); - ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask); - } - #endif } @@ -1608,47 +1446,6 @@ public: ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); } - #else - - GSVector4i v0, v1, v2, v3; - GSVector4i mask = GSVector4i(0x0f000000); - - for(int i = 0; i < 2; i++, src += srcpitch * 4) - { - GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); - - v4 = v.upl8(v >> 4); - v5 = v.uph8(v >> 4); - - v6 = v4.upl16(v5); - v7 = v4.uph16(v5); - - v4 = v6.upl8(v6); - v5 = v6.uph8(v6); - v6 = v7.upl8(v7); - v7 = v7.uph8(v7); - - v0 = v4.upl16(v4); - v1 = v4.uph16(v4); - v2 = v5.upl16(v5); - v3 = v5.uph16(v5); - - ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); - - v0 = v6.upl16(v6); - v1 = v6.uph16(v6); - v2 = v7.upl16(v7); - v3 = v7.uph16(v7); - - ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); - } - #endif } @@ -1736,47 +1533,6 @@ public: ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); } - #else - - GSVector4i v0, v1, v2, v3; - GSVector4i mask = GSVector4i::xf0000000(); - - for(int i = 0; i < 2; i++, src += srcpitch * 4) - { - GSVector4i v(*(uint32*)&src[srcpitch * 0], *(uint32*)&src[srcpitch * 2], *(uint32*)&src[srcpitch * 1], *(uint32*)&src[srcpitch * 3]); - - v4 = (v << 4).upl8(v); - v5 = (v << 4).uph8(v); - - v6 = v4.upl16(v5); - v7 = v4.uph16(v5); - - v4 = v6.upl8(v6); - v5 = v6.uph8(v6); - v6 = v7.upl8(v7); - v7 = v7.uph8(v7); - - v0 = v4.upl16(v4); - v1 = v4.uph16(v4); - v2 = v5.upl16(v5); - v3 = v5.uph16(v5); - - ((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask); - - v0 = v6.upl16(v6); - v1 = v6.uph16(v6); - v2 = v7.upl16(v7); - v3 = v7.uph16(v7); - - ((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask); - ((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask); - ((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask); - ((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask); - } - #endif } @@ -1882,39 +1638,6 @@ public: d1[1] = Expand16to32(v1.uph16(v1), TA0, TA1); } - #elif 0 // not faster - - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i TA0(TEXA.TA0 << 24); - GSVector4i TA1(TEXA.TA1 << 24); - - for(int i = 0; i < 4; i++, dst += dstpitch * 2) - { - GSVector4i v0 = s[i * 4 + 0]; - GSVector4i v1 = s[i * 4 + 1]; - GSVector4i v2 = s[i * 4 + 2]; - GSVector4i v3 = s[i * 4 + 3]; - - GSVector4i::sw16(v0, v1, v2, v3); - GSVector4i::sw32(v0, v1, v2, v3); - GSVector4i::sw16(v0, v2, v1, v3); - - GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; - - d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); - d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); - d0[2] = Expand16to32(v1.upl16(v1), TA0, TA1); - d0[3] = Expand16to32(v1.uph16(v1), TA0, TA1); - - GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; - - d1[0] = Expand16to32(v2.upl16(v2), TA0, TA1); - d1[1] = Expand16to32(v2.uph16(v2), TA0, TA1); - d1[2] = Expand16to32(v3.upl16(v3), TA0, TA1); - d1[3] = Expand16to32(v3.uph16(v3), TA0, TA1); - } - #else alignas(32) uint16 block[16 * 8]; @@ -1930,8 +1653,6 @@ public: { //printf("ReadAndExpandBlock8_32\n"); - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; @@ -1973,16 +1694,6 @@ public: v2.gather32_8<>(pal, (GSVector4i*)dst); dst += dstpitch; } - - #else - - alignas(32) uint8 block[16 * 16]; - - ReadBlock8(src, (uint8*)block, sizeof(block) / 16); - - ExpandBlock8_32(block, dst, dstpitch, pal); - - #endif } // TODO: ReadAndExpandBlock8_16 @@ -1991,8 +1702,6 @@ public: { //printf("ReadAndExpandBlock4_32\n"); - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; @@ -2050,16 +1759,6 @@ public: v3.gather64_8<>(pal, (GSVector4i*)dst); dst += dstpitch; } - - #else - - alignas(32) uint8 block[(32 / 2) * 16]; - - ReadBlock4(src, (uint8*)block, sizeof(block) / 16); - - ExpandBlock4_32(block, dst, dstpitch, pal); - - #endif } // TODO: ReadAndExpandBlock4_16 @@ -2068,8 +1767,6 @@ public: { //printf("ReadAndExpandBlock8H_32\n"); - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; @@ -2093,16 +1790,6 @@ public: dst += dstpitch; } - - #else - - alignas(32) uint32 block[8 * 8]; - - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock8H_32(block, dst, dstpitch, pal); - - #endif } // TODO: ReadAndExpandBlock8H_16 @@ -2110,9 +1797,6 @@ public: __forceinline static void ReadAndExpandBlock4HL_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { //printf("ReadAndExpandBlock4HL_32\n"); - - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; @@ -2136,16 +1820,6 @@ public: dst += dstpitch; } - - #else - - alignas(32) uint32 block[8 * 8]; - - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock4HL_32(block, dst, dstpitch, pal); - - #endif } // TODO: ReadAndExpandBlock4HL_16 @@ -2154,8 +1828,6 @@ public: { //printf("ReadAndExpandBlock4HH_32\n"); - #if _M_SSE >= 0x401 - const GSVector4i* s = (const GSVector4i*)src; GSVector4i v0, v1, v2, v3; @@ -2179,16 +1851,6 @@ public: dst += dstpitch; } - - #else - - alignas(32) uint32 block[8 * 8]; - - ReadBlock32(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock4HH_32(block, dst, dstpitch, pal); - - #endif } // TODO: ReadAndExpandBlock4HH_16 diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index ccea31e96b..cdaee1f341 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -2734,11 +2734,7 @@ __forceinline void GSState::VertexKick(uint32 skip) GSVector4i xy = v1.xxxx().u16to32().sub32(m_ofxy); -#if _M_SSE >= 0x401 GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.blend16<0xf0>(xy.sra32(4)).ps32()); -#else - GSVector4i::storel(&m_vertex.xy[xy_tail & 3], xy.upl64(xy.sra32(4).zwzw()).ps32()); -#endif m_vertex.tail = ++tail; m_vertex.xy_tail = ++xy_tail; diff --git a/plugins/GSdx/GSUtil.cpp b/plugins/GSdx/GSUtil.cpp index b5f0f06d2c..08f8a352cd 100644 --- a/plugins/GSdx/GSUtil.cpp +++ b/plugins/GSdx/GSUtil.cpp @@ -85,10 +85,6 @@ const char* GSUtil::GetLibName() "AVX", sw_sse #elif _M_SSE >= 0x401 "SSE4.1", sw_sse -#elif _M_SSE >= 0x301 - "SSSE3", sw_sse -#elif _M_SSE >= 0x200 - "SSE2", sw_sse #endif ); @@ -221,13 +217,7 @@ bool GSUtil::CheckSSE() }; ISA checks[] = { - {Xbyak::util::Cpu::tSSE2, "SSE2"}, -#if _M_SSE >= 0x301 - {Xbyak::util::Cpu::tSSSE3, "SSSE3"}, -#endif -#if _M_SSE >= 0x401 {Xbyak::util::Cpu::tSSE41, "SSE41"}, -#endif #if _M_SSE >= 0x500 {Xbyak::util::Cpu::tAVX, "AVX1"}, #endif diff --git a/plugins/GSdx/GSVector4.h b/plugins/GSdx/GSVector4.h index f97a8db817..77422cb351 100644 --- a/plugins/GSdx/GSVector4.h +++ b/plugins/GSdx/GSVector4.h @@ -250,33 +250,7 @@ public: template __forceinline GSVector4 round() const { - #if _M_SSE >= 0x401 - return GSVector4(_mm_round_ps(m, mode)); - - #else - - GSVector4 a = *this; - - GSVector4 b = (a & cast(GSVector4i::x80000000())) | m_x4b000000; - - b = a + b - b; - - if((mode & 7) == (Round_NegInf & 7)) - { - return b - ((a < b) & m_one); - } - - if((mode & 7) == (Round_PosInf & 7)) - { - return b + ((a > b) & m_one); - } - - ASSERT((mode & 7) == (Round_NearestInt & 7)); // other modes aren't implemented - - return b; - - #endif } __forceinline GSVector4 floor() const @@ -404,65 +378,29 @@ public: __forceinline GSVector4 hadd() const { - #if _M_SSE >= 0x300 - return GSVector4(_mm_hadd_ps(m, m)); - - #else - - return xzxz() + ywyw(); - - #endif } __forceinline GSVector4 hadd(const GSVector4& v) const { - #if _M_SSE >= 0x300 - return GSVector4(_mm_hadd_ps(m, v.m)); - - #else - - return xzxz(v) + ywyw(v); - - #endif } __forceinline GSVector4 hsub() const { - #if _M_SSE >= 0x300 - return GSVector4(_mm_hsub_ps(m, m)); - - #else - - return xzxz() - ywyw(); - - #endif } __forceinline GSVector4 hsub(const GSVector4& v) const { - #if _M_SSE >= 0x300 - return GSVector4(_mm_hsub_ps(m, v.m)); - - #else - - return xzxz(v) - ywyw(v); - - #endif } - #if _M_SSE >= 0x401 - template __forceinline GSVector4 dp(const GSVector4& v) const { return GSVector4(_mm_dp_ps(m, v.m, i)); } - #endif - __forceinline GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return GSVector4(_mm_min_ps(_mm_max_ps(m, a), b)); @@ -493,26 +431,14 @@ public: return GSVector4(_mm_max_ps(m, a)); } - #if _M_SSE >= 0x401 - template __forceinline GSVector4 blend32(const GSVector4& a) const { return GSVector4(_mm_blend_ps(m, a, mask)); } - #endif - __forceinline GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const { - #if _M_SSE >= 0x401 - return GSVector4(_mm_blendv_ps(m, a, mask)); - - #else - - return GSVector4(_mm_or_ps(_mm_andnot_ps(mask, m), _mm_and_ps(mask, a))); - - #endif } __forceinline GSVector4 upl(const GSVector4& a) const @@ -566,16 +492,10 @@ public: return _mm_testz_ps(m, m) != 0; - #elif _M_SSE >= 0x401 - __m128i a = _mm_castps_si128(m); return _mm_testz_si128(a, a) != 0; - #else - - return mask() == 0; - #endif } @@ -643,7 +563,6 @@ public: } #endif - } #ifdef __linux__ @@ -663,28 +582,12 @@ GSVector.h:2973:15: error: shadows template parm 'int i' template __forceinline int extract32() const { - #if _M_SSE >= 0x401 - return _mm_extract_ps(m, index); - - #else - - return i32[index]; - - #endif } #else template __forceinline int extract32() const { - #if _M_SSE >= 0x401 - return _mm_extract_ps(m, i); - - #else - - return i32[i]; - - #endif } #endif diff --git a/plugins/GSdx/GSVector4i.h b/plugins/GSdx/GSVector4i.h index db376cdd57..e3c6dc5808 100644 --- a/plugins/GSdx/GSVector4i.h +++ b/plugins/GSdx/GSVector4i.h @@ -229,15 +229,7 @@ public: __forceinline GSVector4i runion_ordered(const GSVector4i& a) const { - #if _M_SSE >= 0x401 - return min_i32(a).upl64(max_i32(a).srl<8>()); - - #else - - return GSVector4i(std::min(x, a.x), std::min(y, a.y), std::max(z, a.z), std::max(w, a.w)); - - #endif } __forceinline GSVector4i rintersect(const GSVector4i& a) const @@ -295,8 +287,6 @@ public: return (uint32)store(v); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const { return max_i8(a).min_i8(b); @@ -307,8 +297,6 @@ public: return max_i8(a.xyxy()).min_i8(a.zwzw()); } - #endif - __forceinline GSVector4i sat_i16(const GSVector4i& a, const GSVector4i& b) const { return max_i16(a).min_i16(b); @@ -319,8 +307,6 @@ public: return max_i16(a.xyxy()).min_i16(a.zwzw()); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const { return max_i32(a).min_i32(b); @@ -331,34 +317,6 @@ public: return max_i32(a.xyxy()).min_i32(a.zwzw()); } - #else - - __forceinline GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const - { - GSVector4i v; - - v.x = std::min(std::max(x, a.x), b.x); - v.y = std::min(std::max(y, a.y), b.y); - v.z = std::min(std::max(z, a.z), b.z); - v.w = std::min(std::max(w, a.w), b.w); - - return v; - } - - __forceinline GSVector4i sat_i32(const GSVector4i& a) const - { - GSVector4i v; - - v.x = std::min(std::max(x, a.x), a.z); - v.y = std::min(std::max(y, a.y), a.w); - v.z = std::min(std::max(z, a.x), a.z); - v.w = std::min(std::max(w, a.y), a.w); - - return v; - } - - #endif - __forceinline GSVector4i sat_u8(const GSVector4i& a, const GSVector4i& b) const { return max_u8(a).min_u8(b); @@ -369,8 +327,6 @@ public: return max_u8(a.xyxy()).min_u8(a.zwzw()); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const { return max_u16(a).min_u16(b); @@ -381,10 +337,6 @@ public: return max_u16(a.xyxy()).min_u16(a.zwzw()); } - #endif - - #if _M_SSE >= 0x401 - __forceinline GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const { return max_u32(a).min_u32(b); @@ -395,10 +347,6 @@ public: return max_u32(a.xyxy()).min_u32(a.zwzw()); } - #endif - - #if _M_SSE >= 0x401 - __forceinline GSVector4i min_i8(const GSVector4i& a) const { return GSVector4i(_mm_min_epi8(m, a)); @@ -409,8 +357,6 @@ public: return GSVector4i(_mm_max_epi8(m, a)); } - #endif - __forceinline GSVector4i min_i16(const GSVector4i& a) const { return GSVector4i(_mm_min_epi16(m, a)); @@ -421,8 +367,6 @@ public: return GSVector4i(_mm_max_epi16(m, a)); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i min_i32(const GSVector4i& a) const { return GSVector4i(_mm_min_epi32(m, a)); @@ -433,8 +377,6 @@ public: return GSVector4i(_mm_max_epi32(m, a)); } - #endif - __forceinline GSVector4i min_u8(const GSVector4i& a) const { return GSVector4i(_mm_min_epu8(m, a)); @@ -445,8 +387,6 @@ public: return GSVector4i(_mm_max_epu8(m, a)); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i min_u16(const GSVector4i& a) const { return GSVector4i(_mm_min_epu16(m, a)); @@ -467,8 +407,6 @@ public: return GSVector4i(_mm_max_epu32(m, a)); } - #endif - __forceinline static int min_i16(int a, int b) { return store(load(a).min_i16(load(b))); @@ -481,26 +419,14 @@ public: __forceinline GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const { - #if _M_SSE >= 0x401 - return GSVector4i(_mm_blendv_epi8(m, a, mask)); - - #else - - return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a))); - - #endif } - #if _M_SSE >= 0x401 - template __forceinline GSVector4i blend16(const GSVector4i& a) const { return GSVector4i(_mm_blend_epi16(m, a, mask)); } - #endif - #if _M_SSE >= 0x501 template __forceinline GSVector4i blend32(const GSVector4i& v) const @@ -517,26 +443,14 @@ public: __forceinline GSVector4i mix16(const GSVector4i& a) const { - #if _M_SSE >= 0x401 - return blend16<0xaa>(a); - - #else - - return blend8(a, GSVector4i::xffff0000()); - - #endif } - #if _M_SSE >= 0x301 - __forceinline GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); } - #endif - __forceinline GSVector4i ps16(const GSVector4i& a) const { return GSVector4i(_mm_packs_epi16(m, a)); @@ -567,8 +481,6 @@ public: return GSVector4i(_mm_packs_epi32(m, m)); } - #if _M_SSE >= 0x401 - __forceinline GSVector4i pu32(const GSVector4i& a) const { return GSVector4i(_mm_packus_epi32(m, a)); @@ -579,8 +491,6 @@ public: return GSVector4i(_mm_packus_epi32(m, m)); } - #endif - __forceinline GSVector4i upl8(const GSVector4i& a) const { return GSVector4i(_mm_unpacklo_epi8(m, a)); @@ -685,8 +595,6 @@ public: return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); } - #if _M_SSE >= 0x401 - // WARNING!!! // // MSVC (2008, 2010 ctp) believes that there is a "mem, reg" form of the pmovz/sx* instructions, @@ -752,50 +660,6 @@ public: return GSVector4i(_mm_cvtepu32_epi64(m)); } - #else - - __forceinline GSVector4i u8to16() const - { - return upl8(); - } - - __forceinline GSVector4i u8to32() const - { - return upl8().upl16(); - } - - __forceinline GSVector4i u8to64() const - { - return upl8().upl16().upl32(); - } - - __forceinline GSVector4i u16to32() const - { - return upl16(); - } - - __forceinline GSVector4i u16to64() const - { - return upl16().upl32(); - } - - __forceinline GSVector4i u32to64() const - { - return upl32(); - } - - __forceinline GSVector4i i8to16() const - { - return zero().upl8(*this).sra16(8); - } - - __forceinline GSVector4i i16to32() const - { - return zero().upl16(*this).sra32(16); - } - - #endif - template __forceinline GSVector4i srl() const { return GSVector4i(_mm_srli_si128(m, i)); @@ -803,20 +667,7 @@ public: template __forceinline GSVector4i srl(const GSVector4i& v) { - #if _M_SSE >= 0x301 - return GSVector4i(_mm_alignr_epi8(v.m, m, i)); - - #else - - // The `& 0xF` keeps the compiler happy on cases that won't actually be hit - if(i == 0) return *this; - else if(i < 16) return srl() | v.sll<(16 - i) & 0xF>(); - else if(i == 16) return v; - else if(i < 32) return v.srl<(i - 16) & 0xF>(); - else return zero(); - - #endif } template __forceinline GSVector4i sll() const @@ -1013,15 +864,11 @@ public: return GSVector4i(_mm_mullo_epi16(m, v.m)); } - #if _M_SSE >= 0x301 - __forceinline GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); } - #endif - GSVector4i madd(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); @@ -1073,21 +920,11 @@ public: __forceinline bool eq(const GSVector4i& v) const { - #if _M_SSE >= 0x401 - // pxor, ptest, je GSVector4i t = *this ^ v; return _mm_testz_si128(t, t) != 0; - - #else - - // pcmpeqd, pmovmskb, cmp, je - - return eq32(v).alltrue(); - - #endif } __forceinline GSVector4i eq8(const GSVector4i& v) const @@ -1167,37 +1004,17 @@ public: __forceinline bool allfalse() const { - #if _M_SSE >= 0x401 - return _mm_testz_si128(m, m) != 0; - - #else - - return mask() == 0; - - #endif } - #if _M_SSE >= 0x401 - template __forceinline GSVector4i insert8(int a) const { return GSVector4i(_mm_insert_epi8(m, a, i)); } - #endif - template __forceinline int extract8() const { - #if _M_SSE >= 0x401 - return _mm_extract_epi8(m, i); - - #else - - return (int)u8[i]; - - #endif } template __forceinline GSVector4i insert16(int a) const @@ -1210,59 +1027,34 @@ public: return _mm_extract_epi16(m, i); } - #if _M_SSE >= 0x401 - template __forceinline GSVector4i insert32(int a) const { return GSVector4i(_mm_insert_epi32(m, a, i)); } - #endif - template __forceinline int extract32() const { if(i == 0) return GSVector4i::store(*this); - #if _M_SSE >= 0x401 - return _mm_extract_epi32(m, i); - - #else - - return i32[i]; - - #endif } #ifdef _M_AMD64 - #if _M_SSE >= 0x401 - template __forceinline GSVector4i insert64(int64 a) const { return GSVector4i(_mm_insert_epi64(m, a, i)); } - #endif - template __forceinline int64 extract64() const { if(i == 0) return GSVector4i::storeq(*this); - #if _M_SSE >= 0x401 - return _mm_extract_epi64(m, i); - - #else - - return i64[i]; - - #endif } #endif - #if _M_SSE >= 0x401 template __forceinline GSVector4i gather8_4(const T* ptr) const { @@ -1340,8 +1132,6 @@ public: return v; } - #endif - template __forceinline GSVector4i gather16_4(const T* ptr) const { GSVector4i v; @@ -1418,8 +1208,6 @@ public: return v; } - #if _M_SSE >= 0x401 - template __forceinline GSVector4i gather32_4(const T* ptr) const { GSVector4i v; @@ -1479,56 +1267,7 @@ public: return v; } - #else - - template __forceinline GSVector4i gather32_4(const T* ptr) const - { - return GSVector4i( - (int)ptr[extract8() & 0xf], - (int)ptr[extract8() >> 4], - (int)ptr[extract8() & 0xf], - (int)ptr[extract8() >> 4]); - } - - template __forceinline GSVector4i gather32_8(const T* ptr) const - { - return GSVector4i( - (int)ptr[extract8()], - (int)ptr[extract8()], - (int)ptr[extract8()], - (int)ptr[extract8()]); - } - - template __forceinline GSVector4i gather32_16(const T* ptr) const - { - return GSVector4i( - (int)ptr[extract16()], - (int)ptr[extract16()], - (int)ptr[extract16()], - (int)ptr[extract16()]); - } - - template __forceinline GSVector4i gather32_32(const T* ptr) const - { - return GSVector4i( - (int)ptr[extract32<0>()], - (int)ptr[extract32<1>()], - (int)ptr[extract32<2>()], - (int)ptr[extract32<3>()]); - } - - template __forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const - { - return GSVector4i( - (int)ptr2[ptr1[extract32<0>()]], - (int)ptr2[ptr1[extract32<1>()]], - (int)ptr2[ptr1[extract32<2>()]], - (int)ptr2[ptr1[extract32<3>()]]); - } - - #endif - - #if defined(_M_AMD64) && _M_SSE >= 0x401 + #if defined(_M_AMD64) template __forceinline GSVector4i gather64_4(const T* ptr) const { @@ -1620,8 +1359,6 @@ public: #endif - #if _M_SSE >= 0x401 - template __forceinline void gather8_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const { dst[0] = gather8_4<0>(ptr); @@ -1633,8 +1370,6 @@ public: dst[0] = gather8_8<>(ptr); } - #endif - template __forceinline void gather16_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const { dst[0] = gather16_4<0>(ptr); @@ -1742,15 +1477,7 @@ public: __forceinline static GSVector4i loadnt(const void* p) { - #if _M_SSE >= 0x401 - return GSVector4i(_mm_stream_load_si128((__m128i*)p)); - - #else - - return GSVector4i(_mm_load_si128((__m128i*)p)); - - #endif } __forceinline static GSVector4i loadl(const void* p) diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index a65963c9c5..19e9af6a1f 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -17,14 +17,6 @@ Debug x64 - - Debug SSE4 - Win32 - - - Debug SSE4 - x64 - Release AVX2 Win32 @@ -41,14 +33,6 @@ Release x64 - - Release SSE4 - Win32 - - - Release SSE4 - x64 - @@ -67,8 +51,7 @@ - - + diff --git a/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp b/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp index 4df7c0e32b..b093811c7e 100644 --- a/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp +++ b/plugins/GSdx/Renderers/Common/GSVertexTrace.cpp @@ -175,18 +175,9 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4i cmin = GSVector4i::xffffffff(); GSVector4i cmax = GSVector4i::zero(); - #if _M_SSE >= 0x401 - GSVector4i pmin = GSVector4i::xffffffff(); GSVector4i pmax = GSVector4i::zero(); - #else - - GSVector4 pmin = s_minmax.xxxx(); - GSVector4 pmax = s_minmax.yyyy(); - - #endif - const GSVertex* RESTRICT v = (GSVertex*)vertex; for(int i = 0; i < count; i += n) @@ -233,21 +224,10 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4i xy = xyzf.upl16(); GSVector4i z = xyzf.yyyy(); - #if _M_SSE >= 0x401 - GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf)); pmin = pmin.min_u32(p); pmax = pmax.max_u32(p); - - #else - - GSVector4 p = GSVector4(xy.upl64(z.srl32(1).upl32(xyzf.wwww()))); - - pmin = pmin.min(p); - pmax = pmax.max(p); - - #endif } else if(primclass == GS_LINE_CLASS) { @@ -314,23 +294,11 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4i xy1 = xyzf1.upl16(); GSVector4i z1 = xyzf1.yyyy(); - #if _M_SSE >= 0x401 - GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0)); GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); pmin = pmin.min_u32(p0.min_u32(p1)); pmax = pmax.max_u32(p0.max_u32(p1)); - - #else - - GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww()))); - GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); - - pmin = pmin.min(p0.min(p1)); - pmax = pmax.max(p0.max(p1)); - - #endif } else if(primclass == GS_TRIANGLE_CLASS) { @@ -406,25 +374,12 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4i xy2 = xyzf2.upl16(); GSVector4i z2 = xyzf2.yyyy(); - #if _M_SSE >= 0x401 - GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0)); GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2)); pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1)); pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1)); - - #else - - GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww()))); - GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); - GSVector4 p2 = GSVector4(xy2.upl64(z2.srl32(1).upl32(xyzf2.wwww()))); - - pmin = pmin.min(p2).min(p0.min(p1)); - pmax = pmax.max(p2).max(p0.max(p1)); - - #endif } else if(primclass == GS_SPRITE_CLASS) { @@ -491,23 +446,11 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun GSVector4i xy1 = xyzf1.upl16(); GSVector4i z1 = xyzf1.yyyy(); - #if _M_SSE >= 0x401 - GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1)); GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); pmin = pmin.min_u32(p0.min_u32(p1)); pmax = pmax.max_u32(p0.max_u32(p1)); - - #else - - GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf1.wwww()))); - GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); - - pmin = pmin.min(p0.min(p1)); - pmax = pmax.max(p0.max(p1)); - - #endif } } @@ -516,13 +459,9 @@ void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int coun // be true if depth isn't constant but close enough. It also imply that // pmin.z & 1 == 0 and pax.z & 1 == 0 - #if _M_SSE >= 0x401 - pmin = pmin.blend16<0x30>(pmin.srl32(1)); pmax = pmax.blend16<0x30>(pmax.srl32(1)); - #endif - GSVector4 o(context->XYOFFSET); GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f); diff --git a/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp b/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp index 91726a8490..842e4e329f 100644 --- a/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp +++ b/plugins/GSdx/Renderers/DX11/GSRendererDX11.cpp @@ -944,12 +944,8 @@ void GSRendererDX11::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sou m_ps_sel.fog = 1; GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]); -#if _M_SSE >= 0x401 // Blend AREF to avoid to load a random value for alpha (dirty cache) ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF); -#else - ps_cb.FogColor_AREF = fc; -#endif } // Warning must be done after EmulateZbuffer diff --git a/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp b/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp index 6173e291d8..57657f6b1a 100644 --- a/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp +++ b/plugins/GSdx/Renderers/OpenGL/GSRendererOGL.cpp @@ -1180,12 +1180,8 @@ void GSRendererOGL::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour m_ps_sel.fog = 1; GSVector4 fc = GSVector4::rgba32(m_env.FOGCOL.u32[0]); -#if _M_SSE >= 0x401 // Blend AREF to avoid to load a random value for alpha (dirty cache) ps_cb.FogColor_AREF = fc.blend32<8>(ps_cb.FogColor_AREF); -#else - ps_cb.FogColor_AREF = fc; -#endif } // Warning must be done after EmulateZbuffer diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp index df6391f10b..1508978081 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -121,18 +121,7 @@ void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& t } else { - if(m_cpu.has(util::Cpu::tSSE41)) - { - pblendw(a, b, 0xaa); - } - else - { - pcmpeqd(temp, temp); - psrld(temp, 16); - pand(a, temp); - pandn(temp, b); - por(a, temp); - } + pblendw(a, b, 0xaa); } } @@ -154,17 +143,8 @@ void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) } else { - if(m_cpu.has(util::Cpu::tSSE41)) - { - packuswb(a, a); - pmovzxbw(a, a); - } - else - { - packuswb(a, a); - pxor(temp, temp); - punpcklbw(a, temp); - } + packuswb(a, a); + pmovzxbw(a, a); } } @@ -223,10 +203,8 @@ void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b) { if(m_cpu.has(util::Cpu::tAVX)) vpblendvb(a, a, b, xmm0); - else if(m_cpu.has(util::Cpu::tSSE41)) + else pblendvb(a, b); - else - blend(a, b, xmm0); } void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) @@ -235,15 +213,11 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a) { vpblendvb(b, a, b, xmm0); } - else if(m_cpu.has(util::Cpu::tSSE41)) + else { pblendvb(a, b); movdqa(b, a); } - else - { - blendr(b, a, xmm0); - } } void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src) diff --git a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp index 78711e48a8..ea59e11cf8 100644 --- a/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp @@ -644,20 +644,9 @@ void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2) // Clamp Z to ZPSM_FMT_MAX if (m_sel.zclamp) { -#if _M_SSE >= 0x401 pcmpeqd(temp1, temp1); psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); pminsd(xmm0, temp1); -#else - pcmpeqd(temp1, temp1); - psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); - pcmpgtd(temp1, xmm0); - pand(xmm0, temp1); - pcmpeqd(temp2, temp2); - pxor(temp1, temp2); - psrld(temp1, (uint8)((m_sel.zpsm & 0x3) * 8)); - por(xmm0, temp1); -#endif } if(m_sel.zwrite) @@ -1089,15 +1078,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) movdqa(xmm4, ptr[&m_local.gd->t.min]); movdqa(xmm5, ptr[&m_local.gd->t.max]); - if(m_cpu.has(util::Cpu::tSSE41)) - { - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - } - else - { - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm6, xmm0); - } + movdqa(xmm0, ptr[&m_local.gd->t.mask]); // uv0 @@ -1118,11 +1099,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) pminsw(uv0, xmm5); // clamp.blend8(repeat, m_local.gd->t.mask); - - if(m_cpu.has(util::Cpu::tSSE41)) - pblendvb(uv0, xmm1); - else - blendr(uv0, xmm1, xmm0); + pblendvb(uv0, xmm1); // uv1 @@ -1143,11 +1120,7 @@ void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1) pminsw(uv1, xmm5); // clamp.blend8(repeat, m_local.gd->t.mask); - - if(m_cpu.has(util::Cpu::tSSE41)) - pblendvb(uv1, xmm1); - else - blendr(uv1, xmm1, xmm6); + pblendvb(uv1, xmm1); } } @@ -1908,15 +1881,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) } else { - if(m_cpu.has(util::Cpu::tSSE41)) - { - movdqa(xmm0, ptr[&m_local.gd->t.mask]); - } - else - { - movdqa(xmm0, ptr[&m_local.gd->t.invmask]); - movdqa(xmm4, xmm0); - } + movdqa(xmm0, ptr[&m_local.gd->t.mask]); // uv0 @@ -1937,11 +1902,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) pminsw(uv0, xmm6); // clamp.blend8(repeat, m_local.gd->t.mask); - - if(m_cpu.has(util::Cpu::tSSE41)) - pblendvb(uv0, xmm1); - else - blendr(uv0, xmm1, xmm0); + pblendvb(uv0, xmm1); // uv1 @@ -1963,10 +1924,7 @@ void GSDrawScanlineCodeGenerator::WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1) // clamp.blend8(repeat, m_local.gd->t.mask); - if(m_cpu.has(util::Cpu::tSSE41)) - pblendvb(uv1, xmm1); - else - blendr(uv1, xmm1, xmm4); + pblendvb(uv1, xmm1); } } @@ -2435,20 +2393,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_SSE() // Clamp Z to ZPSM_FMT_MAX if (m_sel.zclamp) { -#if _M_SSE >= 0x401 pcmpeqd(xmm7, xmm7); psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); pminsd(xmm1, xmm7); -#else - static GSVector4i all_1s = GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); - pcmpeqd(xmm7, xmm7); - psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); - pcmpgtd(xmm7, xmm1); - pand(xmm1, xmm7); - pxor(xmm7, ptr[&all_1s]); - psrld(xmm7, (uint8)((m_sel.zpsm & 0x3) * 8)); - por(xmm1, xmm7); -#endif } bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; @@ -2669,15 +2616,6 @@ void GSDrawScanlineCodeGenerator::AlphaBlend_SSE() if(m_sel.pabe) { - if(!m_cpu.has(util::Cpu::tSSE41)) - { - // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) - movdqa(xmm0, xmm4); - pslld(xmm0, 8); - psrad(xmm0, 31); - - } - psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); @@ -2862,24 +2800,13 @@ void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const Reg32& ad case 0: if(i == 0) movd(dst, src); else { - if(m_cpu.has(util::Cpu::tSSE41)) { - pextrd(dst, src, i); - } else { - pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); - movd(dst, xmm0); - } - + pextrd(dst, src, i); } break; case 1: if(i == 0) movd(eax, src); else { - if(m_cpu.has(util::Cpu::tSSE41)) { - pextrd(eax, src, i); - } else { - pshufd(xmm0, src, _MM_SHUFFLE(i, i, i, i)); - movd(eax, xmm0); - } + pextrd(eax, src, i); } xor(eax, dst); and(eax, 0xffffff); @@ -2918,153 +2845,28 @@ void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) if(m_sel.mmin && !m_sel.lcm) { - if(m_cpu.has(util::Cpu::tSSE41)) + const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; + + if(pixels == 4) { + movdqa(ptr[&m_local.temp.test], xmm7); + } - const int r[] = {5, 6, 2, 4, 0, 1, 3, 7}; + for(uint8 j = 0; j < 4; j++) + { + mov(ebx, ptr[&lod_i->u32[j]]); + mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - if(pixels == 4) + for(int i = 0; i < pixels; i++) { - movdqa(ptr[&m_local.temp.test], xmm7); - } - - for(uint8 j = 0; j < 4; j++) - { - mov(ebx, ptr[&lod_i->u32[j]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - for(int i = 0; i < pixels; i++) - { - ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } - } - - if(pixels == 4) - { - movdqa(xmm5, xmm7); - movdqa(xmm7, ptr[&m_local.temp.test]); + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } } - else + + if(pixels == 4) { - - if(pixels == 4) - { - movdqa(ptr[&m_local.temp.test], xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm6, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel_SSE(xmm4, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel_SSE(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - punpckldq(xmm6, xmm1); - punpckldq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm1, xmm5, 0); - psrldq(xmm5, 4); - ReadTexel_SSE(xmm7, xmm2, 0); - psrldq(xmm2, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm5, xmm5, 0); - ReadTexel_SSE(xmm2, xmm2, 0); - - punpckldq(xmm1, xmm5); - punpckldq(xmm7, xmm2); - - punpcklqdq(xmm6, xmm1); - punpcklqdq(xmm4, xmm7); - - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm1, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel_SSE(xmm5, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel_SSE(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - punpckldq(xmm1, xmm2); - punpckldq(xmm5, xmm7); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm2, xmm0, 0); - psrldq(xmm0, 4); - ReadTexel_SSE(xmm7, xmm3, 0); - psrldq(xmm3, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm0, xmm0, 0); - ReadTexel_SSE(xmm3, xmm3, 0); - - punpckldq(xmm2, xmm0); - punpckldq(xmm7, xmm3); - - punpcklqdq(xmm1, xmm2); - punpcklqdq(xmm5, xmm7); - - movdqa(xmm7, ptr[&m_local.temp.test]); - } - else - { - mov(ebx, ptr[&lod_i->u32[0]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm6, xmm5, 0); - psrldq(xmm5, 4); // shuffle instead? (1 2 3 0 ~ rotation) - - mov(ebx, ptr[&lod_i->u32[1]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm1, xmm5, 0); - psrldq(xmm5, 4); - - punpckldq(xmm6, xmm1); - - mov(ebx, ptr[&lod_i->u32[2]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm1, xmm5, 0); - psrldq(xmm5, 4); - - mov(ebx, ptr[&lod_i->u32[3]]); - mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]); - - ReadTexel_SSE(xmm4, xmm5, 0); - // psrldq(xmm5, 4); - - punpckldq(xmm1, xmm4); - - punpcklqdq(xmm6, xmm1); - } - + movdqa(xmm5, xmm7); + movdqa(xmm7, ptr[&m_local.temp.test]); } } else @@ -3077,41 +2879,12 @@ void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset) const int r[] = {5, 6, 2, 4, 0, 1, 3, 5}; - if(m_cpu.has(util::Cpu::tSSE41)) + for(int i = 0; i < pixels; i++) { - for(int i = 0; i < pixels; i++) + for(uint8 j = 0; j < 4; j++) { - for(uint8 j = 0; j < 4; j++) - { - ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); - } + ReadTexel_SSE(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j); } - - } else { - const int t[] = {1, 4, 1, 5, 2, 5, 2, 0}; - - for(int i = 0; i < pixels; i++) - { - const Xmm& addr = Xmm(r[i * 2 + 0]); - const Xmm& dst = Xmm(r[i * 2 + 1]); - const Xmm& temp1 = Xmm(t[i * 2 + 0]); - const Xmm& temp2 = Xmm(t[i * 2 + 1]); - - ReadTexel_SSE(dst, addr, 0); - psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) - ReadTexel_SSE(temp1, addr, 0); - psrldq(addr, 4); - punpckldq(dst, temp1); - - ReadTexel_SSE(temp1, addr, 0); - psrldq(addr, 4); - ReadTexel_SSE(temp2, addr, 0); - // psrldq(addr, 4); - punpckldq(temp1, temp2); - - punpcklqdq(dst, temp1); - } - } } } diff --git a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp index fd424400c1..cb98275428 100644 --- a/plugins/GSdx/Renderers/SW/GSRendererSW.cpp +++ b/plugins/GSdx/Renderers/SW/GSRendererSW.cpp @@ -277,37 +277,17 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* GSVector4i off = (GSVector4i)m_context->XYOFFSET; GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0); - - #if _M_SSE >= 0x401 - GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8); - #else - - uint32_t z_max = 0xffffffff >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8); - - #endif - for(int i = (int)m_vertex.next; i > 0; i--, src++, dst++) { GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q - #if _M_SSE >= 0x401 - GSVector4i xyzuvf(src->m[1]); GSVector4i xy = xyzuvf.upl16() - off; GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00()); - #else - - uint32 z = src->XYZ.Z; - - GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - off; - GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later - - #endif - dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale; dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7); @@ -317,15 +297,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* { if(fst) { - #if _M_SSE >= 0x401 - t = GSVector4(xyzuvf.uph16() << (16 - 4)); - - #else - - t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4)); - - #endif } else if(q_div) { @@ -350,17 +322,8 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* if(primclass == GS_SPRITE_CLASS) { - #if _M_SSE >= 0x401 - xyzuvf = xyzuvf.min_u32(z_max); t = t.insert32<1, 3>(GSVector4::cast(xyzuvf)); - - #else - - z = std::min(z, z_max); - t = t.insert32<0, 3>(GSVector4::cast(GSVector4i::load(z))); - - #endif } dst->t = t; diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index cfcb833b55..e2e9fff37f 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -241,60 +241,39 @@ typedef int64 sint64; #define _M_SSE 0x500 #elif defined(__SSE4_1__) #define _M_SSE 0x401 -#elif defined(__SSSE3__) - #define _M_SSE 0x301 -#elif defined(__SSE2__) - #define _M_SSE 0x200 #endif #endif #if !defined(_M_SSE) && (!defined(_WIN32) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2) - #define _M_SSE 0x200 + #define _M_SSE 0x401 #endif -#if _M_SSE >= 0x200 - - #include - #include - - #ifndef _MM_DENORMALS_ARE_ZERO - #define _MM_DENORMALS_ARE_ZERO 0x0040 - #endif - - #define MXCSR (_MM_DENORMALS_ARE_ZERO | _MM_MASK_MASK | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON) - - #define _MM_TRANSPOSE4_SI128(row0, row1, row2, row3) \ - { \ - __m128 tmp0 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0x44); \ - __m128 tmp2 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0xEE); \ - __m128 tmp1 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0x44); \ - __m128 tmp3 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0xEE); \ - (row0) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0x88)); \ - (row1) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0xDD)); \ - (row2) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0x88)); \ - (row3) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); \ - } - -#else - -#error TODO: GSVector4 and GSRasterizer needs SSE2 +#include +#include +#ifndef _MM_DENORMALS_ARE_ZERO +#define _MM_DENORMALS_ARE_ZERO 0x0040 #endif -#if _M_SSE >= 0x301 +#define MXCSR (_MM_DENORMALS_ARE_ZERO | _MM_MASK_MASK | _MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON) - #include +#define _MM_TRANSPOSE4_SI128(row0, row1, row2, row3) \ +{ \ + __m128 tmp0 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0x44); \ + __m128 tmp2 = _mm_shuffle_ps(_mm_castsi128_ps(row0), _mm_castsi128_ps(row1), 0xEE); \ + __m128 tmp1 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0x44); \ + __m128 tmp3 = _mm_shuffle_ps(_mm_castsi128_ps(row2), _mm_castsi128_ps(row3), 0xEE); \ + (row0) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0x88)); \ + (row1) = _mm_castps_si128(_mm_shuffle_ps(tmp0, tmp1, 0xDD)); \ + (row2) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0x88)); \ + (row3) = _mm_castps_si128(_mm_shuffle_ps(tmp2, tmp3, 0xDD)); \ +} -#endif - -#if _M_SSE >= 0x401 - - #include - -#endif +#include +#include #if _M_SSE >= 0x500