gsdx: enable AVX with GCC

* Use overloaded function instead of specialized template
=> see http://stackoverflow.com/questions/3052579/explicit-specialization-in-non-namespace-scope

* replace _mm256_slli_si128 by _mm256_slli_si256
  I hope they're equivalent. I didn't find any info on _mm256_slli_si128,
  however srl use _mm256_srli_si256
This commit is contained in:
Gregory Hainaut 2014-10-26 14:40:14 +01:00
parent 679fa65b84
commit f25e056914
2 changed files with 15 additions and 6 deletions

View File

@ -3810,7 +3810,8 @@ public:
template<int i> __forceinline GSVector8i sll() const template<int i> __forceinline GSVector8i sll() const
{ {
return GSVector8i(_mm256_slli_si128(m, i)); return GSVector8i(_mm256_slli_si256(m, i));
//return GSVector8i(_mm256_slli_si128(m, i));
} }
__forceinline GSVector8i sra16(int i) const __forceinline GSVector8i sra16(int i) const
@ -4260,17 +4261,17 @@ public:
return cast(v0).insert<1>(v1); return cast(v0).insert<1>(v1);
} }
template<> __forceinline GSVector8i gather32_32<uint8>(const uint8* ptr) const __forceinline GSVector8i gather32_32(const uint8* ptr) const
{ {
return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 1)) & GSVector8i::x000000ff(); return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 1)) & GSVector8i::x000000ff();
} }
template<> __forceinline GSVector8i gather32_32<uint16>(const uint16* ptr) const __forceinline GSVector8i gather32_32(const uint16* ptr) const
{ {
return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 2)) & GSVector8i::x0000ffff(); return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 2)) & GSVector8i::x0000ffff();
} }
template<> __forceinline GSVector8i gather32_32<uint32>(const uint32* ptr) const __forceinline GSVector8i gather32_32(const uint32* ptr) const
{ {
return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 4)); return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 4));
} }
@ -4296,12 +4297,12 @@ public:
return cast(v0).insert<1>(v1); return cast(v0).insert<1>(v1);
} }
template<> __forceinline GSVector8i gather32_32<uint8, uint32>(const uint8* ptr1, const uint32* ptr2) const __forceinline GSVector8i gather32_32(const uint8* ptr1, const uint32* ptr2) const
{ {
return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2); return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
} }
template<> __forceinline GSVector8i gather32_32<uint32, uint32>(const uint32* ptr1, const uint32* ptr2) const __forceinline GSVector8i gather32_32(const uint32* ptr1, const uint32* ptr2) const
{ {
return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2); return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
} }

View File

@ -263,6 +263,14 @@ struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_fr
#endif #endif
// sse // sse
#ifndef _WINDOWS
// Convert gcc see define into GSdx (windows) define
#if defined(__AVX2__)
#define _M_SSE 0x501
#elif defined(__AVX__)
#define _M_SSE 0x500
#endif
#endif
#if !defined(_M_SSE) && (!defined(_WINDOWS) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2) #if !defined(_M_SSE) && (!defined(_WINDOWS) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2)