Merge pull request #333 from PCSX2/linux-avx

Support of AVX build for linux
2014-11-08 14:09:06 +01:00 · 2014-11-08 14:09:06 +01:00 · b7e5e41afe
parent a908d1ab93 f25e056914
commit b7e5e41afe
5 changed files with 32 additions and 8 deletions
--- a/build.sh
+++ b/build.sh
@ -37,6 +37,7 @@ for ARG in "$@"; do
        --wx28        ) flags+=(-DWX28_API=TRUE) ;;
        --wx30        ) flags+=(-DWX28_API=FALSE) ;;
        --64-bit-dont-work ) flags+=(-D64BIT_BUILD_DONT_WORK=TRUE) ;;
+        --no-simd    )  flags+=(-DDISABLE_ADVANCE_SIMD=TRUE) ;;

        *)
            # Unknown option
@ -58,6 +59,7 @@ for ARG in "$@"; do
            echo "--gles          : Replace openGL backend of GSdx by openGLES3"
            echo
            echo "--64-bit-dont-work : Don't use it!"
+            echo "--no-simd       : Only allow sse2"
            exit 1
    esac
 done
--- a/cmake/BuildParameters.cmake
+++ b/cmake/BuildParameters.cmake
@ -64,6 +64,7 @@ option(USE_ASAN "Enable address sanitizer")
 # Select the architecture
 #-------------------------------------------------------------------------------
 option(64BIT_BUILD_DONT_WORK "Enable a x86_64 build instead of cross compiling (WARNING: NOTHING WORK)" OFF)
+option(DISABLE_ADVANCE_SIMD "Disable advance use of SIMD (SSE2+ & AVX)" OFF)

 # Architecture bitness detection
 if(CMAKE_SIZEOF_VOID_P EQUAL 8)
@ -99,7 +100,11 @@ if(_ARCH_64 AND 64BIT_BUILD_DONT_WORK)
    # x86_64 requires -fPIC
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)

-    set(ARCH_FLAG "-m64 -msse -msse2")
+    if (DISABLE_ADVANCE_SIMD)
+        set(ARCH_FLAG "-m64 -msse -msse2")
+    else()
+        set(ARCH_FLAG "-m64 -march=native -fabi-version=6")
+    endif()
    add_definitions(-D_ARCH_64=1 -D_M_X86=1 -D_M_X86_64=1)
    set(_ARCH_64 1)
    set(_M_X86 1)
@ -127,7 +132,13 @@ else()
    #     - Only plugins. No package will link to them.
    set(CMAKE_POSITION_INDEPENDENT_CODE OFF)

-    set(ARCH_FLAG "-m32 -msse -msse2 -march=i686")
+    if (DISABLE_ADVANCE_SIMD)
+        set(ARCH_FLAG "-m32 -msse -msse2 -march=i686")
+    else()
+        # AVX requires some fix of the ABI (mangling) (default 2)
+        # Note: V6 requires GCC 4.7
+        set(ARCH_FLAG "-m32 -march=native -fabi-version=6")
+    endif()
    add_definitions(-D_ARCH_32=1 -D_M_X86=1 -D_M_X86_32=1)
    set(_ARCH_32 1)
    set(_M_X86 1)
--- a/plugins/GSdx/CMakeLists.txt
+++ b/plugins/GSdx/CMakeLists.txt
@ -92,6 +92,7 @@ set(GSdxSources
    GSDrawScanline.cpp
    GSDrawScanlineCodeGenerator.cpp
    GSDrawScanlineCodeGenerator.x86.avx.cpp
+    GSDrawScanlineCodeGenerator.x86.avx2.cpp
    GSDrawScanlineCodeGenerator.x64.cpp
    GSDrawScanlineCodeGenerator.x86.cpp
    GSDrawScanlineCodeGenerator.x64.avx.cpp
@ -109,6 +110,7 @@ set(GSdxSources
    GSSetting.cpp
    GSSetupPrimCodeGenerator.cpp
    GSSetupPrimCodeGenerator.x86.avx.cpp
+    GSSetupPrimCodeGenerator.x86.avx2.cpp
    GSSetupPrimCodeGenerator.x64.avx.cpp
    GSSetupPrimCodeGenerator.x86.cpp
    GSSetupPrimCodeGenerator.x64.cpp
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
@ -3810,7 +3810,8 @@ public:

 	template<int i> __forceinline GSVector8i sll() const
 	{
-		return GSVector8i(_mm256_slli_si128(m, i));
+		return GSVector8i(_mm256_slli_si256(m, i));
+		//return GSVector8i(_mm256_slli_si128(m, i));
 	}

 	__forceinline GSVector8i sra16(int i) const
@ -4260,17 +4261,17 @@ public:
 		return cast(v0).insert<1>(v1);
 	}

-	template<> __forceinline GSVector8i gather32_32<uint8>(const uint8* ptr) const
+	__forceinline GSVector8i gather32_32(const uint8* ptr) const
 	{
 		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 1)) & GSVector8i::x000000ff();
 	}

-	template<> __forceinline GSVector8i gather32_32<uint16>(const uint16* ptr) const
+	__forceinline GSVector8i gather32_32(const uint16* ptr) const
 	{
 		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 2)) & GSVector8i::x0000ffff();
 	}

-	template<> __forceinline GSVector8i gather32_32<uint32>(const uint32* ptr) const
+	__forceinline GSVector8i gather32_32(const uint32* ptr) const
 	{
 		return GSVector8i(_mm256_i32gather_epi32((const int*)ptr, m, 4));
 	}
@ -4296,12 +4297,12 @@ public:
 		return cast(v0).insert<1>(v1);
 	}

-	template<> __forceinline GSVector8i gather32_32<uint8, uint32>(const uint8* ptr1, const uint32* ptr2) const
+	__forceinline GSVector8i gather32_32(const uint8* ptr1, const uint32* ptr2) const
 	{
 		return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
 	}

-	template<> __forceinline GSVector8i gather32_32<uint32, uint32>(const uint32* ptr1, const uint32* ptr2) const
+	__forceinline GSVector8i gather32_32(const uint32* ptr1, const uint32* ptr2) const
 	{
 		return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
 	}
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@ -263,6 +263,14 @@ struct aligned_free_second {template<class T> void operator()(T& p) {_aligned_fr
 #endif

 // sse
+#ifndef _WINDOWS
+// Convert gcc see define into GSdx (windows) define
+#if defined(__AVX2__)
+	#define _M_SSE 0x501
+#elif defined(__AVX__)
+	#define _M_SSE 0x500
+#endif
+#endif

 #if !defined(_M_SSE) && (!defined(_WINDOWS) || defined(_M_AMD64) || defined(_M_IX86_FP) && _M_IX86_FP >= 2)