From df32564bef41f3763c08823f3e7cb8ef3a0ba8ee Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Mon, 27 Mar 2017 21:39:33 +0200
Subject: [PATCH] gsdx: workaround AVX2 generated code by GCC

See commit for the full details.

v2: use a direct access to the union field instead of extract32
It gives us both optimal and working code.
---
 plugins/GSdx/GSDrawScanline.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp
index a3b6158298..d266437fc4 100644
--- a/plugins/GSdx/GSDrawScanline.cpp
+++ b/plugins/GSdx/GSDrawScanline.cpp
@@ -45,7 +45,19 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
 
 	if(m_global.sel.mmin && m_global.sel.lcm)
 	{
+#if defined(__GNUC__) && _M_SSE >= 0x501
+		// GCC 4.9/5/6 doesn't generate correct AVX2 code for extract32<0>. GCC7 and upward are unknown.
+		// Intrinsic code is _mm_cvtsi128_si32(_mm256_castsi256_si128(m))
+		// It seems recent Clang got _mm256_cvtsi256_si32(m) instead. I don't know about GCC.
+		//
+		// Generated code keep the integer in an XMM register but bit [64:32] aren't cleared.
+		// So the srl16 shift will be huge and v will be 0.
+		//
+		int lod_x = m_global.lod.i.x0;
+		GSVector4i v = m_global.t.minmax.srl16(lod_x);
+#else
 		GSVector4i v = m_global.t.minmax.srl16(m_global.lod.i.extract32<0>());//.x);
+#endif
 
 		v = v.upl16(v);