Linux: Added more correct __asm__ qualifiers and conditions; including __volatile__ on a lot of asm code (it should really be the default behavior and non-vlatile the specifier, but whatever >_<), and added register clobber specifiers. Might help unbreak some of GCC 4.4's optimization problems, although VIFdma's uber-hack SSE optimization looks like a real problem.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1964 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-10-05 11:05:11 +00:00 · 2009-10-05 11:05:11 +00:00 · b71b837efa
parent a8b55baa38
commit b71b837efa
13 changed files with 150 additions and 138 deletions
--- a/common/include/Utilities/lnx_memzero.h
+++ b/common/include/Utilities/lnx_memzero.h
@ -102,7 +102,7 @@ static __forceinline void memset_8( void *dest )
 		return;
 		case 3:
-			__asm__ 
+			__asm__ volatile
 			(
 				".intel_syntax noprefix\n"
 				"cld\n"
@ -113,14 +113,14 @@ static __forceinline void memset_8( void *dest )
 				"stosd\n"
 				".att_syntax\n"
 				:
 				// Input specifiers: D - edi, a -- eax, c ecx
 				: [dest]"D"(dest), [data32]"a"(data32)
-// D - edi, a -- eax, c ecx
+				: "memory"
 				:  
 			);
 		return;
 		case 4:
-			__asm__ 
+			__asm__ volatile
 			(
 				".intel_syntax noprefix\n"
 				"cld\n"
@ -133,13 +133,13 @@ static __forceinline void memset_8( void *dest )
 				".att_syntax\n"
 				:
 				:  [dest]"D"(dest), [data32]"a"(data32)
-				:  
+				:  "memory"
 			);
 		return;
 		case 5:
-			__asm__
+			__asm__ volatile
 			(
 				".intel_syntax noprefix\n"
 				"cld\n"
@ -153,13 +153,13 @@ static __forceinline void memset_8( void *dest )
 				".att_syntax\n"
 				:
 				: [dest]"D"(dest), [data32]"a"(data32)
-				:  
+				: "memory"
 			);
 		return;
 		default:
-			__asm__
+			__asm__ volatile
 			(
 				".intel_syntax noprefix\n"
 				"cld\n"
@ -170,7 +170,7 @@ static __forceinline void memset_8( void *dest )
 				".att_syntax\n"
 				:
 				: [remdat]"c"(remdat), [dest]"D"(dest), [data32]"a"(data32)
-				:  
+				: "memory"
 			);
 		return;
 	}
--- a/common/src/x86emitter/cpudetect.cpp
+++ b/common/src/x86emitter/cpudetect.cpp
@ -27,12 +27,9 @@ __aligned16 x86CPU_INFO x86caps;
 static s32 iCpuId( u32 cmd, u32 *regs )
 {
 #ifdef _MSC_VER
-	__asm
+	__asm xor ecx, ecx;		// ecx should be zero for CPUID(4)
 	{
 		xor ecx, ecx;		// ecx should be zero for CPUID(4)
 	}
 #else
-	__asm__ ( "xor %ecx, %ecx" );
+	__asm__ __volatile__ ( "xor %ecx, %ecx" );
 #endif
   __cpuid( (int*)regs, cmd );
--- a/common/src/x86emitter/tools.cpp
+++ b/common/src/x86emitter/tools.cpp
@ -65,7 +65,7 @@ __forceinline void FreezeMMXRegs(int save)
 			emms
 		}
 #else
-        __asm__(
+        __asm__ volatile(
 			".intel_syntax noprefix\n"
 			"movq [%[g_globalMMXData]+0x00], mm0\n"
 			"movq [%[g_globalMMXData]+0x08], mm1\n"
@ -76,7 +76,7 @@ __forceinline void FreezeMMXRegs(int save)
 			"movq [%[g_globalMMXData]+0x30], mm6\n"
 			"movq [%[g_globalMMXData]+0x38], mm7\n"
 			"emms\n"
-			".att_syntax\n" : : [g_globalMMXData]"r"(g_globalMMXData)
+			".att_syntax\n" : : [g_globalMMXData]"r"(g_globalMMXData) : "memory"
 		);
 #endif
@ -105,7 +105,7 @@ __forceinline void FreezeMMXRegs(int save)
 			emms
 		}
 #else
-        __asm__(
+        __asm__ volatile(
 			".intel_syntax noprefix\n"
 			"movq mm0, [%[g_globalMMXData]+0x00]\n"
 			"movq mm1, [%[g_globalMMXData]+0x08]\n"
@ -116,7 +116,7 @@ __forceinline void FreezeMMXRegs(int save)
 			"movq mm6, [%[g_globalMMXData]+0x30]\n"
 			"movq mm7, [%[g_globalMMXData]+0x38]\n"
 			"emms\n"
-			".att_syntax\n" : :  [g_globalMMXData]"r"(g_globalMMXData)
+			".att_syntax\n" : :  [g_globalMMXData]"r"(g_globalMMXData) : "memory"
 		);
 #endif
 	}
@ -154,7 +154,7 @@ __forceinline void FreezeXMMRegs(int save)
        }
 #else
-        __asm__(
+        __asm__ volatile(
 			".intel_syntax noprefix\n"
 			"movaps [%[g_globalXMMData]+0x00], xmm0\n"
 			"movaps [%[g_globalXMMData]+0x10], xmm1\n"
@ -164,7 +164,7 @@ __forceinline void FreezeXMMRegs(int save)
 			"movaps [%[g_globalXMMData]+0x50], xmm5\n"
 			"movaps [%[g_globalXMMData]+0x60], xmm6\n"
 			"movaps [%[g_globalXMMData]+0x70], xmm7\n"
-			".att_syntax\n" : :  [g_globalXMMData]"r"(g_globalXMMData)
+			".att_syntax\n" : : [g_globalXMMData]"r"(g_globalXMMData) : "memory"
 		);
 #endif // _MSC_VER
@ -196,7 +196,7 @@ __forceinline void FreezeXMMRegs(int save)
        }
 #else
-        __asm__(
+        __asm__ volatile(
 			".intel_syntax noprefix\n"
 			"movaps xmm0, [%[g_globalXMMData]+0x00]\n"
 			"movaps xmm1, [%[g_globalXMMData]+0x10]\n"
@ -206,7 +206,7 @@ __forceinline void FreezeXMMRegs(int save)
 			"movaps xmm5, [%[g_globalXMMData]+0x50]\n"
 			"movaps xmm6, [%[g_globalXMMData]+0x60]\n"
 			"movaps xmm7, [%[g_globalXMMData]+0x70]\n"
-			".att_syntax\n" : : [g_globalXMMData]"r"(g_globalXMMData)
+			".att_syntax\n" : : [g_globalXMMData]"r"(g_globalXMMData) : "memory"
 		);
 #endif // _MSC_VER
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@ -55,7 +55,7 @@ enum
 	BCb_COEFF   = 0x40
 };
-static volatile const __aligned16 SSE2_Tables sse2_tables = 
+static const __aligned16 SSE2_Tables sse2_tables =
 {
 	{0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000},	// c_bias
 	{16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16},			// y_bias
@ -211,8 +211,10 @@ ihatemsvc:
 		cmp esi, 64
 		jne tworows
 	}
 #elif defined(__GNUC__)
-	__asm__(
+
 	__asm__ __volatile__ (
 		".intel_syntax noprefix\n"
 		"mov eax, 1\n"
 		"xor esi, esi\n"
@ -220,8 +222,8 @@ ihatemsvc:
 		// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
 		// This saves 2-3 bytes per instruction where these are used. :)
-		"mov ecx, offset %c[yuv2rgb_temp]\n"
+		//"mov ecx, offset %c[yuv2rgb_temp]\n"
-		"mov edx, offset %c[sse2_tables]+64\n"
+		//"mov edx, offset %c[sse2_tables]+64\n"
 		".align 16\n"
 "tworows:\n"
@ -237,29 +239,29 @@ ihatemsvc:
 		// unfortunately I don't think this will matter despite being
 		// technically potentially a little faster, but this is
 		// equivalent to an add or sub
-		"pxor xmm2, xmmword ptr [edx+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
+		"pxor xmm2, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
-		"pxor xmm0, xmmword ptr [edx+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
+		"pxor xmm0, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
 		"movaps xmm1, xmm0\n"
 		"movaps xmm3, xmm2\n"
-		"pmulhw xmm1, xmmword ptr [edx+%c[GCr_COEFF]]\n"
+		"pmulhw xmm1, xmmword ptr [%[sse2_tables]+%c[GCr_COEFF]]\n"
-		"pmulhw xmm3, xmmword ptr [edx+%c[GCb_COEFF]]\n"
+		"pmulhw xmm3, xmmword ptr [%[sse2_tables]+%c[GCb_COEFF]]\n"
-		"pmulhw xmm0, xmmword ptr [edx+%c[RCr_COEFF]]\n"
+		"pmulhw xmm0, xmmword ptr [%[sse2_tables]+%c[RCr_COEFF]]\n"
-		"pmulhw xmm2, xmmword ptr [edx+%c[BCb_COEFF]]\n"
+		"pmulhw xmm2, xmmword ptr [%[sse2_tables]+%c[BCb_COEFF]]\n"
 		"paddsw xmm1, xmm3\n"
 		// store for the next line; looking at the code above
 		// compared to the code below, I have to wonder whether
 		// this was worth the hassle
-		"movaps xmmword ptr [ecx], xmm0\n"
+		"movaps xmmword ptr [%[yuv2rgb_temp]], xmm0\n"
-		"movaps xmmword ptr [ecx+16], xmm1\n"
+		"movaps xmmword ptr [%[yuv2rgb_temp]+16], xmm1\n"
-		"movaps xmmword ptr [ecx+32], xmm2\n"
+		"movaps xmmword ptr [%[yuv2rgb_temp]+32], xmm2\n"
 		"jmp ihategcctoo\n"
 		".align 16\n"
 "onerow:\n"
-		"movaps xmm0, xmmword ptr [ecx]\n"
+		"movaps xmm0, xmmword ptr [%[yuv2rgb_temp]]\n"
-		"movaps xmm1, xmmword ptr [ecx+16]\n"
+		"movaps xmm1, xmmword ptr [%[yuv2rgb_temp]+16]\n"
-		"movaps xmm2, xmmword ptr [ecx+32]\n"
+		"movaps xmm2, xmmword ptr [%[yuv2rgb_temp]+32]\n"
 "ihategcctoo:\n"
 		"movaps xmm3, xmm0\n"
@ -267,13 +269,13 @@ ihatemsvc:
 		"movaps xmm5, xmm2\n"
 		"movaps xmm6, xmmword ptr [mb8+edi]\n"
-		"psubusb xmm6, xmmword ptr [edx+%c[Y_BIAS]]\n"
+		"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
 		"movaps xmm7, xmm6\n"
 		"psllw xmm6, 8\n"                   // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
-		"pand xmm7, xmmword ptr [edx+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
+		"pand xmm7, xmmword ptr [%[sse2_tables]+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
-		"pmulhuw xmm6, xmmword ptr [edx+%c[Y_COEFF]]\n"
+		"pmulhuw xmm6, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
-		"pmulhuw xmm7, xmmword ptr [edx+%c[Y_COEFF]]\n"
+		"pmulhuw xmm7, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
 		"paddsw xmm0, xmm6\n"
 		"paddsw xmm3, xmm7\n"
@ -283,7 +285,7 @@ ihatemsvc:
 		"paddsw xmm5, xmm7\n"
 		// round
-		"movaps xmm6, xmmword ptr [edx+%c[ROUND_1BIT]]\n"
+		"movaps xmm6, xmmword ptr [%[sse2_tables]+%c[ROUND_1BIT]]\n"
 		"paddw xmm0, xmm6\n"
 		"paddw xmm1, xmm6\n"
 		"paddw xmm2, xmm6\n"
@ -343,11 +345,11 @@ ihatemsvc:
 		:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
 			[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
 			[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
-			[yuv2rgb_temp]"i"(yuv2rgb_temp), [sse2_tables]"i"(&sse2_tables)
+			[yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(&sse2_tables)
-		:
+		: "eax", "ebx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
 	);
 #else
-#error Unsupported compiler
+#	error Unsupported compiler
 #endif
 	FreezeXMMRegs(0);
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@ -646,6 +646,11 @@ static void VIFunpack(u32 *data, vifCode *v, unsigned int size, const unsigned i
 			int writemask;
 			u32 oldcycle = -1;
 			// yay evil .. let's just set some XMM registers in the middle of C code
 			// and "hope" they get preserved, in spite of the fact that x86-32 ABI specifies
 			// these as "clobberable" registers (so any printf or something could decide to
 			// clobber them, and has every right to... >_<) --air
 #ifdef _MSC_VER
 			if (VIFdmanum)
 			{
@ -658,6 +663,10 @@ static void VIFunpack(u32 *data, vifCode *v, unsigned int size, const unsigned i
 				__asm movaps XMM_COL, xmmword ptr [g_vifmask.Col0]
 			}
 #else
 			// I'd add volatile to these, but what's the point?  This code already breaks
 			// like 5000 coveted rules of binary interfacing regardless, and is only working by
 			// the miracles and graces of a profound deity (or maybe it doesn't -- linux port
 			// *does* have stability issues, especially in GCC 4.4). --air
 			if (VIFdmanum)
 			{
 				__asm__(".intel_syntax noprefix\n"
--- a/pcsx2/x86/iMisc.cpp
+++ b/pcsx2/x86/iMisc.cpp
@ -46,7 +46,7 @@ void SetCPUState(u32 sseMXCSR, u32 sseVUMXCSR)
 #ifdef _MSC_VER
 	__asm ldmxcsr g_sseMXCSR; // set the new sse control
 #else
-	__asm__("ldmxcsr %[g_sseMXCSR]" : : [g_sseMXCSR]"m"(g_sseMXCSR) );
+	__asm__ __volatile__("ldmxcsr %[g_sseMXCSR]" : : [g_sseMXCSR]"m"(g_sseMXCSR) );
 #endif
 	//g_sseVUMXCSR = g_sseMXCSR|0x6000;
 }
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@ -657,22 +657,29 @@ static __forceinline s32 recExecuteBlock( s32 eeCycles )
 		pop ebx
 	}
 #else
-	__asm__
+	__asm__ __volatile__
 	(
 		// We should be able to rely on GAS syntax (the register clobber list) as a
 		// replacement for manual push/pop of unpreserved registers.
 		//
 		// EBP note: As I feared, EBP is "required" for C++ excepion handling in Linux, and trying
 		//   to issue a clobber specifier for it causes an error.  We really need to find a way to
 		//   disable EBP regalloc in iCore. --air
 		".intel_syntax noprefix\n"
-		"push ebx\n"
+		//"push ebx\n"
-		"push esi\n"
+		//"push esi\n"
-		"push edi\n"
+		//"push edi\n"
 		"push ebp\n"
 		"call iopDispatcherReg\n"
 		"pop ebp\n"
-		"pop edi\n"
+		//"pop edi\n"
-		"pop esi\n"
+		//"pop esi\n"
-		"pop ebx\n"
+		//"pop ebx\n"
 		".att_syntax\n"
-	);
+	: : : "eax", "ebx", "ecx", "edx", "esi", "edi", "memory" );
 #endif
 	return psxBreak + psxCycleEE;
@ -836,7 +843,7 @@ static void checkcodefn()
 #ifdef _MSC_VER
 	__asm mov pctemp, eax;
 #else
-    __asm__("movl %%eax, %[pctemp]" : : [pctemp]"m"(pctemp) );
+    __asm__ __volatile__("movl %%eax, %[pctemp]" : [pctemp]"m="(pctemp) );
 #endif
 	Console.WriteLn("iop code changed! %x", pctemp);
 }
--- a/pcsx2/x86/iVif.cpp
+++ b/pcsx2/x86/iVif.cpp
@ -104,7 +104,7 @@ static void __forceinline UseOldMaskCode(u32* &vif1masks, u32 &mask)
 	u8* p0 = (u8*)&s_maskarr[mask&15][0];
 	u8* p1 = (u8*)&s_maskarr[(mask>>4)&15][0];
-	__asm__(".intel_syntax noprefix\n"
+	__asm__ __volatile__(".intel_syntax noprefix\n"
 			"movaps xmm0, [%0]\n"
 			"movaps xmm1, [%1]\n"
 			"movaps xmm2, xmm0\n"
@ -121,6 +121,6 @@ static void __forceinline UseOldMaskCode(u32* &vif1masks, u32 &mask)
 			"movq [%2+40], xmm3\n"
 			"movhps [%2+48], xmm2\n"
 			"movhps [%2+56], xmm3\n"
-			".att_syntax\n" : : "r"(p0), "r"(p1), "r"(vif1masks) );
+			".att_syntax\n" : : "r"(p0), "r"(p1), "r"(vif1masks) : "memory" );
 }
 #endif
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -601,22 +601,29 @@ static void recExecute()
 	#else // _MSC_VER
-				__asm__
+				__asm__ __volatile__
 				(
 					// We should be able to rely on GAS syntax (the register clobber list) as a
 					// replacement for manual push/pop of unpreserved registers.
 					// EBP note: As I feared, EBP is "required" for C++ excepion handling in Linux, and trying
 					//   to issue a clobber specifier for it causes an error.  We really need to find a way to
 					//   disable EBP regalloc in iCore. --air
 					".intel_syntax noprefix\n"
-					"push ebx\n"
+					//"push ebx\n"
-					"push esi\n"
+					//"push esi\n"
-					"push edi\n"
+					//"push edi\n"
-					"push ebp\n"
+					//"push ebp\n"
 					"call DispatcherReg\n"
-					"pop ebp\n"
+					//"pop ebp\n"
-					"pop edi\n"
+					//"pop edi\n"
-					"pop esi\n"
+					//"pop esi\n"
-					"pop ebx\n"
+					//"pop ebx\n"
 					".att_syntax\n"
-				);
+				: : : "eax", "ebx", "ecx", "edx", "esi", "edi", "memory" );
 	#endif
 			}
 			catch( Exception::ForceDispatcherReg& )
@ -679,21 +686,11 @@ void recClear(u32 addr, u32 size)
 	BASEBLOCKEX* pexblock;
 	BASEBLOCK* pblock;
 	//why the hell?
 #if 1
 	// necessary since recompiler doesn't call femms/emms
-#ifdef __INTEL_COMPILER
+#ifdef _MSC_VER
-                __asm__("emms");
+	 asm emms;
 #else
-        #ifdef _MSC_VER
+	__asm__ __volatile__("emms");
                if (x86caps.has3DNOWInstructionExtensions) __asm femms;
                else __asm emms;
        #else
                if( x86caps.has3DNOWInstructionExtensions )__asm__("femms");
                else
                        __asm__("emms");
        #endif
 #endif
 #endif
 	if ((addr) >= maxrecmem || !(recLUT[(addr) >> 16] + (addr & ~0xFFFFUL)))
--- a/plugins/spu2-x/src/Mixer.cpp
+++ b/plugins/spu2-x/src/Mixer.cpp
@ -299,7 +299,7 @@ static s32 __forceinline GetNoiseValues()
 #else
 	__asm__ (
 		".intel_syntax\n"
-		"MOV %%eax,%0\n"
+		"MOV %%eax,%1\n"
 		"ROR %%eax,5\n"
 		"XOR %%eax,0x9a\n"
 		"MOV %%ebx,%%eax\n"
@ -308,7 +308,7 @@ static s32 __forceinline GetNoiseValues()
 		"XOR %%eax,%%ebx\n"
 		"ROR %%eax,3\n"
 		"MOV %0,%%eax\n"
-		".att_syntax\n" : :"r"(Seed));
+		".att_syntax\n" : "m="(Seed) : "m"(Seed));
 #endif
 	return retval;
 }
--- a/plugins/zerospu2/voices.cpp
+++ b/plugins/zerospu2/voices.cpp
@ -207,7 +207,7 @@ static void __forceinline GetNoiseValues(s32& VD)
 #else
 	__asm__ (
 		".intel_syntax\n"
-		"MOV %%eax,%0\n"
+		"MOV %%eax,%1\n"
 		"ROR %%eax,5\n"
 		"XOR %%eax,0x9a\n"
 		"MOV %%ebx,%%eax\n"
@ -216,7 +216,7 @@ static void __forceinline GetNoiseValues(s32& VD)
 		"XOR %%eax,%%ebx\n"
 		"ROR %%eax,3\n"
 		"MOV %0,%%eax\n"
-		".att_syntax\n" : :"r"(Seed));
+		".att_syntax\n" : "r="(Seed) :"r"(Seed));
 #endif
 }