Linux: Straighten up or remove a few Windows/Linux differences in the code (experamental), remove some dead code, fix a mistake in the Linux version of memcpy_amd_ (still broken, though), and change optimization levels due to gcc optimization induced errors.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@718 96395faa-99c1-11dd-bbfe-3dabce05a288
2009-03-08 12:37:35 +00:00 · 2009-03-08 12:37:35 +00:00 · 25df8958b2
parent 804050aaef
commit 25df8958b2
7 changed files with 38 additions and 69 deletions
--- a/pcsx2/IPU/mpeg2lib/Idct.cpp
+++ b/pcsx2/IPU/mpeg2lib/Idct.cpp
@ -264,21 +264,8 @@ void mpeg2_idct_mmx_init (void);
 void mpeg2_idct_init()
 {
 #if !defined(_MSC_VER) || _MSC_VER < 1400 // ignore vc2005 and beyond
 	   int i, j;
 /*	if(hasMultimediaExtensions == 1)
 	{
 		mpeg2_idct_copy = mpeg2_idct_copy_mmx;
 		mpeg2_idct_add = mpeg2_idct_add_mmx;
 		mpeg2_idct_mmx_init ();
 	}else if(hasMultimediaExtensionsExt == 1)
 	{
 		mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
 		mpeg2_idct_add = mpeg2_idct_add_mmxext;
 		mpeg2_idct_mmx_init ();
 	}else*/
 	{
 	mpeg2_idct_copy = mpeg2_idct_copy_c;
 	mpeg2_idct_add = mpeg2_idct_add_c;
 	for (i = -384; i < 640; i++)
@ -290,18 +277,3 @@ void mpeg2_idct_init()
 		mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 	}
 }
 #else //blah vcnet2005 idiocity :D
 	   int i,j;
  		mpeg2_idct_copy = mpeg2_idct_copy_c;
 		mpeg2_idct_add = mpeg2_idct_add_c;
 		for (i = -384; i < 640; i++)
 			clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
 		for (i = 0; i < 64; i++) {
 			j = mpeg2_scan_norm[i];
 			mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 			j = mpeg2_scan_alt[i];
 			mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 		}
 #endif
 }
--- a/pcsx2/IPU/mpeg2lib/Mpeg.h
+++ b/pcsx2/IPU/mpeg2lib/Mpeg.h
@ -187,9 +187,11 @@ void mpeg2_idct_init ();
 #ifdef _MSC_VER
 #define BigEndian(out, in) out = _byteswap_ulong(in)
 #else
-#define BigEndian(out, in) \
+#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
-	out = (((((in) >> 24) & 0xFF) <<  0) + ((((in) >> 16) & 0xFF) <<  8) + \
+// No need to reimplement something already in the compiler. 
-		   ((((in) >>  8) & 0xFF) << 16) + ((((in) >>  0) & 0xFF) << 24));
+//#define BigEndian(out, in) \
 //	out = (((((in) >> 24) & 0xFF) <<  0) + ((((in) >> 16) & 0xFF) <<  8) + \
 //		   ((((in) >>  8) & 0xFF) << 16) + ((((in) >>  0) & 0xFF) << 24));
 #endif
--- a/pcsx2/Patch.h
+++ b/pcsx2/Patch.h
@ -100,12 +100,6 @@ extern PatchTextTable cpuCore[];
 extern IniPatch patch[ MAX_PATCH ];
 extern int patchnumber;
 #ifdef __LINUX__
 // Nasty, currently neccessary hack	
 extern u32 LinuxsseMXCSR;
 extern u32 LinuxsseVUMXCSR;
 #endif
 void applypatch( int place );
 void inifile_read( const char * name );
 void inifile_command( char * cmd );
--- a/pcsx2/VifDma.cpp
+++ b/pcsx2/VifDma.cpp
@ -25,10 +25,10 @@
 #include "VifDma.h" 
-#ifdef _MSC_VER
+//#ifdef _MSC_VER
 #include <xmmintrin.h>
 #include <emmintrin.h>
-#endif
+//#endif
 using namespace std;			// for min / max
@ -330,9 +330,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 	u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
 #endif
-#ifdef _MSC_VER
+//#ifdef _MSC_VER
 	_mm_prefetch((char*)data, _MM_HINT_NTA);
-#endif
+//#endif
 	if (VIFdmanum == 0) 
 	{
@ -381,9 +381,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
 		VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
 	}
-#ifdef _MSC_VER
+//#ifdef _MSC_VER
 	_mm_prefetch((char*)data+128, _MM_HINT_NTA);
-#endif
+//#endif
 	_vifRegs = (VIFregisters*)vifRegs;
 	_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
 	_vif = vif;
--- a/pcsx2/configure.ac
+++ b/pcsx2/configure.ac
@ -36,11 +36,11 @@ DEBUG_FLAGS=" -O0 -g "
 fi
 WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value"
-NORMAL_FLAGS=" -pipe -msse -O3 "
+NORMAL_FLAGS=" -pipe -msse -msse2 -O2 "
 # These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off.
 NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse "
-DEBUG_FLAGS+=" -g -msse ${WARNING_FLAGS} "
+DEBUG_FLAGS+=" -g -msse -msse2 ${WARNING_FLAGS} "
 dnl Check for debug build
 AC_MSG_CHECKING(debug build)
--- a/pcsx2/x86/fast_routines.S
+++ b/pcsx2/x86/fast_routines.S
@ -19,9 +19,9 @@
 #define TINY_BLOCK_COPY 64  
 #define IN_CACHE_COPY 2 * 1024 
-#define UNCACHED_COPY 4 * 1024 /
+#define UNCACHED_COPY 4 * 1024
 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
-#define CACHEBLOCK 80h 
+#define CACHEBLOCK 80
 // Fast assembly routines for x86-64
 // zerofrog(@gmail.com)
@ -40,7 +40,7 @@
 #define MEMCMP_SRC2 esi
 #define MEMCMP_SIZE ecx
-.globl memcmp_mmx
+.global memcmp_mmx
 memcmp_mmx:
 		// make sure mmx regs are stored
 		// FreezeMMXRegs(1);
@ -225,7 +225,7 @@ memcmp_End:
 #define MEMXOR_SRC2 esi
 #define MEMXOR_SIZE ecx
-.globl memxor_mmx
+.global memxor_mmx
 memxor_mmx:
 		// make sure mmx regs are stored
 		// FreezeMMXRegs(1);
@ -338,7 +338,7 @@ memxor_End:
 	ret
 // void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
-.globl memcpy_amd_
+.global memcpy_amd_
 memcpy_amd_:
 	push    edi  
 	push    esi  
@ -356,10 +356,11 @@ memcpy_amd_:
 	jbe		$memcpy_do_align //  it appears to be slower
 	cmp		eax, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
 	mov		eax, 8			// a trick that's faster than rep movsb...
 	sub		eax, edi		// align destination to qword
-	andb	eax, 111		// get the low bits
+	and		eax, 0b111		// get the low bits
 	sub		ecx, eax		// update copy count
 	neg		eax				// set up to jump into the array
 	add		eax, offset $memcpy_align_done
@ -427,7 +428,7 @@ $memcpy_ic_2:
 	mov		eax, ecx		// has valid low 6 bits of the byte count
 $memcpy_ic_3:
 	shr		eax, 2			// dword count
-	andb	eax, 1111		// only look at the "remainder" bits
+	and		eax, 0b1111		// only look at the "remainder" bits
 	neg		eax				// set up to jump into the array
 	add		eax, offset $memcpy_last_few
 	jmp		eax				// jump to array of movsd's
@ -512,7 +513,7 @@ $memcpy_uc_1:				// 64-byte blocks, uncached copy
 $memcpy_last_few:		// dword aligned from before movsd's
 	mov		eax, ecx	// has valid low 2 bits of the byte count
-	andb	eax, 11	// the last few cows must come home
+	and		eax, 0b11	// the last few cows must come home
 	jz		$memcpy_final	// no more, let's leave
 	rep		movsb		// the last 1, 2, or 3 bytes
--- a/pcsx2/x86/iVif.cpp
+++ b/pcsx2/x86/iVif.cpp
@ -22,6 +22,9 @@
 #include "Vif.h"
 #include "VUmicro.h"
 #include <xmmintrin.h>
 #include <emmintrin.h>
 // sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com)
 extern u32 g_vif1Masks[48], g_vif0Masks[48];
 extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4];
@ -55,10 +58,7 @@ extern u8 s_maskwrite[256];
 extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0};
-#if defined(_MSC_VER)
+//#if defined(_MSC_VER)
 #include <xmmintrin.h>
 #include <emmintrin.h>
 void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
 {
@ -95,7 +95,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
 }
-#else // gcc
+/*#else // gcc
 // Is this really supposed to be assembly for gcc and C for Windows?
 void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
 {
@ -135,4 +135,4 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
 	FreezeXMMRegs(0);
 }
-#endif
+#endif*/