Linux: Straighten up or remove a few Windows/Linux differences in the code (experamental), remove some dead code, fix a mistake in the Linux version of memcpy_amd_ (still broken, though), and change optimization levels due to gcc optimization induced errors.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@718 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
arcum42 2009-03-08 12:37:35 +00:00
parent 804050aaef
commit 25df8958b2
7 changed files with 38 additions and 69 deletions

View File

@ -264,44 +264,16 @@ void mpeg2_idct_mmx_init (void);
void mpeg2_idct_init() void mpeg2_idct_init()
{ {
#if !defined(_MSC_VER) || _MSC_VER < 1400 // ignore vc2005 and beyond int i, j;
int i, j;
/* if(hasMultimediaExtensions == 1) mpeg2_idct_copy = mpeg2_idct_copy_c;
{ mpeg2_idct_add = mpeg2_idct_add_c;
mpeg2_idct_copy = mpeg2_idct_copy_mmx; for (i = -384; i < 640; i++)
mpeg2_idct_add = mpeg2_idct_add_mmx; clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
mpeg2_idct_mmx_init (); for (i = 0; i < 64; i++) {
}else if(hasMultimediaExtensionsExt == 1) j = mpeg2_scan_norm[i];
{ mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
mpeg2_idct_copy = mpeg2_idct_copy_mmxext; j = mpeg2_scan_alt[i];
mpeg2_idct_add = mpeg2_idct_add_mmxext; mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
mpeg2_idct_mmx_init ();
}else*/
{
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
for (i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
} }
#else //blah vcnet2005 idiocity :D
int i,j;
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
for (i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
#endif
} }

View File

@ -187,9 +187,11 @@ void mpeg2_idct_init ();
#ifdef _MSC_VER #ifdef _MSC_VER
#define BigEndian(out, in) out = _byteswap_ulong(in) #define BigEndian(out, in) out = _byteswap_ulong(in)
#else #else
#define BigEndian(out, in) \ #define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \ // No need to reimplement something already in the compiler.
((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24)); //#define BigEndian(out, in) \
// out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \
// ((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24));
#endif #endif

View File

@ -100,12 +100,6 @@ extern PatchTextTable cpuCore[];
extern IniPatch patch[ MAX_PATCH ]; extern IniPatch patch[ MAX_PATCH ];
extern int patchnumber; extern int patchnumber;
#ifdef __LINUX__
// Nasty, currently neccessary hack
extern u32 LinuxsseMXCSR;
extern u32 LinuxsseVUMXCSR;
#endif
void applypatch( int place ); void applypatch( int place );
void inifile_read( const char * name ); void inifile_read( const char * name );
void inifile_command( char * cmd ); void inifile_command( char * cmd );

View File

@ -25,10 +25,10 @@
#include "VifDma.h" #include "VifDma.h"
#ifdef _MSC_VER //#ifdef _MSC_VER
#include <xmmintrin.h> #include <xmmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#endif //#endif
using namespace std; // for min / max using namespace std; // for min / max
@ -330,9 +330,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
u32 memsize = VIFdmanum ? 0x4000 : 0x1000; u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
#endif #endif
#ifdef _MSC_VER //#ifdef _MSC_VER
_mm_prefetch((char*)data, _MM_HINT_NTA); _mm_prefetch((char*)data, _MM_HINT_NTA);
#endif //#endif
if (VIFdmanum == 0) if (VIFdmanum == 0)
{ {
@ -381,9 +381,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask); VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
} }
#ifdef _MSC_VER //#ifdef _MSC_VER
_mm_prefetch((char*)data+128, _MM_HINT_NTA); _mm_prefetch((char*)data+128, _MM_HINT_NTA);
#endif //#endif
_vifRegs = (VIFregisters*)vifRegs; _vifRegs = (VIFregisters*)vifRegs;
_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks; _vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
_vif = vif; _vif = vif;

View File

@ -36,11 +36,11 @@ DEBUG_FLAGS=" -O0 -g "
fi fi
WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value" WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value"
NORMAL_FLAGS=" -pipe -msse -O3 " NORMAL_FLAGS=" -pipe -msse -msse2 -O2 "
# These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off. # These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off.
NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse " NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse "
DEBUG_FLAGS+=" -g -msse ${WARNING_FLAGS} " DEBUG_FLAGS+=" -g -msse -msse2 ${WARNING_FLAGS} "
dnl Check for debug build dnl Check for debug build
AC_MSG_CHECKING(debug build) AC_MSG_CHECKING(debug build)

View File

@ -19,9 +19,9 @@
#define TINY_BLOCK_COPY 64 #define TINY_BLOCK_COPY 64
#define IN_CACHE_COPY 2 * 1024 #define IN_CACHE_COPY 2 * 1024
#define UNCACHED_COPY 4 * 1024 / #define UNCACHED_COPY 4 * 1024
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h #define CACHEBLOCK 80
// Fast assembly routines for x86-64 // Fast assembly routines for x86-64
// zerofrog(@gmail.com) // zerofrog(@gmail.com)
@ -40,7 +40,7 @@
#define MEMCMP_SRC2 esi #define MEMCMP_SRC2 esi
#define MEMCMP_SIZE ecx #define MEMCMP_SIZE ecx
.globl memcmp_mmx .global memcmp_mmx
memcmp_mmx: memcmp_mmx:
// make sure mmx regs are stored // make sure mmx regs are stored
// FreezeMMXRegs(1); // FreezeMMXRegs(1);
@ -225,7 +225,7 @@ memcmp_End:
#define MEMXOR_SRC2 esi #define MEMXOR_SRC2 esi
#define MEMXOR_SIZE ecx #define MEMXOR_SIZE ecx
.globl memxor_mmx .global memxor_mmx
memxor_mmx: memxor_mmx:
// make sure mmx regs are stored // make sure mmx regs are stored
// FreezeMMXRegs(1); // FreezeMMXRegs(1);
@ -338,7 +338,7 @@ memxor_End:
ret ret
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) // void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
.globl memcpy_amd_ .global memcpy_amd_
memcpy_amd_: memcpy_amd_:
push edi push edi
push esi push esi
@ -356,10 +356,11 @@ memcpy_amd_:
jbe $memcpy_do_align // it appears to be slower jbe $memcpy_do_align // it appears to be slower
cmp eax, 64*1024 cmp eax, 64*1024
jbe $memcpy_align_done jbe $memcpy_align_done
$memcpy_do_align: $memcpy_do_align:
mov eax, 8 // a trick that's faster than rep movsb... mov eax, 8 // a trick that's faster than rep movsb...
sub eax, edi // align destination to qword sub eax, edi // align destination to qword
andb eax, 111 // get the low bits and eax, 0b111 // get the low bits
sub ecx, eax // update copy count sub ecx, eax // update copy count
neg eax // set up to jump into the array neg eax // set up to jump into the array
add eax, offset $memcpy_align_done add eax, offset $memcpy_align_done
@ -427,7 +428,7 @@ $memcpy_ic_2:
mov eax, ecx // has valid low 6 bits of the byte count mov eax, ecx // has valid low 6 bits of the byte count
$memcpy_ic_3: $memcpy_ic_3:
shr eax, 2 // dword count shr eax, 2 // dword count
andb eax, 1111 // only look at the "remainder" bits and eax, 0b1111 // only look at the "remainder" bits
neg eax // set up to jump into the array neg eax // set up to jump into the array
add eax, offset $memcpy_last_few add eax, offset $memcpy_last_few
jmp eax // jump to array of movsd's jmp eax // jump to array of movsd's
@ -512,7 +513,7 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy
$memcpy_last_few: // dword aligned from before movsd's $memcpy_last_few: // dword aligned from before movsd's
mov eax, ecx // has valid low 2 bits of the byte count mov eax, ecx // has valid low 2 bits of the byte count
andb eax, 11 // the last few cows must come home and eax, 0b11 // the last few cows must come home
jz $memcpy_final // no more, let's leave jz $memcpy_final // no more, let's leave
rep movsb // the last 1, 2, or 3 bytes rep movsb // the last 1, 2, or 3 bytes

View File

@ -22,6 +22,9 @@
#include "Vif.h" #include "Vif.h"
#include "VUmicro.h" #include "VUmicro.h"
#include <xmmintrin.h>
#include <emmintrin.h>
// sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com) // sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com)
extern u32 g_vif1Masks[48], g_vif0Masks[48]; extern u32 g_vif1Masks[48], g_vif0Masks[48];
extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4]; extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4];
@ -55,10 +58,7 @@ extern u8 s_maskwrite[256];
extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0}; extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0};
#if defined(_MSC_VER) //#if defined(_MSC_VER)
#include <xmmintrin.h>
#include <emmintrin.h>
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask) void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
{ {
@ -95,7 +95,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
} }
#else // gcc /*#else // gcc
// Is this really supposed to be assembly for gcc and C for Windows? // Is this really supposed to be assembly for gcc and C for Windows?
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask) void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
{ {
@ -135,4 +135,4 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
FreezeXMMRegs(0); FreezeXMMRegs(0);
} }
#endif #endif*/