Linux: Straighten up or remove a few Windows/Linux differences in the code (experamental), remove some dead code, fix a mistake in the Linux version of memcpy_amd_ (still broken, though), and change optimization levels due to gcc optimization induced errors.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@718 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
arcum42 2009-03-08 12:37:35 +00:00
parent 804050aaef
commit 25df8958b2
7 changed files with 38 additions and 69 deletions

View File

@ -264,44 +264,16 @@ void mpeg2_idct_mmx_init (void);
void mpeg2_idct_init()
{
#if !defined(_MSC_VER) || _MSC_VER < 1400 // ignore vc2005 and beyond
int i, j;
int i, j;
/* if(hasMultimediaExtensions == 1)
{
mpeg2_idct_copy = mpeg2_idct_copy_mmx;
mpeg2_idct_add = mpeg2_idct_add_mmx;
mpeg2_idct_mmx_init ();
}else if(hasMultimediaExtensionsExt == 1)
{
mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
mpeg2_idct_add = mpeg2_idct_add_mmxext;
mpeg2_idct_mmx_init ();
}else*/
{
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
for (i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
for (i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
#else //blah vcnet2005 idiocity :D
int i,j;
mpeg2_idct_copy = mpeg2_idct_copy_c;
mpeg2_idct_add = mpeg2_idct_add_c;
for (i = -384; i < 640; i++)
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
for (i = 0; i < 64; i++) {
j = mpeg2_scan_norm[i];
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
j = mpeg2_scan_alt[i];
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
}
#endif
}

View File

@ -187,9 +187,11 @@ void mpeg2_idct_init ();
#ifdef _MSC_VER
#define BigEndian(out, in) out = _byteswap_ulong(in)
#else
#define BigEndian(out, in) \
out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \
((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24));
#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
// No need to reimplement something already in the compiler.
//#define BigEndian(out, in) \
// out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \
// ((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24));
#endif

View File

@ -100,12 +100,6 @@ extern PatchTextTable cpuCore[];
extern IniPatch patch[ MAX_PATCH ];
extern int patchnumber;
#ifdef __LINUX__
// Nasty, currently neccessary hack
extern u32 LinuxsseMXCSR;
extern u32 LinuxsseVUMXCSR;
#endif
void applypatch( int place );
void inifile_read( const char * name );
void inifile_command( char * cmd );

View File

@ -25,10 +25,10 @@
#include "VifDma.h"
#ifdef _MSC_VER
//#ifdef _MSC_VER
#include <xmmintrin.h>
#include <emmintrin.h>
#endif
//#endif
using namespace std; // for min / max
@ -330,9 +330,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
#endif
#ifdef _MSC_VER
//#ifdef _MSC_VER
_mm_prefetch((char*)data, _MM_HINT_NTA);
#endif
//#endif
if (VIFdmanum == 0)
{
@ -381,9 +381,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
}
#ifdef _MSC_VER
//#ifdef _MSC_VER
_mm_prefetch((char*)data+128, _MM_HINT_NTA);
#endif
//#endif
_vifRegs = (VIFregisters*)vifRegs;
_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
_vif = vif;

View File

@ -36,11 +36,11 @@ DEBUG_FLAGS=" -O0 -g "
fi
WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value"
NORMAL_FLAGS=" -pipe -msse -O3 "
NORMAL_FLAGS=" -pipe -msse -msse2 -O2 "
# These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off.
NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse "
DEBUG_FLAGS+=" -g -msse ${WARNING_FLAGS} "
DEBUG_FLAGS+=" -g -msse -msse2 ${WARNING_FLAGS} "
dnl Check for debug build
AC_MSG_CHECKING(debug build)

View File

@ -19,9 +19,9 @@
#define TINY_BLOCK_COPY 64
#define IN_CACHE_COPY 2 * 1024
#define UNCACHED_COPY 4 * 1024 /
#define UNCACHED_COPY 4 * 1024
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h
#define CACHEBLOCK 80
// Fast assembly routines for x86-64
// zerofrog(@gmail.com)
@ -40,7 +40,7 @@
#define MEMCMP_SRC2 esi
#define MEMCMP_SIZE ecx
.globl memcmp_mmx
.global memcmp_mmx
memcmp_mmx:
// make sure mmx regs are stored
// FreezeMMXRegs(1);
@ -225,7 +225,7 @@ memcmp_End:
#define MEMXOR_SRC2 esi
#define MEMXOR_SIZE ecx
.globl memxor_mmx
.global memxor_mmx
memxor_mmx:
// make sure mmx regs are stored
// FreezeMMXRegs(1);
@ -338,7 +338,7 @@ memxor_End:
ret
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
.globl memcpy_amd_
.global memcpy_amd_
memcpy_amd_:
push edi
push esi
@ -356,10 +356,11 @@ memcpy_amd_:
jbe $memcpy_do_align // it appears to be slower
cmp eax, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov eax, 8 // a trick that's faster than rep movsb...
sub eax, edi // align destination to qword
andb eax, 111 // get the low bits
and eax, 0b111 // get the low bits
sub ecx, eax // update copy count
neg eax // set up to jump into the array
add eax, offset $memcpy_align_done
@ -427,7 +428,7 @@ $memcpy_ic_2:
mov eax, ecx // has valid low 6 bits of the byte count
$memcpy_ic_3:
shr eax, 2 // dword count
andb eax, 1111 // only look at the "remainder" bits
and eax, 0b1111 // only look at the "remainder" bits
neg eax // set up to jump into the array
add eax, offset $memcpy_last_few
jmp eax // jump to array of movsd's
@ -512,7 +513,7 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy
$memcpy_last_few: // dword aligned from before movsd's
mov eax, ecx // has valid low 2 bits of the byte count
andb eax, 11 // the last few cows must come home
and eax, 0b11 // the last few cows must come home
jz $memcpy_final // no more, let's leave
rep movsb // the last 1, 2, or 3 bytes

View File

@ -22,6 +22,9 @@
#include "Vif.h"
#include "VUmicro.h"
#include <xmmintrin.h>
#include <emmintrin.h>
// sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com)
extern u32 g_vif1Masks[48], g_vif0Masks[48];
extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4];
@ -55,10 +58,7 @@ extern u8 s_maskwrite[256];
extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0};
#if defined(_MSC_VER)
#include <xmmintrin.h>
#include <emmintrin.h>
//#if defined(_MSC_VER)
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
{
@ -95,7 +95,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
}
#else // gcc
/*#else // gcc
// Is this really supposed to be assembly for gcc and C for Windows?
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
{
@ -135,4 +135,4 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
FreezeXMMRegs(0);
}
#endif
#endif*/