mirror of https://github.com/PCSX2/pcsx2.git
Linux: Straighten up or remove a few Windows/Linux differences in the code (experamental), remove some dead code, fix a mistake in the Linux version of memcpy_amd_ (still broken, though), and change optimization levels due to gcc optimization induced errors.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@718 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
804050aaef
commit
25df8958b2
|
@ -264,21 +264,8 @@ void mpeg2_idct_mmx_init (void);
|
||||||
|
|
||||||
void mpeg2_idct_init()
|
void mpeg2_idct_init()
|
||||||
{
|
{
|
||||||
#if !defined(_MSC_VER) || _MSC_VER < 1400 // ignore vc2005 and beyond
|
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
/* if(hasMultimediaExtensions == 1)
|
|
||||||
{
|
|
||||||
mpeg2_idct_copy = mpeg2_idct_copy_mmx;
|
|
||||||
mpeg2_idct_add = mpeg2_idct_add_mmx;
|
|
||||||
mpeg2_idct_mmx_init ();
|
|
||||||
}else if(hasMultimediaExtensionsExt == 1)
|
|
||||||
{
|
|
||||||
mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
|
|
||||||
mpeg2_idct_add = mpeg2_idct_add_mmxext;
|
|
||||||
mpeg2_idct_mmx_init ();
|
|
||||||
}else*/
|
|
||||||
{
|
|
||||||
mpeg2_idct_copy = mpeg2_idct_copy_c;
|
mpeg2_idct_copy = mpeg2_idct_copy_c;
|
||||||
mpeg2_idct_add = mpeg2_idct_add_c;
|
mpeg2_idct_add = mpeg2_idct_add_c;
|
||||||
for (i = -384; i < 640; i++)
|
for (i = -384; i < 640; i++)
|
||||||
|
@ -290,18 +277,3 @@ void mpeg2_idct_init()
|
||||||
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else //blah vcnet2005 idiocity :D
|
|
||||||
int i,j;
|
|
||||||
mpeg2_idct_copy = mpeg2_idct_copy_c;
|
|
||||||
mpeg2_idct_add = mpeg2_idct_add_c;
|
|
||||||
for (i = -384; i < 640; i++)
|
|
||||||
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
|
|
||||||
for (i = 0; i < 64; i++) {
|
|
||||||
j = mpeg2_scan_norm[i];
|
|
||||||
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
|
||||||
j = mpeg2_scan_alt[i];
|
|
||||||
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
|
@ -187,9 +187,11 @@ void mpeg2_idct_init ();
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define BigEndian(out, in) out = _byteswap_ulong(in)
|
#define BigEndian(out, in) out = _byteswap_ulong(in)
|
||||||
#else
|
#else
|
||||||
#define BigEndian(out, in) \
|
#define BigEndian(out, in) out = __builtin_bswap32(in) // or we could use the asm function bswap...
|
||||||
out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \
|
// No need to reimplement something already in the compiler.
|
||||||
((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24));
|
//#define BigEndian(out, in) \
|
||||||
|
// out = (((((in) >> 24) & 0xFF) << 0) + ((((in) >> 16) & 0xFF) << 8) + \
|
||||||
|
// ((((in) >> 8) & 0xFF) << 16) + ((((in) >> 0) & 0xFF) << 24));
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -100,12 +100,6 @@ extern PatchTextTable cpuCore[];
|
||||||
extern IniPatch patch[ MAX_PATCH ];
|
extern IniPatch patch[ MAX_PATCH ];
|
||||||
extern int patchnumber;
|
extern int patchnumber;
|
||||||
|
|
||||||
#ifdef __LINUX__
|
|
||||||
// Nasty, currently neccessary hack
|
|
||||||
extern u32 LinuxsseMXCSR;
|
|
||||||
extern u32 LinuxsseVUMXCSR;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void applypatch( int place );
|
void applypatch( int place );
|
||||||
void inifile_read( const char * name );
|
void inifile_read( const char * name );
|
||||||
void inifile_command( char * cmd );
|
void inifile_command( char * cmd );
|
||||||
|
|
|
@ -25,10 +25,10 @@
|
||||||
|
|
||||||
#include "VifDma.h"
|
#include "VifDma.h"
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
//#ifdef _MSC_VER
|
||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
//#endif
|
||||||
|
|
||||||
using namespace std; // for min / max
|
using namespace std; // for min / max
|
||||||
|
|
||||||
|
@ -330,9 +330,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
|
||||||
u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
|
u32 memsize = VIFdmanum ? 0x4000 : 0x1000;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
//#ifdef _MSC_VER
|
||||||
_mm_prefetch((char*)data, _MM_HINT_NTA);
|
_mm_prefetch((char*)data, _MM_HINT_NTA);
|
||||||
#endif
|
//#endif
|
||||||
|
|
||||||
if (VIFdmanum == 0)
|
if (VIFdmanum == 0)
|
||||||
{
|
{
|
||||||
|
@ -381,9 +381,9 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
|
||||||
VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
|
VIFUNPACK_LOG("*PCSX2*: Unpack %x with size 0!! v->size = %d cl = %d, wl = %d, mode %d mask %x\n", v->cmd, v->size, vifRegs->cycle.cl, vifRegs->cycle.wl, vifRegs->mode, vifRegs->mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
//#ifdef _MSC_VER
|
||||||
_mm_prefetch((char*)data+128, _MM_HINT_NTA);
|
_mm_prefetch((char*)data+128, _MM_HINT_NTA);
|
||||||
#endif
|
//#endif
|
||||||
_vifRegs = (VIFregisters*)vifRegs;
|
_vifRegs = (VIFregisters*)vifRegs;
|
||||||
_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
|
_vifMaskRegs = VIFdmanum ? g_vif1Masks : g_vif0Masks;
|
||||||
_vif = vif;
|
_vif = vif;
|
||||||
|
|
|
@ -36,11 +36,11 @@ DEBUG_FLAGS=" -O0 -g "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value"
|
WARNING_FLAGS="-Wall -Wno-format -Wno-unused-value"
|
||||||
NORMAL_FLAGS=" -pipe -msse -O3 "
|
NORMAL_FLAGS=" -pipe -msse -msse2 -O2 "
|
||||||
# These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off.
|
# These optimizations seem to cause issues with GCC 4.3.3, so we'll turn them off.
|
||||||
NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse "
|
NORMAL_FLAGS+=" -fno-guess-branch-probability -fno-dse -fno-tree-dse "
|
||||||
|
|
||||||
DEBUG_FLAGS+=" -g -msse ${WARNING_FLAGS} "
|
DEBUG_FLAGS+=" -g -msse -msse2 ${WARNING_FLAGS} "
|
||||||
|
|
||||||
dnl Check for debug build
|
dnl Check for debug build
|
||||||
AC_MSG_CHECKING(debug build)
|
AC_MSG_CHECKING(debug build)
|
||||||
|
|
|
@ -19,9 +19,9 @@
|
||||||
|
|
||||||
#define TINY_BLOCK_COPY 64
|
#define TINY_BLOCK_COPY 64
|
||||||
#define IN_CACHE_COPY 2 * 1024
|
#define IN_CACHE_COPY 2 * 1024
|
||||||
#define UNCACHED_COPY 4 * 1024 /
|
#define UNCACHED_COPY 4 * 1024
|
||||||
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||||
#define CACHEBLOCK 80h
|
#define CACHEBLOCK 80
|
||||||
|
|
||||||
// Fast assembly routines for x86-64
|
// Fast assembly routines for x86-64
|
||||||
// zerofrog(@gmail.com)
|
// zerofrog(@gmail.com)
|
||||||
|
@ -40,7 +40,7 @@
|
||||||
#define MEMCMP_SRC2 esi
|
#define MEMCMP_SRC2 esi
|
||||||
#define MEMCMP_SIZE ecx
|
#define MEMCMP_SIZE ecx
|
||||||
|
|
||||||
.globl memcmp_mmx
|
.global memcmp_mmx
|
||||||
memcmp_mmx:
|
memcmp_mmx:
|
||||||
// make sure mmx regs are stored
|
// make sure mmx regs are stored
|
||||||
// FreezeMMXRegs(1);
|
// FreezeMMXRegs(1);
|
||||||
|
@ -225,7 +225,7 @@ memcmp_End:
|
||||||
#define MEMXOR_SRC2 esi
|
#define MEMXOR_SRC2 esi
|
||||||
#define MEMXOR_SIZE ecx
|
#define MEMXOR_SIZE ecx
|
||||||
|
|
||||||
.globl memxor_mmx
|
.global memxor_mmx
|
||||||
memxor_mmx:
|
memxor_mmx:
|
||||||
// make sure mmx regs are stored
|
// make sure mmx regs are stored
|
||||||
// FreezeMMXRegs(1);
|
// FreezeMMXRegs(1);
|
||||||
|
@ -338,7 +338,7 @@ memxor_End:
|
||||||
ret
|
ret
|
||||||
|
|
||||||
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||||
.globl memcpy_amd_
|
.global memcpy_amd_
|
||||||
memcpy_amd_:
|
memcpy_amd_:
|
||||||
push edi
|
push edi
|
||||||
push esi
|
push esi
|
||||||
|
@ -356,10 +356,11 @@ memcpy_amd_:
|
||||||
jbe $memcpy_do_align // it appears to be slower
|
jbe $memcpy_do_align // it appears to be slower
|
||||||
cmp eax, 64*1024
|
cmp eax, 64*1024
|
||||||
jbe $memcpy_align_done
|
jbe $memcpy_align_done
|
||||||
|
|
||||||
$memcpy_do_align:
|
$memcpy_do_align:
|
||||||
mov eax, 8 // a trick that's faster than rep movsb...
|
mov eax, 8 // a trick that's faster than rep movsb...
|
||||||
sub eax, edi // align destination to qword
|
sub eax, edi // align destination to qword
|
||||||
andb eax, 111 // get the low bits
|
and eax, 0b111 // get the low bits
|
||||||
sub ecx, eax // update copy count
|
sub ecx, eax // update copy count
|
||||||
neg eax // set up to jump into the array
|
neg eax // set up to jump into the array
|
||||||
add eax, offset $memcpy_align_done
|
add eax, offset $memcpy_align_done
|
||||||
|
@ -427,7 +428,7 @@ $memcpy_ic_2:
|
||||||
mov eax, ecx // has valid low 6 bits of the byte count
|
mov eax, ecx // has valid low 6 bits of the byte count
|
||||||
$memcpy_ic_3:
|
$memcpy_ic_3:
|
||||||
shr eax, 2 // dword count
|
shr eax, 2 // dword count
|
||||||
andb eax, 1111 // only look at the "remainder" bits
|
and eax, 0b1111 // only look at the "remainder" bits
|
||||||
neg eax // set up to jump into the array
|
neg eax // set up to jump into the array
|
||||||
add eax, offset $memcpy_last_few
|
add eax, offset $memcpy_last_few
|
||||||
jmp eax // jump to array of movsd's
|
jmp eax // jump to array of movsd's
|
||||||
|
@ -512,7 +513,7 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy
|
||||||
|
|
||||||
$memcpy_last_few: // dword aligned from before movsd's
|
$memcpy_last_few: // dword aligned from before movsd's
|
||||||
mov eax, ecx // has valid low 2 bits of the byte count
|
mov eax, ecx // has valid low 2 bits of the byte count
|
||||||
andb eax, 11 // the last few cows must come home
|
and eax, 0b11 // the last few cows must come home
|
||||||
jz $memcpy_final // no more, let's leave
|
jz $memcpy_final // no more, let's leave
|
||||||
rep movsb // the last 1, 2, or 3 bytes
|
rep movsb // the last 1, 2, or 3 bytes
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@
|
||||||
#include "Vif.h"
|
#include "Vif.h"
|
||||||
#include "VUmicro.h"
|
#include "VUmicro.h"
|
||||||
|
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
// sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com)
|
// sse2 highly optimized vif (~200 separate functions are built) zerofrog(@gmail.com)
|
||||||
extern u32 g_vif1Masks[48], g_vif0Masks[48];
|
extern u32 g_vif1Masks[48], g_vif0Masks[48];
|
||||||
extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4];
|
extern u32 g_vif1HasMask3[4], g_vif0HasMask3[4];
|
||||||
|
@ -55,10 +58,7 @@ extern u8 s_maskwrite[256];
|
||||||
|
|
||||||
extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0};
|
extern "C" PCSX2_ALIGNED16(u32 s_TempDecompress[4]) = {0};
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
//#if defined(_MSC_VER)
|
||||||
|
|
||||||
#include <xmmintrin.h>
|
|
||||||
#include <emmintrin.h>
|
|
||||||
|
|
||||||
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
||||||
{
|
{
|
||||||
|
@ -95,7 +95,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#else // gcc
|
/*#else // gcc
|
||||||
// Is this really supposed to be assembly for gcc and C for Windows?
|
// Is this really supposed to be assembly for gcc and C for Windows?
|
||||||
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
||||||
{
|
{
|
||||||
|
@ -135,4 +135,4 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
||||||
FreezeXMMRegs(0);
|
FreezeXMMRegs(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif*/
|
||||||
|
|
Loading…
Reference in New Issue