Remove lots of evil %'s.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@693 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
arcum42 2009-03-05 21:35:26 +00:00
parent 8614bbd0f8
commit 3ae6ff0856
13 changed files with 479 additions and 477 deletions

View File

@ -231,7 +231,9 @@ public:
#ifndef __LINUX__ #ifndef __LINUX__
__asm mov eax, SendSimplePacket __asm mov eax, SendSimplePacket
#else #else
__asm ("mov %eax, SendSimplePacket"); __asm__ (".intel_syntax noprefix\n"
"mov eax, SendSimplePacket\n"
".att_syntax\n");
#endif #endif
//return (uptr)&SendSimplePacket; //return (uptr)&SendSimplePacket;
} }

View File

@ -1,75 +1,75 @@
.intel_syntax .intel_syntax noprefix
.extern g_pCurrentRoutine .extern g_pCurrentRoutine
.globl so_call .globl so_call
so_call: so_call:
mov %eax, dword ptr [%esp+4] mov eax, dword ptr [esp+4]
test dword ptr [%eax+24], 1 test dword ptr [eax+24], 1
jnz RestoreRegs jnz RestoreRegs
mov [%eax+8], %ebx mov [eax+8], ebx
mov [%eax+12], %esi mov [eax+12], esi
mov [%eax+16], %edi mov [eax+16], edi
mov [%eax+20], %ebp mov [eax+20], ebp
mov dword ptr [%eax+24], 1 mov dword ptr [eax+24], 1
jmp CallFn jmp CallFn
RestoreRegs: RestoreRegs:
// have to load and save at the same time // have to load and save at the same time
mov %ecx, [%eax+8] mov ecx, [eax+8]
mov %edx, [%eax+12] mov edx, [eax+12]
mov [%eax+8], %ebx mov [eax+8], ebx
mov [%eax+12], %esi mov [eax+12], esi
mov %ebx, %ecx mov ebx, ecx
mov %esi, %edx mov esi, edx
mov %ecx, [%eax+16] mov ecx, [eax+16]
mov %edx, [%eax+20] mov edx, [eax+20]
mov [%eax+16], %edi mov [eax+16], edi
mov [%eax+20], %ebp mov [eax+20], ebp
mov %edi, %ecx mov edi, ecx
mov %ebp, %edx mov ebp, edx
CallFn: CallFn:
mov [g_pCurrentRoutine], %eax mov [g_pCurrentRoutine], eax
mov %ecx, %esp mov ecx, esp
mov %esp, [%eax+4] mov esp, [eax+4]
mov [%eax+4], %ecx mov [eax+4], ecx
jmp dword ptr [%eax] jmp dword ptr [eax]
.globl so_resume .globl so_resume
so_resume: so_resume:
mov %eax, [g_pCurrentRoutine] mov eax, [g_pCurrentRoutine]
mov %ecx, [%eax+8] mov ecx, [eax+8]
mov %edx, [%eax+12] mov edx, [eax+12]
mov [%eax+8], %ebx mov [eax+8], ebx
mov [%eax+12], %esi mov [eax+12], esi
mov %ebx, %ecx mov ebx, ecx
mov %esi, %edx mov esi, edx
mov %ecx, [%eax+16] mov ecx, [eax+16]
mov %edx, [%eax+20] mov edx, [eax+20]
mov [%eax+16], %edi mov [eax+16], edi
mov [%eax+20], %ebp mov [eax+20], ebp
mov %edi, %ecx mov edi, ecx
mov %ebp, %edx mov ebp, edx
// put the return address in pcalladdr // put the return address in pcalladdr
mov %ecx, [%esp] mov ecx, [esp]
mov [%eax], %ecx mov [eax], ecx
add %esp, 4 // remove the return address add esp, 4 // remove the return address
// swap stack pointers // swap stack pointers
mov %ecx, [%eax+4] mov ecx, [eax+4]
mov [%eax+4], %esp mov [eax+4], esp
mov %esp, %ecx mov esp, ecx
ret ret
.globl so_exit .globl so_exit
so_exit: so_exit:
mov %eax, [g_pCurrentRoutine] mov eax, [g_pCurrentRoutine]
mov %esp, [%eax+4] mov esp, [eax+4]
mov %ebx, [%eax+8] mov ebx, [eax+8]
mov %esi, [%eax+12] mov esi, [eax+12]
mov %edi, [%eax+16] mov edi, [eax+16]
mov %ebp, [%eax+20] mov ebp, [eax+20]
ret ret

View File

@ -129,15 +129,15 @@ namespace Threading
if( true ) //isMultiCore ) if( true ) //isMultiCore )
{ {
__asm__ __volatile__( __asm__ __volatile__(
".intel_syntax\n" ".intel_syntax noprefix\n"
"lock xadd [%0], %%eax\n" "lock xadd [%0], eax\n"
".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory"); ".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory");
} }
else else
{ {
__asm__ __volatile__( __asm__ __volatile__(
".intel_syntax\n" ".intel_syntax noprefix\n"
"xadd [%0], %%eax\n" "xadd [%0], eax\n"
".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory"); ".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory");
} }
} }
@ -175,8 +175,8 @@ namespace Threading
__forceinline void pcsx2_InterlockedExchange64(volatile s64* Target, s64 Value) __forceinline void pcsx2_InterlockedExchange64(volatile s64* Target, s64 Value)
{ {
__asm__ __volatile__( __asm__ __volatile__(
".intel_syntax\n" ".intel_syntax noprefix\n"
"lock xchg [%0], %%rax\n" "lock xchg [%0], rax\n"
".att_syntax\n" : : "r"(Target), "a"(Value) : "memory" ".att_syntax\n" : : "r"(Target), "a"(Value) : "memory"
); );
return 0; return 0;

View File

@ -107,10 +107,10 @@ static __forceinline void memset_8( void *dest )
case 3: case 3:
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"cld\n" "cld\n"
// "mov %edi, %0\n" // "mov edi, %0\n"
// "mov %eax, %1\n" // "mov eax, %1\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
@ -125,10 +125,10 @@ static __forceinline void memset_8( void *dest )
case 4: case 4:
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"cld\n" "cld\n"
// "mov %edi, %0\n" // "mov edi, %0\n"
// "mov %eax, %1\n" // "mov eax, %1\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
@ -144,10 +144,10 @@ static __forceinline void memset_8( void *dest )
case 5: case 5:
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"cld\n" "cld\n"
// "mov %edi, %0\n" // "mov edi, %0\n"
// "mov %eax, %1\n" // "mov eax, %1\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
"stosd\n" "stosd\n"
@ -164,7 +164,7 @@ static __forceinline void memset_8( void *dest )
default: default:
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"cld\n" "cld\n"
// "mov ecx, %0\n" // "mov ecx, %0\n"
// "mov edi, %1\n" // "mov edi, %1\n"

View File

@ -504,15 +504,15 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
} }
#else #else
if( VIFdmanum ) { if( VIFdmanum ) {
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movaps %%xmm6, xmmword ptr [%0]\n" "movaps xmm6, xmmword ptr [%0]\n"
"movaps %%xmm7, xmmword ptr [%1]\n" "movaps xmm7, xmmword ptr [%1]\n"
".att_syntax\n" : :"r"(g_vifRow1), "r"(g_vifCol1) ); ".att_syntax\n" : :"r"(g_vifRow1), "r"(g_vifCol1) );
} }
else { else {
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movaps %%xmm6, xmmword ptr [%0]\n" "movaps xmm6, xmmword ptr [%0]\n"
"movaps %%xmm7, xmmword ptr [%1]\n" "movaps xmm7, xmmword ptr [%1]\n"
".att_syntax\n" : : "r"(g_vifRow0), "r"(g_vifCol0) ); ".att_syntax\n" : : "r"(g_vifRow0), "r"(g_vifCol0) );
} }
#endif #endif

View File

@ -1,6 +1,6 @@
// microVU.cpp assembly routines // microVU.cpp assembly routines
// arcum42(@gmail.com) // arcum42(@gmail.com)
.intel_syntax .intel_syntax noprefix
.extern mVUexecuteVU0 .extern mVUexecuteVU0
.extern mVUexecuteVU1 .extern mVUexecuteVU1
@ -19,14 +19,14 @@ startVU0:
call mVUexecuteVU0 call mVUexecuteVU0
// backup cpu state // backup cpu state
push %ebx push ebx
push %ebp push ebp
push %esi push esi
push %edi push edi
ldmxcsr g_sseVUMXCSR ldmxcsr g_sseVUMXCSR
// Should set xmmZ? // Should set xmmZ?
jmp %eax jmp eax
// Runs VU1 for number of cycles // Runs VU1 for number of cycles
// void __fastcall startVU1(u32 startPC, u32 cycles) // void __fastcall startVU1(u32 startPC, u32 cycles)
@ -35,14 +35,14 @@ startVU01:
call mVUexecuteVU1 call mVUexecuteVU1
// backup cpu state // backup cpu state
push %ebx push ebx
push %ebp push ebp
push %esi push esi
push %edi push edi
ldmxcsr g_sseVUMXCSR ldmxcsr g_sseVUMXCSR
jmp %eax jmp eax
// Exit point // Exit point
// void __fastcall endVU0(u32 startPC, u32 cycles) // void __fastcall endVU0(u32 startPC, u32 cycles)
@ -51,10 +51,10 @@ endVU0:
//call mVUcleanUpVU0 //call mVUcleanUpVU0
/*restore cpu state*/ /*restore cpu state*/
pop %edi; pop edi;
pop %esi; pop esi;
pop %ebp; pop ebp;
pop %ebx; pop ebx;
ldmxcsr g_sseMXCSR ldmxcsr g_sseMXCSR

View File

@ -1,6 +1,6 @@
// iVUzerorec.cpp assembly routines // iVUzerorec.cpp assembly routines
// zerofrog(@gmail.com) // zerofrog(@gmail.com)
.intel_syntax .intel_syntax noprefix
.extern svudispfntemp .extern svudispfntemp
.extern s_TotalVUCycles .extern s_TotalVUCycles
@ -17,35 +17,35 @@
// SuperVUExecuteProgram(u32 startpc, int vuindex) // SuperVUExecuteProgram(u32 startpc, int vuindex)
.globl SuperVUExecuteProgram .globl SuperVUExecuteProgram
SuperVUExecuteProgram: SuperVUExecuteProgram:
mov %eax, [%esp] mov eax, [esp]
mov dword ptr s_TotalVUCycles, 0 mov dword ptr s_TotalVUCycles, 0
add %esp, 4 add esp, 4
mov dword ptr [s_callstack], %eax mov dword ptr [s_callstack], eax
call SuperVUGetProgram call SuperVUGetProgram
mov s_vu1ebp, %ebp mov s_vu1ebp, ebp
mov s_vu1esi, %esi mov s_vu1esi, esi
mov s_vuedi, %edi mov s_vuedi, edi
mov s_vuebx, %ebx mov s_vuebx, ebx
#ifdef _DEBUG #ifdef _DEBUG
mov s_vu1esp, %esp mov s_vu1esp, esp
#endif #endif
ldmxcsr g_sseVUMXCSR ldmxcsr g_sseVUMXCSR
mov dword ptr s_writeQ, 0xffffffff mov dword ptr s_writeQ, 0xffffffff
mov dword ptr s_writeP, 0xffffffff mov dword ptr s_writeP, 0xffffffff
jmp %eax jmp eax
.globl SuperVUEndProgram .globl SuperVUEndProgram
SuperVUEndProgram: SuperVUEndProgram:
// restore cpu state // restore cpu state
ldmxcsr g_sseMXCSR ldmxcsr g_sseMXCSR
mov %ebp, s_vu1ebp mov ebp, s_vu1ebp
mov %esi, s_vu1esi mov esi, s_vu1esi
mov %edi, s_vuedi mov edi, s_vuedi
mov %ebx, s_vuebx mov ebx, s_vuebx
#ifdef _DEBUG #ifdef _DEBUG
sub s_vu1esp, %esp sub s_vu1esp, esp
#endif #endif
call SuperVUCleanupProgram call SuperVUCleanupProgram
@ -54,20 +54,20 @@ SuperVUEndProgram:
.globl svudispfn .globl svudispfn
svudispfn: svudispfn:
mov [g_curdebugvu], %eax mov [g_curdebugvu], eax
mov s_saveecx, %ecx mov s_saveecx, ecx
mov s_saveedx, %edx mov s_saveedx, edx
mov s_saveebx, %ebx mov s_saveebx, ebx
mov s_saveesi, %esi mov s_saveesi, esi
mov s_saveedi, %edi mov s_saveedi, edi
mov s_saveebp, %ebp mov s_saveebp, ebp
call svudispfntemp call svudispfntemp
mov %ecx, s_saveecx mov ecx, s_saveecx
mov %edx, s_saveedx mov edx, s_saveedx
mov %ebx, s_saveebx mov ebx, s_saveebx
mov %esi, s_saveesi mov esi, s_saveesi
mov %edi, s_saveedi mov edi, s_saveedi
mov %ebp, s_saveebp mov ebp, s_saveebp
ret ret

View File

@ -16,29 +16,29 @@
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/ */
.intel_syntax .intel_syntax noprefix
.extern _vifRegs .extern _vifRegs
.extern _vifMaskRegs .extern _vifMaskRegs
.extern _vifRow .extern _vifRow
#define VIF_ESP %esp #define VIF_ESP esp
#define VIF_SRC %esi #define VIF_SRC esi
#define VIF_INC %ecx #define VIF_INC ecx
#define VIF_DST %edi #define VIF_DST edi
#define VIF_SIZE %edx #define VIF_SIZE edx
#define VIF_TMPADDR %eax #define VIF_TMPADDR eax
#define VIF_SAVEEBX %ebx #define VIF_SAVEEBX ebx
#define VIF_SAVEEBXd %ebx #define VIF_SAVEEBXd ebx
#define XMM_R0 %xmm0 #define XMM_R0 xmm0
#define XMM_R1 %xmm1 #define XMM_R1 xmm1
#define XMM_R2 %xmm2 #define XMM_R2 xmm2
#define XMM_WRITEMASK %xmm3 #define XMM_WRITEMASK xmm3
#define XMM_ROWMASK %xmm4 #define XMM_ROWMASK xmm4
#define XMM_ROWCOLMASK %xmm5 #define XMM_ROWCOLMASK xmm5
#define XMM_ROW %xmm6 #define XMM_ROW xmm6
#define XMM_COL %xmm7 #define XMM_COL xmm7
#define XMM_R3 XMM_COL #define XMM_R3 XMM_COL
@ -1189,35 +1189,35 @@
.extern s_TempDecompress .extern s_TempDecompress
#define DECOMPRESS_RGBA(OFFSET) \ #define DECOMPRESS_RGBA(OFFSET) \
mov %bl, %al; \ mov bl, al; \
shl %bl, 3; \ shl bl, 3; \
mov byte ptr [s_TempDecompress+OFFSET], %bl; \ mov byte ptr [s_TempDecompress+OFFSET], bl; \
\ \
mov %bx, %ax; \ mov bx, ax; \
shr %bx, 2; \ shr bx, 2; \
and %bx, 0xf8; \ and bx, 0xf8; \
mov byte ptr [s_TempDecompress+OFFSET+1], %bl; \ mov byte ptr [s_TempDecompress+OFFSET+1], bl; \
\ \
mov %bx, %ax; \ mov bx, ax; \
shr %bx, 7; \ shr bx, 7; \
and %bx, 0xf8; \ and bx, 0xf8; \
mov byte ptr [s_TempDecompress+OFFSET+2], %bl; \ mov byte ptr [s_TempDecompress+OFFSET+2], bl; \
mov %bx, %ax; \ mov bx, ax; \
shr %bx, 8; \ shr bx, 8; \
and %bx, 0x80; \ and bx, 0x80; \
mov byte ptr [s_TempDecompress+OFFSET+3], %bl; \ mov byte ptr [s_TempDecompress+OFFSET+3], bl; \
#define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \ #define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \ mov eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \ DECOMPRESS_RGBA(0); \
\ \
shr %eax, 16; \ shr eax, 16; \
DECOMPRESS_RGBA(4); \ DECOMPRESS_RGBA(4); \
\ \
mov %eax, dword ptr [VIF_SRC+4]; \ mov eax, dword ptr [VIF_SRC+4]; \
DECOMPRESS_RGBA(8); \ DECOMPRESS_RGBA(8); \
\ \
shr %eax, 16; \ shr eax, 16; \
DECOMPRESS_RGBA(12); \ DECOMPRESS_RGBA(12); \
\ \
movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \ movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \
@ -1242,13 +1242,13 @@
#define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4 #define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4
#define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \ #define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \ mov eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \ DECOMPRESS_RGBA(0); \
\ \
shr %eax, 16; \ shr eax, 16; \
DECOMPRESS_RGBA(4); \ DECOMPRESS_RGBA(4); \
\ \
mov %eax, dword ptr [VIF_SRC]; \ mov eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(8); \ DECOMPRESS_RGBA(8); \
\ \
movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \ movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \
@ -1271,10 +1271,10 @@
#define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3 #define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3
#define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \ #define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \ mov eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \ DECOMPRESS_RGBA(0); \
\ \
shr %eax, 16; \ shr eax, 16; \
DECOMPRESS_RGBA(4); \ DECOMPRESS_RGBA(4); \
\ \
movq XMM_R0, qword ptr [s_TempDecompress]; \ movq XMM_R0, qword ptr [s_TempDecompress]; \
@ -1294,7 +1294,7 @@
#define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2 #define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2
#define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \ #define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \
mov %ax, word ptr [VIF_SRC]; \ mov ax, word ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0) \ DECOMPRESS_RGBA(0) \
\ \
movd XMM_R0, dword ptr [s_TempDecompress]; \ movd XMM_R0, dword ptr [s_TempDecompress]; \
@ -1327,20 +1327,20 @@
// 32 bit versions have the args on the stack // 32 bit versions have the args on the stack
#define INIT_ARGS() \ #define INIT_ARGS() \
push %edi; \ push edi; \
push %esi; \ push esi; \
push %ebx; \ push ebx; \
mov VIF_DST, dword ptr [%esp+4+12]; \ mov VIF_DST, dword ptr [esp+4+12]; \
mov VIF_SRC, dword ptr [%esp+8+12]; \ mov VIF_SRC, dword ptr [esp+8+12]; \
mov VIF_SIZE, dword ptr [%esp+12+12]; \ mov VIF_SIZE, dword ptr [esp+12+12]; \
#define POP_REGS() \ #define POP_REGS() \
pop %ebx; \ pop ebx; \
pop %esi; \ pop esi; \
pop %edi; \ pop edi; \
#define INC_STACK(reg) add %esp, 4; #define INC_STACK(reg) add esp, 4;
// qsize - bytes of compressed size of 1 decompressed xmmword // qsize - bytes of compressed size of 1 decompressed xmmword
// int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize) // int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize)
@ -1431,7 +1431,7 @@ name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec: \
sub VIF_SIZE, qsize; \ sub VIF_SIZE, qsize; \
name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \ name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \
SAVE_ROW_REG; \ SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \ mov eax, VIF_SIZE; \
POP_REGS(); \ POP_REGS(); \
ret; \ ret; \
\ \
@ -1460,7 +1460,7 @@ name##_##sign##_##MaskType##_##ModeType##_C2_Done3: \
name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \ name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \
\ \
SAVE_ROW_REG; \ SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \ mov eax, VIF_SIZE; \
POP_REGS(); \ POP_REGS(); \
ret; \ ret; \
\ \
@ -1497,7 +1497,7 @@ name##_##sign##_##MaskType##_##ModeType##_C3_Done3: \
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \ UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \ name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \
SAVE_ROW_REG; \ SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \ mov eax, VIF_SIZE; \
POP_REGS(); \ POP_REGS(); \
ret; \ ret; \
\ \
@ -1552,7 +1552,7 @@ name##_##sign##_##MaskType##_##ModeType##_C4_Done: \
\ \
SAVE_ROW_REG; \ SAVE_ROW_REG; \
INC_STACK(); \ INC_STACK(); \
mov %eax, VIF_SIZE; \ mov eax, VIF_SIZE; \
POP_REGS(); \ POP_REGS(); \
ret; \ ret; \

View File

@ -26,7 +26,7 @@
// Fast assembly routines for x86-64 // Fast assembly routines for x86-64
// zerofrog(@gmail.com) // zerofrog(@gmail.com)
// and added to by arcum42@gmail.com // and added to by arcum42@gmail.com
.intel_syntax .intel_syntax noprefix
.extern g_EEFreezeRegs .extern g_EEFreezeRegs
.extern FreezeMMXRegs_ .extern FreezeMMXRegs_
.extern _mmx_backup .extern _mmx_backup
@ -36,9 +36,9 @@
// ~10 times faster than standard memcmp // ~10 times faster than standard memcmp
// (zerofrog) // (zerofrog)
// u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) // u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
#define MEMCMP_SRC1 %edx #define MEMCMP_SRC1 edx
#define MEMCMP_SRC2 %esi #define MEMCMP_SRC2 esi
#define MEMCMP_SIZE %ecx #define MEMCMP_SIZE ecx
.globl memcmp_mmx .globl memcmp_mmx
memcmp_mmx: memcmp_mmx:
@ -48,82 +48,82 @@ memcmp_mmx:
je memcmp_mmx_begin je memcmp_mmx_begin
push 1 push 1
call FreezeMMXRegs_ call FreezeMMXRegs_
add %esp, 4 add esp, 4
memcmp_mmx_begin: memcmp_mmx_begin:
push %esi push esi
mov MEMCMP_SRC1, dword ptr [%esp+8] mov MEMCMP_SRC1, dword ptr [esp+8]
mov MEMCMP_SRC2, dword ptr [%esp+12] mov MEMCMP_SRC2, dword ptr [esp+12]
mov MEMCMP_SIZE, dword ptr [%esp+16] mov MEMCMP_SIZE, dword ptr [esp+16]
cmp MEMCMP_SIZE, 32 cmp MEMCMP_SIZE, 32
jl memcmp_Done4 jl memcmp_Done4
// custom test first 8 to make sure things are ok // custom test first 8 to make sure things are ok
movq %mm0, [MEMCMP_SRC2] movq mm0, [MEMCMP_SRC2]
movq %mm1, [MEMCMP_SRC2+8] movq mm1, [MEMCMP_SRC2+8]
pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd mm0, [MEMCMP_SRC1]
pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd mm1, [MEMCMP_SRC1+8]
pand %mm0, %mm1 pand mm0, mm1
movq %mm2, [MEMCMP_SRC2+16] movq mm2, [MEMCMP_SRC2+16]
pmovmskb %eax, %mm0 pmovmskb eax, mm0
movq %mm3, [MEMCMP_SRC2+24] movq mm3, [MEMCMP_SRC2+24]
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_NextComp je memcmp_NextComp
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_NextComp: memcmp_NextComp:
pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd mm2, [MEMCMP_SRC1+16]
pcmpeqd %mm3, [MEMCMP_SRC1+24] pcmpeqd mm3, [MEMCMP_SRC1+24]
pand %mm2, %mm3 pand mm2, mm3
pmovmskb %eax, %mm2 pmovmskb eax, mm2
sub MEMCMP_SIZE, 32 sub MEMCMP_SIZE, 32
add MEMCMP_SRC2, 32 add MEMCMP_SRC2, 32
add MEMCMP_SRC1, 32 add MEMCMP_SRC1, 32
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_ContinueTest je memcmp_ContinueTest
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
cmp MEMCMP_SIZE, 64 cmp MEMCMP_SIZE, 64
jl memcmp_Done8 jl memcmp_Done8
memcmp_Cmp8: memcmp_Cmp8:
movq %mm0, [MEMCMP_SRC2] movq mm0, [MEMCMP_SRC2]
movq %mm1, [MEMCMP_SRC2+8] movq mm1, [MEMCMP_SRC2+8]
movq %mm2, [MEMCMP_SRC2+16] movq mm2, [MEMCMP_SRC2+16]
movq %mm3, [MEMCMP_SRC2+24] movq mm3, [MEMCMP_SRC2+24]
movq %mm4, [MEMCMP_SRC2+32] movq mm4, [MEMCMP_SRC2+32]
movq %mm5, [MEMCMP_SRC2+40] movq mm5, [MEMCMP_SRC2+40]
movq %mm6, [MEMCMP_SRC2+48] movq mm6, [MEMCMP_SRC2+48]
movq %mm7, [MEMCMP_SRC2+56] movq mm7, [MEMCMP_SRC2+56]
pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd mm0, [MEMCMP_SRC1]
pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd mm1, [MEMCMP_SRC1+8]
pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd mm2, [MEMCMP_SRC1+16]
pcmpeqd %mm3, [MEMCMP_SRC1+24] pcmpeqd mm3, [MEMCMP_SRC1+24]
pand %mm0, %mm1 pand mm0, mm1
pcmpeqd %mm4, [MEMCMP_SRC1+32] pcmpeqd mm4, [MEMCMP_SRC1+32]
pand %mm0, %mm2 pand mm0, mm2
pcmpeqd %mm5, [MEMCMP_SRC1+40] pcmpeqd mm5, [MEMCMP_SRC1+40]
pand %mm0, %mm3 pand mm0, mm3
pcmpeqd %mm6, [MEMCMP_SRC1+48] pcmpeqd mm6, [MEMCMP_SRC1+48]
pand %mm0, %mm4 pand mm0, mm4
pcmpeqd %mm7, [MEMCMP_SRC1+56] pcmpeqd mm7, [MEMCMP_SRC1+56]
pand %mm0, %mm5 pand mm0, mm5
pand %mm0, %mm6 pand mm0, mm6
pand %mm0, %mm7 pand mm0, mm7
pmovmskb %eax, %mm0 pmovmskb eax, mm0
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_Continue je memcmp_Continue
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Continue: memcmp_Continue:
@ -137,93 +137,93 @@ memcmp_ContinueTest:
memcmp_Done8: memcmp_Done8:
test MEMCMP_SIZE, 0x20 test MEMCMP_SIZE, 0x20
jz memcmp_Done4 jz memcmp_Done4
movq %mm0, [MEMCMP_SRC2] movq mm0, [MEMCMP_SRC2]
movq %mm1, [MEMCMP_SRC2+8] movq mm1, [MEMCMP_SRC2+8]
movq %mm2, [MEMCMP_SRC2+16] movq mm2, [MEMCMP_SRC2+16]
movq %mm3, [MEMCMP_SRC2+24] movq mm3, [MEMCMP_SRC2+24]
pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd mm0, [MEMCMP_SRC1]
pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd mm1, [MEMCMP_SRC1+8]
pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd mm2, [MEMCMP_SRC1+16]
pcmpeqd %mm3, [MEMCMP_SRC1+24] pcmpeqd mm3, [MEMCMP_SRC1+24]
pand %mm0, %mm1 pand mm0, mm1
pand %mm0, %mm2 pand mm0, mm2
pand %mm0, %mm3 pand mm0, mm3
pmovmskb %eax, %mm0 pmovmskb eax, mm0
sub MEMCMP_SIZE, 32 sub MEMCMP_SIZE, 32
add MEMCMP_SRC2, 32 add MEMCMP_SRC2, 32
add MEMCMP_SRC1, 32 add MEMCMP_SRC1, 32
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_Done4 je memcmp_Done4
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Done4: memcmp_Done4:
cmp MEMCMP_SIZE, 24 cmp MEMCMP_SIZE, 24
jne memcmp_Done2 jne memcmp_Done2
movq %mm0, [MEMCMP_SRC2] movq mm0, [MEMCMP_SRC2]
movq %mm1, [MEMCMP_SRC2+8] movq mm1, [MEMCMP_SRC2+8]
movq %mm2, [MEMCMP_SRC2+16] movq mm2, [MEMCMP_SRC2+16]
pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd mm0, [MEMCMP_SRC1]
pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd mm1, [MEMCMP_SRC1+8]
pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd mm2, [MEMCMP_SRC1+16]
pand %mm0, %mm1 pand mm0, mm1
pand %mm0, %mm2 pand mm0, mm2
pmovmskb %eax, %mm0 pmovmskb eax, mm0
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_Done je memcmp_Done
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Done2: memcmp_Done2:
cmp MEMCMP_SIZE, 16 cmp MEMCMP_SIZE, 16
jne memcmp_Done1 jne memcmp_Done1
movq %mm0, [MEMCMP_SRC2] movq mm0, [MEMCMP_SRC2]
movq %mm1, [MEMCMP_SRC2+8] movq mm1, [MEMCMP_SRC2+8]
pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd mm0, [MEMCMP_SRC1]
pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd mm1, [MEMCMP_SRC1+8]
pand %mm0, %mm1 pand mm0, mm1
pmovmskb %eax, %mm0 pmovmskb eax, mm0
// check if eq // check if eq
cmp %eax, 0xff cmp eax, 0xff
je memcmp_Done je memcmp_Done
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Done1: memcmp_Done1:
cmp MEMCMP_SIZE, 8 cmp MEMCMP_SIZE, 8
jne memcmp_Done jne memcmp_Done
mov %eax, [MEMCMP_SRC2] mov eax, [MEMCMP_SRC2]
mov MEMCMP_SRC2, [MEMCMP_SRC2+4] mov MEMCMP_SRC2, [MEMCMP_SRC2+4]
cmp %eax, [MEMCMP_SRC1] cmp eax, [MEMCMP_SRC1]
je memcmp_Next je memcmp_Next
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Next: memcmp_Next:
cmp MEMCMP_SRC2, [MEMCMP_SRC1+4] cmp MEMCMP_SRC2, [MEMCMP_SRC1+4]
je memcmp_Done je memcmp_Done
mov %eax, 1 mov eax, 1
jmp memcmp_End jmp memcmp_End
memcmp_Done: memcmp_Done:
xor %eax, %eax xor eax, eax
memcmp_End: memcmp_End:
emms emms
pop %esi pop esi
ret ret
// memxor_mmx // memxor_mmx
#define MEMXOR_SRC1 %edx #define MEMXOR_SRC1 edx
#define MEMXOR_SRC2 %esi #define MEMXOR_SRC2 esi
#define MEMXOR_SIZE %ecx #define MEMXOR_SIZE ecx
.globl memxor_mmx .globl memxor_mmx
memxor_mmx: memxor_mmx:
@ -233,38 +233,38 @@ memxor_mmx:
je memxor_mmx_begin je memxor_mmx_begin
push 1 push 1
call FreezeMMXRegs_ call FreezeMMXRegs_
add %esp, 4 add esp, 4
memxor_mmx_begin: memxor_mmx_begin:
push %esi push esi
mov MEMXOR_SRC1, dword ptr [%esp+8] mov MEMXOR_SRC1, dword ptr [esp+8]
mov MEMXOR_SRC2, dword ptr [%esp+12] mov MEMXOR_SRC2, dword ptr [esp+12]
mov MEMXOR_SIZE, dword ptr [%esp+16] mov MEMXOR_SIZE, dword ptr [esp+16]
cmp MEMXOR_SIZE, 64 cmp MEMXOR_SIZE, 64
jl memxor_Setup4 jl memxor_Setup4
movq %mm0, [MEMXOR_SRC2] movq mm0, [MEMXOR_SRC2]
movq %mm1, [MEMXOR_SRC2+8] movq mm1, [MEMXOR_SRC2+8]
movq %mm2, [MEMXOR_SRC2+16] movq mm2, [MEMXOR_SRC2+16]
movq %mm3, [MEMXOR_SRC2+24] movq mm3, [MEMXOR_SRC2+24]
movq %mm4, [MEMXOR_SRC2+32] movq mm4, [MEMXOR_SRC2+32]
movq %mm5, [MEMXOR_SRC2+40] movq mm5, [MEMXOR_SRC2+40]
movq %mm6, [MEMXOR_SRC2+48] movq mm6, [MEMXOR_SRC2+48]
movq %mm7, [MEMXOR_SRC2+56] movq mm7, [MEMXOR_SRC2+56]
sub MEMXOR_SIZE, 64 sub MEMXOR_SIZE, 64
add MEMXOR_SRC2, 64 add MEMXOR_SRC2, 64
cmp MEMXOR_SIZE, 64 cmp MEMXOR_SIZE, 64
jl memxor_End8 jl memxor_End8
memxor_Cmp8: memxor_Cmp8:
pxor %mm0, [MEMXOR_SRC2] pxor mm0, [MEMXOR_SRC2]
pxor %mm1, [MEMXOR_SRC2+8] pxor mm1, [MEMXOR_SRC2+8]
pxor %mm2, [MEMXOR_SRC2+16] pxor mm2, [MEMXOR_SRC2+16]
pxor %mm3, [MEMXOR_SRC2+24] pxor mm3, [MEMXOR_SRC2+24]
pxor %mm4, [MEMXOR_SRC2+32] pxor mm4, [MEMXOR_SRC2+32]
pxor %mm5, [MEMXOR_SRC2+40] pxor mm5, [MEMXOR_SRC2+40]
pxor %mm6, [MEMXOR_SRC2+48] pxor mm6, [MEMXOR_SRC2+48]
pxor %mm7, [MEMXOR_SRC2+56] pxor mm7, [MEMXOR_SRC2+56]
sub MEMXOR_SIZE, 64 sub MEMXOR_SIZE, 64
add MEMXOR_SRC2, 64 add MEMXOR_SRC2, 64
@ -272,17 +272,17 @@ memxor_Cmp8:
jge memxor_Cmp8 jge memxor_Cmp8
memxor_End8: memxor_End8:
pxor %mm0, %mm4 pxor mm0, mm4
pxor %mm1, %mm5 pxor mm1, mm5
pxor %mm2, %mm6 pxor mm2, mm6
pxor %mm3, %mm7 pxor mm3, mm7
cmp MEMXOR_SIZE, 32 cmp MEMXOR_SIZE, 32
jl memxor_End4 jl memxor_End4
pxor %mm0, [MEMXOR_SRC2] pxor mm0, [MEMXOR_SRC2]
pxor %mm1, [MEMXOR_SRC2+8] pxor mm1, [MEMXOR_SRC2+8]
pxor %mm2, [MEMXOR_SRC2+16] pxor mm2, [MEMXOR_SRC2+16]
pxor %mm3, [MEMXOR_SRC2+24] pxor mm3, [MEMXOR_SRC2+24]
sub MEMXOR_SIZE, 32 sub MEMXOR_SIZE, 32
add MEMXOR_SRC2, 32 add MEMXOR_SRC2, 32
jmp memxor_End4 jmp memxor_End4
@ -291,21 +291,21 @@ memxor_Setup4:
cmp MEMXOR_SIZE, 32 cmp MEMXOR_SIZE, 32
jl memxor_Setup2 jl memxor_Setup2
movq %mm0, [MEMXOR_SRC2] movq mm0, [MEMXOR_SRC2]
movq %mm1, [MEMXOR_SRC2+8] movq mm1, [MEMXOR_SRC2+8]
movq %mm2, [MEMXOR_SRC2+16] movq mm2, [MEMXOR_SRC2+16]
movq %mm3, [MEMXOR_SRC2+24] movq mm3, [MEMXOR_SRC2+24]
sub MEMXOR_SIZE, 32 sub MEMXOR_SIZE, 32
add MEMXOR_SRC2, 32 add MEMXOR_SRC2, 32
memxor_End4: memxor_End4:
pxor %mm0, %mm2 pxor mm0, mm2
pxor %mm1, %mm3 pxor mm1, mm3
cmp MEMXOR_SIZE, 16 cmp MEMXOR_SIZE, 16
jl memxor_End2 jl memxor_End2
pxor %mm0, [MEMXOR_SRC2] pxor mm0, [MEMXOR_SRC2]
pxor %mm1, [MEMXOR_SRC2+8] pxor mm1, [MEMXOR_SRC2+8]
sub MEMXOR_SIZE, 16 sub MEMXOR_SIZE, 16
add MEMXOR_SRC2, 16 add MEMXOR_SRC2, 16
jmp memxor_End2 jmp memxor_End2
@ -314,56 +314,56 @@ memxor_Setup2:
cmp MEMXOR_SIZE, 16 cmp MEMXOR_SIZE, 16
jl memxor_Setup1 jl memxor_Setup1
movq %mm0, [MEMXOR_SRC2] movq mm0, [MEMXOR_SRC2]
movq %mm1, [MEMXOR_SRC2+8] movq mm1, [MEMXOR_SRC2+8]
sub MEMXOR_SIZE, 16 sub MEMXOR_SIZE, 16
add MEMXOR_SRC2, 16 add MEMXOR_SRC2, 16
memxor_End2: memxor_End2:
pxor %mm0, %mm1 pxor mm0, mm1
cmp MEMXOR_SIZE, 8 cmp MEMXOR_SIZE, 8
jl memxor_End1 jl memxor_End1
pxor %mm0, [MEMXOR_SRC2] pxor mm0, [MEMXOR_SRC2]
memxor_End1: memxor_End1:
movq [MEMXOR_SRC1], %mm0 movq [MEMXOR_SRC1], mm0
jmp memxor_End jmp memxor_End
memxor_Setup1: memxor_Setup1:
movq %mm0, [MEMXOR_SRC2] movq mm0, [MEMXOR_SRC2]
movq [MEMXOR_SRC1], %mm0 movq [MEMXOR_SRC1], mm0
memxor_End: memxor_End:
emms emms
pop %esi pop esi
ret ret
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) // void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
.globl memcpy_amd_ .globl memcpy_amd_
memcpy_amd_: memcpy_amd_:
push %edi push edi
push %esi push esi
mov %edi, %ecx // destination mov edi, ecx // destination
mov %esi, %edx // source mov esi, edx // source
mov %ecx, [%esp+12] // number of bytes to copy mov ecx, [esp+12] // number of bytes to copy
mov %eax, %ecx // keep a copy of count mov eax, ecx // keep a copy of count
cld cld
cmp %eax, TINY_BLOCK_COPY cmp eax, TINY_BLOCK_COPY
jb $memcpy_ic_3 // tiny? skip mmx copy jb $memcpy_ic_3 // tiny? skip mmx copy
cmp %eax, 32*1024 // don't align between 32k-64k because cmp eax, 32*1024 // don't align between 32k-64k because
jbe $memcpy_do_align // it appears to be slower jbe $memcpy_do_align // it appears to be slower
cmp %eax, 64*1024 cmp eax, 64*1024
jbe $memcpy_align_done jbe $memcpy_align_done
$memcpy_do_align: $memcpy_do_align:
mov %eax, 8 // a trick that's faster than rep movsb... mov eax, 8 // a trick that's faster than rep movsb...
sub %eax, %edi // align destination to qword sub eax, edi // align destination to qword
andb %eax, 111 // get the low bits andb eax, 111 // get the low bits
sub %ecx, %eax // update copy count sub ecx, eax // update copy count
neg %eax // set up to jump into the array neg eax // set up to jump into the array
add %eax, offset $memcpy_align_done add eax, offset $memcpy_align_done
jmp %eax // jump to array of movsb's jmp eax // jump to array of movsb's
.align 4 .align 4
movsb movsb
@ -376,17 +376,17 @@ $memcpy_do_align:
movsb movsb
$memcpy_align_done: // destination is dword aligned $memcpy_align_done: // destination is dword aligned
mov %eax, %ecx // number of bytes left to copy mov eax, ecx // number of bytes left to copy
shr %eax, 6 // get 64-byte block count shr eax, 6 // get 64-byte block count
jz $memcpy_ic_2 // finish the last few bytes jz $memcpy_ic_2 // finish the last few bytes
cmp %eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy cmp eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
jae $memcpy_uc_test jae $memcpy_uc_test
movq [_mmx_backup+0x00],%mm0 movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],%mm1 movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],%mm2 movq [_mmx_backup+0x10],mm2
movq [_mmx_backup+0x18],%mm3 movq [_mmx_backup+0x18],mm3
// This is small block copy that uses the MMX registers to copy 8 bytes // This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses // at a time. It uses the "unrolled loop" optimization, and also uses
@ -394,49 +394,49 @@ $memcpy_align_done: // destination is dword aligned
.align 16 .align 16
$memcpy_ic_1: // 64-byte block copies, in-cache copy $memcpy_ic_1: // 64-byte block copies, in-cache copy
prefetchnta [%esi + (200*64/34+192)] // start reading ahead prefetchnta [esi + (200*64/34+192)] // start reading ahead
movq %mm0, [%esi+0] // read 64 bits movq mm0, [esi+0] // read 64 bits
movq %mm1, [%esi+8] movq mm1, [esi+8]
movq [%edi+0], %mm0 //write 64 bits movq [edi+0], mm0 //write 64 bits
movq [%edi+8], %mm1 // note: the normal movq writes the movq [edi+8], mm1 // note: the normal movq writes the
movq %mm2, [%esi+16] // data to cache; a cache line will be movq mm2, [esi+16] // data to cache; a cache line will be
movq %mm3, [%esi+24] // allocated as needed, to store the data movq mm3, [esi+24] // allocated as needed, to store the data
movq [%edi+16], %mm2 movq [edi+16], mm2
movq [%edi+24], %mm3 movq [edi+24], mm3
movq %mm0, [%esi+32] movq mm0, [esi+32]
movq %mm1, [%esi+40] movq mm1, [esi+40]
movq [%edi+32], %mm0 movq [edi+32], mm0
movq [%edi+40], %mm1 movq [edi+40], mm1
movq %mm2, [%esi+48] movq mm2, [esi+48]
movq %mm3, [%esi+56] movq mm3, [esi+56]
movq [%edi+48], %mm2 movq [edi+48], mm2
movq [%edi+56], %mm3 movq [edi+56], mm3
add %esi, 64 // update source pointer add esi, 64 // update source pointer
add %edi, 64 // update destination pointer add edi, 64 // update destination pointer
dec %eax // count down dec eax // count down
jnz $memcpy_ic_1 // last 64-byte block? jnz $memcpy_ic_1 // last 64-byte block?
movq %mm0,[_mmx_backup+0x00] movq mm0,[_mmx_backup+0x00]
movq %mm1,[_mmx_backup+0x08] movq mm1,[_mmx_backup+0x08]
movq %mm2,[_mmx_backup+0x10] movq mm2,[_mmx_backup+0x10]
movq %mm3,[_mmx_backup+0x18] movq mm3,[_mmx_backup+0x18]
$memcpy_ic_2: $memcpy_ic_2:
mov %eax, %ecx // has valid low 6 bits of the byte count mov eax, ecx // has valid low 6 bits of the byte count
$memcpy_ic_3: $memcpy_ic_3:
shr %eax, 2 // dword count shr eax, 2 // dword count
andb %eax, 1111 // only look at the "remainder" bits andb eax, 1111 // only look at the "remainder" bits
neg %eax // set up to jump into the array neg eax // set up to jump into the array
add %eax, offset $memcpy_last_few add eax, offset $memcpy_last_few
jmp %eax // jump to array of movsd's jmp eax // jump to array of movsd's
$memcpy_uc_test: $memcpy_uc_test:
// cmp %ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy // cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
// jae $memcpy_bp_1 // jae $memcpy_bp_1
//$memcpy_64_test: //$memcpy_64_test:
or %eax, %eax // tail end of block prefetch will jump here or eax, eax // tail end of block prefetch will jump here
jz $memcpy_ic_2 // no more 64-byte blocks left jz $memcpy_ic_2 // no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to // For larger blocks, which will spill beyond the cache, it's faster to
@ -444,39 +444,39 @@ $memcpy_uc_test:
// bypasses the cache and writes straight to main memory. This code also // bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data. // uses the software prefetch instruction to pre-read the data.
movq [_mmx_backup+0x00],%mm0 movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],%mm1 movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],%mm2 movq [_mmx_backup+0x10],mm2
.align 16 .align 16
$memcpy_uc_1: // 64-byte blocks, uncached copy $memcpy_uc_1: // 64-byte blocks, uncached copy
prefetchnta [%esi + (200*64/34+192)] // start reading ahead prefetchnta [esi + (200*64/34+192)] // start reading ahead
movq %mm0,[%esi+0] // read 64 bits movq mm0,[esi+0] // read 64 bits
add %edi,64 // update destination pointer add edi,64 // update destination pointer
movq %mm1,[%esi+8] movq mm1,[esi+8]
add %esi,64 // update source pointer add esi,64 // update source pointer
movq %mm2,[%esi-48] movq mm2,[esi-48]
movntq [%edi-64], %mm0 // write 64 bits, bypassing the cache movntq [edi-64], mm0 // write 64 bits, bypassing the cache
movq %mm0,[%esi-40] // note: movntq also prevents the CPU movq mm0,[esi-40] // note: movntq also prevents the CPU
movntq [%edi-56], %mm1 // from READING the destination address movntq [edi-56], mm1 // from READING the destination address
movq %mm1,[%esi-32] // into the cache, only to be over-written movq mm1,[esi-32] // into the cache, only to be over-written
movntq [%edi-48], %mm2 // so that also helps performance movntq [edi-48], mm2 // so that also helps performance
movq %mm2,[%esi-24] movq mm2,[esi-24]
movntq [%edi-40], %mm0 movntq [edi-40], mm0
movq %mm0,[%esi-16] movq mm0,[esi-16]
movntq [%edi-32], %mm1 movntq [edi-32], mm1
movq %mm1,[%esi-8] movq mm1,[esi-8]
movntq [%edi-24], %mm2 movntq [edi-24], mm2
movntq [%edi-16], %mm0 movntq [edi-16],mm0
dec %eax dec eax
movntq [%edi-8], %mm1 movntq [edi-8], mm1
jnz $memcpy_uc_1 // last 64-byte block? jnz $memcpy_uc_1 // last 64-byte block?
movq %mm0,[_mmx_backup+0x00] movq mm0,[_mmx_backup+0x00]
movq %mm1,[_mmx_backup+0x08] movq mm1,[_mmx_backup+0x08]
movq %mm2,[_mmx_backup+0x10] movq mm2,[_mmx_backup+0x10]
jmp $memcpy_ic_2 // almost done (not needed because large copy below was removed) jmp $memcpy_ic_2 // almost done (not needed because large copy below was removed)
@ -511,17 +511,17 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy
movsd movsd
$memcpy_last_few: // dword aligned from before movsd's $memcpy_last_few: // dword aligned from before movsd's
mov %eax, %ecx // has valid low 2 bits of the byte count mov eax, ecx // has valid low 2 bits of the byte count
andb %eax, 11 // the last few cows must come home andb eax, 11 // the last few cows must come home
jz $memcpy_final // no more, let's leave jz $memcpy_final // no more, let's leave
rep movsb // the last 1, 2, or 3 bytes rep movsb // the last 1, 2, or 3 bytes
$memcpy_final: $memcpy_final:
emms // clean up the MMX state emms // clean up the MMX state
sfence // flush the write buffer sfence // flush the write buffer
//mov %eax, [dest] // ret value = destination pointer //mov eax, [dest] // ret value = destination pointer
pop %esi pop esi
pop %edi pop edi
ret 4 ret 4

View File

@ -824,18 +824,18 @@ static s32 recExecuteBlock( s32 eeCycles )
#else #else
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"push %ebx\n" "push ebx\n"
"push %esi\n" "push esi\n"
"push %edi\n" "push edi\n"
"push %ebp\n" "push ebp\n"
"call iopDispatcherReg\n" "call iopDispatcherReg\n"
"pop %ebp\n" "pop ebp\n"
"pop %edi\n" "pop edi\n"
"pop %esi\n" "pop esi\n"
"pop %ebx\n" "pop ebx\n"
".att_syntax\n" ".att_syntax\n"
); );
#endif #endif

View File

@ -96,7 +96,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
#else // gcc #else // gcc
// Is this really supposed to be assembly for gcc and C for Windows?
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask) void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
{ {
u32 i; u32 i;
@ -112,23 +112,23 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
u8* p0 = (u8*)&s_maskarr[mask&15][0]; u8* p0 = (u8*)&s_maskarr[mask&15][0];
u8* p1 = (u8*)&s_maskarr[(mask>>4)&15][0]; u8* p1 = (u8*)&s_maskarr[(mask>>4)&15][0];
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movaps %%xmm0, [%0]\n" "movaps xmm0, [%0]\n"
"movaps %%xmm1, [%1]\n" "movaps xmm1, [%1]\n"
"movaps %%xmm2, %%xmm0\n" "movaps xmm2, xmm0\n"
"punpcklwd %%xmm0, %%xmm0\n" "punpcklwd xmm0, xmm0\n"
"punpckhwd %%xmm2, %%xmm2\n" "punpckhwd xmm2, xmm2\n"
"movaps %%xmm3, %%xmm1\n" "movaps xmm3, xmm1\n"
"punpcklwd %%xmm1, %%xmm1\n" "punpcklwd xmm1, xmm1\n"
"punpckhwd %%xmm3, %%xmm3\n" "punpckhwd xmm3, xmm3\n"
"movq [%2], %%xmm0\n" "movq [%2], xmm0\n"
"movq [%2+8], %%xmm1\n" "movq [%2+8], xmm1\n"
"movhps [%2+16], %%xmm0\n" "movhps [%2+16], xmm0\n"
"movhps [%2+24], %%xmm1\n" "movhps [%2+24], xmm1\n"
"movq [%2+32], %%xmm2\n" "movq [%2+32], xmm2\n"
"movq [%2+40], %%xmm3\n" "movq [%2+40], xmm3\n"
"movhps [%2+48], %%xmm2\n" "movhps [%2+48], xmm2\n"
"movhps [%2+56], %%xmm3\n" "movhps [%2+56], xmm3\n"
".att_syntax\n" : : "r"(p0), "r"(p1), "r"(vif1masks) ); ".att_syntax\n" : : "r"(p0), "r"(p1), "r"(vif1masks) );
} }
} }

View File

@ -800,18 +800,18 @@ __forceinline void recExecute()
g_EEFreezeRegs = true; g_EEFreezeRegs = true;
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"push %ebx\n" "push ebx\n"
"push %esi\n" "push esi\n"
"push %edi\n" "push edi\n"
"push %ebp\n" "push ebp\n"
"call DispatcherReg\n" "call DispatcherReg\n"
"pop %ebp\n" "pop ebp\n"
"pop %edi\n" "pop edi\n"
"pop %esi\n" "pop esi\n"
"pop %ebx\n" "pop ebx\n"
".att_syntax\n" ".att_syntax\n"
); );
g_EEFreezeRegs = false; g_EEFreezeRegs = false;
@ -824,18 +824,18 @@ static void recExecuteBlock()
g_EEFreezeRegs = true; g_EEFreezeRegs = true;
__asm__ __asm__
( (
".intel_syntax\n" ".intel_syntax noprefix\n"
"push %ebx\n" "push ebx\n"
"push %esi\n" "push esi\n"
"push %edi\n" "push edi\n"
"push %ebp\n" "push ebp\n"
"call DispatcherReg\n" "call DispatcherReg\n"
"pop %ebp\n" "pop ebp\n"
"pop %edi\n" "pop edi\n"
"pop %esi\n" "pop esi\n"
"pop %ebx\n" "pop ebx\n"
".att_syntax\n" ".att_syntax\n"
); );
g_EEFreezeRegs = false; g_EEFreezeRegs = false;

View File

@ -97,15 +97,15 @@ __forceinline void FreezeMMXRegs_(int save)
emms emms
} }
#else #else
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movq [%0+0x00], %%mm0\n" "movq [%0+0x00], mm0\n"
"movq [%0+0x08], %%mm1\n" "movq [%0+0x08], mm1\n"
"movq [%0+0x10], %%mm2\n" "movq [%0+0x10], mm2\n"
"movq [%0+0x18], %%mm3\n" "movq [%0+0x18], mm3\n"
"movq [%0+0x20], %%mm4\n" "movq [%0+0x20], mm4\n"
"movq [%0+0x28], %%mm5\n" "movq [%0+0x28], mm5\n"
"movq [%0+0x30], %%mm6\n" "movq [%0+0x30], mm6\n"
"movq [%0+0x38], %%mm7\n" "movq [%0+0x38], mm7\n"
"emms\n" "emms\n"
".att_syntax\n" : : "r"(g_globalMMXData) ); ".att_syntax\n" : : "r"(g_globalMMXData) );
#endif #endif
@ -134,15 +134,15 @@ __forceinline void FreezeMMXRegs_(int save)
emms emms
} }
#else #else
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movq %%mm0, [%0+0x00]\n" "movq mm0, [%0+0x00]\n"
"movq %%mm1, [%0+0x08]\n" "movq mm1, [%0+0x08]\n"
"movq %%mm2, [%0+0x10]\n" "movq mm2, [%0+0x10]\n"
"movq %%mm3, [%0+0x18]\n" "movq mm3, [%0+0x18]\n"
"movq %%mm4, [%0+0x20]\n" "movq mm4, [%0+0x20]\n"
"movq %%mm5, [%0+0x28]\n" "movq mm5, [%0+0x28]\n"
"movq %%mm6, [%0+0x30]\n" "movq mm6, [%0+0x30]\n"
"movq %%mm7, [%0+0x38]\n" "movq mm7, [%0+0x38]\n"
"emms\n" "emms\n"
".att_syntax\n" : : "r"(g_globalMMXData) ); ".att_syntax\n" : : "r"(g_globalMMXData) );
#endif #endif
@ -177,15 +177,15 @@ __forceinline void FreezeXMMRegs_(int save)
} }
#else #else
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movaps [%0+0x00], %%xmm0\n" "movaps [%0+0x00], xmm0\n"
"movaps [%0+0x10], %%xmm1\n" "movaps [%0+0x10], xmm1\n"
"movaps [%0+0x20], %%xmm2\n" "movaps [%0+0x20], xmm2\n"
"movaps [%0+0x30], %%xmm3\n" "movaps [%0+0x30], xmm3\n"
"movaps [%0+0x40], %%xmm4\n" "movaps [%0+0x40], xmm4\n"
"movaps [%0+0x50], %%xmm5\n" "movaps [%0+0x50], xmm5\n"
"movaps [%0+0x60], %%xmm6\n" "movaps [%0+0x60], xmm6\n"
"movaps [%0+0x70], %%xmm7\n" "movaps [%0+0x70], xmm7\n"
".att_syntax\n" : : "r"(g_globalXMMData) ); ".att_syntax\n" : : "r"(g_globalXMMData) );
#endif // _MSC_VER #endif // _MSC_VER
@ -214,15 +214,15 @@ __forceinline void FreezeXMMRegs_(int save)
} }
#else #else
__asm__(".intel_syntax\n" __asm__(".intel_syntax noprefix\n"
"movaps %%xmm0, [%0+0x00]\n" "movaps xmm0, [%0+0x00]\n"
"movaps %%xmm1, [%0+0x10]\n" "movaps xmm1, [%0+0x10]\n"
"movaps %%xmm2, [%0+0x20]\n" "movaps xmm2, [%0+0x20]\n"
"movaps %%xmm3, [%0+0x30]\n" "movaps xmm3, [%0+0x30]\n"
"movaps %%xmm4, [%0+0x40]\n" "movaps xmm4, [%0+0x40]\n"
"movaps %%xmm5, [%0+0x50]\n" "movaps xmm5, [%0+0x50]\n"
"movaps %%xmm6, [%0+0x60]\n" "movaps xmm6, [%0+0x60]\n"
"movaps %%xmm7, [%0+0x70]\n" "movaps xmm7, [%0+0x70]\n"
".att_syntax\n" : : "r"(g_globalXMMData) ); ".att_syntax\n" : : "r"(g_globalXMMData) );
#endif // _MSC_VER #endif // _MSC_VER