mirror of https://github.com/PCSX2/pcsx2.git
Remove lots of evil %'s.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@693 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
8614bbd0f8
commit
3ae6ff0856
|
@ -231,7 +231,9 @@ public:
|
|||
#ifndef __LINUX__
|
||||
__asm mov eax, SendSimplePacket
|
||||
#else
|
||||
__asm ("mov %eax, SendSimplePacket");
|
||||
__asm__ (".intel_syntax noprefix\n"
|
||||
"mov eax, SendSimplePacket\n"
|
||||
".att_syntax\n");
|
||||
#endif
|
||||
//return (uptr)&SendSimplePacket;
|
||||
}
|
||||
|
|
|
@ -1,75 +1,75 @@
|
|||
.intel_syntax
|
||||
.intel_syntax noprefix
|
||||
|
||||
.extern g_pCurrentRoutine
|
||||
|
||||
.globl so_call
|
||||
so_call:
|
||||
mov %eax, dword ptr [%esp+4]
|
||||
test dword ptr [%eax+24], 1
|
||||
mov eax, dword ptr [esp+4]
|
||||
test dword ptr [eax+24], 1
|
||||
jnz RestoreRegs
|
||||
mov [%eax+8], %ebx
|
||||
mov [%eax+12], %esi
|
||||
mov [%eax+16], %edi
|
||||
mov [%eax+20], %ebp
|
||||
mov dword ptr [%eax+24], 1
|
||||
mov [eax+8], ebx
|
||||
mov [eax+12], esi
|
||||
mov [eax+16], edi
|
||||
mov [eax+20], ebp
|
||||
mov dword ptr [eax+24], 1
|
||||
jmp CallFn
|
||||
RestoreRegs:
|
||||
// have to load and save at the same time
|
||||
mov %ecx, [%eax+8]
|
||||
mov %edx, [%eax+12]
|
||||
mov [%eax+8], %ebx
|
||||
mov [%eax+12], %esi
|
||||
mov %ebx, %ecx
|
||||
mov %esi, %edx
|
||||
mov %ecx, [%eax+16]
|
||||
mov %edx, [%eax+20]
|
||||
mov [%eax+16], %edi
|
||||
mov [%eax+20], %ebp
|
||||
mov %edi, %ecx
|
||||
mov %ebp, %edx
|
||||
mov ecx, [eax+8]
|
||||
mov edx, [eax+12]
|
||||
mov [eax+8], ebx
|
||||
mov [eax+12], esi
|
||||
mov ebx, ecx
|
||||
mov esi, edx
|
||||
mov ecx, [eax+16]
|
||||
mov edx, [eax+20]
|
||||
mov [eax+16], edi
|
||||
mov [eax+20], ebp
|
||||
mov edi, ecx
|
||||
mov ebp, edx
|
||||
|
||||
CallFn:
|
||||
mov [g_pCurrentRoutine], %eax
|
||||
mov %ecx, %esp
|
||||
mov %esp, [%eax+4]
|
||||
mov [%eax+4], %ecx
|
||||
mov [g_pCurrentRoutine], eax
|
||||
mov ecx, esp
|
||||
mov esp, [eax+4]
|
||||
mov [eax+4], ecx
|
||||
|
||||
jmp dword ptr [%eax]
|
||||
jmp dword ptr [eax]
|
||||
|
||||
.globl so_resume
|
||||
so_resume:
|
||||
mov %eax, [g_pCurrentRoutine]
|
||||
mov %ecx, [%eax+8]
|
||||
mov %edx, [%eax+12]
|
||||
mov [%eax+8], %ebx
|
||||
mov [%eax+12], %esi
|
||||
mov %ebx, %ecx
|
||||
mov %esi, %edx
|
||||
mov %ecx, [%eax+16]
|
||||
mov %edx, [%eax+20]
|
||||
mov [%eax+16], %edi
|
||||
mov [%eax+20], %ebp
|
||||
mov %edi, %ecx
|
||||
mov %ebp, %edx
|
||||
mov eax, [g_pCurrentRoutine]
|
||||
mov ecx, [eax+8]
|
||||
mov edx, [eax+12]
|
||||
mov [eax+8], ebx
|
||||
mov [eax+12], esi
|
||||
mov ebx, ecx
|
||||
mov esi, edx
|
||||
mov ecx, [eax+16]
|
||||
mov edx, [eax+20]
|
||||
mov [eax+16], edi
|
||||
mov [eax+20], ebp
|
||||
mov edi, ecx
|
||||
mov ebp, edx
|
||||
|
||||
// put the return address in pcalladdr
|
||||
mov %ecx, [%esp]
|
||||
mov [%eax], %ecx
|
||||
add %esp, 4 // remove the return address
|
||||
mov ecx, [esp]
|
||||
mov [eax], ecx
|
||||
add esp, 4 // remove the return address
|
||||
|
||||
// swap stack pointers
|
||||
mov %ecx, [%eax+4]
|
||||
mov [%eax+4], %esp
|
||||
mov %esp, %ecx
|
||||
mov ecx, [eax+4]
|
||||
mov [eax+4], esp
|
||||
mov esp, ecx
|
||||
ret
|
||||
|
||||
.globl so_exit
|
||||
so_exit:
|
||||
mov %eax, [g_pCurrentRoutine]
|
||||
mov %esp, [%eax+4]
|
||||
mov %ebx, [%eax+8]
|
||||
mov %esi, [%eax+12]
|
||||
mov %edi, [%eax+16]
|
||||
mov %ebp, [%eax+20]
|
||||
mov eax, [g_pCurrentRoutine]
|
||||
mov esp, [eax+4]
|
||||
mov ebx, [eax+8]
|
||||
mov esi, [eax+12]
|
||||
mov edi, [eax+16]
|
||||
mov ebp, [eax+20]
|
||||
ret
|
||||
|
||||
|
|
|
@ -129,15 +129,15 @@ namespace Threading
|
|||
if( true ) //isMultiCore )
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
".intel_syntax\n"
|
||||
"lock xadd [%0], %%eax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"lock xadd [%0], eax\n"
|
||||
".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
".intel_syntax\n"
|
||||
"xadd [%0], %%eax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"xadd [%0], eax\n"
|
||||
".att_syntax\n" : : "r"(Addend), "a"(Value) : "memory");
|
||||
}
|
||||
}
|
||||
|
@ -175,8 +175,8 @@ namespace Threading
|
|||
__forceinline void pcsx2_InterlockedExchange64(volatile s64* Target, s64 Value)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
".intel_syntax\n"
|
||||
"lock xchg [%0], %%rax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"lock xchg [%0], rax\n"
|
||||
".att_syntax\n" : : "r"(Target), "a"(Value) : "memory"
|
||||
);
|
||||
return 0;
|
||||
|
|
|
@ -107,10 +107,10 @@ static __forceinline void memset_8( void *dest )
|
|||
case 3:
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"cld\n"
|
||||
// "mov %edi, %0\n"
|
||||
// "mov %eax, %1\n"
|
||||
// "mov edi, %0\n"
|
||||
// "mov eax, %1\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
|
@ -125,10 +125,10 @@ static __forceinline void memset_8( void *dest )
|
|||
case 4:
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"cld\n"
|
||||
// "mov %edi, %0\n"
|
||||
// "mov %eax, %1\n"
|
||||
// "mov edi, %0\n"
|
||||
// "mov eax, %1\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
|
@ -144,10 +144,10 @@ static __forceinline void memset_8( void *dest )
|
|||
case 5:
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"cld\n"
|
||||
// "mov %edi, %0\n"
|
||||
// "mov %eax, %1\n"
|
||||
// "mov edi, %0\n"
|
||||
// "mov eax, %1\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
"stosd\n"
|
||||
|
@ -164,7 +164,7 @@ static __forceinline void memset_8( void *dest )
|
|||
default:
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"cld\n"
|
||||
// "mov ecx, %0\n"
|
||||
// "mov edi, %1\n"
|
||||
|
|
|
@ -504,15 +504,15 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
|
|||
}
|
||||
#else
|
||||
if( VIFdmanum ) {
|
||||
__asm__(".intel_syntax\n"
|
||||
"movaps %%xmm6, xmmword ptr [%0]\n"
|
||||
"movaps %%xmm7, xmmword ptr [%1]\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movaps xmm6, xmmword ptr [%0]\n"
|
||||
"movaps xmm7, xmmword ptr [%1]\n"
|
||||
".att_syntax\n" : :"r"(g_vifRow1), "r"(g_vifCol1) );
|
||||
}
|
||||
else {
|
||||
__asm__(".intel_syntax\n"
|
||||
"movaps %%xmm6, xmmword ptr [%0]\n"
|
||||
"movaps %%xmm7, xmmword ptr [%1]\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movaps xmm6, xmmword ptr [%0]\n"
|
||||
"movaps xmm7, xmmword ptr [%1]\n"
|
||||
".att_syntax\n" : : "r"(g_vifRow0), "r"(g_vifCol0) );
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// microVU.cpp assembly routines
|
||||
// arcum42(@gmail.com)
|
||||
.intel_syntax
|
||||
.intel_syntax noprefix
|
||||
|
||||
.extern mVUexecuteVU0
|
||||
.extern mVUexecuteVU1
|
||||
|
@ -19,14 +19,14 @@ startVU0:
|
|||
call mVUexecuteVU0
|
||||
|
||||
// backup cpu state
|
||||
push %ebx
|
||||
push %ebp
|
||||
push %esi
|
||||
push %edi
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
|
||||
ldmxcsr g_sseVUMXCSR
|
||||
// Should set xmmZ?
|
||||
jmp %eax
|
||||
jmp eax
|
||||
|
||||
// Runs VU1 for number of cycles
|
||||
// void __fastcall startVU1(u32 startPC, u32 cycles)
|
||||
|
@ -35,14 +35,14 @@ startVU01:
|
|||
call mVUexecuteVU1
|
||||
|
||||
// backup cpu state
|
||||
push %ebx
|
||||
push %ebp
|
||||
push %esi
|
||||
push %edi
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
|
||||
ldmxcsr g_sseVUMXCSR
|
||||
|
||||
jmp %eax
|
||||
jmp eax
|
||||
|
||||
// Exit point
|
||||
// void __fastcall endVU0(u32 startPC, u32 cycles)
|
||||
|
@ -51,10 +51,10 @@ endVU0:
|
|||
//call mVUcleanUpVU0
|
||||
|
||||
/*restore cpu state*/
|
||||
pop %edi;
|
||||
pop %esi;
|
||||
pop %ebp;
|
||||
pop %ebx;
|
||||
pop edi;
|
||||
pop esi;
|
||||
pop ebp;
|
||||
pop ebx;
|
||||
|
||||
ldmxcsr g_sseMXCSR
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// iVUzerorec.cpp assembly routines
|
||||
// zerofrog(@gmail.com)
|
||||
.intel_syntax
|
||||
.intel_syntax noprefix
|
||||
|
||||
.extern svudispfntemp
|
||||
.extern s_TotalVUCycles
|
||||
|
@ -17,35 +17,35 @@
|
|||
// SuperVUExecuteProgram(u32 startpc, int vuindex)
|
||||
.globl SuperVUExecuteProgram
|
||||
SuperVUExecuteProgram:
|
||||
mov %eax, [%esp]
|
||||
mov eax, [esp]
|
||||
mov dword ptr s_TotalVUCycles, 0
|
||||
add %esp, 4
|
||||
mov dword ptr [s_callstack], %eax
|
||||
add esp, 4
|
||||
mov dword ptr [s_callstack], eax
|
||||
call SuperVUGetProgram
|
||||
mov s_vu1ebp, %ebp
|
||||
mov s_vu1esi, %esi
|
||||
mov s_vuedi, %edi
|
||||
mov s_vuebx, %ebx
|
||||
mov s_vu1ebp, ebp
|
||||
mov s_vu1esi, esi
|
||||
mov s_vuedi, edi
|
||||
mov s_vuebx, ebx
|
||||
#ifdef _DEBUG
|
||||
mov s_vu1esp, %esp
|
||||
mov s_vu1esp, esp
|
||||
#endif
|
||||
|
||||
ldmxcsr g_sseVUMXCSR
|
||||
mov dword ptr s_writeQ, 0xffffffff
|
||||
mov dword ptr s_writeP, 0xffffffff
|
||||
jmp %eax
|
||||
jmp eax
|
||||
|
||||
.globl SuperVUEndProgram
|
||||
SuperVUEndProgram:
|
||||
// restore cpu state
|
||||
ldmxcsr g_sseMXCSR
|
||||
mov %ebp, s_vu1ebp
|
||||
mov %esi, s_vu1esi
|
||||
mov %edi, s_vuedi
|
||||
mov %ebx, s_vuebx
|
||||
mov ebp, s_vu1ebp
|
||||
mov esi, s_vu1esi
|
||||
mov edi, s_vuedi
|
||||
mov ebx, s_vuebx
|
||||
|
||||
#ifdef _DEBUG
|
||||
sub s_vu1esp, %esp
|
||||
sub s_vu1esp, esp
|
||||
#endif
|
||||
|
||||
call SuperVUCleanupProgram
|
||||
|
@ -54,20 +54,20 @@ SuperVUEndProgram:
|
|||
|
||||
.globl svudispfn
|
||||
svudispfn:
|
||||
mov [g_curdebugvu], %eax
|
||||
mov s_saveecx, %ecx
|
||||
mov s_saveedx, %edx
|
||||
mov s_saveebx, %ebx
|
||||
mov s_saveesi, %esi
|
||||
mov s_saveedi, %edi
|
||||
mov s_saveebp, %ebp
|
||||
mov [g_curdebugvu], eax
|
||||
mov s_saveecx, ecx
|
||||
mov s_saveedx, edx
|
||||
mov s_saveebx, ebx
|
||||
mov s_saveesi, esi
|
||||
mov s_saveedi, edi
|
||||
mov s_saveebp, ebp
|
||||
|
||||
call svudispfntemp
|
||||
|
||||
mov %ecx, s_saveecx
|
||||
mov %edx, s_saveedx
|
||||
mov %ebx, s_saveebx
|
||||
mov %esi, s_saveesi
|
||||
mov %edi, s_saveedi
|
||||
mov %ebp, s_saveebp
|
||||
mov ecx, s_saveecx
|
||||
mov edx, s_saveedx
|
||||
mov ebx, s_saveebx
|
||||
mov esi, s_saveesi
|
||||
mov edi, s_saveedi
|
||||
mov ebp, s_saveebp
|
||||
ret
|
112
pcsx2/x86/aVif.S
112
pcsx2/x86/aVif.S
|
@ -16,29 +16,29 @@
|
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
*/
|
||||
.intel_syntax
|
||||
.intel_syntax noprefix
|
||||
|
||||
.extern _vifRegs
|
||||
.extern _vifMaskRegs
|
||||
.extern _vifRow
|
||||
|
||||
#define VIF_ESP %esp
|
||||
#define VIF_SRC %esi
|
||||
#define VIF_INC %ecx
|
||||
#define VIF_DST %edi
|
||||
#define VIF_SIZE %edx
|
||||
#define VIF_TMPADDR %eax
|
||||
#define VIF_SAVEEBX %ebx
|
||||
#define VIF_SAVEEBXd %ebx
|
||||
#define VIF_ESP esp
|
||||
#define VIF_SRC esi
|
||||
#define VIF_INC ecx
|
||||
#define VIF_DST edi
|
||||
#define VIF_SIZE edx
|
||||
#define VIF_TMPADDR eax
|
||||
#define VIF_SAVEEBX ebx
|
||||
#define VIF_SAVEEBXd ebx
|
||||
|
||||
#define XMM_R0 %xmm0
|
||||
#define XMM_R1 %xmm1
|
||||
#define XMM_R2 %xmm2
|
||||
#define XMM_WRITEMASK %xmm3
|
||||
#define XMM_ROWMASK %xmm4
|
||||
#define XMM_ROWCOLMASK %xmm5
|
||||
#define XMM_ROW %xmm6
|
||||
#define XMM_COL %xmm7
|
||||
#define XMM_R0 xmm0
|
||||
#define XMM_R1 xmm1
|
||||
#define XMM_R2 xmm2
|
||||
#define XMM_WRITEMASK xmm3
|
||||
#define XMM_ROWMASK xmm4
|
||||
#define XMM_ROWCOLMASK xmm5
|
||||
#define XMM_ROW xmm6
|
||||
#define XMM_COL xmm7
|
||||
|
||||
#define XMM_R3 XMM_COL
|
||||
|
||||
|
@ -1189,35 +1189,35 @@
|
|||
.extern s_TempDecompress
|
||||
|
||||
#define DECOMPRESS_RGBA(OFFSET) \
|
||||
mov %bl, %al; \
|
||||
shl %bl, 3; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET], %bl; \
|
||||
mov bl, al; \
|
||||
shl bl, 3; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET], bl; \
|
||||
\
|
||||
mov %bx, %ax; \
|
||||
shr %bx, 2; \
|
||||
and %bx, 0xf8; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+1], %bl; \
|
||||
mov bx, ax; \
|
||||
shr bx, 2; \
|
||||
and bx, 0xf8; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+1], bl; \
|
||||
\
|
||||
mov %bx, %ax; \
|
||||
shr %bx, 7; \
|
||||
and %bx, 0xf8; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+2], %bl; \
|
||||
mov %bx, %ax; \
|
||||
shr %bx, 8; \
|
||||
and %bx, 0x80; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+3], %bl; \
|
||||
mov bx, ax; \
|
||||
shr bx, 7; \
|
||||
and bx, 0xf8; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+2], bl; \
|
||||
mov bx, ax; \
|
||||
shr bx, 8; \
|
||||
and bx, 0x80; \
|
||||
mov byte ptr [s_TempDecompress+OFFSET+3], bl; \
|
||||
|
||||
#define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
||||
mov %eax, dword ptr [VIF_SRC]; \
|
||||
mov eax, dword ptr [VIF_SRC]; \
|
||||
DECOMPRESS_RGBA(0); \
|
||||
\
|
||||
shr %eax, 16; \
|
||||
shr eax, 16; \
|
||||
DECOMPRESS_RGBA(4); \
|
||||
\
|
||||
mov %eax, dword ptr [VIF_SRC+4]; \
|
||||
mov eax, dword ptr [VIF_SRC+4]; \
|
||||
DECOMPRESS_RGBA(8); \
|
||||
\
|
||||
shr %eax, 16; \
|
||||
shr eax, 16; \
|
||||
DECOMPRESS_RGBA(12); \
|
||||
\
|
||||
movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \
|
||||
|
@ -1242,13 +1242,13 @@
|
|||
#define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4
|
||||
|
||||
#define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
||||
mov %eax, dword ptr [VIF_SRC]; \
|
||||
mov eax, dword ptr [VIF_SRC]; \
|
||||
DECOMPRESS_RGBA(0); \
|
||||
\
|
||||
shr %eax, 16; \
|
||||
shr eax, 16; \
|
||||
DECOMPRESS_RGBA(4); \
|
||||
\
|
||||
mov %eax, dword ptr [VIF_SRC]; \
|
||||
mov eax, dword ptr [VIF_SRC]; \
|
||||
DECOMPRESS_RGBA(8); \
|
||||
\
|
||||
movdqa XMM_R0, xmmword ptr [s_TempDecompress]; \
|
||||
|
@ -1271,10 +1271,10 @@
|
|||
#define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3
|
||||
|
||||
#define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
||||
mov %eax, dword ptr [VIF_SRC]; \
|
||||
mov eax, dword ptr [VIF_SRC]; \
|
||||
DECOMPRESS_RGBA(0); \
|
||||
\
|
||||
shr %eax, 16; \
|
||||
shr eax, 16; \
|
||||
DECOMPRESS_RGBA(4); \
|
||||
\
|
||||
movq XMM_R0, qword ptr [s_TempDecompress]; \
|
||||
|
@ -1294,7 +1294,7 @@
|
|||
#define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2
|
||||
|
||||
#define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
||||
mov %ax, word ptr [VIF_SRC]; \
|
||||
mov ax, word ptr [VIF_SRC]; \
|
||||
DECOMPRESS_RGBA(0) \
|
||||
\
|
||||
movd XMM_R0, dword ptr [s_TempDecompress]; \
|
||||
|
@ -1327,20 +1327,20 @@
|
|||
|
||||
// 32 bit versions have the args on the stack
|
||||
#define INIT_ARGS() \
|
||||
push %edi; \
|
||||
push %esi; \
|
||||
push %ebx; \
|
||||
mov VIF_DST, dword ptr [%esp+4+12]; \
|
||||
mov VIF_SRC, dword ptr [%esp+8+12]; \
|
||||
mov VIF_SIZE, dword ptr [%esp+12+12]; \
|
||||
push edi; \
|
||||
push esi; \
|
||||
push ebx; \
|
||||
mov VIF_DST, dword ptr [esp+4+12]; \
|
||||
mov VIF_SRC, dword ptr [esp+8+12]; \
|
||||
mov VIF_SIZE, dword ptr [esp+12+12]; \
|
||||
|
||||
|
||||
#define POP_REGS() \
|
||||
pop %ebx; \
|
||||
pop %esi; \
|
||||
pop %edi; \
|
||||
pop ebx; \
|
||||
pop esi; \
|
||||
pop edi; \
|
||||
|
||||
#define INC_STACK(reg) add %esp, 4;
|
||||
#define INC_STACK(reg) add esp, 4;
|
||||
|
||||
// qsize - bytes of compressed size of 1 decompressed xmmword
|
||||
// int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize)
|
||||
|
@ -1431,7 +1431,7 @@ name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec: \
|
|||
sub VIF_SIZE, qsize; \
|
||||
name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \
|
||||
SAVE_ROW_REG; \
|
||||
mov %eax, VIF_SIZE; \
|
||||
mov eax, VIF_SIZE; \
|
||||
POP_REGS(); \
|
||||
ret; \
|
||||
\
|
||||
|
@ -1460,7 +1460,7 @@ name##_##sign##_##MaskType##_##ModeType##_C2_Done3: \
|
|||
name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \
|
||||
\
|
||||
SAVE_ROW_REG; \
|
||||
mov %eax, VIF_SIZE; \
|
||||
mov eax, VIF_SIZE; \
|
||||
POP_REGS(); \
|
||||
ret; \
|
||||
\
|
||||
|
@ -1497,7 +1497,7 @@ name##_##sign##_##MaskType##_##ModeType##_C3_Done3: \
|
|||
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
|
||||
name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \
|
||||
SAVE_ROW_REG; \
|
||||
mov %eax, VIF_SIZE; \
|
||||
mov eax, VIF_SIZE; \
|
||||
POP_REGS(); \
|
||||
ret; \
|
||||
\
|
||||
|
@ -1552,7 +1552,7 @@ name##_##sign##_##MaskType##_##ModeType##_C4_Done: \
|
|||
\
|
||||
SAVE_ROW_REG; \
|
||||
INC_STACK(); \
|
||||
mov %eax, VIF_SIZE; \
|
||||
mov eax, VIF_SIZE; \
|
||||
POP_REGS(); \
|
||||
ret; \
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
// Fast assembly routines for x86-64
|
||||
// zerofrog(@gmail.com)
|
||||
// and added to by arcum42@gmail.com
|
||||
.intel_syntax
|
||||
.intel_syntax noprefix
|
||||
.extern g_EEFreezeRegs
|
||||
.extern FreezeMMXRegs_
|
||||
.extern _mmx_backup
|
||||
|
@ -36,9 +36,9 @@
|
|||
// ~10 times faster than standard memcmp
|
||||
// (zerofrog)
|
||||
// u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
|
||||
#define MEMCMP_SRC1 %edx
|
||||
#define MEMCMP_SRC2 %esi
|
||||
#define MEMCMP_SIZE %ecx
|
||||
#define MEMCMP_SRC1 edx
|
||||
#define MEMCMP_SRC2 esi
|
||||
#define MEMCMP_SIZE ecx
|
||||
|
||||
.globl memcmp_mmx
|
||||
memcmp_mmx:
|
||||
|
@ -48,82 +48,82 @@ memcmp_mmx:
|
|||
je memcmp_mmx_begin
|
||||
push 1
|
||||
call FreezeMMXRegs_
|
||||
add %esp, 4
|
||||
add esp, 4
|
||||
|
||||
memcmp_mmx_begin:
|
||||
push %esi
|
||||
mov MEMCMP_SRC1, dword ptr [%esp+8]
|
||||
mov MEMCMP_SRC2, dword ptr [%esp+12]
|
||||
mov MEMCMP_SIZE, dword ptr [%esp+16]
|
||||
push esi
|
||||
mov MEMCMP_SRC1, dword ptr [esp+8]
|
||||
mov MEMCMP_SRC2, dword ptr [esp+12]
|
||||
mov MEMCMP_SIZE, dword ptr [esp+16]
|
||||
|
||||
cmp MEMCMP_SIZE, 32
|
||||
jl memcmp_Done4
|
||||
|
||||
// custom test first 8 to make sure things are ok
|
||||
movq %mm0, [MEMCMP_SRC2]
|
||||
movq %mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd %mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd %mm1, [MEMCMP_SRC1+8]
|
||||
pand %mm0, %mm1
|
||||
movq %mm2, [MEMCMP_SRC2+16]
|
||||
pmovmskb %eax, %mm0
|
||||
movq %mm3, [MEMCMP_SRC2+24]
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pand mm0, mm1
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
pmovmskb eax, mm0
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_NextComp
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_NextComp:
|
||||
pcmpeqd %mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd %mm3, [MEMCMP_SRC1+24]
|
||||
pand %mm2, %mm3
|
||||
pmovmskb %eax, %mm2
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm2, mm3
|
||||
pmovmskb eax, mm2
|
||||
|
||||
sub MEMCMP_SIZE, 32
|
||||
add MEMCMP_SRC2, 32
|
||||
add MEMCMP_SRC1, 32
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_ContinueTest
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
cmp MEMCMP_SIZE, 64
|
||||
jl memcmp_Done8
|
||||
|
||||
memcmp_Cmp8:
|
||||
movq %mm0, [MEMCMP_SRC2]
|
||||
movq %mm1, [MEMCMP_SRC2+8]
|
||||
movq %mm2, [MEMCMP_SRC2+16]
|
||||
movq %mm3, [MEMCMP_SRC2+24]
|
||||
movq %mm4, [MEMCMP_SRC2+32]
|
||||
movq %mm5, [MEMCMP_SRC2+40]
|
||||
movq %mm6, [MEMCMP_SRC2+48]
|
||||
movq %mm7, [MEMCMP_SRC2+56]
|
||||
pcmpeqd %mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd %mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd %mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd %mm3, [MEMCMP_SRC1+24]
|
||||
pand %mm0, %mm1
|
||||
pcmpeqd %mm4, [MEMCMP_SRC1+32]
|
||||
pand %mm0, %mm2
|
||||
pcmpeqd %mm5, [MEMCMP_SRC1+40]
|
||||
pand %mm0, %mm3
|
||||
pcmpeqd %mm6, [MEMCMP_SRC1+48]
|
||||
pand %mm0, %mm4
|
||||
pcmpeqd %mm7, [MEMCMP_SRC1+56]
|
||||
pand %mm0, %mm5
|
||||
pand %mm0, %mm6
|
||||
pand %mm0, %mm7
|
||||
pmovmskb %eax, %mm0
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
movq mm4, [MEMCMP_SRC2+32]
|
||||
movq mm5, [MEMCMP_SRC2+40]
|
||||
movq mm6, [MEMCMP_SRC2+48]
|
||||
movq mm7, [MEMCMP_SRC2+56]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm0, mm1
|
||||
pcmpeqd mm4, [MEMCMP_SRC1+32]
|
||||
pand mm0, mm2
|
||||
pcmpeqd mm5, [MEMCMP_SRC1+40]
|
||||
pand mm0, mm3
|
||||
pcmpeqd mm6, [MEMCMP_SRC1+48]
|
||||
pand mm0, mm4
|
||||
pcmpeqd mm7, [MEMCMP_SRC1+56]
|
||||
pand mm0, mm5
|
||||
pand mm0, mm6
|
||||
pand mm0, mm7
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_Continue
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Continue:
|
||||
|
@ -137,93 +137,93 @@ memcmp_ContinueTest:
|
|||
memcmp_Done8:
|
||||
test MEMCMP_SIZE, 0x20
|
||||
jz memcmp_Done4
|
||||
movq %mm0, [MEMCMP_SRC2]
|
||||
movq %mm1, [MEMCMP_SRC2+8]
|
||||
movq %mm2, [MEMCMP_SRC2+16]
|
||||
movq %mm3, [MEMCMP_SRC2+24]
|
||||
pcmpeqd %mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd %mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd %mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd %mm3, [MEMCMP_SRC1+24]
|
||||
pand %mm0, %mm1
|
||||
pand %mm0, %mm2
|
||||
pand %mm0, %mm3
|
||||
pmovmskb %eax, %mm0
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pand mm0, mm3
|
||||
pmovmskb eax, mm0
|
||||
sub MEMCMP_SIZE, 32
|
||||
add MEMCMP_SRC2, 32
|
||||
add MEMCMP_SRC1, 32
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done4
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done4:
|
||||
cmp MEMCMP_SIZE, 24
|
||||
jne memcmp_Done2
|
||||
movq %mm0, [MEMCMP_SRC2]
|
||||
movq %mm1, [MEMCMP_SRC2+8]
|
||||
movq %mm2, [MEMCMP_SRC2+16]
|
||||
pcmpeqd %mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd %mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd %mm2, [MEMCMP_SRC1+16]
|
||||
pand %mm0, %mm1
|
||||
pand %mm0, %mm2
|
||||
pmovmskb %eax, %mm0
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done2:
|
||||
cmp MEMCMP_SIZE, 16
|
||||
jne memcmp_Done1
|
||||
|
||||
movq %mm0, [MEMCMP_SRC2]
|
||||
movq %mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd %mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd %mm1, [MEMCMP_SRC1+8]
|
||||
pand %mm0, %mm1
|
||||
pmovmskb %eax, %mm0
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pand mm0, mm1
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp %eax, 0xff
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done1:
|
||||
cmp MEMCMP_SIZE, 8
|
||||
jne memcmp_Done
|
||||
|
||||
mov %eax, [MEMCMP_SRC2]
|
||||
mov eax, [MEMCMP_SRC2]
|
||||
mov MEMCMP_SRC2, [MEMCMP_SRC2+4]
|
||||
cmp %eax, [MEMCMP_SRC1]
|
||||
cmp eax, [MEMCMP_SRC1]
|
||||
je memcmp_Next
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Next:
|
||||
cmp MEMCMP_SRC2, [MEMCMP_SRC1+4]
|
||||
je memcmp_Done
|
||||
mov %eax, 1
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done:
|
||||
xor %eax, %eax
|
||||
xor eax, eax
|
||||
|
||||
memcmp_End:
|
||||
emms
|
||||
pop %esi
|
||||
pop esi
|
||||
ret
|
||||
|
||||
// memxor_mmx
|
||||
#define MEMXOR_SRC1 %edx
|
||||
#define MEMXOR_SRC2 %esi
|
||||
#define MEMXOR_SIZE %ecx
|
||||
#define MEMXOR_SRC1 edx
|
||||
#define MEMXOR_SRC2 esi
|
||||
#define MEMXOR_SIZE ecx
|
||||
|
||||
.globl memxor_mmx
|
||||
memxor_mmx:
|
||||
|
@ -233,38 +233,38 @@ memxor_mmx:
|
|||
je memxor_mmx_begin
|
||||
push 1
|
||||
call FreezeMMXRegs_
|
||||
add %esp, 4
|
||||
add esp, 4
|
||||
|
||||
memxor_mmx_begin:
|
||||
push %esi
|
||||
mov MEMXOR_SRC1, dword ptr [%esp+8]
|
||||
mov MEMXOR_SRC2, dword ptr [%esp+12]
|
||||
mov MEMXOR_SIZE, dword ptr [%esp+16]
|
||||
push esi
|
||||
mov MEMXOR_SRC1, dword ptr [esp+8]
|
||||
mov MEMXOR_SRC2, dword ptr [esp+12]
|
||||
mov MEMXOR_SIZE, dword ptr [esp+16]
|
||||
cmp MEMXOR_SIZE, 64
|
||||
jl memxor_Setup4
|
||||
|
||||
movq %mm0, [MEMXOR_SRC2]
|
||||
movq %mm1, [MEMXOR_SRC2+8]
|
||||
movq %mm2, [MEMXOR_SRC2+16]
|
||||
movq %mm3, [MEMXOR_SRC2+24]
|
||||
movq %mm4, [MEMXOR_SRC2+32]
|
||||
movq %mm5, [MEMXOR_SRC2+40]
|
||||
movq %mm6, [MEMXOR_SRC2+48]
|
||||
movq %mm7, [MEMXOR_SRC2+56]
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
movq mm2, [MEMXOR_SRC2+16]
|
||||
movq mm3, [MEMXOR_SRC2+24]
|
||||
movq mm4, [MEMXOR_SRC2+32]
|
||||
movq mm5, [MEMXOR_SRC2+40]
|
||||
movq mm6, [MEMXOR_SRC2+48]
|
||||
movq mm7, [MEMXOR_SRC2+56]
|
||||
sub MEMXOR_SIZE, 64
|
||||
add MEMXOR_SRC2, 64
|
||||
cmp MEMXOR_SIZE, 64
|
||||
jl memxor_End8
|
||||
|
||||
memxor_Cmp8:
|
||||
pxor %mm0, [MEMXOR_SRC2]
|
||||
pxor %mm1, [MEMXOR_SRC2+8]
|
||||
pxor %mm2, [MEMXOR_SRC2+16]
|
||||
pxor %mm3, [MEMXOR_SRC2+24]
|
||||
pxor %mm4, [MEMXOR_SRC2+32]
|
||||
pxor %mm5, [MEMXOR_SRC2+40]
|
||||
pxor %mm6, [MEMXOR_SRC2+48]
|
||||
pxor %mm7, [MEMXOR_SRC2+56]
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
pxor mm2, [MEMXOR_SRC2+16]
|
||||
pxor mm3, [MEMXOR_SRC2+24]
|
||||
pxor mm4, [MEMXOR_SRC2+32]
|
||||
pxor mm5, [MEMXOR_SRC2+40]
|
||||
pxor mm6, [MEMXOR_SRC2+48]
|
||||
pxor mm7, [MEMXOR_SRC2+56]
|
||||
|
||||
sub MEMXOR_SIZE, 64
|
||||
add MEMXOR_SRC2, 64
|
||||
|
@ -272,17 +272,17 @@ memxor_Cmp8:
|
|||
jge memxor_Cmp8
|
||||
|
||||
memxor_End8:
|
||||
pxor %mm0, %mm4
|
||||
pxor %mm1, %mm5
|
||||
pxor %mm2, %mm6
|
||||
pxor %mm3, %mm7
|
||||
pxor mm0, mm4
|
||||
pxor mm1, mm5
|
||||
pxor mm2, mm6
|
||||
pxor mm3, mm7
|
||||
|
||||
cmp MEMXOR_SIZE, 32
|
||||
jl memxor_End4
|
||||
pxor %mm0, [MEMXOR_SRC2]
|
||||
pxor %mm1, [MEMXOR_SRC2+8]
|
||||
pxor %mm2, [MEMXOR_SRC2+16]
|
||||
pxor %mm3, [MEMXOR_SRC2+24]
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
pxor mm2, [MEMXOR_SRC2+16]
|
||||
pxor mm3, [MEMXOR_SRC2+24]
|
||||
sub MEMXOR_SIZE, 32
|
||||
add MEMXOR_SRC2, 32
|
||||
jmp memxor_End4
|
||||
|
@ -291,21 +291,21 @@ memxor_Setup4:
|
|||
cmp MEMXOR_SIZE, 32
|
||||
jl memxor_Setup2
|
||||
|
||||
movq %mm0, [MEMXOR_SRC2]
|
||||
movq %mm1, [MEMXOR_SRC2+8]
|
||||
movq %mm2, [MEMXOR_SRC2+16]
|
||||
movq %mm3, [MEMXOR_SRC2+24]
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
movq mm2, [MEMXOR_SRC2+16]
|
||||
movq mm3, [MEMXOR_SRC2+24]
|
||||
sub MEMXOR_SIZE, 32
|
||||
add MEMXOR_SRC2, 32
|
||||
|
||||
memxor_End4:
|
||||
pxor %mm0, %mm2
|
||||
pxor %mm1, %mm3
|
||||
pxor mm0, mm2
|
||||
pxor mm1, mm3
|
||||
|
||||
cmp MEMXOR_SIZE, 16
|
||||
jl memxor_End2
|
||||
pxor %mm0, [MEMXOR_SRC2]
|
||||
pxor %mm1, [MEMXOR_SRC2+8]
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
sub MEMXOR_SIZE, 16
|
||||
add MEMXOR_SRC2, 16
|
||||
jmp memxor_End2
|
||||
|
@ -314,56 +314,56 @@ memxor_Setup2:
|
|||
cmp MEMXOR_SIZE, 16
|
||||
jl memxor_Setup1
|
||||
|
||||
movq %mm0, [MEMXOR_SRC2]
|
||||
movq %mm1, [MEMXOR_SRC2+8]
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
sub MEMXOR_SIZE, 16
|
||||
add MEMXOR_SRC2, 16
|
||||
|
||||
memxor_End2:
|
||||
pxor %mm0, %mm1
|
||||
pxor mm0, mm1
|
||||
|
||||
cmp MEMXOR_SIZE, 8
|
||||
jl memxor_End1
|
||||
pxor %mm0, [MEMXOR_SRC2]
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
memxor_End1:
|
||||
movq [MEMXOR_SRC1], %mm0
|
||||
movq [MEMXOR_SRC1], mm0
|
||||
jmp memxor_End
|
||||
|
||||
memxor_Setup1:
|
||||
movq %mm0, [MEMXOR_SRC2]
|
||||
movq [MEMXOR_SRC1], %mm0
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq [MEMXOR_SRC1], mm0
|
||||
memxor_End:
|
||||
emms
|
||||
pop %esi
|
||||
pop esi
|
||||
ret
|
||||
|
||||
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||
.globl memcpy_amd_
|
||||
memcpy_amd_:
|
||||
push %edi
|
||||
push %esi
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov %edi, %ecx // destination
|
||||
mov %esi, %edx // source
|
||||
mov %ecx, [%esp+12] // number of bytes to copy
|
||||
mov %eax, %ecx // keep a copy of count
|
||||
mov edi, ecx // destination
|
||||
mov esi, edx // source
|
||||
mov ecx, [esp+12] // number of bytes to copy
|
||||
mov eax, ecx // keep a copy of count
|
||||
|
||||
cld
|
||||
cmp %eax, TINY_BLOCK_COPY
|
||||
cmp eax, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 // tiny? skip mmx copy
|
||||
|
||||
cmp %eax, 32*1024 // don't align between 32k-64k because
|
||||
cmp eax, 32*1024 // don't align between 32k-64k because
|
||||
jbe $memcpy_do_align // it appears to be slower
|
||||
cmp %eax, 64*1024
|
||||
cmp eax, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
$memcpy_do_align:
|
||||
mov %eax, 8 // a trick that's faster than rep movsb...
|
||||
sub %eax, %edi // align destination to qword
|
||||
andb %eax, 111 // get the low bits
|
||||
sub %ecx, %eax // update copy count
|
||||
neg %eax // set up to jump into the array
|
||||
add %eax, offset $memcpy_align_done
|
||||
jmp %eax // jump to array of movsb's
|
||||
mov eax, 8 // a trick that's faster than rep movsb...
|
||||
sub eax, edi // align destination to qword
|
||||
andb eax, 111 // get the low bits
|
||||
sub ecx, eax // update copy count
|
||||
neg eax // set up to jump into the array
|
||||
add eax, offset $memcpy_align_done
|
||||
jmp eax // jump to array of movsb's
|
||||
|
||||
.align 4
|
||||
movsb
|
||||
|
@ -376,17 +376,17 @@ $memcpy_do_align:
|
|||
movsb
|
||||
|
||||
$memcpy_align_done: // destination is dword aligned
|
||||
mov %eax, %ecx // number of bytes left to copy
|
||||
shr %eax, 6 // get 64-byte block count
|
||||
mov eax, ecx // number of bytes left to copy
|
||||
shr eax, 6 // get 64-byte block count
|
||||
jz $memcpy_ic_2 // finish the last few bytes
|
||||
|
||||
cmp %eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
|
||||
cmp eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
movq [_mmx_backup+0x00],%mm0
|
||||
movq [_mmx_backup+0x08],%mm1
|
||||
movq [_mmx_backup+0x10],%mm2
|
||||
movq [_mmx_backup+0x18],%mm3
|
||||
movq [_mmx_backup+0x00],mm0
|
||||
movq [_mmx_backup+0x08],mm1
|
||||
movq [_mmx_backup+0x10],mm2
|
||||
movq [_mmx_backup+0x18],mm3
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
|
@ -394,49 +394,49 @@ $memcpy_align_done: // destination is dword aligned
|
|||
.align 16
|
||||
$memcpy_ic_1: // 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [%esi + (200*64/34+192)] // start reading ahead
|
||||
prefetchnta [esi + (200*64/34+192)] // start reading ahead
|
||||
|
||||
movq %mm0, [%esi+0] // read 64 bits
|
||||
movq %mm1, [%esi+8]
|
||||
movq [%edi+0], %mm0 //write 64 bits
|
||||
movq [%edi+8], %mm1 // note: the normal movq writes the
|
||||
movq %mm2, [%esi+16] // data to cache; a cache line will be
|
||||
movq %mm3, [%esi+24] // allocated as needed, to store the data
|
||||
movq [%edi+16], %mm2
|
||||
movq [%edi+24], %mm3
|
||||
movq %mm0, [%esi+32]
|
||||
movq %mm1, [%esi+40]
|
||||
movq [%edi+32], %mm0
|
||||
movq [%edi+40], %mm1
|
||||
movq %mm2, [%esi+48]
|
||||
movq %mm3, [%esi+56]
|
||||
movq [%edi+48], %mm2
|
||||
movq [%edi+56], %mm3
|
||||
movq mm0, [esi+0] // read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 //write 64 bits
|
||||
movq [edi+8], mm1 // note: the normal movq writes the
|
||||
movq mm2, [esi+16] // data to cache; a cache line will be
|
||||
movq mm3, [esi+24] // allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add %esi, 64 // update source pointer
|
||||
add %edi, 64 // update destination pointer
|
||||
dec %eax // count down
|
||||
add esi, 64 // update source pointer
|
||||
add edi, 64 // update destination pointer
|
||||
dec eax // count down
|
||||
jnz $memcpy_ic_1 // last 64-byte block?
|
||||
|
||||
movq %mm0,[_mmx_backup+0x00]
|
||||
movq %mm1,[_mmx_backup+0x08]
|
||||
movq %mm2,[_mmx_backup+0x10]
|
||||
movq %mm3,[_mmx_backup+0x18]
|
||||
movq mm0,[_mmx_backup+0x00]
|
||||
movq mm1,[_mmx_backup+0x08]
|
||||
movq mm2,[_mmx_backup+0x10]
|
||||
movq mm3,[_mmx_backup+0x18]
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov %eax, %ecx // has valid low 6 bits of the byte count
|
||||
mov eax, ecx // has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr %eax, 2 // dword count
|
||||
andb %eax, 1111 // only look at the "remainder" bits
|
||||
neg %eax // set up to jump into the array
|
||||
add %eax, offset $memcpy_last_few
|
||||
jmp %eax // jump to array of movsd's
|
||||
shr eax, 2 // dword count
|
||||
andb eax, 1111 // only look at the "remainder" bits
|
||||
neg eax // set up to jump into the array
|
||||
add eax, offset $memcpy_last_few
|
||||
jmp eax // jump to array of movsd's
|
||||
|
||||
$memcpy_uc_test:
|
||||
// cmp %ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
|
||||
// cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
|
||||
// jae $memcpy_bp_1
|
||||
//$memcpy_64_test:
|
||||
or %eax, %eax // tail end of block prefetch will jump here
|
||||
or eax, eax // tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 // no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
|
@ -444,39 +444,39 @@ $memcpy_uc_test:
|
|||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
|
||||
movq [_mmx_backup+0x00],%mm0
|
||||
movq [_mmx_backup+0x08],%mm1
|
||||
movq [_mmx_backup+0x10],%mm2
|
||||
movq [_mmx_backup+0x00],mm0
|
||||
movq [_mmx_backup+0x08],mm1
|
||||
movq [_mmx_backup+0x10],mm2
|
||||
|
||||
.align 16
|
||||
$memcpy_uc_1: // 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [%esi + (200*64/34+192)] // start reading ahead
|
||||
prefetchnta [esi + (200*64/34+192)] // start reading ahead
|
||||
|
||||
movq %mm0,[%esi+0] // read 64 bits
|
||||
add %edi,64 // update destination pointer
|
||||
movq %mm1,[%esi+8]
|
||||
add %esi,64 // update source pointer
|
||||
movq %mm2,[%esi-48]
|
||||
movntq [%edi-64], %mm0 // write 64 bits, bypassing the cache
|
||||
movq %mm0,[%esi-40] // note: movntq also prevents the CPU
|
||||
movntq [%edi-56], %mm1 // from READING the destination address
|
||||
movq %mm1,[%esi-32] // into the cache, only to be over-written
|
||||
movntq [%edi-48], %mm2 // so that also helps performance
|
||||
movq %mm2,[%esi-24]
|
||||
movntq [%edi-40], %mm0
|
||||
movq %mm0,[%esi-16]
|
||||
movntq [%edi-32], %mm1
|
||||
movq %mm1,[%esi-8]
|
||||
movntq [%edi-24], %mm2
|
||||
movntq [%edi-16], %mm0
|
||||
dec %eax
|
||||
movntq [%edi-8], %mm1
|
||||
movq mm0,[esi+0] // read 64 bits
|
||||
add edi,64 // update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 // update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 // write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] // note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 // from READING the destination address
|
||||
movq mm1,[esi-32] // into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 // so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16],mm0
|
||||
dec eax
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 // last 64-byte block?
|
||||
|
||||
movq %mm0,[_mmx_backup+0x00]
|
||||
movq %mm1,[_mmx_backup+0x08]
|
||||
movq %mm2,[_mmx_backup+0x10]
|
||||
movq mm0,[_mmx_backup+0x00]
|
||||
movq mm1,[_mmx_backup+0x08]
|
||||
movq mm2,[_mmx_backup+0x10]
|
||||
|
||||
jmp $memcpy_ic_2 // almost done (not needed because large copy below was removed)
|
||||
|
||||
|
@ -511,17 +511,17 @@ $memcpy_uc_1: // 64-byte blocks, uncached copy
|
|||
movsd
|
||||
|
||||
$memcpy_last_few: // dword aligned from before movsd's
|
||||
mov %eax, %ecx // has valid low 2 bits of the byte count
|
||||
andb %eax, 11 // the last few cows must come home
|
||||
mov eax, ecx // has valid low 2 bits of the byte count
|
||||
andb eax, 11 // the last few cows must come home
|
||||
jz $memcpy_final // no more, let's leave
|
||||
rep movsb // the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
emms // clean up the MMX state
|
||||
sfence // flush the write buffer
|
||||
//mov %eax, [dest] // ret value = destination pointer
|
||||
//mov eax, [dest] // ret value = destination pointer
|
||||
|
||||
pop %esi
|
||||
pop %edi
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
ret 4
|
|
@ -824,18 +824,18 @@ static s32 recExecuteBlock( s32 eeCycles )
|
|||
#else
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
"push %ebx\n"
|
||||
"push %esi\n"
|
||||
"push %edi\n"
|
||||
"push %ebp\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"push ebx\n"
|
||||
"push esi\n"
|
||||
"push edi\n"
|
||||
"push ebp\n"
|
||||
|
||||
"call iopDispatcherReg\n"
|
||||
|
||||
"pop %ebp\n"
|
||||
"pop %edi\n"
|
||||
"pop %esi\n"
|
||||
"pop %ebx\n"
|
||||
"pop ebp\n"
|
||||
"pop edi\n"
|
||||
"pop esi\n"
|
||||
"pop ebx\n"
|
||||
".att_syntax\n"
|
||||
);
|
||||
#endif
|
||||
|
|
|
@ -96,7 +96,7 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
|||
|
||||
|
||||
#else // gcc
|
||||
|
||||
// Is this really supposed to be assembly for gcc and C for Windows?
|
||||
void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
||||
{
|
||||
u32 i;
|
||||
|
@ -112,23 +112,23 @@ void __fastcall SetNewMask(u32* vif1masks, u32* hasmask, u32 mask, u32 oldmask)
|
|||
u8* p0 = (u8*)&s_maskarr[mask&15][0];
|
||||
u8* p1 = (u8*)&s_maskarr[(mask>>4)&15][0];
|
||||
|
||||
__asm__(".intel_syntax\n"
|
||||
"movaps %%xmm0, [%0]\n"
|
||||
"movaps %%xmm1, [%1]\n"
|
||||
"movaps %%xmm2, %%xmm0\n"
|
||||
"punpcklwd %%xmm0, %%xmm0\n"
|
||||
"punpckhwd %%xmm2, %%xmm2\n"
|
||||
"movaps %%xmm3, %%xmm1\n"
|
||||
"punpcklwd %%xmm1, %%xmm1\n"
|
||||
"punpckhwd %%xmm3, %%xmm3\n"
|
||||
"movq [%2], %%xmm0\n"
|
||||
"movq [%2+8], %%xmm1\n"
|
||||
"movhps [%2+16], %%xmm0\n"
|
||||
"movhps [%2+24], %%xmm1\n"
|
||||
"movq [%2+32], %%xmm2\n"
|
||||
"movq [%2+40], %%xmm3\n"
|
||||
"movhps [%2+48], %%xmm2\n"
|
||||
"movhps [%2+56], %%xmm3\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movaps xmm0, [%0]\n"
|
||||
"movaps xmm1, [%1]\n"
|
||||
"movaps xmm2, xmm0\n"
|
||||
"punpcklwd xmm0, xmm0\n"
|
||||
"punpckhwd xmm2, xmm2\n"
|
||||
"movaps xmm3, xmm1\n"
|
||||
"punpcklwd xmm1, xmm1\n"
|
||||
"punpckhwd xmm3, xmm3\n"
|
||||
"movq [%2], xmm0\n"
|
||||
"movq [%2+8], xmm1\n"
|
||||
"movhps [%2+16], xmm0\n"
|
||||
"movhps [%2+24], xmm1\n"
|
||||
"movq [%2+32], xmm2\n"
|
||||
"movq [%2+40], xmm3\n"
|
||||
"movhps [%2+48], xmm2\n"
|
||||
"movhps [%2+56], xmm3\n"
|
||||
".att_syntax\n" : : "r"(p0), "r"(p1), "r"(vif1masks) );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -800,18 +800,18 @@ __forceinline void recExecute()
|
|||
g_EEFreezeRegs = true;
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
"push %ebx\n"
|
||||
"push %esi\n"
|
||||
"push %edi\n"
|
||||
"push %ebp\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"push ebx\n"
|
||||
"push esi\n"
|
||||
"push edi\n"
|
||||
"push ebp\n"
|
||||
|
||||
"call DispatcherReg\n"
|
||||
|
||||
"pop %ebp\n"
|
||||
"pop %edi\n"
|
||||
"pop %esi\n"
|
||||
"pop %ebx\n"
|
||||
"pop ebp\n"
|
||||
"pop edi\n"
|
||||
"pop esi\n"
|
||||
"pop ebx\n"
|
||||
".att_syntax\n"
|
||||
);
|
||||
g_EEFreezeRegs = false;
|
||||
|
@ -824,18 +824,18 @@ static void recExecuteBlock()
|
|||
g_EEFreezeRegs = true;
|
||||
__asm__
|
||||
(
|
||||
".intel_syntax\n"
|
||||
"push %ebx\n"
|
||||
"push %esi\n"
|
||||
"push %edi\n"
|
||||
"push %ebp\n"
|
||||
".intel_syntax noprefix\n"
|
||||
"push ebx\n"
|
||||
"push esi\n"
|
||||
"push edi\n"
|
||||
"push ebp\n"
|
||||
|
||||
"call DispatcherReg\n"
|
||||
|
||||
"pop %ebp\n"
|
||||
"pop %edi\n"
|
||||
"pop %esi\n"
|
||||
"pop %ebx\n"
|
||||
"pop ebp\n"
|
||||
"pop edi\n"
|
||||
"pop esi\n"
|
||||
"pop ebx\n"
|
||||
".att_syntax\n"
|
||||
);
|
||||
g_EEFreezeRegs = false;
|
||||
|
|
|
@ -97,15 +97,15 @@ __forceinline void FreezeMMXRegs_(int save)
|
|||
emms
|
||||
}
|
||||
#else
|
||||
__asm__(".intel_syntax\n"
|
||||
"movq [%0+0x00], %%mm0\n"
|
||||
"movq [%0+0x08], %%mm1\n"
|
||||
"movq [%0+0x10], %%mm2\n"
|
||||
"movq [%0+0x18], %%mm3\n"
|
||||
"movq [%0+0x20], %%mm4\n"
|
||||
"movq [%0+0x28], %%mm5\n"
|
||||
"movq [%0+0x30], %%mm6\n"
|
||||
"movq [%0+0x38], %%mm7\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movq [%0+0x00], mm0\n"
|
||||
"movq [%0+0x08], mm1\n"
|
||||
"movq [%0+0x10], mm2\n"
|
||||
"movq [%0+0x18], mm3\n"
|
||||
"movq [%0+0x20], mm4\n"
|
||||
"movq [%0+0x28], mm5\n"
|
||||
"movq [%0+0x30], mm6\n"
|
||||
"movq [%0+0x38], mm7\n"
|
||||
"emms\n"
|
||||
".att_syntax\n" : : "r"(g_globalMMXData) );
|
||||
#endif
|
||||
|
@ -134,15 +134,15 @@ __forceinline void FreezeMMXRegs_(int save)
|
|||
emms
|
||||
}
|
||||
#else
|
||||
__asm__(".intel_syntax\n"
|
||||
"movq %%mm0, [%0+0x00]\n"
|
||||
"movq %%mm1, [%0+0x08]\n"
|
||||
"movq %%mm2, [%0+0x10]\n"
|
||||
"movq %%mm3, [%0+0x18]\n"
|
||||
"movq %%mm4, [%0+0x20]\n"
|
||||
"movq %%mm5, [%0+0x28]\n"
|
||||
"movq %%mm6, [%0+0x30]\n"
|
||||
"movq %%mm7, [%0+0x38]\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movq mm0, [%0+0x00]\n"
|
||||
"movq mm1, [%0+0x08]\n"
|
||||
"movq mm2, [%0+0x10]\n"
|
||||
"movq mm3, [%0+0x18]\n"
|
||||
"movq mm4, [%0+0x20]\n"
|
||||
"movq mm5, [%0+0x28]\n"
|
||||
"movq mm6, [%0+0x30]\n"
|
||||
"movq mm7, [%0+0x38]\n"
|
||||
"emms\n"
|
||||
".att_syntax\n" : : "r"(g_globalMMXData) );
|
||||
#endif
|
||||
|
@ -177,15 +177,15 @@ __forceinline void FreezeXMMRegs_(int save)
|
|||
}
|
||||
|
||||
#else
|
||||
__asm__(".intel_syntax\n"
|
||||
"movaps [%0+0x00], %%xmm0\n"
|
||||
"movaps [%0+0x10], %%xmm1\n"
|
||||
"movaps [%0+0x20], %%xmm2\n"
|
||||
"movaps [%0+0x30], %%xmm3\n"
|
||||
"movaps [%0+0x40], %%xmm4\n"
|
||||
"movaps [%0+0x50], %%xmm5\n"
|
||||
"movaps [%0+0x60], %%xmm6\n"
|
||||
"movaps [%0+0x70], %%xmm7\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movaps [%0+0x00], xmm0\n"
|
||||
"movaps [%0+0x10], xmm1\n"
|
||||
"movaps [%0+0x20], xmm2\n"
|
||||
"movaps [%0+0x30], xmm3\n"
|
||||
"movaps [%0+0x40], xmm4\n"
|
||||
"movaps [%0+0x50], xmm5\n"
|
||||
"movaps [%0+0x60], xmm6\n"
|
||||
"movaps [%0+0x70], xmm7\n"
|
||||
".att_syntax\n" : : "r"(g_globalXMMData) );
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
@ -214,15 +214,15 @@ __forceinline void FreezeXMMRegs_(int save)
|
|||
}
|
||||
|
||||
#else
|
||||
__asm__(".intel_syntax\n"
|
||||
"movaps %%xmm0, [%0+0x00]\n"
|
||||
"movaps %%xmm1, [%0+0x10]\n"
|
||||
"movaps %%xmm2, [%0+0x20]\n"
|
||||
"movaps %%xmm3, [%0+0x30]\n"
|
||||
"movaps %%xmm4, [%0+0x40]\n"
|
||||
"movaps %%xmm5, [%0+0x50]\n"
|
||||
"movaps %%xmm6, [%0+0x60]\n"
|
||||
"movaps %%xmm7, [%0+0x70]\n"
|
||||
__asm__(".intel_syntax noprefix\n"
|
||||
"movaps xmm0, [%0+0x00]\n"
|
||||
"movaps xmm1, [%0+0x10]\n"
|
||||
"movaps xmm2, [%0+0x20]\n"
|
||||
"movaps xmm3, [%0+0x30]\n"
|
||||
"movaps xmm4, [%0+0x40]\n"
|
||||
"movaps xmm5, [%0+0x50]\n"
|
||||
"movaps xmm6, [%0+0x60]\n"
|
||||
"movaps xmm7, [%0+0x70]\n"
|
||||
".att_syntax\n" : : "r"(g_globalXMMData) );
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
|
Loading…
Reference in New Issue