mirror of https://github.com/PCSX2/pcsx2.git
commit
1b6188ee1d
|
@ -146,12 +146,6 @@
|
|||
<ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<FileType>Document</FileType>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
</CustomBuildStep>
|
||||
<None Include="..\..\include\Utilities\EventSource.inl" />
|
||||
<None Include="..\..\include\Utilities\TlsVariable.inl" />
|
||||
</ItemGroup>
|
||||
|
@ -196,4 +190,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -223,9 +223,4 @@
|
|||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<Filter>Source Files\Linux</Filter>
|
||||
</CustomBuildStep>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -149,13 +149,6 @@
|
|||
<ClCompile Include="..\..\src\Utilities\Semaphore.cpp" />
|
||||
<ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<FileType>Document</FileType>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
</CustomBuildStep>
|
||||
<None Include="..\..\include\Utilities\EventSource.inl" />
|
||||
<None Include="..\..\include\Utilities\TlsVariable.inl" />
|
||||
</ItemGroup>
|
||||
|
@ -200,4 +193,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -223,9 +223,4 @@
|
|||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<Filter>Source Files\Linux</Filter>
|
||||
</CustomBuildStep>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -150,12 +150,6 @@
|
|||
<ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<FileType>Document</FileType>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
</CustomBuildStep>
|
||||
<None Include="..\..\include\Utilities\EventSource.inl" />
|
||||
<None Include="..\..\include\Utilities\TlsVariable.inl" />
|
||||
</ItemGroup>
|
||||
|
@ -200,4 +194,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -223,9 +223,4 @@
|
|||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
|
||||
<Filter>Source Files\Linux</Filter>
|
||||
</CustomBuildStep>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -16,23 +16,17 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
# include "lnx_memzero.h"
|
||||
|
||||
extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||
extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||
extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
|
||||
|
||||
#else
|
||||
|
||||
# include "win_memzero.h"
|
||||
#endif
|
||||
|
||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
|
||||
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
||||
|
||||
// For 32-bit MSVC compiles, memcmp performs much worse than memcmp_mmx and
|
||||
// other implementations. So for this combination only, prefer memcmp_mmx
|
||||
#if defined(_MSC_VER) && !defined(_M_X86_64)
|
||||
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
||||
#else
|
||||
#define memcmp_mmx memcmp
|
||||
#endif
|
||||
|
||||
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
||||
|
|
|
@ -129,13 +129,8 @@ set(UtilitiesSources
|
|||
wxGuiTools.cpp
|
||||
wxHelpers.cpp
|
||||
x86/MemcpyVibes.cpp
|
||||
# x86/MemcpyFast.cpp
|
||||
)
|
||||
|
||||
# collect .S files
|
||||
set(UtilitiesSSources
|
||||
x86/MemcpyFast.S)
|
||||
|
||||
# variable with all headers of this library
|
||||
set(UtilitiesHeaders
|
||||
../../include/Utilities/Assertions.h
|
||||
|
@ -171,17 +166,13 @@ set(UtilitiesHeaders
|
|||
../../include/Utilities/wxGuiTools.h
|
||||
PrecompiledHeader.h)
|
||||
|
||||
# change language of .S-files to c++
|
||||
set_source_files_properties(${UtilitiesSSources} PROPERTIES LANGUAGE CXX)
|
||||
|
||||
set(UtilitiesFinalSources
|
||||
${UtilitiesSources}
|
||||
${UtilitiesHeaders}
|
||||
${UtilitiesSSources}
|
||||
)
|
||||
|
||||
set(UtilitiesFinalLibs
|
||||
${wxWidgets_LIBRARIES}
|
||||
)
|
||||
|
||||
add_pcsx2_lib(${Output} "${UtilitiesFinalSources}" "${UtilitiesFinalLibs}" "${UtilitiesFinalFlags}")
|
||||
add_pcsx2_lib(${Output} "${UtilitiesFinalSources}" "${UtilitiesFinalLibs}" "${UtilitiesFinalFlags}")
|
||||
|
|
|
@ -1,497 +0,0 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#define TINY_BLOCK_COPY 64
|
||||
#define IN_CACHE_COPY 2 * 1024
|
||||
#define UNCACHED_COPY 4 * 1024
|
||||
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||
#define CACHEBLOCK 80
|
||||
|
||||
// Fast assembly routines for x86-64
|
||||
// zerofrog(@gmail.com)
|
||||
// and added to by arcum42@gmail.com
|
||||
.intel_syntax noprefix
|
||||
.extern _mmx_backup
|
||||
|
||||
// mmx memcmp implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
// (zerofrog)
|
||||
// u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
|
||||
#define MEMCMP_SRC1 edx
|
||||
#define MEMCMP_SRC2 esi
|
||||
#define MEMCMP_SIZE ecx
|
||||
|
||||
.global memcmp_mmx
|
||||
memcmp_mmx:
|
||||
// make sure mmx regs are stored
|
||||
// FreezeMMXRegs(1);
|
||||
//cmp dword ptr [g_EEFreezeRegs], 0
|
||||
//je memcmp_mmx_begin
|
||||
//push 1
|
||||
//call FreezeMMXRegs_
|
||||
//add esp, 4
|
||||
|
||||
memcmp_mmx_begin:
|
||||
push esi
|
||||
mov MEMCMP_SRC1, dword ptr [esp+8]
|
||||
mov MEMCMP_SRC2, dword ptr [esp+12]
|
||||
mov MEMCMP_SIZE, dword ptr [esp+16]
|
||||
|
||||
cmp MEMCMP_SIZE, 32
|
||||
jl memcmp_Done4
|
||||
|
||||
// custom test first 8 to make sure things are ok
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pand mm0, mm1
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
pmovmskb eax, mm0
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_NextComp
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_NextComp:
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm2, mm3
|
||||
pmovmskb eax, mm2
|
||||
|
||||
sub MEMCMP_SIZE, 32
|
||||
add MEMCMP_SRC2, 32
|
||||
add MEMCMP_SRC1, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_ContinueTest
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
cmp MEMCMP_SIZE, 64
|
||||
jl memcmp_Done8
|
||||
|
||||
memcmp_Cmp8:
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
movq mm4, [MEMCMP_SRC2+32]
|
||||
movq mm5, [MEMCMP_SRC2+40]
|
||||
movq mm6, [MEMCMP_SRC2+48]
|
||||
movq mm7, [MEMCMP_SRC2+56]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm0, mm1
|
||||
pcmpeqd mm4, [MEMCMP_SRC1+32]
|
||||
pand mm0, mm2
|
||||
pcmpeqd mm5, [MEMCMP_SRC1+40]
|
||||
pand mm0, mm3
|
||||
pcmpeqd mm6, [MEMCMP_SRC1+48]
|
||||
pand mm0, mm4
|
||||
pcmpeqd mm7, [MEMCMP_SRC1+56]
|
||||
pand mm0, mm5
|
||||
pand mm0, mm6
|
||||
pand mm0, mm7
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_Continue
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Continue:
|
||||
sub MEMCMP_SIZE, 64
|
||||
add MEMCMP_SRC2, 64
|
||||
add MEMCMP_SRC1, 64
|
||||
memcmp_ContinueTest:
|
||||
cmp MEMCMP_SIZE, 64
|
||||
jge memcmp_Cmp8
|
||||
|
||||
memcmp_Done8:
|
||||
test MEMCMP_SIZE, 0x20
|
||||
jz memcmp_Done4
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
movq mm3, [MEMCMP_SRC2+24]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pcmpeqd mm3, [MEMCMP_SRC1+24]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pand mm0, mm3
|
||||
pmovmskb eax, mm0
|
||||
sub MEMCMP_SIZE, 32
|
||||
add MEMCMP_SRC2, 32
|
||||
add MEMCMP_SRC1, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done4
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done4:
|
||||
cmp MEMCMP_SIZE, 24
|
||||
jne memcmp_Done2
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
movq mm2, [MEMCMP_SRC2+16]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pcmpeqd mm2, [MEMCMP_SRC1+16]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done2:
|
||||
cmp MEMCMP_SIZE, 16
|
||||
jne memcmp_Done1
|
||||
|
||||
movq mm0, [MEMCMP_SRC2]
|
||||
movq mm1, [MEMCMP_SRC2+8]
|
||||
pcmpeqd mm0, [MEMCMP_SRC1]
|
||||
pcmpeqd mm1, [MEMCMP_SRC1+8]
|
||||
pand mm0, mm1
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je memcmp_Done
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done1:
|
||||
cmp MEMCMP_SIZE, 8
|
||||
jne memcmp_Done
|
||||
|
||||
mov eax, [MEMCMP_SRC2]
|
||||
mov MEMCMP_SRC2, [MEMCMP_SRC2+4]
|
||||
cmp eax, [MEMCMP_SRC1]
|
||||
je memcmp_Next
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Next:
|
||||
cmp MEMCMP_SRC2, [MEMCMP_SRC1+4]
|
||||
je memcmp_Done
|
||||
mov eax, 1
|
||||
jmp memcmp_End
|
||||
|
||||
memcmp_Done:
|
||||
xor eax, eax
|
||||
|
||||
memcmp_End:
|
||||
emms
|
||||
pop esi
|
||||
ret
|
||||
|
||||
// memxor_mmx
|
||||
#define MEMXOR_SRC1 edx
|
||||
#define MEMXOR_SRC2 esi
|
||||
#define MEMXOR_SIZE ecx
|
||||
|
||||
.global memxor_mmx
|
||||
memxor_mmx:
|
||||
|
||||
push esi
|
||||
mov MEMXOR_SRC1, dword ptr [esp+8]
|
||||
mov MEMXOR_SRC2, dword ptr [esp+12]
|
||||
mov MEMXOR_SIZE, dword ptr [esp+16]
|
||||
cmp MEMXOR_SIZE, 64
|
||||
jl memxor_Setup4
|
||||
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
movq mm2, [MEMXOR_SRC2+16]
|
||||
movq mm3, [MEMXOR_SRC2+24]
|
||||
movq mm4, [MEMXOR_SRC2+32]
|
||||
movq mm5, [MEMXOR_SRC2+40]
|
||||
movq mm6, [MEMXOR_SRC2+48]
|
||||
movq mm7, [MEMXOR_SRC2+56]
|
||||
sub MEMXOR_SIZE, 64
|
||||
add MEMXOR_SRC2, 64
|
||||
cmp MEMXOR_SIZE, 64
|
||||
jl memxor_End8
|
||||
|
||||
memxor_Cmp8:
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
pxor mm2, [MEMXOR_SRC2+16]
|
||||
pxor mm3, [MEMXOR_SRC2+24]
|
||||
pxor mm4, [MEMXOR_SRC2+32]
|
||||
pxor mm5, [MEMXOR_SRC2+40]
|
||||
pxor mm6, [MEMXOR_SRC2+48]
|
||||
pxor mm7, [MEMXOR_SRC2+56]
|
||||
|
||||
sub MEMXOR_SIZE, 64
|
||||
add MEMXOR_SRC2, 64
|
||||
cmp MEMXOR_SIZE, 64
|
||||
jge memxor_Cmp8
|
||||
|
||||
memxor_End8:
|
||||
pxor mm0, mm4
|
||||
pxor mm1, mm5
|
||||
pxor mm2, mm6
|
||||
pxor mm3, mm7
|
||||
|
||||
cmp MEMXOR_SIZE, 32
|
||||
jl memxor_End4
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
pxor mm2, [MEMXOR_SRC2+16]
|
||||
pxor mm3, [MEMXOR_SRC2+24]
|
||||
sub MEMXOR_SIZE, 32
|
||||
add MEMXOR_SRC2, 32
|
||||
jmp memxor_End4
|
||||
|
||||
memxor_Setup4:
|
||||
cmp MEMXOR_SIZE, 32
|
||||
jl memxor_Setup2
|
||||
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
movq mm2, [MEMXOR_SRC2+16]
|
||||
movq mm3, [MEMXOR_SRC2+24]
|
||||
sub MEMXOR_SIZE, 32
|
||||
add MEMXOR_SRC2, 32
|
||||
|
||||
memxor_End4:
|
||||
pxor mm0, mm2
|
||||
pxor mm1, mm3
|
||||
|
||||
cmp MEMXOR_SIZE, 16
|
||||
jl memxor_End2
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
pxor mm1, [MEMXOR_SRC2+8]
|
||||
sub MEMXOR_SIZE, 16
|
||||
add MEMXOR_SRC2, 16
|
||||
jmp memxor_End2
|
||||
|
||||
memxor_Setup2:
|
||||
cmp MEMXOR_SIZE, 16
|
||||
jl memxor_Setup1
|
||||
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq mm1, [MEMXOR_SRC2+8]
|
||||
sub MEMXOR_SIZE, 16
|
||||
add MEMXOR_SRC2, 16
|
||||
|
||||
memxor_End2:
|
||||
pxor mm0, mm1
|
||||
|
||||
cmp MEMXOR_SIZE, 8
|
||||
jl memxor_End1
|
||||
pxor mm0, [MEMXOR_SRC2]
|
||||
memxor_End1:
|
||||
movq [MEMXOR_SRC1], mm0
|
||||
jmp memxor_End
|
||||
|
||||
memxor_Setup1:
|
||||
movq mm0, [MEMXOR_SRC2]
|
||||
movq [MEMXOR_SRC1], mm0
|
||||
memxor_End:
|
||||
emms
|
||||
pop esi
|
||||
ret
|
||||
|
||||
// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||
.global memcpy_amd_
|
||||
memcpy_amd_:
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edi, ecx // destination
|
||||
mov esi, edx // source
|
||||
mov ecx, [esp+12] // number of bytes to copy
|
||||
mov eax, ecx // keep a copy of count
|
||||
|
||||
cld
|
||||
cmp eax, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 // tiny? skip mmx copy
|
||||
|
||||
cmp eax, 32*1024 // don't align between 32k-64k because
|
||||
jbe $memcpy_do_align // it appears to be slower
|
||||
cmp eax, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
|
||||
$memcpy_do_align:
|
||||
mov eax, 8 // a trick that's faster than rep movsb...
|
||||
sub eax, edi // align destination to qword
|
||||
and eax, 0b111 // get the low bits
|
||||
sub ecx, eax // update copy count
|
||||
neg eax // set up to jump into the array
|
||||
add eax, offset $memcpy_align_done
|
||||
jmp eax // jump to array of movsb's
|
||||
|
||||
.align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
|
||||
$memcpy_align_done: // destination is dword aligned
|
||||
mov eax, ecx // number of bytes left to copy
|
||||
shr eax, 6 // get 64-byte block count
|
||||
jz $memcpy_ic_2 // finish the last few bytes
|
||||
|
||||
cmp eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
.align 16
|
||||
$memcpy_ic_1: // 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] // start reading ahead
|
||||
|
||||
movq mm0, [esi+0] // read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 //write 64 bits
|
||||
movq [edi+8], mm1 // note: the normal movq writes the
|
||||
movq mm2, [esi+16] // data to cache; a cache line will be
|
||||
movq mm3, [esi+24] // allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add esi, 64 // update source pointer
|
||||
add edi, 64 // update destination pointer
|
||||
dec eax // count down
|
||||
jnz $memcpy_ic_1 // last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov eax, ecx // has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr eax, 2 // dword count
|
||||
and eax, 0b1111 // only look at the "remainder" bits
|
||||
neg eax // set up to jump into the array
|
||||
add eax, offset $memcpy_last_few
|
||||
jmp eax // jump to array of movsd's
|
||||
|
||||
$memcpy_uc_test:
|
||||
or eax, eax // tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 // no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
|
||||
.align 16
|
||||
$memcpy_uc_1: // 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] // start reading ahead
|
||||
|
||||
movq mm0,[esi+0] // read 64 bits
|
||||
add edi,64 // update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 // update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 // write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] // note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 // from READING the destination address
|
||||
movq mm1,[esi-32] // into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 // so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16],mm0
|
||||
dec eax
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 // last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 // almost done (not needed because large copy below was removed)
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
||||
// help keep the code cache footprint of memcpy_fast to a minimum.
|
||||
// <Code removed here>
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
.align 16
|
||||
movsd
|
||||
movsd // perform last 1-15 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd // perform last 1-7 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
|
||||
$memcpy_last_few: // dword aligned from before movsd's
|
||||
and ecx, 0b11 // the last few cows must come home
|
||||
jz $memcpy_final // no more, let's leave
|
||||
rep movsb // the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
emms // clean up the MMX state
|
||||
sfence // flush the write buffer
|
||||
//mov eax, [dest] // ret value = destination pointer
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
ret 4
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
|
@ -192,8 +192,7 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
|
|||
#else
|
||||
// Note: GCC builds crash with custom search function, because
|
||||
// they're not guaranteeing 16-byte alignment on the structs :(
|
||||
// #define mVUquickSearch(dest, src, size) (!memcmp(dest, src, size))
|
||||
#define mVUquickSearch(dest, src, size) (!memcmp_mmx(dest, src, size))
|
||||
#define mVUquickSearch(dest, src, size) (!memcmp(dest, src, size))
|
||||
#define mVUemitSearch()
|
||||
#endif
|
||||
//------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue