Merge pull request #281 from xsacha/memcmp

Remove usages of memcmp_mmx
2014-09-19 21:43:19 +02:00 · 2014-09-19 21:43:19 +02:00 · 1b6188ee1d
parent 9ee88ad070 bf7c29e4cd
commit 1b6188ee1d
10 changed files with 15 additions and 562 deletions
--- a/common/build/Utilities/utilities.vcxproj
+++ b/common/build/Utilities/utilities.vcxproj
@ -146,12 +146,6 @@
    <ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </CustomBuildStep>
    <None Include="..\..\include\Utilities\EventSource.inl" />
    <None Include="..\..\include\Utilities\TlsVariable.inl" />
  </ItemGroup>
@ -196,4 +190,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/common/build/Utilities/utilities.vcxproj.filters
+++ b/common/build/Utilities/utilities.vcxproj.filters
@ -223,9 +223,4 @@
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <Filter>Source Files\Linux</Filter>
-    </CustomBuildStep>
-  </ItemGroup>
-</Project>
+</Project>
--- a/common/build/Utilities/utilities_vs2012.vcxproj
+++ b/common/build/Utilities/utilities_vs2012.vcxproj
@ -149,13 +149,6 @@
    <ClCompile Include="..\..\src\Utilities\Semaphore.cpp" />
    <ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </CustomBuildStep>
    <None Include="..\..\include\Utilities\EventSource.inl" />
    <None Include="..\..\include\Utilities\TlsVariable.inl" />
  </ItemGroup>
@ -200,4 +193,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/common/build/Utilities/utilities_vs2012.vcxproj.filters
+++ b/common/build/Utilities/utilities_vs2012.vcxproj.filters
@ -223,9 +223,4 @@
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <Filter>Source Files\Linux</Filter>
-    </CustomBuildStep>
-  </ItemGroup>
-</Project>
+</Project>
--- a/common/build/Utilities/utilities_vs2013.vcxproj
+++ b/common/build/Utilities/utilities_vs2013.vcxproj
@ -150,12 +150,6 @@
    <ClCompile Include="..\..\src\Utilities\ThreadTools.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Devel|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-    </CustomBuildStep>
    <None Include="..\..\include\Utilities\EventSource.inl" />
    <None Include="..\..\include\Utilities\TlsVariable.inl" />
  </ItemGroup>
@ -200,4 +194,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/common/build/Utilities/utilities_vs2013.vcxproj.filters
+++ b/common/build/Utilities/utilities_vs2013.vcxproj.filters
@ -223,9 +223,4 @@
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuildStep Include="..\..\src\Utilities\x86\MemcpyFast.S">
-      <Filter>Source Files\Linux</Filter>
-    </CustomBuildStep>
-  </ItemGroup>
-</Project>
+</Project>
--- a/common/include/Utilities/MemcpyFast.h
+++ b/common/include/Utilities/MemcpyFast.h
@ -16,23 +16,17 @@
 #pragma once

 #ifdef __linux__
-
 #	include "lnx_memzero.h"
-
-	extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
-	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
-	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
-	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
-
 #else
-
 #	include "win_memzero.h"
+#endif

-	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
-	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
-	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
-	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
-
+// For 32-bit MSVC compiles, memcmp performs much worse than memcmp_mmx and
+// other implementations. So for this combination only, prefer memcmp_mmx
+#if defined(_MSC_VER) && !defined(_M_X86_64)
+extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
+#else
+#define memcmp_mmx memcmp
 #endif

 // Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
--- a/common/src/Utilities/CMakeLists.txt
+++ b/common/src/Utilities/CMakeLists.txt
@ -129,13 +129,8 @@ set(UtilitiesSources
 	wxGuiTools.cpp
 	wxHelpers.cpp
 	x86/MemcpyVibes.cpp
-#	x86/MemcpyFast.cpp
 	)

-# collect .S files
-set(UtilitiesSSources
-	x86/MemcpyFast.S)
-
 # variable with all headers of this library
 set(UtilitiesHeaders
 	../../include/Utilities/Assertions.h
@ -171,17 +166,13 @@ set(UtilitiesHeaders
 	../../include/Utilities/wxGuiTools.h
 	PrecompiledHeader.h)

-# change language of .S-files to c++
-set_source_files_properties(${UtilitiesSSources} PROPERTIES LANGUAGE CXX)
-
 set(UtilitiesFinalSources
 	${UtilitiesSources}
 	${UtilitiesHeaders}
-	${UtilitiesSSources}
 )

 set(UtilitiesFinalLibs
 	${wxWidgets_LIBRARIES}
 )

-add_pcsx2_lib(${Output} "${UtilitiesFinalSources}" "${UtilitiesFinalLibs}" "${UtilitiesFinalFlags}")
+add_pcsx2_lib(${Output} "${UtilitiesFinalSources}" "${UtilitiesFinalLibs}" "${UtilitiesFinalFlags}")
--- a/common/src/Utilities/x86/MemcpyFast.S
+++ b/common/src/Utilities/x86/MemcpyFast.S
@ -1,497 +0,0 @@
-/*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
- *
- *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
- *  of the GNU Lesser General Public License as published by the Free Software Found-
- *  ation, either version 3 of the License, or (at your option) any later version.
- *
- *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- *  PURPOSE.  See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along with PCSX2.
- *  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#define TINY_BLOCK_COPY 64
-#define IN_CACHE_COPY 2 * 1024
-#define UNCACHED_COPY 4 * 1024
-#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80
-
-// Fast assembly routines for x86-64
-// zerofrog(@gmail.com)
-// and added to by arcum42@gmail.com
-.intel_syntax noprefix
-.extern _mmx_backup
-
-// mmx memcmp implementation, size has to be a multiple of 8
-// returns 0 is equal, nonzero value if not equal
-// ~10 times faster than standard memcmp
-// (zerofrog)
-// u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-#define MEMCMP_SRC1 edx
-#define MEMCMP_SRC2 esi
-#define MEMCMP_SIZE ecx
-
-.global memcmp_mmx
-memcmp_mmx:
-		// make sure mmx regs are stored
-		// FreezeMMXRegs(1);
-		//cmp dword ptr [g_EEFreezeRegs], 0
-		//je memcmp_mmx_begin
-		//push 1
-		//call FreezeMMXRegs_
-		//add esp, 4
-
-memcmp_mmx_begin:
-		push esi
-		mov MEMCMP_SRC1, dword ptr [esp+8]
-		mov MEMCMP_SRC2, dword ptr [esp+12]
-		mov MEMCMP_SIZE, dword ptr [esp+16]
-
-        cmp MEMCMP_SIZE, 32
-		jl memcmp_Done4
-
-		// custom test first 8 to make sure things are ok
-		movq mm0, [MEMCMP_SRC2]
-		movq mm1, [MEMCMP_SRC2+8]
-		pcmpeqd mm0, [MEMCMP_SRC1]
-		pcmpeqd mm1, [MEMCMP_SRC1+8]
-		pand mm0, mm1
-		movq mm2, [MEMCMP_SRC2+16]
-		pmovmskb eax, mm0
-		movq mm3, [MEMCMP_SRC2+24]
-
-		// check if eq
-		cmp eax, 0xff
-		je memcmp_NextComp
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_NextComp:
-		pcmpeqd mm2, [MEMCMP_SRC1+16]
-		pcmpeqd mm3, [MEMCMP_SRC1+24]
-		pand mm2, mm3
-		pmovmskb eax, mm2
-
-		sub MEMCMP_SIZE, 32
-		add MEMCMP_SRC2, 32
-		add MEMCMP_SRC1, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je memcmp_ContinueTest
-		mov eax, 1
-		jmp memcmp_End
-
-		cmp MEMCMP_SIZE, 64
-		jl memcmp_Done8
-
-memcmp_Cmp8:
-		movq mm0, [MEMCMP_SRC2]
-		movq mm1, [MEMCMP_SRC2+8]
-		movq mm2, [MEMCMP_SRC2+16]
-		movq mm3, [MEMCMP_SRC2+24]
-		movq mm4, [MEMCMP_SRC2+32]
-		movq mm5, [MEMCMP_SRC2+40]
-		movq mm6, [MEMCMP_SRC2+48]
-		movq mm7, [MEMCMP_SRC2+56]
-		pcmpeqd mm0, [MEMCMP_SRC1]
-		pcmpeqd mm1, [MEMCMP_SRC1+8]
-		pcmpeqd mm2, [MEMCMP_SRC1+16]
-		pcmpeqd mm3, [MEMCMP_SRC1+24]
-		pand mm0, mm1
-		pcmpeqd mm4, [MEMCMP_SRC1+32]
-		pand mm0, mm2
-		pcmpeqd mm5, [MEMCMP_SRC1+40]
-		pand mm0, mm3
-		pcmpeqd mm6, [MEMCMP_SRC1+48]
-		pand mm0, mm4
-		pcmpeqd mm7, [MEMCMP_SRC1+56]
-		pand mm0, mm5
-		pand mm0, mm6
-		pand mm0, mm7
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		je memcmp_Continue
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Continue:
-		sub MEMCMP_SIZE, 64
-		add MEMCMP_SRC2, 64
-		add MEMCMP_SRC1, 64
-memcmp_ContinueTest:
-		cmp MEMCMP_SIZE, 64
-		jge memcmp_Cmp8
-
-memcmp_Done8:
-		test MEMCMP_SIZE, 0x20
-		jz memcmp_Done4
-		movq mm0, [MEMCMP_SRC2]
-		movq mm1, [MEMCMP_SRC2+8]
-		movq mm2, [MEMCMP_SRC2+16]
-		movq mm3, [MEMCMP_SRC2+24]
-		pcmpeqd mm0, [MEMCMP_SRC1]
-		pcmpeqd mm1, [MEMCMP_SRC1+8]
-		pcmpeqd mm2, [MEMCMP_SRC1+16]
-		pcmpeqd mm3, [MEMCMP_SRC1+24]
-		pand mm0, mm1
-		pand mm0, mm2
-		pand mm0, mm3
-		pmovmskb eax, mm0
-		sub MEMCMP_SIZE, 32
-		add MEMCMP_SRC2, 32
-		add MEMCMP_SRC1, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je memcmp_Done4
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Done4:
-		cmp MEMCMP_SIZE, 24
-		jne memcmp_Done2
-		movq mm0, [MEMCMP_SRC2]
-		movq mm1, [MEMCMP_SRC2+8]
-		movq mm2, [MEMCMP_SRC2+16]
-		pcmpeqd mm0, [MEMCMP_SRC1]
-		pcmpeqd mm1, [MEMCMP_SRC1+8]
-		pcmpeqd mm2, [MEMCMP_SRC1+16]
-		pand mm0, mm1
-		pand mm0, mm2
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-        je memcmp_Done
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Done2:
-		cmp MEMCMP_SIZE, 16
-		jne memcmp_Done1
-
-		movq mm0, [MEMCMP_SRC2]
-		movq mm1, [MEMCMP_SRC2+8]
-		pcmpeqd mm0, [MEMCMP_SRC1]
-		pcmpeqd mm1, [MEMCMP_SRC1+8]
-		pand mm0, mm1
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-        je memcmp_Done
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Done1:
-		cmp MEMCMP_SIZE, 8
-		jne memcmp_Done
-
-		mov eax, [MEMCMP_SRC2]
-		mov MEMCMP_SRC2, [MEMCMP_SRC2+4]
-		cmp eax, [MEMCMP_SRC1]
-		je memcmp_Next
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Next:
-		cmp MEMCMP_SRC2, [MEMCMP_SRC1+4]
-        je memcmp_Done
-		mov eax, 1
-		jmp memcmp_End
-
-memcmp_Done:
-		xor eax, eax
-
-memcmp_End:
-		emms
-		pop esi
-		ret
-
-// memxor_mmx
-#define MEMXOR_SRC1 edx
-#define MEMXOR_SRC2 esi
-#define MEMXOR_SIZE ecx
-
-.global memxor_mmx
-memxor_mmx:
-
-	push esi
-	mov MEMXOR_SRC1, dword ptr [esp+8]
-	mov MEMXOR_SRC2, dword ptr [esp+12]
-	mov MEMXOR_SIZE, dword ptr [esp+16]
-	cmp MEMXOR_SIZE, 64
-	jl memxor_Setup4
-
-	movq mm0, [MEMXOR_SRC2]
-	movq mm1, [MEMXOR_SRC2+8]
-	movq mm2, [MEMXOR_SRC2+16]
-	movq mm3, [MEMXOR_SRC2+24]
-	movq mm4, [MEMXOR_SRC2+32]
-	movq mm5, [MEMXOR_SRC2+40]
-	movq mm6, [MEMXOR_SRC2+48]
-	movq mm7, [MEMXOR_SRC2+56]
-	sub MEMXOR_SIZE, 64
-	add MEMXOR_SRC2, 64
-	cmp MEMXOR_SIZE, 64
-	jl memxor_End8
-
-memxor_Cmp8:
-	pxor mm0, [MEMXOR_SRC2]
-	pxor mm1, [MEMXOR_SRC2+8]
-	pxor mm2, [MEMXOR_SRC2+16]
-	pxor mm3, [MEMXOR_SRC2+24]
-	pxor mm4, [MEMXOR_SRC2+32]
-	pxor mm5, [MEMXOR_SRC2+40]
-	pxor mm6, [MEMXOR_SRC2+48]
-	pxor mm7, [MEMXOR_SRC2+56]
-
-	sub MEMXOR_SIZE, 64
-	add MEMXOR_SRC2, 64
-	cmp MEMXOR_SIZE, 64
-	jge memxor_Cmp8
-
-memxor_End8:
-	pxor mm0, mm4
-	pxor mm1, mm5
-	pxor mm2, mm6
-	pxor mm3, mm7
-
-	cmp MEMXOR_SIZE, 32
-	jl memxor_End4
-	pxor mm0, [MEMXOR_SRC2]
-	pxor mm1, [MEMXOR_SRC2+8]
-	pxor mm2, [MEMXOR_SRC2+16]
-	pxor mm3, [MEMXOR_SRC2+24]
-	sub MEMXOR_SIZE, 32
-	add MEMXOR_SRC2, 32
-	jmp memxor_End4
-
-memxor_Setup4:
-	cmp MEMXOR_SIZE, 32
-	jl memxor_Setup2
-
-	movq mm0, [MEMXOR_SRC2]
-	movq mm1, [MEMXOR_SRC2+8]
-	movq mm2, [MEMXOR_SRC2+16]
-	movq mm3, [MEMXOR_SRC2+24]
-	sub MEMXOR_SIZE, 32
-	add MEMXOR_SRC2, 32
-
-memxor_End4:
-	pxor mm0, mm2
-	pxor mm1, mm3
-
-	cmp MEMXOR_SIZE, 16
-	jl memxor_End2
-	pxor mm0, [MEMXOR_SRC2]
-	pxor mm1, [MEMXOR_SRC2+8]
-	sub MEMXOR_SIZE, 16
-	add MEMXOR_SRC2, 16
-	jmp memxor_End2
-
-memxor_Setup2:
-	cmp MEMXOR_SIZE, 16
-	jl memxor_Setup1
-
-	movq mm0, [MEMXOR_SRC2]
-	movq mm1, [MEMXOR_SRC2+8]
-	sub MEMXOR_SIZE, 16
-	add MEMXOR_SRC2, 16
-
-memxor_End2:
-	pxor mm0, mm1
-
-	cmp MEMXOR_SIZE, 8
-	jl memxor_End1
-	pxor mm0, [MEMXOR_SRC2]
-memxor_End1:
-	movq [MEMXOR_SRC1], mm0
-	jmp memxor_End
-
-memxor_Setup1:
-	movq mm0, [MEMXOR_SRC2]
-	movq [MEMXOR_SRC1], mm0
-memxor_End:
-	emms
-	pop esi
-	ret
-
-// void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
-.global memcpy_amd_
-memcpy_amd_:
-	push    edi
-	push    esi
-
-	mov		edi, ecx		// destination
-	mov		esi, edx		// source
-	mov		ecx, [esp+12]	// number of bytes to copy
-	mov		eax, ecx		// keep a copy of count
-
-	cld
-	cmp		eax, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	// tiny? skip mmx copy
-
-	cmp		eax, 32*1024		// don't align between 32k-64k because
-	jbe		$memcpy_do_align //  it appears to be slower
-	cmp		eax, 64*1024
-	jbe		$memcpy_align_done
-
-$memcpy_do_align:
-	mov		eax, 8			// a trick that's faster than rep movsb...
-	sub		eax, edi		// align destination to qword
-	and		eax, 0b111		// get the low bits
-	sub		ecx, eax		// update copy count
-	neg		eax				// set up to jump into the array
-	add		eax, offset $memcpy_align_done
-	jmp		eax				// jump to array of movsb's
-
-.align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-
-$memcpy_align_done:			// destination is dword aligned
-	mov		eax, ecx		// number of bytes left to copy
-	shr		eax, 6			// get 64-byte block count
-	jz		$memcpy_ic_2	// finish the last few bytes
-
-	cmp		eax, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time.  It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-.align 16
-$memcpy_ic_1:			// 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
-
-	movq	mm0, [esi+0]	// read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	//write 64 bits
-	movq	[edi+8], mm1	//    note:  the normal movq writes the
-	movq	mm2, [esi+16]	//    data to cache; a cache line will be
-	movq	mm3, [esi+24]	//    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			// update source pointer
-	add		edi, 64			// update destination pointer
-	dec		eax				// count down
-	jnz		$memcpy_ic_1	// last 64-byte block?
-
-$memcpy_ic_2:
-	mov		eax, ecx		// has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		eax, 2			// dword count
-	and		eax, 0b1111		// only look at the "remainder" bits
-	neg		eax				// set up to jump into the array
-	add		eax, offset $memcpy_last_few
-	jmp		eax				// jump to array of movsd's
-
-$memcpy_uc_test:
-	or		eax, eax		// tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	// no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-
-.align 16
-$memcpy_uc_1:				// 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
-
-	movq	mm0,[esi+0]		// read 64 bits
-	add		edi,64			// update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			// update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	// write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	//    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	//    from READING the destination address
-	movq	mm1,[esi-32]	//    into the cache, only to be over-written
-	movntq	[edi-48], mm2	//    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16],mm0
-	dec		eax
-	movntq	[edi-8], mm1
-	jnz		$memcpy_uc_1	// last 64-byte block?
-
-	jmp		$memcpy_ic_2		// almost done  (not needed because large copy below was removed)
-
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
-// help keep the code cache footprint of memcpy_fast to a minimum.
-// <Code removed here>
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-.align 16
-	movsd
-	movsd			// perform last 1-15 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd			// perform last 1-7 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		// dword aligned from before movsd's
-	and		ecx, 0b11	// the last few cows must come home
-	jz		$memcpy_final	// no more, let's leave
-	rep		movsb		// the last 1, 2, or 3 bytes
-
-$memcpy_final:
-	emms				// clean up the MMX state
-	sfence				// flush the write buffer
-	//mov		eax, [dest]	// ret value = destination pointer
-
-	pop    esi
-	pop    edi
-
-	ret 4
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
--- a/pcsx2/x86/microVU_Misc.h
+++ b/pcsx2/x86/microVU_Misc.h
@ -192,8 +192,7 @@ typedef u32 (__fastcall *mVUCall)(void*, void*);
 #else
 // Note: GCC builds crash with custom search function, because
 // they're not guaranteeing 16-byte alignment on the structs :(
-// #define mVUquickSearch(dest, src, size) (!memcmp(dest, src, size))
-#define mVUquickSearch(dest, src, size) (!memcmp_mmx(dest, src, size))
+#define mVUquickSearch(dest, src, size) (!memcmp(dest, src, size))
 #define mVUemitSearch()
 #endif
 //------------------------------------------------------------------