From fb7e2fad1a20fcfc7ad8a3cabaf590dccf807361 Mon Sep 17 00:00:00 2001
From: arcum42 <arcum42@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Sat, 6 Nov 2010 06:41:56 +0000
Subject: [PATCH] GregMiscellaneous: zzogl-pg: Have the Windows version use the
 Utilities version of memcpy_amd like the Linux version does, and get rid of
 the redundant file...

git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@4001 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 plugins/zzogl-pg/opengl/Util.h                |  12 +-
 .../opengl/Win32/zerogsogl_2008.vcproj        |   8 +-
 plugins/zzogl-pg/opengl/ZZGl.h                | 256 +++++-----
 plugins/zzogl-pg/opengl/memcpy_amd.cpp        | 480 ------------------
 4 files changed, 133 insertions(+), 623 deletions(-)
 delete mode 100644 plugins/zzogl-pg/opengl/memcpy_amd.cpp
diff --git a/plugins/zzogl-pg/opengl/Util.h b/plugins/zzogl-pg/opengl/Util.h
index b970971578..b808dfd153 100644
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@@ -91,10 +91,12 @@ static __forceinline void pcsx2_aligned_free(void* pmem)
 
 #endif
 
+#include "Utilities/MemcpyFast.h"
+#define memcpy_amd memcpy_fast
+
 #define max(a,b)			(((a) > (b)) ? (a) : (b))
 #define min(a,b)			(((a) < (b)) ? (a) : (b))
 
-
 typedef struct
 {
 	int x, y, w, h;
@@ -346,14 +348,6 @@ extern u32 g_nGenVars, g_nTexVars, g_nAlphaVars, g_nResolve;
 extern bool g_bSaveTrans, g_bUpdateEffect, g_bSaveTex, g_bSaveResolved;
 #endif
 
-#ifdef __LINUX__
-#include "Utilities/MemcpyFast.h"
-#define memcpy_amd memcpy_fast
-#else
-extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
-extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
-#endif
-
 extern bool g_bDisplayFPS; // should we display FPS on screen?
 
 #endif // UTIL_H_INCLUDED
diff --git a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
index 7f8b3e0d8a..ca34be35a3 100644
--- a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
+++ b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj
@@ -285,10 +285,6 @@
 				RelativePath="..\Mem_Tables.cpp"
 				>
 			</File>
-			<File
-				RelativePath="..\memcpy_amd.cpp"
-				>
-			</File>
 			<File
 				RelativePath="..\NewRegs.cpp"
 				>
@@ -568,11 +564,11 @@
 			</File>
 		</Filter>
 		<File
-			RelativePath=".\ps2hw.dat"
+			RelativePath="..\ps2hw.dat"
 			>
 		</File>
 		<File
-			RelativePath="..\ps2hw.dat"
+			RelativePath=".\ps2hw.dat"
 			>
 		</File>
 	</Files>
diff --git a/plugins/zzogl-pg/opengl/ZZGl.h b/plugins/zzogl-pg/opengl/ZZGl.h
index 29a000ef13..3b7f148ade 100644
--- a/plugins/zzogl-pg/opengl/ZZGl.h
+++ b/plugins/zzogl-pg/opengl/ZZGl.h
@@ -1,133 +1,133 @@
-/*  ZZ Open GL graphics plugin
- *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
- *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
+/*  ZZ Open GL graphics plugin
+ *  Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
+ *  Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+ */
+
 #ifndef ZZGL_H_INCLUDED
 #define ZZGL_H_INCLUDED
 
-#include "PS2Etypes.h"
-#include "PS2Edefs.h"
-
-// Need this before gl.h
-#ifdef _WIN32
-
-#include <windows.h>
-#include <GL/gl.h>
-#include <GL/glext.h>
-#include "glprocs.h"
-
-#else
-
-// adding glew support instead of glXGetProcAddress (thanks to scaught)
-#include <GL/glew.h>
-#include <GL/gl.h>
-#include <GL/glext.h>
-#include <GL/glx.h>
-
-inline void* wglGetProcAddress(const char* x)
-{
-	return (void*)glXGetProcAddress((const GLubyte*)x);
-}
-
-#endif
-
-extern u32 s_stencilfunc, s_stencilref, s_stencilmask;
-// Defines
-
-#ifndef GL_DEPTH24_STENCIL8_EXT // allows FBOs to support stencils
-#	define GL_DEPTH_STENCIL_EXT 0x84F9
-#	define GL_UNSIGNED_INT_24_8_EXT 0x84FA
-#	define GL_DEPTH24_STENCIL8_EXT 0x88F0
-#	define GL_TEXTURE_STENCIL_SIZE_EXT 0x88F1
-#endif
-
-#define GL_STENCILFUNC(func, ref, mask) { \
-	s_stencilfunc  = func; \
-	s_stencilref = ref; \
-	s_stencilmask = mask; \
-	glStencilFunc(func, ref, mask); \
-}
-
-#define GL_STENCILFUNC_SET() glStencilFunc(s_stencilfunc, s_stencilref, s_stencilmask)
-
-
-// sets the data stream
-#define SET_STREAM() { \
-	glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)8); \
-	glSecondaryColorPointerEXT(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)12); \
-	glTexCoordPointer(3, GL_FLOAT, sizeof(VertexGPU), (void*)16); \
-	glVertexPointer(4, GL_SHORT, sizeof(VertexGPU), (void*)0); \
-}
-
-
-// global alpha blending settings
-extern GLenum g_internalRGBAFloat16Fmt;
-
-#define SAFE_RELEASE_TEX(x) { if( (x) != 0 ) { glDeleteTextures(1, &(x)); x = 0; } }
-
-// inline for an extremely often used sequence
-// This is turning off all gl functions. Safe to do updates.
-inline void DisableAllgl()
-{
-	glDisable(GL_SCISSOR_TEST);
-	glDisable(GL_BLEND);
-	glDisable(GL_ALPHA_TEST);
-	glDisable(GL_DEPTH_TEST);
-	glDepthMask(0);
-	glDisable(GL_STENCIL_TEST);
-	glColorMask(1, 1, 1, 1);
-}
-
-//--------------------- Dummies
-
-#ifdef _WIN32
-extern void (__stdcall *zgsBlendEquationSeparateEXT)(GLenum, GLenum);
-extern void (__stdcall *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum);
-#else
-extern void (APIENTRY *zgsBlendEquationSeparateEXT)(GLenum, GLenum);
-extern void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum);
-#endif
-
-
-// ------------------------ Types -------------------------
-
-/////////////////////
-// graphics resources
-extern GLenum s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha; // set by zgsBlendFuncSeparateEXT
-
-// GL prototypes
-extern PFNGLISRENDERBUFFEREXTPROC glIsRenderbufferEXT;
-extern PFNGLBINDRENDERBUFFEREXTPROC glBindRenderbufferEXT;
-extern PFNGLDELETERENDERBUFFERSEXTPROC glDeleteRenderbuffersEXT;
-extern PFNGLGENRENDERBUFFERSEXTPROC glGenRenderbuffersEXT;
-extern PFNGLRENDERBUFFERSTORAGEEXTPROC glRenderbufferStorageEXT;
-extern PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glGetRenderbufferParameterivEXT;
-extern PFNGLISFRAMEBUFFEREXTPROC glIsFramebufferEXT;
-extern PFNGLBINDFRAMEBUFFEREXTPROC glBindFramebufferEXT;
-extern PFNGLDELETEFRAMEBUFFERSEXTPROC glDeleteFramebuffersEXT;
-extern PFNGLGENFRAMEBUFFERSEXTPROC glGenFramebuffersEXT;
-extern PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glCheckFramebufferStatusEXT;
-extern PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glFramebufferTexture1DEXT;
-extern PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glFramebufferTexture2DEXT;
-extern PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glFramebufferTexture3DEXT;
-extern PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glFramebufferRenderbufferEXT;
-extern PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glGetFramebufferAttachmentParameterivEXT;
-extern PFNGLGENERATEMIPMAPEXTPROC glGenerateMipmapEXT;
-extern PFNGLDRAWBUFFERSPROC glDrawBuffers;
+#include "PS2Etypes.h"
+#include "PS2Edefs.h"
+
+// Need this before gl.h
+#ifdef _WIN32
+
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include "glprocs.h"
+
+#else
+
+// adding glew support instead of glXGetProcAddress (thanks to scaught)
+#include <GL/glew.h>
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include <GL/glx.h>
+
+inline void* wglGetProcAddress(const char* x)
+{
+	return (void*)glXGetProcAddress((const GLubyte*)x);
+}
+
+#endif
+
+extern u32 s_stencilfunc, s_stencilref, s_stencilmask;
+// Defines
+
+#ifndef GL_DEPTH24_STENCIL8_EXT // allows FBOs to support stencils
+#	define GL_DEPTH_STENCIL_EXT 0x84F9
+#	define GL_UNSIGNED_INT_24_8_EXT 0x84FA
+#	define GL_DEPTH24_STENCIL8_EXT 0x88F0
+#	define GL_TEXTURE_STENCIL_SIZE_EXT 0x88F1
+#endif
+
+#define GL_STENCILFUNC(func, ref, mask) { \
+	s_stencilfunc  = func; \
+	s_stencilref = ref; \
+	s_stencilmask = mask; \
+	glStencilFunc(func, ref, mask); \
+}
+
+#define GL_STENCILFUNC_SET() glStencilFunc(s_stencilfunc, s_stencilref, s_stencilmask)
+
+
+// sets the data stream
+#define SET_STREAM() { \
+	glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)8); \
+	glSecondaryColorPointerEXT(4, GL_UNSIGNED_BYTE, sizeof(VertexGPU), (void*)12); \
+	glTexCoordPointer(3, GL_FLOAT, sizeof(VertexGPU), (void*)16); \
+	glVertexPointer(4, GL_SHORT, sizeof(VertexGPU), (void*)0); \
+}
+
+
+// global alpha blending settings
+extern GLenum g_internalRGBAFloat16Fmt;
+
+#define SAFE_RELEASE_TEX(x) { if( (x) != 0 ) { glDeleteTextures(1, &(x)); x = 0; } }
+
+// inline for an extremely often used sequence
+// This is turning off all gl functions. Safe to do updates.
+inline void DisableAllgl()
+{
+	glDisable(GL_SCISSOR_TEST);
+	glDisable(GL_BLEND);
+	glDisable(GL_ALPHA_TEST);
+	glDisable(GL_DEPTH_TEST);
+	glDepthMask(0);
+	glDisable(GL_STENCIL_TEST);
+	glColorMask(1, 1, 1, 1);
+}
+
+//--------------------- Dummies
+
+#ifdef _WIN32
+extern void (__stdcall *zgsBlendEquationSeparateEXT)(GLenum, GLenum);
+extern void (__stdcall *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum);
+#else
+extern void (APIENTRY *zgsBlendEquationSeparateEXT)(GLenum, GLenum);
+extern void (APIENTRY *zgsBlendFuncSeparateEXT)(GLenum, GLenum, GLenum, GLenum);
+#endif
+
+
+// ------------------------ Types -------------------------
+
+/////////////////////
+// graphics resources
+extern GLenum s_srcrgb, s_dstrgb, s_srcalpha, s_dstalpha; // set by zgsBlendFuncSeparateEXT
+
+// GL prototypes
+extern PFNGLISRENDERBUFFEREXTPROC glIsRenderbufferEXT;
+extern PFNGLBINDRENDERBUFFEREXTPROC glBindRenderbufferEXT;
+extern PFNGLDELETERENDERBUFFERSEXTPROC glDeleteRenderbuffersEXT;
+extern PFNGLGENRENDERBUFFERSEXTPROC glGenRenderbuffersEXT;
+extern PFNGLRENDERBUFFERSTORAGEEXTPROC glRenderbufferStorageEXT;
+extern PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glGetRenderbufferParameterivEXT;
+extern PFNGLISFRAMEBUFFEREXTPROC glIsFramebufferEXT;
+extern PFNGLBINDFRAMEBUFFEREXTPROC glBindFramebufferEXT;
+extern PFNGLDELETEFRAMEBUFFERSEXTPROC glDeleteFramebuffersEXT;
+extern PFNGLGENFRAMEBUFFERSEXTPROC glGenFramebuffersEXT;
+extern PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glCheckFramebufferStatusEXT;
+extern PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glFramebufferTexture1DEXT;
+extern PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glFramebufferTexture2DEXT;
+extern PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glFramebufferTexture3DEXT;
+extern PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glFramebufferRenderbufferEXT;
+extern PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glGetFramebufferAttachmentParameterivEXT;
+extern PFNGLGENERATEMIPMAPEXTPROC glGenerateMipmapEXT;
+extern PFNGLDRAWBUFFERSPROC glDrawBuffers;
 
 #endif // ZZGL_H_INCLUDED
diff --git a/plugins/zzogl-pg/opengl/memcpy_amd.cpp b/plugins/zzogl-pg/opengl/memcpy_amd.cpp
deleted file mode 100644
index 94cb955ed6..0000000000
--- a/plugins/zzogl-pg/opengl/memcpy_amd.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-/******************************************************************************
-
- Copyright (c) 2001 Advanced Micro Devices, Inc.
-
- LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
- EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
- NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
- PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
- DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
- BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
- INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
- OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
- OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
- NOT APPLY TO YOU.
-
- AMD does not assume any responsibility for any errors which may appear in the
- Materials nor any responsibility to support or update the Materials.  AMD retains
- the right to make changes to its test specifications at any time, without notice.
-
- NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
- further information, software, technical information, know-how, or show-how
- available to you.
-
- So that all may benefit from your experience, please report  any  problems
- or  suggestions about this software to 3dsdk.support@amd.com
-
- AMD Developer Technologies, M/S 585
- Advanced Micro Devices, Inc.
- 5900 E. Ben White Blvd.
- Austin, TX 78741
- 3dsdk.support@amd.com
-******************************************************************************/
-
-#include <assert.h>
-
-/*****************************************************************************
-MEMCPY_AMD.CPP
-******************************************************************************/
-
-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
-
-#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
-
-#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-//#include <stddef.h>
-
-// Inline assembly syntax for use with Visual C++
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-#if defined(_MSC_VER) && !defined(__x86_64__)
-
-#include "PS2Etypes.h"
-
-extern "C" {
-void * memcpy_amd(void *dest, const void *src, size_t n)
-{
-    __asm {
-	mov		ecx, [n]		; number of bytes to copy
-	mov		edi, [dest]		; destination
-	mov		esi, [src]		; source
-	mov		ebx, ecx		; keep a copy of count
-
-	cld
-	cmp		ecx, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	; tiny? skip mmx copy
-
-	cmp		ecx, 32*1024		; don't align between 32k-64k because
-	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		ecx, 64*1024
-	jbe		$memcpy_align_done
-$memcpy_do_align:
-	mov		ecx, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
-
-align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-
-$memcpy_align_done:			; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
-	jz		$memcpy_ic_2	; finish the last few bytes
-
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time.  It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:			; 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0, [esi+0]	; read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	; write 64 bits
-	movq	[edi+8], mm1	;    note:  the normal movq writes the
-	movq	mm2, [esi+16]	;    data to cache; a cache line will be
-	movq	mm3, [esi+24]	;    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			; update source pointer
-	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
-	jnz		$memcpy_ic_1	; last 64-byte block?
-
-$memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		ecx, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
-
-$memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
-	jae		$memcpy_bp_1
-
-$memcpy_64_test:
-	or		ecx, ecx		; tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	; no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:				; 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0,[esi+0]		; read 64 bits
-	add		edi,64			; update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			; update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	;    from READING the destination address
-	movq	mm1,[esi-32]	;    into the cache, only to be over-written
-	movntq	[edi-48], mm2	;    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	dec		ecx
-	movntq	[edi-8], mm1
-	jnz		$memcpy_uc_1	; last 64-byte block?
-
-	jmp		$memcpy_ic_2		; almost done
-
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-	movsd
-	movsd			; perform last 1-15 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd			; perform last 1-7 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
-	jz		$memcpy_final	; no more, let's leave
-	rep		movsb		; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-	emms				; clean up the MMX state
-	sfence				; flush the write buffer
-	mov		eax, [dest]	; ret value = destination pointer
-
-    }
-}
-
-// mmx memcpy implementation, size has to be a multiple of 8
-// returns 0 is equal, nonzero value if not equal
-// ~10 times faster than standard memcmp
-// (zerofrog)
-u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-{
-	assert( (cmpsize&7) == 0 );
-
-	__asm {
-push esi
-		mov ecx, cmpsize
-		mov edx, src1
-		mov esi, src2
-
-		cmp ecx, 32
-		jl Done4
-
-		// custom test first 8 to make sure things are ok
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		movq mm2, [esi+16]
-		pmovmskb eax, mm0
-		movq mm3, [esi+24]
-
-		// check if eq
-		cmp eax, 0xff
-		je NextComp
-		mov eax, 1
-		jmp End
-
-NextComp:
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm2, mm3
-		pmovmskb eax, mm2
-
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je ContinueTest
-		mov eax, 1
-		jmp End
-
-		cmp ecx, 64
-		jl Done8
-
-Cmp8:
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		movq mm4, [esi+32]
-		movq mm5, [esi+40]
-		movq mm6, [esi+48]
-		movq mm7, [esi+56]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pcmpeqd mm4, [edx+32]
-		pand mm0, mm2
-		pcmpeqd mm5, [edx+40]
-		pand mm0, mm3
-		pcmpeqd mm6, [edx+48]
-		pand mm0, mm4
-		pcmpeqd mm7, [edx+56]
-		pand mm0, mm5
-		pand mm0, mm6
-		pand mm0, mm7
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		je Continue
-		mov eax, 1
-		jmp End
-
-Continue:
-		sub ecx, 64
-		add esi, 64
-		add edx, 64
-ContinueTest:
-		cmp ecx, 64
-		jge Cmp8
-
-Done8:
-		test ecx, 0x20
-		jz Done4
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pand mm0, mm2
-		pand mm0, mm3
-		pmovmskb eax, mm0
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je Done4
-		mov eax, 1
-		jmp End
-
-Done4:
-		cmp ecx, 24
-		jne Done2
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pand mm0, mm1
-		pand mm0, mm2
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done2:
-		cmp ecx, 16
-		jne Done1
-
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done1:
-		cmp ecx, 8
-		jne Done
-
-		mov eax, [esi]
-		mov esi, [esi+4]
-		cmp eax, [edx]
-		je Next
-		mov eax, 1
-		jmp End
-
-Next:
-		cmp esi, [edx+4]
-		setne al
-		jmp End
-
-Done:
-		xor eax, eax
-
-End:
-		pop esi
-		emms
-	}
-}
-
-}
-#else // _MSC_VER
-// assume gcc or mingw or win x64
-
-#include <memory.h>
-#include <string.h>
-#include "PS2Etypes.h"
-#include "Utilities/MemcpyFast.h"
-//void * memcpy_amd(void *dest, const void *src, size_t n)
-//{
-//	//memcpy(dest, src, n);
-//	memcpy_fast(dest, src, n);
-//	return dest;
-//}
-#define memcpy_amd memcpy_fast
-
-#endif
-