pcsx2/common/include/Utilities/MemcpyFast.h

/*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2010  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#ifdef __LINUX__

#	include "lnx_memzero.h"

	extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
	extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
	extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);

#if 0
	// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
	// Quadword Copy! Count is in QWCs (128 bits).  Neither source nor dest need to be aligned.
	static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
	{	
		// Optimization Analysis: This code is *nearly* optimal.  Do not think that using XMM
		// registers will improve copy performance, because they won't.  Use of XMMs is only
		// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
		// and even then the benefits are typically minimal (sometimes slower depending on the
		// amount of data being copied).
		//
		// Thus: MMX are alignment safe, fast, and widely available.  Lets just stick with them.
		//   --air

		// Linux Conversion note:
		//  This code would benefit nicely from having inline-able GAS syntax, since it should
		//  allow GCC to optimize the first 3 instructions out of existence in many scenarios.
		//  And its called enough times to probably merit the extra effort to ensure proper
		//  optimization. --air

		__asm__
		(
			".intel_syntax noprefix\n"
				"mov		ecx, [%[dest]]\n"
				"mov		edx, [%[src]]\n"
				"mov		eax, [%[qwc]]\n"			// keep a copy of count
				"shr		eax, 1\n"
				"jz		memcpy_qwc_1\n"		// only one 16 byte block to copy?

				"cmp		eax, 64\n" // "IN_CACHE_COPY/32"
				"jb		memcpy_qwc_loop1\n"	// small copies should be cached (definite speedup --air)
		
			"memcpy_qwc_loop2:\n"				// 32-byte blocks, uncached copy
				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)

				"movq	mm0,[edx+0]\n"			// read 64 bits
				"movq	mm1,[edx+8]\n"
				"movq	mm2,[edx+16]\n"
				"movntq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
				"movntq	[ecx+8], mm1\n"
				"movq	mm3,[edx+24]\n"
				"movntq	[ecx+16], mm2\n"
				"movntq	[ecx+24], mm3\n"

				"add		edx,32\n"				// update source pointer
				"add		ecx,32\n"				// update destination pointer
				"sub		eax,1\n"
				"jnz		memcpy_qwc_loop2\n"	// last 64-byte block?
				"sfence\n"						// flush the write buffer
				"jmp		memcpy_qwc_1\n"

			// 32-byte blocks, cached!
			// This *is* important.  Removing this and using exclusively non-temporal stores
			// results in noticable speed loss!

			"memcpy_qwc_loop1:\n"				
				"prefetchnta [edx + 568]\n"		// start reading ahead (tested: it helps! --air)

				"movq	mm0,[edx+0]\n"			// read 64 bits
				"movq	mm1,[edx+8]\n"
				"movq	mm2,[edx+16]\n"
				"movq	[ecx+0], mm0\n"		// write 64 bits, bypassing the cache
				"movq	[ecx+8], mm1\n"
				"movq	mm3,[edx+24]\n"
				"movq	[ecx+16], mm2\n"
				"movq	[ecx+24], mm3\n"

				"add		edx,32\n"				// update source pointer
				"add		ecx,32\n"				// update destination pointer
				"sub		eax,1\n"
				"jnz		memcpy_qwc_loop1\n"	// last 64-byte block?

			"memcpy_qwc_1:\n"
				"test	[%[qwc]],dword ptr 1\n"
				"jz		memcpy_qwc_final\n"
				"movq	mm0,[edx]\n"
				"movq	mm1,[edx+8]\n"
				"movq	[ecx], mm0\n"
				"movq	[ecx+8], mm1\n"

			"memcpy_qwc_final:\n"
				"emms\n"				// clean up the MMX state
			".att_syntax\n"
					: "=r"(dest), "=r"(src), "=r"(qwc)
					: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
					//: Needs a clobber list here
		);
	}
#endif
#else

#	include "win_memzero.h"

	extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
	extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
	extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
	extern void memxor_mmx(void* dst, const void* src1, int cmpsize);

#endif

// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
void _memset16_unaligned( void* dest, u16 data, size_t size );

#define memcpy_fast				memcpy_amd_ // Fast memcpy
#define memcpy_aligned(d,s,c)	memcpy_amd_(d,s,c)	// Memcpy with 16-byte Aligned addresses
#define memcpy_const			memcpy_amd_	// Memcpy with constant size
#define memcpy_constA			memcpy_amd_ // Memcpy with constant size and 16-byte aligned

//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
#ifndef __LINUX__
#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
#else
#define memcpy_qwc(d,s,c)		memcpy_amd_(d,s,c*16)
//#define memcpy_qwc(d,s,c)		memcpy_amd_qwc(d,s,c)
#endif
Upgraded PCSX2 core and utilities to GPLv3. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1783 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-09-08 12:08:10 +00:00			`/* PCSX2 - PS2 Emulator for PCs`
Copyright 2010 : PCSX2 and plugins! (notable exception: didn't update copyright info in any Gabest plugins) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2937 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-05-03 14:08:02 +00:00			`* Copyright (C) 2002-2010 PCSX2 Dev Team`
Removed all trailing whitespace in .c .cpp *.h because it irritates me. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2897 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-04-25 00:31:27 +00:00			`*`
Upgraded PCSX2 core and utilities to GPLv3. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1783 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-09-08 12:08:10 +00:00			`* PCSX2 is free software: you can redistribute it and/or modify it under the terms`
			`* of the GNU Lesser General Public License as published by the Free Software Found-`
			`* ation, either version 3 of the License, or (at your option) any later version.`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00			`*`
Upgraded PCSX2 core and utilities to GPLv3. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1783 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-09-08 12:08:10 +00:00			`* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;`
			`* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR`
			`* PURPOSE. See the GNU General Public License for more details.`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00			`*`
Upgraded PCSX2 core and utilities to GPLv3. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1783 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-09-08 12:08:10 +00:00			`* You should have received a copy of the GNU General Public License along with PCSX2.`
			`* If not, see <http://www.gnu.org/licenses/>.`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00			`*/`

wxGui branch: major progress on many fronts! * Added new files AppConfig.cpp and StringUtils.cpp, and removed memzero.h (we'll use the win/linux platform dependent implementations) * Enabled wxWidgets memory tracing since we don't use a memory trace util of our own. * Switched many instances of std::string to wxString. * Added preliminary support for configuration settings and ini file creation. * Added a set of parsing and splitting tools to StringUtils. * Set it up so that the Console log is attachable to the main window, when dragging (fun!) * Main window and console log window record and restore window positions between runs (only partially implemented yet) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@881 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-04-02 11:30:23 +00:00			`#pragma once`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00			`#ifdef __LINUX__`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00			`# include "lnx_memzero.h"`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00
pcsx2-wxgui: maintenence merging of trunk revisions into the brance (always a good idea to update branches after header file renames) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@764 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-13 06:14:08 +00:00			`extern "C" void __fastcall memcpy_amd_(void dest, const void src, size_t bytes);`
90% of an implementation of memcpy_fast_ for Linux. And fix debug mode. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@642 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-01 06:31:33 +00:00			`extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);`
			`extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);`
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00
ReorderingMTGS: Initial Linux version of memcpy_amd_qwc. Disabled for now, till I get a chance to look it over better. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 10:48:35 +00:00			`#if 0`
			`// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.`
			`// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.`
			`static __forceinline void memcpy_amd_qwc(void dest, const void src, size_t qwc)`
			`{`
			`// Optimization Analysis: This code is nearly optimal. Do not think that using XMM`
			`// registers will improve copy performance, because they won't. Use of XMMs is only`
			`// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,`
			`// and even then the benefits are typically minimal (sometimes slower depending on the`
			`// amount of data being copied).`
			`//`
			`// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.`
			`// --air`

			`// Linux Conversion note:`
			`// This code would benefit nicely from having inline-able GAS syntax, since it should`
			`// allow GCC to optimize the first 3 instructions out of existence in many scenarios.`
			`// And its called enough times to probably merit the extra effort to ensure proper`
			`// optimization. --air`

			`__asm__`
			`(`
			`".intel_syntax noprefix\n"`
			`"mov ecx, [%[dest]]\n"`
			`"mov edx, [%[src]]\n"`
			`"mov eax, [%[qwc]]\n" // keep a copy of count`
			`"shr eax, 1\n"`
			`"jz memcpy_qwc_1\n" // only one 16 byte block to copy?`

			`"cmp eax, 64\n" // "IN_CACHE_COPY/32"`
			`"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)`

			`"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy`
			`"prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air)`

			`"movq mm0,[edx+0]\n" // read 64 bits`
			`"movq mm1,[edx+8]\n"`
			`"movq mm2,[edx+16]\n"`
			`"movntq [ecx+0], mm0\n" // write 64 bits, bypassing the cache`
			`"movntq [ecx+8], mm1\n"`
			`"movq mm3,[edx+24]\n"`
			`"movntq [ecx+16], mm2\n"`
			`"movntq [ecx+24], mm3\n"`

			`"add edx,32\n" // update source pointer`
			`"add ecx,32\n" // update destination pointer`
			`"sub eax,1\n"`
			`"jnz memcpy_qwc_loop2\n" // last 64-byte block?`
			`"sfence\n" // flush the write buffer`
			`"jmp memcpy_qwc_1\n"`

			`// 32-byte blocks, cached!`
			`// This is important. Removing this and using exclusively non-temporal stores`
			`// results in noticable speed loss!`

			`"memcpy_qwc_loop1:\n"`
			`"prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air)`

			`"movq mm0,[edx+0]\n" // read 64 bits`
			`"movq mm1,[edx+8]\n"`
			`"movq mm2,[edx+16]\n"`
			`"movq [ecx+0], mm0\n" // write 64 bits, bypassing the cache`
			`"movq [ecx+8], mm1\n"`
			`"movq mm3,[edx+24]\n"`
			`"movq [ecx+16], mm2\n"`
			`"movq [ecx+24], mm3\n"`

			`"add edx,32\n" // update source pointer`
			`"add ecx,32\n" // update destination pointer`
			`"sub eax,1\n"`
			`"jnz memcpy_qwc_loop1\n" // last 64-byte block?`

			`"memcpy_qwc_1:\n"`
			`"test [%[qwc]],dword ptr 1\n"`
			`"jz memcpy_qwc_final\n"`
			`"movq mm0,[edx]\n"`
			`"movq mm1,[edx+8]\n"`
			`"movq [ecx], mm0\n"`
			`"movq [ecx+8], mm1\n"`

			`"memcpy_qwc_final:\n"`
			`"emms\n" // clean up the MMX state`
			`".att_syntax\n"`
			`: "=r"(dest), "=r"(src), "=r"(qwc)`
			`: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)`
			`//: Needs a clobber list here`
			`);`
			`}`
			`#endif`
Finish the Linux implementation of memcpy_fast_. I've disabled it by default until I'm sure it's working right, but it can easily be enabled in build.sh. Should be a speed boost in Linux (which Windows already had), but I haven't tested it enough to be able to tell yet. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@643 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-01 08:21:14 +00:00			`#else`
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00
			`# include "win_memzero.h"`

pcsx2-wxgui: maintenence merging of trunk revisions into the brance (always a good idea to update branches after header file renames) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@764 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-13 06:14:08 +00:00			`extern void __fastcall memcpy_amd_(void dest, const void src, size_t bytes);`
ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it. DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-12 19:40:30 +00:00			`extern void memcpy_amd_qwc(void dest, const void src, size_t bytes);`
pcsx2-wxgui: maintenence merging of trunk revisions into the brance (always a good idea to update branches after header file renames) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@764 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-13 06:14:08 +00:00			`extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);`
			`extern void memxor_mmx(void* dst, const void* src1, int cmpsize);`
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00
pcsx2-wxgui: maintenence merging of trunk revisions into the brance (always a good idea to update branches after header file renames) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@764 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-03-13 06:14:08 +00:00			`#endif`
Re-Added eol-style:native properties to the repository. The settings got lost when we merged from Playground to Official. Added interface.cpp (plugin/pcsx2 interface) and savestate.cpp to SPU2ghz, to help clean up SPU2.cpp. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@463 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-02-09 21:15:56 +00:00
wxgui branch: Maintenance merge against trunk, plus many cleanups and project-level changes. * Moved the x86 emitter to /common, so that plugins can link against it if they wish. * Created a new "utility" class in /common which houses string utils, fast memcpy, common exception classes, and other handy dandies. * Removed old-style linux automake files from the pcsx2 dir since they were hopelessly out of date (and their multi-file-per-line format makes svn merging impossible >_<) git-svn-id: http://pcsx2.googlecode.com/svn/branches/wxgui@1454 96395faa-99c1-11dd-bbfe-3dabce05a288 2009-07-03 00:49:40 +00:00			`// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.`
			`void _memset16_unaligned( void* dest, u16 data, size_t size );`

ReorderingMTGS: * Implemented GIFPath_CopyTag, which performs a "copy-in-place" while parsing tags (big speedup over the old parse-then-copy strategy, especially with the SSE intrinsics I've included for kicks). * Removed the old ringbuffer 'restart' mechanism and replaced it with a truly free-flowing wrapping mechanism. Utilizes the ringbuffer more efficiently, and removes quite a bit of overhead from the MTGS's PrepDataPacket call. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-11 04:53:50 +00:00			`#define memcpy_fast memcpy_amd_ // Fast memcpy`
ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it. DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-12 19:40:30 +00:00			`#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses`
ReorderingMTGS: * Implemented GIFPath_CopyTag, which performs a "copy-in-place" while parsing tags (big speedup over the old parse-then-copy strategy, especially with the SSE intrinsics I've included for kicks). * Removed the old ringbuffer 'restart' mechanism and replaced it with a truly free-flowing wrapping mechanism. Utilizes the ringbuffer more efficiently, and removes quite a bit of overhead from the MTGS's PrepDataPacket call. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3458 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-11 04:53:50 +00:00			`#define memcpy_const memcpy_amd_ // Memcpy with constant size`
			`#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned`
ReorderingMTGS: Initial Linux version of memcpy_amd_qwc. Disabled for now, till I get a chance to look it over better. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 10:48:35 +00:00
			`//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)`
ReorderingMTGS: Hackfix Linux, until some assembly is written. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3476 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 09:28:24 +00:00			`#ifndef __LINUX__`
ReorderingMTGS: Added a new optimized memcpy_amd_qwc, for use by GIFpath copies. After much studying, we determined this is about as efficient as memcpy will ever get, for what we're doing with it. DevNot: Win32-only at the moment -- needs a GAS port (but that shouldn't be hard). I made some notes in the code about it. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3472 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-12 19:40:30 +00:00			`#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)`
ReorderingMTGS: Hackfix Linux, until some assembly is written. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3476 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 09:28:24 +00:00			`#else`
			`#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)`
ReorderingMTGS: Initial Linux version of memcpy_amd_qwc. Disabled for now, till I get a chance to look it over better. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3477 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 10:48:35 +00:00			`//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)`
ReorderingMTGS: Hackfix Linux, until some assembly is written. git-svn-id: http://pcsx2.googlecode.com/svn/branches/ReorderingMTGS@3476 96395faa-99c1-11dd-bbfe-3dabce05a288 2010-07-13 09:28:24 +00:00			`#endif`