2009-09-08 12:08:10 +00:00
|
|
|
/* PCSX2 - PS2 Emulator for PCs
|
2010-05-03 14:08:02 +00:00
|
|
|
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
2010-04-25 00:31:27 +00:00
|
|
|
*
|
2009-09-08 12:08:10 +00:00
|
|
|
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
|
|
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
|
|
|
* ation, either version 3 of the License, or (at your option) any later version.
|
2009-02-09 21:15:56 +00:00
|
|
|
*
|
2009-09-08 12:08:10 +00:00
|
|
|
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
|
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE. See the GNU General Public License for more details.
|
2009-02-09 21:15:56 +00:00
|
|
|
*
|
2009-09-08 12:08:10 +00:00
|
|
|
* You should have received a copy of the GNU General Public License along with PCSX2.
|
|
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
2009-02-09 21:15:56 +00:00
|
|
|
*/
|
|
|
|
|
2009-04-02 11:30:23 +00:00
|
|
|
#pragma once
|
2009-02-09 21:15:56 +00:00
|
|
|
|
2009-07-03 00:49:40 +00:00
|
|
|
#ifdef __LINUX__
|
2009-02-09 21:15:56 +00:00
|
|
|
|
2009-07-03 00:49:40 +00:00
|
|
|
# include "lnx_memzero.h"
|
2009-02-09 21:15:56 +00:00
|
|
|
|
2009-03-13 06:14:08 +00:00
|
|
|
extern "C" void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
2009-03-01 06:31:33 +00:00
|
|
|
extern "C" u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
|
|
|
extern "C" void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
2009-07-03 00:49:40 +00:00
|
|
|
|
2010-07-13 10:48:35 +00:00
|
|
|
#if 0
|
|
|
|
// This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
|
|
|
|
// Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
|
|
|
|
static __forceinline void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
|
|
|
|
{
|
|
|
|
// Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
|
|
|
|
// registers will improve copy performance, because they won't. Use of XMMs is only
|
|
|
|
// warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
|
|
|
|
// and even then the benefits are typically minimal (sometimes slower depending on the
|
|
|
|
// amount of data being copied).
|
|
|
|
//
|
|
|
|
// Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
|
|
|
|
// --air
|
|
|
|
|
|
|
|
// Linux Conversion note:
|
|
|
|
// This code would benefit nicely from having inline-able GAS syntax, since it should
|
|
|
|
// allow GCC to optimize the first 3 instructions out of existence in many scenarios.
|
|
|
|
// And its called enough times to probably merit the extra effort to ensure proper
|
|
|
|
// optimization. --air
|
|
|
|
|
|
|
|
__asm__
|
|
|
|
(
|
|
|
|
".intel_syntax noprefix\n"
|
|
|
|
"mov ecx, [%[dest]]\n"
|
|
|
|
"mov edx, [%[src]]\n"
|
|
|
|
"mov eax, [%[qwc]]\n" // keep a copy of count
|
|
|
|
"shr eax, 1\n"
|
|
|
|
"jz memcpy_qwc_1\n" // only one 16 byte block to copy?
|
|
|
|
|
|
|
|
"cmp eax, 64\n" // "IN_CACHE_COPY/32"
|
|
|
|
"jb memcpy_qwc_loop1\n" // small copies should be cached (definite speedup --air)
|
|
|
|
|
|
|
|
"memcpy_qwc_loop2:\n" // 32-byte blocks, uncached copy
|
|
|
|
"prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air)
|
|
|
|
|
|
|
|
"movq mm0,[edx+0]\n" // read 64 bits
|
|
|
|
"movq mm1,[edx+8]\n"
|
|
|
|
"movq mm2,[edx+16]\n"
|
|
|
|
"movntq [ecx+0], mm0\n" // write 64 bits, bypassing the cache
|
|
|
|
"movntq [ecx+8], mm1\n"
|
|
|
|
"movq mm3,[edx+24]\n"
|
|
|
|
"movntq [ecx+16], mm2\n"
|
|
|
|
"movntq [ecx+24], mm3\n"
|
|
|
|
|
|
|
|
"add edx,32\n" // update source pointer
|
|
|
|
"add ecx,32\n" // update destination pointer
|
|
|
|
"sub eax,1\n"
|
|
|
|
"jnz memcpy_qwc_loop2\n" // last 64-byte block?
|
|
|
|
"sfence\n" // flush the write buffer
|
|
|
|
"jmp memcpy_qwc_1\n"
|
|
|
|
|
|
|
|
// 32-byte blocks, cached!
|
|
|
|
// This *is* important. Removing this and using exclusively non-temporal stores
|
|
|
|
// results in noticable speed loss!
|
|
|
|
|
|
|
|
"memcpy_qwc_loop1:\n"
|
|
|
|
"prefetchnta [edx + 568]\n" // start reading ahead (tested: it helps! --air)
|
|
|
|
|
|
|
|
"movq mm0,[edx+0]\n" // read 64 bits
|
|
|
|
"movq mm1,[edx+8]\n"
|
|
|
|
"movq mm2,[edx+16]\n"
|
|
|
|
"movq [ecx+0], mm0\n" // write 64 bits, bypassing the cache
|
|
|
|
"movq [ecx+8], mm1\n"
|
|
|
|
"movq mm3,[edx+24]\n"
|
|
|
|
"movq [ecx+16], mm2\n"
|
|
|
|
"movq [ecx+24], mm3\n"
|
|
|
|
|
|
|
|
"add edx,32\n" // update source pointer
|
|
|
|
"add ecx,32\n" // update destination pointer
|
|
|
|
"sub eax,1\n"
|
|
|
|
"jnz memcpy_qwc_loop1\n" // last 64-byte block?
|
|
|
|
|
|
|
|
"memcpy_qwc_1:\n"
|
|
|
|
"test [%[qwc]],dword ptr 1\n"
|
|
|
|
"jz memcpy_qwc_final\n"
|
|
|
|
"movq mm0,[edx]\n"
|
|
|
|
"movq mm1,[edx+8]\n"
|
|
|
|
"movq [ecx], mm0\n"
|
|
|
|
"movq [ecx+8], mm1\n"
|
|
|
|
|
|
|
|
"memcpy_qwc_final:\n"
|
|
|
|
"emms\n" // clean up the MMX state
|
|
|
|
".att_syntax\n"
|
|
|
|
: "=r"(dest), "=r"(src), "=r"(qwc)
|
|
|
|
: [dest]"r"(dest), [src]"r"(src), [qwc]"r"(qwc)
|
|
|
|
//: Needs a clobber list here
|
|
|
|
);
|
|
|
|
}
|
|
|
|
#endif
|
2009-03-01 08:21:14 +00:00
|
|
|
#else
|
2009-07-03 00:49:40 +00:00
|
|
|
|
|
|
|
# include "win_memzero.h"
|
|
|
|
|
2009-03-13 06:14:08 +00:00
|
|
|
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
2010-07-12 19:40:30 +00:00
|
|
|
extern void memcpy_amd_qwc(void *dest, const void *src, size_t bytes);
|
2009-03-13 06:14:08 +00:00
|
|
|
extern u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize);
|
|
|
|
extern void memxor_mmx(void* dst, const void* src1, int cmpsize);
|
2009-07-03 00:49:40 +00:00
|
|
|
|
2009-03-13 06:14:08 +00:00
|
|
|
#endif
|
2009-02-09 21:15:56 +00:00
|
|
|
|
2009-07-03 00:49:40 +00:00
|
|
|
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
|
|
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
|
|
|
|
2010-07-11 04:53:50 +00:00
|
|
|
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
2010-07-12 19:40:30 +00:00
|
|
|
#define memcpy_aligned(d,s,c) memcpy_amd_(d,s,c) // Memcpy with 16-byte Aligned addresses
|
2010-07-11 04:53:50 +00:00
|
|
|
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
|
|
|
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
2010-07-13 10:48:35 +00:00
|
|
|
|
|
|
|
//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
2010-07-13 09:28:24 +00:00
|
|
|
#ifndef __LINUX__
|
2010-07-12 19:40:30 +00:00
|
|
|
#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
2010-07-13 09:28:24 +00:00
|
|
|
#else
|
|
|
|
#define memcpy_qwc(d,s,c) memcpy_amd_(d,s,c*16)
|
2010-07-13 10:48:35 +00:00
|
|
|
//#define memcpy_qwc(d,s,c) memcpy_amd_qwc(d,s,c)
|
2010-07-13 09:28:24 +00:00
|
|
|
#endif
|