pcsx2/common/include/Utilities/win_memzero.h

592 lines
12 KiB
C++

/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2009 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
// These functions are meant for memset operations of constant length only.
// For dynamic length clears, use the C-compiler provided memset instead.
// MemZero Code Strategies:
// I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
// won't optimize local variables very well because it insists in storing them on the
// stack and then loading them out of the stack when I use them from inline ASM, and
// it won't allow me to use template parameters in inline asm code either. But I can
// assign the template parameters to enums, and then use the enums from asm code.
// Yeah, silly, but it works. :D (air)
// All methods defined in this header use template in combination with the aforementioned
// enumerations to generate very efficient and compact inlined code. These optimized
// memsets work on the theory that most uses of memset involve static arrays and
// structures, which are constant in size, thus allowing us to generate optimal compile-
// time code for each use of the function.
// Use of CLD (Clear Direction Flag):
// On Windows platforms the ABI declares that the direction flag should be cleared upon
// entry of *any* function. Therefore there is no need to have CLD prior to our use of
// rep strosd here.
// Notes on XMM0's "storage" area (_xmm_backup):
// Unfortunately there's no way to guarantee alignment for this variable. If I use the
// __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
// alignment requires prep work. And for the same reason it's not possible to check the
// alignment of the stack at compile time, so I'm forced to use movups to store and
// retrieve xmm0.
// MSVC Template Issue:
// MSVC treats int template parameters like macro insertions. That is, if you have a
// a template parameter in the form of "func<10-5>()", MSVC inserts 10-5 into the
// templated function, causing order-of-operation problems (sigh). The normal fix would
// be to assign the template parameter to a static const int inside each function, but that
// won't fly with the enums optimization. So in order to fix the problem I define a macro
// that encapsulates the template parameter inside parenthesis for us:
#define MZFbytes (_bytes)
// This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
template< size_t _bytes >
static __forceinline void memzero_ptr( void *dest )
{
if( MZFbytes == 0 ) return;
// This function only works on 32-bit alignments. For anything else we just fall back
// on the compiler-provided implementation of memset...
if( (MZFbytes & 0x3) != 0 )
{
memset( dest, 0, MZFbytes );
return;
}
enum
{
remainder = MZFbytes & 127,
bytes128 = MZFbytes / 128
};
// Initial check -- if the length is not a multiple of 16 then fall back on
// using rep movsd methods. Handling these unaligned clears in a more efficient
// manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
// scenarios).
if( (MZFbytes & 0xf) == 0 )
{
u64 _xmm_backup[2];
if( ((uptr)dest & 0xf) != 0 )
{
// UNALIGNED COPY MODE.
// For unaligned copies we have a threshold of at least 128 vectors. Anything
// less and it's probably better off just falling back on the rep movsd.
if( bytes128 > 128 )
{
__asm
{
movups _xmm_backup,xmm0;
mov ecx,dest
pxor xmm0,xmm0
mov eax,bytes128
align 16
_loop_6:
movups [ecx],xmm0;
movups [ecx+0x10],xmm0;
movups [ecx+0x20],xmm0;
movups [ecx+0x30],xmm0;
movups [ecx+0x40],xmm0;
movups [ecx+0x50],xmm0;
movups [ecx+0x60],xmm0;
movups [ecx+0x70],xmm0;
sub ecx,-128
dec eax;
jnz _loop_6;
}
if( remainder != 0 )
{
// Copy the remainder in reverse (using the decrementing eax as our indexer)
__asm
{
mov eax, remainder
_loop_5:
movups [ecx+eax],xmm0;
sub eax,16;
jnz _loop_5;
}
}
__asm
{
movups xmm0,[_xmm_backup];
}
return;
}
}
else if( bytes128 > 48 )
{
// ALIGNED COPY MODE
// Data is aligned and the size of data is large enough to merit a nice
// fancy chunk of unrolled goodness:
__asm
{
movups _xmm_backup,xmm0;
mov ecx,dest
pxor xmm0,xmm0
mov eax,bytes128
align 16
_loop_8:
movaps [ecx],xmm0;
movaps [ecx+0x10],xmm0;
movaps [ecx+0x20],xmm0;
movaps [ecx+0x30],xmm0;
movaps [ecx+0x40],xmm0;
movaps [ecx+0x50],xmm0;
movaps [ecx+0x60],xmm0;
movaps [ecx+0x70],xmm0;
sub ecx,-128
dec eax;
jnz _loop_8;
}
if( remainder != 0 )
{
// Copy the remainder in reverse (using the decrementing eax as our indexer)
__asm
{
mov eax, remainder
_loop_10:
movaps [ecx+eax],xmm0;
sub eax,16;
jnz _loop_10;
}
}
__asm
{
movups xmm0,[_xmm_backup];
}
return;
}
}
// This function only works on 32-bit alignments.
jASSUME( (MZFbytes & 0x3) == 0 );
jASSUME( ((uptr)dest & 0x3) == 0 );
enum
{
remdat = MZFbytes >> 2
};
// This case statement handles 5 special-case sizes (small blocks)
// in addition to the generic large block that uses rep stosd.
switch( remdat )
{
case 1:
*(u32*)dest = 0;
return;
case 2:
*(u64*)dest = 0;
return;
case 3:
__asm
{
mov edi, dest
xor eax, eax
stosd
stosd
stosd
}
return;
case 4:
__asm
{
mov edi, dest
xor eax, eax
stosd
stosd
stosd
stosd
}
return;
case 5:
__asm
{
mov edi, dest
xor eax, eax
stosd
stosd
stosd
stosd
stosd
}
return;
default:
__asm
{
mov ecx, remdat
mov edi, dest
xor eax, eax
rep stosd
}
return;
}
}
// An optimized memset for 8 bit destination data.
template< u8 data, size_t _bytes >
static __forceinline void memset_8( void *dest )
{
if( MZFbytes == 0 ) return;
if( (MZFbytes & 0x3) != 0 )
{
// unaligned data length. No point in doing an optimized inline version (too complicated!)
// So fall back on the compiler implementation:
memset( dest, data, MZFbytes );
return;
}
//u64 _xmm_backup[2];
/*static const size_t remainder = MZFbytes & 127;
static const size_t bytes128 = MZFbytes / 128;
if( bytes128 > 32 )
{
// This function only works on 128-bit alignments.
jASSUME( (MZFbytes & 0xf) == 0 );
jASSUME( ((uptr)dest & 0xf) == 0 );
__asm
{
movups _xmm_backup,xmm0;
mov eax,bytes128
mov ecx,dest
movss xmm0,data
align 16
_loop_8:
movaps [ecx],xmm0;
movaps [ecx+0x10],xmm0;
movaps [ecx+0x20],xmm0;
movaps [ecx+0x30],xmm0;
movaps [ecx+0x40],xmm0;
movaps [ecx+0x50],xmm0;
movaps [ecx+0x60],xmm0;
movaps [ecx+0x70],xmm0;
sub ecx,-128
dec eax;
jnz _loop_8;
}
if( remainder != 0 )
{
// Copy the remainder in reverse (using the decrementing eax as our indexer)
__asm
{
mov eax, remainder
_loop_10:
movaps [ecx+eax],xmm0;
sub eax,16;
jnz _loop_10;
}
}
__asm
{
movups xmm0,[_xmm_backup];
}
}*/
// This function only works on 32-bit alignments of data copied.
jASSUME( (MZFbytes & 0x3) == 0 );
enum
{
remdat = MZFbytes >> 2,
data32 = data + (data<<8) + (data<<16) + (data<<24)
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
case 2:
((u32*)dest)[0] = data32;
((u32*)dest)[1] = data32;
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return;
}
}
template< u16 data, size_t _bytes >
static __forceinline void memset_16( void *dest )
{
if( MZFbytes == 0 ) return;
if( (MZFbytes & 0x1) != 0 )
throw Exception::LogicError( "Invalid parameter passed to memset_16 - data length is not a multiple of 16 or 32 bits." );
if( (MZFbytes & 0x3) != 0 )
{
// Unaligned data length. No point in doing an optimized inline version (too complicated with
// remainders and such).
_memset16_unaligned( dest, data, MZFbytes );
return;
}
//u64 _xmm_backup[2];
// This function only works on 32-bit alignments of data copied.
jASSUME( (MZFbytes & 0x3) == 0 );
enum
{
remdat = MZFbytes >> 2,
data32 = data + (data<<16)
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
case 2:
((u32*)dest)[0] = data32;
((u32*)dest)[1] = data32;
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return
}
}
template< u32 data, size_t MZFbytes >
static __forceinline void memset_32( void *dest )
{
if( MZFbytes == 0 ) return;
if( (MZFbytes & 0x3) != 0 )
throw Exception::LogicError( "Invalid parameter passed to memset_32 - data length is not a multiple of 32 bits." );
//u64 _xmm_backup[2];
// This function only works on 32-bit alignments of data copied.
// If the data length is not a factor of 32 bits, the C++ optimizing compiler will
// probably just generate mysteriously broken code in Release builds. ;)
jASSUME( (MZFbytes & 0x3) == 0 );
enum
{
remdat = MZFbytes>>2,
data32 = data
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
case 2:
((u32*)dest)[0] = data32;
((u32*)dest)[1] = data32;
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return
}
}
// This method can clear any object-like entity -- which is anything that is not a pointer.
// Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
// for you!
template< typename T >
static __forceinline void memzero_obj( T& object )
{
memzero_ptr<sizeof(T)>( &object );
}
// This method clears an object with the given 8 bit value.
template< u8 data, typename T >
static __forceinline void memset8_obj( T& object )
{
memset_8<data, sizeof(T)>( &object );
}
// This method clears an object with the given 16 bit value.
template< u16 data, typename T >
static __forceinline void memset16_obj( T& object )
{
memset_16<data, sizeof(T)>( &object );
}
// This method clears an object with the given 32 bit value.
template< u32 data, typename T >
static __forceinline void memset32_obj( T& object )
{
memset_32<data, sizeof(T)>( &object );
}
#undef MZFbytes