matrix.h: stream_copy_fast() for AltiVec and NEON now mirror their associated buffer_copy_fast() functions, replacing memcpy().
- Since stream_copy_fast() and buffer_copy_fast() are intended for small finite-sized buffers only, we shouldn't need a full memcpy() call with these.
This commit is contained in:
parent
7e85253ebb
commit
8a9fec431a
|
@ -616,18 +616,20 @@ static void memset_u32_fast(void *dst, const u32 val)
|
|||
MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
memcpy(dst, src, VECLENGTH);
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
|
||||
// so just use buffer_copy_fast() for this function too.
|
||||
buffer_copy_fast<VECLENGTH>(dst, src);
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
|
||||
{
|
||||
|
@ -739,18 +741,20 @@ static void memset_u32_fast(void *dst, const u32 val)
|
|||
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
memcpy(dst, src, VECLENGTH);
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),src), (X)*sizeof(v128s8), dst) );
|
||||
}
|
||||
|
||||
template <size_t VECLENGTH>
|
||||
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
|
||||
{
|
||||
// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
|
||||
// so just use buffer_copy_fast() for this function too.
|
||||
buffer_copy_fast<VECLENGTH>(dst, src, VECLENGTH);
|
||||
}
|
||||
|
||||
template <class T, size_t VECLENGTH, bool NEEDENDIANSWAP>
|
||||
static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const T &c_vec)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue