matrix.h: stream_copy_fast() for AltiVec and NEON now mirror their associated buffer_copy_fast() functions, replacing memcpy().

- Since stream_copy_fast() and buffer_copy_fast() are intended for small finite-sized buffers only, we shouldn't need a full memcpy() call with these.
This commit is contained in:
rogerman 2022-04-02 23:43:46 -07:00
parent 7e85253ebb
commit 8a9fec431a
1 changed files with 16 additions and 12 deletions

View File

@ -616,18 +616,20 @@ static void memset_u32_fast(void *dst, const u32 val)
MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
}
template <size_t VECLENGTH>
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
{
memcpy(dst, src, VECLENGTH);
}
template <size_t VECLENGTH>
static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
{
MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
}
template <size_t VECLENGTH>
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
{
// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
// so just use buffer_copy_fast() for this function too.
buffer_copy_fast<VECLENGTH>(dst, src);
}
template <size_t VECLENGTH>
static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
{
@ -739,18 +741,20 @@ static void memset_u32_fast(void *dst, const u32 val)
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
}
template <size_t VECLENGTH>
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
{
memcpy(dst, src, VECLENGTH);
}
template <size_t VECLENGTH>
static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
{
MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),src), (X)*sizeof(v128s8), dst) );
}
template <size_t VECLENGTH>
static void stream_copy_fast(void *__restrict dst, void *__restrict src)
{
// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
// so just use buffer_copy_fast() for this function too.
buffer_copy_fast<VECLENGTH>(dst, src, VECLENGTH);
}
template <class T, size_t VECLENGTH, bool NEEDENDIANSWAP>
static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const T &c_vec)
{