diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 88ee6dfd4..b4795b2f1 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -616,18 +616,20 @@ static void memset_u32_fast(void *dst, const u32 val) MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) ); } -template -static void stream_copy_fast(void *__restrict dst, void *__restrict src) -{ - memcpy(dst, src, VECLENGTH); -} - template static void buffer_copy_fast(void *__restrict dst, void *__restrict src) { MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) ); } +template +static void stream_copy_fast(void *__restrict dst, void *__restrict src) +{ + // NEON doesn't have the same temporal/caching distinctions that SSE and AVX do, + // so just use buffer_copy_fast() for this function too. + buffer_copy_fast(dst, src); +} + template static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec) { @@ -739,18 +741,20 @@ static void memset_u32_fast(void *dst, const u32 val) MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X))); } -template -static void stream_copy_fast(void *__restrict dst, void *__restrict src) -{ - memcpy(dst, src, VECLENGTH); -} - template static void buffer_copy_fast(void *__restrict dst, void *__restrict src) { MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),src), (X)*sizeof(v128s8), dst) ); } +template +static void stream_copy_fast(void *__restrict dst, void *__restrict src) +{ + // AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do, + // so just use buffer_copy_fast() for this function too. + buffer_copy_fast(dst, src, VECLENGTH); +} + template static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const T &c_vec) {