matrix.h: stream_copy_fast() for AltiVec and NEON now mirror their associated buffer_copy_fast() functions, replacing memcpy().

- Since stream_copy_fast() and buffer_copy_fast() are intended for small finite-sized buffers only, we shouldn't need a full memcpy() call with these.
2022-04-02 23:43:46 -07:00 · 2022-04-02 23:43:46 -07:00 · 8a9fec431a
parent 7e85253ebb
commit 8a9fec431a
1 changed files with 16 additions and 12 deletions
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -616,18 +616,20 @@ static void memset_u32_fast(void *dst, const u32 val)
 	MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
 }

-template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
-{
-	memcpy(dst, src, VECLENGTH);
-}
-
 template <size_t VECLENGTH>
 static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
 }

+template <size_t VECLENGTH>
+static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
+	// so just use buffer_copy_fast() for this function too.
+	buffer_copy_fast<VECLENGTH>(dst, src);
+}
+
 template <size_t VECLENGTH>
 static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
 {
@ -739,18 +741,20 @@ static void memset_u32_fast(void *dst, const u32 val)
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
 }

-template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
-{
-	memcpy(dst, src, VECLENGTH);
-}
-
 template <size_t VECLENGTH>
 static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),src), (X)*sizeof(v128s8), dst) );
 }

+template <size_t VECLENGTH>
+static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
+	// so just use buffer_copy_fast() for this function too.
+	buffer_copy_fast<VECLENGTH>(dst, src, VECLENGTH);
+}
+
 template <class T, size_t VECLENGTH, bool NEEDENDIANSWAP>
 static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const T &c_vec)
 {