diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index 88ee6dfd4..b4795b2f1 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -616,18 +616,20 @@ static void memset_u32_fast(void *dst, const u32 val)
 	MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
 }
 
-template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
-{
-	memcpy(dst, src, VECLENGTH);
-}
-
 template <size_t VECLENGTH>
 static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
 }
 
+template <size_t VECLENGTH>
+static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
+	// so just use buffer_copy_fast() for this function too.
+	buffer_copy_fast<VECLENGTH>(dst, src);
+}
+
 template <size_t VECLENGTH>
 static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
 {
@@ -739,18 +741,20 @@ static void memset_u32_fast(void *dst, const u32 val)
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
 }
 
-template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
-{
-	memcpy(dst, src, VECLENGTH);
-}
-
 template <size_t VECLENGTH>
 static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),src), (X)*sizeof(v128s8), dst) );
 }
 
+template <size_t VECLENGTH>
+static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
+	// so just use buffer_copy_fast() for this function too.
+	buffer_copy_fast<VECLENGTH>(dst, src, VECLENGTH);
+}
+
 template <class T, size_t VECLENGTH, bool NEEDENDIANSWAP>
 static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const T &c_vec)
 {