Core: Add preliminary support for ARM NEON-A64. The generic memset and buffer_copy functions are now supported.

- Note that NEON support is assuming the A64 instruction set. But if there is enough user demand for running the A32 instruction set, and if it is feasible to backport the NEON code to A32, then this may be explored at a later date. But for now, we are sticking with A64.
2022-04-02 21:33:14 -07:00 · 2022-04-02 21:33:14 -07:00 · 7e85253ebb
parent 03be21608e
commit 7e85253ebb
3 changed files with 147 additions and 4 deletions
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -1,6 +1,6 @@
 /*  
 	Copyright (C) 2006-2007 shash
-	Copyright (C) 2007-2021 DeSmuME team
+	Copyright (C) 2007-2022 DeSmuME team

 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@ -576,6 +576,129 @@ static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *_
 	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
 }

+#elif defined(ENABLE_NEON_A64)
+
+static void memset_u16(void *dst, const u16 val, const size_t elementCount)
+{
+	u16 *dst16 = (u16 *)dst;
+	
+	const v128u16 val_vec128 = vdupq_n_u16(val);
+	for (size_t i = 0; i < elementCount; i+=(sizeof(v128u16)/sizeof(u16)))
+		vst1q_u16(dst16 + i, val_vec128);
+}
+
+template <size_t ELEMENTCOUNT>
+static void memset_u16_fast(void *dst, const u16 val)
+{
+	u16 *dst16 = (u16 *)dst;
+	
+	const v128u16 val_vec128 = vdupq_n_u16(val);
+	const uint16x8x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 };
+	MACRODO_N( ELEMENTCOUNT / (sizeof(uint16x8x4_t) / sizeof(u16)), vst1q_u16_x4(dst16 + ((X) * (sizeof(uint16x8x4_t)/sizeof(u16))), val_vec128x4) );
+}
+
+static void memset_u32(void *dst, const u32 val, const size_t elementCount)
+{
+	u32 *dst32 = (u32 *)dst;
+	
+	const v128u32 val_vec128 = vdupq_n_u32(val);
+	for (size_t i = 0; i < elementCount; i+=(sizeof(v128u32)/sizeof(u32)))
+		vst1q_u32(dst32 + i, val_vec128);
+}
+
+template <size_t ELEMENTCOUNT>
+static void memset_u32_fast(void *dst, const u32 val)
+{
+	u32 *dst32 = (u32 *)dst;
+	
+	const v128u32 val_vec128 = vdupq_n_u32(val);
+	const uint32x4x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 };
+	MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) );
+}
+
+template <size_t VECLENGTH>
+static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	memcpy(dst, src, VECLENGTH);
+}
+
+template <size_t VECLENGTH>
+static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+{
+	MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
+}
+
+template <size_t VECLENGTH>
+static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec)
+{
+	MACRODO_N( VECLENGTH / sizeof(v128u8), vst1q_u8((u8 *)dst + ((X) * sizeof(v128u8)), vorrq_u8(vld1q_u8((u8 *)src + ((X) * sizeof(v128u8))), c_vec)) );
+}
+
+static void __buffer_copy_or_constant(void *__restrict dst, const void *__restrict src, const size_t vecLength, const v128u8 &c_vec)
+{
+	switch (vecLength)
+	{
+		case 128: __buffer_copy_or_constant_fast<128>(dst, src, c_vec); break;
+		case 256: __buffer_copy_or_constant_fast<256>(dst, src, c_vec); break;
+		case 512: __buffer_copy_or_constant_fast<512>(dst, src, c_vec); break;
+		case 768: __buffer_copy_or_constant_fast<768>(dst, src, c_vec); break;
+		case 1024: __buffer_copy_or_constant_fast<1024>(dst, src, c_vec); break;
+		case 2048: __buffer_copy_or_constant_fast<2048>(dst, src, c_vec); break;
+		case 2304: __buffer_copy_or_constant_fast<2304>(dst, src, c_vec); break;
+		case 4096: __buffer_copy_or_constant_fast<4096>(dst, src, c_vec); break;
+			
+		default:
+		{
+			for (size_t i = 0; i < vecLength; i+=sizeof(v128u8))
+			{
+				vst1q_u8( (u8 *)dst + i, vorrq_u8(vld1q_u8((u8 *)src + i), c_vec) );
+			}
+			break;
+		}
+	}
+}
+
+static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s8 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );
+	__buffer_copy_or_constant(dst, src, vecLength, c_vec);
+}
+
+template <size_t VECLENGTH>
+static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );
+	__buffer_copy_or_constant_fast<VECLENGTH, false>(dst, src, c_vec);
+}
+
+template <bool NEEDENDIANSWAP>
+static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s16 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );
+	__buffer_copy_or_constant(dst, src, vecLength, c_vec);
+}
+
+template <size_t VECLENGTH, bool NEEDENDIANSWAP>
+static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );
+	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
+}
+
+template <bool NEEDENDIANSWAP>
+static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s32 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );
+	__buffer_copy_or_constant(dst, src, vecLength, c_vec);
+}
+
+template <size_t VECLENGTH, bool NEEDENDIANSWAP>
+static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c)
+{
+	const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );
+	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
+}
+
 #elif defined(ENABLE_ALTIVEC)

 static void memset_u16(void *dst, const u16 val, const size_t elementCount)
--- a/desmume/src/types.h
+++ b/desmume/src/types.h
@ -34,7 +34,7 @@

 // Determine CPU architecture for platforms that don't use the autoconf script
 #if defined(HOST_WINDOWS) || defined(DESMUME_COCOA)
-	#if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(__ppc64__)
+	#if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__ppc64__)
 		#define HOST_64
 	#else
 		#define HOST_32
@ -57,6 +57,14 @@
 		#define ENABLE_ALTIVEC
 	#endif

+// For now, we'll be starting off with only using NEON-A64 for easier testing
+// and development. If the development for A64 goes well and if an A32 backport
+// is discovered to be feasible, then we may explore backporting the NEON code
+// to A32 at a later date.
+	#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && (defined(__aarch64__) || defined(_M_ARM64))
+		#define ENABLE_NEON_A64
+	#endif
+
 	#ifdef __SSE__
 		#define ENABLE_SSE
 	#endif
@ -262,6 +270,16 @@ typedef vector unsigned int v128u32;
 typedef vector signed int v128s32;
 #endif

+#ifdef ENABLE_NEON_A64
+#include <arm_neon.h>
+typedef uint8x16_t v128u8;
+typedef int8x16_t v128s8;
+typedef uint16x8_t v128u16;
+typedef int16x8_t v128s16;
+typedef uint32x4_t v128u32;
+typedef int32x4_t v128s32;
+#endif
+
 #ifdef ENABLE_SSE2
 #include <emmintrin.h>
 typedef __m128i v128u8;
--- a/desmume/src/version.cpp
+++ b/desmume/src/version.cpp
@ -1,5 +1,5 @@
 /*
-	Copyright (C) 2009-2021 DeSmuME team
+	Copyright (C) 2009-2022 DeSmuME team

 	This file is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
@ -50,7 +50,7 @@
 	#define DESMUME_PLATFORM_STRING " ARM"
 #elif defined(__thumb__)
 	#define DESMUME_PLATFORM_STRING " ARM-Thumb"
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(_M_ARM64)
    #if defined(__APPLE__)
 	#define DESMUME_PLATFORM_STRING " ARM64"
    #else
@ -78,6 +78,8 @@
 	#define DESMUME_CPUEXT_PRIMARY_STRING " SSE"
 #elif defined(ENABLE_ALTIVEC)
 	#define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec"
+#elif defined(ENABLE_NEON_A64)
+	#define DESMUME_CPUEXT_PRIMARY_STRING " NEON-A64"
 #endif

 #if defined(ENABLE_AVX512_3)