From 7e85253ebb79767151818b7ae7dd32e08f3531d6 Mon Sep 17 00:00:00 2001 From: rogerman Date: Sat, 2 Apr 2022 21:33:14 -0700 Subject: [PATCH] Core: Add preliminary support for ARM NEON-A64. The generic memset and buffer_copy functions are now supported. - Note that NEON support is assuming the A64 instruction set. But if there is enough user demand for running the A32 instruction set, and if it is feasible to backport the NEON code to A32, then this may be explored at a later date. But for now, we are sticking with A64. --- desmume/src/matrix.h | 125 +++++++++++++++++++++++++++++++++++++++- desmume/src/types.h | 20 ++++++- desmume/src/version.cpp | 6 +- 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index d3d0bce47..88ee6dfd4 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006-2007 shash - Copyright (C) 2007-2021 DeSmuME team + Copyright (C) 2007-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -576,6 +576,129 @@ static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *_ __buffer_copy_or_constant_fast(dst, src, c_vec); } +#elif defined(ENABLE_NEON_A64) + +static void memset_u16(void *dst, const u16 val, const size_t elementCount) +{ + u16 *dst16 = (u16 *)dst; + + const v128u16 val_vec128 = vdupq_n_u16(val); + for (size_t i = 0; i < elementCount; i+=(sizeof(v128u16)/sizeof(u16))) + vst1q_u16(dst16 + i, val_vec128); +} + +template +static void memset_u16_fast(void *dst, const u16 val) +{ + u16 *dst16 = (u16 *)dst; + + const v128u16 val_vec128 = vdupq_n_u16(val); + const uint16x8x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 }; + MACRODO_N( ELEMENTCOUNT / (sizeof(uint16x8x4_t) / sizeof(u16)), vst1q_u16_x4(dst16 + ((X) * (sizeof(uint16x8x4_t)/sizeof(u16))), val_vec128x4) ); +} + +static void memset_u32(void *dst, const u32 val, const size_t elementCount) +{ + u32 *dst32 = (u32 *)dst; + + const v128u32 val_vec128 = vdupq_n_u32(val); + for (size_t i = 0; i < elementCount; i+=(sizeof(v128u32)/sizeof(u32))) + vst1q_u32(dst32 + i, val_vec128); +} + +template +static void memset_u32_fast(void *dst, const u32 val) +{ + u32 *dst32 = (u32 *)dst; + + const v128u32 val_vec128 = vdupq_n_u32(val); + const uint32x4x4_t val_vec128x4 = { val_vec128, val_vec128, val_vec128, val_vec128 }; + MACRODO_N( ELEMENTCOUNT / (sizeof(uint32x4x4_t) / sizeof(u32)), vst1q_u32_x4(dst32 + ((X) * (sizeof(uint32x4x4_t)/sizeof(u32))), val_vec128x4) ); +} + +template +static void stream_copy_fast(void *__restrict dst, void *__restrict src) +{ + memcpy(dst, src, VECLENGTH); +} + +template +static void buffer_copy_fast(void *__restrict dst, void *__restrict src) +{ + MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) ); +} + +template +static void __buffer_copy_or_constant_fast(void *__restrict dst, const void *__restrict src, const v128u8 &c_vec) +{ + MACRODO_N( VECLENGTH / sizeof(v128u8), vst1q_u8((u8 *)dst + ((X) * sizeof(v128u8)), vorrq_u8(vld1q_u8((u8 *)src + ((X) * sizeof(v128u8))), c_vec)) ); +} + +static void __buffer_copy_or_constant(void *__restrict dst, const void *__restrict src, const size_t vecLength, const v128u8 &c_vec) +{ + switch (vecLength) + { + case 128: __buffer_copy_or_constant_fast<128>(dst, src, c_vec); break; + case 256: __buffer_copy_or_constant_fast<256>(dst, src, c_vec); break; + case 512: __buffer_copy_or_constant_fast<512>(dst, src, c_vec); break; + case 768: __buffer_copy_or_constant_fast<768>(dst, src, c_vec); break; + case 1024: __buffer_copy_or_constant_fast<1024>(dst, src, c_vec); break; + case 2048: __buffer_copy_or_constant_fast<2048>(dst, src, c_vec); break; + case 2304: __buffer_copy_or_constant_fast<2304>(dst, src, c_vec); break; + case 4096: __buffer_copy_or_constant_fast<4096>(dst, src, c_vec); break; + + default: + { + for (size_t i = 0; i < vecLength; i+=sizeof(v128u8)) + { + vst1q_u8( (u8 *)dst + i, vorrq_u8(vld1q_u8((u8 *)src + i), c_vec) ); + } + break; + } + } +} + +static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s8 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) ); + __buffer_copy_or_constant(dst, src, vecLength, c_vec); +} + +template +static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) ); + __buffer_copy_or_constant_fast(dst, src, c_vec); +} + +template +static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s16 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) ); + __buffer_copy_or_constant(dst, src, vecLength, c_vec); +} + +template +static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) ); + __buffer_copy_or_constant_fast(dst, src, c_vec); +} + +template +static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__restrict src, const size_t vecLength, const s32 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) ); + __buffer_copy_or_constant(dst, src, vecLength, c_vec); +} + +template +static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c) +{ + const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) ); + __buffer_copy_or_constant_fast(dst, src, c_vec); +} + #elif defined(ENABLE_ALTIVEC) static void memset_u16(void *dst, const u16 val, const size_t elementCount) diff --git a/desmume/src/types.h b/desmume/src/types.h index 831fd47c9..0fe31b347 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -34,7 +34,7 @@ // Determine CPU architecture for platforms that don't use the autoconf script #if defined(HOST_WINDOWS) || defined(DESMUME_COCOA) - #if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(__ppc64__) + #if defined(__x86_64__) || defined(__LP64) || defined(__IA64__) || defined(_M_X64) || defined(_WIN64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__ppc64__) #define HOST_64 #else #define HOST_32 @@ -57,6 +57,14 @@ #define ENABLE_ALTIVEC #endif +// For now, we'll be starting off with only using NEON-A64 for easier testing +// and development. If the development for A64 goes well and if an A32 backport +// is discovered to be feasible, then we may explore backporting the NEON code +// to A32 at a later date. + #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && (defined(__aarch64__) || defined(_M_ARM64)) + #define ENABLE_NEON_A64 + #endif + #ifdef __SSE__ #define ENABLE_SSE #endif @@ -262,6 +270,16 @@ typedef vector unsigned int v128u32; typedef vector signed int v128s32; #endif +#ifdef ENABLE_NEON_A64 +#include +typedef uint8x16_t v128u8; +typedef int8x16_t v128s8; +typedef uint16x8_t v128u16; +typedef int16x8_t v128s16; +typedef uint32x4_t v128u32; +typedef int32x4_t v128s32; +#endif + #ifdef ENABLE_SSE2 #include typedef __m128i v128u8; diff --git a/desmume/src/version.cpp b/desmume/src/version.cpp index 17d320687..2aa506dc1 100755 --- a/desmume/src/version.cpp +++ b/desmume/src/version.cpp @@ -1,5 +1,5 @@ /* - Copyright (C) 2009-2021 DeSmuME team + Copyright (C) 2009-2022 DeSmuME team This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -50,7 +50,7 @@ #define DESMUME_PLATFORM_STRING " ARM" #elif defined(__thumb__) #define DESMUME_PLATFORM_STRING " ARM-Thumb" -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) #if defined(__APPLE__) #define DESMUME_PLATFORM_STRING " ARM64" #else @@ -78,6 +78,8 @@ #define DESMUME_CPUEXT_PRIMARY_STRING " SSE" #elif defined(ENABLE_ALTIVEC) #define DESMUME_CPUEXT_PRIMARY_STRING " AltiVec" +#elif defined(ENABLE_NEON_A64) + #define DESMUME_CPUEXT_PRIMARY_STRING " NEON-A64" #endif #if defined(ENABLE_AVX512_3)