From c41a006b2ad96e3b2d412556534fd5def25a6f8e Mon Sep 17 00:00:00 2001 From: rogerman Date: Tue, 13 Feb 2018 14:45:17 -0800 Subject: [PATCH] GPU: Add additional basic SIMD-accelerated functions for memset_u16(), memset_u16_fast(), memset_u32(), and memset_u32_fast() for AVX2 and Altivec. --- desmume/src/matrix.h | 120 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 19 deletions(-) diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index 8a1cbc222..1d0b667df 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -114,16 +114,56 @@ FORCEINLINE s32 s32floor(double d) return s32floor((float)d); } -//switched SSE2 functions -//------------- -#ifdef ENABLE_SSE2 +// SIMD Functions +//------------- +#if defined(ENABLE_AVX2) + +static void memset_u16(void *dst, const u16 val, const size_t elementCount) +{ + v256u16 *dst_vec256 = (v256u16 *)dst; + const size_t length_vec256 = elementCount / (sizeof(v256u16) / sizeof(u16)); + + const v256u16 val_vec256 = _mm256_set1_epi16(val); + for (size_t i = 0; i < length_vec256; i++) + _mm256_stream_si256(dst_vec256 + i, val_vec256); +} + +template +static void memset_u16_fast(void *dst, const u16 val) +{ + v256u16 *dst_vec256 = (v256u16 *)dst; + + const v256u16 val_vec256 = _mm256_set1_epi16(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v256u16) / sizeof(u16)), _mm256_store_si256(dst_vec256 + (X), val_vec256)); +} + +static void memset_u32(void *dst, const u32 val, const size_t elementCount) +{ + v256u32 *dst_vec256 = (v256u32 *)dst; + const size_t length_vec256 = elementCount / (sizeof(v256u32) / sizeof(u32)); + + const v256u32 val_vec256 = _mm256_set1_epi32(val); + for (size_t i = 0; i < length_vec256; i++) + _mm256_stream_si256(dst_vec256 + i, val_vec256); +} + +template +static void memset_u32_fast(void *dst, const u32 val) +{ + v256u32 *dst_vec256 = (v256u32 *)dst; + + const v256u32 val_vec256 = _mm256_set1_epi32(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256)); +} + +#elif defined(ENABLE_SSE2) static void memset_u16(void *dst, const u16 val, const size_t elementCount) { - __m128i *dst_vec128 = (__m128i *)dst; - const __m128i val_vec128 = _mm_set1_epi16(val); - const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val)); - + v128u16 *dst_vec128 = (v128u16 *)dst; + const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16)); + + const v128u16 val_vec128 = _mm_set1_epi16(val); for (size_t i = 0; i < length_vec128; i++) _mm_stream_si128(dst_vec128 + i, val_vec128); } @@ -131,17 +171,18 @@ static void memset_u16(void *dst, const u16 val, const size_t elementCount) template static void memset_u16_fast(void *dst, const u16 val) { - __m128i *dst_vec128 = (__m128i *)dst; - const __m128i val_vec128 = _mm_set1_epi16(val); - MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128)); + v128u16 *dst_vec128 = (v128u16 *)dst; + + const v128u16 val_vec128 = _mm_set1_epi16(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128)); } static void memset_u32(void *dst, const u32 val, const size_t elementCount) { - __m128i *dst_vec128 = (__m128i *)dst; - const __m128i val_vec128 = _mm_set1_epi32(val); - const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val)); - + v128u32 *dst_vec128 = (v128u32 *)dst; + const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32)); + + const v128u32 val_vec128 = _mm_set1_epi32(val); for (size_t i = 0; i < length_vec128; i++) _mm_stream_si128(dst_vec128 + i, val_vec128); } @@ -149,12 +190,53 @@ static void memset_u32(void *dst, const u32 val, const size_t elementCount) template static void memset_u32_fast(void *dst, const u32 val) { - __m128i *dst_vec128 = (__m128i *)dst; - const __m128i val_vec128 = _mm_set1_epi32(val); - MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128)); + v128u32 *dst_vec128 = (v128u32 *)dst; + + const v128u32 val_vec128 = _mm_set1_epi32(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128)); } + +#elif defined(ENABLE_ALTIVEC) + +static void memset_u16(void *dst, const u16 val, const size_t elementCount) +{ + v128u16 *dst_vec128 = (v128u16 *)dst; + const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16)); + + const v128u16 val_vec128 = vec_splat_u16(val); + for (size_t i = 0; i < length_vec128; i++) + vec_st(val_vec128, 0, dst_vec128 + i); +} + +template +static void memset_u16_fast(void *dst, const u16 val) +{ + v128u16 *dst_vec128 = (v128u16 *)dst; + + const v128u16 val_vec128 = vec_splat_u16(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X))); +} + +static void memset_u32(void *dst, const u32 val, const size_t elementCount) +{ + v128u32 *dst_vec128 = (v128u32 *)dst; + const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32)); + + const v128u32 val_vec128 = vec_splat_u32(val); + for (size_t i = 0; i < length_vec128; i++) + vec_st(val_vec128, 0, dst_vec128 + i); +} + +template +static void memset_u32_fast(void *dst, const u32 val) +{ + v128u32 *dst_vec128 = (v128u32 *)dst; + + const v128u32 val_vec128 = vec_splat_u32(val); + MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X))); +} -#else //no sse2 +#else // No SIMD static void memset_u16(void *dst, const u16 val, const size_t elementCount) { @@ -212,7 +294,7 @@ static void memset_u32_fast(void *dst, const u32 val) #endif } -#endif // ENABLE_SSE2 +#endif // SIMD Functions // NOSSE version always used in gfx3d.cpp void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);