GPU: Add additional basic SIMD-accelerated functions for memset_u16(), memset_u16_fast(), memset_u32(), and memset_u32_fast() for AVX2 and Altivec.

This commit is contained in:
rogerman 2018-02-13 14:45:17 -08:00
parent 5fbaa53b46
commit c41a006b2a
1 changed files with 101 additions and 19 deletions

View File

@ -114,16 +114,56 @@ FORCEINLINE s32 s32floor(double d)
return s32floor((float)d);
}
//switched SSE2 functions
//-------------
#ifdef ENABLE_SSE2
// SIMD Functions
//-------------
#if defined(ENABLE_AVX2)
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
v256u16 *dst_vec256 = (v256u16 *)dst;
const size_t length_vec256 = elementCount / (sizeof(v256u16) / sizeof(u16));
const v256u16 val_vec256 = _mm256_set1_epi16(val);
for (size_t i = 0; i < length_vec256; i++)
_mm256_stream_si256(dst_vec256 + i, val_vec256);
}
template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val)
{
v256u16 *dst_vec256 = (v256u16 *)dst;
const v256u16 val_vec256 = _mm256_set1_epi16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v256u16) / sizeof(u16)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
v256u32 *dst_vec256 = (v256u32 *)dst;
const size_t length_vec256 = elementCount / (sizeof(v256u32) / sizeof(u32));
const v256u32 val_vec256 = _mm256_set1_epi32(val);
for (size_t i = 0; i < length_vec256; i++)
_mm256_stream_si256(dst_vec256 + i, val_vec256);
}
template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
v256u32 *dst_vec256 = (v256u32 *)dst;
const v256u32 val_vec256 = _mm256_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
}
#elif defined(ENABLE_SSE2)
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi16(val);
const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val));
v128u16 *dst_vec128 = (v128u16 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
const v128u16 val_vec128 = _mm_set1_epi16(val);
for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128);
}
@ -131,17 +171,18 @@ static void memset_u16(void *dst, const u16 val, const size_t elementCount)
template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
v128u16 *dst_vec128 = (v128u16 *)dst;
const v128u16 val_vec128 = _mm_set1_epi16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi32(val);
const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val));
v128u32 *dst_vec128 = (v128u32 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
const v128u32 val_vec128 = _mm_set1_epi32(val);
for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128);
}
@ -149,12 +190,53 @@ static void memset_u32(void *dst, const u32 val, const size_t elementCount)
template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
__m128i *dst_vec128 = (__m128i *)dst;
const __m128i val_vec128 = _mm_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
v128u32 *dst_vec128 = (v128u32 *)dst;
const v128u32 val_vec128 = _mm_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
#elif defined(ENABLE_ALTIVEC)
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
v128u16 *dst_vec128 = (v128u16 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
const v128u16 val_vec128 = vec_splat_u16(val);
for (size_t i = 0; i < length_vec128; i++)
vec_st(val_vec128, 0, dst_vec128 + i);
}
template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val)
{
v128u16 *dst_vec128 = (v128u16 *)dst;
const v128u16 val_vec128 = vec_splat_u16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X)));
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
v128u32 *dst_vec128 = (v128u32 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
const v128u32 val_vec128 = vec_splat_u32(val);
for (size_t i = 0; i < length_vec128; i++)
vec_st(val_vec128, 0, dst_vec128 + i);
}
template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
v128u32 *dst_vec128 = (v128u32 *)dst;
const v128u32 val_vec128 = vec_splat_u32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
}
#else //no sse2
#else // No SIMD
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{
@ -212,7 +294,7 @@ static void memset_u32_fast(void *dst, const u32 val)
#endif
}
#endif // ENABLE_SSE2
#endif // SIMD Functions
// NOSSE version always used in gfx3d.cpp
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);