GPU: Add additional basic SIMD-accelerated functions for memset_u16(), memset_u16_fast(), memset_u32(), and memset_u32_fast() for AVX2 and Altivec.
This commit is contained in:
parent
5fbaa53b46
commit
c41a006b2a
|
@ -114,16 +114,56 @@ FORCEINLINE s32 s32floor(double d)
|
||||||
return s32floor((float)d);
|
return s32floor((float)d);
|
||||||
}
|
}
|
||||||
|
|
||||||
//switched SSE2 functions
|
// SIMD Functions
|
||||||
//-------------
|
//-------------
|
||||||
#ifdef ENABLE_SSE2
|
#if defined(ENABLE_AVX2)
|
||||||
|
|
||||||
|
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||||
|
{
|
||||||
|
v256u16 *dst_vec256 = (v256u16 *)dst;
|
||||||
|
const size_t length_vec256 = elementCount / (sizeof(v256u16) / sizeof(u16));
|
||||||
|
|
||||||
|
const v256u16 val_vec256 = _mm256_set1_epi16(val);
|
||||||
|
for (size_t i = 0; i < length_vec256; i++)
|
||||||
|
_mm256_stream_si256(dst_vec256 + i, val_vec256);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t ELEMENTCOUNT>
|
||||||
|
static void memset_u16_fast(void *dst, const u16 val)
|
||||||
|
{
|
||||||
|
v256u16 *dst_vec256 = (v256u16 *)dst;
|
||||||
|
|
||||||
|
const v256u16 val_vec256 = _mm256_set1_epi16(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v256u16) / sizeof(u16)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
||||||
|
{
|
||||||
|
v256u32 *dst_vec256 = (v256u32 *)dst;
|
||||||
|
const size_t length_vec256 = elementCount / (sizeof(v256u32) / sizeof(u32));
|
||||||
|
|
||||||
|
const v256u32 val_vec256 = _mm256_set1_epi32(val);
|
||||||
|
for (size_t i = 0; i < length_vec256; i++)
|
||||||
|
_mm256_stream_si256(dst_vec256 + i, val_vec256);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t ELEMENTCOUNT>
|
||||||
|
static void memset_u32_fast(void *dst, const u32 val)
|
||||||
|
{
|
||||||
|
v256u32 *dst_vec256 = (v256u32 *)dst;
|
||||||
|
|
||||||
|
const v256u32 val_vec256 = _mm256_set1_epi32(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(ENABLE_SSE2)
|
||||||
|
|
||||||
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||||
{
|
{
|
||||||
__m128i *dst_vec128 = (__m128i *)dst;
|
v128u16 *dst_vec128 = (v128u16 *)dst;
|
||||||
const __m128i val_vec128 = _mm_set1_epi16(val);
|
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
|
||||||
const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val));
|
|
||||||
|
const v128u16 val_vec128 = _mm_set1_epi16(val);
|
||||||
for (size_t i = 0; i < length_vec128; i++)
|
for (size_t i = 0; i < length_vec128; i++)
|
||||||
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
||||||
}
|
}
|
||||||
|
@ -131,17 +171,18 @@ static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||||
template <size_t ELEMENTCOUNT>
|
template <size_t ELEMENTCOUNT>
|
||||||
static void memset_u16_fast(void *dst, const u16 val)
|
static void memset_u16_fast(void *dst, const u16 val)
|
||||||
{
|
{
|
||||||
__m128i *dst_vec128 = (__m128i *)dst;
|
v128u16 *dst_vec128 = (v128u16 *)dst;
|
||||||
const __m128i val_vec128 = _mm_set1_epi16(val);
|
|
||||||
MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
const v128u16 val_vec128 = _mm_set1_epi16(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
||||||
{
|
{
|
||||||
__m128i *dst_vec128 = (__m128i *)dst;
|
v128u32 *dst_vec128 = (v128u32 *)dst;
|
||||||
const __m128i val_vec128 = _mm_set1_epi32(val);
|
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
|
||||||
const size_t length_vec128 = elementCount / (sizeof(val_vec128) / sizeof(val));
|
|
||||||
|
const v128u32 val_vec128 = _mm_set1_epi32(val);
|
||||||
for (size_t i = 0; i < length_vec128; i++)
|
for (size_t i = 0; i < length_vec128; i++)
|
||||||
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
_mm_stream_si128(dst_vec128 + i, val_vec128);
|
||||||
}
|
}
|
||||||
|
@ -149,12 +190,53 @@ static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
||||||
template <size_t ELEMENTCOUNT>
|
template <size_t ELEMENTCOUNT>
|
||||||
static void memset_u32_fast(void *dst, const u32 val)
|
static void memset_u32_fast(void *dst, const u32 val)
|
||||||
{
|
{
|
||||||
__m128i *dst_vec128 = (__m128i *)dst;
|
v128u32 *dst_vec128 = (v128u32 *)dst;
|
||||||
const __m128i val_vec128 = _mm_set1_epi32(val);
|
|
||||||
MACRODO_N(ELEMENTCOUNT / (sizeof(val_vec128) / sizeof(val)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
const v128u32 val_vec128 = _mm_set1_epi32(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(ENABLE_ALTIVEC)
|
||||||
|
|
||||||
|
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||||
|
{
|
||||||
|
v128u16 *dst_vec128 = (v128u16 *)dst;
|
||||||
|
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
|
||||||
|
|
||||||
|
const v128u16 val_vec128 = vec_splat_u16(val);
|
||||||
|
for (size_t i = 0; i < length_vec128; i++)
|
||||||
|
vec_st(val_vec128, 0, dst_vec128 + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t ELEMENTCOUNT>
|
||||||
|
static void memset_u16_fast(void *dst, const u16 val)
|
||||||
|
{
|
||||||
|
v128u16 *dst_vec128 = (v128u16 *)dst;
|
||||||
|
|
||||||
|
const v128u16 val_vec128 = vec_splat_u16(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
|
||||||
|
{
|
||||||
|
v128u32 *dst_vec128 = (v128u32 *)dst;
|
||||||
|
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
|
||||||
|
|
||||||
|
const v128u32 val_vec128 = vec_splat_u32(val);
|
||||||
|
for (size_t i = 0; i < length_vec128; i++)
|
||||||
|
vec_st(val_vec128, 0, dst_vec128 + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t ELEMENTCOUNT>
|
||||||
|
static void memset_u32_fast(void *dst, const u32 val)
|
||||||
|
{
|
||||||
|
v128u32 *dst_vec128 = (v128u32 *)dst;
|
||||||
|
|
||||||
|
const v128u32 val_vec128 = vec_splat_u32(val);
|
||||||
|
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
|
||||||
|
}
|
||||||
|
|
||||||
#else //no sse2
|
#else // No SIMD
|
||||||
|
|
||||||
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
static void memset_u16(void *dst, const u16 val, const size_t elementCount)
|
||||||
{
|
{
|
||||||
|
@ -212,7 +294,7 @@ static void memset_u32_fast(void *dst, const u32 val)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // ENABLE_SSE2
|
#endif // SIMD Functions
|
||||||
|
|
||||||
// NOSSE version always used in gfx3d.cpp
|
// NOSSE version always used in gfx3d.cpp
|
||||||
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
void _NOSSE_MatrixMultVec4x4 (const float *matrix, float *vecPtr);
|
||||||
|
|
Loading…
Reference in New Issue