PPU: more interpreter refactoring (VPERM)

Notably, runtime-built SSSE3 version of VPERM.
Some other instructions are refactored and vectorized.
Aarch64 impl of multiple instructions including VPERM.
This commit is contained in:
Nekotekina 2022-01-16 07:47:42 +03:00
parent 14170afe8a
commit b404d37eb0
2 changed files with 373 additions and 250 deletions

File diff suppressed because it is too large Load Diff

View File

@ -662,6 +662,24 @@ inline auto gv_xorfs(A&& a, B&& b)
FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward<A>(a), std::forward<B>(b)); FOR_X64(binary_op, 4, kIdMovaps, kIdXorps, kIdVxorps, kIdVxorps, std::forward<A>(a), std::forward<B>(b));
} }
inline v128 gv_not32(const v128& a)
{
#if defined(ARCH_X64)
return _mm_xor_si128(a, _mm_set1_epi32(-1));
#elif defined(ARCH_ARM64)
return vmvnq_u32(a);
#endif
}
inline v128 gv_notfs(const v128& a)
{
#if defined(ARCH_X64)
return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
#elif defined(ARCH_ARM64)
return vmvnq_u32(a);
#endif
}
inline v128 gv_shl16(const v128& a, u32 count) inline v128 gv_shl16(const v128& a, u32 count)
{ {
if (count >= 16) if (count >= 16)
@ -1644,7 +1662,7 @@ inline v128 gv_mul32(const v128& a, const v128& b)
#elif defined(ARCH_X64) #elif defined(ARCH_X64)
const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8); const __m128i lows = _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8);
const __m128i highs = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8); const __m128i highs = _mm_shuffle_epi32(_mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)), 8);
return _mm_unpacklo_epi64(lows, highs); return _mm_unpacklo_epi32(lows, highs);
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
return vmulq_s32(a, b); return vmulq_s32(a, b);
#endif #endif
@ -1813,6 +1831,53 @@ inline v128 gv_dots_s16x2(const v128& a, const v128& b, const v128& c)
#endif #endif
} }
// Multiply s16 elements 0, 2, 4, 6 to produce s32 results in corresponding lanes
inline v128 gv_mul_even_s16(const v128& a, const v128& b)
{
#if defined(ARCH_X64)
const auto c = _mm_set1_epi32(0x0000ffff);
return _mm_madd_epi16(_mm_and_si128(a, c), _mm_and_si128(b, c));
#else
// TODO
return gv_mul32(gv_sar32(gv_shl32(a, 16), 16), gv_sar32(gv_shl32(b, 16), 16));
#endif
}
// Multiply u16 elements 0, 2, 4, 6 to produce u32 results in corresponding lanes
inline v128 gv_mul_even_u16(const v128& a, const v128& b)
{
#if defined(__SSE4_1__) || defined(ARCH_ARM64)
const auto c = gv_bcst32(0x0000ffff);
return gv_mul32(a & c, b & c);
#elif defined(ARCH_X64)
const auto ml = _mm_mullo_epi16(a, b);
const auto mh = _mm_mulhi_epu16(a, b);
return _mm_or_si128(_mm_and_si128(ml, _mm_set1_epi32(0x0000ffff)), _mm_slli_epi32(mh, 16));
#endif
}
// Multiply s16 elements 1, 3, 5, 7 to produce s32 results in corresponding lanes
inline v128 gv_mul_odds_s16(const v128& a, const v128& b)
{
#if defined(ARCH_X64)
return _mm_madd_epi16(_mm_srli_epi32(a, 16), _mm_srli_epi32(b, 16));
#else
return gv_mul32(gv_sar32(a, 16), gv_sar32(b, 16));
#endif
}
// Multiply u16 elements 1, 3, 5, 7 to produce u32 results in corresponding lanes
inline v128 gv_mul_odds_u16(const v128& a, const v128& b)
{
#if defined(__SSE4_1__) || defined(ARCH_ARM64)
return gv_mul32(gv_shr32(a, 16), gv_shr32(b, 16));
#elif defined(ARCH_X64)
const auto ml = _mm_mullo_epi16(a, b);
const auto mh = _mm_mulhi_epu16(a, b);
return _mm_or_si128(_mm_and_si128(mh, _mm_set1_epi32(0xffff0000)), _mm_srli_epi32(ml, 16));
#endif
}
inline v128 gv_cvts32_tofs(const v128& src) inline v128 gv_cvts32_tofs(const v128& src)
{ {
#if defined(ARCH_X64) #if defined(ARCH_X64)