[x64] Add AVX512 optimizations for `PERMUTE_V128`

Uses the single-instruction AVX512 `vperm*` instructions to accelerate
the `INT8_TYPE` and `INT16_TYPE` permutation opcodes.

The `INT8_TYPE` is accelerated using `AVX512VBMI` subset of AVX512.
Available since Icelake(Intel) and Zen4(AMD).
This commit is contained in:
Wunkolo 2022-09-05 09:26:26 -07:00 committed by Rick Gibbed
parent f207239349
commit 5fde7c6aa5
1 changed files with 82 additions and 1 deletions

View File

@ -1804,7 +1804,23 @@ struct PERMUTE_V128
} else {
e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512VBMI)) {
Xmm table_lo = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}
Opmask zeroes = e.k1;
// _mm_cmple_epu8_mask
e.vpcmpub(zeroes, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15), 2);
e.vpermb(i.dest.reg() | zeroes | e.T_z, e.xmm0, table_lo);
return;
}
e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));
if (i.src2.is_constant) {
e.LoadConstantXmm(i.dest, i.src2.constant());
e.vpshufb(i.dest, i.dest, e.xmm0);
@ -1820,6 +1836,39 @@ struct PERMUTE_V128
// General permute.
// Control mask needs to be shuffled.
// TODO(benvanik): do constants here instead of in generated code.
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
kX64EmitAVX512VBMI)) {
Xmm table_idx = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(table_idx, i.src1.constant());
e.vxorps(table_idx, table_idx, e.GetXmmConstPtr(XMMSwapWordMask));
} else {
e.vxorps(table_idx, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}
Xmm table_lo = e.xmm1;
if (i.src2.value->IsConstantZero()) {
e.vpxor(table_lo, table_lo);
} else if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}
Xmm table_hi = e.xmm2;
if (i.src3.value->IsConstantZero()) {
e.vpxor(table_hi, table_hi);
} else if (i.src3.is_constant) {
e.LoadConstantXmm(table_hi, i.src3.constant());
} else {
table_hi = i.src3;
}
e.vpermi2b(table_idx, table_lo, table_hi);
e.vmovdqu8(i.dest, table_idx);
return;
}
if (i.src1.is_constant) {
e.LoadConstantXmm(e.xmm2, i.src1.constant());
e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
@ -1827,6 +1876,7 @@ struct PERMUTE_V128
e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
}
e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
Xmm src2_shuf = e.xmm0;
if (i.src2.value->IsConstantZero()) {
e.vpxor(src2_shuf, src2_shuf);
@ -1853,8 +1903,39 @@ struct PERMUTE_V128
static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
// src1 is an array of indices corresponding to positions within src2 and
// src3.
// src3
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW)) {
e.LoadConstantXmm(e.xmm1, vec128s(0x1));
Xmm table_idx = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(table_idx, i.src1.constant());
e.vpxord(table_idx, table_idx, e.xmm1);
} else {
e.vpxord(table_idx, i.src1, e.xmm1);
}
Xmm table_lo = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(table_lo, i.src2.constant());
} else {
table_lo = i.src2;
}
Xmm table_hi = e.xmm2;
if (i.src3.is_constant) {
e.LoadConstantXmm(table_hi, i.src3.constant());
} else {
table_hi = i.src3;
}
e.vpermi2w(table_idx, table_lo, table_hi);
e.vmovdqu8(i.dest, table_idx);
return;
}
assert_true(i.src1.is_constant);
vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
vec128_t perm_ctrl = vec128b(0);
for (int i = 0; i < 8; i++) {