[x64] Add AVX512 optimizations for `PERMUTE_V128`
Uses the single-instruction AVX512 `vperm*` instructions to accelerate the `INT8_TYPE` and `INT16_TYPE` permutation opcodes. The `INT8_TYPE` is accelerated using `AVX512VBMI` subset of AVX512. Available since Icelake(Intel) and Zen4(AMD).
This commit is contained in:
parent
f207239349
commit
5fde7c6aa5
|
@ -1804,7 +1804,23 @@ struct PERMUTE_V128
|
|||
} else {
|
||||
e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
|
||||
}
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512VBMI)) {
|
||||
Xmm table_lo = e.xmm1;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(table_lo, i.src2.constant());
|
||||
} else {
|
||||
table_lo = i.src2;
|
||||
}
|
||||
Opmask zeroes = e.k1;
|
||||
// _mm_cmple_epu8_mask
|
||||
e.vpcmpub(zeroes, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15), 2);
|
||||
e.vpermb(i.dest.reg() | zeroes | e.T_z, e.xmm0, table_lo);
|
||||
return;
|
||||
}
|
||||
|
||||
e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));
|
||||
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(i.dest, i.src2.constant());
|
||||
e.vpshufb(i.dest, i.dest, e.xmm0);
|
||||
|
@ -1820,6 +1836,39 @@ struct PERMUTE_V128
|
|||
// General permute.
|
||||
// Control mask needs to be shuffled.
|
||||
// TODO(benvanik): do constants here instead of in generated code.
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
|
||||
kX64EmitAVX512VBMI)) {
|
||||
Xmm table_idx = e.xmm0;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(table_idx, i.src1.constant());
|
||||
e.vxorps(table_idx, table_idx, e.GetXmmConstPtr(XMMSwapWordMask));
|
||||
} else {
|
||||
e.vxorps(table_idx, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
|
||||
}
|
||||
|
||||
Xmm table_lo = e.xmm1;
|
||||
if (i.src2.value->IsConstantZero()) {
|
||||
e.vpxor(table_lo, table_lo);
|
||||
} else if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(table_lo, i.src2.constant());
|
||||
} else {
|
||||
table_lo = i.src2;
|
||||
}
|
||||
|
||||
Xmm table_hi = e.xmm2;
|
||||
if (i.src3.value->IsConstantZero()) {
|
||||
e.vpxor(table_hi, table_hi);
|
||||
} else if (i.src3.is_constant) {
|
||||
e.LoadConstantXmm(table_hi, i.src3.constant());
|
||||
} else {
|
||||
table_hi = i.src3;
|
||||
}
|
||||
|
||||
e.vpermi2b(table_idx, table_lo, table_hi);
|
||||
e.vmovdqu8(i.dest, table_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm2, i.src1.constant());
|
||||
e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
|
||||
|
@ -1827,6 +1876,7 @@ struct PERMUTE_V128
|
|||
e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
|
||||
}
|
||||
e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
|
||||
|
||||
Xmm src2_shuf = e.xmm0;
|
||||
if (i.src2.value->IsConstantZero()) {
|
||||
e.vpxor(src2_shuf, src2_shuf);
|
||||
|
@ -1853,8 +1903,39 @@ struct PERMUTE_V128
|
|||
|
||||
static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
// src1 is an array of indices corresponding to positions within src2 and
|
||||
// src3.
|
||||
// src3
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW)) {
|
||||
e.LoadConstantXmm(e.xmm1, vec128s(0x1));
|
||||
|
||||
Xmm table_idx = e.xmm0;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(table_idx, i.src1.constant());
|
||||
e.vpxord(table_idx, table_idx, e.xmm1);
|
||||
} else {
|
||||
e.vpxord(table_idx, i.src1, e.xmm1);
|
||||
}
|
||||
|
||||
Xmm table_lo = e.xmm1;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(table_lo, i.src2.constant());
|
||||
} else {
|
||||
table_lo = i.src2;
|
||||
}
|
||||
|
||||
Xmm table_hi = e.xmm2;
|
||||
if (i.src3.is_constant) {
|
||||
e.LoadConstantXmm(table_hi, i.src3.constant());
|
||||
} else {
|
||||
table_hi = i.src3;
|
||||
}
|
||||
|
||||
e.vpermi2w(table_idx, table_lo, table_hi);
|
||||
e.vmovdqu8(i.dest, table_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
assert_true(i.src1.is_constant);
|
||||
|
||||
vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
|
||||
vec128_t perm_ctrl = vec128b(0);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
|
|
Loading…
Reference in New Issue