From 5fde7c6aa5a2cc3489a1a4069ac48dd4c0fcab57 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Mon, 5 Sep 2022 09:26:26 -0700 Subject: [PATCH] [x64] Add AVX512 optimizations for `PERMUTE_V128` Uses the single-instruction AVX512 `vperm*` instructions to accelerate the `INT8_TYPE` and `INT16_TYPE` permutation opcodes. The `INT8_TYPE` is accelerated using `AVX512VBMI` subset of AVX512. Available since Icelake(Intel) and Zen4(AMD). --- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 83 ++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index c34c28745..75f162559 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -1804,7 +1804,23 @@ struct PERMUTE_V128 } else { e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); } + + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512VBMI)) { + Xmm table_lo = e.xmm1; + if (i.src2.is_constant) { + e.LoadConstantXmm(table_lo, i.src2.constant()); + } else { + table_lo = i.src2; + } + Opmask zeroes = e.k1; + // _mm_cmple_epu8_mask + e.vpcmpub(zeroes, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15), 2); + e.vpermb(i.dest.reg() | zeroes | e.T_z, e.xmm0, table_lo); + return; + } + e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask)); + if (i.src2.is_constant) { e.LoadConstantXmm(i.dest, i.src2.constant()); e.vpshufb(i.dest, i.dest, e.xmm0); @@ -1820,6 +1836,39 @@ struct PERMUTE_V128 // General permute. // Control mask needs to be shuffled. // TODO(benvanik): do constants here instead of in generated code. + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW | + kX64EmitAVX512VBMI)) { + Xmm table_idx = e.xmm0; + if (i.src1.is_constant) { + e.LoadConstantXmm(table_idx, i.src1.constant()); + e.vxorps(table_idx, table_idx, e.GetXmmConstPtr(XMMSwapWordMask)); + } else { + e.vxorps(table_idx, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); + } + + Xmm table_lo = e.xmm1; + if (i.src2.value->IsConstantZero()) { + e.vpxor(table_lo, table_lo); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(table_lo, i.src2.constant()); + } else { + table_lo = i.src2; + } + + Xmm table_hi = e.xmm2; + if (i.src3.value->IsConstantZero()) { + e.vpxor(table_hi, table_hi); + } else if (i.src3.is_constant) { + e.LoadConstantXmm(table_hi, i.src3.constant()); + } else { + table_hi = i.src3; + } + + e.vpermi2b(table_idx, table_lo, table_hi); + e.vmovdqu8(i.dest, table_idx); + return; + } + if (i.src1.is_constant) { e.LoadConstantXmm(e.xmm2, i.src1.constant()); e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask)); @@ -1827,6 +1876,7 @@ struct PERMUTE_V128 e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask)); } e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask)); + Xmm src2_shuf = e.xmm0; if (i.src2.value->IsConstantZero()) { e.vpxor(src2_shuf, src2_shuf); @@ -1853,8 +1903,39 @@ struct PERMUTE_V128 static void EmitByInt16(X64Emitter& e, const EmitArgType& i) { // src1 is an array of indices corresponding to positions within src2 and - // src3. + // src3 + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW)) { + e.LoadConstantXmm(e.xmm1, vec128s(0x1)); + + Xmm table_idx = e.xmm0; + if (i.src1.is_constant) { + e.LoadConstantXmm(table_idx, i.src1.constant()); + e.vpxord(table_idx, table_idx, e.xmm1); + } else { + e.vpxord(table_idx, i.src1, e.xmm1); + } + + Xmm table_lo = e.xmm1; + if (i.src2.is_constant) { + e.LoadConstantXmm(table_lo, i.src2.constant()); + } else { + table_lo = i.src2; + } + + Xmm table_hi = e.xmm2; + if (i.src3.is_constant) { + e.LoadConstantXmm(table_hi, i.src3.constant()); + } else { + table_hi = i.src3; + } + + e.vpermi2w(table_idx, table_lo, table_hi); + e.vmovdqu8(i.dest, table_idx); + return; + } + assert_true(i.src1.is_constant); + vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1); vec128_t perm_ctrl = vec128b(0); for (int i = 0; i < 8; i++) {