From fd942a37f15399ba1688b4ef5801f77a9f8a320a Mon Sep 17 00:00:00 2001 From: Max-Tepafray <134654211+Max-Tepafray@users.noreply.github.com> Date: Thu, 25 May 2023 17:17:26 -0500 Subject: [PATCH] [x64] Add AVX512 optimization for OPCODE_VECTOR_SUB(saturated) --- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 77 +++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 15c7ed19e..c34c28745 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Xenia Developers. All rights reserved. * + * Copyright 2022 Xenia Developers. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I I> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + Opmask mask = e.k1; + // Mask positive values and unordered values + // _CMP_NLT_UQ + e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15); + + // vcvttps2udq will saturate overflowing positive values and unordered + // values to UINT_MAX. Mask registers will write zero everywhere + // else (negative values) + e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1); + return; + } + // clamp to min 0 e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero)); @@ -547,6 +560,15 @@ struct VECTOR_ADD case INT32_TYPE: if (saturate) { if (is_unsigned) { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + e.vpaddd(dest, src1, src2); + Opmask saturate = e.k1; + // _mm_cmplt_epu32_mask + e.vpcmpud(saturate, dest, src1, 0x1); + e.vpternlogd(dest | saturate, dest, dest, 0xFF); + return; + } + // xmm0 is the only temp register that can be used by // src1/src2. e.vpaddd(e.xmm1, src1, src2); @@ -562,6 +584,20 @@ struct VECTOR_ADD } else { e.vpaddd(e.xmm1, src1, src2); + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | + kX64EmitAVX512DQ)) { + e.vmovdqa32(e.xmm3, src1); + e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00100100); + + const Opmask saturate = e.k1; + e.vpmovd2m(saturate, e.xmm3); + + e.vpsrad(e.xmm2, e.xmm1, 31); + e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpblendmd(dest | saturate, e.xmm1, e.xmm2); + return; + } + // Overflow results if two inputs are the same sign and the // result isn't the same sign. if ((s32b)(~(src1 ^ src2) & // (src1 ^ res)) < 0) then overflowed @@ -643,6 +679,19 @@ struct VECTOR_SUB // src1/src2. e.vpsubd(e.xmm1, src1, src2); + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + // If the result is less or equal to the first operand then + // we did not underflow + Opmask not_underflow = e.k1; + // _mm_cmple_epu32_mask + e.vpcmpud(not_underflow, e.xmm1, src1, 0x2); + + // Copy over values that did not underflow, write zero + // everywhere else + e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1); + return; + } + // If result is greater than either of the inputs, we've // underflowed (only need to check one input) // if (res > src1) then underflowed @@ -654,6 +703,21 @@ struct VECTOR_SUB } else { e.vpsubd(e.xmm1, src1, src2); + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | + kX64EmitAVX512DQ)) { + e.vmovdqa32(e.xmm3, src1); + e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000); + + const Opmask saturate = e.k1; + e.vpmovd2m(saturate, e.xmm3); + + e.vpsrad(e.xmm2, e.xmm1, 31); + e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32)); + + e.vpblendmd(dest | saturate, e.xmm1, e.xmm2); + return; + } + // We can only overflow if the signs of the operands are // opposite. If signs are opposite and result sign isn't the // same as src1's sign, we've overflowed. if ((s32b)((src1 ^ @@ -1287,7 +1351,6 @@ static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) { return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } -// TODO(benvanik): AVX512 has a native variable rotate (rolv). struct VECTOR_ROTATE_LEFT_V128 : Sequence> { @@ -1318,7 +1381,9 @@ struct VECTOR_ROTATE_LEFT_V128 e.vmovaps(i.dest, e.xmm0); break; case INT32_TYPE: { - if (e.IsFeatureEnabled(kX64EmitAVX2)) { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + e.vprolvd(i.dest, i.src1, i.src2); + } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { Xmm temp = i.dest; if (i.dest == i.src1 || i.dest == i.src2) { temp = e.xmm2; @@ -1573,7 +1638,11 @@ EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32); struct SPLAT_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.src1.is_constant) { - // TODO(benvanik): faster constant splats. + if (e.IsFeatureEnabled(kX64EmitGFNI)) { + e.pxor(e.xmm0, e.xmm0); + e.gf2p8affineqb(i.dest, e.xmm0, i.src1.constant()); + return; + } e.mov(e.eax, i.src1.constant()); e.vmovd(e.xmm0, e.eax); } else {