From 9fd684594b6e7f0fe18745200482dedb423c327a Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Fri, 9 Sep 2022 14:16:19 -0700 Subject: [PATCH] [x64] Add AVX512 optimization for `OPCODE_VECTOR_CONVERT_F2I`(unsigned) `vcvttps2udq` already saturates overflowing and unordered values to `0xFFFFFFFF`. Using mask registers, zeroes are written to negative values within the same instruction. --- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 09eb2b00e..820e6bf91 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I I> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + Opmask mask = e.k1; + // Mask positive values and unordered values + // _CMP_NLT_UQ + e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15); + + // vcvttps2udq will saturate overflowing positive values and unordered + // values to UINT_MAX. Mask registers will write zero everywhere + // else (negative values) + e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1); + return; + } + // clamp to min 0 e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));