From 9fd684594b6e7f0fe18745200482dedb423c327a Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Fri, 9 Sep 2022 14:16:19 -0700
Subject: [PATCH] [x64] Add AVX512 optimization for
 `OPCODE_VECTOR_CONVERT_F2I`(unsigned)

`vcvttps2udq` already saturates overflowing and unordered values to `0xFFFFFFFF`. Using mask registers, zeroes are written to negative values within the same instruction.
---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 09eb2b00e..820e6bf91 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
                I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
+        Opmask mask = e.k1;
+        // Mask positive values and unordered values
+        // _CMP_NLT_UQ
+        e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);
+
+        // vcvttps2udq will saturate overflowing positive values and unordered
+        // values to UINT_MAX. Mask registers will write zero everywhere
+        // else (negative values)
+        e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
+        return;
+      }
+
       // clamp to min 0
       e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));