[x64] Add AVX512 optimization for `OPCODE_VECTOR_CONVERT_F2I`(unsigned)

`vcvttps2udq` already saturates overflowing and unordered values to `0xFFFFFFFF`. Using mask registers, zeroes are written to negative values within the same instruction.
2022-09-09 14:16:19 -07:00 · 2022-09-09 14:16:19 -07:00 · 9fd684594b
parent 90fffe1de7
commit 9fd684594b
1 changed files with 13 additions and 0 deletions
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
               I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
+      if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
+        Opmask mask = e.k1;
+        // Mask positive values and unordered values
+        // _CMP_NLT_UQ
+        e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);
+
+        // vcvttps2udq will saturate overflowing positive values and unordered
+        // values to UINT_MAX. Mask registers will write zero everywhere
+        // else (negative values)
+        e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
+        return;
+      }
+
      // clamp to min 0
      e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));