[x64] Add AVX512 optimizations for `OPCODE_VECTOR_COMPARE_UGT`(Integer)

AVX512 has native unsigned integer comparisons instructions, removing the need to XOR the most-significant-bit with a constant in memory to use the signed comparison instructions. These instructions only write to a k-mask register though and need an additional call to `vpmovm2*` to turn the mask-register into a vector-mask register. As of Icelake: `vpcmpu*` is all L3/T1 `vpmovm2d` is L1/T0.33 `vpmovm2{b,w}` is L3/T0.33 As of Zen4: `vpcmpu*` is all L3/T0.50 `vpmovm2*` is all L1/T0.25
2023-02-05 17:55:09 -08:00 · 2023-02-05 17:55:09 -08:00 · 6ee2e3718f
parent 121bf93cbe
commit 6ee2e3718f
1 changed files with 37 additions and 0 deletions
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -409,6 +409,43 @@ struct VECTOR_COMPARE_UGT_V128
    : Sequence<VECTOR_COMPARE_UGT_V128,
               I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
+                           kX64EmitAVX512DQ) &&
+        (i.instr->flags != FLOAT32_TYPE)) {
+      Xmm src1 = e.xmm0;
+      if (i.src1.is_constant) {
+        e.LoadConstantXmm(src1, i.src1.constant());
+      } else {
+        src1 = i.src1;
+      }
+
+      Xmm src2 = e.xmm1;
+      if (i.src2.is_constant) {
+        e.LoadConstantXmm(src2, i.src2.constant());
+      } else {
+        src2 = i.src2;
+      }
+
+      switch (i.instr->flags) {
+        case INT8_TYPE:
+          e.vpcmpub(e.k1, src1, src2, 0x6);
+          e.vpmovm2b(i.dest, e.k1);
+          break;
+        case INT16_TYPE:
+          e.vpcmpuw(e.k1, src1, src2, 0x6);
+          e.vpmovm2w(i.dest, e.k1);
+          break;
+        case INT32_TYPE:
+          e.vpcmpud(e.k1, src1, src2, 0x6);
+          e.vpmovm2d(i.dest, e.k1);
+          break;
+        default:
+          assert_always();
+          break;
+      }
+      return;
+    }
+
    Xbyak::Address sign_addr = e.ptr[e.rax];  // dummy
    switch (i.instr->flags) {
      case INT8_TYPE: