[x64] Add AVX512 optimizations for `OPCODE_VECTOR_COMPARE_UGT`(Integer)

AVX512 has native unsigned integer comparisons instructions, removing
the need to XOR the most-significant-bit with a constant in memory to
use the signed comparison instructions. These instructions only write to
a k-mask register though and need an additional call to `vpmovm2*` to
turn the mask-register into a vector-mask register.

As of Icelake:
`vpcmpu*` is all L3/T1
`vpmovm2d` is L1/T0.33
`vpmovm2{b,w}` is L3/T0.33

As of Zen4:
`vpcmpu*` is all L3/T0.50
`vpmovm2*` is all L1/T0.25
This commit is contained in:
Wunkolo 2023-02-05 17:55:09 -08:00 committed by Rick Gibbed
parent 121bf93cbe
commit 6ee2e3718f
1 changed files with 37 additions and 0 deletions

View File

@ -409,6 +409,43 @@ struct VECTOR_COMPARE_UGT_V128
: Sequence<VECTOR_COMPARE_UGT_V128,
I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
kX64EmitAVX512DQ) &&
(i.instr->flags != FLOAT32_TYPE)) {
Xmm src1 = e.xmm0;
if (i.src1.is_constant) {
e.LoadConstantXmm(src1, i.src1.constant());
} else {
src1 = i.src1;
}
Xmm src2 = e.xmm1;
if (i.src2.is_constant) {
e.LoadConstantXmm(src2, i.src2.constant());
} else {
src2 = i.src2;
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpub(e.k1, src1, src2, 0x6);
e.vpmovm2b(i.dest, e.k1);
break;
case INT16_TYPE:
e.vpcmpuw(e.k1, src1, src2, 0x6);
e.vpmovm2w(i.dest, e.k1);
break;
case INT32_TYPE:
e.vpcmpud(e.k1, src1, src2, 0x6);
e.vpmovm2d(i.dest, e.k1);
break;
default:
assert_always();
break;
}
return;
}
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE: