[x64] Add AVX512 optimization for `OPCODE_VECTOR_CONVERT_F2I`(unsigned)

`vcvttps2udq` already saturates overflowing and unordered values to `0xFFFFFFFF`. Using mask registers, zeroes are written to negative values within the same instruction.
This commit is contained in:
Wunkolo 2022-09-09 14:16:19 -07:00 committed by Rick Gibbed
parent 90fffe1de7
commit 9fd684594b
1 changed files with 13 additions and 0 deletions

View File

@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
Opmask mask = e.k1;
// Mask positive values and unordered values
// _CMP_NLT_UQ
e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);
// vcvttps2udq will saturate overflowing positive values and unordered
// values to UINT_MAX. Mask registers will write zero everywhere
// else (negative values)
e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
return;
}
// clamp to min 0
e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));