[x64] Add AVX512 optimization for `OPCODE_VECTOR_ADD`(saturated)

Uses a single `vpternlogd` to test for signed/unsigned
overflow/underflow. Then utilizes AVX512 mask operations to create
either `0x7FFFFFFF` or `0x80000000` arithmetically.
This commit is contained in:
Wunkolo 2022-09-09 15:59:16 -07:00 committed by Rick Gibbed
parent 9fd684594b
commit addd8c94e5
1 changed files with 23 additions and 0 deletions

View File

@ -560,6 +560,15 @@ struct VECTOR_ADD
case INT32_TYPE:
if (saturate) {
if (is_unsigned) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vpaddd(dest, src1, src2);
Opmask saturate = e.k1;
// _mm_cmplt_epu32_mask
e.vpcmpud(saturate, dest, src1, 0x1);
e.vpternlogd(dest | saturate, dest, dest, 0xFF);
return;
}
// xmm0 is the only temp register that can be used by
// src1/src2.
e.vpaddd(e.xmm1, src1, src2);
@ -575,6 +584,20 @@ struct VECTOR_ADD
} else {
e.vpaddd(e.xmm1, src1, src2);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
kX64EmitAVX512DQ)) {
e.vmovdqa32(e.xmm3, src1);
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00100100);
const Opmask saturate = e.k1;
e.vpmovd2m(saturate, e.xmm3);
e.vpsrad(e.xmm2, e.xmm1, 31);
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
return;
}
// Overflow results if two inputs are the same sign and the
// result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
// (src1 ^ res)) < 0) then overflowed