[x64] Add AVX512 optimization for `OPCODE_VECTOR_SUB`(saturated)
Passes the `vsubuws` and `vsubsws` unit-tests from https://github.com/xenia-project/xenia/pull/1348
This commit is contained in:
parent
cdb40ddb28
commit
d73088e5ca
|
@ -679,6 +679,19 @@ struct VECTOR_SUB
|
||||||
// src1/src2.
|
// src1/src2.
|
||||||
e.vpsubd(e.xmm1, src1, src2);
|
e.vpsubd(e.xmm1, src1, src2);
|
||||||
|
|
||||||
|
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||||
|
// If the result is less or equal to the first operand then
|
||||||
|
// we did not underflow
|
||||||
|
Opmask not_underflow = e.k1;
|
||||||
|
// _mm_cmple_epu32_mask
|
||||||
|
e.vpcmpud(not_underflow, e.xmm1, src1, 0x2);
|
||||||
|
|
||||||
|
// Copy over values that did not underflow, write zero
|
||||||
|
// everywhere else
|
||||||
|
e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// If result is greater than either of the inputs, we've
|
// If result is greater than either of the inputs, we've
|
||||||
// underflowed (only need to check one input)
|
// underflowed (only need to check one input)
|
||||||
// if (res > src1) then underflowed
|
// if (res > src1) then underflowed
|
||||||
|
@ -690,6 +703,21 @@ struct VECTOR_SUB
|
||||||
} else {
|
} else {
|
||||||
e.vpsubd(e.xmm1, src1, src2);
|
e.vpsubd(e.xmm1, src1, src2);
|
||||||
|
|
||||||
|
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
|
||||||
|
kX64EmitAVX512DQ)) {
|
||||||
|
e.vmovdqa32(e.xmm3, src1);
|
||||||
|
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000);
|
||||||
|
|
||||||
|
const Opmask saturate = e.k1;
|
||||||
|
e.vpmovd2m(saturate, e.xmm3);
|
||||||
|
|
||||||
|
e.vpsrad(e.xmm2, e.xmm1, 31);
|
||||||
|
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||||
|
|
||||||
|
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// We can only overflow if the signs of the operands are
|
// We can only overflow if the signs of the operands are
|
||||||
// opposite. If signs are opposite and result sign isn't the
|
// opposite. If signs are opposite and result sign isn't the
|
||||||
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^
|
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^
|
||||||
|
|
Loading…
Reference in New Issue