[x64] Add AVX512 optimization for OPCODE_VECTOR_SUB(saturated)

This commit is contained in:
Max-Tepafray 2023-05-25 17:17:26 -05:00 committed by GitHub
parent 2db8d01a13
commit fd942a37f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 73 additions and 4 deletions

View File

@ -2,7 +2,7 @@
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Xenia Developers. All rights reserved. *
* Copyright 2022 Xenia Developers. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
Opmask mask = e.k1;
// Mask positive values and unordered values
// _CMP_NLT_UQ
e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);
// vcvttps2udq will saturate overflowing positive values and unordered
// values to UINT_MAX. Mask registers will write zero everywhere
// else (negative values)
e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
return;
}
// clamp to min 0
e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
@ -547,6 +560,15 @@ struct VECTOR_ADD
case INT32_TYPE:
if (saturate) {
if (is_unsigned) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vpaddd(dest, src1, src2);
Opmask saturate = e.k1;
// _mm_cmplt_epu32_mask
e.vpcmpud(saturate, dest, src1, 0x1);
e.vpternlogd(dest | saturate, dest, dest, 0xFF);
return;
}
// xmm0 is the only temp register that can be used by
// src1/src2.
e.vpaddd(e.xmm1, src1, src2);
@ -562,6 +584,20 @@ struct VECTOR_ADD
} else {
e.vpaddd(e.xmm1, src1, src2);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
kX64EmitAVX512DQ)) {
e.vmovdqa32(e.xmm3, src1);
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00100100);
const Opmask saturate = e.k1;
e.vpmovd2m(saturate, e.xmm3);
e.vpsrad(e.xmm2, e.xmm1, 31);
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
return;
}
// Overflow results if two inputs are the same sign and the
// result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
// (src1 ^ res)) < 0) then overflowed
@ -643,6 +679,19 @@ struct VECTOR_SUB
// src1/src2.
e.vpsubd(e.xmm1, src1, src2);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
// If the result is less or equal to the first operand then
// we did not underflow
Opmask not_underflow = e.k1;
// _mm_cmple_epu32_mask
e.vpcmpud(not_underflow, e.xmm1, src1, 0x2);
// Copy over values that did not underflow, write zero
// everywhere else
e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1);
return;
}
// If result is greater than either of the inputs, we've
// underflowed (only need to check one input)
// if (res > src1) then underflowed
@ -654,6 +703,21 @@ struct VECTOR_SUB
} else {
e.vpsubd(e.xmm1, src1, src2);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
kX64EmitAVX512DQ)) {
e.vmovdqa32(e.xmm3, src1);
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000);
const Opmask saturate = e.k1;
e.vpmovd2m(saturate, e.xmm3);
e.vpsrad(e.xmm2, e.xmm1, 31);
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
return;
}
// We can only overflow if the signs of the operands are
// opposite. If signs are opposite and result sign isn't the
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^
@ -1287,7 +1351,6 @@ static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
// TODO(benvanik): AVX512 has a native variable rotate (rolv).
struct VECTOR_ROTATE_LEFT_V128
: Sequence<VECTOR_ROTATE_LEFT_V128,
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
@ -1318,7 +1381,9 @@ struct VECTOR_ROTATE_LEFT_V128
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vprolvd(i.dest, i.src1, i.src2);
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
@ -1573,7 +1638,11 @@ EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.src1.is_constant) {
// TODO(benvanik): faster constant splats.
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
e.pxor(e.xmm0, e.xmm0);
e.gf2p8affineqb(i.dest, e.xmm0, i.src1.constant());
return;
}
e.mov(e.eax, i.src1.constant());
e.vmovd(e.xmm0, e.eax);
} else {