[x64] Add AVX512 optimization for OPCODE_VECTOR_SUB(saturated)
This commit is contained in:
parent
2db8d01a13
commit
fd942a37f1
|
@ -2,7 +2,7 @@
|
|||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2018 Xenia Developers. All rights reserved. *
|
||||
* Copyright 2022 Xenia Developers. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -83,6 +83,19 @@ struct VECTOR_CONVERT_F2I
|
|||
I<OPCODE_VECTOR_CONVERT_F2I, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
Opmask mask = e.k1;
|
||||
// Mask positive values and unordered values
|
||||
// _CMP_NLT_UQ
|
||||
e.vcmpps(mask, i.src1, e.GetXmmConstPtr(XMMZero), 0x15);
|
||||
|
||||
// vcvttps2udq will saturate overflowing positive values and unordered
|
||||
// values to UINT_MAX. Mask registers will write zero everywhere
|
||||
// else (negative values)
|
||||
e.vcvttps2udq(i.dest.reg() | mask | e.T_z, i.src1);
|
||||
return;
|
||||
}
|
||||
|
||||
// clamp to min 0
|
||||
e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero));
|
||||
|
||||
|
@ -547,6 +560,15 @@ struct VECTOR_ADD
|
|||
case INT32_TYPE:
|
||||
if (saturate) {
|
||||
if (is_unsigned) {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vpaddd(dest, src1, src2);
|
||||
Opmask saturate = e.k1;
|
||||
// _mm_cmplt_epu32_mask
|
||||
e.vpcmpud(saturate, dest, src1, 0x1);
|
||||
e.vpternlogd(dest | saturate, dest, dest, 0xFF);
|
||||
return;
|
||||
}
|
||||
|
||||
// xmm0 is the only temp register that can be used by
|
||||
// src1/src2.
|
||||
e.vpaddd(e.xmm1, src1, src2);
|
||||
|
@ -562,6 +584,20 @@ struct VECTOR_ADD
|
|||
} else {
|
||||
e.vpaddd(e.xmm1, src1, src2);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
|
||||
kX64EmitAVX512DQ)) {
|
||||
e.vmovdqa32(e.xmm3, src1);
|
||||
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00100100);
|
||||
|
||||
const Opmask saturate = e.k1;
|
||||
e.vpmovd2m(saturate, e.xmm3);
|
||||
|
||||
e.vpsrad(e.xmm2, e.xmm1, 31);
|
||||
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
|
||||
return;
|
||||
}
|
||||
|
||||
// Overflow results if two inputs are the same sign and the
|
||||
// result isn't the same sign. if ((s32b)(~(src1 ^ src2) &
|
||||
// (src1 ^ res)) < 0) then overflowed
|
||||
|
@ -643,6 +679,19 @@ struct VECTOR_SUB
|
|||
// src1/src2.
|
||||
e.vpsubd(e.xmm1, src1, src2);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
// If the result is less or equal to the first operand then
|
||||
// we did not underflow
|
||||
Opmask not_underflow = e.k1;
|
||||
// _mm_cmple_epu32_mask
|
||||
e.vpcmpud(not_underflow, e.xmm1, src1, 0x2);
|
||||
|
||||
// Copy over values that did not underflow, write zero
|
||||
// everywhere else
|
||||
e.vmovdqa32(dest | not_underflow | e.T_z, e.xmm1);
|
||||
return;
|
||||
}
|
||||
|
||||
// If result is greater than either of the inputs, we've
|
||||
// underflowed (only need to check one input)
|
||||
// if (res > src1) then underflowed
|
||||
|
@ -654,6 +703,21 @@ struct VECTOR_SUB
|
|||
} else {
|
||||
e.vpsubd(e.xmm1, src1, src2);
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho |
|
||||
kX64EmitAVX512DQ)) {
|
||||
e.vmovdqa32(e.xmm3, src1);
|
||||
e.vpternlogd(e.xmm3, e.xmm1, src2, 0b00011000);
|
||||
|
||||
const Opmask saturate = e.k1;
|
||||
e.vpmovd2m(saturate, e.xmm3);
|
||||
|
||||
e.vpsrad(e.xmm2, e.xmm1, 31);
|
||||
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||
|
||||
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
|
||||
return;
|
||||
}
|
||||
|
||||
// We can only overflow if the signs of the operands are
|
||||
// opposite. If signs are opposite and result sign isn't the
|
||||
// same as src1's sign, we've overflowed. if ((s32b)((src1 ^
|
||||
|
@ -1287,7 +1351,6 @@ static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
|
|||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||
}
|
||||
|
||||
// TODO(benvanik): AVX512 has a native variable rotate (rolv).
|
||||
struct VECTOR_ROTATE_LEFT_V128
|
||||
: Sequence<VECTOR_ROTATE_LEFT_V128,
|
||||
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
|
||||
|
@ -1318,7 +1381,9 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
e.vmovaps(i.dest, e.xmm0);
|
||||
break;
|
||||
case INT32_TYPE: {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vprolvd(i.dest, i.src1, i.src2);
|
||||
} else if (e.IsFeatureEnabled(kX64EmitAVX2)) {
|
||||
Xmm temp = i.dest;
|
||||
if (i.dest == i.src1 || i.dest == i.src2) {
|
||||
temp = e.xmm2;
|
||||
|
@ -1573,7 +1638,11 @@ EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32);
|
|||
struct SPLAT_I8 : Sequence<SPLAT_I8, I<OPCODE_SPLAT, V128Op, I8Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
if (i.src1.is_constant) {
|
||||
// TODO(benvanik): faster constant splats.
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
e.pxor(e.xmm0, e.xmm0);
|
||||
e.gf2p8affineqb(i.dest, e.xmm0, i.src1.constant());
|
||||
return;
|
||||
}
|
||||
e.mov(e.eax, i.src1.constant());
|
||||
e.vmovd(e.xmm0, e.eax);
|
||||
} else {
|
||||
|
|
Loading…
Reference in New Issue