[x64] NAN up vmsum3fp128/vmsum4fp128.

It seems vmsum3fp128/vmsum4fp128 need to output a NAN on overflow.

Do so. Probably inefficiently.
This commit is contained in:
gibbed 2020-02-24 18:00:22 -06:00 committed by Rick Gibbed
parent 9185cdcc79
commit aa28430786
1 changed files with 50 additions and 12 deletions

View File

@ -2496,6 +2496,48 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
}; };
EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128); EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
struct DOT_PRODUCT_V128 {
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
// TODO(benvanik): apparently this is very slow
// - find alternative?
Xbyak::Label end;
e.inLocalLabel();
// Grab space to put MXCSR.
// TODO(gibbed): stick this in TLS or
// something?
e.sub(e.rsp, 8);
// Grab MXCSR and mask off the overflow flag,
// because it's sticky.
e.vstmxcsr(e.dword[e.rsp]);
e.mov(e.eax, e.dword[e.rsp]);
e.and_(e.eax, uint32_t(~8));
e.mov(e.dword[e.rsp], e.eax);
e.vldmxcsr(e.dword[e.rsp]);
// Hey we can do the dot product now.
e.vdpps(dest, src1, src2, imm);
// Load MXCSR...
e.vstmxcsr(e.dword[e.rsp]);
// ..free our temporary space and get MXCSR at
// the same time
e.pop(e.rax);
// Did we overflow?
e.test(e.al, 8);
e.jz(end);
// Infinity? HA! Give NAN.
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
e.L(end);
e.outLocalLabel();
}
};
// ============================================================================ // ============================================================================
// OPCODE_DOT_PRODUCT_3 // OPCODE_DOT_PRODUCT_3
// ============================================================================ // ============================================================================
@ -2504,11 +2546,9 @@ struct DOT_PRODUCT_3_V128
I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> { I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
EmitCommutativeBinaryXmmOp(e, i, EmitCommutativeBinaryXmmOp(
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
// TODO(benvanik): apparently this is very slow DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
// - find alternative?
e.vdpps(dest, src1, src2, 0b01110001);
}); });
} }
}; };
@ -2522,11 +2562,9 @@ struct DOT_PRODUCT_4_V128
I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> { I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
EmitCommutativeBinaryXmmOp(e, i, EmitCommutativeBinaryXmmOp(
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
// TODO(benvanik): apparently this is very slow DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
// - find alternative?
e.vdpps(dest, src1, src2, 0b11110001);
}); });
} }
}; };