[x64] NAN up vmsum3fp128/vmsum4fp128.
It seems vmsum3fp128/vmsum4fp128 need to output a NAN on overflow. Do so. Probably inefficiently.
This commit is contained in:
parent
9185cdcc79
commit
aa28430786
|
@ -2496,6 +2496,48 @@ struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128);
|
||||||
|
|
||||||
|
struct DOT_PRODUCT_V128 {
|
||||||
|
static void Emit(X64Emitter& e, Xmm dest, Xmm src1, Xmm src2, uint8_t imm) {
|
||||||
|
// TODO(benvanik): apparently this is very slow
|
||||||
|
// - find alternative?
|
||||||
|
Xbyak::Label end;
|
||||||
|
e.inLocalLabel();
|
||||||
|
|
||||||
|
// Grab space to put MXCSR.
|
||||||
|
// TODO(gibbed): stick this in TLS or
|
||||||
|
// something?
|
||||||
|
e.sub(e.rsp, 8);
|
||||||
|
|
||||||
|
// Grab MXCSR and mask off the overflow flag,
|
||||||
|
// because it's sticky.
|
||||||
|
e.vstmxcsr(e.dword[e.rsp]);
|
||||||
|
e.mov(e.eax, e.dword[e.rsp]);
|
||||||
|
e.and_(e.eax, uint32_t(~8));
|
||||||
|
e.mov(e.dword[e.rsp], e.eax);
|
||||||
|
e.vldmxcsr(e.dword[e.rsp]);
|
||||||
|
|
||||||
|
// Hey we can do the dot product now.
|
||||||
|
e.vdpps(dest, src1, src2, imm);
|
||||||
|
|
||||||
|
// Load MXCSR...
|
||||||
|
e.vstmxcsr(e.dword[e.rsp]);
|
||||||
|
|
||||||
|
// ..free our temporary space and get MXCSR at
|
||||||
|
// the same time
|
||||||
|
e.pop(e.rax);
|
||||||
|
|
||||||
|
// Did we overflow?
|
||||||
|
e.test(e.al, 8);
|
||||||
|
e.jz(end);
|
||||||
|
|
||||||
|
// Infinity? HA! Give NAN.
|
||||||
|
e.vmovdqa(dest, e.GetXmmConstPtr(XMMQNaN));
|
||||||
|
|
||||||
|
e.L(end);
|
||||||
|
e.outLocalLabel();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// OPCODE_DOT_PRODUCT_3
|
// OPCODE_DOT_PRODUCT_3
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -2504,11 +2546,9 @@ struct DOT_PRODUCT_3_V128
|
||||||
I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
|
I<OPCODE_DOT_PRODUCT_3, F32Op, V128Op, V128Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||||
EmitCommutativeBinaryXmmOp(e, i,
|
EmitCommutativeBinaryXmmOp(
|
||||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||||
// TODO(benvanik): apparently this is very slow
|
DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b01110001);
|
||||||
// - find alternative?
|
|
||||||
e.vdpps(dest, src1, src2, 0b01110001);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -2522,11 +2562,9 @@ struct DOT_PRODUCT_4_V128
|
||||||
I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
|
I<OPCODE_DOT_PRODUCT_4, F32Op, V128Op, V128Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
// https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx
|
||||||
EmitCommutativeBinaryXmmOp(e, i,
|
EmitCommutativeBinaryXmmOp(
|
||||||
[](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) {
|
||||||
// TODO(benvanik): apparently this is very slow
|
DOT_PRODUCT_V128::Emit(e, dest, src1, src2, 0b11110001);
|
||||||
// - find alternative?
|
|
||||||
e.vdpps(dest, src1, src2, 0b11110001);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue