implement more accurately inaccurate frsqrte

This commit is contained in:
disjtqz 2023-10-01 15:06:33 -04:00 committed by Radosław Gliński
parent 79465708aa
commit 67f16c4e31
3 changed files with 141 additions and 7 deletions

View File

@ -76,6 +76,8 @@ class X64HelperEmitter : public X64Emitter {
void* EmitScalarVRsqrteHelper();
void* EmitVectorVRsqrteHelper(void* scalar_helper);
void* EmitFrsqrteHelper();
private:
void* EmitCurrentForOffsets(const _code_offsets& offsets,
size_t stack_size = 0);
@ -240,6 +242,7 @@ bool X64Backend::Initialize(Processor* processor) {
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
frsqrtefp_helper = thunk_emitter.EmitFrsqrteHelper();
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@ -1162,6 +1165,140 @@ void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitFrsqrteHelper() {
_code_offsets code_offsets = {};
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
code_offsets.prolog = getSize();
Xbyak::Label L2, L7, L6, L9, L1, L12, L24, L3, L25, frsqrte_table2, LC1;
bt(GetBackendFlagsPtr(), kX64BackendNonIEEEMode);
vmovq(rax, xmm0);
jc(L24, CodeGenerator::T_NEAR);
L(L2);
mov(rcx, rax);
add(rcx, rcx);
je(L3, CodeGenerator::T_NEAR);
mov(rdx, 0x7ff0000000000000ULL);
vxorpd(xmm1, xmm1, xmm1);
if (IsFeatureEnabled(kX64EmitBMI1)) {
andn(rcx, rax, rdx);
} else {
mov(rcx, rax);
not_(rcx);
and_(rcx, rdx);
}
jne(L6);
cmp(rax, rdx);
je(L1, CodeGenerator::T_NEAR);
mov(r8, rax);
sal(r8, 12);
jne(L7);
vcomisd(xmm0, xmm1);
jb(L12, CodeGenerator::T_NEAR);
L(L7);
mov(rdx, 0x7ff8000000000000ULL);
or_(rax, rdx);
vmovq(xmm1, rax);
vmovapd(xmm0, xmm1);
ret();
L(L6);
vcomisd(xmm1, xmm0);
ja(L12, CodeGenerator::T_NEAR);
mov(rcx, rax);
mov(rdx, 0xfffffffffffffULL);
shr(rcx, 52);
and_(ecx, 2047);
and_(rax, rdx);
je(L9);
test(ecx, ecx);
je(L25, CodeGenerator::T_NEAR);
L(L9);
lea(edx, ptr[0 + rcx * 8]);
shr(rax, 49);
sub(ecx, 1023);
and_(edx, 8);
and_(eax, 7);
shr(ecx, 1);
or_(eax, edx);
mov(edx, 1022);
xor_(eax, 8);
sub(edx, ecx);
lea(rcx, ptr[rip + frsqrte_table2]);
movzx(eax, byte[rax+rcx]);
sal(rdx, 52);
sal(rax, 44);
or_(rax, rdx);
vmovq(xmm1, rax);
L(L1);
vmovapd(xmm0, xmm1);
ret();
L(L12);
vmovsd(xmm1, qword[rip + LC1]);
vmovapd(xmm0, xmm1);
ret();
L(L24);
mov(r8, rax);
sal(r8, 12);
je(L2);
mov(rdx, 0x7ff0000000000000);
test(rax, rdx);
jne(L2);
mov(rdx, 0x8000000000000000ULL);
and_(rax, rdx);
L(L3);
mov(rdx, 0x8000000000000000ULL);
and_(rax, rdx);
mov(rdx, 0x7ff0000000000000ULL);
or_(rax, rdx);
vmovq(xmm1, rax);
vmovapd(xmm0, xmm1);
ret();
L(L25);
if (IsFeatureEnabled(kX64EmitLZCNT)) {
lzcnt(rdx, rax);
} else {
Xbyak::Label end_lzcnt;
bsr(rcx, rax);
mov(rdx, 0x40);
jz(end_lzcnt);
xor_(rcx, 0x3F);
mov(rdx, rcx);
L(end_lzcnt);
}
lea(ecx, ptr[rdx - 11]);
if (IsFeatureEnabled(kX64EmitBMI2)) {
shlx(rax, rax, rcx);
} else {
shl(rax, cl);
}
mov(ecx, 12);
sub(ecx, edx);
jmp(L9, CodeGenerator::T_NEAR);
L(frsqrte_table2);
static constexpr unsigned char table_values[] = {
241u, 216u, 192u, 168u, 152u, 136u, 128u, 112u,
96u, 76u, 60u, 48u, 32u, 24u, 16u, 8u};
db(table_values, sizeof(table_values));
L(LC1);
dd(0);
dd(0x7ff80000);
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();

View File

@ -193,6 +193,7 @@ class X64Backend : public Backend {
void* reserved_store_64_helper = nullptr;
void* vrsqrtefp_vector_helper = nullptr;
void* vrsqrtefp_scalar_helper = nullptr;
void* frsqrtefp_helper = nullptr;
private:
#if XE_X64_PROFILER_AVAILABLE == 1
GuestProfilerData profiler_data_;

View File

@ -2110,13 +2110,9 @@ struct RSQRT_F64 : Sequence<RSQRT_F64, I<OPCODE_RSQRT, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vrsqrt14sd(i.dest, src1, src1);
} else {
e.vmovapd(e.xmm0, e.GetXmmConstPtr(XMMOnePD));
e.vsqrtsd(e.xmm1, src1, src1);
e.vdivsd(i.dest, e.xmm0, e.xmm1);
}
e.vmovsd(e.xmm0, src1);
e.call(e.backend()->frsqrtefp_helper);
e.vmovsd(i.dest, e.xmm0);
}
};
struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {