Deterministic FREST and FRSQEST

This commit is contained in:
RipleyTom 2023-04-26 03:00:11 +02:00 committed by Ivan
parent db2341c842
commit 5c0113ce59
2 changed files with 27 additions and 11 deletions

View File

@ -8860,12 +8860,16 @@ public:
return;
}
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
if (g_cfg.net.net_active == np_internet_status::enabled)
if (g_cfg.core.spu_approx_xfloat)
{
register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
{
return fsplat<f32[4]>(1.0) / value<f32[4]>(ci->getOperand(0));
const auto a = value<f32[4]>(ci->getOperand(0));
const auto acc_result = fsplat<f32[4]>(1.0) / a;
// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
});
}
else
@ -8873,6 +8877,7 @@ public:
register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));
// Fast but this makes the result vary per cpu
return fre(a);
});
}
@ -8895,12 +8900,16 @@ public:
return;
}
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
if (g_cfg.net.net_active == np_internet_status::enabled)
if (g_cfg.core.spu_approx_xfloat)
{
register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
{
return fsplat<f32[4]>(1.0) / fsqrt(fabs(value<f32[4]>(ci->getOperand(0))));
const auto a = value<f32[4]>(ci->getOperand(0));
const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
});
}
else
@ -8908,6 +8917,7 @@ public:
register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));
// Fast but this makes the result vary per cpu
return frsqe(fabs(a));
});
}
@ -9633,23 +9643,29 @@ public:
return bitcast<f32[4]>((b & 0xff800000u) | (bitcast<u32[4]>(fpcast<f32[4]>(bnew)) & ~0xff800000u)); // Inject old sign and exponent
});
// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
if (g_cfg.net.net_active == np_internet_status::enabled)
if (g_cfg.core.spu_approx_xfloat)
{
register_intrinsic("spu_re", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));
return fsplat<f32[4]>(1.0) / a;
const auto acc_result = fsplat<f32[4]>(1.0) / a;
// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
return acc_result - acc_penalty;
});
register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));
return fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
return acc_result - acc_penalty;
});
}
else
{
// For relaxed use intrinsics, those make the results vary per cpu
register_intrinsic("spu_re", [&](llvm::CallInst* ci)
{
const auto a = value<f32[4]>(ci->getOperand(0));

View File

@ -68,7 +68,7 @@ struct cfg_root : cfg::node
cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT" and "FNMS" instructions
cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT", "FNMS", "FREST" AND "FRSQEST" instructions
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };