diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index d515acf4dc..86655189e4 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -8860,12 +8860,16 @@ public: return; } - // To avoid divergence in online play don't use divergent intel/amd intrinsics when online - if (g_cfg.net.net_active == np_internet_status::enabled) + if (g_cfg.core.spu_approx_xfloat) { register_intrinsic("spu_frest", [&](llvm::CallInst* ci) { - return fsplat(1.0) / value(ci->getOperand(0)); + const auto a = value(ci->getOperand(0)); + const auto acc_result = fsplat(1.0) / a; + // Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy + const auto acc_penalty = fsplat(0x1p-13f) * acc_result; + // Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed + return bitcast(bitcast(acc_result - acc_penalty) & splat(0xFFFFF800)); }); } else @@ -8873,6 +8877,7 @@ public: register_intrinsic("spu_frest", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); + // Fast but this makes the result vary per cpu return fre(a); }); } @@ -8895,12 +8900,16 @@ public: return; } - // To avoid divergence in online play don't use divergent intel/amd intrinsics when online - if (g_cfg.net.net_active == np_internet_status::enabled) + if (g_cfg.core.spu_approx_xfloat) { register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci) { - return fsplat(1.0) / fsqrt(fabs(value(ci->getOperand(0)))); + const auto a = value(ci->getOperand(0)); + const auto acc_result = fsplat(1.0) / fsqrt(fabs(a)); + // Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy + const auto acc_penalty = fsplat(0x1p-13f) * acc_result; + // Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed + return bitcast(bitcast(acc_result - acc_penalty) & splat(0xFFFFF800)); }); } else @@ -8908,6 +8917,7 @@ public: register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); + // Fast but this makes the result vary per cpu return frsqe(fabs(a)); }); } @@ -9633,23 +9643,29 @@ public: return bitcast((b & 0xff800000u) | (bitcast(fpcast(bnew)) & ~0xff800000u)); // Inject old sign and exponent }); - // To avoid divergence in online play don't use divergent intel/amd intrinsics when online - if (g_cfg.net.net_active == np_internet_status::enabled) + if (g_cfg.core.spu_approx_xfloat) { register_intrinsic("spu_re", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); - return fsplat(1.0) / a; + const auto acc_result = fsplat(1.0) / a; + // Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy + const auto acc_penalty = fsplat(0x1p-13f) * acc_result; + return acc_result - acc_penalty; }); register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); - return fsplat(1.0) / fsqrt(fabs(a)); + const auto acc_result = fsplat(1.0) / fsqrt(fabs(a)); + // Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy + const auto acc_penalty = fsplat(0x1p-13f) * acc_result; + return acc_result - acc_penalty; }); } else { + // For relaxed use intrinsics, those make the results vary per cpu register_intrinsic("spu_re", [&](llvm::CallInst* ci) { const auto a = value(ci->getOperand(0)); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 3a9ffea945..9434b30876 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -68,7 +68,7 @@ struct cfg_root : cfg::node cfg::_enum enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false }; cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true }; - cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT" and "FNMS" instructions + cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT", "FNMS", "FREST" AND "FRSQEST" instructions cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };