NetPlay/Jit64: Avoid using software FMA

When I added the software FMA path in 2c38d64 and made us use
it when determinism is enabled, I was assuming that either the
performance impact of software FMA wouldn't be too large or CPUs
that were too old to have FMA instructions were too slow to run
Dolphin well anyway. This was wrong. To give an example, the
netplay performance went from 60 FPS to 30 FPS in one case.

This change makes netplay clients negotiate whether FMA should
be used. If all clients use an x64 CPU that supports FMA, or
AArch64, then FMA is enabled, and otherwise FMA is disabled.
In other words, we sacrifice accuracy if needed to avoid massive
slowdown, but not otherwise. When not using netplay, whether to
enable FMA is simply based on whether the host CPU supports it.

The only remaining case where the software FMA path gets used
under normal circumstances is when an input recording is created
on a CPU with FMA support and then played back on a CPU without.
This is not an especially common scenario (though it can happen),
and TASers are generally less picky about performance and more
picky about accuracy than other users anyway.

With this change, FMA desyncs are avoided between AArch64 and
modern x64 CPUs (unlike before 2c38d64), but we do get FMA
desyncs between AArch64 and old x64 CPUs (like before 2c38d64).
This desync can be avoided by adding a non-FMA path to JitArm64 as
an option, which I will wait with for another pull request so that
we can get the performance regression fixed as quickly as possible.

https://bugs.dolphin-emu.org/issues/12542
This commit is contained in:
JosJuice 2021-06-09 20:16:41 +02:00
parent 8f9bb5612a
commit ac28b89fa5
16 changed files with 105 additions and 24 deletions

View File

@ -141,7 +141,9 @@ static const std::map<System, std::string> system_to_name = {
{System::Logger, "Logger"},
{System::Debugger, "Debugger"},
{System::SYSCONF, "SYSCONF"},
{System::DualShockUDPClient, "DualShockUDPClient"}};
{System::DualShockUDPClient, "DualShockUDPClient"},
{System::FreeLook, "FreeLook"},
{System::Session, "Session"}};
const std::string& GetSystemName(System system)
{

View File

@ -33,6 +33,7 @@ enum class System
Debugger,
DualShockUDPClient,
FreeLook,
Session,
};
constexpr std::array<LayerType, 7> SEARCH_ORDER{{

View File

@ -26,6 +26,8 @@ add_library(core
Config/MainSettings.h
Config/NetplaySettings.cpp
Config/NetplaySettings.h
Config/SessionSettings.cpp
Config/SessionSettings.h
Config/SYSCONFSettings.cpp
Config/SYSCONFSettings.h
Config/UISettings.cpp

View File

@ -0,0 +1,14 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include "Core/Config/SessionSettings.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Config/Config.h"
namespace Config
{
const Info<bool> SESSION_USE_FMA{{System::Session, "Core", "UseFMA"}, CPUInfo().bFMA};
} // namespace Config

View File

@ -0,0 +1,13 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#pragma once
#include "Common/CommonTypes.h"
#include "Common/Config/Config.h"
namespace Config
{
extern const Info<bool> SESSION_USE_FMA;
} // namespace Config

View File

@ -94,6 +94,7 @@ const std::map<Config::System, int> system_to_ini = {
{Config::System::Debugger, F_DEBUGGERCONFIG_IDX},
{Config::System::DualShockUDPClient, F_DUALSHOCKUDPCLIENTCONFIG_IDX},
{Config::System::FreeLook, F_FREELOOKCONFIG_IDX},
// Config::System::Session should not be added to this list
};
// INI layer configuration loader
@ -144,6 +145,9 @@ public:
if (location.system == Config::System::SYSCONF)
continue;
if (location.system == Config::System::Session)
continue;
auto ini = inis.find(location.system);
if (ini == inis.end())
{

View File

@ -250,6 +250,9 @@ private:
if (location.section.empty() && location.key.empty())
continue;
if (location.system == Config::System::Session)
continue;
layer->Set(location, value.second);
}
}
@ -272,7 +275,7 @@ void INIGameConfigLayerLoader::Save(Config::Layer* layer)
const Config::Location& location = config.first;
const std::optional<std::string>& value = config.second;
if (!IsSettingSaveable(location))
if (!IsSettingSaveable(location) || location.system == Config::System::Session)
continue;
const auto ini_location = GetINILocationFromConfig(location);

View File

@ -14,6 +14,7 @@
#include "Core/Config/GraphicsSettings.h"
#include "Core/Config/MainSettings.h"
#include "Core/Config/SYSCONFSettings.h"
#include "Core/Config/SessionSettings.h"
#include "Core/ConfigManager.h"
#include "Core/Movie.h"
#include "VideoCommon/VideoConfig.h"
@ -46,6 +47,8 @@ static void LoadFromDTM(Config::Layer* config_layer, Movie::DTMHeader* dtm)
config_layer->Set(Config::GFX_HACK_EFB_EMULATE_FORMAT_CHANGES, dtm->bEFBEmulateFormatChanges);
config_layer->Set(Config::GFX_HACK_IMMEDIATE_XFB, dtm->bImmediateXFB);
config_layer->Set(Config::GFX_HACK_SKIP_XFB_COPY_TO_RAM, dtm->bSkipXFBCopyToRam);
config_layer->Set(Config::SESSION_USE_FMA, dtm->bUseFMA);
}
void SaveToDTM(Movie::DTMHeader* dtm)
@ -70,7 +73,9 @@ void SaveToDTM(Movie::DTMHeader* dtm)
dtm->bImmediateXFB = Config::Get(Config::GFX_HACK_IMMEDIATE_XFB);
dtm->bSkipXFBCopyToRam = Config::Get(Config::GFX_HACK_SKIP_XFB_COPY_TO_RAM);
// This never used the regular config
dtm->bUseFMA = Config::Get(Config::SESSION_USE_FMA);
// Settings which only existed in old Dolphin versions
dtm->bSkipIdle = true;
dtm->bEFBCopyEnable = true;
dtm->bEFBCopyCacheEnable = false;

View File

@ -11,9 +11,11 @@
#include "Common/CommonPaths.h"
#include "Common/Config/Config.h"
#include "Common/FileUtil.h"
#include "Core/Config/GraphicsSettings.h"
#include "Core/Config/MainSettings.h"
#include "Core/Config/SYSCONFSettings.h"
#include "Core/Config/SessionSettings.h"
#include "Core/NetPlayProto.h"
namespace ConfigLoaders
@ -87,6 +89,8 @@ public:
layer->Set(Config::GFX_HACK_EFB_ACCESS_TILE_SIZE, m_settings.m_EFBAccessTileSize);
layer->Set(Config::GFX_HACK_EFB_DEFER_INVALIDATION, m_settings.m_EFBAccessDeferInvalidation);
layer->Set(Config::SESSION_USE_FMA, m_settings.m_UseFMA);
if (m_settings.m_StrictSettingsSync)
{
layer->Set(Config::GFX_HACK_VERTEX_ROUDING, m_settings.m_VertexRounding);

View File

@ -116,7 +116,8 @@ struct DTMHeader
u8 language;
u8 reserved3;
bool bFollowBranch;
std::array<u8, 9> reserved; // Padding for any new config options
bool bUseFMA;
std::array<u8, 8> reserved; // Padding for any new config options
std::array<char, 40> discChange; // Name of iso file to switch to, for two disc games.
std::array<u8, 20> revision; // Git hash
u32 DSPiromHash;

View File

@ -34,8 +34,10 @@
#include "Common/StringUtil.h"
#include "Common/Timer.h"
#include "Common/Version.h"
#include "Core/ActionReplay.h"
#include "Core/Config/NetplaySettings.h"
#include "Core/Config/SessionSettings.h"
#include "Core/ConfigManager.h"
#include "Core/GeckoCode.h"
#include "Core/HW/EXI/EXI_DeviceIPL.h"
@ -54,6 +56,7 @@
#include "Core/Movie.h"
#include "Core/PowerPC/PowerPC.h"
#include "Core/SyncIdentifier.h"
#include "InputCommon/ControllerEmu/ControlGroup/Attachments.h"
#include "InputCommon/GCAdapter.h"
#include "InputCommon/InputConfig.h"
@ -609,10 +612,11 @@ unsigned int NetPlayClient::OnData(sf::Packet& packet)
game_status_packet << static_cast<u32>(result);
Send(game_status_packet);
sf::Packet ipl_status_packet;
ipl_status_packet << static_cast<MessageId>(NP_MSG_IPL_STATUS);
ipl_status_packet << ExpansionInterface::CEXIIPL::HasIPLDump();
Send(ipl_status_packet);
sf::Packet client_capabilities_packet;
client_capabilities_packet << static_cast<MessageId>(NP_MSG_CLIENT_CAPABILITIES);
client_capabilities_packet << ExpansionInterface::CEXIIPL::HasIPLDump();
client_capabilities_packet << Config::Get(Config::SESSION_USE_FMA);
Send(client_capabilities_packet);
}
break;
@ -735,6 +739,7 @@ unsigned int NetPlayClient::OnData(sf::Packet& packet)
packet >> extension;
packet >> m_net_settings.m_GolfMode;
packet >> m_net_settings.m_UseFMA;
m_net_settings.m_IsHosting = m_local_player->IsHost();
m_net_settings.m_HostInputAuthority = m_host_input_authority;

View File

@ -97,6 +97,7 @@ struct NetSettings
bool m_SyncAllWiiSaves;
std::array<int, 4> m_WiimoteExtension;
bool m_GolfMode;
bool m_UseFMA;
// These aren't sent over the network directly
bool m_IsHosting;
@ -156,7 +157,7 @@ enum
NP_MSG_STOP_GAME = 0xA2,
NP_MSG_DISABLE_GAME = 0xA3,
NP_MSG_GAME_STATUS = 0xA4,
NP_MSG_IPL_STATUS = 0xA5,
NP_MSG_CLIENT_CAPABILITIES = 0xA5,
NP_MSG_HOST_INPUT_AUTHORITY = 0xA6,
NP_MSG_POWER_BUTTON = 0xA7,

View File

@ -32,11 +32,13 @@
#include "Common/StringUtil.h"
#include "Common/UPnP.h"
#include "Common/Version.h"
#include "Core/ActionReplay.h"
#include "Core/Config/GraphicsSettings.h"
#include "Core/Config/MainSettings.h"
#include "Core/Config/NetplaySettings.h"
#include "Core/Config/SYSCONFSettings.h"
#include "Core/Config/SessionSettings.h"
#include "Core/ConfigLoaders/GameConfigLoader.h"
#include "Core/ConfigManager.h"
#include "Core/GeckoCode.h"
@ -54,10 +56,13 @@
#include "Core/IOS/Uids.h"
#include "Core/NetPlayClient.h" //for NetPlayUI
#include "Core/SyncIdentifier.h"
#include "DiscIO/Enums.h"
#include "InputCommon/ControllerEmu/ControlGroup/Attachments.h"
#include "InputCommon/GCPadStatus.h"
#include "InputCommon/InputConfig.h"
#include "UICommon/GameFile.h"
#if !defined(_WIN32)
@ -943,12 +948,10 @@ unsigned int NetPlayServer::OnData(sf::Packet& packet, Client& player)
}
break;
case NP_MSG_IPL_STATUS:
case NP_MSG_CLIENT_CAPABILITIES:
{
bool status;
packet >> status;
m_players[player.pid].has_ipl_dump = status;
packet >> m_players[player.pid].has_ipl_dump;
packet >> m_players[player.pid].has_hardware_fma;
}
break;
@ -1306,12 +1309,14 @@ bool NetPlayServer::SetupNetSettings()
settings.m_DeferEFBCopies = Config::Get(Config::GFX_HACK_DEFER_EFB_COPIES);
settings.m_EFBAccessTileSize = Config::Get(Config::GFX_HACK_EFB_ACCESS_TILE_SIZE);
settings.m_EFBAccessDeferInvalidation = Config::Get(Config::GFX_HACK_EFB_DEFER_INVALIDATION);
settings.m_StrictSettingsSync = Config::Get(Config::NETPLAY_STRICT_SETTINGS_SYNC);
settings.m_SyncSaveData = Config::Get(Config::NETPLAY_SYNC_SAVES);
settings.m_SyncCodes = Config::Get(Config::NETPLAY_SYNC_CODES);
settings.m_SyncAllWiiSaves =
Config::Get(Config::NETPLAY_SYNC_ALL_WII_SAVES) && Config::Get(Config::NETPLAY_SYNC_SAVES);
settings.m_GolfMode = Config::Get(Config::NETPLAY_NETWORK_MODE) == "golf";
settings.m_UseFMA = DoAllPlayersHaveHardwareFMA();
// Unload GameINI to restore things to normal
Config::RemoveLayer(Config::LayerType::GlobalGame);
@ -1328,6 +1333,12 @@ bool NetPlayServer::DoAllPlayersHaveIPLDump() const
[](const auto& p) { return p.second.has_ipl_dump; });
}
bool NetPlayServer::DoAllPlayersHaveHardwareFMA() const
{
return std::all_of(m_players.begin(), m_players.end(),
[](const auto& p) { return p.second.has_hardware_fma; });
}
// called from ---GUI--- thread
bool NetPlayServer::RequestStartGame()
{
@ -1490,6 +1501,7 @@ bool NetPlayServer::StartGame()
}
spac << m_settings.m_GolfMode;
spac << m_settings.m_UseFMA;
SendAsyncToClients(std::move(spac));

View File

@ -51,6 +51,7 @@ public:
void SendChatMessage(const std::string& msg);
bool DoAllPlayersHaveIPLDump() const;
bool DoAllPlayersHaveHardwareFMA() const;
bool StartGame();
bool RequestStartGame();
void AbortGameStart();
@ -82,6 +83,7 @@ private:
std::string revision;
SyncIdentifierComparison game_status;
bool has_ipl_dump;
bool has_hardware_fma;
ENetPeer* socket;
u32 ping;

View File

@ -10,7 +10,9 @@
#include "Common/Assert.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Config/Config.h"
#include "Common/x64Emitter.h"
#include "Core/Config/SessionSettings.h"
#include "Core/ConfigManager.h"
#include "Core/Core.h"
#include "Core/PowerPC/Jit64/Jit.h"
@ -241,10 +243,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
// While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
// be extra careful and use software FMA on CPUs that don't have hardware FMA.
const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
// We would like to emulate FMA instructions accurately without rounding error if possible, but
// unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA
// instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
// emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
//
// Why have a setting instead of just checking cpu_info.bFMA, you might wonder? Because for
// netplay and TAS, it's important that everyone gets exactly the same results. The setting
// is not user configurable - Dolphin automatically sets it based on what is supported by the
// CPUs of everyone in the netplay room (or when not using netplay, simply the system's CPU).
//
// There is one circumstance where the software FMA path does get used: when an input recording
// is created on a CPU that has FMA instructions and then gets played back on a CPU that doesn't.
// (Or if the user just really wants to override the setting and knows how to do so.)
const bool use_fma = Config::Get(Config::SESSION_USE_FMA);
const bool software_fma = use_fma && !cpu_info.bFMA;
int a = inst.FA;
int b = inst.FB;
@ -272,11 +285,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
else
{
// For cpu_info.bFMA == true:
// For use_fma == true:
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
Ra = fpr.Use(a, RCMode::Read);
Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd);
@ -357,7 +370,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
break;
}
if (cpu_info.bFMA)
if (use_fma)
{
switch (inst.SUBOP5)
{
@ -395,9 +408,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
else
{
// No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
// do the multiplication and addition/subtraction in two separate operations for performance.
if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),

View File

@ -169,6 +169,7 @@
<ClInclude Include="Core\Config\GraphicsSettings.h" />
<ClInclude Include="Core\Config\MainSettings.h" />
<ClInclude Include="Core\Config\NetplaySettings.h" />
<ClInclude Include="Core\Config\SessionSettings.h" />
<ClInclude Include="Core\Config\SYSCONFSettings.h" />
<ClInclude Include="Core\Config\UISettings.h" />
<ClInclude Include="Core\ConfigLoaders\BaseConfigLoader.h" />
@ -742,6 +743,7 @@
<ClCompile Include="Core\Config\GraphicsSettings.cpp" />
<ClCompile Include="Core\Config\MainSettings.cpp" />
<ClCompile Include="Core\Config\NetplaySettings.cpp" />
<ClCompile Include="Core\Config\SessionSettings.cpp" />
<ClCompile Include="Core\Config\SYSCONFSettings.cpp" />
<ClCompile Include="Core\Config\UISettings.cpp" />
<ClCompile Include="Core\ConfigLoaders\BaseConfigLoader.cpp" />