From 73d23eb6e6b154220ec009b22ca52742e4b925ad Mon Sep 17 00:00:00 2001 From: Eladash Date: Thu, 3 Sep 2020 00:58:29 +0300 Subject: [PATCH] SPU: Implement Accurate DMA (#8822) --- rpcs3/Emu/Cell/PPUInterpreter.cpp | 11 ++- rpcs3/Emu/Cell/PPUThread.cpp | 14 +++- rpcs3/Emu/Cell/SPURecompiler.cpp | 5 ++ rpcs3/Emu/Cell/SPUThread.cpp | 117 +++++++++++++++++++++++------- rpcs3/Emu/Memory/vm.cpp | 16 ++-- rpcs3/Emu/Memory/vm_reservation.h | 49 ++++++++----- rpcs3/Emu/RSX/rsx_methods.cpp | 4 +- rpcs3/Emu/system_config.h | 1 + rpcs3/rpcs3qt/emu_settings_type.h | 2 + rpcs3/rpcs3qt/settings_dialog.cpp | 3 + rpcs3/rpcs3qt/settings_dialog.ui | 7 ++ rpcs3/rpcs3qt/tooltips.h | 1 + 12 files changed, 170 insertions(+), 60 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 8cbf05f988..22771bc5ad 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -4433,8 +4433,17 @@ bool ppu_interpreter::ICBI(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op) { const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; + const u32 addr0 = vm::cast(addr, HERE) & ~127; - std::memset(vm::base(vm::cast(addr, HERE) & ~127), 0, 128); + if (g_cfg.core.spu_accurate_dma) + { + auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb); + std::memset(vm::base(addr0), 0, 128); + res.release(rtime + 128); + return true; + } + + std::memset(vm::base(addr0), 0, 128); return true; } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 24bbf6dfca..2c5fe9c7e9 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1097,10 +1097,16 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) } }()) { - ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128; + ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb); + + if (ppu.rtime & 127) + { + continue; + } + ppu.rdata = data; - if ((vm::reservation_acquire(addr, sizeof(T)) & -128) == ppu.rtime) [[likely]] + if ((vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb)) == ppu.rtime) [[likely]] { if (count >= 10) [[unlikely]] { @@ -1176,7 +1182,7 @@ const auto ppu_stwcx_tx = build_function_asm(trunc(val).eval(m_ir))) { + if (g_cfg.core.spu_accurate_dma) + { + break; + } + if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); !g_use_rtm) { // TODO: don't require TSX (current implementation is TSX-only) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 64e164e417..6ea2b15f73 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -718,7 +718,7 @@ const auto spu_putlluc_tx = build_function_asm(vm::putlluc_lockb)); c.jc(fail2); build_transaction_enter(c, fall2, x86::r12, 666); @@ -1345,13 +1345,20 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) src = zero_buf; } - if (!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) [[unlikely]] + if ((!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) || g_cfg.core.spu_accurate_dma) [[unlikely]] { - if (const u32 size = args.size; ((eal & 127) + size) <= 128 && is_get) + for (u32 size = args.size, size0; is_get; + size -= size0, dst += size0, src += size0) { + size0 = std::min(128 - (eal & 127), std::min(size, 128)); + for (u64 i = 0;; [&]() { - if (++i < 25) [[likely]] + if (state) + { + check_state(); + } + else if (++i < 25) [[likely]] { busy_wait(300); } @@ -1361,14 +1368,15 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } }()) { - const u64 time0 = vm::reservation_acquire(eal, size); + const u64 time0 = vm::reservation_acquire(eal, size0); - if (time0 & 1) + // Ignore DMA lock bits + if (time0 & (127 & ~vm::dma_lockb)) { continue; } - switch (size) + switch (size0) { case 1: { @@ -1390,11 +1398,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) *reinterpret_cast(dst) = *reinterpret_cast(src); break; } + case 128: + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + break; + } default: { auto _dst = dst; auto _src = src; - auto _size = size; + auto _size = size0; while (_size) { @@ -1409,11 +1422,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - if (time0 != vm::reservation_acquire(eal, size)) + if (time0 != vm::reservation_acquire(eal, size0)) { continue; } + break; + } + + if (size == size0) + { return; } } @@ -1422,38 +1440,85 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) { case 1: { - auto& res = vm::reservation_lock(eal, 1); + auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb); *reinterpret_cast(dst) = *reinterpret_cast(src); - res.release(res.load() - 1); + res.release(time0 + 128); break; } case 2: { - auto& res = vm::reservation_lock(eal, 2); + auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb); *reinterpret_cast(dst) = *reinterpret_cast(src); - res.release(res.load() - 1); + res.release(time0 + 128); break; } case 4: { - auto& res = vm::reservation_lock(eal, 4); + auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb); *reinterpret_cast(dst) = *reinterpret_cast(src); - res.release(res.load() - 1); + res.release(time0 + 128); break; } case 8: { - auto& res = vm::reservation_lock(eal, 8); + auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb); *reinterpret_cast(dst) = *reinterpret_cast(src); - res.release(res.load() - 1); + res.release(time0 + 128); break; } default: { + if (g_cfg.core.spu_accurate_dma) + { + for (u32 size0;; + size -= size0, dst += size0, src += size0) + { + size0 = std::min(128 - (eal & 127), std::min(size, 128)); + + // Lock each cache line execlusively + auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb); + + switch (size0) + { + case 128: + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + break; + } + default: + { + auto _dst = dst; + auto _src = src; + auto _size = size0; + + while (_size) + { + *reinterpret_cast(_dst) = *reinterpret_cast(_src); + + _dst += 16; + _src += 16; + _size -= 16; + } + + break; + } + } + + res.release(time0 + 128); + + if (size == size0) + { + break; + } + } + + break; + } + if (((eal & 127) + size) <= 128) { // Lock one cache line - auto& res = vm::reservation_lock(eal, 128); + auto [res, time0] = vm::reservation_lock(eal, 128); while (size) { @@ -1464,7 +1529,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) size -= 16; } - res.release(res.load() - 1); + res.release(time0); break; } @@ -1786,7 +1851,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) if (vm::reservation_acquire(addr, 128) & 64) { // Wait for PUTLLC to complete - while (vm::reservation_acquire(addr, 128) & 1) + while (vm::reservation_acquire(addr, 128) & 63) { busy_wait(100); } @@ -1799,7 +1864,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) { cpu_thread::suspend_all cpu_lock(this); - while (vm::reservation_acquire(addr, 128).bts(6)) + while (vm::reservation_acquire(addr, 128).bts(std::countr_zero(vm::putlluc_lockb))) { busy_wait(100); } @@ -1819,7 +1884,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) else { auto& data = vm::_ref(addr); - auto& res = vm::reservation_lock(addr, 128); + auto [res, time0] = vm::reservation_lock(addr, 128); *reinterpret_cast*>(&data) += 0; @@ -1835,7 +1900,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) // TODO: vm::check_addr vm::writer_lock lock(addr); mov_rdata(super_data, to_write); - res.release(res.load() + 127); + res.release(time0 + 128); } if (render) render->unpause(); @@ -1843,7 +1908,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) else { mov_rdata(data, to_write); - res.release(res.load() + 127); + res.release(time0 + 128); } } @@ -2072,7 +2137,7 @@ bool spu_thread::process_mfc_cmd() if (raddr && raddr != addr) { // Last check for event before we replace the reservation with a new one - if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(raddr))) + if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(rdata, vm::_ref(raddr))) { ch_event_stat |= SPU_EVENT_LR; } @@ -2197,7 +2262,7 @@ bool spu_thread::process_mfc_cmd() if (raddr) { // Last check for event before we clear the reservation - if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref(raddr))) + if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref(raddr))) { ch_event_stat |= SPU_EVENT_LR; } diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 477d3e90c0..c4484e94a7 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -440,16 +440,20 @@ namespace vm g_mutex.unlock(); } - bool reservation_lock_internal(u32 addr, atomic_t& res) + u64 reservation_lock_internal(u32 addr, atomic_t& res, u64 lock_bits) { for (u64 i = 0;; i++) { - if (!res.bts(0)) [[likely]] + if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]] { - break; + return rtime; } - if (i < 15) + if (auto cpu = get_current_cpu_thread(); cpu && cpu->state) + { + cpu->check_state(); + } + else if (i < 15) { busy_wait(500); } @@ -458,14 +462,12 @@ namespace vm // TODO: Accurate locking in this case if (!(g_pages[addr / 4096].flags & page_writable)) { - return false; + return -1; } std::this_thread::yield(); } } - - return true; } static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm) diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index 95011bdba3..6320c91e7d 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -6,6 +6,13 @@ namespace vm { + enum reservation_lock_bit : u64 + { + stcx_lockb = 1 << 0, // Exclusive conditional reservation lock + dma_lockb = 1 << 1, // Inexclusive unconditional reservation lock + putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock + }; + // Get reservation status for further atomic update: last update timestamp inline atomic_t& reservation_acquire(u32 addr, u32 size) { @@ -31,28 +38,11 @@ namespace vm return *reinterpret_cast*>(g_reservations + (addr & 0xff80) / 2); } - bool reservation_lock_internal(u32, atomic_t&); + u64 reservation_lock_internal(u32, atomic_t&, u64); - inline atomic_t& reservation_lock(u32 addr, u32 size) + inline bool reservation_trylock(atomic_t& res, u64 rtime, u64 lock_bits = stcx_lockb) { - auto res = &vm::reservation_acquire(addr, size); - - if (res->bts(0)) [[unlikely]] - { - static atomic_t no_lock{}; - - if (!reservation_lock_internal(addr, *res)) - { - res = &no_lock; - } - } - - return *res; - } - - inline bool reservation_trylock(atomic_t& res, u64 rtime) - { - if (res.compare_and_swap_test(rtime, rtime | 1)) [[likely]] + if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]] { return true; } @@ -60,4 +50,23 @@ namespace vm return false; } + inline std::pair&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb) + { + auto res = &vm::reservation_acquire(addr, size); + auto rtime = res->load(); + + if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]] + { + static atomic_t no_lock{}; + + rtime = reservation_lock_internal(addr, *res, lock_bits); + + if (rtime == umax) + { + res = &no_lock; + } + } + + return {*res, rtime}; + } } // namespace vm diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 108579795c..cc0d470280 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -151,14 +151,14 @@ namespace rsx // TODO: Check if possible to write on reservations if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]] { - res = &vm::reservation_lock(addr, 4); + res = &vm::reservation_lock(addr, 4).first; } vm::_ref(addr).val = arg; if (res) { - res->release(*res & -128); + res->release(*res + 127); } vm::reservation_notifier(addr, 4).notify_all(); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 652b9e03d2..89bbca32bb 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -44,6 +44,7 @@ struct cfg_root : cfg::node cfg::_enum spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe }; cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false }; cfg::_bool spu_accurate_putlluc{ this, "Accurate PUTLLUC", false }; + cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false }; cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true}; cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled cfg::_bool spu_cache{ this, "SPU Cache", true }; diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index de99239ae7..1a1dd12b9e 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -22,6 +22,7 @@ enum class emu_settings_type EnableTSX, AccurateGETLLAR, AccuratePUTLLUC, + AccurateSpuDMA, AccurateLLVMdfma, AccurateVectorNaN, AccurateRSXAccess, @@ -162,6 +163,7 @@ static const QMap settings_location = { emu_settings_type::EnableTSX, { "Core", "Enable TSX"}}, { emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}}, { emu_settings_type::AccuratePUTLLUC, { "Core", "Accurate PUTLLUC"}}, + { emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}}, { emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}}, { emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}}, { emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index d6488dbec8..3d223922a0 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -1726,6 +1726,9 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std m_emu_settings->EnhanceCheckBox(ui->accuratePUTLLUC, emu_settings_type::AccuratePUTLLUC); SubscribeTooltip(ui->accuratePUTLLUC, tooltips.settings.accurate_putlluc); + m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA); + SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma); + m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess); SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 89ff738d3a..3e7197ff1f 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -3417,6 +3417,13 @@ + + + + Accurate SPU DMA + + + diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index b8cdc620b5..1a3a67a1e3 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -78,6 +78,7 @@ public: const QString set_daz_and_ftz = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this."); const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation."); const QString accurate_putlluc = tr("Accurately processes SPU MFC_PUTLLUC operation."); + const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations."); const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA."); const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)"); const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");