SPU: Implement Accurate DMA (#8822)

2020-09-03 00:58:29 +03:00 · 2020-09-03 00:58:29 +03:00 · 73d23eb6e6
parent ddfa077c3e
commit 73d23eb6e6
12 changed files with 170 additions and 60 deletions
--- a/rpcs3/Emu/Cell/PPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@ -4433,8 +4433,17 @@ bool ppu_interpreter::ICBI(ppu_thread& ppu, ppu_opcode_t op)
 bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
+	const u32 addr0 = vm::cast(addr, HERE) & ~127;

-	std::memset(vm::base(vm::cast(addr, HERE) & ~127), 0, 128);
+	if (g_cfg.core.spu_accurate_dma)
+	{
+		auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb);
+		std::memset(vm::base(addr0), 0, 128);
+		res.release(rtime + 128);
+		return true;
+	}
+
+	std::memset(vm::base(addr0), 0, 128);
 	return true;
 }

--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -1097,10 +1097,16 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		}
 	}())
 	{
-		ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128;
+		ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb);
+
+		if (ppu.rtime & 127)
+		{
+			continue;
+		}
+
 		ppu.rdata = data;

-		if ((vm::reservation_acquire(addr, sizeof(T)) & -128) == ppu.rtime) [[likely]]
+		if ((vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb)) == ppu.rtime) [[likely]]
 		{
 			if (count >= 10) [[unlikely]]
 			{
@ -1176,7 +1182,7 @@ const auto ppu_stwcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
 	// Begin transaction
 	build_transaction_enter(c, fall, args[0], 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
-	c.and_(x86::rax, -128);
+	c.and_(x86::rax, -128 | vm::dma_lockb);
 	c.cmp(x86::rax, args[1]);
 	c.jne(fail);
 	c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
@ -1222,7 +1228,7 @@ const auto ppu_stdcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
 	// Begin transaction
 	build_transaction_enter(c, fall, args[0], 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
-	c.and_(x86::rax, -128);
+	c.and_(x86::rax, -128 | vm::dma_lockb);
 	c.cmp(x86::rax, args[1]);
 	c.jne(fail);
 	c.cmp(x86::qword_ptr(x86::r11), args[2]);
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -5788,6 +5788,11 @@ public:

 			if (auto ci = llvm::dyn_cast<llvm::ConstantInt>(trunc<u8>(val).eval(m_ir)))
 			{
+				if (g_cfg.core.spu_accurate_dma)
+				{
+					break;
+				}
+
 				if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); !g_use_rtm)
 				{
 					// TODO: don't require TSX (current implementation is TSX-only)
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -718,7 +718,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.bind(next);

 	// Try to acquire "PUTLLUC lock"
-	c.lock().bts(x86::qword_ptr(x86::rbx), 6);
+	c.lock().bts(x86::qword_ptr(x86::rbx), std::countr_zero<u32>(vm::putlluc_lockb));
 	c.jc(fail2);

 	build_transaction_enter(c, fall2, x86::r12, 666);
@ -1345,13 +1345,20 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		src = zero_buf;
 	}

-	if (!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) [[unlikely]]
+	if ((!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) || g_cfg.core.spu_accurate_dma)  [[unlikely]]
 	{
-		if (const u32 size = args.size; ((eal & 127) + size) <= 128 && is_get)
+		for (u32 size = args.size, size0; is_get;
+			size -= size0, dst += size0, src += size0)
 		{
+			size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
+
 			for (u64 i = 0;; [&]()
 			{
-				if (++i < 25) [[likely]]
+				if (state)
+				{
+					check_state();
+				}
+				else if (++i < 25) [[likely]]
 				{
 					busy_wait(300);
 				}
@ -1361,14 +1368,15 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				}
 			}())
 			{
-				const u64 time0 = vm::reservation_acquire(eal, size);
+				const u64 time0 = vm::reservation_acquire(eal, size0);

-				if (time0 & 1)
+				// Ignore DMA lock bits
+				if (time0 & (127 & ~vm::dma_lockb))
 				{
 					continue;
 				}

-				switch (size)
+				switch (size0)
 				{
 				case 1:
 				{
@ -1390,11 +1398,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
 					break;
 				}
+				case 128:
+				{
+					mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
+					break;
+				}
 				default:
 				{
 					auto _dst = dst;
 					auto _src = src;
-					auto _size = size;
+					auto _size = size0;

 					while (_size)
 					{
@ -1409,11 +1422,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				}
 				}

-				if (time0 != vm::reservation_acquire(eal, size))
+				if (time0 != vm::reservation_acquire(eal, size0))
 				{
 					continue;
 				}

+				break;
+			}
+
+			if (size == size0)
+			{
 				return;
 			}
 		}
@ -1422,38 +1440,85 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		{
 		case 1:
 		{
-			auto& res = vm::reservation_lock(eal, 1);
+			auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb);
 			*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
-			res.release(res.load() - 1);
+			res.release(time0 + 128);
 			break;
 		}
 		case 2:
 		{
-			auto& res = vm::reservation_lock(eal, 2);
+			auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb);
 			*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
-			res.release(res.load() - 1);
+			res.release(time0 + 128);
 			break;
 		}
 		case 4:
 		{
-			auto& res = vm::reservation_lock(eal, 4);
+			auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb);
 			*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
-			res.release(res.load() - 1);
+			res.release(time0 + 128);
 			break;
 		}
 		case 8:
 		{
-			auto& res = vm::reservation_lock(eal, 8);
+			auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb);
 			*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
-			res.release(res.load() - 1);
+			res.release(time0 + 128);
 			break;
 		}
 		default:
 		{
+			if (g_cfg.core.spu_accurate_dma)
+			{
+				for (u32 size0;;
+					size -= size0, dst += size0, src += size0)
+				{
+					size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
+
+					// Lock each cache line execlusively
+					auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
+
+					switch (size0)
+					{
+					case 128:
+					{
+						mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
+						break;
+					}
+					default:
+					{
+						auto _dst = dst;
+						auto _src = src;
+						auto _size = size0;
+
+						while (_size)
+						{
+							*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
+
+							_dst += 16;
+							_src += 16;
+							_size -= 16;
+						}
+
+						break;
+					}
+					}
+
+					res.release(time0 + 128);
+
+					if (size == size0)
+					{
+						break;
+					}
+				}
+
+				break;
+			}
+
 			if (((eal & 127) + size) <= 128)
 			{
 				// Lock one cache line
-				auto& res = vm::reservation_lock(eal, 128);
+				auto [res, time0] = vm::reservation_lock(eal, 128);

 				while (size)
 				{
@ -1464,7 +1529,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					size -= 16;
 				}

-				res.release(res.load() - 1);
+				res.release(time0);
 				break;
 			}

@ -1786,7 +1851,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 			if (vm::reservation_acquire(addr, 128) & 64)
 			{
 				// Wait for PUTLLC to complete
-				while (vm::reservation_acquire(addr, 128) & 1)
+				while (vm::reservation_acquire(addr, 128) & 63)
 				{
 					busy_wait(100);
 				}
@ -1799,7 +1864,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 		{
 			cpu_thread::suspend_all cpu_lock(this);

-			while (vm::reservation_acquire(addr, 128).bts(6))
+			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
 			{
 				busy_wait(100);
 			}
@ -1819,7 +1884,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 	else
 	{
 		auto& data = vm::_ref<decltype(rdata)>(addr);
-		auto& res = vm::reservation_lock(addr, 128);
+		auto [res, time0] = vm::reservation_lock(addr, 128);

 		*reinterpret_cast<atomic_t<u32>*>(&data) += 0;

@ -1835,7 +1900,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 				// TODO: vm::check_addr
 				vm::writer_lock lock(addr);
 				mov_rdata(super_data, to_write);
-				res.release(res.load() + 127);
+				res.release(time0 + 128);
 			}

 			if (render) render->unpause();
@ -1843,7 +1908,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 		else
 		{
 			mov_rdata(data, to_write);
-			res.release(res.load() + 127);
+			res.release(time0 + 128);
 		}
 	}

@ -2072,7 +2137,7 @@ bool spu_thread::process_mfc_cmd()
 		if (raddr && raddr != addr)
 		{
 			// Last check for event before we replace the reservation with a new one
-			if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
+			if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
 			{
 				ch_event_stat |= SPU_EVENT_LR;
 			}
@ -2197,7 +2262,7 @@ bool spu_thread::process_mfc_cmd()
 			if (raddr)
 			{
 				// Last check for event before we clear the reservation
-				if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
+				if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
 				{
 					ch_event_stat |= SPU_EVENT_LR;
 				}
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -440,16 +440,20 @@ namespace vm
 		g_mutex.unlock();
 	}

-	bool reservation_lock_internal(u32 addr, atomic_t<u64>& res)
+	u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res, u64 lock_bits)
 	{
 		for (u64 i = 0;; i++)
 		{
-			if (!res.bts(0)) [[likely]]
+			if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]]
 			{
-				break;
+				return rtime;
 			}

-			if (i < 15)
+			if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
+			{
+				cpu->check_state();
+			}
+			else if (i < 15)
 			{
 				busy_wait(500);
 			}
@ -458,14 +462,12 @@ namespace vm
 				// TODO: Accurate locking in this case
 				if (!(g_pages[addr / 4096].flags & page_writable))
 				{
-					return false;
+					return -1;
 				}

 				std::this_thread::yield();
 			}
 		}
-
-		return true;
 	}

 	static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm)
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@ -6,6 +6,13 @@

 namespace vm
 {
+	enum reservation_lock_bit : u64
+	{
+		stcx_lockb = 1 << 0, // Exclusive conditional reservation lock
+		dma_lockb = 1 << 1, // Inexclusive unconditional reservation lock
+		putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock
+	};
+
 	// Get reservation status for further atomic update: last update timestamp
 	inline atomic_t<u64>& reservation_acquire(u32 addr, u32 size)
 	{
@ -31,28 +38,11 @@ namespace vm
 		return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2);
 	}

-	bool reservation_lock_internal(u32, atomic_t<u64>&);
+	u64 reservation_lock_internal(u32, atomic_t<u64>&, u64);

-	inline atomic_t<u64>& reservation_lock(u32 addr, u32 size)
+	inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime, u64 lock_bits = stcx_lockb)
 	{
-		auto res = &vm::reservation_acquire(addr, size);
-
-		if (res->bts(0)) [[unlikely]]
-		{
-			static atomic_t<u64> no_lock{};
-
-			if (!reservation_lock_internal(addr, *res))
-			{
-				res = &no_lock;
-			}
-		}
-
-		return *res;
-	}
-
-	inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime)
-	{
-		if (res.compare_and_swap_test(rtime, rtime | 1)) [[likely]]
+		if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]]
 		{
 			return true;
 		}
@ -60,4 +50,23 @@ namespace vm
 		return false;
 	}

+	inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb)
+	{
+		auto res = &vm::reservation_acquire(addr, size);
+		auto rtime = res->load();
+
+		if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]]
+		{
+			static atomic_t<u64> no_lock{};
+
+			rtime = reservation_lock_internal(addr, *res, lock_bits);
+
+			if (rtime == umax)
+			{
+				res = &no_lock;
+			}
+		}
+
+		return {*res, rtime};
+	}
 } // namespace vm
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@ -151,14 +151,14 @@ namespace rsx
 			// TODO: Check if possible to write on reservations
 			if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]]
 			{
-				res = &vm::reservation_lock(addr, 4);
+				res = &vm::reservation_lock(addr, 4).first;
 			}

 			vm::_ref<RsxSemaphore>(addr).val = arg;

 			if (res)
 			{
-				res->release(*res & -128);
+				res->release(*res + 127);
 			}

 			vm::reservation_notifier(addr, 4).notify_all();
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -44,6 +44,7 @@ struct cfg_root : cfg::node
 		cfg::_enum<spu_block_size_type> spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe };
 		cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false };
 		cfg::_bool spu_accurate_putlluc{ this, "Accurate PUTLLUC", false };
+		cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false };
 		cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true};
 		cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled
 		cfg::_bool spu_cache{ this, "SPU Cache", true };
--- a/rpcs3/rpcs3qt/emu_settings_type.h
+++ b/rpcs3/rpcs3qt/emu_settings_type.h
@ -22,6 +22,7 @@ enum class emu_settings_type
 	EnableTSX,
 	AccurateGETLLAR,
 	AccuratePUTLLUC,
+	AccurateSpuDMA,
 	AccurateLLVMdfma,
 	AccurateVectorNaN,
 	AccurateRSXAccess,
@ -162,6 +163,7 @@ static const QMap<emu_settings_type, cfg_location> settings_location =
 	{ emu_settings_type::EnableTSX,                { "Core", "Enable TSX"}},
 	{ emu_settings_type::AccurateGETLLAR,          { "Core", "Accurate GETLLAR"}},
 	{ emu_settings_type::AccuratePUTLLUC,          { "Core", "Accurate PUTLLUC"}},
+	{ emu_settings_type::AccurateSpuDMA,           { "Core", "Accurate SPU DMA"}},
 	{ emu_settings_type::AccurateLLVMdfma,         { "Core", "LLVM Accurate DFMA"}},
 	{ emu_settings_type::AccurateVectorNaN,        { "Core", "PPU LLVM Accurate Vector NaN values"}},
 	{ emu_settings_type::AccurateRSXAccess,        { "Core", "Accurate RSX reservation access"}},
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -1726,6 +1726,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	m_emu_settings->EnhanceCheckBox(ui->accuratePUTLLUC, emu_settings_type::AccuratePUTLLUC);
 	SubscribeTooltip(ui->accuratePUTLLUC, tooltips.settings.accurate_putlluc);

+	m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA);
+	SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma);
+
 	m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
 	SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);

--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@ -3417,6 +3417,13 @@
              </property>
             </widget>
            </item>
+            <item>
+             <widget class="QCheckBox" name="accurateSpuDMA">
+              <property name="text">
+               <string>Accurate SPU DMA</string>
+              </property>
+             </widget>
+            </item>
            <item>
             <widget class="QCheckBox" name="hookStFunc">
              <property name="text">
--- a/rpcs3/rpcs3qt/tooltips.h
+++ b/rpcs3/rpcs3qt/tooltips.h
@ -78,6 +78,7 @@ public:
 		const QString set_daz_and_ftz              = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this.");
 		const QString accurate_getllar             = tr("Accurately processes SPU MFC_GETLLAR operation.");
 		const QString accurate_putlluc             = tr("Accurately processes SPU MFC_PUTLLUC operation.");
+		const QString accurate_spu_dma             = tr("Accurately processes SPU DMA operations.");
 		const QString accurate_llvm_dfma           = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
 		const QString accurate_vector_nan          = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
 		const QString accurate_rsx_access          = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");