SPU: Use REP MOVSB in do_dma_transfer

- Try to use REP MOVSB when the size of the transfer is above a certain threshold - This threshold is determined by the ERMS and FSRM cpuid flags - The threshold values are (roughly) taken from GLIBC - A threshold of 0xFFFFFFFF indicates that the cpu has neither flag
2021-10-29 23:55:47 -04:00 · 2021-10-29 23:55:47 -04:00 · 31a5a77ae5
parent 1c014299eb
commit 31a5a77ae5
3 changed files with 138 additions and 60 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -90,6 +90,22 @@ static const bool s_tsx_avx = utils::has_avx();
 // For special case
 static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();

+// Threshold for when rep mosvb is expected to outperform simd copies
+// The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad
+static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold();
+
+#ifndef _MSC_VER
+static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size)
+{
+	__asm__ __volatile__
+	(
+		"rep; movsb" :
+		[Dst] "=D" (Dst), [Src] "=S" (Src), [Size] "=c" (Size) :
+		"[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size)
+	);
+}
+#endif
+
 static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
 {
 #if defined(_MSC_VER) || defined(__AVX__)
@ -2234,32 +2250,41 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 				// Split locking + transfer in two parts (before 64K border, and after it)
 				vm::range_lock(range_lock, range_addr, size0);

-				// Avoid unaligned stores in mov_rdata_avx
-				if (reinterpret_cast<u64>(dst) & 0x10)
+				if (size > s_rep_movsb_threshold)
 				{
-					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
-
-					dst += 16;
-					src += 16;
-					size0 -= 16;
+					__movsb(dst, src, size0);
+					dst += size0;
+					src += size0;
 				}
-
-				while (size0 >= 128)
+				else
 				{
-					mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+					// Avoid unaligned stores in mov_rdata_avx
+					if (reinterpret_cast<u64>(dst) & 0x10)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);

-					dst += 128;
-					src += 128;
-					size0 -= 128;
-				}
+						dst += 16;
+						src += 16;
+						size0 -= 16;
+					}

-				while (size0)
-				{
-					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+					while (size0 >= 128)
+					{
+						mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));

-					dst += 16;
-					src += 16;
-					size0 -= 16;
+						dst += 128;
+						src += 128;
+						size0 -= 128;
+					}
+
+					while (size0)
+					{
+						*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+						dst += 16;
+						src += 16;
+						size0 -= 16;
+					}
 				}

 				range_lock->release(0);
@ -2268,32 +2293,39 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*

 			vm::range_lock(range_lock, range_addr, range_end - range_addr);

-			// Avoid unaligned stores in mov_rdata_avx
-			if (reinterpret_cast<u64>(dst) & 0x10)
+			if (size > s_rep_movsb_threshold)
 			{
-				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
-
-				dst += 16;
-				src += 16;
-				size -= 16;
+				__movsb(dst, src, size);
 			}
-
-			while (size >= 128)
+			else
 			{
-				mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+				// Avoid unaligned stores in mov_rdata_avx
+				if (reinterpret_cast<u64>(dst) & 0x10)
+				{
+					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);

-				dst += 128;
-				src += 128;
-				size -= 128;
-			}
+					dst += 16;
+					src += 16;
+					size -= 16;
+				}

-			while (size)
-			{
-				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+				while (size >= 128)
+				{
+					mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));

-				dst += 16;
-				src += 16;
-				size -= 16;
+					dst += 128;
+					src += 128;
+					size -= 128;
+				}
+
+				while (size)
+				{
+					*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+					dst += 16;
+					src += 16;
+					size -= 16;
+				}
 			}

 			range_lock->release(0);
@ -2338,32 +2370,39 @@ plain_access:
 	}
 	default:
 	{
-		// Avoid unaligned stores in mov_rdata_avx
-		if (reinterpret_cast<u64>(dst) & 0x10)
+		if (size > s_rep_movsb_threshold)
 		{
-			*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
-
-			dst += 16;
-			src += 16;
-			size -= 16;
+			__movsb(dst, src, size);
 		}
-
-		while (size >= 128)
+		else
 		{
-			mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+			// Avoid unaligned stores in mov_rdata_avx
+			if (reinterpret_cast<u64>(dst) & 0x10)
+			{
+				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);

-			dst += 128;
-			src += 128;
-			size -= 128;
-		}
+				dst += 16;
+				src += 16;
+				size -= 16;
+			}

-		while (size)
-		{
-			*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+			while (size >= 128)
+			{
+				mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));

-			dst += 16;
-			src += 16;
-			size -= 16;
+				dst += 128;
+				src += 128;
+				size -= 128;
+			}
+
+			while (size)
+			{
+				*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
+
+				dst += 16;
+				src += 16;
+				size -= 16;
+			}
 		}

 		break;
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@ -138,6 +138,38 @@ bool utils::has_fma4()
 	return g_value;
 }

+bool utils::has_erms()
+{
+	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200;
+	return g_value;
+}
+
+bool utils::has_fsrm()
+{
+	static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10;
+	return g_value;
+}
+
+u32 utils::get_rep_movsb_threshold()
+{
+	static const u32 g_value = []()
+	{
+		u32 thresh_value = 0xFFFFFFFF;
+		if (has_fsrm())
+		{
+			thresh_value = 2047;
+		}
+		else if (has_erms())
+		{
+			thresh_value = 4095;
+		}
+
+		return thresh_value;
+	}();
+
+	return g_value;
+}
+
 std::string utils::get_cpu_brand()
 {
 	std::string brand;
--- a/rpcs3/util/sysinfo.hpp
+++ b/rpcs3/util/sysinfo.hpp
@ -39,6 +39,10 @@ namespace utils

 	bool has_fma4();

+	bool has_erms();
+
+	bool has_fsrm();
+
 	std::string get_cpu_brand();

 	std::string get_system_info();
@ -57,5 +61,8 @@ namespace utils

 	u32 get_cpu_model();

+	// A threshold of 0xFFFFFFFF means that the rep movsb is expected to be slow on this platform
+	u32 get_rep_movsb_threshold();
+
 	extern const u64 main_tid;
 }