SPU: Use REP MOVSB in do_dma_transfer

- Try to use REP MOVSB when the size of the transfer is above a certain threshold
- This threshold is determined by the ERMS and FSRM cpuid flags
- The threshold values are (roughly) taken from GLIBC
- A threshold of 0xFFFFFFFF indicates that the cpu has neither flag
This commit is contained in:
Malcolm Jestadt 2021-10-29 23:55:47 -04:00 committed by Ivan
parent 1c014299eb
commit 31a5a77ae5
3 changed files with 138 additions and 60 deletions

View File

@ -90,6 +90,22 @@ static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
// Threshold for when rep mosvb is expected to outperform simd copies
// The threshold will be 0xFFFFFFFF when the performance of rep movsb is expected to be bad
static const u32 s_rep_movsb_threshold = utils::get_rep_movsb_threshold();
#ifndef _MSC_VER
static FORCE_INLINE void __movsb(unsigned char * Dst, const unsigned char * Src, size_t Size)
{
__asm__ __volatile__
(
"rep; movsb" :
[Dst] "=D" (Dst), [Src] "=S" (Src), [Size] "=c" (Size) :
"[Dst]" (Dst), "[Src]" (Src), "[Size]" (Size)
);
}
#endif
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
{
#if defined(_MSC_VER) || defined(__AVX__)
@ -2234,32 +2250,41 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
// Split locking + transfer in two parts (before 64K border, and after it)
vm::range_lock(range_lock, range_addr, size0);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
if (size > s_rep_movsb_threshold)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size0 -= 16;
__movsb(dst, src, size0);
dst += size0;
src += size0;
}
while (size0 >= 128)
else
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 128;
src += 128;
size0 -= 128;
}
dst += 16;
src += 16;
size0 -= 16;
}
while (size0)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
while (size0 >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 16;
src += 16;
size0 -= 16;
dst += 128;
src += 128;
size0 -= 128;
}
while (size0)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size0 -= 16;
}
}
range_lock->release(0);
@ -2268,32 +2293,39 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
vm::range_lock(range_lock, range_addr, range_end - range_addr);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
if (size > s_rep_movsb_threshold)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
__movsb(dst, src, size);
}
while (size >= 128)
else
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 128;
src += 128;
size -= 128;
}
dst += 16;
src += 16;
size -= 16;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
while (size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 16;
src += 16;
size -= 16;
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
}
range_lock->release(0);
@ -2338,32 +2370,39 @@ plain_access:
}
default:
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
if (size > s_rep_movsb_threshold)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
__movsb(dst, src, size);
}
while (size >= 128)
else
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 128;
src += 128;
size -= 128;
}
dst += 16;
src += 16;
size -= 16;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
while (size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 16;
src += 16;
size -= 16;
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
}
break;

View File

@ -138,6 +138,38 @@ bool utils::has_fma4()
return g_value;
}
bool utils::has_erms()
{
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[1] & 0x200) == 0x200;
return g_value;
}
bool utils::has_fsrm()
{
static const bool g_value = get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(7, 0)[3] & 0x10) == 0x10;
return g_value;
}
u32 utils::get_rep_movsb_threshold()
{
static const u32 g_value = []()
{
u32 thresh_value = 0xFFFFFFFF;
if (has_fsrm())
{
thresh_value = 2047;
}
else if (has_erms())
{
thresh_value = 4095;
}
return thresh_value;
}();
return g_value;
}
std::string utils::get_cpu_brand()
{
std::string brand;

View File

@ -39,6 +39,10 @@ namespace utils
bool has_fma4();
bool has_erms();
bool has_fsrm();
std::string get_cpu_brand();
std::string get_system_info();
@ -57,5 +61,8 @@ namespace utils
u32 get_cpu_model();
// A threshold of 0xFFFFFFFF means that the rep movsb is expected to be slow on this platform
u32 get_rep_movsb_threshold();
extern const u64 main_tid;
}