PPU: fix LDARX/LWARX in accurate mode (closes #9058)

Fixup after #9048
Use SSE intrinsics in mov_rdata.
This commit is contained in:
Nekotekina 2020-10-11 17:32:00 +03:00
parent 1885e4345c
commit 5bd5a382c0
2 changed files with 29 additions and 3 deletions

View File

@ -1176,7 +1176,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
continue;
}
const be_t<u64> rdata = data.load();
be_t<u64> rdata;
if (ppu.use_full_rdata)
{
@ -1187,6 +1187,10 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
}
else
{
rdata = data.load();
}
if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]]
{
@ -1212,6 +1216,11 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
// Store only 64 bits of reservation data
std::memcpy(&ppu.rdata[addr & 0x78], &rdata, 8);
}
else
{
// Load relevant 64 bits of reservation data
std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8);
}
return static_cast<T>(rdata << data_off >> size_off);
}

View File

@ -197,8 +197,25 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
return;
}
// TODO: use std::assume_aligned
std::memcpy(reinterpret_cast<v128*>(_dst), reinterpret_cast<const v128*>(_src), 128);
{
const __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 0));
const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 16));
const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 32));
const __m128i v3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 48));
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 0), v0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 16), v1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 32), v2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 48), v3);
}
const __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 64));
const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 80));
const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 96));
const __m128i v3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 112));
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 64), v0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
}
extern u64 get_timebased_time();