SPU: Implement Accurate DMA (#8822)

This commit is contained in:
Eladash 2020-09-03 00:58:29 +03:00 committed by GitHub
parent ddfa077c3e
commit 73d23eb6e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 170 additions and 60 deletions

View File

@ -4433,8 +4433,17 @@ bool ppu_interpreter::ICBI(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
{
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
const u32 addr0 = vm::cast(addr, HERE) & ~127;
std::memset(vm::base(vm::cast(addr, HERE) & ~127), 0, 128);
if (g_cfg.core.spu_accurate_dma)
{
auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb);
std::memset(vm::base(addr0), 0, 128);
res.release(rtime + 128);
return true;
}
std::memset(vm::base(addr0), 0, 128);
return true;
}

View File

@ -1097,10 +1097,16 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
}
}())
{
ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128;
ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb);
if (ppu.rtime & 127)
{
continue;
}
ppu.rdata = data;
if ((vm::reservation_acquire(addr, sizeof(T)) & -128) == ppu.rtime) [[likely]]
if ((vm::reservation_acquire(addr, sizeof(T)) & (-128 | vm::dma_lockb)) == ppu.rtime) [[likely]]
{
if (count >= 10) [[unlikely]]
{
@ -1176,7 +1182,7 @@ const auto ppu_stwcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
// Begin transaction
build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128);
c.and_(x86::rax, -128 | vm::dma_lockb);
c.cmp(x86::rax, args[1]);
c.jne(fail);
c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
@ -1222,7 +1228,7 @@ const auto ppu_stdcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rd
// Begin transaction
build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128);
c.and_(x86::rax, -128 | vm::dma_lockb);
c.cmp(x86::rax, args[1]);
c.jne(fail);
c.cmp(x86::qword_ptr(x86::r11), args[2]);

View File

@ -5788,6 +5788,11 @@ public:
if (auto ci = llvm::dyn_cast<llvm::ConstantInt>(trunc<u8>(val).eval(m_ir)))
{
if (g_cfg.core.spu_accurate_dma)
{
break;
}
if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); !g_use_rtm)
{
// TODO: don't require TSX (current implementation is TSX-only)

View File

@ -718,7 +718,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.bind(next);
// Try to acquire "PUTLLUC lock"
c.lock().bts(x86::qword_ptr(x86::rbx), 6);
c.lock().bts(x86::qword_ptr(x86::rbx), std::countr_zero<u32>(vm::putlluc_lockb));
c.jc(fail2);
build_transaction_enter(c, fall2, x86::r12, 666);
@ -1345,13 +1345,20 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
src = zero_buf;
}
if (!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) [[unlikely]]
if ((!g_use_rtm && (!is_get || g_cfg.core.spu_accurate_putlluc)) || g_cfg.core.spu_accurate_dma) [[unlikely]]
{
if (const u32 size = args.size; ((eal & 127) + size) <= 128 && is_get)
for (u32 size = args.size, size0; is_get;
size -= size0, dst += size0, src += size0)
{
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
for (u64 i = 0;; [&]()
{
if (++i < 25) [[likely]]
if (state)
{
check_state();
}
else if (++i < 25) [[likely]]
{
busy_wait(300);
}
@ -1361,14 +1368,15 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
}())
{
const u64 time0 = vm::reservation_acquire(eal, size);
const u64 time0 = vm::reservation_acquire(eal, size0);
if (time0 & 1)
// Ignore DMA lock bits
if (time0 & (127 & ~vm::dma_lockb))
{
continue;
}
switch (size)
switch (size0)
{
case 1:
{
@ -1390,11 +1398,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
case 128:
{
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
break;
}
default:
{
auto _dst = dst;
auto _src = src;
auto _size = size;
auto _size = size0;
while (_size)
{
@ -1409,11 +1422,16 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
}
if (time0 != vm::reservation_acquire(eal, size))
if (time0 != vm::reservation_acquire(eal, size0))
{
continue;
}
break;
}
if (size == size0)
{
return;
}
}
@ -1422,38 +1440,85 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
case 1:
{
auto& res = vm::reservation_lock(eal, 1);
auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res.release(res.load() - 1);
res.release(time0 + 128);
break;
}
case 2:
{
auto& res = vm::reservation_lock(eal, 2);
auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res.release(res.load() - 1);
res.release(time0 + 128);
break;
}
case 4:
{
auto& res = vm::reservation_lock(eal, 4);
auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res.release(res.load() - 1);
res.release(time0 + 128);
break;
}
case 8:
{
auto& res = vm::reservation_lock(eal, 8);
auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res.release(res.load() - 1);
res.release(time0 + 128);
break;
}
default:
{
if (g_cfg.core.spu_accurate_dma)
{
for (u32 size0;;
size -= size0, dst += size0, src += size0)
{
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
// Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
switch (size0)
{
case 128:
{
mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
break;
}
default:
{
auto _dst = dst;
auto _src = src;
auto _size = size0;
while (_size)
{
*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
_dst += 16;
_src += 16;
_size -= 16;
}
break;
}
}
res.release(time0 + 128);
if (size == size0)
{
break;
}
}
break;
}
if (((eal & 127) + size) <= 128)
{
// Lock one cache line
auto& res = vm::reservation_lock(eal, 128);
auto [res, time0] = vm::reservation_lock(eal, 128);
while (size)
{
@ -1464,7 +1529,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= 16;
}
res.release(res.load() - 1);
res.release(time0);
break;
}
@ -1786,7 +1851,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
if (vm::reservation_acquire(addr, 128) & 64)
{
// Wait for PUTLLC to complete
while (vm::reservation_acquire(addr, 128) & 1)
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
@ -1799,7 +1864,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
{
cpu_thread::suspend_all cpu_lock(this);
while (vm::reservation_acquire(addr, 128).bts(6))
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
{
busy_wait(100);
}
@ -1819,7 +1884,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
else
{
auto& data = vm::_ref<decltype(rdata)>(addr);
auto& res = vm::reservation_lock(addr, 128);
auto [res, time0] = vm::reservation_lock(addr, 128);
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
@ -1835,7 +1900,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(super_data, to_write);
res.release(res.load() + 127);
res.release(time0 + 128);
}
if (render) render->unpause();
@ -1843,7 +1908,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
else
{
mov_rdata(data, to_write);
res.release(res.load() + 127);
res.release(time0 + 128);
}
}
@ -2072,7 +2137,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr && raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
{
ch_event_stat |= SPU_EVENT_LR;
}
@ -2197,7 +2262,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref<decltype(rdata)>(raddr)))
{
ch_event_stat |= SPU_EVENT_LR;
}

View File

@ -440,16 +440,20 @@ namespace vm
g_mutex.unlock();
}
bool reservation_lock_internal(u32 addr, atomic_t<u64>& res)
u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res, u64 lock_bits)
{
for (u64 i = 0;; i++)
{
if (!res.bts(0)) [[likely]]
if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]]
{
break;
return rtime;
}
if (i < 15)
if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
{
cpu->check_state();
}
else if (i < 15)
{
busy_wait(500);
}
@ -458,14 +462,12 @@ namespace vm
// TODO: Accurate locking in this case
if (!(g_pages[addr / 4096].flags & page_writable))
{
return false;
return -1;
}
std::this_thread::yield();
}
}
return true;
}
static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm)

View File

@ -6,6 +6,13 @@
namespace vm
{
enum reservation_lock_bit : u64
{
stcx_lockb = 1 << 0, // Exclusive conditional reservation lock
dma_lockb = 1 << 1, // Inexclusive unconditional reservation lock
putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock
};
// Get reservation status for further atomic update: last update timestamp
inline atomic_t<u64>& reservation_acquire(u32 addr, u32 size)
{
@ -31,28 +38,11 @@ namespace vm
return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2);
}
bool reservation_lock_internal(u32, atomic_t<u64>&);
u64 reservation_lock_internal(u32, atomic_t<u64>&, u64);
inline atomic_t<u64>& reservation_lock(u32 addr, u32 size)
inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime, u64 lock_bits = stcx_lockb)
{
auto res = &vm::reservation_acquire(addr, size);
if (res->bts(0)) [[unlikely]]
{
static atomic_t<u64> no_lock{};
if (!reservation_lock_internal(addr, *res))
{
res = &no_lock;
}
}
return *res;
}
inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime)
{
if (res.compare_and_swap_test(rtime, rtime | 1)) [[likely]]
if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]]
{
return true;
}
@ -60,4 +50,23 @@ namespace vm
return false;
}
inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb)
{
auto res = &vm::reservation_acquire(addr, size);
auto rtime = res->load();
if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]]
{
static atomic_t<u64> no_lock{};
rtime = reservation_lock_internal(addr, *res, lock_bits);
if (rtime == umax)
{
res = &no_lock;
}
}
return {*res, rtime};
}
} // namespace vm

View File

@ -151,14 +151,14 @@ namespace rsx
// TODO: Check if possible to write on reservations
if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]]
{
res = &vm::reservation_lock(addr, 4);
res = &vm::reservation_lock(addr, 4).first;
}
vm::_ref<RsxSemaphore>(addr).val = arg;
if (res)
{
res->release(*res & -128);
res->release(*res + 127);
}
vm::reservation_notifier(addr, 4).notify_all();

View File

@ -44,6 +44,7 @@ struct cfg_root : cfg::node
cfg::_enum<spu_block_size_type> spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe };
cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false };
cfg::_bool spu_accurate_putlluc{ this, "Accurate PUTLLUC", false };
cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false };
cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true};
cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled
cfg::_bool spu_cache{ this, "SPU Cache", true };

View File

@ -22,6 +22,7 @@ enum class emu_settings_type
EnableTSX,
AccurateGETLLAR,
AccuratePUTLLUC,
AccurateSpuDMA,
AccurateLLVMdfma,
AccurateVectorNaN,
AccurateRSXAccess,
@ -162,6 +163,7 @@ static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::EnableTSX, { "Core", "Enable TSX"}},
{ emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}},
{ emu_settings_type::AccuratePUTLLUC, { "Core", "Accurate PUTLLUC"}},
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},
{ emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}},
{ emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}},
{ emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}},

View File

@ -1726,6 +1726,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->accuratePUTLLUC, emu_settings_type::AccuratePUTLLUC);
SubscribeTooltip(ui->accuratePUTLLUC, tooltips.settings.accurate_putlluc);
m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA);
SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma);
m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);

View File

@ -3417,6 +3417,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="accurateSpuDMA">
<property name="text">
<string>Accurate SPU DMA</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="hookStFunc">
<property name="text">

View File

@ -78,6 +78,7 @@ public:
const QString set_daz_and_ftz = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this.");
const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation.");
const QString accurate_putlluc = tr("Accurately processes SPU MFC_PUTLLUC operation.");
const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations.");
const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");