TSX: new fallback method (time-based)

Basically, using timestamp counter.
Rewritten vm::reservation_op with the same principle.
Rewritten another transaction helper.
Add two new settings for configuring fallbacks.
Two limits are specified in nanoseconds (first and second).
Fix PUTLLC reload logic (prevent reusing garbage).
This commit is contained in:
Nekotekina 2020-10-31 01:52:24 +03:00
parent 80530e8aef
commit 86fc842c89
12 changed files with 263 additions and 79 deletions

View File

@ -57,31 +57,68 @@ namespace asmjit
// Emit xbegin and adjacent loop, return label at xbegin (don't use xabort please)
template <typename F>
[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than, F func)
[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, F func)
{
Label fall = c.newLabel();
Label begin = c.newLabel();
c.jmp(begin);
c.bind(fall);
// First invoked after failure
func();
c.add(ctr, 1);
// Don't repeat on zero status (may indicate syscall or interrupt)
c.test(x86::eax, x86::eax);
c.jz(fallback);
// First invoked after failure (can fallback to proceed, or jump anywhere else)
func();
// Other bad statuses are ignored regardless of repeat flag (TODO)
c.cmp(ctr, less_than);
c.jae(fallback);
c.align(kAlignCode, 16);
c.bind(begin);
return fall;
// xbegin should be issued manually, allows to add more check before entering transaction
}
// Helper to spill RDX (EDX) register for RDTSC
inline void build_swap_rdx_with(asmjit::X86Assembler& c, std::array<X86Gp, 4>& args, const asmjit::X86Gp& with)
{
#ifdef _WIN32
c.xchg(args[1], with);
args[1] = with;
#else
c.xchg(args[2], with);
args[2] = with;
#endif
}
// Get full RDTSC value into chosen register (clobbers rax/rdx or saves only rax with other target)
inline void build_get_tsc(asmjit::X86Assembler& c, const asmjit::X86Gp& to = asmjit::x86::rax)
{
if (&to != &x86::rax && &to != &x86::rdx)
{
// Swap to save its contents
c.xchg(x86::rax, to);
}
c.rdtsc();
c.shl(x86::rdx, 32);
if (&to == &x86::rax)
{
c.or_(x86::rax, x86::rdx);
}
else if (&to == &x86::rdx)
{
c.or_(x86::rdx, x86::rax);
}
else
{
// Swap back, maybe there is more effective way to do it
c.xchg(x86::rax, to);
c.mov(to.r32(), to.r32());
c.or_(to.r64(), x86::rdx);
}
}
}
// Build runtime function with asmjit::X86Assembler

View File

@ -2,15 +2,18 @@
#include "types.h"
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
namespace utils
{
// Transaction helper (Max = max attempts) (result = pair of success and op result)
template <uint Max = 10, typename F, typename R = std::invoke_result_t<F>>
// Transaction helper (result = pair of success and op result, or just bool)
template <typename F, typename R = std::invoke_result_t<F>>
inline auto tx_start(F op)
{
uint status = -1;
for (uint i = 0; i < Max; i++)
for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
{
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);

View File

@ -73,6 +73,11 @@ public:
return !!(state & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause));
}
bool has_pause_flag() const
{
return !!(state & cpu_flag::pause);
}
// Check thread type
u32 id_type() const
{

View File

@ -292,7 +292,7 @@ namespace _spurs
namespace _spurs
{
// Add workload
s32 add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
s32 add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
}
//s32 _cellSpursWorkloadAttributeInitialize(vm::ptr<CellSpursWorkloadAttribute> attr, u32 revision, u32 sdkVersion, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt);
@ -2295,7 +2295,7 @@ s32 cellSpursWorkloadAttributeSetShutdownCompletionEventHook(vm::ptr<CellSpursWo
return CELL_OK;
}
s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
s32 _spurs::add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
{
if (!spurs || !wid || !pm)
{
@ -2420,7 +2420,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
u32 res_wkl;
const auto wkl = &spurs->wklInfo(wnum);
vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
{
const u32 mask = op.wklMskB & ~(0x80000000u >> wnum);
res_wkl = 0;
@ -2456,12 +2456,12 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
}
/// Add workload
s32 cellSpursAddWorkload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
s32 cellSpursAddWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
{
cellSpurs.warning("cellSpursAddWorkload(spurs=*0x%x, wid=*0x%x, pm=*0x%x, size=0x%x, data=0x%llx, priority=*0x%x, minCnt=0x%x, maxCnt=0x%x)",
spurs, wid, pm, size, data, priority, minCnt, maxCnt);
return _spurs::add_workload(spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
return _spurs::add_workload(ppu, spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
}
/// Add workload
@ -2484,7 +2484,7 @@ s32 cellSpursAddWorkloadWithAttribute(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
}
return _spurs::add_workload(spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
return _spurs::add_workload(ppu, spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
}
/// Request workload shutdown
@ -2506,7 +2506,7 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
bool send_event;
s32 rc, old_state;
if (!vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
if (!vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
{
auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
@ -2663,7 +2663,7 @@ s32 cellSpursRemoveWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid)
}
s32 rc;
vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
{
auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
@ -3040,7 +3040,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
};
s32 res;
vm::reservation_op(vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
{
if (is_set)
{
@ -3189,7 +3189,7 @@ s32 cellSpursEventFlagSet(ppu_thread& ppu, vm::ptr<CellSpursEventFlag> eventFlag
u16 pendingRecv;
u16 pendingRecvTaskEvents[16];
vm::reservation_op(vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
{
send = false;
ppuWaitSlot = 0;
@ -4081,7 +4081,7 @@ s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32
int signal;
vm::reservation_op(vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
{
const u32 signalled = op.signalled[taskId / 32];
const u32 running = op.running[taskId / 32];
@ -4972,7 +4972,7 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
u32 old = 0;
const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
const bool ok = vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
{
allow_jobchain_run = jg.zero;
old = jg.ncount0;
@ -5136,7 +5136,7 @@ s32 cellSpursAddUrgentCommand(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobCha
s32 result = CELL_OK;
vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
{
for (auto& cmd : jch.urgentCmds)
{

View File

@ -2074,7 +2074,7 @@ void spursJobchainPopUrgentCommand(spu_thread& spu)
const auto jc = vm::unsafe_ptr_cast<CellSpursJobChain_x00>(+ctxt->jobChain);
const bool alterQueue = ctxt->unkFlag0;
vm::reservation_op(jc, [&](CellSpursJobChain_x00& op)
vm::reservation_op(spu, jc, [&](CellSpursJobChain_x00& op)
{
const auto ls = reinterpret_cast<CellSpursJobChain_x00*>(ctxt->tempAreaJobChain);

View File

@ -1216,6 +1216,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
{
ppu.rtime = ppu.last_ftime;
ppu.raddr = ppu.last_faddr;
ppu.last_ftime = 0;
return static_cast<T>(rdata << data_off >> size_off);
}
@ -1261,7 +1262,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -1282,6 +1283,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::r14);
c.push(x86::r15);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
@ -1292,6 +1295,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@ -1305,7 +1309,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.and_(x86::rbx, -128 / 2);
c.prefetchw(x86::byte_ptr(x86::rbx));
c.and_(args[0].r32(), 63);
c.mov(x86::r12d, 1);
c.mov(x86::r13, args[1]);
// Prepare data
@ -1328,8 +1331,20 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
}
// Alloc r14 to stamp0
const auto stamp0 = x86::r14;
const auto stamp1 = x86::r15;
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, []{});
Label tx0 = build_transaction_enter(c, fall, [&]()
{
build_get_tsc(c, stamp1);
c.sub(stamp1, stamp0);
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
c.xor_(x86::eax, x86::eax);
c.jae(fall);
});
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.mov(x86::eax, _XABORT_EXPLICIT);
c.jc(fall);
@ -1380,7 +1395,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
// Update reservation
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.mov(x86::eax, x86::r12d);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so finish with xend instead
@ -1411,6 +1427,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.bind(skip);
c.xend();
build_get_tsc(c, stamp1);
c.mov(x86::eax, _XABORT_EXPLICIT);
//c.jmp(fall);
@ -1436,11 +1453,28 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail2);
// Allow only first shared lock to proceed
// Check if already updated
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, []{});
// Exclude some time spent on touching memory: stamp1 contains last success or failure
c.mov(x86::rax, stamp1);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
build_get_tsc(c, stamp1);
c.sub(stamp1, x86::rax);
Label tx1 = build_transaction_enter(c, fall2, [&]()
{
build_get_tsc(c);
c.sub(x86::rax, stamp1);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
c.jnz(fall2);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
@ -1448,8 +1482,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall2);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::rax, 127 - 1);
c.jnz(fall2);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
@ -1493,7 +1525,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::eax, x86::r12d);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so try to finish with xend instead
@ -1523,7 +1556,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.jmp(fail2);
c.bind(fall2);
c.mov(x86::eax, -1);
c.mov(x86::rax, -1);
c.jmp(_ret);
c.bind(fail2);
@ -1550,6 +1583,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
}
c.mov(x86::rax, -1);
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
@ -1569,6 +1604,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
}
c.add(x86::rsp, 40);
c.pop(x86::r15);
c.pop(x86::r14);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
@ -1634,9 +1671,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
{
if (g_use_rtm) [[likely]]
{
switch (u32 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
{
case UINT32_MAX:
case UINT64_MAX:
{
auto& all_data = *vm::get_super_ptr<spu_rdata_t>(addr & -128);
auto& sdata = *vm::get_super_ptr<atomic_be_t<u64>>(addr & -8);
@ -1660,6 +1697,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
break;
}
ppu.last_ftime = -1;
[[fallthrough]];
}
case 0:
@ -1669,6 +1707,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
ppu.last_fail++;
}
if (ppu.last_ftime != umax)
{
ppu.last_faddr = 0;
return false;
}
_m_prefetchw(ppu.rdata);
_m_prefetchw(ppu.rdata + 64);
ppu.last_faddr = addr;
@ -1678,9 +1722,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
}
default:
{
if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("STCX: took too long: %u", count);
perf_log.warning(u8"STCX: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
}
break;

View File

@ -371,7 +371,7 @@ namespace spu
}
}
const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -415,6 +415,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@ -425,7 +426,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r12d, 1);
c.mov(x86::r13, args[1]);
// Prepare data
@ -460,10 +460,20 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
}
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
const auto stamp1 = args[1];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, [&]()
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c, stamp1);
c.sub(stamp1, stamp0);
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
c.xor_(x86::eax, x86::eax);
c.jae(fall);
});
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.mov(x86::eax, _XABORT_EXPLICIT);
@ -531,7 +541,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.mov(x86::eax, x86::r12d);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so finish with xend instead
@ -564,6 +575,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.bind(skip);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c, stamp1);
c.mov(x86::eax, _XABORT_EXPLICIT);
//c.jmp(fall);
@ -589,13 +601,28 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail2);
// Allow only first shared lock to proceed
// Check if already updated
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
// Exclude some time spent on touching memory: stamp1 contains last success or failure
c.mov(x86::rax, stamp1);
c.sub(x86::rax, stamp0);
build_get_tsc(c, stamp1);
c.sub(stamp1, x86::rax);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
Label tx1 = build_transaction_enter(c, fall2, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp1);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
c.jnz(fall2);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
@ -604,8 +631,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall2);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::rax, 127 - 1);
c.jnz(fall2);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
@ -666,7 +691,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::eax, x86::r12d);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so try to finish with xend instead
@ -697,7 +723,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.jmp(fail2);
c.bind(fall2);
c.mov(x86::eax, -1);
c.mov(x86::rax, -1);
c.jmp(_ret);
c.bind(fail2);
@ -724,6 +750,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
}
c.mov(x86::rax, -1);
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
@ -763,7 +791,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
c.ret();
});
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -792,6 +820,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@ -802,7 +831,6 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r12d, 1);
c.mov(x86::r13, args[1]);
// Prepare data
@ -825,10 +853,20 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
const auto stamp1 = args[1];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c, stamp1);
c.sub(stamp1, stamp0);
c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
c.xor_(x86::eax, x86::eax);
c.jae(fall);
});
c.xbegin(tx0);
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
@ -856,12 +894,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
c.mov(x86::eax, 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
c.bind(skip);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c, stamp1);
c.mov(x86::eax, _XABORT_EXPLICIT);
//c.jmp(fall);
c.bind(fall);
@ -881,12 +922,24 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
// Lock reservation
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.test(x86::eax, 127 - 1);
c.jnz(fall2);
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
// Exclude some time spent on touching memory: stamp1 contains last success or failure
c.mov(x86::rax, stamp1);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
build_get_tsc(c, stamp1);
c.sub(stamp1, x86::rax);
Label tx1 = build_transaction_enter(c, fall2, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp1);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
c.jae(fall2);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
@ -922,7 +975,8 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::eax, x86::r12d);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
c.bind(fall2);
@ -952,7 +1006,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.ret();
});
const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -979,6 +1033,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@ -986,13 +1041,20 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.mov(x86::r12d, 1);
c.mov(x86::r13, args[1]);
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit1));
c.jae(fall);
});
// Check pause flag
@ -1026,6 +1088,8 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
// Store data
if (s_tsx_avx)
@ -1047,9 +1111,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
}
c.mov(x86::eax, 1);
c.jmp(_ret);
c.bind(fall);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
@ -1546,6 +1608,7 @@ void spu_thread::push_snr(u32 number, u32 value)
const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
const u32 bitor_bit = (snr_config >> number) & 1;
// Redundant, g_use_rtm is checked inside tx_start now.
if (g_use_rtm)
{
bool channel_notify = false;
@ -2422,9 +2485,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
if (g_use_rtm) [[likely]]
{
switch (u32 count = spu_putllc_tx(addr, rtime, rdata, to_write))
switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
{
case UINT32_MAX:
case UINT64_MAX:
{
auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
@ -2451,6 +2514,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
break;
}
last_ftime = -1;
[[fallthrough]];
}
case 0:
@ -2460,6 +2524,12 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
last_fail++;
}
if (last_ftime != umax)
{
last_faddr = 0;
return false;
}
_m_prefetchw(rdata);
_m_prefetchw(rdata + 64);
last_faddr = addr;
@ -2469,9 +2539,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
}
default:
{
if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("PUTLLC: took too long: %u", count);
perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
}
break;
@ -2566,7 +2636,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
if (g_use_rtm) [[likely]]
{
const u32 result = spu_putlluc_tx(addr, to_write, cpu);
const u64 result = spu_putlluc_tx(addr, to_write, cpu);
if (result == 0)
{
@ -2579,9 +2649,9 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
res += 127;
});
}
else if (result > 60 && g_cfg.core.perf_report) [[unlikely]]
else if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("STORE128: took too long: %u", result);
perf_log.warning(u8"STORE128: took too long: %.3fµs (%u c)", result / (utils::get_tsc_freq() / 1000'000.), result);
}
static_cast<void>(cpu->test_stopped());
@ -2796,6 +2866,7 @@ bool spu_thread::process_mfc_cmd()
{
rtime = last_ftime;
raddr = last_faddr;
last_ftime = 0;
mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);

View File

@ -550,17 +550,19 @@ namespace vm
void reservation_op_internal(u32 addr, std::function<bool()> func)
{
auto& res = vm::reservation_acquire(addr, 128);
auto& res = vm::reservation_acquire(addr, 1);
auto* ptr = vm::get_super_ptr(addr & -128);
cpu_thread::suspend_all(get_current_cpu_thread(), {&res}, [&]
cpu_thread::suspend_all(get_current_cpu_thread(), {ptr, ptr + 64, &res}, [&]
{
if (func())
{
// Success, release all locks if necessary
// Success, release the lock and progress
res += 127;
}
else
{
// Only release the lock on failure
res -= 1;
}
});

View File

@ -7,6 +7,7 @@
#include <functional>
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit2;
namespace vm
{
@ -70,8 +71,8 @@ namespace vm
// TODO: remove and make it external
void reservation_op_internal(u32 addr, std::function<bool()> func);
template <bool Ack = false, typename T, typename AT = u32, typename F>
SAFE_BUFFERS inline auto reservation_op(_ptr_base<T, AT> ptr, F op)
template <bool Ack = false, typename CPU, typename T, typename AT = u32, typename F>
SAFE_BUFFERS inline auto reservation_op(CPU& cpu, _ptr_base<T, AT> ptr, F op)
{
// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
@ -94,9 +95,10 @@ namespace vm
{
// Stage 1: single optimistic transaction attempt
unsigned status = _XBEGIN_STARTED;
unsigned count = 0;
u64 _old = 0;
auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
#else
@ -157,6 +159,7 @@ namespace vm
#ifndef _MSC_VER
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
#endif
stamp1 = __rdtsc();
// Touch memory if transaction failed with status 0
if (!status)
@ -167,12 +170,17 @@ namespace vm
// Stage 2: try to lock reservation first
_old = res.fetch_add(1);
// Also identify atomic op
count = 1;
// Compute stamps excluding memory touch
stamp2 = __rdtsc() - (stamp1 - stamp0);
// Start lightened transaction (TODO: tweaking)
for (; !(_old & rsrv_unique_lock) && count < 60; count++)
// Start lightened transaction
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
{
if (cpu.has_pause_flag())
{
break;
}
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
#else

View File

@ -55,7 +55,9 @@ LOG_CHANNEL(sys_log, "SYS");
stx::manual_fixed_typemap<void> g_fixed_typemap;
bool g_use_rtm;
bool g_use_rtm = false;
u64 g_rtm_tx_limit1 = 0;
u64 g_rtm_tx_limit2 = 0;
std::string g_cfg_defaults;
@ -1019,6 +1021,14 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
}
}
if (g_use_rtm)
{
// Update supplementary settings
const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.;
g_rtm_tx_limit1 = g_cfg.core.tx_limit1_ns * _1ns;
g_rtm_tx_limit2 = g_cfg.core.tx_limit2_ns * _1ns;
}
// Load patches from different locations
g_fxo->get<patch_engine>()->append_title_patches(m_title_id);

View File

@ -240,3 +240,5 @@ private:
extern Emulator Emu;
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
extern u64 g_rtm_tx_limit2;

View File

@ -66,6 +66,8 @@ struct cfg_root : cfg::node
cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation
cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" };
cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff };
cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds
cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100, true }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy",