Build transactions at runtime

Drop _xbegin family intrinsics due to bad codegen
Implemented `notifier` class, replacing vm::notify
Minor optimization: detach transactions from global mutex on TSX path
Minor optimization: don't acquire vm::passive_lock on PPU on TSX path
This commit is contained in:
Nekotekina 2018-05-14 23:07:36 +03:00
parent fd525ae1cf
commit 367f039523
14 changed files with 529 additions and 339 deletions

View File

@ -7,14 +7,14 @@ asmjit::JitRuntime& asmjit::get_global_runtime()
return g_rt;
}
void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label abort)
void asmjit::build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback)
{
Label fall = c.newLabel();
Label begin = c.newLabel();
c.jmp(begin);
c.bind(fall);
c.test(x86::eax, _XABORT_RETRY);
c.jz(abort);
c.jz(fallback);
c.align(kAlignCode, 16);
c.bind(begin);
c.xbegin(fall);
@ -25,8 +25,6 @@ void asmjit::build_transaction_abort(asmjit::X86Assembler& c, unsigned char code
c.db(0xc6);
c.db(0xf8);
c.db(code);
c.xor_(x86::eax, x86::eax);
c.ret();
}
#ifdef LLVM_AVAILABLE

View File

@ -12,9 +12,9 @@ namespace asmjit
JitRuntime& get_global_runtime();
// Emit xbegin and adjacent loop
void build_transaction_enter(X86Assembler& c, Label abort);
void build_transaction_enter(X86Assembler& c, Label fallback);
// Emit xabort and return zero
// Emit xabort
void build_transaction_abort(X86Assembler& c, unsigned char code);
}

View File

@ -16,7 +16,7 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
LARGE_INTEGER timeout;
timeout.QuadPart = _timeout * -10;
if (HRESULT rc = NtWaitForKeyedEvent(nullptr, &m_value, false, is_inf ? nullptr : &timeout))
if (HRESULT rc = _timeout ? NtWaitForKeyedEvent(nullptr, &m_value, false, is_inf ? nullptr : &timeout) : WAIT_TIMEOUT)
{
verify(HERE), rc == WAIT_TIMEOUT;
@ -32,6 +32,12 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
return true;
#else
if (!_timeout)
{
verify(HERE), m_value--;
return false;
}
timespec timeout;
timeout.tv_sec = _timeout / 1000000;
timeout.tv_nsec = (_timeout % 1000000) * 1000;

View File

@ -9,6 +9,8 @@ class cond_variable
// Internal waiter counter
atomic_t<u32> m_value{0};
friend class notifier;
protected:
// Internal waiting function
bool imp_wait(u32 _old, u64 _timeout) noexcept;
@ -50,3 +52,94 @@ public:
static constexpr u64 max_timeout = u64{UINT32_MAX} / 1000 * 1000000;
};
// Pair of a fake shared mutex (only limited shared locking) and a condition variable
class notifier
{
atomic_t<u32> m_counter{0};
cond_variable m_cond;
public:
constexpr notifier() = default;
void lock_shared()
{
m_counter++;
}
void unlock_shared()
{
const u32 counter = --m_counter;
if (counter & 0x7f)
{
return;
}
if (counter >= 0x80)
{
const u32 _old = m_counter.atomic_op([](u32& value) -> u32
{
if (value & 0x7f)
{
return 0;
}
return std::exchange(value, 0) >> 7;
});
if (_old && m_cond.m_value)
{
m_cond.imp_wake(_old);
}
}
}
explicit_bool_t wait(u64 usec_timeout = -1)
{
const u32 _old = m_cond.m_value.fetch_add(1);
if (0x80 <= m_counter.fetch_op([](u32& value)
{
value--;
if (value >= 0x80)
{
value -= 0x80;
}
}))
{
// Return without waiting
m_cond.imp_wait(_old, 0);
m_counter++;
return true;
}
const bool res = m_cond.imp_wait(_old, usec_timeout);
m_counter++;
return res;
}
void notify_all()
{
if (m_counter)
{
m_counter.atomic_op([](u32& value)
{
if (const u32 add = value & 0x7f)
{
// Mutex is locked in shared mode
value += add << 7;
}
else
{
// Mutex is unlocked
value = 0;
}
});
}
// Notify after imaginary "exclusive" lock+unlock
m_cond.notify_all();
}
};

View File

@ -41,28 +41,5 @@ namespace utils
bool has_xop();
FORCE_INLINE bool transaction_enter(uint* out = nullptr)
{
while (true)
{
const uint status = _xbegin();
if (status == _XBEGIN_STARTED)
{
return true;
}
if (!(status & _XABORT_RETRY))
{
if (out)
{
*out = status;
}
return false;
}
}
}
std::string get_system_info();
}

View File

@ -122,7 +122,7 @@ if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--allow-multiple-definition")
endif()
add_compile_options(-msse -msse2 -mcx16 -mrtm)
add_compile_options(-msse -msse2 -mcx16)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# This fixes 'some' of the st11range issues. See issue #2516

View File

@ -6,8 +6,6 @@
#include "Emu/System.h"
#include "MFC.h"
const bool s_use_rtm = utils::has_rtm();
template <>
void fmt_class_string<MFC>::format(std::string& out, u64 arg)
{

View File

@ -1,6 +1,7 @@
#include "stdafx.h"
#include "Utilities/VirtualMemory.h"
#include "Utilities/sysinfo.h"
#include "Utilities/JIT.h"
#include "Crypto/sha1.h"
#include "Emu/Memory/Memory.h"
#include "Emu/System.h"
@ -46,7 +47,6 @@
#endif
#include "define_new_memleakdetect.h"
#include "Utilities/JIT.h"
#include "PPUTranslator.h"
#include "Modules/cellMsgDialog.h"
#endif
@ -55,8 +55,6 @@
#include <cfenv>
#include "Utilities/GSL.h"
const bool s_use_rtm = utils::has_rtm();
const bool s_use_ssse3 =
#ifdef _MSC_VER
utils::has_ssse3();
@ -713,7 +711,12 @@ ppu_thread::ppu_thread(const std::string& name, u32 prio, u32 stack)
, m_name(name)
{
// Trigger the scheduler
state += cpu_flag::suspend + cpu_flag::memory;
state += cpu_flag::suspend;
if (!g_use_rtm)
{
state += cpu_flag::memory;
}
}
void ppu_thread::cmd_push(cmd64 cmd)
@ -942,7 +945,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
ppu.raddr = addr;
// Do several attemps
for (uint i = 0; i < 5; i++)
for (uint i = 0; g_use_rtm || i < 5; i++)
{
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
_mm_lfence();
@ -978,6 +981,57 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stwcx_tx = build_function_asm<int(*)(u32 raddr, u64 rtime, u64 rdata, u32 value)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
c.mov(x86::r10, x86::qword_ptr(x86::rax));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::r11, x86::qword_ptr(x86::rax));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 7);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
c.bswap(args[2].r32());
c.bswap(args[3].r32());
// Touch memory (heavyweight)
c.lock().add(x86::dword_ptr(x86::r11), 0);
c.xor_(x86::eax, x86::eax);
c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax);
c.cmp(x86::rax, args[1]);
c.jne(fail);
// Begin transaction
build_transaction_enter(c, fall);
c.cmp(x86::qword_ptr(x86::r10), args[1]);
c.jne(fail);
c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
c.jne(fail);
c.mov(x86::dword_ptr(x86::r11), args[3].r32());
c.rdtsc(); // destroys args[1] or args[2]
c.shl(x86::rdx, 33);
c.shl(x86::rax, 1);
c.or_(x86::rax, x86::rdx);
c.mov(x86::qword_ptr(x86::r10), x86::rax);
c.xend();
c.mov(x86::eax, 1);
c.ret();
c.bind(fall);
c.sar(x86::eax, 24);
c.ret();
c.bind(fail);
build_transaction_abort(c, 0xff);
c.or_(x86::eax, -1);
c.ret();
});
extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
{
atomic_be_t<u32>& data = vm::_ref<atomic_be_t<u32>>(addr);
@ -988,24 +1042,31 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
return false;
}
if (s_use_rtm && utils::transaction_enter())
if (g_use_rtm)
{
if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading())
// Do several attempts (TODO)
for (u32 i = 0; i < 5; i++)
{
_xabort(0);
const int r = ppu_stwcx_tx(addr, ppu.rtime, ppu.rdata, reg_value);
if (r > 0)
{
vm::reservation_notifier(addr, sizeof(u32)).notify_all();
ppu.raddr = 0;
return true;
}
if (r < 0)
{
// Reservation lost
ppu.raddr = 0;
return false;
}
}
const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u32)) && data.compare_and_swap_test(static_cast<u32>(ppu.rdata), reg_value);
if (result)
{
vm::reservation_update(addr, sizeof(u32));
vm::notify(addr, sizeof(u32));
}
_xend();
// Give up
ppu.raddr = 0;
return result;
return false;
}
vm::writer_lock lock(0);
@ -1015,13 +1076,64 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
if (result)
{
vm::reservation_update(addr, sizeof(u32));
vm::notify(addr, sizeof(u32));
vm::reservation_notifier(addr, sizeof(u32)).notify_all();
}
ppu.raddr = 0;
return result;
}
const auto ppu_stdcx_tx = build_function_asm<int(*)(u32 raddr, u64 rtime, u64 rdata, u64 value)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
c.mov(x86::r10, x86::qword_ptr(x86::rax));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::r11, x86::qword_ptr(x86::rax));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 7);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0], 3));
c.bswap(args[2]);
c.bswap(args[3]);
// Touch memory (heavyweight)
c.lock().add(x86::qword_ptr(x86::r11), 0);
c.xor_(x86::eax, x86::eax);
c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax);
c.cmp(x86::rax, args[1]);
c.jne(fail);
// Begin transaction
build_transaction_enter(c, fall);
c.cmp(x86::qword_ptr(x86::r10), args[1]);
c.jne(fail);
c.cmp(x86::qword_ptr(x86::r11), args[2]);
c.jne(fail);
c.mov(x86::qword_ptr(x86::r11), args[3]);
c.rdtsc(); // destroys args[1] or args[2]
c.shl(x86::rdx, 33);
c.shl(x86::rax, 1);
c.or_(x86::rax, x86::rdx);
c.mov(x86::qword_ptr(x86::r10), x86::rax);
c.xend();
c.mov(x86::eax, 1);
c.ret();
c.bind(fall);
c.sar(x86::eax, 24);
c.ret();
c.bind(fail);
build_transaction_abort(c, 0xff);
c.or_(x86::eax, -1);
c.ret();
});
extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
{
atomic_be_t<u64>& data = vm::_ref<atomic_be_t<u64>>(addr);
@ -1032,24 +1144,31 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
if (s_use_rtm && utils::transaction_enter())
if (g_use_rtm)
{
if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading())
// Do several attempts (TODO)
for (u32 i = 0; i < 5; i++)
{
_xabort(0);
const int r = ppu_stdcx_tx(addr, ppu.rtime, ppu.rdata, reg_value);
if (r > 0)
{
vm::reservation_notifier(addr, sizeof(u64)).notify_all();
ppu.raddr = 0;
return true;
}
if (r < 0)
{
// Reservation lost
ppu.raddr = 0;
return false;
}
}
const bool result = ppu.rtime == vm::reservation_acquire(addr, sizeof(u64)) && data.compare_and_swap_test(ppu.rdata, reg_value);
if (result)
{
vm::reservation_update(addr, sizeof(u64));
vm::notify(addr, sizeof(u64));
}
_xend();
// Give up
ppu.raddr = 0;
return result;
return false;
}
vm::writer_lock lock(0);
@ -1059,7 +1178,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
if (result)
{
vm::reservation_update(addr, sizeof(u64));
vm::notify(addr, sizeof(u64));
vm::reservation_notifier(addr, sizeof(u64)).notify_all();
}
ppu.raddr = 0;

View File

@ -1,4 +1,5 @@
#include "stdafx.h"
#include "Utilities/JIT.h"
#include "Utilities/lockless.h"
#include "Utilities/sysinfo.h"
#include "Emu/Memory/Memory.h"
@ -22,8 +23,7 @@
#include <cfenv>
#include <atomic>
#include <thread>
const bool s_use_rtm = utils::has_rtm();
#include <shared_mutex>
const bool s_use_ssse3 =
#ifdef _MSC_VER
@ -213,6 +213,175 @@ namespace spu
}
}
const auto spu_putllc_tx = build_function_asm<int(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
c.mov(x86::r10, x86::qword_ptr(x86::rax));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::r11, x86::qword_ptr(x86::rax));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 4);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0]));
// Touch memory (heavyweight)
c.mov(x86::eax, x86::dword_ptr(args[2]));
c.mov(x86::eax, x86::dword_ptr(args[3]));
c.lock().add(x86::qword_ptr(x86::r11), 0);
c.xor_(x86::eax, x86::eax);
c.lock().xadd(x86::qword_ptr(x86::r10), x86::rax);
c.cmp(x86::rax, args[1]);
c.jne(fail);
c.vmovups(x86::ymm0, x86::yword_ptr(args[2], 0));
c.vmovups(x86::ymm1, x86::yword_ptr(args[2], 32));
c.vmovups(x86::ymm2, x86::yword_ptr(args[2], 64));
c.vmovups(x86::ymm3, x86::yword_ptr(args[2], 96));
#ifndef _WIN32
c.vmovups(x86::ymm6, x86::yword_ptr(args[3], 0));
c.vmovups(x86::ymm7, x86::yword_ptr(args[3], 32));
c.vmovups(x86::ymm8, x86::yword_ptr(args[3], 64));
c.vmovups(x86::ymm9, x86::yword_ptr(args[3], 96));
#endif
// Begin transaction
build_transaction_enter(c, fall);
c.cmp(x86::qword_ptr(x86::r10), args[1]);
c.jne(fail);
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::r11, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::r11, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::r11, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::r11, 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
c.vptest(x86::ymm0, x86::ymm0);
c.jnz(fail);
#ifdef _WIN32
c.vmovups(x86::ymm0, x86::yword_ptr(args[3], 0));
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0);
c.vmovups(x86::ymm1, x86::yword_ptr(args[3], 32));
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovups(x86::ymm2, x86::yword_ptr(args[3], 64));
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovups(x86::ymm3, x86::yword_ptr(args[3], 96));
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
#else
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm6);
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm7);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9);
#endif
c.rdtsc(); // destroys args[1] or args[2]
c.shl(x86::rdx, 33);
c.shl(x86::rax, 1);
c.or_(x86::rax, x86::rdx);
c.mov(x86::qword_ptr(x86::r10), x86::rax);
c.xend();
c.vzeroupper();
c.mov(x86::eax, 1);
c.ret();
c.bind(fall);
c.sar(x86::eax, 24);
c.ret();
c.bind(fail);
build_transaction_abort(c, 0xff);
c.or_(x86::eax, -1);
c.ret();
});
const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
c.mov(x86::r10, x86::qword_ptr(x86::rax));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::r11, x86::qword_ptr(x86::rax));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 4);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0]));
// Touch memory
c.mov(x86::rax, x86::qword_ptr(x86::r11));
c.mov(x86::rax, x86::qword_ptr(x86::r10));
// Begin transaction
build_transaction_enter(c, fall);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::r11, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::r11, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::r11, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::r11, 96));
c.xend();
c.vmovups(x86::yword_ptr(args[1], 0), x86::ymm0);
c.vmovups(x86::yword_ptr(args[1], 32), x86::ymm1);
c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2);
c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3);
c.vzeroupper();
c.ret();
c.bind(fall);
c.mov(x86::eax, 1);
c.ret();
});
const auto spu_putlluc_tx = build_function_asm<bool(*)(u32 raddr, const void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
// Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations));
c.mov(x86::r10, x86::qword_ptr(x86::rax));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::r11, x86::qword_ptr(x86::rax));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.shr(args[0], 4);
c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0]));
// Touch memory (heavyweight)
c.lock().add(x86::qword_ptr(x86::r11), 0);
c.lock().add(x86::qword_ptr(x86::r10), 0);
// Prepare data
c.vmovups(x86::ymm0, x86::yword_ptr(args[1], 0));
c.vmovups(x86::ymm1, x86::yword_ptr(args[1], 32));
c.vmovups(x86::ymm2, x86::yword_ptr(args[1], 64));
c.vmovups(x86::ymm3, x86::yword_ptr(args[1], 96));
// Begin transaction
build_transaction_enter(c, fall);
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
c.rdtsc(); // destroys args[1] or args[2]
c.shl(x86::rdx, 33);
c.shl(x86::rax, 1);
c.or_(x86::rax, x86::rdx);
c.mov(x86::qword_ptr(x86::r10), x86::rax);
c.xend();
c.vzeroupper();
c.mov(x86::eax, 1);
c.ret();
c.bind(fall);
c.xor_(x86::eax, x86::eax);
c.ret();
});
void spu_int_ctrl_t::set(u64 ints)
{
// leave only enabled interrupts
@ -516,10 +685,12 @@ void SPUThread::cpu_task()
void SPUThread::cpu_mem()
{
//vm::passive_lock(*this);
}
void SPUThread::cpu_unmem()
{
//state.test_and_set(cpu_flag::memory);
}
SPUThread::~SPUThread()
@ -881,42 +1052,17 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args)
vm::reservation_acquire(addr, 128);
// Store unconditionally
if (s_use_rtm && utils::transaction_enter())
while (g_use_rtm)
{
// First transaction attempt
if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading())
if (spu_putlluc_tx(addr, to_write.data()))
{
_xabort(0);
vm::reservation_notifier(addr, 128).notify_all();
tx_success++;
return;
}
data = to_write;
vm::reservation_update(addr, 128);
vm::notify(addr, 128);
_xend();
return;
}
else if (s_use_rtm)
{
vm::writer_lock lock(0);
if (utils::transaction_enter())
{
// Second transaction attempt
data = to_write;
vm::reservation_update(addr, 128);
_xend();
}
else
{
vm::reservation_update(addr, 128, true);
_mm_sfence();
data = to_write;
_mm_sfence();
vm::reservation_update(addr, 128);
}
vm::notify(addr, 128);
return;
busy_wait(300);
tx_failure++;
}
vm::writer_lock lock(0);
@ -925,7 +1071,7 @@ void SPUThread::do_putlluc(const spu_mfc_cmd& args)
data = to_write;
_mm_sfence();
vm::reservation_update(addr, 128);
vm::notify(addr, 128);
vm::reservation_notifier(addr, 128).notify_all();
}
void SPUThread::do_mfc(bool wait)
@ -970,7 +1116,7 @@ void SPUThread::do_mfc(bool wait)
{
if (!test(ch_stall_mask, mask))
{
if (s_use_rtm)
if (g_use_rtm)
{
if (do_list_transfer(args))
{
@ -1002,7 +1148,7 @@ void SPUThread::do_mfc(bool wait)
if (args.size)
{
if (s_use_rtm)
if (g_use_rtm)
{
do_dma_transfer(args);
}
@ -1067,13 +1213,6 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
// Stall infinitely if MFC queue is full
while (UNLIKELY(mfc_size >= 16))
{
do_mfc();
if (mfc_size < 16)
{
break;
}
if (test(state, cpu_flag::stop))
{
return false;
@ -1102,18 +1241,11 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
if (is_polling)
{
vm::waiter waiter;
waiter.owner = this;
waiter.addr = raddr;
waiter.size = 128;
waiter.stamp = rtime;
waiter.data = rdata.data();
waiter.init();
rtime = vm::reservation_acquire(raddr, 128);
_mm_lfence();
while (vm::reservation_acquire(raddr, 128) == waiter.stamp && rdata == data)
while (vm::reservation_acquire(raddr, 128) == rtime && rdata == data)
{
vm::temporary_unlock(*this);
if (test(state, cpu_flag::stop))
{
break;
@ -1123,8 +1255,23 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
}
}
while (g_use_rtm)
{
rtime = spu_getll_tx(raddr, rdata.data());
if (rtime & 1)
{
tx_failure++;
busy_wait(300);
continue;
}
tx_success++;
break;
}
// Do several attemps
for (uint i = 0; i < 5; i++)
for (uint i = 0; !g_use_rtm && i < 5; i++)
{
rtime = vm::reservation_acquire(raddr, 128);
_mm_lfence();
@ -1147,19 +1294,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
busy_wait(300);
}
if (s_use_rtm && utils::transaction_enter())
{
rtime = vm::reservation_acquire(raddr, 128);
if (rtime & 1)
{
_xabort(0);
}
rdata = data;
_xend();
}
else
if (!g_use_rtm)
{
vm::reader_lock lock;
rtime = vm::reservation_acquire(raddr, 128);
@ -1182,63 +1317,25 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
if (raddr == args.eal && rtime == vm::reservation_acquire(raddr, 128))
{
// TODO: vm::check_addr
if (s_use_rtm && utils::transaction_enter())
if (g_use_rtm)
{
// First transaction attempt
if (!vm::g_mutex.is_lockable() || vm::g_mutex.is_reading())
// Do several attempts (TODO)
for (u32 i = 0;; i++)
{
_xabort(0);
}
const int r = spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data());
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
{
data = to_write;
result = true;
vm::reservation_update(raddr, 128);
vm::notify(raddr, 128);
}
_xend();
tx_success++;
}
else if (s_use_rtm)
{
// Second transaction attempt
vm::writer_lock lock(0);
// Touch memory without modifying the value
vm::_ref<atomic_t<u32>>(args.eal) += 0;
// Touch reservation memory area as well
vm::reservation_acquire(raddr, 128) += 0;
if (utils::transaction_enter(&tx_status))
{
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
if (r > 0)
{
data = to_write;
vm::reservation_notifier(raddr, 128).notify_all();
result = true;
vm::reservation_update(raddr, 128);
tx_success++;
break;
}
_xend();
tx_success++;
if (result)
if (r < 0)
{
// First transaction attempt usually fails on vm::notify
vm::notify(raddr, 128);
}
}
else
{
// Workaround MSVC
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
{
vm::reservation_update(raddr, 128);
// Reservation lost
break;
}
// Don't fallback to heavyweight lock, just give up
@ -1248,6 +1345,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
else if (rdata == data)
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
@ -1259,12 +1357,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
result = true;
vm::reservation_update(raddr, 128);
vm::notify(raddr, 128);
tx_success++;
}
else
{
tx_failure++;
vm::reservation_notifier(raddr, 128).notify_all();
}
}
}
@ -1332,7 +1425,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
{
if (LIKELY(args.size))
{
if (s_use_rtm)
if (g_use_rtm)
{
do_dma_transfer(args);
return true;
@ -1377,7 +1470,7 @@ bool SPUThread::process_mfc_cmd(spu_mfc_cmd args)
{
if (LIKELY(do_dma_check(args) && !test(ch_stall_mask, 1u << args.tag)))
{
if (s_use_rtm)
if (g_use_rtm)
{
if (LIKELY(do_list_transfer(args)))
{
@ -1531,14 +1624,7 @@ s64 SPUThread::get_ch_value(u32 ch)
{
for (int i = 0; i < 10 && channel.get_count() == 0; i++)
{
// if (!s_use_rtm && mfc_size && !i)
// {
// do_mfc();
// }
// else
{
busy_wait();
}
busy_wait();
}
u32 out;
@ -1568,14 +1654,7 @@ s64 SPUThread::get_ch_value(u32 ch)
{
for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++)
{
// if (!s_use_rtm && mfc_size && !i)
// {
// do_mfc();
// }
// else
{
busy_wait();
}
busy_wait();
}
u32 out;
@ -1601,11 +1680,6 @@ s64 SPUThread::get_ch_value(u32 ch)
case MFC_RdTagStat:
{
// if (!s_use_rtm && mfc_size)
// {
// do_mfc();
// }
if (ch_tag_stat.get_count())
{
u32 out = ch_tag_stat.get_value();
@ -1676,11 +1750,6 @@ s64 SPUThread::get_ch_value(u32 ch)
case SPU_RdEventStat:
{
// if (!s_use_rtm && mfc_size)
// {
// do_mfc();
// }
u32 res = get_events();
if (res)
@ -1688,19 +1757,31 @@ s64 SPUThread::get_ch_value(u32 ch)
return res;
}
vm::waiter waiter;
const u32 mask1 = ch_event_mask;
if (ch_event_mask & SPU_EVENT_LR)
if (mask1 & SPU_EVENT_LR && raddr)
{
waiter.owner = this;
waiter.addr = raddr;
waiter.size = 128;
waiter.stamp = rtime;
waiter.data = rdata.data();
waiter.init();
if (mask1 != SPU_EVENT_LR)
{
fmt::throw_exception("Not supported: event mask 0x%x" HERE, mask1);
}
std::shared_lock<notifier> pseudo_lock(vm::reservation_notifier(raddr, 128));
while (res = get_events(), !res)
{
if (test(state, cpu_flag::stop + cpu_flag::dbg_global_stop))
{
return -1;
}
pseudo_lock.mutex()->wait(100);
}
return res;
}
while (!(res = get_events(true)))
while (res = get_events(true), !res)
{
if (test(state & cpu_flag::stop))
{
@ -1738,11 +1819,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
case SPU_WrOutIntrMbox:
{
// if (!s_use_rtm && mfc_size)
// {
// do_mfc(false);
// }
if (offset >= RAW_SPU_BASE_ADDR)
{
while (!ch_out_intr_mbox.try_push(value))
@ -1891,11 +1967,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
case SPU_WrOutMbox:
{
// if (!s_use_rtm && mfc_size)
// {
// do_mfc(false);
// }
while (!ch_out_mbox.try_push(value))
{
if (test(state & cpu_flag::stop))
@ -1939,11 +2010,6 @@ bool SPUThread::set_ch_value(u32 ch, u32 value)
break;
}
// if (!s_use_rtm && mfc_size)
// {
// do_mfc(false);
// }
const u32 completed = get_mfc_completed();
if (!value)
@ -2066,11 +2132,6 @@ bool SPUThread::stop_and_signal(u32 code)
{
LOG_TRACE(SPU, "stop_and_signal(code=0x%x)", code);
// if (!s_use_rtm && mfc_size)
// {
// do_mfc();
// }
if (offset >= RAW_SPU_BASE_ADDR)
{
status.atomic_op([code](u32& status)

View File

@ -2,6 +2,7 @@
#include "Memory.h"
#include "Emu/System.h"
#include "Utilities/mutex.h"
#include "Utilities/cond.h"
#include "Utilities/Thread.h"
#include "Utilities/VirtualMemory.h"
#include "Emu/CPU/CPUThread.h"
@ -10,6 +11,8 @@
#include <atomic>
#include <deque>
static_assert(sizeof(notifier) == 8, "Unexpected size of notifier");
namespace vm
{
static u8* memory_reserve_4GiB(std::uintptr_t _addr = 0)
@ -38,12 +41,12 @@ namespace vm
// Reservation stats (compressed x16)
u8* const g_reservations = memory_reserve_4GiB((std::uintptr_t)g_stat_addr);
// Reservation sync variables
u8* const g_reservations2 = g_reservations + 0x10000000;
// Memory locations
std::vector<std::shared_ptr<block_t>> g_locations;
// Registered waiters
std::deque<vm::waiter*> g_waiters;
// Memory mutex core
shared_mutex g_mutex;
@ -239,65 +242,6 @@ namespace vm
// Memory pages
std::array<memory_page, 0x100000000 / 4096> g_pages{};
void waiter::init()
{
// Register waiter
vm::writer_lock lock(0);
g_waiters.emplace_back(this);
}
void waiter::test() const
{
if (std::memcmp(data, vm::base(addr), size) == 0)
{
return;
}
if (stamp >= reservation_acquire(addr, size))
{
return;
}
if (owner)
{
owner->notify();
}
}
waiter::~waiter()
{
// Unregister waiter
vm::writer_lock lock(0);
// Find waiter
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this);
if (found != g_waiters.cend())
{
g_waiters.erase(found);
}
}
void notify(u32 addr, u32 size)
{
for (const waiter* ptr : g_waiters)
{
if (ptr->addr / 128 == addr / 128)
{
ptr->test();
}
}
}
void notify_all()
{
for (const waiter* ptr : g_waiters)
{
ptr->test();
}
}
static void _page_map(u32 addr, u8 flags, utils::shm& shm)
{
const u32 size = shm.size();
@ -539,6 +483,7 @@ namespace vm
if (addr != 0xc0000000 && addr != 0xe0000000)
{
utils::memory_commit(g_reservations + addr / 16, size / 16);
utils::memory_commit(g_reservations2 + addr / 16, size / 16);
}
}

View File

@ -8,6 +8,7 @@
class shared_mutex;
class named_thread;
class cpu_thread;
class notifier;
namespace vm
{
@ -15,6 +16,7 @@ namespace vm
extern u8* const g_exec_addr;
extern u8* const g_stat_addr;
extern u8* const g_reservations;
extern u8* const g_reservations2;
enum memory_location_t : uint
{
@ -41,24 +43,6 @@ namespace vm
page_allocated = (1 << 7),
};
struct waiter
{
named_thread* owner;
u32 addr;
u32 size;
u64 stamp;
const void* data;
waiter() = default;
waiter(const waiter&) = delete;
void init();
void test() const;
~waiter();
};
// Address type
enum addr_t : u32 {};
@ -112,14 +96,14 @@ namespace vm
inline void reservation_update(u32 addr, u32 size, bool lsb = false)
{
// Update reservation info with new timestamp
reservation_acquire(addr, size) = (__rdtsc() & -2) | u64{lsb};
reservation_acquire(addr, size) = (__rdtsc() << 1) | u64{lsb};
}
// Check and notify memory changes at address
void notify(u32 addr, u32 size);
// Check and notify memory changes
void notify_all();
// Get reservation sync variable
inline notifier& reservation_notifier(u32 addr, u32 size)
{
return *reinterpret_cast<notifier*>(g_reservations2 + addr / 16);
}
// Change memory protection of specified memory region
bool page_protect(u32 addr, u32 size, u8 flags_test = 0, u8 flags_set = 0, u8 flags_clear = 0);

View File

@ -118,16 +118,20 @@ namespace rsx
rsx->sync_point_request = true;
const u32 addr = get_address(method_registers.semaphore_offset_406e(), method_registers.semaphore_context_dma_406e());
if (addr >> 28 == 0x4)
if (g_use_rtm || addr >> 28 == 0x4)
{
// TODO: check no reservation area instead
vm::write32(addr, arg);
return;
}
else
{
vm::reader_lock lock;
vm::write32(addr, arg);
}
vm::reader_lock lock;
vm::write32(addr, arg);
vm::notify(addr, 4);
if (addr >> 28 != 0x4)
{
vm::reservation_notifier(addr, 4).notify_all();
}
}
}
@ -1051,7 +1055,7 @@ namespace rsx
}
LOG_SUCCESS(RSX, "capture successful: %s", filePath.c_str());
frame_capture.reset();
Emu.Pause();
}

View File

@ -23,6 +23,7 @@
#include "Loader/ELF.h"
#include "Utilities/StrUtil.h"
#include "Utilities/sysinfo.h"
#include "../Crypto/unself.h"
#include "../Crypto/unpkg.h"
@ -40,6 +41,8 @@
cfg_root g_cfg;
bool g_use_rtm = utils::has_rtm();
std::string g_cfg_defaults;
extern atomic_t<u32> g_thread_count;

View File

@ -456,3 +456,5 @@ struct cfg_root : cfg::node
};
extern cfg_root g_cfg;
extern bool g_use_rtm;