Fix thread pool entry point and get_cycles()

Fix possible race between thread handle availability.
Don't treat zero thread as invalid one.
Now entry point is full is assembly.
Attempt to fix #9282
Also fix some TLS.
This commit is contained in:
Nekotekina 2020-11-21 07:56:54 +03:00
parent 60cff6f3d4
commit 71f1021648
5 changed files with 109 additions and 110 deletions

View File

@ -88,6 +88,7 @@ thread_local u64 g_tls_fault_rsx = 0;
thread_local u64 g_tls_fault_spu = 0;
thread_local u64 g_tls_wait_time = 0;
thread_local u64 g_tls_wait_fail = 0;
thread_local bool g_tls_access_violation_recovered = false;
extern thread_local std::string(*g_tls_log_prefix)();
template <>
@ -1367,13 +1368,11 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
return true;
}
thread_local bool access_violation_recovered = false;
// Hack: allocate memory in case the emulator is stopping
const auto hack_alloc = [&]()
{
// If failed the value remains true and std::terminate should be called
access_violation_recovered = true;
g_tls_access_violation_recovered = true;
const auto area = vm::reserve_map(vm::any, addr & -0x10000, 0x10000);
@ -1525,7 +1524,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
if (cpu->id_type() != 1)
{
if (!access_violation_recovered)
if (!g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", cpu->dump_all());
vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
@ -1556,14 +1555,14 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
Emu.Pause();
if (cpu && !access_violation_recovered)
if (cpu && !g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", cpu->dump_all());
}
// Note: a thread may access violate more than once after hack_alloc recovery
// Do not log any further access violations in this case.
if (!access_violation_recovered)
if (!g_tls_access_violation_recovered)
{
vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
}
@ -1850,7 +1849,7 @@ static atomic_t<u128, 64> s_thread_bits{0};
static atomic_t<thread_base**> s_thread_pool[128]{};
void thread_base::start(native_entry entry, void(*trampoline)())
void thread_base::start(native_entry entry)
{
for (u128 bits = s_thread_bits.load(); bits; bits &= bits - 1)
{
@ -1869,16 +1868,13 @@ void thread_base::start(native_entry entry, void(*trampoline)())
}
// Send "this" and entry point
m_thread = reinterpret_cast<u64>(trampoline);
const u64 entry_val = reinterpret_cast<u64>(entry);
m_thread = entry_val;
atomic_storage<thread_base*>::release(*tls, this);
s_thread_pool[pos].notify_all();
// Wait for actual "m_thread" in return
while (m_thread == reinterpret_cast<u64>(trampoline))
{
busy_wait(300);
}
m_thread.wait(entry_val);
return;
}
@ -1892,6 +1888,10 @@ void thread_base::start(native_entry entry, void(*trampoline)())
void thread_base::initialize(void (*error_cb)())
{
#ifndef _WIN32
m_thread.release(reinterpret_cast<u64>(pthread_self()));
#endif
// Initialize TLS variables
thread_ctrl::g_tls_this_thread = this;
@ -2012,9 +2012,6 @@ u64 thread_base::finalize(thread_state result_state) noexcept
atomic_wait_engine::set_wait_callback(nullptr);
const u64 _self = m_thread;
m_thread.release(0);
// Return true if need to delete thread object (no)
const bool ok = 0 == (3 & ~m_sync.fetch_op([&](u64& v)
{
@ -2026,48 +2023,55 @@ u64 thread_base::finalize(thread_state result_state) noexcept
m_sync.notify_all(2);
// No detached thread supported atm
return _self;
return m_thread;
}
void thread_base::finalize(u64 _self) noexcept
u64 thread_base::finalize(u64 _self) noexcept
{
g_tls_fault_all = 0;
g_tls_fault_rsx = 0;
g_tls_fault_spu = 0;
g_tls_wait_time = 0;
g_tls_wait_fail = 0;
g_tls_access_violation_recovered = false;
atomic_wait_engine::set_wait_callback(nullptr);
g_tls_log_prefix = []() -> std::string { return {}; };
thread_ctrl::g_tls_this_thread = nullptr;
if (!_self)
{
return;
return 0;
}
// Try to add self to thread pool
const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits)
{
if (~bits) [[likely]]
{
// Set lowest clear bit
bits |= bits + 1;
return true;
}
return false;
});
if (!ok)
{
#ifdef _WIN32
_endthread();
#else
pthread_detach(reinterpret_cast<pthread_t>(_self));
pthread_exit(0);
#endif
return;
}
set_name("..pool");
// Obtain id from atomic op
const u32 pos = utils::ctz128(~bits);
u32 pos = -1;
while (true)
{
const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits)
{
if (~bits) [[likely]]
{
// Set lowest clear bit
bits |= bits + 1;
return true;
}
return false;
});
if (ok) [[likely]]
{
pos = utils::ctz128(~bits);
break;
}
s_thread_bits.wait(bits);
}
const auto tls = &thread_ctrl::g_tls_this_thread;
s_thread_pool[pos] = tls;
@ -2082,30 +2086,47 @@ void thread_base::finalize(u64 _self) noexcept
val &= ~(u128(1) << pos);
});
s_thread_bits.notify_one();
// Restore thread id
const auto _this = atomic_storage<thread_base*>::load(*tls);
const auto entry = _this->m_thread.exchange(_self);
verify(HERE), entry != _self;
_this->m_thread.notify_one();
// Hack return address to avoid tail call
#ifdef _MSC_VER
*static_cast<u64*>(_AddressOfReturnAddress()) = entry;
#else
static_cast<u64*>(__builtin_frame_address(0))[1] = entry;
#endif
//reinterpret_cast<native_entry>(entry)(_this);
// Return new entry
return entry;
}
void (*thread_base::make_trampoline(native_entry entry))()
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{
return build_function_asm<void(*)()>([&](asmjit::X86Assembler& c, auto& args)
return build_function_asm<native_entry>([&](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
// Revert effect of ret instruction (fix stack alignment)
c.mov(x86::rax, imm_ptr(entry));
c.sub(x86::rsp, 8);
c.jmp(x86::rax);
Label _ret = c.newLabel();
c.push(x86::rbp);
c.sub(x86::rsp, 0x20);
// Call entry point (TODO: support for detached threads missing?)
c.call(imm_ptr(entry));
// Call finalize, return if zero
c.mov(args[0], x86::rax);
c.call(imm_ptr<u64(*)(u64)>(finalize));
c.test(x86::rax, x86::rax);
c.jz(_ret);
// Otherwise, call it as an entry point with first arg = new current thread
c.mov(x86::rbp, x86::rax);
c.call(imm_ptr(thread_ctrl::get_current));
c.mov(args[0], x86::rax);
c.add(x86::rsp, 0x28);
c.jmp(x86::rbp);
c.bind(_ret);
c.add(x86::rsp, 0x28);
c.ret();
});
}
@ -2193,12 +2214,13 @@ thread_base::thread_base(std::string_view name)
thread_base::~thread_base()
{
if (u64 handle = m_thread.exchange(0))
// Only cleanup on errored status
if ((m_sync & 3) == 2)
{
#ifdef _WIN32
CloseHandle(reinterpret_cast<HANDLE>(handle));
CloseHandle(reinterpret_cast<HANDLE>(m_thread.load()));
#else
pthread_detach(reinterpret_cast<pthread_t>(handle));
pthread_join(reinterpret_cast<pthread_t>(m_thread.load()), nullptr);
#endif
}
}
@ -2255,13 +2277,15 @@ u64 thread_base::get_native_id() const
u64 thread_base::get_cycles()
{
u64 cycles;
u64 cycles = 0;
const u64 handle = m_thread;
#ifdef _WIN32
if (QueryThreadCycleTime(reinterpret_cast<HANDLE>(m_thread.load()), &cycles))
if (QueryThreadCycleTime(reinterpret_cast<HANDLE>(handle), &cycles))
{
#elif __APPLE__
mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast<pthread_t>(m_thread.load()));
mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast<pthread_t>(handle));
mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
thread_basic_info_data_t info;
kern_return_t ret = thread_info(port, THREAD_BASIC_INFO, reinterpret_cast<thread_info_t>(&info), &count);
@ -2272,7 +2296,7 @@ u64 thread_base::get_cycles()
#else
clockid_t _clock;
struct timespec thread_time;
if (!pthread_getcpuclockid(reinterpret_cast<pthread_t>(m_thread.load()), &_clock) && !clock_gettime(_clock, &thread_time))
if (!pthread_getcpuclockid(reinterpret_cast<pthread_t>(handle), &_clock) && !clock_gettime(_clock, &thread_time))
{
cycles = static_cast<u64>(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec;
#endif
@ -2317,19 +2341,15 @@ void thread_ctrl::emergency_exit(std::string_view reason)
if (!_self)
{
// Unused, detached thread support remnant
delete _this;
}
thread_base::finalize(0);
#ifdef _WIN32
_endthread();
_endthreadex(0);
#else
if (_self)
{
pthread_detach(reinterpret_cast<pthread_t>(_self));
}
pthread_exit(0);
#endif
}

View File

@ -111,22 +111,22 @@ private:
stx::atomic_cptr<std::string> m_tname;
// Start thread
void start(native_entry, void(*)());
void start(native_entry);
// Called at the thread start
void initialize(void (*error_cb)());
// Called at the thread end, returns true if needs destruction
// Called at the thread end, returns self handle
u64 finalize(thread_state result) noexcept;
// Cleanup after possibly deleting the thread instance
static void finalize(u64 _self) noexcept;
static u64 finalize(u64 _self) noexcept;
// Set name for debugger
static void set_name(std::string);
// Make trampoline with stack fix
static void(*make_trampoline(native_entry))();
// Make entry point
static native_entry make_trampoline(u64(*)(thread_base*));
friend class thread_ctrl;
@ -277,35 +277,12 @@ class named_thread final : public Context, result_storage_t<Context>, thread_bas
using result = result_storage_t<Context>;
using thread = thread_base;
// Type-erased thread entry point
#ifdef _WIN32
static inline uint __stdcall entry_point(void* arg)
#else
static inline void* entry_point(void* arg)
#endif
static u64 entry_point(thread_base* _base)
{
if (auto _this = thread_ctrl::get_current())
{
arg = _this;
}
const auto _this = static_cast<named_thread*>(static_cast<thread*>(arg));
// Perform self-cleanup if necessary
u64 _self = _this->entry_point();
if (!_self)
{
delete _this;
thread::finalize(0);
return 0;
}
thread::finalize(_self);
return 0;
return static_cast<named_thread*>(_base)->entry_point2();
}
u64 entry_point()
u64 entry_point2()
{
thread::initialize([]()
{
@ -330,7 +307,7 @@ class named_thread final : public Context, result_storage_t<Context>, thread_bas
return thread::finalize(thread_state::finished);
}
static inline void(*trampoline)() = thread::make_trampoline(entry_point);
static inline thread::native_entry trampoline = thread::make_trampoline(entry_point);
friend class thread_ctrl;
@ -341,7 +318,7 @@ public:
: Context()
, thread(Context::thread_name)
{
thread::start(&named_thread::entry_point, trampoline);
thread::start(trampoline);
}
// Normal forwarding constructor
@ -350,7 +327,7 @@ public:
: Context(std::forward<Args>(args)...)
, thread(name)
{
thread::start(&named_thread::entry_point, trampoline);
thread::start(trampoline);
}
// Lambda constructor, also the implicit deduction guide candidate
@ -358,7 +335,7 @@ public:
: Context(std::forward<Context>(f))
, thread(name)
{
thread::start(&named_thread::entry_point, trampoline);
thread::start(trampoline);
}
named_thread(const named_thread&) = delete;

View File

@ -469,6 +469,8 @@ void GLGSRender::on_exit()
GSRender::on_exit();
zcull_ctrl.release();
gl::set_primary_context_thread(false);
}
void GLGSRender::clear_surface(u32 arg)

View File

@ -11,16 +11,16 @@ namespace gl
capabilities g_driver_caps;
const fbo screen{};
thread_local bool tls_primary_context_thread = false;
static thread_local bool s_tls_primary_context_thread = false;
void set_primary_context_thread()
void set_primary_context_thread(bool value)
{
tls_primary_context_thread = true;
s_tls_primary_context_thread = value;
}
bool is_primary_context_thread()
{
return tls_primary_context_thread;
return s_tls_primary_context_thread;
}
GLenum draw_mode(rsx::primitive_type in)

View File

@ -55,7 +55,7 @@ namespace gl
bool is_primitive_native(rsx::primitive_type in);
GLenum draw_mode(rsx::primitive_type in);
void set_primary_context_thread();
void set_primary_context_thread(bool = true);
bool is_primary_context_thread();
// Texture helpers