Drastically reduce cpu time wasted by XMADecoderThread spinning, went from 13% of all cpu time to about 0.6% in my tests

Commented out lock in WatchMemoryRange, lock is always held by caller
properly set the value/check the irql for spinlocks in xboxkrnl_threading
This commit is contained in:
chss95cs@gmail.com 2022-10-15 03:07:07 -07:00
parent ecf6bfbbdf
commit efbeae660c
6 changed files with 62 additions and 42 deletions

View File

@ -177,14 +177,7 @@ void XmaDecoder::WorkerThreadMain() {
} else {
idle_loop_count = 0;
}
if (idle_loop_count > 500) {
// Idle for an extended period. Introduce a 20ms wait.
xe::threading::Wait(work_event_.get(), false,
std::chrono::milliseconds(20));
}
xe::threading::MaybeYield();
xe::threading::Wait(work_event_.get(), false);
}
}
@ -316,7 +309,7 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
}
}
// Signal the decoder thread to start processing.
work_event_->Set();
work_event_->SetBoostPriority();
} else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) {
// Context lock command.
// This requests a lock by flagging the context.

View File

@ -4357,7 +4357,7 @@ bool D3D12CommandProcessor::UpdateBindings(
uint32_t float_constant_index;
while (xe::bit_scan_forward(float_constant_map_entry,
&float_constant_index)) {
float_constant_map_entry &= ~(1ull << float_constant_index);
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
std::memcpy(float_constants,
&regs[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
(float_constant_index << 2)]
@ -4388,7 +4388,7 @@ bool D3D12CommandProcessor::UpdateBindings(
uint32_t float_constant_index;
while (xe::bit_scan_forward(float_constant_map_entry,
&float_constant_index)) {
float_constant_map_entry &= ~(1ull << float_constant_index);
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
std::memcpy(float_constants,
&regs[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
(float_constant_index << 2)]

View File

@ -680,9 +680,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
ID3D12Resource* readback_buffer_ = nullptr;
uint32_t readback_buffer_size_ = 0;
std::atomic<bool> pix_capture_requested_ = false;
bool pix_capturing_;
// The current fixed-function drawing state.
D3D12_VIEWPORT ff_viewport_;
D3D12_RECT ff_scissor_;
@ -776,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
// scratch memexport data
MemExportRange memexport_ranges_[512];
uint32_t memexport_range_count_ = 0;
std::atomic<bool> pix_capture_requested_ = false;
bool pix_capturing_;
};
} // namespace d3d12

View File

@ -150,8 +150,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
uint32_t bucket_last =
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
auto global_lock = global_critical_region_.Acquire();
//chrispy: Not required the global lock is always held by the caller
// auto global_lock = global_critical_region_.Acquire();
// Allocate the range.
WatchRange* range = watch_range_first_free_;

View File

@ -957,13 +957,14 @@ static void PrefetchForCAS(const void* value) {
}
}
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
// XELOGD(
// "KfAcquireSpinLock({:08X})",
// lock_ptr);
PrefetchForCAS(lock);
assert_true(*lock != static_cast<uint32_t>(r13));
// Lock.
while (!xe::atomic_cas(0, 1, lock)) {
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) {
// Spin!
// TODO(benvanik): error on deadlock?
xe::threading::MaybeYield();
@ -976,34 +977,51 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
return old_irql;
}
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr) {
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_context) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
return xeKeKfAcquireSpinLock(lock);
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
}
DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking,
kHighFrequency);
void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
// Unlock.
*lock = 0;
if (old_irql >= 2) {
return;
}
// Restore IRQL.
XThread* thread = XThread::GetCurrentThread();
thread->LowerIrql(old_irql);
// Unlock.
xe::atomic_dec(lock);
}
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) {
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
ppc_context_t& ppc_ctx) {
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
xeKeKfReleaseSpinLock(lock, old_irql);
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
*lock_ptr = 0;
if (old_irql >= 2) {
return;
}
// Restore IRQL.
XThread* thread = XThread::GetCurrentThread();
thread->LowerIrql(old_irql);
}
DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
kHighFrequency);
// todo: this is not accurate
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) {
// Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
// must not be our own thread
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock);
while (!xe::atomic_cas(0, 1, lock)) {
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
#if XE_ARCH_AMD64 == 1
// todo: this is just a nop if they don't have SMT, which is not great
// either...
@ -1017,11 +1035,13 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency);
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) {
// Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
PrefetchForCAS(lock);
if (!xe::atomic_cas(0, 1, lock)) {
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
return 0;
}
return 1;
@ -1029,10 +1049,12 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
kImplemented, kBlocking, kHighFrequency, kSketchy);
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr) {
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
ppc_context_t& ppc_ctx) {
// Unlock.
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
xe::atomic_dec(lock);
*lock_ptr = 0;
}
DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading,
kImplemented, kHighFrequency);
@ -1261,8 +1283,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
}
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count;
if (!lock_count) {
@ -1279,8 +1301,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
kImplemented, kBlocking);
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
pointer_t<X_ERWLOCK> lock_ptr) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
uint32_t result;
if (lock_ptr->lock_count < 0) {
@ -1296,8 +1319,9 @@ dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
kImplemented);
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = ++lock_ptr->lock_count;
if (!lock_count ||
@ -1316,8 +1340,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
kBlocking);
dword_result_t ExTryToAcquireReadWriteLockShared_entry(
pointer_t<X_ERWLOCK> lock_ptr) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
uint32_t result;
if (lock_ptr->lock_count < 0 ||
@ -1335,8 +1360,10 @@ dword_result_t ExTryToAcquireReadWriteLockShared_entry(
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
kImplemented);
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
ppc_context_t& ppc_context) {
auto old_irql =
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
int32_t lock_count = --lock_ptr->lock_count;

View File

@ -100,7 +100,7 @@ struct X_KTHREAD {
uint8_t unk_58[0x4]; // 0x58
xe::be<uint32_t> stack_base; // 0x5C
xe::be<uint32_t> stack_limit; // 0x60
uint8_t unk_64[0x4]; // 0x64
xe::be<uint32_t> stack_kernel; // 0x64
xe::be<uint32_t> tls_address; // 0x68
uint8_t unk_6C; // 0x6C
uint8_t unk_6D[0x7]; // 0x6D