Drastically reduce cpu time wasted by XMADecoderThread spinning, went from 13% of all cpu time to about 0.6% in my tests
Commented out lock in WatchMemoryRange, lock is always held by caller properly set the value/check the irql for spinlocks in xboxkrnl_threading
This commit is contained in:
parent
ecf6bfbbdf
commit
efbeae660c
|
@ -177,14 +177,7 @@ void XmaDecoder::WorkerThreadMain() {
|
|||
} else {
|
||||
idle_loop_count = 0;
|
||||
}
|
||||
|
||||
if (idle_loop_count > 500) {
|
||||
// Idle for an extended period. Introduce a 20ms wait.
|
||||
xe::threading::Wait(work_event_.get(), false,
|
||||
std::chrono::milliseconds(20));
|
||||
}
|
||||
|
||||
xe::threading::MaybeYield();
|
||||
xe::threading::Wait(work_event_.get(), false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -316,7 +309,7 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
|
|||
}
|
||||
}
|
||||
// Signal the decoder thread to start processing.
|
||||
work_event_->Set();
|
||||
work_event_->SetBoostPriority();
|
||||
} else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) {
|
||||
// Context lock command.
|
||||
// This requests a lock by flagging the context.
|
||||
|
|
|
@ -4357,7 +4357,7 @@ bool D3D12CommandProcessor::UpdateBindings(
|
|||
uint32_t float_constant_index;
|
||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||
&float_constant_index)) {
|
||||
float_constant_map_entry &= ~(1ull << float_constant_index);
|
||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||
std::memcpy(float_constants,
|
||||
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
||||
(float_constant_index << 2)]
|
||||
|
@ -4388,7 +4388,7 @@ bool D3D12CommandProcessor::UpdateBindings(
|
|||
uint32_t float_constant_index;
|
||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||
&float_constant_index)) {
|
||||
float_constant_map_entry &= ~(1ull << float_constant_index);
|
||||
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||
std::memcpy(float_constants,
|
||||
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
||||
(float_constant_index << 2)]
|
||||
|
|
|
@ -680,9 +680,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
ID3D12Resource* readback_buffer_ = nullptr;
|
||||
uint32_t readback_buffer_size_ = 0;
|
||||
|
||||
std::atomic<bool> pix_capture_requested_ = false;
|
||||
bool pix_capturing_;
|
||||
|
||||
// The current fixed-function drawing state.
|
||||
D3D12_VIEWPORT ff_viewport_;
|
||||
D3D12_RECT ff_scissor_;
|
||||
|
@ -776,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
// scratch memexport data
|
||||
MemExportRange memexport_ranges_[512];
|
||||
uint32_t memexport_range_count_ = 0;
|
||||
|
||||
std::atomic<bool> pix_capture_requested_ = false;
|
||||
bool pix_capturing_;
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -150,8 +150,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
|
|||
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
uint32_t bucket_last =
|
||||
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
//chrispy: Not required the global lock is always held by the caller
|
||||
// auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Allocate the range.
|
||||
WatchRange* range = watch_range_first_free_;
|
||||
|
|
|
@ -957,13 +957,14 @@ static void PrefetchForCAS(const void* value) {
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
|
||||
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
|
||||
// XELOGD(
|
||||
// "KfAcquireSpinLock({:08X})",
|
||||
// lock_ptr);
|
||||
PrefetchForCAS(lock);
|
||||
assert_true(*lock != static_cast<uint32_t>(r13));
|
||||
// Lock.
|
||||
while (!xe::atomic_cas(0, 1, lock)) {
|
||||
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) {
|
||||
// Spin!
|
||||
// TODO(benvanik): error on deadlock?
|
||||
xe::threading::MaybeYield();
|
||||
|
@ -976,34 +977,51 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
|
|||
return old_irql;
|
||||
}
|
||||
|
||||
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr) {
|
||||
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
return xeKeKfAcquireSpinLock(lock);
|
||||
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking,
|
||||
kHighFrequency);
|
||||
|
||||
void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
|
||||
// Unlock.
|
||||
*lock = 0;
|
||||
if (old_irql >= 2) {
|
||||
return;
|
||||
}
|
||||
// Restore IRQL.
|
||||
XThread* thread = XThread::GetCurrentThread();
|
||||
thread->LowerIrql(old_irql);
|
||||
|
||||
// Unlock.
|
||||
xe::atomic_dec(lock);
|
||||
}
|
||||
|
||||
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) {
|
||||
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
xeKeKfReleaseSpinLock(lock, old_irql);
|
||||
|
||||
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
|
||||
*lock_ptr = 0;
|
||||
if (old_irql >= 2) {
|
||||
return;
|
||||
}
|
||||
// Restore IRQL.
|
||||
XThread* thread = XThread::GetCurrentThread();
|
||||
thread->LowerIrql(old_irql);
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
|
||||
kHighFrequency);
|
||||
// todo: this is not accurate
|
||||
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
// must not be our own thread
|
||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
|
||||
PrefetchForCAS(lock);
|
||||
while (!xe::atomic_cas(0, 1, lock)) {
|
||||
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
// todo: this is just a nop if they don't have SMT, which is not great
|
||||
// either...
|
||||
|
@ -1017,11 +1035,13 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
|||
DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
|
||||
kImplemented, kBlocking, kHighFrequency);
|
||||
|
||||
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
|
||||
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) {
|
||||
// Lock.
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
PrefetchForCAS(lock);
|
||||
if (!xe::atomic_cas(0, 1, lock)) {
|
||||
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
|
@ -1029,10 +1049,12 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
|||
DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
|
||||
kImplemented, kBlocking, kHighFrequency, kSketchy);
|
||||
|
||||
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
|
||||
ppc_context_t& ppc_ctx) {
|
||||
// Unlock.
|
||||
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||
xe::atomic_dec(lock);
|
||||
*lock_ptr = 0;
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading,
|
||||
kImplemented, kHighFrequency);
|
||||
|
@ -1261,8 +1283,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
|||
}
|
||||
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
||||
|
||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
if (!lock_count) {
|
||||
|
@ -1279,8 +1301,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
|
|||
kImplemented, kBlocking);
|
||||
|
||||
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
||||
pointer_t<X_ERWLOCK> lock_ptr) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
||||
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
uint32_t result;
|
||||
if (lock_ptr->lock_count < 0) {
|
||||
|
@ -1296,8 +1319,9 @@ dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
|||
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
||||
kImplemented);
|
||||
|
||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = ++lock_ptr->lock_count;
|
||||
if (!lock_count ||
|
||||
|
@ -1316,8 +1340,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
|
|||
kBlocking);
|
||||
|
||||
dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
||||
pointer_t<X_ERWLOCK> lock_ptr) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
||||
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
uint32_t result;
|
||||
if (lock_ptr->lock_count < 0 ||
|
||||
|
@ -1335,8 +1360,10 @@ dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
|||
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
|
||||
kImplemented);
|
||||
|
||||
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
||||
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||
ppc_context_t& ppc_context) {
|
||||
auto old_irql =
|
||||
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||
|
||||
int32_t lock_count = --lock_ptr->lock_count;
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ struct X_KTHREAD {
|
|||
uint8_t unk_58[0x4]; // 0x58
|
||||
xe::be<uint32_t> stack_base; // 0x5C
|
||||
xe::be<uint32_t> stack_limit; // 0x60
|
||||
uint8_t unk_64[0x4]; // 0x64
|
||||
xe::be<uint32_t> stack_kernel; // 0x64
|
||||
xe::be<uint32_t> tls_address; // 0x68
|
||||
uint8_t unk_6C; // 0x6C
|
||||
uint8_t unk_6D[0x7]; // 0x6D
|
||||
|
|
Loading…
Reference in New Issue