Drastically reduce cpu time wasted by XMADecoderThread spinning, went from 13% of all cpu time to about 0.6% in my tests
Commented out lock in WatchMemoryRange, lock is always held by caller properly set the value/check the irql for spinlocks in xboxkrnl_threading
This commit is contained in:
parent
ecf6bfbbdf
commit
efbeae660c
|
@ -177,14 +177,7 @@ void XmaDecoder::WorkerThreadMain() {
|
||||||
} else {
|
} else {
|
||||||
idle_loop_count = 0;
|
idle_loop_count = 0;
|
||||||
}
|
}
|
||||||
|
xe::threading::Wait(work_event_.get(), false);
|
||||||
if (idle_loop_count > 500) {
|
|
||||||
// Idle for an extended period. Introduce a 20ms wait.
|
|
||||||
xe::threading::Wait(work_event_.get(), false,
|
|
||||||
std::chrono::milliseconds(20));
|
|
||||||
}
|
|
||||||
|
|
||||||
xe::threading::MaybeYield();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -316,7 +309,7 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Signal the decoder thread to start processing.
|
// Signal the decoder thread to start processing.
|
||||||
work_event_->Set();
|
work_event_->SetBoostPriority();
|
||||||
} else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) {
|
} else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) {
|
||||||
// Context lock command.
|
// Context lock command.
|
||||||
// This requests a lock by flagging the context.
|
// This requests a lock by flagging the context.
|
||||||
|
|
|
@ -4357,7 +4357,7 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
uint32_t float_constant_index;
|
uint32_t float_constant_index;
|
||||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||||
&float_constant_index)) {
|
&float_constant_index)) {
|
||||||
float_constant_map_entry &= ~(1ull << float_constant_index);
|
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||||
std::memcpy(float_constants,
|
std::memcpy(float_constants,
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) +
|
||||||
(float_constant_index << 2)]
|
(float_constant_index << 2)]
|
||||||
|
@ -4388,7 +4388,7 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
uint32_t float_constant_index;
|
uint32_t float_constant_index;
|
||||||
while (xe::bit_scan_forward(float_constant_map_entry,
|
while (xe::bit_scan_forward(float_constant_map_entry,
|
||||||
&float_constant_index)) {
|
&float_constant_index)) {
|
||||||
float_constant_map_entry &= ~(1ull << float_constant_index);
|
float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry);
|
||||||
std::memcpy(float_constants,
|
std::memcpy(float_constants,
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) +
|
||||||
(float_constant_index << 2)]
|
(float_constant_index << 2)]
|
||||||
|
|
|
@ -680,9 +680,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
ID3D12Resource* readback_buffer_ = nullptr;
|
ID3D12Resource* readback_buffer_ = nullptr;
|
||||||
uint32_t readback_buffer_size_ = 0;
|
uint32_t readback_buffer_size_ = 0;
|
||||||
|
|
||||||
std::atomic<bool> pix_capture_requested_ = false;
|
|
||||||
bool pix_capturing_;
|
|
||||||
|
|
||||||
// The current fixed-function drawing state.
|
// The current fixed-function drawing state.
|
||||||
D3D12_VIEWPORT ff_viewport_;
|
D3D12_VIEWPORT ff_viewport_;
|
||||||
D3D12_RECT ff_scissor_;
|
D3D12_RECT ff_scissor_;
|
||||||
|
@ -776,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
// scratch memexport data
|
// scratch memexport data
|
||||||
MemExportRange memexport_ranges_[512];
|
MemExportRange memexport_ranges_[512];
|
||||||
uint32_t memexport_range_count_ = 0;
|
uint32_t memexport_range_count_ = 0;
|
||||||
|
|
||||||
|
std::atomic<bool> pix_capture_requested_ = false;
|
||||||
|
bool pix_capturing_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
|
|
|
@ -150,8 +150,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
|
||||||
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||||
uint32_t bucket_last =
|
uint32_t bucket_last =
|
||||||
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
|
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||||
|
//chrispy: Not required the global lock is always held by the caller
|
||||||
auto global_lock = global_critical_region_.Acquire();
|
// auto global_lock = global_critical_region_.Acquire();
|
||||||
|
|
||||||
// Allocate the range.
|
// Allocate the range.
|
||||||
WatchRange* range = watch_range_first_free_;
|
WatchRange* range = watch_range_first_free_;
|
||||||
|
|
|
@ -957,13 +957,14 @@ static void PrefetchForCAS(const void* value) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
|
uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) {
|
||||||
// XELOGD(
|
// XELOGD(
|
||||||
// "KfAcquireSpinLock({:08X})",
|
// "KfAcquireSpinLock({:08X})",
|
||||||
// lock_ptr);
|
// lock_ptr);
|
||||||
PrefetchForCAS(lock);
|
PrefetchForCAS(lock);
|
||||||
|
assert_true(*lock != static_cast<uint32_t>(r13));
|
||||||
// Lock.
|
// Lock.
|
||||||
while (!xe::atomic_cas(0, 1, lock)) {
|
while (!xe::atomic_cas(0, static_cast<uint32_t>(r13), lock)) {
|
||||||
// Spin!
|
// Spin!
|
||||||
// TODO(benvanik): error on deadlock?
|
// TODO(benvanik): error on deadlock?
|
||||||
xe::threading::MaybeYield();
|
xe::threading::MaybeYield();
|
||||||
|
@ -976,34 +977,51 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) {
|
||||||
return old_irql;
|
return old_irql;
|
||||||
}
|
}
|
||||||
|
|
||||||
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr) {
|
dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr,
|
||||||
|
ppc_context_t& ppc_context) {
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
return xeKeKfAcquireSpinLock(lock);
|
return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]);
|
||||||
}
|
}
|
||||||
DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking,
|
DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking,
|
||||||
kHighFrequency);
|
kHighFrequency);
|
||||||
|
|
||||||
void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
|
void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) {
|
||||||
|
// Unlock.
|
||||||
|
*lock = 0;
|
||||||
|
if (old_irql >= 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
// Restore IRQL.
|
// Restore IRQL.
|
||||||
XThread* thread = XThread::GetCurrentThread();
|
XThread* thread = XThread::GetCurrentThread();
|
||||||
thread->LowerIrql(old_irql);
|
thread->LowerIrql(old_irql);
|
||||||
|
|
||||||
// Unlock.
|
|
||||||
xe::atomic_dec(lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) {
|
void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql,
|
||||||
|
ppc_context_t& ppc_ctx) {
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
xeKeKfReleaseSpinLock(lock, old_irql);
|
|
||||||
|
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
|
|
||||||
|
*lock_ptr = 0;
|
||||||
|
if (old_irql >= 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Restore IRQL.
|
||||||
|
XThread* thread = XThread::GetCurrentThread();
|
||||||
|
thread->LowerIrql(old_irql);
|
||||||
}
|
}
|
||||||
DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
|
DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
|
||||||
kHighFrequency);
|
kHighFrequency);
|
||||||
// todo: this is not accurate
|
// todo: this is not accurate
|
||||||
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr,
|
||||||
|
ppc_context_t& ppc_ctx) {
|
||||||
// Lock.
|
// Lock.
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
|
// must not be our own thread
|
||||||
|
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
|
|
||||||
PrefetchForCAS(lock);
|
PrefetchForCAS(lock);
|
||||||
while (!xe::atomic_cas(0, 1, lock)) {
|
while (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||||
#if XE_ARCH_AMD64 == 1
|
#if XE_ARCH_AMD64 == 1
|
||||||
// todo: this is just a nop if they don't have SMT, which is not great
|
// todo: this is just a nop if they don't have SMT, which is not great
|
||||||
// either...
|
// either...
|
||||||
|
@ -1017,11 +1035,13 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||||
DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
|
DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading,
|
||||||
kImplemented, kBlocking, kHighFrequency);
|
kImplemented, kBlocking, kHighFrequency);
|
||||||
|
|
||||||
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(
|
||||||
|
lpdword_t lock_ptr, ppc_context_t& ppc_ctx) {
|
||||||
// Lock.
|
// Lock.
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
|
assert_true(*lock_ptr != static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
PrefetchForCAS(lock);
|
PrefetchForCAS(lock);
|
||||||
if (!xe::atomic_cas(0, 1, lock)) {
|
if (!xe::atomic_cas(0, static_cast<uint32_t>(ppc_ctx->r[13]), lock)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -1029,10 +1049,12 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
|
||||||
DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
|
DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading,
|
||||||
kImplemented, kBlocking, kHighFrequency, kSketchy);
|
kImplemented, kBlocking, kHighFrequency, kSketchy);
|
||||||
|
|
||||||
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr) {
|
void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr,
|
||||||
|
ppc_context_t& ppc_ctx) {
|
||||||
// Unlock.
|
// Unlock.
|
||||||
|
assert_true(*lock_ptr == static_cast<uint32_t>(ppc_ctx->r[13]));
|
||||||
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
|
||||||
xe::atomic_dec(lock);
|
*lock_ptr = 0;
|
||||||
}
|
}
|
||||||
DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading,
|
DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading,
|
||||||
kImplemented, kHighFrequency);
|
kImplemented, kHighFrequency);
|
||||||
|
@ -1261,8 +1283,8 @@ void ExInitializeReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
||||||
}
|
}
|
||||||
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented);
|
||||||
|
|
||||||
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
void ExAcquireReadWriteLockExclusive_entry(pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
int32_t lock_count = ++lock_ptr->lock_count;
|
int32_t lock_count = ++lock_ptr->lock_count;
|
||||||
if (!lock_count) {
|
if (!lock_count) {
|
||||||
|
@ -1279,8 +1301,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading,
|
||||||
kImplemented, kBlocking);
|
kImplemented, kBlocking);
|
||||||
|
|
||||||
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
||||||
pointer_t<X_ERWLOCK> lock_ptr) {
|
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
auto old_irql =
|
||||||
|
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
uint32_t result;
|
uint32_t result;
|
||||||
if (lock_ptr->lock_count < 0) {
|
if (lock_ptr->lock_count < 0) {
|
||||||
|
@ -1296,8 +1319,9 @@ dword_result_t ExTryToAcquireReadWriteLockExclusive_entry(
|
||||||
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading,
|
||||||
kImplemented);
|
kImplemented);
|
||||||
|
|
||||||
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
void ExAcquireReadWriteLockShared_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
ppc_context_t& ppc_context) {
|
||||||
|
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
int32_t lock_count = ++lock_ptr->lock_count;
|
int32_t lock_count = ++lock_ptr->lock_count;
|
||||||
if (!lock_count ||
|
if (!lock_count ||
|
||||||
|
@ -1316,8 +1340,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented,
|
||||||
kBlocking);
|
kBlocking);
|
||||||
|
|
||||||
dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
||||||
pointer_t<X_ERWLOCK> lock_ptr) {
|
pointer_t<X_ERWLOCK> lock_ptr, ppc_context_t& ppc_context) {
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
auto old_irql =
|
||||||
|
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
uint32_t result;
|
uint32_t result;
|
||||||
if (lock_ptr->lock_count < 0 ||
|
if (lock_ptr->lock_count < 0 ||
|
||||||
|
@ -1335,8 +1360,10 @@ dword_result_t ExTryToAcquireReadWriteLockShared_entry(
|
||||||
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
|
DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading,
|
||||||
kImplemented);
|
kImplemented);
|
||||||
|
|
||||||
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr) {
|
void ExReleaseReadWriteLock_entry(pointer_t<X_ERWLOCK> lock_ptr,
|
||||||
auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock);
|
ppc_context_t& ppc_context) {
|
||||||
|
auto old_irql =
|
||||||
|
xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]);
|
||||||
|
|
||||||
int32_t lock_count = --lock_ptr->lock_count;
|
int32_t lock_count = --lock_ptr->lock_count;
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ struct X_KTHREAD {
|
||||||
uint8_t unk_58[0x4]; // 0x58
|
uint8_t unk_58[0x4]; // 0x58
|
||||||
xe::be<uint32_t> stack_base; // 0x5C
|
xe::be<uint32_t> stack_base; // 0x5C
|
||||||
xe::be<uint32_t> stack_limit; // 0x60
|
xe::be<uint32_t> stack_limit; // 0x60
|
||||||
uint8_t unk_64[0x4]; // 0x64
|
xe::be<uint32_t> stack_kernel; // 0x64
|
||||||
xe::be<uint32_t> tls_address; // 0x68
|
xe::be<uint32_t> tls_address; // 0x68
|
||||||
uint8_t unk_6C; // 0x6C
|
uint8_t unk_6C; // 0x6C
|
||||||
uint8_t unk_6D[0x7]; // 0x6D
|
uint8_t unk_6D[0x7]; // 0x6D
|
||||||
|
|
Loading…
Reference in New Issue