diff --git a/.gitignore b/.gitignore index 140103405..34791bda2 100644 --- a/.gitignore +++ b/.gitignore @@ -101,3 +101,5 @@ node_modules/.bin/ /third_party/binutils/binutils* /third_party/vasm/ /tools/shader-playground/*.dll +/profile_print_times.py +/profile_times.txt diff --git a/.gitmodules b/.gitmodules index b4e3119b2..6c356ec38 100644 --- a/.gitmodules +++ b/.gitmodules @@ -36,7 +36,7 @@ url = https://github.com/skystrife/cpptoml.git [submodule "third_party/cxxopts"] path = third_party/cxxopts - url = https://github.com/jarro2783/cxxopts.git + url = https://github.com/chrisps/cxxopts.git [submodule "third_party/SDL2"] path = third_party/SDL2 url = https://github.com/libsdl-org/SDL.git diff --git a/src/xenia/apu/xma_decoder.cc b/src/xenia/apu/xma_decoder.cc index 43b82ea73..eac5d3d53 100644 --- a/src/xenia/apu/xma_decoder.cc +++ b/src/xenia/apu/xma_decoder.cc @@ -177,14 +177,7 @@ void XmaDecoder::WorkerThreadMain() { } else { idle_loop_count = 0; } - - if (idle_loop_count > 500) { - // Idle for an extended period. Introduce a 20ms wait. - xe::threading::Wait(work_event_.get(), false, - std::chrono::milliseconds(20)); - } - - xe::threading::MaybeYield(); + xe::threading::Wait(work_event_.get(), false); } } @@ -316,7 +309,7 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) { } } // Signal the decoder thread to start processing. - work_event_->Set(); + work_event_->SetBoostPriority(); } else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) { // Context lock command. // This requests a lock by flagging the context. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 03b8b4abd..3ba47cad4 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -405,9 +405,10 @@ void X64Emitter::EmitProfilerEpilogue() { if (cvars::instrument_call_times) { uint64_t* profiler_entry = backend()->GetProfilerRecordForFunction(current_guest_function_); + mov(ecx, 0x7ffe0014); mov(rdx, qword[rcx]); - mov(rbx, (uintptr_t)profiler_entry); + mov(r10, (uintptr_t)profiler_entry); sub(rdx, qword[rsp + StackLayout::GUEST_PROFILER_START]); // atomic add our time to the profiler entry @@ -416,7 +417,8 @@ void X64Emitter::EmitProfilerEpilogue() { // this a few cycles less intrusive, but its good enough for now // actually... lets just try without atomics lol // lock(); - add(qword[rbx], rdx); + add(qword[r10], rdx); + } #endif } diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index b98ae80d5..7a5935001 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1872,20 +1872,20 @@ Value* HIRBuilder::AndNot(Value* value1, Value* value2) { ASSERT_NON_FLOAT_TYPE(value1); ASSERT_NON_FLOAT_TYPE(value2); ASSERT_TYPES_EQUAL(value1, value2); - - if (value1 == value2) { + // only other type it can be used with is INT64_TYPE (andc) + if (value1->type != VEC128_TYPE) { + return this->And(this->Not(value2), value1); + } else if (value1 == value2) { return LoadZero(value1->type); - } else if (value1->IsConstantZero()) { - return value1; - } else if (value2->IsConstantZero()) { + } else if (value1->IsConstantZero() || value2->IsConstantZero()) { return value1; + } else { + Instr* i = AppendInstr(OPCODE_AND_NOT_info, 0, AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->src3.value = NULL; + return i->dest; } - - Instr* i = AppendInstr(OPCODE_AND_NOT_info, 0, AllocValue(value1->type)); - i->set_src1(value1); - i->set_src2(value2); - i->src3.value = NULL; - return i->dest; } Value* HIRBuilder::Or(Value* value1, Value* value2) { diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 04ddbfbe1..9f00648b0 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -4355,7 +4355,7 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry &= ~(1ull << float_constant_index); + float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) + (float_constant_index << 2)] @@ -4386,7 +4386,7 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry &= ~(1ull << float_constant_index); + float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) + (float_constant_index << 2)] @@ -4877,7 +4877,7 @@ bool D3D12CommandProcessor::UpdateBindings_BindfulPath( bool& retflag) { retflag = true; auto& provider = this->GetD3D12Provider(); - size_t texture_count_pixel = textures_pixel->size(); + size_t texture_count_pixel = textures_pixel ? textures_pixel->size() : 0; size_t texture_count_vertex = textures_vertex.size(); // // Bindful descriptors path. diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 9412116ac..53a23add8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -680,9 +680,6 @@ class D3D12CommandProcessor final : public CommandProcessor { ID3D12Resource* readback_buffer_ = nullptr; uint32_t readback_buffer_size_ = 0; - std::atomic pix_capture_requested_ = false; - bool pix_capturing_; - // The current fixed-function drawing state. D3D12_VIEWPORT ff_viewport_; D3D12_RECT ff_scissor_; @@ -776,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor { // scratch memexport data MemExportRange memexport_ranges_[512]; uint32_t memexport_range_count_ = 0; + + std::atomic pix_capture_requested_ = false; + bool pix_capturing_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index c15da8a9b..b891b5f38 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -150,8 +150,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2; uint32_t bucket_last = watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2; - - auto global_lock = global_critical_region_.Acquire(); + //chrispy: Not required the global lock is always held by the caller + // auto global_lock = global_critical_region_.Acquire(); // Allocate the range. WatchRange* range = watch_range_first_free_; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc index 000e75a1a..3a0d88ea4 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc @@ -188,6 +188,12 @@ void RtlInitAnsiString_entry(pointer_t destination, destination->pointer = source.guest_address(); } DECLARE_XBOXKRNL_EXPORT1(RtlInitAnsiString, kNone, kImplemented); +//https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/wdm/nf-wdm-rtlupcaseunicodechar +dword_result_t RtlUpcaseUnicodeChar_entry(dword_t SourceCharacter) { + return std::use_facet>(std::locale()).toupper(SourceCharacter); +} +DECLARE_XBOXKRNL_EXPORT1(RtlUpcaseUnicodeChar, kNone, kImplemented); + // https://msdn.microsoft.com/en-us/library/ff561899 void RtlFreeAnsiString_entry(pointer_t string) { diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 95b26dfb3..8d0283744 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -957,13 +957,14 @@ static void PrefetchForCAS(const void* value) { } } -uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) { +uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) { // XELOGD( // "KfAcquireSpinLock({:08X})", // lock_ptr); PrefetchForCAS(lock); + assert_true(*lock != static_cast(r13)); // Lock. - while (!xe::atomic_cas(0, 1, lock)) { + while (!xe::atomic_cas(0, static_cast(r13), lock)) { // Spin! // TODO(benvanik): error on deadlock? xe::threading::MaybeYield(); @@ -976,34 +977,51 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) { return old_irql; } -dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr) { +dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_context) { auto lock = reinterpret_cast(lock_ptr.host_address()); - return xeKeKfAcquireSpinLock(lock); + return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]); } DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking, kHighFrequency); void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) { + // Unlock. + *lock = 0; + if (old_irql >= 2) { + return; + } // Restore IRQL. XThread* thread = XThread::GetCurrentThread(); thread->LowerIrql(old_irql); - - // Unlock. - xe::atomic_dec(lock); } -void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) { +void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql, + ppc_context_t& ppc_ctx) { auto lock = reinterpret_cast(lock_ptr.host_address()); - xeKeKfReleaseSpinLock(lock, old_irql); + + assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); + + *lock_ptr = 0; + if (old_irql >= 2) { + return; + } + // Restore IRQL. + XThread* thread = XThread::GetCurrentThread(); + thread->LowerIrql(old_irql); } DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented, kHighFrequency); // todo: this is not accurate -void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { +void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + // must not be our own thread + assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); + PrefetchForCAS(lock); - while (!xe::atomic_cas(0, 1, lock)) { + while (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { #if XE_ARCH_AMD64 == 1 // todo: this is just a nop if they don't have SMT, which is not great // either... @@ -1017,11 +1035,13 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency); -dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { +dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( + lpdword_t lock_ptr, ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - if (!xe::atomic_cas(0, 1, lock)) { + if (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { return 0; } return 1; @@ -1029,10 +1049,12 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency, kSketchy); -void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr) { +void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_ctx) { // Unlock. + assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); auto lock = reinterpret_cast(lock_ptr.host_address()); - xe::atomic_dec(lock); + *lock_ptr = 0; } DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading, kImplemented, kHighFrequency); @@ -1261,8 +1283,8 @@ void ExInitializeReadWriteLock_entry(pointer_t lock_ptr) { } DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); -void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count) { @@ -1279,8 +1301,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( - pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); + pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); uint32_t result; if (lock_ptr->lock_count < 0) { @@ -1296,8 +1319,9 @@ dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading, kImplemented); -void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr, + ppc_context_t& ppc_context) { + auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count || @@ -1316,8 +1340,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockShared_entry( - pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); + pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); uint32_t result; if (lock_ptr->lock_count < 0 || @@ -1335,8 +1360,10 @@ dword_result_t ExTryToAcquireReadWriteLockShared_entry( DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading, kImplemented); -void ExReleaseReadWriteLock_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExReleaseReadWriteLock_entry(pointer_t lock_ptr, + ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = --lock_ptr->lock_count; diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index 9eef807b2..35af2bc12 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -100,7 +100,7 @@ struct X_KTHREAD { uint8_t unk_58[0x4]; // 0x58 xe::be stack_base; // 0x5C xe::be stack_limit; // 0x60 - uint8_t unk_64[0x4]; // 0x64 + xe::be stack_kernel; // 0x64 xe::be tls_address; // 0x68 uint8_t unk_6C; // 0x6C uint8_t unk_6D[0x7]; // 0x6D diff --git a/third_party/cxxopts b/third_party/cxxopts index 2e3c6991d..b2b8cf2f5 160000 --- a/third_party/cxxopts +++ b/third_party/cxxopts @@ -1 +1 @@ -Subproject commit 2e3c6991d33811878ebcc0839d3815850d129b3a +Subproject commit b2b8cf2f50a449720874f43445e23d75b77dcc43