diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc index 7024db5a8..74847efed 100644 --- a/src/xenia/apu/xma_context.cc +++ b/src/xenia/apu/xma_context.cc @@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) { // Allocate ffmpeg stuff: av_packet_ = av_packet_alloc(); assert_not_null(av_packet_); - + //chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet, + //these allocations were causing RtlSubsegmentInitialize + av_packet_->buf = av_buffer_alloc(128 * 1024); // find the XMA2 audio decoder av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES); if (!av_codec_) { @@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) { } bool XmaContext::Work() { - std::lock_guard lock(lock_); - if (!is_allocated() || !is_enabled()) { + + if (!is_enabled() || !is_allocated()) { return false; } + { + std::lock_guard lock(lock_); + set_is_enabled(false); - set_is_enabled(false); - - auto context_ptr = memory()->TranslateVirtual(guest_ptr()); - XMA_CONTEXT_DATA data(context_ptr); - Decode(&data); - data.Store(context_ptr); - return true; + auto context_ptr = memory()->TranslateVirtual(guest_ptr()); + XMA_CONTEXT_DATA data(context_ptr); + Decode(&data); + data.Store(context_ptr); + return true; + } } void XmaContext::Enable() { diff --git a/src/xenia/apu/xma_context.h b/src/xenia/apu/xma_context.h index a348c7836..cc0d0f8d5 100644 --- a/src/xenia/apu/xma_context.h +++ b/src/xenia/apu/xma_context.h @@ -201,8 +201,8 @@ class XmaContext { uint32_t id_ = 0; uint32_t guest_ptr_ = 0; xe_mutex lock_; - bool is_allocated_ = false; - bool is_enabled_ = false; + volatile bool is_allocated_ = false; + volatile bool is_enabled_ = false; // bool is_dirty_ = true; // ffmpeg structures diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc deleted file mode 100644 index 520cbdbc3..000000000 --- a/src/xenia/base/dma.cc +++ /dev/null @@ -1,348 +0,0 @@ -#include "dma.h" -#include "logging.h" -#include "mutex.h" -#include "platform_win.h" - -XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution, - NtDelayExecutionPointer); -XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer); -XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId, - NtAlertThreadByThreadId); - -template -static void xedmaloghelper(const char (&fmt)[N], Ts... args) { - char buffer[1024]; - sprintf_s(buffer, fmt, args...); - XELOGI("%s", buffer); -} - -//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__) -//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__) -#define XEDMALOG(...) static_cast(0) -using xe::swcache::CacheLine; -static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine); -#if defined(__clang__) -XE_FORCEINLINE -static void mvdir64b(void* to, const void* from) { - __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory"); -} -#define _movdir64b mvdir64b -#endif -XE_FORCEINLINE -static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to, - CacheLine* XE_RESTRICT from) { - uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; - - CacheLine* dest1 = to; - CacheLine* src1 = from; - - CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; - CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; - - CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); - CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); - - CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); - CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); -#pragma loop(no_vector) - for (uint32_t i = 0; i < num_lines_for_8k; ++i) { - xe::swcache::CacheLine line0, line1, line2, line3; - - xe::swcache::ReadLine(&line0, src1 + i); - xe::swcache::ReadLine(&line1, src2 + i); - xe::swcache::ReadLine(&line2, src3 + i); - xe::swcache::ReadLine(&line3, src4 + i); - XE_MSVC_REORDER_BARRIER(); - xe::swcache::WriteLineNT(dest1 + i, &line0); - xe::swcache::WriteLineNT(dest2 + i, &line1); - - xe::swcache::WriteLineNT(dest3 + i, &line2); - xe::swcache::WriteLineNT(dest4 + i, &line3); - } - XE_MSVC_REORDER_BARRIER(); -} -XE_FORCEINLINE -static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to, - CacheLine* XE_RESTRICT from) { - uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; - - CacheLine* dest1 = to; - CacheLine* src1 = from; - - CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; - CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; - - CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); - CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); - - CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); - CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); -#pragma loop(no_vector) - for (uint32_t i = 0; i < num_lines_for_8k; ++i) { -#if 0 - xe::swcache::CacheLine line0, line1, line2, line3; - - xe::swcache::ReadLine(&line0, src1 + i); - xe::swcache::ReadLine(&line1, src2 + i); - xe::swcache::ReadLine(&line2, src3 + i); - xe::swcache::ReadLine(&line3, src4 + i); - XE_MSVC_REORDER_BARRIER(); - xe::swcache::WriteLineNT(dest1 + i, &line0); - xe::swcache::WriteLineNT(dest2 + i, &line1); - - xe::swcache::WriteLineNT(dest3 + i, &line2); - xe::swcache::WriteLineNT(dest4 + i, &line3); -#else - _movdir64b(dest1 + i, src1 + i); - _movdir64b(dest2 + i, src2 + i); - _movdir64b(dest3 + i, src3 + i); - _movdir64b(dest4 + i, src4 + i); -#endif - } - XE_MSVC_REORDER_BARRIER(); -} - -namespace xe::dma { -using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length); -static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length) { - static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; - - while (written_length >= 16384) { - XeCopy16384StreamingAVX(physaddr, rdmapping); - - physaddr += NUM_LINES_FOR_16K; - rdmapping += NUM_LINES_FOR_16K; - - written_length -= 16384; - } - - if (!written_length) { - return; - } - uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; - - uint32_t i = 0; - - for (; i + 1 < num_written_lines; i += 2) { - xe::swcache::CacheLine line0, line1; - - xe::swcache::ReadLine(&line0, rdmapping + i); - - xe::swcache::ReadLine(&line1, rdmapping + i + 1); - XE_MSVC_REORDER_BARRIER(); - xe::swcache::WriteLineNT(physaddr + i, &line0); - xe::swcache::WriteLineNT(physaddr + i + 1, &line1); - } - - if (i < num_written_lines) { - xe::swcache::CacheLine line0; - - xe::swcache::ReadLine(&line0, rdmapping + i); - xe::swcache::WriteLineNT(physaddr + i, &line0); - } -} - -static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length) { - static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; - - while (written_length >= 16384) { - XeCopy16384Movdir64M(physaddr, rdmapping); - - physaddr += NUM_LINES_FOR_16K; - rdmapping += NUM_LINES_FOR_16K; - - written_length -= 16384; - } - - if (!written_length) { - return; - } - uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; - - uint32_t i = 0; - - for (; i + 1 < num_written_lines; i += 2) { - _movdir64b(physaddr + i, rdmapping + i); - _movdir64b(physaddr + i + 1, rdmapping + i + 1); - } - - if (i < num_written_lines) { - _movdir64b(physaddr + i, rdmapping + i); - } -} - -XE_COLD -static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length); - -static VastCpyDispatch vastcpy_dispatch = first_vastcpy; - -XE_COLD -static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length) { - VastCpyDispatch dispatch_to_use = nullptr; - if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) { - XELOGI("Selecting MOVDIR64M vastcpy."); - dispatch_to_use = vastcpy_impl_movdir64m; - } else { - XELOGI("Selecting generic AVX vastcpy."); - dispatch_to_use = vastcpy_impl_avx; - } - - vastcpy_dispatch = - dispatch_to_use; // all future calls will go through our selected path - return vastcpy_dispatch(physaddr, rdmapping, written_length); -} - -XE_NOINLINE -void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, - uint32_t written_length) { - return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping, - written_length); -} - -#define MAX_INFLIGHT_DMAJOBS 65536 -#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1) -class XeDMACGeneric : public XeDMAC { - std::unique_ptr thrd_; - XeDMAJob* jobs_ring_; - volatile std::atomic write_ptr_; - - struct alignas(XE_HOST_CACHE_LINE_SIZE) { - volatile std::atomic read_ptr_; - xe_mutex push_into_ring_lock_; - }; - HANDLE gotjob_event; - void WorkerWait(); - - public: - virtual ~XeDMACGeneric() {} - void WorkerThreadMain(); - XeDMACGeneric() { - threading::Thread::CreationParameters crparams; - crparams.create_suspended = true; - crparams.initial_priority = threading::ThreadPriority::kNormal; - crparams.stack_size = 65536; - gotjob_event = CreateEventA(nullptr, false, false, nullptr); - thrd_ = std::move(threading::Thread::Create( - crparams, [this]() { this->WorkerThreadMain(); })); - - jobs_ring_ = (XeDMAJob*)_aligned_malloc( - MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE); - - write_ptr_ = 0; - read_ptr_ = 0; - - thrd_->Resume(); - } - - virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override { - // std::unique_lock pushlock{push_into_ring_lock_}; - HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr); - { - job->dmac_specific_ = (uintptr_t)dmacevent; - - jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job; - write_ptr_++; - SetEvent(gotjob_event); - } - return (DMACJobHandle)dmacevent; - } - virtual void WaitJobDone(DMACJobHandle handle) override { - while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) { - // NtAlertThreadByThreadId.invoke(thrd_->system_id()); - // while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) == - // WAIT_TIMEOUT) { - // ; - } - //} - - // SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false); - CloseHandle((HANDLE)handle); - } - virtual void WaitForIdle() override { - while (write_ptr_ != read_ptr_) { - threading::MaybeYield(); - } - } -}; -void XeDMACGeneric::WorkerWait() { - constexpr unsigned NUM_PAUSE_SPINS = 2048; - constexpr unsigned NUM_YIELD_SPINS = 8; -#if 0 - - for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) { - if (write_ptr_ == read_ptr_) { - _mm_pause(); - } else { - break; - } - } - for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) { - if (write_ptr_ == read_ptr_) { - threading::MaybeYield(); - } else { - break; - } - } - LARGE_INTEGER yield_execution_delay{}; - yield_execution_delay.QuadPart = - -2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds - while (write_ptr_ == read_ptr_) { - NtDelayExecutionPointer.invoke(0, &yield_execution_delay); - } -#else - do { - if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) { - while (write_ptr_ == read_ptr_) { - _mm_pause(); - } - } - - } while (write_ptr_ == read_ptr_); -#endif -} -void XeDMACGeneric::WorkerThreadMain() { - while (true) { - this->WorkerWait(); - - XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS]; - swcache::ReadFence(); - - if (current_job.precall) { - current_job.precall(¤t_job); - } - - size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE; - size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE; - - size_t line_rem = current_job.size - line_rounded; - - vastcpy(current_job.destination, current_job.source, - static_cast(line_rounded)); - - if (line_rem) { - __movsb(current_job.destination + line_rounded, - current_job.source + line_rounded, line_rem); - } - - if (current_job.postcall) { - current_job.postcall(¤t_job); - } - read_ptr_++; - swcache::WriteFence(); - - SetEvent((HANDLE)current_job.dmac_specific_); - } -} - -XeDMAC* CreateDMAC() { return new XeDMACGeneric(); } -} // namespace xe::dma diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h deleted file mode 100644 index 9406a31cd..000000000 --- a/src/xenia/base/dma.h +++ /dev/null @@ -1,47 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2020 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_BASE_DMA_H_ -#define XENIA_BASE_DMA_H_ -#include "memory.h" -#include "threading.h" -namespace xe::dma { -struct XeDMAJob; -using DmaPrecall = void (*)(XeDMAJob* job); -using DmaPostcall = void (*)(XeDMAJob* job); -struct XeDMAJob { - //threading::Event* signal_on_done; - uintptr_t dmac_specific_; - uint8_t* destination; - uint8_t* source; - size_t size; - DmaPrecall precall; - DmaPostcall postcall; - void* userdata1; - void* userdata2; -}; -using DMACJobHandle = uint64_t; - -class XeDMAC { - public: - virtual ~XeDMAC() {} - virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0; - virtual void WaitJobDone(DMACJobHandle handle) = 0; - virtual void WaitForIdle() = 0; -}; - -XeDMAC* CreateDMAC(); -// must be divisible by cache line size -XE_NOINLINE -void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, - uint32_t written_length); - -} // namespace xe::dma - -#endif // XENIA_BASE_DMA_H_ diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index ad2af543a..7b9063084 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -44,8 +44,18 @@ template constexpr bool is_pow2(T value) { return (value & (value - 1)) == 0; } +/* + Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient + because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction), + but this code can be executed alongside the scan +*/ +template +constexpr T clear_lowest_bit(T value) { + static_assert(std::is_integral_v); -// Rounds up the given value to the given alignment. + return (value - static_cast(1)) & value; +} + // Rounds up the given value to the given alignment. template constexpr T align(T value, T alignment) { return (value + alignment - 1) & ~(alignment - 1); diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 5c1763282..26f34318e 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -10,6 +10,7 @@ #include "xenia/base/memory.h" #include "xenia/base/cvar.h" #include "xenia/base/platform.h" +#include "xenia/base/logging.h" #if XE_ARCH_ARM64 #include @@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() { cvars::writable_executable_memory; } +using xe::swcache::CacheLine; + +static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine); + +#if defined(__clang__) +XE_FORCEINLINE +static void mvdir64b(void* to, const void* from) { + __asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory"); +} +#define _movdir64b mvdir64b +#endif +XE_FORCEINLINE +static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to, + CacheLine* XE_RESTRICT from) { + uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; + + CacheLine* dest1 = to; + CacheLine* src1 = from; + + CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; + CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; + + CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); + CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); + + CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); + CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); +#pragma loop(no_vector) + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { + xe::swcache::CacheLine line0, line1, line2, line3; + + xe::swcache::ReadLine(&line0, src1 + i); + xe::swcache::ReadLine(&line1, src2 + i); + xe::swcache::ReadLine(&line2, src3 + i); + xe::swcache::ReadLine(&line3, src4 + i); + XE_MSVC_REORDER_BARRIER(); + xe::swcache::WriteLineNT(dest1 + i, &line0); + xe::swcache::WriteLineNT(dest2 + i, &line1); + + xe::swcache::WriteLineNT(dest3 + i, &line2); + xe::swcache::WriteLineNT(dest4 + i, &line3); + } + XE_MSVC_REORDER_BARRIER(); +} +XE_FORCEINLINE +static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to, + CacheLine* XE_RESTRICT from) { + uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; + + CacheLine* dest1 = to; + CacheLine* src1 = from; + + CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; + CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; + + CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); + CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); + + CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); + CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); +#pragma loop(no_vector) + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { + _movdir64b(dest1 + i, src1 + i); + _movdir64b(dest2 + i, src2 + i); + _movdir64b(dest3 + i, src3 + i); + _movdir64b(dest4 + i, src4 + i); + } + XE_MSVC_REORDER_BARRIER(); +} +using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length); +static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; + + while (written_length >= 16384) { + XeCopy16384StreamingAVX(physaddr, rdmapping); + + physaddr += NUM_LINES_FOR_16K; + rdmapping += NUM_LINES_FOR_16K; + + written_length -= 16384; + } + + if (!written_length) { + return; + } + uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; + + uint32_t i = 0; + + for (; i + 1 < num_written_lines; i += 2) { + xe::swcache::CacheLine line0, line1; + + xe::swcache::ReadLine(&line0, rdmapping + i); + + xe::swcache::ReadLine(&line1, rdmapping + i + 1); + XE_MSVC_REORDER_BARRIER(); + xe::swcache::WriteLineNT(physaddr + i, &line0); + xe::swcache::WriteLineNT(physaddr + i + 1, &line1); + } + + if (i < num_written_lines) { + xe::swcache::CacheLine line0; + + xe::swcache::ReadLine(&line0, rdmapping + i); + xe::swcache::WriteLineNT(physaddr + i, &line0); + } +} + +static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; + + while (written_length >= 16384) { + XeCopy16384Movdir64M(physaddr, rdmapping); + + physaddr += NUM_LINES_FOR_16K; + rdmapping += NUM_LINES_FOR_16K; + + written_length -= 16384; + } + + if (!written_length) { + return; + } + uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; + + uint32_t i = 0; + + for (; i + 1 < num_written_lines; i += 2) { + _movdir64b(physaddr + i, rdmapping + i); + _movdir64b(physaddr + i + 1, rdmapping + i + 1); + } + + if (i < num_written_lines) { + _movdir64b(physaddr + i, rdmapping + i); + } +} + +XE_COLD +static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length); + +static VastCpyDispatch vastcpy_dispatch = first_vastcpy; + +XE_COLD +static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + VastCpyDispatch dispatch_to_use = nullptr; + if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) { + XELOGI("Selecting MOVDIR64M vastcpy."); + dispatch_to_use = vastcpy_impl_movdir64m; + } else { + XELOGI("Selecting generic AVX vastcpy."); + dispatch_to_use = vastcpy_impl_avx; + } + + vastcpy_dispatch = + dispatch_to_use; // all future calls will go through our selected path + return vastcpy_dispatch(physaddr, rdmapping, written_length); +} + +XE_NOINLINE +void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, + uint32_t written_length) { + return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping, + written_length); +} } // namespace memory // TODO(benvanik): fancy AVX versions. diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index b20b33b97..bd7081418 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -17,9 +17,9 @@ #include #include -#include "xenia/base/assert.h" + #include "xenia/base/byte_order.h" -#include "xenia/base/platform.h" + namespace xe { namespace memory { @@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) { seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2); return hash_combine(seed, vs...); } +// must be divisible by cache line size +XE_NOINLINE +void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, + uint32_t written_length); } // namespace memory diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h index 92c3a9405..b7fc09896 100644 --- a/src/xenia/base/mutex.h +++ b/src/xenia/base/mutex.h @@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock; // }; class global_critical_region { public: + constexpr global_critical_region() {} static global_mutex_type& mutex(); // Acquires a lock on the global critical section. @@ -144,18 +145,18 @@ class global_critical_region { } // Acquires a lock on the global critical section. - inline global_unique_lock_type Acquire() { + static inline global_unique_lock_type Acquire() { return global_unique_lock_type(mutex()); } // Acquires a deferred lock on the global critical section. - inline global_unique_lock_type AcquireDeferred() { + static inline global_unique_lock_type AcquireDeferred() { return global_unique_lock_type(mutex(), std::defer_lock); } // Tries to acquire a lock on the glboal critical section. // Check owns_lock() to see if the lock was successfully acquired. - inline global_unique_lock_type TryAcquire() { + static inline global_unique_lock_type TryAcquire() { return global_unique_lock_type(mutex(), std::try_to_lock); } }; diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index 84d5893b5..e481f4f27 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap() { xenia_assert(this->capacity_ >= 4); ring_size_t next_read_offset = read_offset + 4; -#if 0 - size_t zerotest = next_read_offset - this->capacity_; - // unpredictable branch, use bit arith instead - // todo: it would be faster to use lzcnt, but we need to figure out if all - // machines we support support it - next_read_offset &= -static_cast(!!zerotest); -#else + if (XE_UNLIKELY(next_read_offset == this->capacity_)) { next_read_offset = 0; // todo: maybe prefetch next? or should that happen much earlier? } -#endif this->read_offset_ = next_read_offset; - unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset]; + uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset]; return xe::byte_swap(ring_value); } } // namespace xe diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h index 604819950..1be26b373 100644 --- a/src/xenia/base/threading.h +++ b/src/xenia/base/threading.h @@ -271,7 +271,10 @@ inline std::pair WaitAny( return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable, timeout); } - +struct EventInfo { + uint32_t type; + uint32_t state; +}; // Models a Win32-like event object. // https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx class Event : public WaitHandle { @@ -299,6 +302,8 @@ class Event : public WaitHandle { // the nonsignaled state after releasing the appropriate number of waiting // threads. virtual void Pulse() = 0; + + virtual EventInfo Query() = 0; #if XE_PLATFORM_WIN32 ==1 //SetEvent, but if there is a waiter we immediately transfer execution to it virtual void SetBoostPriority() = 0; diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 01a4eb9be..ed7874458 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore, XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution, NtDelayExecutionPointer); - +XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer); namespace xe { namespace threading { @@ -255,20 +255,20 @@ std::pair WaitMultiple(WaitHandle* wait_handles[], size_t wait_handle_count, bool wait_all, bool is_alertable, std::chrono::milliseconds timeout) { - std::vector handles( - wait_handle_count); // max handles is like 64, so it would make more - // sense to just do a fixed size array here + xenia_assert(wait_handle_count <= 64); + HANDLE handles[64]; + for (size_t i = 0; i < wait_handle_count; ++i) { handles[i] = wait_handles[i]->native_handle(); } DWORD result = WaitForMultipleObjectsEx( - DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE, + static_cast(wait_handle_count), handles, wait_all ? TRUE : FALSE, DWORD(timeout.count()), is_alertable ? TRUE : FALSE); - if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) { + if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) { return std::pair(WaitResult::kSuccess, result - WAIT_OBJECT_0); } else if (result >= WAIT_ABANDONED_0 && - result < WAIT_ABANDONED_0 + handles.size()) { + result < WAIT_ABANDONED_0 + wait_handle_count) { return std::pair(WaitResult::kAbandoned, result - WAIT_ABANDONED_0); } @@ -293,7 +293,15 @@ class Win32Event : public Win32Handle { void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); } void SetBoostPriority() override { // no previous state for boostpriority - NtSetEventBoostPriorityPointer.invoke(handle_); + // Boost priority is unimplemented under wine probably because it's not used + // anywhere in user mode except by us. Maybe some Windows internals uses it + // see: + // https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228 + if (NtSetEventBoostPriorityPointer) { + NtSetEventBoostPriorityPointer.invoke(handle_); + } else { + NtSetEventPointer.invoke(handle_, nullptr); + } } #else void Set() override { SetEvent(handle_); } @@ -305,6 +313,11 @@ class Win32Event : public Win32Handle { SetEvent(handle_); } #endif + + EventInfo Query() { EventInfo result{}; + NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr); + return result; + } }; std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 57b038613..1b6c40f44 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -39,7 +39,7 @@ #include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/processor.h" -XE_MSVC_OPTIMIZE_SMALL() + DEFINE_bool(use_fast_dot_product, false, "Experimental optimization, much shorter sequence on dot products, " "treating inf as overflow instead of using mcxsr" diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 5e0ea6c1a..d759e97b3 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -9,23 +9,16 @@ #include "xenia/gpu/command_processor.h" -#include #include -#include -#include #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_stream.h" -#include "xenia/base/cvar.h" #include "xenia/base/logging.h" -#include "xenia/base/math.h" #include "xenia/base/profiling.h" -#include "xenia/base/ring_buffer.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/graphics_system.h" #include "xenia/gpu/sampler_info.h" #include "xenia/gpu/texture_info.h" -#include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" @@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system, write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)), write_ptr_index_(0) { assert_not_null(write_ptr_index_event_); -#if 0 - dmac_ = dma::CreateDMAC(); -#else - dmac_ = nullptr; -#endif } CommandProcessor::~CommandProcessor() = default; @@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); } void CommandProcessor::ReturnFromWait() {} -uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index, - uint32_t write_index) { - SCOPE_profile_cpu_f("gpu"); -#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 - // If we have a pending trace stream open it now. That way we ensure we get - // all commands. - if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) { - uint32_t title_id = kernel_state_->GetExecutableModule() - ? kernel_state_->GetExecutableModule()->title_id() - : 0; - auto file_name = fmt::format("{:08X}_stream.xtr", title_id); - auto path = trace_stream_path_ / file_name; - trace_writer_.Open(path, title_id); - InitializeTrace(); - } -#endif - // Adjust pointer base. - uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t); - start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF); - uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t); - end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); - - trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index); - - // Execute commands! - - RingBuffer old_reader = reader_; - new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_), - primary_buffer_size_); - - reader_.set_read_offset(read_index * sizeof(uint32_t)); - reader_.set_write_offset(write_index * sizeof(uint32_t)); - // prefetch the wraparound range - // it likely is already in L3 cache, but in a zen system it may be another - // chiplets l3 - reader_.BeginPrefetchedRead( - GetCurrentRingReadCount()); - do { - if (!ExecutePacket()) { - // This probably should be fatal - but we're going to continue anyways. - XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet."); - assert_always(); - break; - } - } while (reader_.read_count()); - - OnPrimaryBufferEnd(); - - trace_writer_.WritePrimaryBufferEnd(); - - reader_ = old_reader; - return write_index; -} - -void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) { - // Execute commands! - RingBuffer old_reader = reader_; - - new (&reader_) - RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)}; - - reader_.set_write_offset(count * sizeof(uint32_t)); - - do { - if (!ExecutePacket()) { - XELOGE("**** ExecutePacket: Failed to execute packet."); - assert_always(); - break; - } - } while (reader_.read_count()); - reader_ = old_reader; -} void CommandProcessor::InitializeTrace() { // Write the initial register values, to be loaded directly into the diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index 4e96391b8..0553a4184 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -19,11 +19,8 @@ #include #include -#include "xenia/base/dma.h" #include "xenia/base/ring_buffer.h" -#include "xenia/base/threading.h" #include "xenia/gpu/register_file.h" -#include "xenia/gpu/registers.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/xthread.h" @@ -82,7 +79,6 @@ class CommandProcessor { CommandProcessor(GraphicsSystem* graphics_system, kernel::KernelState* kernel_state); virtual ~CommandProcessor(); - dma::XeDMAC* GetDMAC() const { return dmac_; } uint32_t counter() const { return counter_; } void increment_counter() { counter_++; } @@ -135,7 +131,7 @@ class CommandProcessor { void UpdateWritePointer(uint32_t value); - void ExecutePacket(uint32_t ptr, uint32_t count); + bool is_paused() const { return paused_; } void Pause(); @@ -172,7 +168,7 @@ class CommandProcessor { uint32_t num_registers); XE_NOINLINE - virtual void WriteOneRegisterFromRing( + void WriteOneRegisterFromRing( uint32_t base, uint32_t num_times); // repeatedly write a value to one register, presumably a @@ -221,7 +217,7 @@ class CommandProcessor { virtual void PrepareForWait(); virtual void ReturnFromWait(); - uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); + virtual void OnPrimaryBufferEnd() {} #include "pm4_command_processor_declare.h" @@ -300,7 +296,6 @@ class CommandProcessor { reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {}; reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {}; uint32_t gamma_ramp_rw_component_ = 0; - dma::XeDMAC* dmac_ = nullptr; }; } // namespace gpu diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index a24d468ae..584917e45 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // validity is tracked. const Shader::ConstantRegisterMap& constant_map_vertex = vertex_shader->constant_register_map(); - for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); - ++i) { - uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i]; - uint32_t j; - while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) { - vfetch_bits_remaining &= ~(uint32_t(1) << j); - uint32_t vfetch_index = i * 32 + j; - const auto& vfetch_constant = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2); - switch (vfetch_constant.type) { - case xenos::FetchConstantType::kVertex: - break; - case xenos::FetchConstantType::kInvalidVertex: - if (cvars::gpu_allow_invalid_fetch_constants) { + { + uint32_t vfetch_addresses[96]; + uint32_t vfetch_sizes[96]; + uint32_t vfetch_current_queued = 0; + for (uint32_t i = 0; + i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) { + uint32_t vfetch_bits_remaining = + constant_map_vertex.vertex_fetch_bitmap[i]; + uint32_t j; + while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) { + vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining); + uint32_t vfetch_index = i * 32 + j; + const auto& vfetch_constant = regs.Get( + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2); + switch (vfetch_constant.type) { + case xenos::FetchConstantType::kVertex: break; - } - XELOGW( - "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! " - "This is incorrect behavior, but you can try bypassing this by " - "launching Xenia with --gpu_allow_invalid_fetch_constants=true.", - vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1); - return false; - default: - XELOGW( - "Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!", - vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1); - return false; + case xenos::FetchConstantType::kInvalidVertex: + if (cvars::gpu_allow_invalid_fetch_constants) { + break; + } + XELOGW( + "Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" " + "type! " + "This is incorrect behavior, but you can try bypassing this by " + "launching Xenia with " + "--gpu_allow_invalid_fetch_constants=true.", + vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1); + return false; + default: + XELOGW( + "Vertex fetch constant {} ({:08X} {:08X}) is completely " + "invalid!", + vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1); + return false; + } + vfetch_addresses[vfetch_current_queued] = vfetch_constant.address; + vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size; } - if (!shared_memory_->RequestRange(vfetch_constant.address << 2, - vfetch_constant.size << 2)) { - XELOGE( - "Failed to request vertex buffer at 0x{:08X} (size {}) in the " - "shared memory", - vfetch_constant.address << 2, vfetch_constant.size << 2); - return false; + } + + if (vfetch_current_queued) { + // so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though + // pre-acquire the critical region so we're not repeatedly re-acquiring it + // in requestrange + auto shared_memory_request_range_hoisted = + global_critical_region::Acquire(); + + for (uint32_t i = 0; i < vfetch_current_queued; ++i) { + if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2, + vfetch_sizes[i] << 2)) { + XELOGE( + "Failed to request vertex buffer at 0x{:08X} (size {}) in the " + "shared memory", + vfetch_addresses[i] << 2, vfetch_sizes[i] << 2); + return false; + } } } } - // Gather memexport ranges and ensure the heaps for them are resident, and // also load the data surrounding the export and to fill the regions that // won't be modified by the shaders. @@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() { shared_memory_->InitializeTraceCompleteDownloads(); } } -static void DmaPrefunc(dma::XeDMAJob* job) { - D3D12_RANGE readback_range; - readback_range.Begin = 0; - readback_range.End = job->size; - void* readback_mapping; - ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1; - - HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping); - xenia_assert(SUCCEEDED(mapres)); - - job->source = (uint8_t*)readback_mapping; -} - -static void DmaPostfunc(dma::XeDMAJob* job) { - D3D12_RANGE readback_write_range = {}; - ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1; - readback_buffer->Unmap(0, &readback_write_range); -} bool D3D12CommandProcessor::IssueCopy() { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES @@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() { if (!BeginSubmission(true)) { return false; } - if (!cvars::d3d12_readback_resolve) { uint32_t written_address, written_length; return render_target_cache_->Resolve(*memory_, *shared_memory_, @@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() { readback_buffer, 0, shared_memory_buffer, written_address, written_length); if (AwaitAllQueueOperationsCompletion()) { -#if 1 - D3D12_RANGE readback_range; - readback_range.Begin = 0; - readback_range.End = written_length; - void* readback_mapping; - if (SUCCEEDED( - readback_buffer->Map(0, &readback_range, &readback_mapping))) { - // chrispy: this memcpy needs to be optimized as much as possible + D3D12_RANGE readback_range; + readback_range.Begin = 0; + readback_range.End = written_length; + void* readback_mapping; + if (SUCCEEDED(readback_buffer->Map(0, &readback_range, + &readback_mapping))) { + // chrispy: this memcpy needs to be optimized as much as possible - auto physaddr = memory_->TranslatePhysical(written_address); - dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length); - // XEDmaCpy(physaddr, readback_mapping, written_length); - D3D12_RANGE readback_write_range = {}; - readback_buffer->Unmap(0, &readback_write_range); - } - -#else - dma::XeDMAJob job{}; - job.destination = memory_->TranslatePhysical(written_address); - job.size = written_length; - job.source = nullptr; - job.userdata1 = (void*)readback_buffer; - job.precall = DmaPrefunc; - job.postcall = DmaPostfunc; - - readback_available_ = GetDMAC()->PushDMAJob(&job); - -#endif + auto physaddr = memory_->TranslatePhysical(written_address); + memory::vastcpy(physaddr, (uint8_t*)readback_mapping, + written_length); + // XEDmaCpy(physaddr, readback_mapping, written_length); + D3D12_RANGE readback_write_range = {}; + readback_buffer->Unmap(0, &readback_write_range); + } } } } @@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl( uint32_t user_clip_plane_index; while (xe::bit_scan_forward(user_clip_planes_remaining, &user_clip_plane_index)) { - user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index); + user_clip_planes_remaining = + xe::clear_lowest_bit(user_clip_planes_remaining); const float* user_clip_plane = ®s[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32; if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane, @@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl( uint32_t textures_remaining = used_texture_mask; uint32_t texture_index; while (xe::bit_scan_forward(textures_remaining, &texture_index)) { - textures_remaining &= ~(uint32_t(1) << texture_index); + textures_remaining = xe::clear_lowest_bit(textures_remaining); uint32_t& texture_signs_uint = system_constants_.texture_swizzled_signs[texture_index >> 2]; uint32_t texture_signs_shift = (texture_index & 3) * 8; @@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { if (size == 0) { return nullptr; } -#if 1 - if (readback_available_) { - GetDMAC()->WaitJobDone(readback_available_); - readback_available_ = 0; - } -#endif size = xe::align(size, kReadbackBufferSizeIncrement); if (size > readback_buffer_size_) { const ui::d3d12::D3D12Provider& provider = GetD3D12Provider(); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index ba2c17a82..9412116ac 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -21,7 +21,6 @@ #include #include "xenia/base/assert.h" -#include "xenia/base/dma.h" #include "xenia/gpu/command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_primitive_processor.h" @@ -50,8 +49,10 @@ struct MemExportRange { uint32_t size_dwords; }; class D3D12CommandProcessor final : public CommandProcessor { - public: + protected: #include "../pm4_command_processor_declare.h" + + public: explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system, kernel::KernelState* kernel_state); ~D3D12CommandProcessor(); @@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor { uint32_t base, uint32_t num_registers); XE_NOINLINE - virtual void WriteOneRegisterFromRing(uint32_t base, - uint32_t num_times) override; + void WriteOneRegisterFromRing(uint32_t base, + uint32_t num_times); XE_FORCEINLINE void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base, @@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor { static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024; ID3D12Resource* readback_buffer_ = nullptr; - dma::DMACJobHandle readback_available_ = 0; uint32_t readback_buffer_size_ = 0; std::atomic pix_capture_requested_ = false; diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc index 1e6f61685..91bb139e3 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc @@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange( bool D3D12SharedMemory::UploadRanges( const std::pair* upload_page_ranges, - unsigned num_upload_page_ranges) { + uint32_t num_upload_page_ranges) { if (!num_upload_page_ranges) { return true; } CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST); command_processor_.SubmitBarriers(); auto& command_list = command_processor_.GetDeferredCommandList(); - // for (auto upload_range : upload_page_ranges) { - for (unsigned int i = 0; i < num_upload_page_ranges; ++i) { + for (uint32_t i = 0; i < num_upload_page_ranges; ++i) { auto& upload_range = upload_page_ranges[i]; uint32_t upload_range_start = upload_range.first; uint32_t upload_range_length = upload_range.second; @@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges( uint32_t(upload_buffer_size), false, false); if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) { - dma::vastcpy( + memory::vastcpy( upload_buffer_mapping, memory().TranslatePhysical(upload_range_start << page_size_log2()), static_cast(upload_buffer_size)); diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h index 76200ef7f..73cff2e7b 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory { bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, uint32_t length_allocations) override; - bool UploadRanges(const std::pair* - upload_page_ranges, unsigned num_ranges) override; + bool UploadRanges(const std::pair* upload_page_ranges, + uint32_t num_ranges) override; private: D3D12CommandProcessor& command_processor_; diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 0cfaaaa41..eb0ada4e0 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) { uint32_t textures_remaining = used_texture_mask; uint32_t index; while (xe::bit_scan_forward(textures_remaining, &index)) { - textures_remaining &= ~(uint32_t(1) << index); + textures_remaining = xe::clear_lowest_bit(textures_remaining); const TextureBinding* binding = GetValidTextureBinding(index); if (!binding) { continue; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 5c62c50c3..af977d4d5 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -9,21 +9,14 @@ #include "xenia/gpu/draw_util.h" -#include -#include #include -#include "xenia/base/assert.h" #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/gpu/gpu_flags.h" -#include "xenia/gpu/registers.h" #include "xenia/gpu/texture_cache.h" -#include "xenia/gpu/texture_info.h" -#include "xenia/gpu/texture_util.h" -#include "xenia/gpu/xenos.h" #include "xenia/ui/graphics_util.h" // Very prominent in 545407F2. diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 15c014520..ececbaac9 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -16,7 +16,6 @@ #include "xenia/base/assert.h" #include "xenia/gpu/register_file.h" -#include "xenia/gpu/registers.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h index 2da2fb72c..8db802d13 100644 --- a/src/xenia/gpu/pm4_command_processor_declare.h +++ b/src/xenia/gpu/pm4_command_processor_declare.h @@ -1,9 +1,14 @@ void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT; - +virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT; virtual bool ExecutePacket(); -XE_NOINLINE + +public: +void ExecutePacket(uint32_t ptr, uint32_t count); + +protected: + XE_NOINLINE bool ExecutePacketType0( uint32_t packet) XE_RESTRICT; XE_NOINLINE bool ExecutePacketType1( uint32_t packet) XE_RESTRICT; @@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount(); XE_NOINLINE XE_COLD -bool ExecutePacketType3_CountOverflow(uint32_t count); \ No newline at end of file +bool ExecutePacketType3_CountOverflow(uint32_t count); +XE_NOINLINE +XE_COLD +bool ExecutePacketType0_CountOverflow(uint32_t count); \ No newline at end of file diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h index 48795abba..bb6b94051 100644 --- a/src/xenia/gpu/pm4_command_processor_implement.h +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() { } } XE_NOINLINE +XE_COLD +bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) { + XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})", + COMMAND_PROCESSOR::GetCurrentRingReadCount(), + count * sizeof(uint32_t)); + return false; +} + /* + Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4 +*/ +XE_NOINLINE bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT { // Type-0 packet. // Write count registers in sequence to the registers starting at // (base_index << 2). uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) { - XELOGE( - "ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})", - COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t)); - return false; - } - trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count); - uint32_t base_index = (packet & 0x7FFF); - uint32_t write_one_reg = (packet >> 15) & 0x1; + if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >= + count * sizeof(uint32_t)) { + trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count); - if (!write_one_reg) { - COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count); + uint32_t base_index = (packet & 0x7FFF); + uint32_t write_one_reg = (packet >> 15) & 0x1; + if (!write_one_reg) { + COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, + count); + + } else { + COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count); + } + + trace_writer_.WritePacketEnd(); + return true; } else { - COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count); + return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count); } - - trace_writer_.WritePacketEnd(); - return true; } XE_NOINLINE bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT { @@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER( XE_NOINLINE static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) { + /* + Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits + then use the wait_info value in order to select the bits that correctly implement the condition + If neither subtraction has the signbit set then that means the value is equal + */ bool matched = false; switch (wait_info & 0x7) { case 0x0: // Never. @@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE( return true; } +/* + todo: shouldn't this do something? +*/ + bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE( uint32_t packet, uint32_t count) XE_RESTRICT { // selective invalidation of state pointers @@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY( return true; } + +uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index, + uint32_t write_index) { + SCOPE_profile_cpu_f("gpu"); +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 + // If we have a pending trace stream open it now. That way we ensure we get + // all commands. + if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) { + uint32_t title_id = kernel_state_->GetExecutableModule() + ? kernel_state_->GetExecutableModule()->title_id() + : 0; + auto file_name = fmt::format("{:08X}_stream.xtr", title_id); + auto path = trace_stream_path_ / file_name; + trace_writer_.Open(path, title_id); + InitializeTrace(); + } +#endif + // Adjust pointer base. + uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t); + start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF); + uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t); + end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); + + trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index); + + // Execute commands! + + RingBuffer old_reader = reader_; + new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_), + primary_buffer_size_); + + reader_.set_read_offset(read_index * sizeof(uint32_t)); + reader_.set_write_offset(write_index * sizeof(uint32_t)); + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader_.BeginPrefetchedRead( + GetCurrentRingReadCount()); + do { + if (!COMMAND_PROCESSOR::ExecutePacket()) { + // This probably should be fatal - but we're going to continue anyways. + XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet."); + assert_always(); + break; + } + } while (reader_.read_count()); + + COMMAND_PROCESSOR::OnPrimaryBufferEnd(); + + trace_writer_.WritePrimaryBufferEnd(); + + reader_ = old_reader; + return write_index; +} + +void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) { + // Execute commands! + RingBuffer old_reader = reader_; + + new (&reader_) + RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)}; + + reader_.set_write_offset(count * sizeof(uint32_t)); + + do { + if (!COMMAND_PROCESSOR::ExecutePacket()) { + XELOGE("**** ExecutePacket: Failed to execute packet."); + assert_always(); + break; + } + } while (reader_.read_count()); + reader_ = old_reader; +} diff --git a/src/xenia/gpu/sampler_info.h b/src/xenia/gpu/sampler_info.h index c36cae07d..5a169aed8 100644 --- a/src/xenia/gpu/sampler_info.h +++ b/src/xenia/gpu/sampler_info.h @@ -11,7 +11,6 @@ #define XENIA_GPU_SAMPLER_INFO_H_ #include "xenia/gpu/shader.h" -#include "xenia/gpu/xenos.h" namespace xe { namespace gpu { diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index f07c0ee97..e9c21d801 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -24,7 +24,6 @@ #include "xenia/base/string_buffer.h" #include "xenia/gpu/registers.h" #include "xenia/gpu/ucode.h" -#include "xenia/gpu/xenos.h" namespace xe { namespace gpu { diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc index 9e1084397..6eda12f42 100644 --- a/src/xenia/gpu/shader_interpreter.cc +++ b/src/xenia/gpu/shader_interpreter.cc @@ -13,13 +13,6 @@ #include #include -#include "xenia/base/assert.h" -#include "xenia/base/byte_order.h" -#include "xenia/base/math.h" -#include "xenia/gpu/registers.h" -#include "xenia/gpu/trace_writer.h" -#include "xenia/gpu/xenos.h" - namespace xe { namespace gpu { diff --git a/src/xenia/gpu/shader_interpreter.h b/src/xenia/gpu/shader_interpreter.h index dca530221..759f606eb 100644 --- a/src/xenia/gpu/shader_interpreter.h +++ b/src/xenia/gpu/shader_interpreter.h @@ -14,12 +14,9 @@ #include #include -#include "xenia/base/assert.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/trace_writer.h" -#include "xenia/gpu/ucode.h" -#include "xenia/gpu/xenos.h" #include "xenia/memory.h" namespace xe { diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 4530f57d4..88c7f95f4 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -9,15 +9,9 @@ #include "xenia/gpu/shader_translator.h" -#include #include -#include -#include -#include -#include "xenia/base/assert.h" #include "xenia/base/logging.h" -#include "xenia/base/math.h" #include "xenia/gpu/gpu_flags.h" namespace xe { diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index 593907952..0e764fe30 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -11,16 +11,7 @@ #define XENIA_GPU_SHADER_TRANSLATOR_H_ #include -#include -#include -#include - -#include "xenia/base/math.h" -#include "xenia/base/string_buffer.h" -#include "xenia/gpu/registers.h" #include "xenia/gpu/shader.h" -#include "xenia/gpu/ucode.h" -#include "xenia/gpu/xenos.h" namespace xe { namespace gpu { diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index 38a8c54e9..c15da8a9b 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -10,15 +10,11 @@ #include "xenia/gpu/shared_memory.h" #include -#include #include "xenia/base/assert.h" #include "xenia/base/bit_range.h" #include "xenia/base/logging.h" -#include "xenia/base/math.h" -#include "xenia/base/memory.h" #include "xenia/base/profiling.h" -#include "xenia/memory.h" namespace xe { namespace gpu { diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 7f8d3a892..e721fe8af 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -10,12 +10,6 @@ #ifndef XENIA_GPU_SHARED_MEMORY_H_ #define XENIA_GPU_SHARED_MEMORY_H_ -#include -#include -#include -#include - -#include "xenia/base/mutex.h" #include "xenia/memory.h" namespace xe { @@ -141,7 +135,7 @@ class SharedMemory { // overall bounds of pages to be uploaded. virtual bool UploadRanges( const std::pair* upload_page_ranges, - unsigned num_upload_ranges) = 0; + uint32_t num_upload_ranges) = 0; const std::vector>& trace_download_ranges() { return trace_download_ranges_; @@ -183,14 +177,10 @@ class SharedMemory { FixedVMemVector)> upload_ranges_; - // GPU-written memory downloading for traces. . - std::vector> trace_download_ranges_; - uint32_t trace_download_page_count_ = 0; - // Mutex between the guest memory subsystem and the command processor, to be // locked when checking or updating validity of pages/ranges and when firing // watches. - xe::global_critical_region global_critical_region_; + static constexpr xe::global_critical_region global_critical_region_{}; // *************************************************************************** // Things below should be fully protected by global_critical_region. @@ -266,6 +256,11 @@ class SharedMemory { uint32_t watch_node_current_pool_allocated_ = 0; WatchRange* watch_range_first_free_ = nullptr; WatchNode* watch_node_first_free_ = nullptr; + + // GPU-written memory downloading for traces. . + std::vector> trace_download_ranges_; + uint32_t trace_download_page_count_ = 0; + // Triggers the watches (global and per-range), removing triggered range // watches. void FireWatches(uint32_t page_first, uint32_t page_last, diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index dfc3264ca..14af42d0d 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -9,21 +9,11 @@ #include "xenia/gpu/texture_cache.h" -#include -#include -#include - -#include "xenia/base/assert.h" #include "xenia/base/clock.h" #include "xenia/base/cvar.h" #include "xenia/base/logging.h" -#include "xenia/base/math.h" #include "xenia/base/profiling.h" #include "xenia/gpu/gpu_flags.h" -#include "xenia/gpu/register_file.h" -#include "xenia/gpu/texture_info.h" -#include "xenia/gpu/texture_util.h" -#include "xenia/gpu/xenos.h" DEFINE_int32( draw_resolution_scale_x, 1, @@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) { uint32_t bindings_changed = 0; uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_; uint32_t index = 0; + + Texture* textures_to_load[64]; // max bits = 32, can be unsigned + signed + // means max array size = 64 + uint32_t num_textures_to_load = 0; while (xe::bit_scan_forward(textures_remaining, &index)) { uint32_t index_bit = UINT32_C(1) << index; - textures_remaining &= ~index_bit; + textures_remaining = xe::clear_lowest_bit(textures_remaining); TextureBinding& binding = texture_bindings_[index]; const auto& fetch = regs.Get( XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6); @@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) { binding.texture_signed = nullptr; } if (load_unsigned_data && binding.texture != nullptr) { - LoadTextureData(*binding.texture); + textures_to_load[num_textures_to_load++] = binding.texture; } if (load_signed_data && binding.texture_signed != nullptr) { - LoadTextureData(*binding.texture_signed); + textures_to_load[num_textures_to_load++] = binding.texture_signed; } } + + LoadTexturesData(textures_to_load, num_textures_to_load); + if (bindings_changed) { UpdateTextureBindingsImpl(bindings_changed); } @@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) { texture->LogAction("Created"); return texture; } +void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) { + assert_true(n_textures <= 64); + if (n_textures < 2) { + if (!n_textures) { + return; + } else { + LoadTextureData(*textures[0]); + return; + } + } + uint64_t index_base_outdated = 0; + uint64_t index_mips_outdated = 0; + uint32_t nkept = 0; + { + auto global_lock = global_critical_region_.Acquire(); + for (uint32_t i = 0; i < n_textures; ++i) { + Texture* current = textures[i]; + + auto base_outdated = current->base_outdated(global_lock); + auto mips_outdated = current->mips_outdated(global_lock); + + index_base_outdated |= static_cast(base_outdated) << i; + index_mips_outdated |= static_cast(mips_outdated) << i; + if (!base_outdated && !mips_outdated) { + textures[i] = nullptr; + + } else { + nkept++; + } + } + } + + if (nkept == 0) { + return; + } + + for (uint32_t i = 0; i < n_textures; ++i) { + Texture* p_texture = textures[i]; + if (!p_texture) { + continue; + } + textures[i] = nullptr; + Texture& texture = *p_texture; + + TextureKey texture_key = texture.key(); + // Implementation may load multiple blocks at once via accesses of up to 128 + // bits (R32G32B32A32_UINT), so aligning the size to this value to make sure + // if the texture is small (especially if it's linear), the last blocks + // won't be cut off (hosts may return 0, 0, 0, 0 for the whole + // R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes + // are actually provided for it). + + // Request uploading of the texture data to the shared memory. + // This is also necessary when resolution scaling is used - the texture + // cache relies on shared memory for invalidation of both unscaled and + // scaled textures. Plus a texture may be unscaled partially, when only a + // portion of its pages is invalidated, in this case we'll need the texture + // from the shared memory to load the unscaled parts. + // TODO(Triang3l): Load unscaled parts. + bool base_resolved = texture.GetBaseResolved(); + if (index_base_outdated & (1ULL << i)) { + if (!shared_memory().RequestRange( + texture_key.base_page << 12, + xe::align(texture.GetGuestBaseSize(), UINT32_C(16)), + texture_key.scaled_resolve ? nullptr : &base_resolved)) { + continue; + } + } + bool mips_resolved = texture.GetMipsResolved(); + if (index_mips_outdated & (1ULL << i)) { + if (!shared_memory().RequestRange( + texture_key.mip_page << 12, + xe::align(texture.GetGuestMipsSize(), UINT32_C(16)), + texture_key.scaled_resolve ? nullptr : &mips_resolved)) { + continue; + } + } + if (texture_key.scaled_resolve) { + // Make sure all the scaled resolve memory is resident and accessible from + // the shader, including any possible padding that hasn't yet been touched + // by an actual resolve, but is still included in the texture size, so the + // GPU won't be trying to access unmapped memory. + if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12, + texture.GetGuestBaseSize(), 4)) { + continue; + } + if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12, + texture.GetGuestMipsSize(), 4)) { + continue; + } + } + + // Actually load the texture data. + if (!LoadTextureDataFromResidentMemoryImpl( + texture, (index_base_outdated & (1ULL << i)) != 0, + (index_mips_outdated & (1ULL << i)) != 0)) { + continue; + } + + // Update the source of the texture (resolve vs. CPU or memexport) for + // purposes of handling piecewise gamma emulation via sRGB and for + // resolution scale in sampling offsets. + if (!texture_key.scaled_resolve) { + texture.SetBaseResolved(base_resolved); + texture.SetMipsResolved(mips_resolved); + } + // reque for makeuptodatandwatch + textures[i] = &texture; + } + { + auto crit = global_critical_region_.Acquire(); + + for (uint32_t i = 0; i < n_textures; ++i) { + auto texture = textures[i]; + if (!texture) { + continue; + } + // Mark the ranges as uploaded and watch them. This is needed for scaled + // resolves as well to detect when the CPU wants to reuse the memory for a + // regular texture or a vertex buffer, and thus the scaled resolve version + // is not up to date anymore. + texture->MakeUpToDateAndWatch(crit); + + texture->LogAction("Loaded"); + } + } +} bool TextureCache::LoadTextureData(Texture& texture) { // Check what needs to be uploaded. bool base_outdated, mips_outdated; diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index 197124e37..ff29fcb38 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -559,6 +559,7 @@ class TextureCache { return load_shader_info_[load_shader_index]; } bool LoadTextureData(Texture& texture); + void LoadTexturesData(Texture** textures, uint32_t n_textures); // Writes the texture data (for base, mips or both - but not neither) from the // shared memory or the scaled resolve memory. The shared memory management is // done outside this function, the implementation just needs to load the data diff --git a/src/xenia/gpu/texture_info.cc b/src/xenia/gpu/texture_info.cc index 67c465b33..7a617af9b 100644 --- a/src/xenia/gpu/texture_info.cc +++ b/src/xenia/gpu/texture_info.cc @@ -8,14 +8,7 @@ */ #include "xenia/gpu/texture_info.h" - -#include -#include -#include - #include "xenia/base/logging.h" -#include "xenia/base/math.h" -#include "xenia/base/memory.h" #include "xenia/base/xxhash.h" namespace xe { diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 5c514f61a..567e2cc48 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -13,7 +13,6 @@ #include #include #include -#include "xenia/base/assert.h" #include "xenia/gpu/xenos.h" namespace xe { diff --git a/src/xenia/gpu/texture_util.cc b/src/xenia/gpu/texture_util.cc index cbe6c62bd..74f02b3d6 100644 --- a/src/xenia/gpu/texture_util.cc +++ b/src/xenia/gpu/texture_util.cc @@ -8,13 +8,6 @@ */ #include "xenia/gpu/texture_util.h" - -#include -#include - -#include "xenia/base/assert.h" -#include "xenia/base/math.h" - namespace xe { namespace gpu { namespace texture_util { diff --git a/src/xenia/gpu/ucode.h b/src/xenia/gpu/ucode.h index 3b0875e25..edecafe7f 100644 --- a/src/xenia/gpu/ucode.h +++ b/src/xenia/gpu/ucode.h @@ -10,11 +10,6 @@ #ifndef XENIA_GPU_UCODE_H_ #define XENIA_GPU_UCODE_H_ -#include - -#include "xenia/base/assert.h" -#include "xenia/base/math.h" -#include "xenia/base/platform.h" #include "xenia/gpu/xenos.h" // The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource, diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index b1b2eb1cd..f5fd3e1af 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -46,6 +46,9 @@ namespace gpu { namespace vulkan { class VulkanCommandProcessor final : public CommandProcessor { + protected: +#include "../pm4_command_processor_declare.h" + public: // Single-descriptor layouts for use within a single frame. enum class SingleTransientDescriptorLayout { @@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor { kStorageBufferCompute, kCount, }; -#include "../pm4_command_processor_declare.h" class ScratchBufferAcquisition { public: diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc index eb6403441..33c348988 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc @@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange( bool VulkanSharedMemory::UploadRanges( const std::pair* upload_page_ranges, - unsigned num_upload_ranges) { + uint32_t num_upload_ranges) { if (!num_upload_ranges) { return true; } diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h index b83877a38..d8a72cb8c 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h @@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory { bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, uint32_t length_allocations) override; - bool UploadRanges(const std::pair* - upload_page_ranges, unsigned num_ranges) override; + bool UploadRanges(const std::pair* upload_page_ranges, + uint32_t num_ranges) override; private: void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask, diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index 997e9a48a..05bb30014 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -9,10 +9,6 @@ #include "xenia/gpu/xenos.h" -#include - -#include "xenia/base/math.h" - namespace xe { namespace gpu { namespace xenos { diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 8e9fd5c11..6c39daf45 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -10,13 +10,10 @@ #ifndef XENIA_GPU_XENOS_H_ #define XENIA_GPU_XENOS_H_ -#include -#include "xenia/base/assert.h" -#include "xenia/base/byte_order.h" -#include "xenia/base/math.h" #include "xenia/base/memory.h" -#include "xenia/base/platform.h" +#include "xenia/base/math.h" + namespace xe { namespace gpu { diff --git a/src/xenia/kernel/util/object_table.cc b/src/xenia/kernel/util/object_table.cc index d1250791c..f560cc837 100644 --- a/src/xenia/kernel/util/object_table.cc +++ b/src/xenia/kernel/util/object_table.cc @@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) { // Generic lookup template <> -object_ref ObjectTable::LookupObject(X_HANDLE handle) { - auto object = ObjectTable::LookupObject(handle, false); +object_ref ObjectTable::LookupObject( + X_HANDLE handle, bool already_locked) { + auto object = ObjectTable::LookupObject(handle, already_locked); auto result = object_ref(reinterpret_cast(object)); return result; } diff --git a/src/xenia/kernel/util/object_table.h b/src/xenia/kernel/util/object_table.h index 6ac67d1b7..7722c28d1 100644 --- a/src/xenia/kernel/util/object_table.h +++ b/src/xenia/kernel/util/object_table.h @@ -46,10 +46,9 @@ class ObjectTable { // Restores a XObject reference with a handle. Mainly for internal use - do // not use. X_STATUS RestoreHandle(X_HANDLE handle, XObject* object); - template - object_ref LookupObject(X_HANDLE handle) { - auto object = LookupObject(handle, false); + object_ref LookupObject(X_HANDLE handle, bool already_locked = false) { + auto object = LookupObject(handle, already_locked); if (T::kObjectType == XObject::Type::Socket) { object = LookupObject((handle | 0xF8000000), false); } @@ -110,7 +109,8 @@ class ObjectTable { // Generic lookup template <> -object_ref ObjectTable::LookupObject(X_HANDLE handle); +object_ref ObjectTable::LookupObject( + X_HANDLE handle, bool already_locked); } // namespace util } // namespace kernel diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc index 3e9c39cbd..0cf538ffa 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_audio.cc @@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub); dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr, lpdword_t driver_ptr) { + if (!callback_ptr) { + return X_E_INVALIDARG; + } + uint32_t callback = callback_ptr[0]; + + if (!callback) { + return X_E_INVALIDARG; + } uint32_t callback_arg = callback_ptr[1]; auto audio_system = kernel_state()->emulator()->audio_system(); diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index b5bb6c57b..95b26dfb3 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle, } DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented, kHighFrequency); +dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) { + X_STATUS result = X_STATUS_SUCCESS; + auto ev = kernel_state()->object_table()->LookupObject(handle); + if (ev) { + uint32_t type_tmp, state_tmp; + + ev->Query(&type_tmp, &state_tmp); + + out_struc[0] = type_tmp; + out_struc[1] = state_tmp; + + } else { + result = X_STATUS_INVALID_HANDLE; + } + + return result; +} +DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented, + kHighFrequency); uint32_t xeNtClearEvent(uint32_t handle) { X_STATUS result = X_STATUS_SUCCESS; @@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry( lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) { assert_true(wait_type <= 1); - std::vector> objects; - for (uint32_t n = 0; n < count; n++) { - auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]); - auto object_ref = - XObject::GetNativeObject(kernel_state(), object_ptr); - if (!object_ref) { - return X_STATUS_INVALID_PARAMETER; + assert_true(count <= 64); + object_ref objects[64]; + { + auto crit = global_critical_region::AcquireDirect(); + for (uint32_t n = 0; n < count; n++) { + auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]); + auto object_ref = XObject::GetNativeObject(kernel_state(), + object_ptr, -1, true); + if (!object_ref) { + return X_STATUS_INVALID_PARAMETER; + } + + objects[n] = std::move(object_ref); } - - objects.push_back(std::move(object_ref)); } - uint64_t timeout = timeout_ptr ? static_cast(*timeout_ptr) : 0u; - return XObject::WaitMultiple(uint32_t(objects.size()), - reinterpret_cast(objects.data()), - wait_type, wait_reason, processor_mode, - alertable, timeout_ptr ? &timeout : nullptr); + return XObject::WaitMultiple( + uint32_t(count), reinterpret_cast(&objects[0]), wait_type, + wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr); } DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented, kBlocking, kHighFrequency); @@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be* handles, uint64_t* timeout_ptr) { assert_true(wait_type <= 1); - std::vector> objects; - for (uint32_t n = 0; n < count; n++) { - uint32_t object_handle = handles[n]; - auto object = - kernel_state()->object_table()->LookupObject(object_handle); - if (!object) { - return X_STATUS_INVALID_PARAMETER; + assert_true(count <= 64); + object_ref objects[64]; + + /* + Reserving to squash the constant reallocations, in a benchmark of one + particular game over a period of five minutes roughly 11% of CPU time was + spent inside a helper function to Windows' heap allocation function. 7% of + that time was traced back to here + + edit: actually switched to fixed size array, as there can never be more + than 64 events specified + */ + { + auto crit = global_critical_region::AcquireDirect(); + for (uint32_t n = 0; n < count; n++) { + uint32_t object_handle = handles[n]; + auto object = kernel_state()->object_table()->LookupObject( + object_handle, true); + if (!object) { + return X_STATUS_INVALID_PARAMETER; + } + objects[n] = std::move(object); } - objects.push_back(std::move(object)); } - return XObject::WaitMultiple(count, - reinterpret_cast(objects.data()), + return XObject::WaitMultiple(count, reinterpret_cast(&objects[0]), wait_type, 6, wait_mode, alertable, timeout_ptr); } @@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry( dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode, dword_t alertable, lpqword_t timeout_ptr) { uint64_t timeout = timeout_ptr ? static_cast(*timeout_ptr) : 0u; + if (!count || count > 64 || wait_type != 1 && wait_type) { + return X_STATUS_INVALID_PARAMETER; + } return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode, alertable, timeout_ptr ? &timeout : nullptr); @@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle, dword_t r6, lpqword_t timeout_ptr) { X_STATUS result = X_STATUS_SUCCESS; + // pre-lock for these two handle lookups + global_critical_region::mutex().lock(); - auto signal_object = - kernel_state()->object_table()->LookupObject(signal_handle); + auto signal_object = kernel_state()->object_table()->LookupObject( + signal_handle, true); auto wait_object = - kernel_state()->object_table()->LookupObject(wait_handle); + kernel_state()->object_table()->LookupObject(wait_handle, true); + global_critical_region::mutex().unlock(); if (signal_object && wait_object) { uint64_t timeout = timeout_ptr ? static_cast(*timeout_ptr) : 0u; result = diff --git a/src/xenia/kernel/xevent.cc b/src/xenia/kernel/xevent.cc index 03c947c7e..bf1176af8 100644 --- a/src/xenia/kernel/xevent.cc +++ b/src/xenia/kernel/xevent.cc @@ -71,7 +71,12 @@ int32_t XEvent::Reset() { event_->Reset(); return 1; } +void XEvent::Query(uint32_t* out_type, uint32_t* out_state) { + auto [type, state] = event_->Query(); + *out_type = type; + *out_state = state; +} void XEvent::Clear() { event_->Reset(); } bool XEvent::Save(ByteStream* stream) { diff --git a/src/xenia/kernel/xevent.h b/src/xenia/kernel/xevent.h index b5f4accf5..4fd174cd0 100644 --- a/src/xenia/kernel/xevent.h +++ b/src/xenia/kernel/xevent.h @@ -36,6 +36,7 @@ class XEvent : public XObject { int32_t Set(uint32_t priority_increment, bool wait); int32_t Pulse(uint32_t priority_increment, bool wait); int32_t Reset(); + void Query(uint32_t* out_type, uint32_t* out_state); void Clear(); bool Save(ByteStream* stream) override; diff --git a/src/xenia/kernel/xobject.cc b/src/xenia/kernel/xobject.cc index bf189c2af..bd7374b74 100644 --- a/src/xenia/kernel/xobject.cc +++ b/src/xenia/kernel/xobject.cc @@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects, uint32_t wait_type, uint32_t wait_reason, uint32_t processor_mode, uint32_t alertable, uint64_t* opt_timeout) { - std::vector wait_handles(count); + xe::threading::WaitHandle* wait_handles[64]; + for (size_t i = 0; i < count; ++i) { wait_handles[i] = objects[i]->GetWaitHandle(); assert_not_null(wait_handles[i]); @@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects, : std::chrono::milliseconds::max(); if (wait_type) { - auto result = xe::threading::WaitAny(std::move(wait_handles), + auto result = xe::threading::WaitAny(wait_handles, count, alertable ? true : false, timeout_ms); switch (result.first) { case xe::threading::WaitResult::kSuccess: @@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects, return X_STATUS_UNSUCCESSFUL; } } else { - auto result = xe::threading::WaitAll(std::move(wait_handles), + auto result = xe::threading::WaitAll(wait_handles, count, alertable ? true : false, timeout_ms); switch (result) { case xe::threading::WaitResult::kSuccess: @@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) { } object_ref XObject::GetNativeObject(KernelState* kernel_state, - void* native_ptr, - int32_t as_type) { + void* native_ptr, int32_t as_type, + bool already_locked) { assert_not_null(native_ptr); // Unfortunately the XDK seems to inline some KeInitialize calls, meaning @@ -373,10 +374,12 @@ object_ref XObject::GetNativeObject(KernelState* kernel_state, // We identify this by setting wait_list_flink to a magic value. When set, // wait_list_blink will hold a handle to our object. - auto global_lock = xe::global_critical_region::AcquireDirect(); + if (!already_locked) { + global_critical_region::mutex().lock(); + } auto header = reinterpret_cast(native_ptr); - + XObject* result; if (as_type == -1) { as_type = header->type; } @@ -385,10 +388,12 @@ object_ref XObject::GetNativeObject(KernelState* kernel_state, // Already initialized. // TODO: assert if the type of the object != as_type uint32_t handle = header->wait_list_blink; - auto object = kernel_state->object_table()->LookupObject(handle); - + result = kernel_state->object_table() + ->LookupObject(handle, true) + .release(); + goto return_result; // TODO(benvanik): assert nothing has been changed in the struct. - return object; + // return object; } else { // First use, create new. // https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html @@ -430,14 +435,22 @@ object_ref XObject::GetNativeObject(KernelState* kernel_state, case 24: // ThreadedDpcObject default: assert_always(); - return NULL; + result = nullptr; + goto return_result; + + // return NULL; } // Stash pointer in struct. // FIXME: This assumes the object contains a dispatch header (some don't!) StashHandle(header, object->handle()); + result = object; - return object_ref(object); + return_result: + if (!already_locked) { + global_critical_region::mutex().unlock(); + } + return object_ref(result); } } diff --git a/src/xenia/kernel/xobject.h b/src/xenia/kernel/xobject.h index 67518b35c..df067b55d 100644 --- a/src/xenia/kernel/xobject.h +++ b/src/xenia/kernel/xobject.h @@ -192,10 +192,12 @@ class XObject { static object_ref GetNativeObject(KernelState* kernel_state, void* native_ptr, - int32_t as_type = -1); + int32_t as_type = -1, + bool already_locked = false); template static object_ref GetNativeObject(KernelState* kernel_state, - void* native_ptr, int32_t as_type = -1); + void* native_ptr, int32_t as_type = -1, + bool already_locked = false); protected: bool SaveObject(ByteStream* stream); @@ -362,9 +364,11 @@ object_ref retain_object(T* ptr) { template object_ref XObject::GetNativeObject(KernelState* kernel_state, - void* native_ptr, int32_t as_type) { + void* native_ptr, int32_t as_type, + bool already_locked) { return object_ref(reinterpret_cast( - GetNativeObject(kernel_state, native_ptr, as_type).release())); + GetNativeObject(kernel_state, native_ptr, as_type, already_locked) + .release())); } } // namespace kernel