From 24383b9137953a3be669f901b44baa42f0f249d2 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 31 Jul 2019 00:18:12 +0300 Subject: [PATCH] [Memory/D3D12] Unwatch up to 256 KB ranges --- src/xenia/gpu/d3d12/primitive_converter.cc | 9 +-- src/xenia/gpu/d3d12/primitive_converter.h | 8 +-- src/xenia/gpu/d3d12/shared_memory.cc | 66 +++++++++++++++------ src/xenia/gpu/d3d12/shared_memory.h | 18 +++--- src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc | 2 +- src/xenia/memory.cc | 67 ++++++++++++++++++---- src/xenia/memory.h | 20 +++---- 7 files changed, 135 insertions(+), 55 deletions(-) diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index c79799d0f..33e281680 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -699,8 +699,8 @@ void* PrimitiveConverter::AllocateIndices( return mapping + simd_offset; } -void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start, - uint32_t length) { +std::pair PrimitiveConverter::MemoryWriteCallback( + uint32_t physical_address_start, uint32_t length) { // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size. uint32_t bit_index_first = physical_address_start >> 23; uint32_t bit_index_last = (physical_address_start + length - 1) >> 23; @@ -709,11 +709,12 @@ void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start, bits &= (1ull << (bit_index_last + 1)) - 1; } memory_regions_invalidated_ |= bits; + return std::make_pair(0, UINT32_MAX); } -void PrimitiveConverter::MemoryWriteCallbackThunk( +std::pair PrimitiveConverter::MemoryWriteCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length) { - reinterpret_cast(context_ptr) + return reinterpret_cast(context_ptr) ->MemoryWriteCallback(physical_address_start, length); } diff --git a/src/xenia/gpu/d3d12/primitive_converter.h b/src/xenia/gpu/d3d12/primitive_converter.h index 79aba99c3..d436d1c60 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.h +++ b/src/xenia/gpu/d3d12/primitive_converter.h @@ -89,10 +89,10 @@ class PrimitiveConverter { D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out); // Callback for invalidating buffers mid-frame. - void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length); - static void MemoryWriteCallbackThunk(void* context_ptr, - uint32_t physical_address_start, - uint32_t length); + std::pair MemoryWriteCallback( + uint32_t physical_address_start, uint32_t length); + static std::pair MemoryWriteCallbackThunk( + void* context_ptr, uint32_t physical_address_start, uint32_t length); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index 6ad37e2f1..8b74daebe 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -50,7 +50,8 @@ SharedMemory::SharedMemory(D3D12CommandProcessor* command_processor, uint32_t page_bitmap_length = page_count_ >> 6; assert_true(page_bitmap_length != 0); - valid_pages_.resize(page_bitmap_length); + // Two interleaved bit arrays. + valid_and_gpu_written_pages_.resize(page_bitmap_length << 1); } SharedMemory::~SharedMemory() { Shutdown(); } @@ -124,7 +125,8 @@ bool SharedMemory::Initialize() { uint32_t(BufferDescriptorIndex::kRawUAV)), buffer_, kBufferSize); - std::memset(valid_pages_.data(), 0, valid_pages_.size() * sizeof(uint64_t)); + std::memset(valid_and_gpu_written_pages_.data(), 0, + valid_and_gpu_written_pages_.size() * sizeof(uint64_t)); upload_buffer_pool_ = std::make_unique(context, 4 * 1024 * 1024); @@ -381,7 +383,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { } uint32_t upload_buffer_pages = upload_buffer_size >> page_size_log2_; // No mutex holding here! - MakeRangeValid(upload_range_start, upload_buffer_pages); + MakeRangeValid(upload_range_start, upload_buffer_pages, false); std::memcpy( upload_buffer_mapping, memory_->TranslatePhysical(upload_range_start << page_size_log2_), @@ -447,7 +449,7 @@ void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) { // Mark the range as valid (so pages are not reuploaded until modified by the // CPU) and watch it so the CPU can reuse it and this will be caught. // No mutex holding here! - MakeRangeValid(page_first, page_last - page_first + 1); + MakeRangeValid(page_first, page_last - page_first + 1, true); } bool SharedMemory::AreTiledResourcesUsed() const { @@ -462,7 +464,8 @@ bool SharedMemory::AreTiledResourcesUsed() const { } void SharedMemory::MakeRangeValid(uint32_t valid_page_first, - uint32_t valid_page_count) { + uint32_t valid_page_count, + bool written_by_gpu) { if (valid_page_first >= page_count_ || valid_page_count == 0) { return; } @@ -482,7 +485,12 @@ void SharedMemory::MakeRangeValid(uint32_t valid_page_first, if (i == valid_block_last && (valid_page_last & 63) != 63) { valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1; } - valid_pages_[i] |= valid_bits; + valid_and_gpu_written_pages_[i << 1] |= valid_bits; + if (written_by_gpu) { + valid_and_gpu_written_pages_[(i << 1) + 1] |= valid_bits; + } else { + valid_and_gpu_written_pages_[(i << 1) + 1] &= ~valid_bits; + } } } @@ -527,7 +535,7 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first, uint32_t range_start = UINT32_MAX; for (uint32_t i = request_block_first; i <= request_block_last; ++i) { - uint64_t block_valid = valid_pages_[i]; + uint64_t block_valid = valid_and_gpu_written_pages_[i << 1]; // Consider pages in the block outside the requested range valid. if (i == request_block_first) { block_valid |= (1ull << (request_page_first & 63)) - 1; @@ -569,25 +577,44 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first, } } -void SharedMemory::MemoryWriteCallbackThunk(void* context_ptr, - uint32_t physical_address_start, - uint32_t length) { - reinterpret_cast(context_ptr) +std::pair SharedMemory::MemoryWriteCallbackThunk( + void* context_ptr, uint32_t physical_address_start, uint32_t length) { + return reinterpret_cast(context_ptr) ->MemoryWriteCallback(physical_address_start, length); } -void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start, - uint32_t length) { - if (length == 0) { - return; - } +std::pair SharedMemory::MemoryWriteCallback( + uint32_t physical_address_start, uint32_t length) { uint32_t page_first = physical_address_start >> page_size_log2_; uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_; + assert_true(page_first < page_count_ && page_last < page_count_); uint32_t block_first = page_first >> 6; uint32_t block_last = page_last >> 6; auto global_lock = global_critical_region_.Acquire(); + // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be + // invalidated - if no GPU-written data nearby that was not intended to be + // invalidated since it's not in sync with CPU memory and can't be reuploaded. + // It's a lot cheaper to upload some excess data than to catch access + // violations - with 4 KB callbacks, the original Doom runs at 4 FPS on + // Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per + // frame, but with 256 KB it's 0.7 ms. + if (page_first & 63) { + uint64_t gpu_written_start = + valid_and_gpu_written_pages_[(block_first << 1) + 1]; + gpu_written_start &= (1ull << (page_first & 63)) - 1; + page_first = + (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); + } + if ((page_last & 63) != 63) { + uint64_t gpu_written_end = + valid_and_gpu_written_pages_[(block_last << 1) + 1]; + gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1); + page_last = (page_last & ~uint32_t(63)) + + (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); + } + for (uint32_t i = block_first; i <= block_last; ++i) { uint64_t invalidate_bits = UINT64_MAX; if (i == block_first) { @@ -596,10 +623,15 @@ void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start, if (i == block_last && (page_last & 63) != 63) { invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1; } - valid_pages_[i] &= ~invalidate_bits; + valid_and_gpu_written_pages_[i << 1] &= ~invalidate_bits; + valid_and_gpu_written_pages_[(i << 1) + 1] &= ~invalidate_bits; } FireWatches(page_first, page_last, false); + + return std::make_pair(page_first << page_size_log2_, + (page_last - page_first + 1) + << page_size_log2_); } void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state) { diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h index 2a6b78c71..de2d44a6b 100644 --- a/src/xenia/gpu/d3d12/shared_memory.h +++ b/src/xenia/gpu/d3d12/shared_memory.h @@ -128,7 +128,8 @@ class SharedMemory { bool AreTiledResourcesUsed() const; // Mark the memory range as updated and protect it. - void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count); + void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count, + bool written_by_gpu); D3D12CommandProcessor* command_processor_; @@ -182,14 +183,17 @@ class SharedMemory { // Things below should be protected by global_critical_region. // *************************************************************************** - // Bit vector containing whether physical memory system pages are up to date. - std::vector valid_pages_; + // Bit vector containing: + // - Even block indices - whether physical memory system pages are up to date. + // - Odd block indices - whether phyical memory system pages contain data + // written by the GPU not synchronized with the CPU (subset of valid pages). + std::vector valid_and_gpu_written_pages_; // Memory access callback. - static void MemoryWriteCallbackThunk(void* context_ptr, - uint32_t physical_address_start, - uint32_t length); - void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length); + static std::pair MemoryWriteCallbackThunk( + void* context_ptr, uint32_t physical_address_start, uint32_t length); + std::pair MemoryWriteCallback( + uint32_t physical_address_start, uint32_t length); struct GlobalWatch { GlobalWatchCallback callback; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc index b5949eca1..21fb050b6 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc @@ -173,7 +173,7 @@ dword_result_t NtReadFile(dword_t file_handle, dword_t event_handle, // TODO(rick): better checking of physical address if (buffer.guest_address() >= 0xA0000000) { kernel_memory()->TriggerWatches(buffer.guest_address(), buffer_length, - true); + true, true); } // Synchronous. diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index d02bea540..9a416e540 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -433,7 +433,7 @@ bool Memory::AccessViolationCallback(size_t host_address, bool is_write) { heap == &heaps_.vE0000000) { return static_cast(heap)->TriggerWatches( virtual_address / system_page_size_ * system_page_size_, - system_page_size_, is_write); + system_page_size_, is_write, false); } return false; @@ -461,7 +461,8 @@ void Memory::CancelAccessWatch(uintptr_t watch_handle) { } bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length, - bool is_write) { + bool is_write, bool unwatch_exact_range, + bool unprotect) { BaseHeap* heap = LookupHeap(virtual_address); if (heap == &heaps_.vA0000000 || heap == &heaps_.vC0000000 || heap == &heaps_.vE0000000) { @@ -469,8 +470,8 @@ bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length, // watches are removed. cpu::MMIOHandler::global_handler()->InvalidateRange(virtual_address, length); - return static_cast(heap)->TriggerWatches(virtual_address, - length, is_write); + return static_cast(heap)->TriggerWatches( + virtual_address, length, is_write, unwatch_exact_range, unprotect); } return false; } @@ -1460,7 +1461,8 @@ bool PhysicalHeap::Release(uint32_t base_address, uint32_t* out_region_size) { // watches are removed. cpu::MMIOHandler::global_handler()->InvalidateRange(base_address, region_size); - TriggerWatches(base_address, region_size, true, !FLAGS_protect_on_release); + TriggerWatches(base_address, region_size, true, true, + !FLAGS_protect_on_release); } if (!parent_heap_->Release(parent_base_address, out_region_size)) { @@ -1478,7 +1480,7 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, // TODO(Triang3l): Remove InvalidateRange when legacy (old Vulkan renderer) // watches are removed. cpu::MMIOHandler::global_handler()->InvalidateRange(address, size); - TriggerWatches(address, size, true, false); + TriggerWatches(address, size, true, true, false); if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect, old_protect)) { @@ -1574,7 +1576,8 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address, } bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, - bool is_write, bool unprotect) { + bool is_write, bool unwatch_exact_range, + bool unprotect) { // TODO(Triang3l): Support read watches. assert_true(is_write); if (!is_write) { @@ -1632,8 +1635,11 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, } // Trigger callbacks. - // TODO(Triang3l): Accumulate the range that is safe to unwatch from the - // callbacks. + if (!unprotect) { + // If not doing anything with protection, no point in unwatching excess + // pages. + unwatch_exact_range = true; + } uint32_t physical_address_offset = GetPhysicalAddress(heap_base_); uint32_t physical_address_start = xe::sat_sub(system_page_first * system_page_size_, @@ -1644,9 +1650,48 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, system_address_offset) + physical_address_offset - physical_address_start, heap_size_ + 1 - (physical_address_start - physical_address_offset)); + uint32_t unwatch_first = 0; + uint32_t unwatch_last = UINT32_MAX; for (auto physical_write_watch : memory_->physical_write_watches_) { - physical_write_watch->callback(physical_write_watch->callback_context, - physical_address_start, physical_length); + std::pair callback_unwatch_range = + physical_write_watch->callback(physical_write_watch->callback_context, + physical_address_start, physical_length); + if (!unwatch_exact_range) { + unwatch_first = std::max(unwatch_first, callback_unwatch_range.first); + unwatch_last = std::min( + unwatch_last, + xe::sat_add( + callback_unwatch_range.first, + std::max(callback_unwatch_range.second, uint32_t(1)) - 1)); + } + } + if (!unwatch_exact_range) { + // Always unwatch at least the requested pages. + unwatch_first = std::min(unwatch_first, physical_address_start); + unwatch_last = + std::max(unwatch_last, physical_address_start + physical_length - 1); + // Don't unprotect too much if not caring much about the region (limit to + // 4 MB - somewhat random, but max 1024 iterations of the page loop). + const uint32_t kMaxUnwatchExcess = 4 * 1024 * 1024; + unwatch_first = std::max(unwatch_first, + physical_address_start & ~(kMaxUnwatchExcess - 1)); + unwatch_last = + std::min(unwatch_last, (physical_address_start + physical_length - 1) | + (kMaxUnwatchExcess - 1)); + // Convert to heap-relative addresses. + unwatch_first = xe::sat_sub(unwatch_first, physical_address_offset); + unwatch_last = xe::sat_sub(unwatch_last, physical_address_offset); + // Clamp to the heap upper bound. + unwatch_first = std::min(unwatch_first, heap_size_); + unwatch_last = std::min(unwatch_last, heap_size_); + // Convert to system pages and update the range. + unwatch_first += system_address_offset; + unwatch_last += system_address_offset; + assert_true(unwatch_first <= unwatch_last); + system_page_first = unwatch_first / system_page_size_; + system_page_last = unwatch_last / system_page_size_; + block_index_first = system_page_first >> 6; + block_index_last = system_page_last >> 6; } // Unprotect ranges that need unprotection. diff --git a/src/xenia/memory.h b/src/xenia/memory.h index cf5cd4284..4c485f284 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "xenia/base/memory.h" @@ -222,7 +223,7 @@ class PhysicalHeap : public BaseHeap { void WatchPhysicalWrite(uint32_t physical_address, uint32_t length); // Returns true if any page in the range was watched. bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write, - bool unprotect = true); + bool unwatch_exact_range, bool unprotect = true); protected: VirtualHeap* parent_heap_; @@ -333,9 +334,11 @@ class Memory { // Cancels a write watch requested with AddPhysicalAccessWatch. void CancelAccessWatch(uintptr_t watch_handle); - typedef void (*PhysicalWriteWatchCallback)(void* context_ptr, - uint32_t physical_address_start, - uint32_t length); + // Returns start and length of the smallest physical memory region surrounding + // the watched region that can be safely unwatched, if it doesn't matter, + // return (0, UINT32_MAX). + typedef std::pair (*PhysicalWriteWatchCallback)( + void* context_ptr, uint32_t physical_address_start, uint32_t length); // Physical memory write watching, allowing subsystems to invalidate cached // data that depends on memory contents. @@ -366,12 +369,6 @@ class Memory { // same pages, and watches must not be placed on read-only or totally // inaccessible pages, there are significant difficulties with synchronizing // all the three ranges, but it's generally not needed. - // - // TODO(Triang3l): Allow the callbacks to unwatch regions larger than one page - // (for instance, 64 KB) so there are less access violations. All callbacks - // must agree to unwatch larger ranges because in some cases (like regions - // near the locations that render targets have been resolved to) it is - // necessary to invalidate only a single page and none more. void* RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback, void* callback_context); @@ -391,7 +388,8 @@ class Memory { // Forces triggering of watch callbacks for a virtual address range if pages // are watched there and unwatching them. Returns whether any page was // watched. - bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write); + bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write, + bool unwatch_exact_range, bool unprotect = true); // Allocates virtual memory from the 'system' heap. // System memory is kept separate from game memory but is still accessible