From 0370f8bbd91503b9ae9808ac4e1c271d624cd598 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 3 Aug 2019 19:16:04 +0300 Subject: [PATCH] [Memory] Pass exact_range to watch callbacks --- src/xenia/gpu/d3d12/primitive_converter.cc | 7 ++-- src/xenia/gpu/d3d12/primitive_converter.h | 5 ++- src/xenia/gpu/d3d12/shared_memory.cc | 49 ++++++++++++---------- src/xenia/gpu/d3d12/shared_memory.h | 5 ++- src/xenia/memory.cc | 3 +- src/xenia/memory.h | 3 +- 6 files changed, 40 insertions(+), 32 deletions(-) diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index 0da7e33d0..95f2fc2f6 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -700,7 +700,7 @@ void* PrimitiveConverter::AllocateIndices( } std::pair PrimitiveConverter::MemoryWriteCallback( - uint32_t physical_address_start, uint32_t length) { + uint32_t physical_address_start, uint32_t length, bool exact_range) { // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size. uint32_t bit_index_first = physical_address_start >> 23; uint32_t bit_index_last = (physical_address_start + length - 1) >> 23; @@ -713,9 +713,10 @@ std::pair PrimitiveConverter::MemoryWriteCallback( } std::pair PrimitiveConverter::MemoryWriteCallbackThunk( - void* context_ptr, uint32_t physical_address_start, uint32_t length) { + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range) { return reinterpret_cast(context_ptr) - ->MemoryWriteCallback(physical_address_start, length); + ->MemoryWriteCallback(physical_address_start, length, exact_range); } D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer( diff --git a/src/xenia/gpu/d3d12/primitive_converter.h b/src/xenia/gpu/d3d12/primitive_converter.h index d436d1c60..12812d3e1 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.h +++ b/src/xenia/gpu/d3d12/primitive_converter.h @@ -90,9 +90,10 @@ class PrimitiveConverter { // Callback for invalidating buffers mid-frame. std::pair MemoryWriteCallback( - uint32_t physical_address_start, uint32_t length); + uint32_t physical_address_start, uint32_t length, bool exact_range); static std::pair MemoryWriteCallbackThunk( - void* context_ptr, uint32_t physical_address_start, uint32_t length); + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index c4c523eff..6f302c952 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -578,13 +578,14 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first, } std::pair SharedMemory::MemoryWriteCallbackThunk( - void* context_ptr, uint32_t physical_address_start, uint32_t length) { + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range) { return reinterpret_cast(context_ptr) - ->MemoryWriteCallback(physical_address_start, length); + ->MemoryWriteCallback(physical_address_start, length, exact_range); } std::pair SharedMemory::MemoryWriteCallback( - uint32_t physical_address_start, uint32_t length) { + uint32_t physical_address_start, uint32_t length, bool exact_range) { uint32_t page_first = physical_address_start >> page_size_log2_; uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_; assert_true(page_first < page_count_ && page_last < page_count_); @@ -593,26 +594,28 @@ std::pair SharedMemory::MemoryWriteCallback( auto global_lock = global_critical_region_.Acquire(); - // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be - // invalidated - if no GPU-written data nearby that was not intended to be - // invalidated since it's not in sync with CPU memory and can't be reuploaded. - // It's a lot cheaper to upload some excess data than to catch access - // violations - with 4 KB callbacks, the original Doom runs at 4 FPS on - // Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per - // frame, but with 256 KB it's 0.7 ms. - if (page_first & 63) { - uint64_t gpu_written_start = - valid_and_gpu_written_pages_[(block_first << 1) + 1]; - gpu_written_start &= (1ull << (page_first & 63)) - 1; - page_first = - (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); - } - if ((page_last & 63) != 63) { - uint64_t gpu_written_end = - valid_and_gpu_written_pages_[(block_last << 1) + 1]; - gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1); - page_last = (page_last & ~uint32_t(63)) + - (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); + if (!exact_range) { + // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be + // invalidated - if no GPU-written data nearby that was not intended to be + // invalidated since it's not in sync with CPU memory and can't be + // reuploaded. It's a lot cheaper to upload some excess data than to catch + // access violations - with 4 KB callbacks, the original Doom runs at 4 FPS + // on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per + // frame, but with 256 KB it's 0.7 ms. + if (page_first & 63) { + uint64_t gpu_written_start = + valid_and_gpu_written_pages_[(block_first << 1) + 1]; + gpu_written_start &= (1ull << (page_first & 63)) - 1; + page_first = + (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); + } + if ((page_last & 63) != 63) { + uint64_t gpu_written_end = + valid_and_gpu_written_pages_[(block_last << 1) + 1]; + gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1); + page_last = (page_last & ~uint32_t(63)) + + (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); + } } for (uint32_t i = block_first; i <= block_last; ++i) { diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h index de2d44a6b..dc1869fbd 100644 --- a/src/xenia/gpu/d3d12/shared_memory.h +++ b/src/xenia/gpu/d3d12/shared_memory.h @@ -191,9 +191,10 @@ class SharedMemory { // Memory access callback. static std::pair MemoryWriteCallbackThunk( - void* context_ptr, uint32_t physical_address_start, uint32_t length); + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range); std::pair MemoryWriteCallback( - uint32_t physical_address_start, uint32_t length); + uint32_t physical_address_start, uint32_t length, bool exact_range); struct GlobalWatch { GlobalWatchCallback callback; diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index 6b27db978..8d8da354f 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -1643,7 +1643,8 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, for (auto physical_write_watch : memory_->physical_write_watches_) { std::pair callback_unwatch_range = physical_write_watch->callback(physical_write_watch->callback_context, - physical_address_start, physical_length); + physical_address_start, physical_length, + unwatch_exact_range); if (!unwatch_exact_range) { unwatch_first = std::max(unwatch_first, callback_unwatch_range.first); unwatch_last = std::min( diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 4c485f284..59b1fb009 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -338,7 +338,8 @@ class Memory { // the watched region that can be safely unwatched, if it doesn't matter, // return (0, UINT32_MAX). typedef std::pair (*PhysicalWriteWatchCallback)( - void* context_ptr, uint32_t physical_address_start, uint32_t length); + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range); // Physical memory write watching, allowing subsystems to invalidate cached // data that depends on memory contents.