[Memory/D3D12] Unwatch up to 256 KB ranges
This commit is contained in:
parent
b5fb84473d
commit
24383b9137
|
@ -699,8 +699,8 @@ void* PrimitiveConverter::AllocateIndices(
|
|||
return mapping + simd_offset;
|
||||
}
|
||||
|
||||
void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start,
|
||||
uint32_t length) {
|
||||
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
|
||||
uint32_t physical_address_start, uint32_t length) {
|
||||
// 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
|
||||
uint32_t bit_index_first = physical_address_start >> 23;
|
||||
uint32_t bit_index_last = (physical_address_start + length - 1) >> 23;
|
||||
|
@ -709,11 +709,12 @@ void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start,
|
|||
bits &= (1ull << (bit_index_last + 1)) - 1;
|
||||
}
|
||||
memory_regions_invalidated_ |= bits;
|
||||
return std::make_pair<uint32_t, uint32_t>(0, UINT32_MAX);
|
||||
}
|
||||
|
||||
void PrimitiveConverter::MemoryWriteCallbackThunk(
|
||||
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
|
||||
reinterpret_cast<PrimitiveConverter*>(context_ptr)
|
||||
return reinterpret_cast<PrimitiveConverter*>(context_ptr)
|
||||
->MemoryWriteCallback(physical_address_start, length);
|
||||
}
|
||||
|
||||
|
|
|
@ -89,10 +89,10 @@ class PrimitiveConverter {
|
|||
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
|
||||
|
||||
// Callback for invalidating buffers mid-frame.
|
||||
void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length);
|
||||
static void MemoryWriteCallbackThunk(void* context_ptr,
|
||||
uint32_t physical_address_start,
|
||||
uint32_t length);
|
||||
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
|
||||
uint32_t physical_address_start, uint32_t length);
|
||||
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length);
|
||||
|
||||
D3D12CommandProcessor* command_processor_;
|
||||
RegisterFile* register_file_;
|
||||
|
|
|
@ -50,7 +50,8 @@ SharedMemory::SharedMemory(D3D12CommandProcessor* command_processor,
|
|||
uint32_t page_bitmap_length = page_count_ >> 6;
|
||||
assert_true(page_bitmap_length != 0);
|
||||
|
||||
valid_pages_.resize(page_bitmap_length);
|
||||
// Two interleaved bit arrays.
|
||||
valid_and_gpu_written_pages_.resize(page_bitmap_length << 1);
|
||||
}
|
||||
|
||||
SharedMemory::~SharedMemory() { Shutdown(); }
|
||||
|
@ -124,7 +125,8 @@ bool SharedMemory::Initialize() {
|
|||
uint32_t(BufferDescriptorIndex::kRawUAV)),
|
||||
buffer_, kBufferSize);
|
||||
|
||||
std::memset(valid_pages_.data(), 0, valid_pages_.size() * sizeof(uint64_t));
|
||||
std::memset(valid_and_gpu_written_pages_.data(), 0,
|
||||
valid_and_gpu_written_pages_.size() * sizeof(uint64_t));
|
||||
|
||||
upload_buffer_pool_ =
|
||||
std::make_unique<ui::d3d12::UploadBufferPool>(context, 4 * 1024 * 1024);
|
||||
|
@ -381,7 +383,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
|
|||
}
|
||||
uint32_t upload_buffer_pages = upload_buffer_size >> page_size_log2_;
|
||||
// No mutex holding here!
|
||||
MakeRangeValid(upload_range_start, upload_buffer_pages);
|
||||
MakeRangeValid(upload_range_start, upload_buffer_pages, false);
|
||||
std::memcpy(
|
||||
upload_buffer_mapping,
|
||||
memory_->TranslatePhysical(upload_range_start << page_size_log2_),
|
||||
|
@ -447,7 +449,7 @@ void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) {
|
|||
// Mark the range as valid (so pages are not reuploaded until modified by the
|
||||
// CPU) and watch it so the CPU can reuse it and this will be caught.
|
||||
// No mutex holding here!
|
||||
MakeRangeValid(page_first, page_last - page_first + 1);
|
||||
MakeRangeValid(page_first, page_last - page_first + 1, true);
|
||||
}
|
||||
|
||||
bool SharedMemory::AreTiledResourcesUsed() const {
|
||||
|
@ -462,7 +464,8 @@ bool SharedMemory::AreTiledResourcesUsed() const {
|
|||
}
|
||||
|
||||
void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
|
||||
uint32_t valid_page_count) {
|
||||
uint32_t valid_page_count,
|
||||
bool written_by_gpu) {
|
||||
if (valid_page_first >= page_count_ || valid_page_count == 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -482,7 +485,12 @@ void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
|
|||
if (i == valid_block_last && (valid_page_last & 63) != 63) {
|
||||
valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1;
|
||||
}
|
||||
valid_pages_[i] |= valid_bits;
|
||||
valid_and_gpu_written_pages_[i << 1] |= valid_bits;
|
||||
if (written_by_gpu) {
|
||||
valid_and_gpu_written_pages_[(i << 1) + 1] |= valid_bits;
|
||||
} else {
|
||||
valid_and_gpu_written_pages_[(i << 1) + 1] &= ~valid_bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -527,7 +535,7 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
|
|||
|
||||
uint32_t range_start = UINT32_MAX;
|
||||
for (uint32_t i = request_block_first; i <= request_block_last; ++i) {
|
||||
uint64_t block_valid = valid_pages_[i];
|
||||
uint64_t block_valid = valid_and_gpu_written_pages_[i << 1];
|
||||
// Consider pages in the block outside the requested range valid.
|
||||
if (i == request_block_first) {
|
||||
block_valid |= (1ull << (request_page_first & 63)) - 1;
|
||||
|
@ -569,25 +577,44 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
|
|||
}
|
||||
}
|
||||
|
||||
void SharedMemory::MemoryWriteCallbackThunk(void* context_ptr,
|
||||
uint32_t physical_address_start,
|
||||
uint32_t length) {
|
||||
reinterpret_cast<SharedMemory*>(context_ptr)
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
|
||||
return reinterpret_cast<SharedMemory*>(context_ptr)
|
||||
->MemoryWriteCallback(physical_address_start, length);
|
||||
}
|
||||
|
||||
void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start,
|
||||
uint32_t length) {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
|
||||
uint32_t physical_address_start, uint32_t length) {
|
||||
uint32_t page_first = physical_address_start >> page_size_log2_;
|
||||
uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_;
|
||||
assert_true(page_first < page_count_ && page_last < page_count_);
|
||||
uint32_t block_first = page_first >> 6;
|
||||
uint32_t block_last = page_last >> 6;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
|
||||
// invalidated - if no GPU-written data nearby that was not intended to be
|
||||
// invalidated since it's not in sync with CPU memory and can't be reuploaded.
|
||||
// It's a lot cheaper to upload some excess data than to catch access
|
||||
// violations - with 4 KB callbacks, the original Doom runs at 4 FPS on
|
||||
// Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
|
||||
// frame, but with 256 KB it's 0.7 ms.
|
||||
if (page_first & 63) {
|
||||
uint64_t gpu_written_start =
|
||||
valid_and_gpu_written_pages_[(block_first << 1) + 1];
|
||||
gpu_written_start &= (1ull << (page_first & 63)) - 1;
|
||||
page_first =
|
||||
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
|
||||
}
|
||||
if ((page_last & 63) != 63) {
|
||||
uint64_t gpu_written_end =
|
||||
valid_and_gpu_written_pages_[(block_last << 1) + 1];
|
||||
gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
|
||||
page_last = (page_last & ~uint32_t(63)) +
|
||||
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
|
||||
}
|
||||
|
||||
for (uint32_t i = block_first; i <= block_last; ++i) {
|
||||
uint64_t invalidate_bits = UINT64_MAX;
|
||||
if (i == block_first) {
|
||||
|
@ -596,10 +623,15 @@ void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start,
|
|||
if (i == block_last && (page_last & 63) != 63) {
|
||||
invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1;
|
||||
}
|
||||
valid_pages_[i] &= ~invalidate_bits;
|
||||
valid_and_gpu_written_pages_[i << 1] &= ~invalidate_bits;
|
||||
valid_and_gpu_written_pages_[(i << 1) + 1] &= ~invalidate_bits;
|
||||
}
|
||||
|
||||
FireWatches(page_first, page_last, false);
|
||||
|
||||
return std::make_pair<uint32_t, uint32_t>(page_first << page_size_log2_,
|
||||
(page_last - page_first + 1)
|
||||
<< page_size_log2_);
|
||||
}
|
||||
|
||||
void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state) {
|
||||
|
|
|
@ -128,7 +128,8 @@ class SharedMemory {
|
|||
bool AreTiledResourcesUsed() const;
|
||||
|
||||
// Mark the memory range as updated and protect it.
|
||||
void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count);
|
||||
void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count,
|
||||
bool written_by_gpu);
|
||||
|
||||
D3D12CommandProcessor* command_processor_;
|
||||
|
||||
|
@ -182,14 +183,17 @@ class SharedMemory {
|
|||
// Things below should be protected by global_critical_region.
|
||||
// ***************************************************************************
|
||||
|
||||
// Bit vector containing whether physical memory system pages are up to date.
|
||||
std::vector<uint64_t> valid_pages_;
|
||||
// Bit vector containing:
|
||||
// - Even block indices - whether physical memory system pages are up to date.
|
||||
// - Odd block indices - whether phyical memory system pages contain data
|
||||
// written by the GPU not synchronized with the CPU (subset of valid pages).
|
||||
std::vector<uint64_t> valid_and_gpu_written_pages_;
|
||||
|
||||
// Memory access callback.
|
||||
static void MemoryWriteCallbackThunk(void* context_ptr,
|
||||
uint32_t physical_address_start,
|
||||
uint32_t length);
|
||||
void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length);
|
||||
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length);
|
||||
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
|
||||
uint32_t physical_address_start, uint32_t length);
|
||||
|
||||
struct GlobalWatch {
|
||||
GlobalWatchCallback callback;
|
||||
|
|
|
@ -173,7 +173,7 @@ dword_result_t NtReadFile(dword_t file_handle, dword_t event_handle,
|
|||
// TODO(rick): better checking of physical address
|
||||
if (buffer.guest_address() >= 0xA0000000) {
|
||||
kernel_memory()->TriggerWatches(buffer.guest_address(), buffer_length,
|
||||
true);
|
||||
true, true);
|
||||
}
|
||||
|
||||
// Synchronous.
|
||||
|
|
|
@ -433,7 +433,7 @@ bool Memory::AccessViolationCallback(size_t host_address, bool is_write) {
|
|||
heap == &heaps_.vE0000000) {
|
||||
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(
|
||||
virtual_address / system_page_size_ * system_page_size_,
|
||||
system_page_size_, is_write);
|
||||
system_page_size_, is_write, false);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -461,7 +461,8 @@ void Memory::CancelAccessWatch(uintptr_t watch_handle) {
|
|||
}
|
||||
|
||||
bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length,
|
||||
bool is_write) {
|
||||
bool is_write, bool unwatch_exact_range,
|
||||
bool unprotect) {
|
||||
BaseHeap* heap = LookupHeap(virtual_address);
|
||||
if (heap == &heaps_.vA0000000 || heap == &heaps_.vC0000000 ||
|
||||
heap == &heaps_.vE0000000) {
|
||||
|
@ -469,8 +470,8 @@ bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length,
|
|||
// watches are removed.
|
||||
cpu::MMIOHandler::global_handler()->InvalidateRange(virtual_address,
|
||||
length);
|
||||
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(virtual_address,
|
||||
length, is_write);
|
||||
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(
|
||||
virtual_address, length, is_write, unwatch_exact_range, unprotect);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -1460,7 +1461,8 @@ bool PhysicalHeap::Release(uint32_t base_address, uint32_t* out_region_size) {
|
|||
// watches are removed.
|
||||
cpu::MMIOHandler::global_handler()->InvalidateRange(base_address,
|
||||
region_size);
|
||||
TriggerWatches(base_address, region_size, true, !FLAGS_protect_on_release);
|
||||
TriggerWatches(base_address, region_size, true, true,
|
||||
!FLAGS_protect_on_release);
|
||||
}
|
||||
|
||||
if (!parent_heap_->Release(parent_base_address, out_region_size)) {
|
||||
|
@ -1478,7 +1480,7 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|||
// TODO(Triang3l): Remove InvalidateRange when legacy (old Vulkan renderer)
|
||||
// watches are removed.
|
||||
cpu::MMIOHandler::global_handler()->InvalidateRange(address, size);
|
||||
TriggerWatches(address, size, true, false);
|
||||
TriggerWatches(address, size, true, true, false);
|
||||
|
||||
if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect,
|
||||
old_protect)) {
|
||||
|
@ -1574,7 +1576,8 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
|
|||
}
|
||||
|
||||
bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
|
||||
bool is_write, bool unprotect) {
|
||||
bool is_write, bool unwatch_exact_range,
|
||||
bool unprotect) {
|
||||
// TODO(Triang3l): Support read watches.
|
||||
assert_true(is_write);
|
||||
if (!is_write) {
|
||||
|
@ -1632,8 +1635,11 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
|
|||
}
|
||||
|
||||
// Trigger callbacks.
|
||||
// TODO(Triang3l): Accumulate the range that is safe to unwatch from the
|
||||
// callbacks.
|
||||
if (!unprotect) {
|
||||
// If not doing anything with protection, no point in unwatching excess
|
||||
// pages.
|
||||
unwatch_exact_range = true;
|
||||
}
|
||||
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
|
||||
uint32_t physical_address_start =
|
||||
xe::sat_sub(system_page_first * system_page_size_,
|
||||
|
@ -1644,9 +1650,48 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
|
|||
system_address_offset) +
|
||||
physical_address_offset - physical_address_start,
|
||||
heap_size_ + 1 - (physical_address_start - physical_address_offset));
|
||||
uint32_t unwatch_first = 0;
|
||||
uint32_t unwatch_last = UINT32_MAX;
|
||||
for (auto physical_write_watch : memory_->physical_write_watches_) {
|
||||
physical_write_watch->callback(physical_write_watch->callback_context,
|
||||
physical_address_start, physical_length);
|
||||
std::pair<uint32_t, uint32_t> callback_unwatch_range =
|
||||
physical_write_watch->callback(physical_write_watch->callback_context,
|
||||
physical_address_start, physical_length);
|
||||
if (!unwatch_exact_range) {
|
||||
unwatch_first = std::max(unwatch_first, callback_unwatch_range.first);
|
||||
unwatch_last = std::min(
|
||||
unwatch_last,
|
||||
xe::sat_add(
|
||||
callback_unwatch_range.first,
|
||||
std::max(callback_unwatch_range.second, uint32_t(1)) - 1));
|
||||
}
|
||||
}
|
||||
if (!unwatch_exact_range) {
|
||||
// Always unwatch at least the requested pages.
|
||||
unwatch_first = std::min(unwatch_first, physical_address_start);
|
||||
unwatch_last =
|
||||
std::max(unwatch_last, physical_address_start + physical_length - 1);
|
||||
// Don't unprotect too much if not caring much about the region (limit to
|
||||
// 4 MB - somewhat random, but max 1024 iterations of the page loop).
|
||||
const uint32_t kMaxUnwatchExcess = 4 * 1024 * 1024;
|
||||
unwatch_first = std::max(unwatch_first,
|
||||
physical_address_start & ~(kMaxUnwatchExcess - 1));
|
||||
unwatch_last =
|
||||
std::min(unwatch_last, (physical_address_start + physical_length - 1) |
|
||||
(kMaxUnwatchExcess - 1));
|
||||
// Convert to heap-relative addresses.
|
||||
unwatch_first = xe::sat_sub(unwatch_first, physical_address_offset);
|
||||
unwatch_last = xe::sat_sub(unwatch_last, physical_address_offset);
|
||||
// Clamp to the heap upper bound.
|
||||
unwatch_first = std::min(unwatch_first, heap_size_);
|
||||
unwatch_last = std::min(unwatch_last, heap_size_);
|
||||
// Convert to system pages and update the range.
|
||||
unwatch_first += system_address_offset;
|
||||
unwatch_last += system_address_offset;
|
||||
assert_true(unwatch_first <= unwatch_last);
|
||||
system_page_first = unwatch_first / system_page_size_;
|
||||
system_page_last = unwatch_last / system_page_size_;
|
||||
block_index_first = system_page_first >> 6;
|
||||
block_index_last = system_page_last >> 6;
|
||||
}
|
||||
|
||||
// Unprotect ranges that need unprotection.
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/memory.h"
|
||||
|
@ -222,7 +223,7 @@ class PhysicalHeap : public BaseHeap {
|
|||
void WatchPhysicalWrite(uint32_t physical_address, uint32_t length);
|
||||
// Returns true if any page in the range was watched.
|
||||
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unprotect = true);
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
|
||||
protected:
|
||||
VirtualHeap* parent_heap_;
|
||||
|
@ -333,9 +334,11 @@ class Memory {
|
|||
// Cancels a write watch requested with AddPhysicalAccessWatch.
|
||||
void CancelAccessWatch(uintptr_t watch_handle);
|
||||
|
||||
typedef void (*PhysicalWriteWatchCallback)(void* context_ptr,
|
||||
uint32_t physical_address_start,
|
||||
uint32_t length);
|
||||
// Returns start and length of the smallest physical memory region surrounding
|
||||
// the watched region that can be safely unwatched, if it doesn't matter,
|
||||
// return (0, UINT32_MAX).
|
||||
typedef std::pair<uint32_t, uint32_t> (*PhysicalWriteWatchCallback)(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length);
|
||||
|
||||
// Physical memory write watching, allowing subsystems to invalidate cached
|
||||
// data that depends on memory contents.
|
||||
|
@ -366,12 +369,6 @@ class Memory {
|
|||
// same pages, and watches must not be placed on read-only or totally
|
||||
// inaccessible pages, there are significant difficulties with synchronizing
|
||||
// all the three ranges, but it's generally not needed.
|
||||
//
|
||||
// TODO(Triang3l): Allow the callbacks to unwatch regions larger than one page
|
||||
// (for instance, 64 KB) so there are less access violations. All callbacks
|
||||
// must agree to unwatch larger ranges because in some cases (like regions
|
||||
// near the locations that render targets have been resolved to) it is
|
||||
// necessary to invalidate only a single page and none more.
|
||||
void* RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback,
|
||||
void* callback_context);
|
||||
|
||||
|
@ -391,7 +388,8 @@ class Memory {
|
|||
// Forces triggering of watch callbacks for a virtual address range if pages
|
||||
// are watched there and unwatching them. Returns whether any page was
|
||||
// watched.
|
||||
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write);
|
||||
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect = true);
|
||||
|
||||
// Allocates virtual memory from the 'system' heap.
|
||||
// System memory is kept separate from game memory but is still accessible
|
||||
|
|
Loading…
Reference in New Issue