[Memory/D3D12] Unwatch up to 256 KB ranges

This commit is contained in:
Triang3l 2019-07-31 00:18:12 +03:00
parent b5fb84473d
commit 24383b9137
7 changed files with 135 additions and 55 deletions

View File

@ -699,8 +699,8 @@ void* PrimitiveConverter::AllocateIndices(
return mapping + simd_offset;
}
void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start,
uint32_t length) {
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length) {
// 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
uint32_t bit_index_first = physical_address_start >> 23;
uint32_t bit_index_last = (physical_address_start + length - 1) >> 23;
@ -709,11 +709,12 @@ void PrimitiveConverter::MemoryWriteCallback(uint32_t physical_address_start,
bits &= (1ull << (bit_index_last + 1)) - 1;
}
memory_regions_invalidated_ |= bits;
return std::make_pair<uint32_t, uint32_t>(0, UINT32_MAX);
}
void PrimitiveConverter::MemoryWriteCallbackThunk(
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
reinterpret_cast<PrimitiveConverter*>(context_ptr)
return reinterpret_cast<PrimitiveConverter*>(context_ptr)
->MemoryWriteCallback(physical_address_start, length);
}

View File

@ -89,10 +89,10 @@ class PrimitiveConverter {
D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
// Callback for invalidating buffers mid-frame.
void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length);
static void MemoryWriteCallbackThunk(void* context_ptr,
uint32_t physical_address_start,
uint32_t length);
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length);
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
D3D12CommandProcessor* command_processor_;
RegisterFile* register_file_;

View File

@ -50,7 +50,8 @@ SharedMemory::SharedMemory(D3D12CommandProcessor* command_processor,
uint32_t page_bitmap_length = page_count_ >> 6;
assert_true(page_bitmap_length != 0);
valid_pages_.resize(page_bitmap_length);
// Two interleaved bit arrays.
valid_and_gpu_written_pages_.resize(page_bitmap_length << 1);
}
SharedMemory::~SharedMemory() { Shutdown(); }
@ -124,7 +125,8 @@ bool SharedMemory::Initialize() {
uint32_t(BufferDescriptorIndex::kRawUAV)),
buffer_, kBufferSize);
std::memset(valid_pages_.data(), 0, valid_pages_.size() * sizeof(uint64_t));
std::memset(valid_and_gpu_written_pages_.data(), 0,
valid_and_gpu_written_pages_.size() * sizeof(uint64_t));
upload_buffer_pool_ =
std::make_unique<ui::d3d12::UploadBufferPool>(context, 4 * 1024 * 1024);
@ -381,7 +383,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
}
uint32_t upload_buffer_pages = upload_buffer_size >> page_size_log2_;
// No mutex holding here!
MakeRangeValid(upload_range_start, upload_buffer_pages);
MakeRangeValid(upload_range_start, upload_buffer_pages, false);
std::memcpy(
upload_buffer_mapping,
memory_->TranslatePhysical(upload_range_start << page_size_log2_),
@ -447,7 +449,7 @@ void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) {
// Mark the range as valid (so pages are not reuploaded until modified by the
// CPU) and watch it so the CPU can reuse it and this will be caught.
// No mutex holding here!
MakeRangeValid(page_first, page_last - page_first + 1);
MakeRangeValid(page_first, page_last - page_first + 1, true);
}
bool SharedMemory::AreTiledResourcesUsed() const {
@ -462,7 +464,8 @@ bool SharedMemory::AreTiledResourcesUsed() const {
}
void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
uint32_t valid_page_count) {
uint32_t valid_page_count,
bool written_by_gpu) {
if (valid_page_first >= page_count_ || valid_page_count == 0) {
return;
}
@ -482,7 +485,12 @@ void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
if (i == valid_block_last && (valid_page_last & 63) != 63) {
valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1;
}
valid_pages_[i] |= valid_bits;
valid_and_gpu_written_pages_[i << 1] |= valid_bits;
if (written_by_gpu) {
valid_and_gpu_written_pages_[(i << 1) + 1] |= valid_bits;
} else {
valid_and_gpu_written_pages_[(i << 1) + 1] &= ~valid_bits;
}
}
}
@ -527,7 +535,7 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
uint32_t range_start = UINT32_MAX;
for (uint32_t i = request_block_first; i <= request_block_last; ++i) {
uint64_t block_valid = valid_pages_[i];
uint64_t block_valid = valid_and_gpu_written_pages_[i << 1];
// Consider pages in the block outside the requested range valid.
if (i == request_block_first) {
block_valid |= (1ull << (request_page_first & 63)) - 1;
@ -569,25 +577,44 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
}
}
void SharedMemory::MemoryWriteCallbackThunk(void* context_ptr,
uint32_t physical_address_start,
uint32_t length) {
reinterpret_cast<SharedMemory*>(context_ptr)
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
return reinterpret_cast<SharedMemory*>(context_ptr)
->MemoryWriteCallback(physical_address_start, length);
}
void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start,
uint32_t length) {
if (length == 0) {
return;
}
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length) {
uint32_t page_first = physical_address_start >> page_size_log2_;
uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_;
assert_true(page_first < page_count_ && page_last < page_count_);
uint32_t block_first = page_first >> 6;
uint32_t block_last = page_last >> 6;
auto global_lock = global_critical_region_.Acquire();
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
// invalidated - if no GPU-written data nearby that was not intended to be
// invalidated since it's not in sync with CPU memory and can't be reuploaded.
// It's a lot cheaper to upload some excess data than to catch access
// violations - with 4 KB callbacks, the original Doom runs at 4 FPS on
// Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
// frame, but with 256 KB it's 0.7 ms.
if (page_first & 63) {
uint64_t gpu_written_start =
valid_and_gpu_written_pages_[(block_first << 1) + 1];
gpu_written_start &= (1ull << (page_first & 63)) - 1;
page_first =
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
}
if ((page_last & 63) != 63) {
uint64_t gpu_written_end =
valid_and_gpu_written_pages_[(block_last << 1) + 1];
gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
page_last = (page_last & ~uint32_t(63)) +
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
}
for (uint32_t i = block_first; i <= block_last; ++i) {
uint64_t invalidate_bits = UINT64_MAX;
if (i == block_first) {
@ -596,10 +623,15 @@ void SharedMemory::MemoryWriteCallback(uint32_t physical_address_start,
if (i == block_last && (page_last & 63) != 63) {
invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1;
}
valid_pages_[i] &= ~invalidate_bits;
valid_and_gpu_written_pages_[i << 1] &= ~invalidate_bits;
valid_and_gpu_written_pages_[(i << 1) + 1] &= ~invalidate_bits;
}
FireWatches(page_first, page_last, false);
return std::make_pair<uint32_t, uint32_t>(page_first << page_size_log2_,
(page_last - page_first + 1)
<< page_size_log2_);
}
void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state) {

View File

@ -128,7 +128,8 @@ class SharedMemory {
bool AreTiledResourcesUsed() const;
// Mark the memory range as updated and protect it.
void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count);
void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count,
bool written_by_gpu);
D3D12CommandProcessor* command_processor_;
@ -182,14 +183,17 @@ class SharedMemory {
// Things below should be protected by global_critical_region.
// ***************************************************************************
// Bit vector containing whether physical memory system pages are up to date.
std::vector<uint64_t> valid_pages_;
// Bit vector containing:
// - Even block indices - whether physical memory system pages are up to date.
// - Odd block indices - whether phyical memory system pages contain data
// written by the GPU not synchronized with the CPU (subset of valid pages).
std::vector<uint64_t> valid_and_gpu_written_pages_;
// Memory access callback.
static void MemoryWriteCallbackThunk(void* context_ptr,
uint32_t physical_address_start,
uint32_t length);
void MemoryWriteCallback(uint32_t physical_address_start, uint32_t length);
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length);
struct GlobalWatch {
GlobalWatchCallback callback;

View File

@ -173,7 +173,7 @@ dword_result_t NtReadFile(dword_t file_handle, dword_t event_handle,
// TODO(rick): better checking of physical address
if (buffer.guest_address() >= 0xA0000000) {
kernel_memory()->TriggerWatches(buffer.guest_address(), buffer_length,
true);
true, true);
}
// Synchronous.

View File

@ -433,7 +433,7 @@ bool Memory::AccessViolationCallback(size_t host_address, bool is_write) {
heap == &heaps_.vE0000000) {
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(
virtual_address / system_page_size_ * system_page_size_,
system_page_size_, is_write);
system_page_size_, is_write, false);
}
return false;
@ -461,7 +461,8 @@ void Memory::CancelAccessWatch(uintptr_t watch_handle) {
}
bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length,
bool is_write) {
bool is_write, bool unwatch_exact_range,
bool unprotect) {
BaseHeap* heap = LookupHeap(virtual_address);
if (heap == &heaps_.vA0000000 || heap == &heaps_.vC0000000 ||
heap == &heaps_.vE0000000) {
@ -469,8 +470,8 @@ bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length,
// watches are removed.
cpu::MMIOHandler::global_handler()->InvalidateRange(virtual_address,
length);
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(virtual_address,
length, is_write);
return static_cast<PhysicalHeap*>(heap)->TriggerWatches(
virtual_address, length, is_write, unwatch_exact_range, unprotect);
}
return false;
}
@ -1460,7 +1461,8 @@ bool PhysicalHeap::Release(uint32_t base_address, uint32_t* out_region_size) {
// watches are removed.
cpu::MMIOHandler::global_handler()->InvalidateRange(base_address,
region_size);
TriggerWatches(base_address, region_size, true, !FLAGS_protect_on_release);
TriggerWatches(base_address, region_size, true, true,
!FLAGS_protect_on_release);
}
if (!parent_heap_->Release(parent_base_address, out_region_size)) {
@ -1478,7 +1480,7 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
// TODO(Triang3l): Remove InvalidateRange when legacy (old Vulkan renderer)
// watches are removed.
cpu::MMIOHandler::global_handler()->InvalidateRange(address, size);
TriggerWatches(address, size, true, false);
TriggerWatches(address, size, true, true, false);
if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect,
old_protect)) {
@ -1574,7 +1576,8 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
}
bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
bool is_write, bool unprotect) {
bool is_write, bool unwatch_exact_range,
bool unprotect) {
// TODO(Triang3l): Support read watches.
assert_true(is_write);
if (!is_write) {
@ -1632,8 +1635,11 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
}
// Trigger callbacks.
// TODO(Triang3l): Accumulate the range that is safe to unwatch from the
// callbacks.
if (!unprotect) {
// If not doing anything with protection, no point in unwatching excess
// pages.
unwatch_exact_range = true;
}
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
uint32_t physical_address_start =
xe::sat_sub(system_page_first * system_page_size_,
@ -1644,9 +1650,48 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
system_address_offset) +
physical_address_offset - physical_address_start,
heap_size_ + 1 - (physical_address_start - physical_address_offset));
uint32_t unwatch_first = 0;
uint32_t unwatch_last = UINT32_MAX;
for (auto physical_write_watch : memory_->physical_write_watches_) {
physical_write_watch->callback(physical_write_watch->callback_context,
physical_address_start, physical_length);
std::pair<uint32_t, uint32_t> callback_unwatch_range =
physical_write_watch->callback(physical_write_watch->callback_context,
physical_address_start, physical_length);
if (!unwatch_exact_range) {
unwatch_first = std::max(unwatch_first, callback_unwatch_range.first);
unwatch_last = std::min(
unwatch_last,
xe::sat_add(
callback_unwatch_range.first,
std::max(callback_unwatch_range.second, uint32_t(1)) - 1));
}
}
if (!unwatch_exact_range) {
// Always unwatch at least the requested pages.
unwatch_first = std::min(unwatch_first, physical_address_start);
unwatch_last =
std::max(unwatch_last, physical_address_start + physical_length - 1);
// Don't unprotect too much if not caring much about the region (limit to
// 4 MB - somewhat random, but max 1024 iterations of the page loop).
const uint32_t kMaxUnwatchExcess = 4 * 1024 * 1024;
unwatch_first = std::max(unwatch_first,
physical_address_start & ~(kMaxUnwatchExcess - 1));
unwatch_last =
std::min(unwatch_last, (physical_address_start + physical_length - 1) |
(kMaxUnwatchExcess - 1));
// Convert to heap-relative addresses.
unwatch_first = xe::sat_sub(unwatch_first, physical_address_offset);
unwatch_last = xe::sat_sub(unwatch_last, physical_address_offset);
// Clamp to the heap upper bound.
unwatch_first = std::min(unwatch_first, heap_size_);
unwatch_last = std::min(unwatch_last, heap_size_);
// Convert to system pages and update the range.
unwatch_first += system_address_offset;
unwatch_last += system_address_offset;
assert_true(unwatch_first <= unwatch_last);
system_page_first = unwatch_first / system_page_size_;
system_page_last = unwatch_last / system_page_size_;
block_index_first = system_page_first >> 6;
block_index_last = system_page_last >> 6;
}
// Unprotect ranges that need unprotection.

View File

@ -13,6 +13,7 @@
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "xenia/base/memory.h"
@ -222,7 +223,7 @@ class PhysicalHeap : public BaseHeap {
void WatchPhysicalWrite(uint32_t physical_address, uint32_t length);
// Returns true if any page in the range was watched.
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
bool unprotect = true);
bool unwatch_exact_range, bool unprotect = true);
protected:
VirtualHeap* parent_heap_;
@ -333,9 +334,11 @@ class Memory {
// Cancels a write watch requested with AddPhysicalAccessWatch.
void CancelAccessWatch(uintptr_t watch_handle);
typedef void (*PhysicalWriteWatchCallback)(void* context_ptr,
uint32_t physical_address_start,
uint32_t length);
// Returns start and length of the smallest physical memory region surrounding
// the watched region that can be safely unwatched, if it doesn't matter,
// return (0, UINT32_MAX).
typedef std::pair<uint32_t, uint32_t> (*PhysicalWriteWatchCallback)(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
// Physical memory write watching, allowing subsystems to invalidate cached
// data that depends on memory contents.
@ -366,12 +369,6 @@ class Memory {
// same pages, and watches must not be placed on read-only or totally
// inaccessible pages, there are significant difficulties with synchronizing
// all the three ranges, but it's generally not needed.
//
// TODO(Triang3l): Allow the callbacks to unwatch regions larger than one page
// (for instance, 64 KB) so there are less access violations. All callbacks
// must agree to unwatch larger ranges because in some cases (like regions
// near the locations that render targets have been resolved to) it is
// necessary to invalidate only a single page and none more.
void* RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback,
void* callback_context);
@ -391,7 +388,8 @@ class Memory {
// Forces triggering of watch callbacks for a virtual address range if pages
// are watched there and unwatching them. Returns whether any page was
// watched.
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write);
bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
bool unwatch_exact_range, bool unprotect = true);
// Allocates virtual memory from the 'system' heap.
// System memory is kept separate from game memory but is still accessible