[Memory] Pass exact_range to watch callbacks

This commit is contained in:
Triang3l 2019-08-03 19:16:04 +03:00
parent a0c92e30ce
commit 0370f8bbd9
6 changed files with 40 additions and 32 deletions

View File

@ -700,7 +700,7 @@ void* PrimitiveConverter::AllocateIndices(
}
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length) {
uint32_t physical_address_start, uint32_t length, bool exact_range) {
// 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
uint32_t bit_index_first = physical_address_start >> 23;
uint32_t bit_index_last = (physical_address_start + length - 1) >> 23;
@ -713,9 +713,10 @@ std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
}
std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range) {
return reinterpret_cast<PrimitiveConverter*>(context_ptr)
->MemoryWriteCallback(physical_address_start, length);
->MemoryWriteCallback(physical_address_start, length, exact_range);
}
D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer(

View File

@ -90,9 +90,10 @@ class PrimitiveConverter {
// Callback for invalidating buffers mid-frame.
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length);
uint32_t physical_address_start, uint32_t length, bool exact_range);
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
D3D12CommandProcessor* command_processor_;
RegisterFile* register_file_;

View File

@ -578,13 +578,14 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
}
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length) {
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range) {
return reinterpret_cast<SharedMemory*>(context_ptr)
->MemoryWriteCallback(physical_address_start, length);
->MemoryWriteCallback(physical_address_start, length, exact_range);
}
std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length) {
uint32_t physical_address_start, uint32_t length, bool exact_range) {
uint32_t page_first = physical_address_start >> page_size_log2_;
uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_;
assert_true(page_first < page_count_ && page_last < page_count_);
@ -593,26 +594,28 @@ std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
auto global_lock = global_critical_region_.Acquire();
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
// invalidated - if no GPU-written data nearby that was not intended to be
// invalidated since it's not in sync with CPU memory and can't be reuploaded.
// It's a lot cheaper to upload some excess data than to catch access
// violations - with 4 KB callbacks, the original Doom runs at 4 FPS on
// Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
// frame, but with 256 KB it's 0.7 ms.
if (page_first & 63) {
uint64_t gpu_written_start =
valid_and_gpu_written_pages_[(block_first << 1) + 1];
gpu_written_start &= (1ull << (page_first & 63)) - 1;
page_first =
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
}
if ((page_last & 63) != 63) {
uint64_t gpu_written_end =
valid_and_gpu_written_pages_[(block_last << 1) + 1];
gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
page_last = (page_last & ~uint32_t(63)) +
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
if (!exact_range) {
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
// invalidated - if no GPU-written data nearby that was not intended to be
// invalidated since it's not in sync with CPU memory and can't be
// reuploaded. It's a lot cheaper to upload some excess data than to catch
// access violations - with 4 KB callbacks, the original Doom runs at 4 FPS
// on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
// frame, but with 256 KB it's 0.7 ms.
if (page_first & 63) {
uint64_t gpu_written_start =
valid_and_gpu_written_pages_[(block_first << 1) + 1];
gpu_written_start &= (1ull << (page_first & 63)) - 1;
page_first =
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
}
if ((page_last & 63) != 63) {
uint64_t gpu_written_end =
valid_and_gpu_written_pages_[(block_last << 1) + 1];
gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
page_last = (page_last & ~uint32_t(63)) +
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
}
}
for (uint32_t i = block_first; i <= block_last; ++i) {

View File

@ -191,9 +191,10 @@ class SharedMemory {
// Memory access callback.
static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
std::pair<uint32_t, uint32_t> MemoryWriteCallback(
uint32_t physical_address_start, uint32_t length);
uint32_t physical_address_start, uint32_t length, bool exact_range);
struct GlobalWatch {
GlobalWatchCallback callback;

View File

@ -1643,7 +1643,8 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
for (auto physical_write_watch : memory_->physical_write_watches_) {
std::pair<uint32_t, uint32_t> callback_unwatch_range =
physical_write_watch->callback(physical_write_watch->callback_context,
physical_address_start, physical_length);
physical_address_start, physical_length,
unwatch_exact_range);
if (!unwatch_exact_range) {
unwatch_first = std::max(unwatch_first, callback_unwatch_range.first);
unwatch_last = std::min(

View File

@ -338,7 +338,8 @@ class Memory {
// the watched region that can be safely unwatched, if it doesn't matter,
// return (0, UINT32_MAX).
typedef std::pair<uint32_t, uint32_t> (*PhysicalWriteWatchCallback)(
void* context_ptr, uint32_t physical_address_start, uint32_t length);
void* context_ptr, uint32_t physical_address_start, uint32_t length,
bool exact_range);
// Physical memory write watching, allowing subsystems to invalidate cached
// data that depends on memory contents.