From 395299711689aa62522ea7ffdafceeddeeb21958 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Thu, 15 Dec 2022 08:35:36 -0800 Subject: [PATCH] Fix issue introduced yesterday where the final fetch constant would never be marked as written Reorganized SystemPageFlags for sharedmemory, each field now goes into its own array, the three arrays are page aligned in a single virtual allocation Refactored sharedmemory a bit, use tzcnt if available when finding ranges (faster on pre-zen4 amd cpus) --- .../gpu/d3d12/d3d12_command_processor.cc | 2 +- src/xenia/gpu/shared_memory.cc | 309 +++++++++++------- src/xenia/gpu/shared_memory.h | 20 +- src/xenia/gpu/texture_cache.h | 2 +- 4 files changed, 217 insertions(+), 116 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 3d31e50de..61a5b88bf 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2063,7 +2063,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( #define DO_A_RANGE(start_range, end_range, cb) \ if constexpr (start_range >= register_lower_bound || \ - end_range > register_lower_bound) { \ + end_range > register_lower_bound) { \ if (current_index < (end_range)) { \ uint32_t ntowrite = get_end_before_qty(end_range); \ cb((start_range), (end_range), current_index, base, ntowrite); \ diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index b891b5f38..a94aa053b 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -26,9 +26,28 @@ SharedMemory::SharedMemory(Memory& memory) : memory_(memory) { SharedMemory::~SharedMemory() { ShutdownCommon(); } void SharedMemory::InitializeCommon() { - system_page_flags_.clear(); - system_page_flags_.resize(((kBufferSize >> page_size_log2_) + 63) / 64); + size_t num_system_page_flags_entries = + ((kBufferSize >> page_size_log2_) + 63) / 64; + num_system_page_flags_ = static_cast(num_system_page_flags_entries); + // in total on windows the page flags take up 2048 entries per fields, with 3 + // fields and 8 bytes per entry thats 49152 bytes. having page alignment for + // them is probably beneficial, we do waste 16384 bytes with this alloc though + + uint64_t* system_page_flags_base = (uint64_t*)memory::AllocFixed( + nullptr, num_system_page_flags_ * 3 * sizeof(uint64_t), + memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite); + + system_page_flags_valid_ = system_page_flags_base, + system_page_flags_valid_and_gpu_resolved_ = + system_page_flags_base + (num_system_page_flags_), + system_page_flags_valid_and_gpu_written_ = + system_page_flags_base + (num_system_page_flags_ * 2); + memset(system_page_flags_valid_, 0, 8 * num_system_page_flags_entries); + memset(system_page_flags_valid_and_gpu_resolved_, 0, + 8 * num_system_page_flags_entries); + memset(system_page_flags_valid_and_gpu_written_, 0, + 8 * num_system_page_flags_entries); memory_invalidation_callback_handle_ = memory_.RegisterPhysicalMemoryInvalidationCallback( MemoryInvalidationCallbackThunk, this); @@ -81,6 +100,12 @@ void SharedMemory::ShutdownCommon() { host_gpu_memory_sparse_allocated_.clear(); host_gpu_memory_sparse_allocated_.shrink_to_fit(); host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX; + memory::DeallocFixed(system_page_flags_valid_, 0, + memory::DeallocationType::kRelease); + system_page_flags_valid_ = nullptr; + system_page_flags_valid_and_gpu_resolved_ = nullptr; + system_page_flags_valid_and_gpu_written_ = nullptr; + num_system_page_flags_ = 0; } void SharedMemory::ClearCache() { @@ -105,8 +130,9 @@ void SharedMemory::ClearCache() { void SharedMemory::SetSystemPageBlocksValidWithGpuDataWritten() { auto global_lock = global_critical_region_.Acquire(); - for (SystemPageFlagsBlock& block : system_page_flags_) { - block.valid = block.valid_and_gpu_written; + + for (unsigned i = 0; i < num_system_page_flags_; ++i) { + system_page_flags_valid_[i] = system_page_flags_valid_and_gpu_written_[i]; } } @@ -150,8 +176,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2; uint32_t bucket_last = watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2; - //chrispy: Not required the global lock is always held by the caller - // auto global_lock = global_critical_region_.Acquire(); + // chrispy: Not required the global lock is always held by the caller + // auto global_lock = global_critical_region_.Acquire(); // Allocate the range. WatchRange* range = watch_range_first_free_; @@ -308,17 +334,17 @@ void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length, if (i == valid_block_last && (valid_page_last & 63) != 63) { valid_bits &= (uint64_t(1) << ((valid_page_last & 63) + 1)) - 1; } - SystemPageFlagsBlock& block = system_page_flags_[i]; - block.valid |= valid_bits; + // SystemPageFlagsBlock& block = system_page_flags_[i]; + system_page_flags_valid_[i] |= valid_bits; if (written_by_gpu) { - block.valid_and_gpu_written |= valid_bits; + system_page_flags_valid_and_gpu_written_[i] |= valid_bits; } else { - block.valid_and_gpu_written &= ~valid_bits; + system_page_flags_valid_and_gpu_written_[i] &= ~valid_bits; } if (written_by_gpu_resolve) { - block.valid_and_gpu_resolved |= valid_bits; + system_page_flags_valid_and_gpu_resolved_[i] |= valid_bits; } else { - block.valid_and_gpu_resolved &= ~valid_bits; + system_page_flags_valid_and_gpu_resolved_[i] &= ~valid_bits; } } } @@ -384,64 +410,15 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, bool any_data_resolved = false; uint32_t block_first = page_first >> 6; - swcache::PrefetchL1(&system_page_flags_[block_first]); + // swcache::PrefetchL1(&system_page_flags_[block_first]); uint32_t block_last = page_last >> 6; uint32_t range_start = UINT32_MAX; { auto global_lock = global_critical_region_.Acquire(); - for (uint32_t i = block_first; i <= block_last; ++i) { - const SystemPageFlagsBlock& block = system_page_flags_[i]; - uint64_t block_valid = block.valid; - uint64_t block_resolved = block.valid_and_gpu_resolved; - // Consider pages in the block outside the requested range valid. - if (i == block_first) { - uint64_t block_before = (uint64_t(1) << (page_first & 63)) - 1; - block_valid |= block_before; - block_resolved &= ~block_before; - } - if (i == block_last && (page_last & 63) != 63) { - uint64_t block_inside = (uint64_t(1) << ((page_last & 63) + 1)) - 1; - block_valid |= ~block_inside; - block_resolved &= block_inside; - } - if (block_resolved) { - any_data_resolved = true; - } - - while (true) { - uint32_t block_page; - if (range_start == UINT32_MAX) { - // Check if need to open a new range. - if (!xe::bit_scan_forward(~block_valid, &block_page)) { - break; - } - range_start = (i << 6) + block_page; - } else { - // Check if need to close the range. - // Ignore the valid pages before the beginning of the range. - uint64_t block_valid_from_start = block_valid; - if (i == (range_start >> 6)) { - block_valid_from_start &= - ~((uint64_t(1) << (range_start & 63)) - 1); - } - if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) { - break; - } - if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) { - xe::FatalError( - "Hit max upload ranges in shared_memory.cc, tell a dev to " - "raise the limit!"); - } - uploads[current_upload_range++] = - std::make_pair(range_start, (i << 6) + block_page - range_start); - // In the next iteration within this block, consider this range valid - // since it has been queued for upload. - block_valid |= (uint64_t(1) << block_page) - 1; - range_start = UINT32_MAX; - } - } - } + TryFindUploadRange(block_first, block_last, page_first, page_last, + any_data_resolved, range_start, current_upload_range, + uploads); } if (range_start != UINT32_MAX) { uploads[current_upload_range++] = @@ -457,6 +434,110 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, return UploadRanges(uploads, current_upload_range); } +template +XE_FORCEINLINE XE_NOALIAS static T mod_shift_left(T value, uint32_t by) { +#if XE_ARCH_AMD64 == 1 + // arch has modular shifts + return value << by; +#else + return value << (by % (sizeof(T) * CHAR_BIT)); +#endif +} +void SharedMemory::TryFindUploadRange(const uint32_t& block_first, + const uint32_t& block_last, + const uint32_t& page_first, + const uint32_t& page_last, + bool& any_data_resolved, + uint32_t& range_start, + unsigned int& current_upload_range, + std::pair* uploads) { + for (uint32_t i = block_first; i <= block_last; ++i) { + // const SystemPageFlagsBlock& block = system_page_flags_[i]; + uint64_t block_valid = system_page_flags_valid_[i]; + uint64_t block_resolved = 0; + + if (any_data_resolved) { + block_resolved = 0; + } else { + block_resolved = system_page_flags_valid_and_gpu_resolved_[i]; + } + if (i == block_first) { + uint64_t block_before = mod_shift_left(uint64_t(1), page_first) - 1; + block_valid |= block_before; + block_resolved &= ~block_before; + } + if (i == block_last && (page_last & 63) != 63) { + uint64_t block_inside = mod_shift_left(uint64_t(1), page_last + 1) - 1; + block_valid |= ~block_inside; + block_resolved &= block_inside; + } + // Consider pages in the block outside the requested range valid. + if (!block_resolved) { + } else { + any_data_resolved = true; + } + TryGetNextUploadRange(range_start, block_valid, i, current_upload_range, + uploads); + } +} + +static bool UploadRange_DoBestScanForward(uint64_t v, uint32_t* out) { +#if XE_ARCH_AMD64 == 1 + if (!v) { + return false; + } + if (amd64::GetFeatureFlags() & amd64::kX64EmitBMI1) { + *out = static_cast(_tzcnt_u64(v)); + } else { + unsigned char bsfres = _BitScanForward64((unsigned long*)out, v); + + XE_MSVC_ASSUME(bsfres == 1); + } + return true; +#else + return xe::bit_scan_forward(v, out); +#endif +} + +void SharedMemory::TryGetNextUploadRange( + uint32_t& range_start, uint64_t& block_valid, const uint32_t& i, + unsigned int& current_upload_range, + std::pair* uploads) { + while (true) { + uint32_t block_page = 0; + if (range_start == UINT32_MAX) { + // Check if need to open a new range. + if (!UploadRange_DoBestScanForward(~block_valid, &block_page)) { + break; + } + range_start = (i << 6) + block_page; + } else { + // Check if need to close the range. + // Ignore the valid pages before the beginning of the range. + uint64_t block_valid_from_start = block_valid; + if (i == (range_start >> 6)) { + block_valid_from_start &= + ~(mod_shift_left(uint64_t(1), range_start) - 1); + } + if (!UploadRange_DoBestScanForward(block_valid_from_start, &block_page)) { + break; + } + if (current_upload_range + 1 < MAX_UPLOAD_RANGES) { + uploads[current_upload_range++] = + std::make_pair(range_start, (i << 6) + block_page - range_start); + // In the next iteration within this block, consider this range valid + // since it has been queued for upload. + block_valid |= (uint64_t(1) << block_page) - 1; + range_start = UINT32_MAX; + } else { + xe::FatalError( + "Hit max upload ranges in shared_memory.cc, tell a dev to " + "raise the limit!"); + } + } + } +} + std::pair SharedMemory::MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range) { @@ -490,14 +571,14 @@ std::pair SharedMemory::MemoryInvalidationCallback( // 0.7 ms. if (page_first & 63) { uint64_t gpu_written_start = - system_page_flags_[block_first].valid_and_gpu_written; + system_page_flags_valid_and_gpu_written_[block_first]; gpu_written_start &= (uint64_t(1) << (page_first & 63)) - 1; page_first = (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); } if ((page_last & 63) != 63) { uint64_t gpu_written_end = - system_page_flags_[block_last].valid_and_gpu_written; + system_page_flags_valid_and_gpu_written_[block_last]; gpu_written_end &= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1); page_last = (page_last & ~uint32_t(63)) + (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); @@ -512,10 +593,9 @@ std::pair SharedMemory::MemoryInvalidationCallback( if (i == block_last && (page_last & 63) != 63) { invalidate_bits &= (uint64_t(1) << ((page_last & 63) + 1)) - 1; } - SystemPageFlagsBlock& block = system_page_flags_[i]; - block.valid &= ~invalidate_bits; - block.valid_and_gpu_written &= ~invalidate_bits; - block.valid_and_gpu_resolved &= ~invalidate_bits; + system_page_flags_valid_[i] &= ~invalidate_bits; + system_page_flags_valid_and_gpu_resolved_[i] &= ~invalidate_bits; + system_page_flags_valid_and_gpu_written_[i] &= ~invalidate_bits; } FireWatches(page_first, page_last, false); @@ -536,11 +616,11 @@ void SharedMemory::PrepareForTraceDownload() { uint32_t fire_watches_range_start = UINT32_MAX; uint32_t gpu_written_range_start = UINT32_MAX; auto global_lock = global_critical_region_.Acquire(); - for (uint32_t i = 0; i < system_page_flags_.size(); ++i) { - SystemPageFlagsBlock& page_flags_block = system_page_flags_[i]; - uint64_t previously_valid_block = page_flags_block.valid; - uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written; - page_flags_block.valid = gpu_written_block; + for (uint32_t i = 0; i < num_system_page_flags_; ++i) { + // SystemPageFlagsBlock& page_flags_block = system_page_flags_[i]; + uint64_t previously_valid_block = system_page_flags_valid_[i]; + uint64_t gpu_written_block = system_page_flags_valid_and_gpu_written_[i]; + system_page_flags_valid_[i] = gpu_written_block; // Fire watches on the invalidated pages. uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block; @@ -627,45 +707,48 @@ void SharedMemory::ReleaseTraceDownloadRanges() { bool SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) { - if (host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX) { - return true; - } - if (!length) { - return true; - } - if (start > kBufferSize || (kBufferSize - start) < length) { - return false; - } - uint32_t page_first = start >> page_size_log2_; - uint32_t page_last = (start + length - 1) >> page_size_log2_; - uint32_t allocation_first = - page_first << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_; - uint32_t allocation_last = - page_last << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_; - while (true) { - std::pair allocation_range = xe::bit_range::NextUnsetRange( - host_gpu_memory_sparse_allocated_.data(), allocation_first, - allocation_last - allocation_first + 1); - if (!allocation_range.second) { - break; - } - if (!AllocateSparseHostGpuMemoryRange(uint32_t(allocation_range.first), - uint32_t(allocation_range.second))) { + if (host_gpu_memory_sparse_granularity_log2_ != UINT32_MAX && length) { + if (start <= kBufferSize && (kBufferSize - start) >= length) { + uint32_t page_first = start >> page_size_log2_; + uint32_t page_last = (start + length - 1) >> page_size_log2_; + uint32_t allocation_first = page_first << page_size_log2_ >> + host_gpu_memory_sparse_granularity_log2_; + uint32_t allocation_last = page_last << page_size_log2_ >> + host_gpu_memory_sparse_granularity_log2_; + while (true) { + std::pair allocation_range = + xe::bit_range::NextUnsetRange( + host_gpu_memory_sparse_allocated_.data(), allocation_first, + allocation_last - allocation_first + 1); + if (!allocation_range.second) { + break; + } + if (!AllocateSparseHostGpuMemoryRange( + uint32_t(allocation_range.first), + uint32_t(allocation_range.second))) { + return false; + } + xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(), + allocation_range.first, + allocation_range.second); + ++host_gpu_memory_sparse_allocations_; + COUNT_profile_set( + "gpu/shared_memory/host_gpu_memory_sparse_allocations", + host_gpu_memory_sparse_allocations_); + host_gpu_memory_sparse_used_bytes_ += + uint32_t(allocation_range.second) + << host_gpu_memory_sparse_granularity_log2_; + COUNT_profile_set( + "gpu/shared_memory/host_gpu_memory_sparse_used_mb", + (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20); + allocation_first = + uint32_t(allocation_range.first + allocation_range.second); + } + } else { return false; } - xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(), - allocation_range.first, allocation_range.second); - ++host_gpu_memory_sparse_allocations_; - COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations", - host_gpu_memory_sparse_allocations_); - host_gpu_memory_sparse_used_bytes_ += - uint32_t(allocation_range.second) - << host_gpu_memory_sparse_granularity_log2_; - COUNT_profile_set( - "gpu/shared_memory/host_gpu_memory_sparse_used_mb", - (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20); - allocation_first = - uint32_t(allocation_range.first + allocation_range.second); + } else { + return true; } return true; } diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index e721fe8af..7100d4df1 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -74,6 +74,18 @@ class SharedMemory { bool RequestRange(uint32_t start, uint32_t length, bool* any_data_resolved_out = nullptr); + void TryFindUploadRange(const uint32_t& block_first, + const uint32_t& block_last, + const uint32_t& page_first, const uint32_t& page_last, + bool& any_data_resolved, uint32_t& range_start, + unsigned int& current_upload_range, + std::pair* uploads); + + void TryGetNextUploadRange(uint32_t& range_start, uint64_t& block_valid, + const uint32_t& i, + unsigned int& current_upload_range, + std::pair* uploads); + // Marks the range and, if not exact_range, potentially its surroundings // (to up to the first GPU-written page, as an access violation exception // count optimization) as modified by the CPU, also invalidating GPU-written @@ -196,10 +208,16 @@ class SharedMemory { // contains data written specifically by resolving from EDRAM. uint64_t valid_and_gpu_resolved; }; + + //chrispy: todo, systempageflagsblock should be 3 different arrays // Flags for each 64 system pages, interleaved as blocks, so bit scan can be // used to quickly extract ranges. - std::vector system_page_flags_; + // std::vector system_page_flags_; + uint64_t *system_page_flags_valid_ = nullptr, + *system_page_flags_valid_and_gpu_written_ = nullptr, + *system_page_flags_valid_and_gpu_resolved_ = nullptr; + unsigned num_system_page_flags_ = 0; static std::pair MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range); diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index 717273275..075b80111 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -108,7 +108,7 @@ class TextureCache { // generate a mask of all bits from before the first index, and xor it with // all bits before the last index this produces a mask covering only the // bits between first and last - uint32_t res = ((1U << first_index) - 1) ^ ((1U << (last_index + 1)) - 1); + uint32_t res = ((1U << first_index) - 1) ^ static_cast((1ULL << (last_index + 1)) - 1ULL); // todo: check that this is right texture_bindings_in_sync_ &= ~res;