From 19121130a3a2e9d081cd53ebb00a2a5d7d606e81 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 6 Oct 2020 21:32:44 +0300 Subject: [PATCH] [GPU] SharedMemory: common sparse memory allocation --- src/xenia/base/bit_range.h | 106 ++++++++++++++++ src/xenia/gpu/d3d12/d3d12_shared_memory.cc | 137 +++++++++------------ src/xenia/gpu/d3d12/d3d12_shared_memory.h | 15 +-- src/xenia/gpu/shared_memory.cc | 101 +++++++++++++-- src/xenia/gpu/shared_memory.h | 41 +++--- 5 files changed, 287 insertions(+), 113 deletions(-) create mode 100644 src/xenia/base/bit_range.h diff --git a/src/xenia/base/bit_range.h b/src/xenia/base/bit_range.h new file mode 100644 index 000000000..462d5e2cd --- /dev/null +++ b/src/xenia/base/bit_range.h @@ -0,0 +1,106 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2019 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_BASE_BIT_RANGE_H_ +#define XENIA_BASE_BIT_RANGE_H_ + +#include +#include +#include +#include +#include + +#include "xenia/base/math.h" + +namespace xe { +namespace bit_range { + +// Provided length is in bits since the first. Returns of the +// range in bits, with length == 0 if not found. +template +std::pair NextUnsetRange(const Block* bits, size_t first, + size_t length) { + if (!length) { + return std::make_pair(size_t(first), size_t(0)); + } + size_t last = first + length - 1; + const size_t block_bits = sizeof(Block) * CHAR_BIT; + size_t block_first = first / block_bits; + size_t block_last = last / block_bits; + size_t range_start = SIZE_MAX; + for (size_t i = block_first; i <= block_last; ++i) { + Block block = bits[i]; + // Ignore bits in the block outside the specified range by considering them + // set. + if (i == block_first) { + block |= (Block(1) << (first & (block_bits - 1))) - 1; + } + if (i == block_last && (last & (block_bits - 1)) != block_bits - 1) { + block |= ~((Block(1) << ((last & (block_bits - 1)) + 1)) - 1); + } + while (true) { + uint32_t block_bit; + if (range_start == SIZE_MAX) { + // Check if need to open a new range. + if (!xe::bit_scan_forward(~block, &block_bit)) { + break; + } + range_start = i * block_bits + block_bit; + } else { + // Check if need to close the range. + // Ignore the set bits before the beginning of the range. + Block block_bits_set_from_start = block; + if (i == range_start / block_bits) { + block_bits_set_from_start &= + ~((Block(1) << (range_start & (block_bits - 1))) - 1); + } + if (!xe::bit_scan_forward(block_bits_set_from_start, &block_bit)) { + break; + } + return std::make_pair(range_start, + (i * block_bits) + block_bit - range_start); + } + } + } + if (range_start != SIZE_MAX) { + return std::make_pair(range_start, last + size_t(1) - range_start); + } + return std::make_pair(first + length, size_t(0)); +} + +template +void SetRange(Block* bits, size_t first, size_t length) { + if (!length) { + return; + } + size_t last = first + length - 1; + const size_t block_bits = sizeof(Block) * CHAR_BIT; + size_t block_first = first / block_bits; + size_t block_last = last / block_bits; + Block set_first = ~((Block(1) << (first & (block_bits - 1))) - 1); + Block set_last = ~Block(0); + if ((last & (block_bits - 1)) != (block_bits - 1)) { + set_last &= (Block(1) << ((last & (block_bits - 1)) + 1)) - 1; + } + if (block_first == block_last) { + bits[block_first] |= set_first & set_last; + return; + } + bits[block_first] |= set_first; + if (block_first + 1 < block_last) { + std::memset(bits + block_first + 1, CHAR_MAX, + (block_last - (block_first + 1)) * sizeof(Block)); + } + bits[block_last] |= set_last; +} + +} // namespace bit_range +} // namespace xe + +#endif // XENIA_BASE_BIT_RANGE_H_ diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc index 2c74c4da8..992f9aed5 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc @@ -17,7 +17,6 @@ #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" -#include "xenia/base/profiling.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/ui/d3d12/d3d12_util.h" @@ -43,26 +42,35 @@ D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); } bool D3D12SharedMemory::Initialize() { InitializeCommon(); - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); + const ui::d3d12::D3D12Provider& provider = + command_processor_.GetD3D12Context().GetD3D12Provider(); + ID3D12Device* device = provider.GetDevice(); D3D12_RESOURCE_DESC buffer_desc; ui::d3d12::util::FillBufferResourceDesc( buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; - if (AreTiledResourcesUsed()) { + if (cvars::d3d12_tiled_shared_memory && + provider.GetTiledResourcesTier() != + D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED && + !provider.GetGraphicsAnalysis()) { if (FAILED(device->CreateReservedResource( &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) { - XELOGE("Shared memory: Failed to create the 512 MB tiled buffer"); + XELOGE("Shared memory: Failed to create the {} MB tiled buffer", + kBufferSize >> 20); Shutdown(); return false; } + static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16)); + InitializeSparseHostGpuMemory( + std::max(kHostGpuMemoryOptimalSparseAllocationLog2, uint32_t(16))); } else { XELOGGPU( "Direct3D 12 tiled resources are not used for shared memory " "emulation - video memory usage may increase significantly " - "because a full 512 MB buffer will be created!"); - if (provider.GetGraphicsAnalysis() != nullptr) { + "because a full {} MB buffer will be created!", + kBufferSize >> 20); + if (provider.GetGraphicsAnalysis()) { // As of October 8th, 2018, PIX doesn't support tiled buffers. // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. XELOGGPU( @@ -73,7 +81,8 @@ bool D3D12SharedMemory::Initialize() { &ui::d3d12::util::kHeapPropertiesDefault, provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) { - XELOGE("Shared memory: Failed to create the 512 MB buffer"); + XELOGE("Shared memory: Failed to create the {} MB buffer", + kBufferSize >> 20); Shutdown(); return false; } @@ -161,13 +170,10 @@ void D3D12SharedMemory::Shutdown(bool from_destructor) { // First free the buffer to detach it from the heaps. ui::d3d12::util::ReleaseAndNull(buffer_); - if (AreTiledResourcesUsed()) { - for (uint32_t i = 0; i < xe::countof(heaps_); ++i) { - ui::d3d12::util::ReleaseAndNull(heaps_[i]); - } - heap_count_ = 0; - COUNT_profile_set("gpu/shared_memory/used_mb", 0); + for (ID3D12Heap* heap : buffer_tiled_heaps_) { + heap->Release(); } + buffer_tiled_heaps_.clear(); // If calling from the destructor, the SharedMemory destructor will call // ShutdownCommon. @@ -180,26 +186,12 @@ void D3D12SharedMemory::ClearCache() { SharedMemory::ClearCache(); upload_buffer_pool_->ClearCache(); - - // TODO(Triang3l): Unmap and destroy heaps. } void D3D12SharedMemory::CompletedSubmissionUpdated() { upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission()); } -bool D3D12SharedMemory::AreTiledResourcesUsed() const { - if (!cvars::d3d12_tiled_shared_memory) { - return false; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - // As of October 8th, 2018, PIX doesn't support tiled buffers. - // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. - return provider.GetTiledResourcesTier() != - D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED && - provider.GetGraphicsAnalysis() == nullptr; -} - void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer( D3D12_RESOURCE_STATES new_state) { if (buffer_state_ == new_state) { @@ -321,11 +313,6 @@ bool D3D12SharedMemory::InitializeTraceSubmitDownloads() { command_processor_.SubmitBarriers(); uint32_t download_buffer_offset = 0; for (auto& download_range : trace_download_ranges()) { - if (!EnsureHostGpuMemoryAllocated(download_range.first, - download_range.second)) { - download_range.second = 0; - continue; - } command_list.D3DCopyBufferRegion( trace_download_buffer_, download_buffer_offset, buffer_, download_range.first, download_range.second); @@ -362,52 +349,50 @@ void D3D12SharedMemory::ResetTraceDownload() { ReleaseTraceDownloadRanges(); } -bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start, - uint32_t length) { - if (!length || !AreTiledResourcesUsed()) { +bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange( + uint32_t offset_allocations, uint32_t length_allocations) { + if (!length_allocations) { return true; } - uint32_t heap_first = start >> kHeapSizeLog2; - uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2; - assert_true(heap_first < xe::countof(heaps_)); - assert_true(heap_last < xe::countof(heaps_)); - for (uint32_t i = heap_first; i <= heap_last; ++i) { - if (heaps_[i] != nullptr) { - continue; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - auto direct_queue = provider.GetDirectQueue(); - D3D12_HEAP_DESC heap_desc = {}; - heap_desc.SizeInBytes = kHeapSize; - heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; - heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | - provider.GetHeapFlagCreateNotZeroed(); - if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) { - XELOGE("Shared memory: Failed to create a tile heap"); - return false; - } - ++heap_count_; - COUNT_profile_set("gpu/shared_memory/used_mb", - heap_count_ << kHeapSizeLog2 >> 20); - D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates; - region_start_coordinates.X = - (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - region_start_coordinates.Y = 0; - region_start_coordinates.Z = 0; - region_start_coordinates.Subresource = 0; - D3D12_TILE_REGION_SIZE region_size; - region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - region_size.UseBox = FALSE; - D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE; - UINT heap_range_start_offset = 0; - UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - direct_queue->UpdateTileMappings( - buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1, - &range_flags, &heap_range_start_offset, &range_tile_count, - D3D12_TILE_MAPPING_FLAG_NONE); - command_processor_.NotifyQueueOperationsDoneDirectly(); + + uint32_t offset_bytes = offset_allocations + << host_gpu_memory_sparse_granularity_log2(); + uint32_t length_bytes = length_allocations + << host_gpu_memory_sparse_granularity_log2(); + + const ui::d3d12::D3D12Provider& provider = + command_processor_.GetD3D12Context().GetD3D12Provider(); + ID3D12Device* device = provider.GetDevice(); + ID3D12CommandQueue* direct_queue = provider.GetDirectQueue(); + + D3D12_HEAP_DESC heap_desc = {}; + heap_desc.SizeInBytes = length_bytes; + heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; + heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | + provider.GetHeapFlagCreateNotZeroed(); + ID3D12Heap* heap; + if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heap)))) { + XELOGE("Shared memory: Failed to create a tile heap"); + return false; } + buffer_tiled_heaps_.push_back(heap); + + D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates; + region_start_coordinates.X = + offset_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + region_start_coordinates.Y = 0; + region_start_coordinates.Z = 0; + region_start_coordinates.Subresource = 0; + D3D12_TILE_REGION_SIZE region_size; + region_size.NumTiles = length_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + region_size.UseBox = FALSE; + D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE; + UINT heap_range_start_offset = 0; + direct_queue->UpdateTileMappings( + buffer_, 1, ®ion_start_coordinates, ®ion_size, heap, 1, + &range_flags, &heap_range_start_offset, ®ion_size.NumTiles, + D3D12_TILE_MAPPING_FLAG_NONE); + command_processor_.NotifyQueueOperationsDoneDirectly(); return true; } diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h index c66e5578d..6620cecaa 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -87,33 +87,24 @@ class D3D12SharedMemory : public SharedMemory { void InitializeTraceCompleteDownloads(); protected: - bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override; + bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, + uint32_t length_allocations) override; bool UploadRanges(const std::vector>& upload_page_ranges) override; private: - bool AreTiledResourcesUsed() const; - D3D12CommandProcessor& command_processor_; TraceWriter& trace_writer_; // The 512 MB tiled buffer. ID3D12Resource* buffer_ = nullptr; D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0; + std::vector buffer_tiled_heaps_; D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; bool buffer_uav_writes_commit_needed_ = false; void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state); - static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16)); - static constexpr uint32_t kHeapSizeLog2 = - std::max(kOptimalAllocationLog2, uint32_t(16)); - static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2; - // Resident portions of the tiled buffer. - ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {}; - // Number of the heaps currently resident, for profiling. - uint32_t heap_count_ = 0; - // Non-shader-visible buffer descriptor heap for faster binding (via copying // rather than creation). enum class BufferDescriptorIndex : uint32_t { diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index 4951eacea..ca3dcf4f0 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -13,6 +13,7 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/bit_range.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/base/profiling.h" @@ -36,6 +37,15 @@ void SharedMemory::InitializeCommon() { MemoryInvalidationCallbackThunk, this); } +void SharedMemory::InitializeSparseHostGpuMemory(uint32_t granularity_log2) { + assert_true(granularity_log2 <= kBufferSizeLog2); + assert_true(host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX); + host_gpu_memory_sparse_granularity_log2_ = granularity_log2; + host_gpu_memory_sparse_allocated_.resize( + size_t(1) << (std::max(kBufferSizeLog2 - granularity_log2, uint32_t(6)) - + 6)); +} + void SharedMemory::ShutdownCommon() { ReleaseTraceDownloadRanges(); @@ -61,6 +71,19 @@ void SharedMemory::ShutdownCommon() { memory_invalidation_callback_handle_); memory_invalidation_callback_handle_ = nullptr; } + + if (host_gpu_memory_sparse_used_bytes_) { + host_gpu_memory_sparse_used_bytes_ = 0; + COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_used_mb", 0); + } + if (host_gpu_memory_sparse_allocations_) { + host_gpu_memory_sparse_allocations_ = 0; + COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations", + 0); + } + host_gpu_memory_sparse_allocated_.clear(); + host_gpu_memory_sparse_allocated_.shrink_to_fit(); + host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX; } void SharedMemory::ClearCache() { @@ -244,6 +267,14 @@ void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) { MakeRangeValid(start, length, true); } +bool SharedMemory::AllocateSparseHostGpuMemoryRange( + uint32_t offset_allocations, uint32_t length_allocations) { + assert_always( + "Sparse host GPU memory allocation has been initialized, but the " + "implementation doesn't provide AllocateSparseHostGpuMemoryRange"); + return false; +} + void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu) { if (length == 0 || start >= kBufferSize) { @@ -316,7 +347,6 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { if (start > kBufferSize || (kBufferSize - start) < length) { return false; } - uint32_t last = start + length - 1; SCOPE_profile_cpu_f("gpu"); @@ -506,10 +536,14 @@ void SharedMemory::PrepareForTraceDownload() { } else { uint32_t gpu_written_range_length = gpu_written_page - gpu_written_range_start; - trace_download_ranges_.push_back( - std::make_pair(gpu_written_range_start << page_size_log2_, - gpu_written_range_length << page_size_log2_)); - trace_download_page_count_ += gpu_written_range_length; + if (EnsureHostGpuMemoryAllocated( + gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)) { + trace_download_ranges_.push_back( + std::make_pair(gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)); + trace_download_page_count_ += gpu_written_range_length; + } gpu_written_range_start = UINT32_MAX; } uint64_t gpu_written_block_mask = @@ -524,10 +558,14 @@ void SharedMemory::PrepareForTraceDownload() { } if (gpu_written_range_start != UINT32_MAX) { uint32_t gpu_written_range_length = page_count - gpu_written_range_start; - trace_download_ranges_.push_back( - std::make_pair(gpu_written_range_start << page_size_log2_, - gpu_written_range_length << page_size_log2_)); - trace_download_page_count_ += gpu_written_range_length; + if (EnsureHostGpuMemoryAllocated( + gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)) { + trace_download_ranges_.push_back( + std::make_pair(gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)); + trace_download_page_count_ += gpu_written_range_length; + } } } @@ -537,5 +575,50 @@ void SharedMemory::ReleaseTraceDownloadRanges() { trace_download_page_count_ = 0; } +bool SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start, + uint32_t length) { + if (host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX) { + return true; + } + if (!length) { + return true; + } + if (start > kBufferSize || (kBufferSize - start) < length) { + return false; + } + uint32_t page_first = start >> page_size_log2_; + uint32_t page_last = (start + length - 1) >> page_size_log2_; + uint32_t allocation_first = + page_first << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_; + uint32_t allocation_last = + page_last << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_; + while (true) { + std::pair allocation_range = xe::bit_range::NextUnsetRange( + host_gpu_memory_sparse_allocated_.data(), allocation_first, + allocation_last - allocation_first + 1); + if (!allocation_range.second) { + break; + } + if (!AllocateSparseHostGpuMemoryRange(uint32_t(allocation_range.first), + uint32_t(allocation_range.second))) { + return false; + } + xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(), + allocation_range.first, allocation_range.second); + ++host_gpu_memory_sparse_allocations_; + COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations", + host_gpu_memory_sparse_allocations_); + host_gpu_memory_sparse_used_bytes_ += + uint32_t(allocation_range.second) + << host_gpu_memory_sparse_granularity_log2_; + COUNT_profile_set( + "gpu/shared_memory/host_gpu_memory_sparse_used_mb", + (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20); + allocation_first = + uint32_t(allocation_range.first + allocation_range.second); + } + return true; +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 6dae85909..496836a38 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -93,6 +93,7 @@ class SharedMemory { SharedMemory(Memory& memory); // Call in implementation-specific initialization. void InitializeCommon(); + void InitializeSparseHostGpuMemory(uint32_t granularity_log2); // Call last in implementation-specific shutdown, also callable from the // destructor. void ShutdownCommon(); @@ -103,33 +104,35 @@ class SharedMemory { // Sparse allocations are 4 MB, so not too many of them are allocated, but // also not to waste too much memory for padding (with 16 MB there's too // much). - static constexpr uint32_t kOptimalAllocationLog2 = 22; - static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2); + static constexpr uint32_t kHostGpuMemoryOptimalSparseAllocationLog2 = 22; + static_assert(kHostGpuMemoryOptimalSparseAllocationLog2 <= kBufferSizeLog2); Memory& memory() const { return memory_; } uint32_t page_size_log2() const { return page_size_log2_; } + uint32_t host_gpu_memory_sparse_granularity_log2() const { + return host_gpu_memory_sparse_granularity_log2_; + } + + virtual bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, + uint32_t length_allocations); + // Mark the memory range as updated and protect it. void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu); - // Ensures the host GPU memory backing the range is accessible by host GPU - // drawing / computations / copying, but doesn't upload anything. - virtual bool EnsureHostGpuMemoryAllocated(uint32_t start, - uint32_t length) = 0; - - // Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated - // succeeded. While uploading, MarkRangeValid must be called for each - // successfully uploaded range as early as possible, before the memcpy, to - // make sure invalidation that happened during the CPU -> GPU memcpy isn't - // missed (upload_page_ranges is in pages because of this - MarkRangeValid has - // page granularity). + // Uploads a range of host pages - only called if host GPU sparse memory + // allocation succeeded if needed. While uploading, MarkRangeValid must be + // called for each successfully uploaded range as early as possible, before + // the memcpy, to make sure invalidation that happened during the CPU -> GPU + // memcpy isn't missed (upload_page_ranges is in pages because of this - + // MarkRangeValid has page granularity). upload_page_ranges are sorted in + // ascending address order, so front and back can be used to determine the + // overall bounds of pages to be uploaded. virtual bool UploadRanges( const std::vector>& upload_page_ranges) = 0; - // Mutable so the implementation can skip ranges by setting their "second" - // value to 0 if needed. - std::vector>& trace_download_ranges() { + const std::vector>& trace_download_ranges() { return trace_download_ranges_; } uint32_t trace_download_page_count() const { @@ -153,6 +156,12 @@ class SharedMemory { // touched pages of the buffer of this size will be invalidated). uint32_t page_size_log2_; + bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length); + uint32_t host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX; + std::vector host_gpu_memory_sparse_allocated_; + uint32_t host_gpu_memory_sparse_allocations_ = 0; + uint32_t host_gpu_memory_sparse_used_bytes_ = 0; + void* memory_invalidation_callback_handle_ = nullptr; void* memory_data_provider_handle_ = nullptr;