[GPU] SharedMemory: common sparse memory allocation
This commit is contained in:
parent
75bf2d3c7d
commit
19121130a3
|
@ -0,0 +1,106 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2019 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_BASE_BIT_RANGE_H_
|
||||||
|
#define XENIA_BASE_BIT_RANGE_H_
|
||||||
|
|
||||||
|
#include <climits>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstring>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "xenia/base/math.h"
|
||||||
|
|
||||||
|
namespace xe {
|
||||||
|
namespace bit_range {
|
||||||
|
|
||||||
|
// Provided length is in bits since the first. Returns <first, length> of the
|
||||||
|
// range in bits, with length == 0 if not found.
|
||||||
|
template <typename Block>
|
||||||
|
std::pair<size_t, size_t> NextUnsetRange(const Block* bits, size_t first,
|
||||||
|
size_t length) {
|
||||||
|
if (!length) {
|
||||||
|
return std::make_pair(size_t(first), size_t(0));
|
||||||
|
}
|
||||||
|
size_t last = first + length - 1;
|
||||||
|
const size_t block_bits = sizeof(Block) * CHAR_BIT;
|
||||||
|
size_t block_first = first / block_bits;
|
||||||
|
size_t block_last = last / block_bits;
|
||||||
|
size_t range_start = SIZE_MAX;
|
||||||
|
for (size_t i = block_first; i <= block_last; ++i) {
|
||||||
|
Block block = bits[i];
|
||||||
|
// Ignore bits in the block outside the specified range by considering them
|
||||||
|
// set.
|
||||||
|
if (i == block_first) {
|
||||||
|
block |= (Block(1) << (first & (block_bits - 1))) - 1;
|
||||||
|
}
|
||||||
|
if (i == block_last && (last & (block_bits - 1)) != block_bits - 1) {
|
||||||
|
block |= ~((Block(1) << ((last & (block_bits - 1)) + 1)) - 1);
|
||||||
|
}
|
||||||
|
while (true) {
|
||||||
|
uint32_t block_bit;
|
||||||
|
if (range_start == SIZE_MAX) {
|
||||||
|
// Check if need to open a new range.
|
||||||
|
if (!xe::bit_scan_forward(~block, &block_bit)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
range_start = i * block_bits + block_bit;
|
||||||
|
} else {
|
||||||
|
// Check if need to close the range.
|
||||||
|
// Ignore the set bits before the beginning of the range.
|
||||||
|
Block block_bits_set_from_start = block;
|
||||||
|
if (i == range_start / block_bits) {
|
||||||
|
block_bits_set_from_start &=
|
||||||
|
~((Block(1) << (range_start & (block_bits - 1))) - 1);
|
||||||
|
}
|
||||||
|
if (!xe::bit_scan_forward(block_bits_set_from_start, &block_bit)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return std::make_pair(range_start,
|
||||||
|
(i * block_bits) + block_bit - range_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (range_start != SIZE_MAX) {
|
||||||
|
return std::make_pair(range_start, last + size_t(1) - range_start);
|
||||||
|
}
|
||||||
|
return std::make_pair(first + length, size_t(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Block>
|
||||||
|
void SetRange(Block* bits, size_t first, size_t length) {
|
||||||
|
if (!length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
size_t last = first + length - 1;
|
||||||
|
const size_t block_bits = sizeof(Block) * CHAR_BIT;
|
||||||
|
size_t block_first = first / block_bits;
|
||||||
|
size_t block_last = last / block_bits;
|
||||||
|
Block set_first = ~((Block(1) << (first & (block_bits - 1))) - 1);
|
||||||
|
Block set_last = ~Block(0);
|
||||||
|
if ((last & (block_bits - 1)) != (block_bits - 1)) {
|
||||||
|
set_last &= (Block(1) << ((last & (block_bits - 1)) + 1)) - 1;
|
||||||
|
}
|
||||||
|
if (block_first == block_last) {
|
||||||
|
bits[block_first] |= set_first & set_last;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
bits[block_first] |= set_first;
|
||||||
|
if (block_first + 1 < block_last) {
|
||||||
|
std::memset(bits + block_first + 1, CHAR_MAX,
|
||||||
|
(block_last - (block_first + 1)) * sizeof(Block));
|
||||||
|
}
|
||||||
|
bits[block_last] |= set_last;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace bit_range
|
||||||
|
} // namespace xe
|
||||||
|
|
||||||
|
#endif // XENIA_BASE_BIT_RANGE_H_
|
|
@ -17,7 +17,6 @@
|
||||||
#include "xenia/base/cvar.h"
|
#include "xenia/base/cvar.h"
|
||||||
#include "xenia/base/logging.h"
|
#include "xenia/base/logging.h"
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/base/profiling.h"
|
|
||||||
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_util.h"
|
#include "xenia/ui/d3d12/d3d12_util.h"
|
||||||
|
|
||||||
|
@ -43,26 +42,35 @@ D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); }
|
||||||
bool D3D12SharedMemory::Initialize() {
|
bool D3D12SharedMemory::Initialize() {
|
||||||
InitializeCommon();
|
InitializeCommon();
|
||||||
|
|
||||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
const ui::d3d12::D3D12Provider& provider =
|
||||||
auto device = provider.GetDevice();
|
command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||||
|
ID3D12Device* device = provider.GetDevice();
|
||||||
|
|
||||||
D3D12_RESOURCE_DESC buffer_desc;
|
D3D12_RESOURCE_DESC buffer_desc;
|
||||||
ui::d3d12::util::FillBufferResourceDesc(
|
ui::d3d12::util::FillBufferResourceDesc(
|
||||||
buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
|
buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
|
||||||
buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
if (AreTiledResourcesUsed()) {
|
if (cvars::d3d12_tiled_shared_memory &&
|
||||||
|
provider.GetTiledResourcesTier() !=
|
||||||
|
D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
|
||||||
|
!provider.GetGraphicsAnalysis()) {
|
||||||
if (FAILED(device->CreateReservedResource(
|
if (FAILED(device->CreateReservedResource(
|
||||||
&buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
|
&buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||||
XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
|
XELOGE("Shared memory: Failed to create the {} MB tiled buffer",
|
||||||
|
kBufferSize >> 20);
|
||||||
Shutdown();
|
Shutdown();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
|
||||||
|
InitializeSparseHostGpuMemory(
|
||||||
|
std::max(kHostGpuMemoryOptimalSparseAllocationLog2, uint32_t(16)));
|
||||||
} else {
|
} else {
|
||||||
XELOGGPU(
|
XELOGGPU(
|
||||||
"Direct3D 12 tiled resources are not used for shared memory "
|
"Direct3D 12 tiled resources are not used for shared memory "
|
||||||
"emulation - video memory usage may increase significantly "
|
"emulation - video memory usage may increase significantly "
|
||||||
"because a full 512 MB buffer will be created!");
|
"because a full {} MB buffer will be created!",
|
||||||
if (provider.GetGraphicsAnalysis() != nullptr) {
|
kBufferSize >> 20);
|
||||||
|
if (provider.GetGraphicsAnalysis()) {
|
||||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
||||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
||||||
XELOGGPU(
|
XELOGGPU(
|
||||||
|
@ -73,7 +81,8 @@ bool D3D12SharedMemory::Initialize() {
|
||||||
&ui::d3d12::util::kHeapPropertiesDefault,
|
&ui::d3d12::util::kHeapPropertiesDefault,
|
||||||
provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
|
provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
|
||||||
nullptr, IID_PPV_ARGS(&buffer_)))) {
|
nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||||
XELOGE("Shared memory: Failed to create the 512 MB buffer");
|
XELOGE("Shared memory: Failed to create the {} MB buffer",
|
||||||
|
kBufferSize >> 20);
|
||||||
Shutdown();
|
Shutdown();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -161,13 +170,10 @@ void D3D12SharedMemory::Shutdown(bool from_destructor) {
|
||||||
// First free the buffer to detach it from the heaps.
|
// First free the buffer to detach it from the heaps.
|
||||||
ui::d3d12::util::ReleaseAndNull(buffer_);
|
ui::d3d12::util::ReleaseAndNull(buffer_);
|
||||||
|
|
||||||
if (AreTiledResourcesUsed()) {
|
for (ID3D12Heap* heap : buffer_tiled_heaps_) {
|
||||||
for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
|
heap->Release();
|
||||||
ui::d3d12::util::ReleaseAndNull(heaps_[i]);
|
|
||||||
}
|
|
||||||
heap_count_ = 0;
|
|
||||||
COUNT_profile_set("gpu/shared_memory/used_mb", 0);
|
|
||||||
}
|
}
|
||||||
|
buffer_tiled_heaps_.clear();
|
||||||
|
|
||||||
// If calling from the destructor, the SharedMemory destructor will call
|
// If calling from the destructor, the SharedMemory destructor will call
|
||||||
// ShutdownCommon.
|
// ShutdownCommon.
|
||||||
|
@ -180,26 +186,12 @@ void D3D12SharedMemory::ClearCache() {
|
||||||
SharedMemory::ClearCache();
|
SharedMemory::ClearCache();
|
||||||
|
|
||||||
upload_buffer_pool_->ClearCache();
|
upload_buffer_pool_->ClearCache();
|
||||||
|
|
||||||
// TODO(Triang3l): Unmap and destroy heaps.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12SharedMemory::CompletedSubmissionUpdated() {
|
void D3D12SharedMemory::CompletedSubmissionUpdated() {
|
||||||
upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
|
upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool D3D12SharedMemory::AreTiledResourcesUsed() const {
|
|
||||||
if (!cvars::d3d12_tiled_shared_memory) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
|
||||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
|
||||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
|
||||||
return provider.GetTiledResourcesTier() !=
|
|
||||||
D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
|
|
||||||
provider.GetGraphicsAnalysis() == nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
|
void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
|
||||||
D3D12_RESOURCE_STATES new_state) {
|
D3D12_RESOURCE_STATES new_state) {
|
||||||
if (buffer_state_ == new_state) {
|
if (buffer_state_ == new_state) {
|
||||||
|
@ -321,11 +313,6 @@ bool D3D12SharedMemory::InitializeTraceSubmitDownloads() {
|
||||||
command_processor_.SubmitBarriers();
|
command_processor_.SubmitBarriers();
|
||||||
uint32_t download_buffer_offset = 0;
|
uint32_t download_buffer_offset = 0;
|
||||||
for (auto& download_range : trace_download_ranges()) {
|
for (auto& download_range : trace_download_ranges()) {
|
||||||
if (!EnsureHostGpuMemoryAllocated(download_range.first,
|
|
||||||
download_range.second)) {
|
|
||||||
download_range.second = 0;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
command_list.D3DCopyBufferRegion(
|
command_list.D3DCopyBufferRegion(
|
||||||
trace_download_buffer_, download_buffer_offset, buffer_,
|
trace_download_buffer_, download_buffer_offset, buffer_,
|
||||||
download_range.first, download_range.second);
|
download_range.first, download_range.second);
|
||||||
|
@ -362,52 +349,50 @@ void D3D12SharedMemory::ResetTraceDownload() {
|
||||||
ReleaseTraceDownloadRanges();
|
ReleaseTraceDownloadRanges();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
|
bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
|
||||||
uint32_t length) {
|
uint32_t offset_allocations, uint32_t length_allocations) {
|
||||||
if (!length || !AreTiledResourcesUsed()) {
|
if (!length_allocations) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
uint32_t heap_first = start >> kHeapSizeLog2;
|
|
||||||
uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
|
uint32_t offset_bytes = offset_allocations
|
||||||
assert_true(heap_first < xe::countof(heaps_));
|
<< host_gpu_memory_sparse_granularity_log2();
|
||||||
assert_true(heap_last < xe::countof(heaps_));
|
uint32_t length_bytes = length_allocations
|
||||||
for (uint32_t i = heap_first; i <= heap_last; ++i) {
|
<< host_gpu_memory_sparse_granularity_log2();
|
||||||
if (heaps_[i] != nullptr) {
|
|
||||||
continue;
|
const ui::d3d12::D3D12Provider& provider =
|
||||||
}
|
command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
ID3D12Device* device = provider.GetDevice();
|
||||||
auto device = provider.GetDevice();
|
ID3D12CommandQueue* direct_queue = provider.GetDirectQueue();
|
||||||
auto direct_queue = provider.GetDirectQueue();
|
|
||||||
D3D12_HEAP_DESC heap_desc = {};
|
D3D12_HEAP_DESC heap_desc = {};
|
||||||
heap_desc.SizeInBytes = kHeapSize;
|
heap_desc.SizeInBytes = length_bytes;
|
||||||
heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
|
heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||||
heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
|
heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
|
||||||
provider.GetHeapFlagCreateNotZeroed();
|
provider.GetHeapFlagCreateNotZeroed();
|
||||||
if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
|
ID3D12Heap* heap;
|
||||||
|
if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heap)))) {
|
||||||
XELOGE("Shared memory: Failed to create a tile heap");
|
XELOGE("Shared memory: Failed to create a tile heap");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
++heap_count_;
|
buffer_tiled_heaps_.push_back(heap);
|
||||||
COUNT_profile_set("gpu/shared_memory/used_mb",
|
|
||||||
heap_count_ << kHeapSizeLog2 >> 20);
|
|
||||||
D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
|
D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
|
||||||
region_start_coordinates.X =
|
region_start_coordinates.X =
|
||||||
(i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
offset_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||||
region_start_coordinates.Y = 0;
|
region_start_coordinates.Y = 0;
|
||||||
region_start_coordinates.Z = 0;
|
region_start_coordinates.Z = 0;
|
||||||
region_start_coordinates.Subresource = 0;
|
region_start_coordinates.Subresource = 0;
|
||||||
D3D12_TILE_REGION_SIZE region_size;
|
D3D12_TILE_REGION_SIZE region_size;
|
||||||
region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
region_size.NumTiles = length_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||||
region_size.UseBox = FALSE;
|
region_size.UseBox = FALSE;
|
||||||
D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
|
D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
|
||||||
UINT heap_range_start_offset = 0;
|
UINT heap_range_start_offset = 0;
|
||||||
UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
|
||||||
direct_queue->UpdateTileMappings(
|
direct_queue->UpdateTileMappings(
|
||||||
buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1,
|
buffer_, 1, ®ion_start_coordinates, ®ion_size, heap, 1,
|
||||||
&range_flags, &heap_range_start_offset, &range_tile_count,
|
&range_flags, &heap_range_start_offset, ®ion_size.NumTiles,
|
||||||
D3D12_TILE_MAPPING_FLAG_NONE);
|
D3D12_TILE_MAPPING_FLAG_NONE);
|
||||||
command_processor_.NotifyQueueOperationsDoneDirectly();
|
command_processor_.NotifyQueueOperationsDoneDirectly();
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -87,33 +87,24 @@ class D3D12SharedMemory : public SharedMemory {
|
||||||
void InitializeTraceCompleteDownloads();
|
void InitializeTraceCompleteDownloads();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override;
|
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||||
|
uint32_t length_allocations) override;
|
||||||
|
|
||||||
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
|
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
|
||||||
upload_page_ranges) override;
|
upload_page_ranges) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool AreTiledResourcesUsed() const;
|
|
||||||
|
|
||||||
D3D12CommandProcessor& command_processor_;
|
D3D12CommandProcessor& command_processor_;
|
||||||
TraceWriter& trace_writer_;
|
TraceWriter& trace_writer_;
|
||||||
|
|
||||||
// The 512 MB tiled buffer.
|
// The 512 MB tiled buffer.
|
||||||
ID3D12Resource* buffer_ = nullptr;
|
ID3D12Resource* buffer_ = nullptr;
|
||||||
D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
|
D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
|
||||||
|
std::vector<ID3D12Heap*> buffer_tiled_heaps_;
|
||||||
D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||||
bool buffer_uav_writes_commit_needed_ = false;
|
bool buffer_uav_writes_commit_needed_ = false;
|
||||||
void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
|
void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
|
||||||
|
|
||||||
static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
|
|
||||||
static constexpr uint32_t kHeapSizeLog2 =
|
|
||||||
std::max(kOptimalAllocationLog2, uint32_t(16));
|
|
||||||
static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
|
|
||||||
// Resident portions of the tiled buffer.
|
|
||||||
ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
|
|
||||||
// Number of the heaps currently resident, for profiling.
|
|
||||||
uint32_t heap_count_ = 0;
|
|
||||||
|
|
||||||
// Non-shader-visible buffer descriptor heap for faster binding (via copying
|
// Non-shader-visible buffer descriptor heap for faster binding (via copying
|
||||||
// rather than creation).
|
// rather than creation).
|
||||||
enum class BufferDescriptorIndex : uint32_t {
|
enum class BufferDescriptorIndex : uint32_t {
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
|
#include "xenia/base/bit_range.h"
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/base/memory.h"
|
#include "xenia/base/memory.h"
|
||||||
#include "xenia/base/profiling.h"
|
#include "xenia/base/profiling.h"
|
||||||
|
@ -36,6 +37,15 @@ void SharedMemory::InitializeCommon() {
|
||||||
MemoryInvalidationCallbackThunk, this);
|
MemoryInvalidationCallbackThunk, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SharedMemory::InitializeSparseHostGpuMemory(uint32_t granularity_log2) {
|
||||||
|
assert_true(granularity_log2 <= kBufferSizeLog2);
|
||||||
|
assert_true(host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX);
|
||||||
|
host_gpu_memory_sparse_granularity_log2_ = granularity_log2;
|
||||||
|
host_gpu_memory_sparse_allocated_.resize(
|
||||||
|
size_t(1) << (std::max(kBufferSizeLog2 - granularity_log2, uint32_t(6)) -
|
||||||
|
6));
|
||||||
|
}
|
||||||
|
|
||||||
void SharedMemory::ShutdownCommon() {
|
void SharedMemory::ShutdownCommon() {
|
||||||
ReleaseTraceDownloadRanges();
|
ReleaseTraceDownloadRanges();
|
||||||
|
|
||||||
|
@ -61,6 +71,19 @@ void SharedMemory::ShutdownCommon() {
|
||||||
memory_invalidation_callback_handle_);
|
memory_invalidation_callback_handle_);
|
||||||
memory_invalidation_callback_handle_ = nullptr;
|
memory_invalidation_callback_handle_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (host_gpu_memory_sparse_used_bytes_) {
|
||||||
|
host_gpu_memory_sparse_used_bytes_ = 0;
|
||||||
|
COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_used_mb", 0);
|
||||||
|
}
|
||||||
|
if (host_gpu_memory_sparse_allocations_) {
|
||||||
|
host_gpu_memory_sparse_allocations_ = 0;
|
||||||
|
COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations",
|
||||||
|
0);
|
||||||
|
}
|
||||||
|
host_gpu_memory_sparse_allocated_.clear();
|
||||||
|
host_gpu_memory_sparse_allocated_.shrink_to_fit();
|
||||||
|
host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SharedMemory::ClearCache() {
|
void SharedMemory::ClearCache() {
|
||||||
|
@ -244,6 +267,14 @@ void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) {
|
||||||
MakeRangeValid(start, length, true);
|
MakeRangeValid(start, length, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool SharedMemory::AllocateSparseHostGpuMemoryRange(
|
||||||
|
uint32_t offset_allocations, uint32_t length_allocations) {
|
||||||
|
assert_always(
|
||||||
|
"Sparse host GPU memory allocation has been initialized, but the "
|
||||||
|
"implementation doesn't provide AllocateSparseHostGpuMemoryRange");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
|
void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
|
||||||
bool written_by_gpu) {
|
bool written_by_gpu) {
|
||||||
if (length == 0 || start >= kBufferSize) {
|
if (length == 0 || start >= kBufferSize) {
|
||||||
|
@ -316,7 +347,6 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
|
||||||
if (start > kBufferSize || (kBufferSize - start) < length) {
|
if (start > kBufferSize || (kBufferSize - start) < length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
uint32_t last = start + length - 1;
|
|
||||||
|
|
||||||
SCOPE_profile_cpu_f("gpu");
|
SCOPE_profile_cpu_f("gpu");
|
||||||
|
|
||||||
|
@ -506,10 +536,14 @@ void SharedMemory::PrepareForTraceDownload() {
|
||||||
} else {
|
} else {
|
||||||
uint32_t gpu_written_range_length =
|
uint32_t gpu_written_range_length =
|
||||||
gpu_written_page - gpu_written_range_start;
|
gpu_written_page - gpu_written_range_start;
|
||||||
|
if (EnsureHostGpuMemoryAllocated(
|
||||||
|
gpu_written_range_start << page_size_log2_,
|
||||||
|
gpu_written_range_length << page_size_log2_)) {
|
||||||
trace_download_ranges_.push_back(
|
trace_download_ranges_.push_back(
|
||||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||||
gpu_written_range_length << page_size_log2_));
|
gpu_written_range_length << page_size_log2_));
|
||||||
trace_download_page_count_ += gpu_written_range_length;
|
trace_download_page_count_ += gpu_written_range_length;
|
||||||
|
}
|
||||||
gpu_written_range_start = UINT32_MAX;
|
gpu_written_range_start = UINT32_MAX;
|
||||||
}
|
}
|
||||||
uint64_t gpu_written_block_mask =
|
uint64_t gpu_written_block_mask =
|
||||||
|
@ -524,11 +558,15 @@ void SharedMemory::PrepareForTraceDownload() {
|
||||||
}
|
}
|
||||||
if (gpu_written_range_start != UINT32_MAX) {
|
if (gpu_written_range_start != UINT32_MAX) {
|
||||||
uint32_t gpu_written_range_length = page_count - gpu_written_range_start;
|
uint32_t gpu_written_range_length = page_count - gpu_written_range_start;
|
||||||
|
if (EnsureHostGpuMemoryAllocated(
|
||||||
|
gpu_written_range_start << page_size_log2_,
|
||||||
|
gpu_written_range_length << page_size_log2_)) {
|
||||||
trace_download_ranges_.push_back(
|
trace_download_ranges_.push_back(
|
||||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||||
gpu_written_range_length << page_size_log2_));
|
gpu_written_range_length << page_size_log2_));
|
||||||
trace_download_page_count_ += gpu_written_range_length;
|
trace_download_page_count_ += gpu_written_range_length;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SharedMemory::ReleaseTraceDownloadRanges() {
|
void SharedMemory::ReleaseTraceDownloadRanges() {
|
||||||
|
@ -537,5 +575,50 @@ void SharedMemory::ReleaseTraceDownloadRanges() {
|
||||||
trace_download_page_count_ = 0;
|
trace_download_page_count_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
|
||||||
|
uint32_t length) {
|
||||||
|
if (host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (start > kBufferSize || (kBufferSize - start) < length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
uint32_t page_first = start >> page_size_log2_;
|
||||||
|
uint32_t page_last = (start + length - 1) >> page_size_log2_;
|
||||||
|
uint32_t allocation_first =
|
||||||
|
page_first << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
|
||||||
|
uint32_t allocation_last =
|
||||||
|
page_last << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
|
||||||
|
while (true) {
|
||||||
|
std::pair<size_t, size_t> allocation_range = xe::bit_range::NextUnsetRange(
|
||||||
|
host_gpu_memory_sparse_allocated_.data(), allocation_first,
|
||||||
|
allocation_last - allocation_first + 1);
|
||||||
|
if (!allocation_range.second) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!AllocateSparseHostGpuMemoryRange(uint32_t(allocation_range.first),
|
||||||
|
uint32_t(allocation_range.second))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(),
|
||||||
|
allocation_range.first, allocation_range.second);
|
||||||
|
++host_gpu_memory_sparse_allocations_;
|
||||||
|
COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations",
|
||||||
|
host_gpu_memory_sparse_allocations_);
|
||||||
|
host_gpu_memory_sparse_used_bytes_ +=
|
||||||
|
uint32_t(allocation_range.second)
|
||||||
|
<< host_gpu_memory_sparse_granularity_log2_;
|
||||||
|
COUNT_profile_set(
|
||||||
|
"gpu/shared_memory/host_gpu_memory_sparse_used_mb",
|
||||||
|
(host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20);
|
||||||
|
allocation_first =
|
||||||
|
uint32_t(allocation_range.first + allocation_range.second);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -93,6 +93,7 @@ class SharedMemory {
|
||||||
SharedMemory(Memory& memory);
|
SharedMemory(Memory& memory);
|
||||||
// Call in implementation-specific initialization.
|
// Call in implementation-specific initialization.
|
||||||
void InitializeCommon();
|
void InitializeCommon();
|
||||||
|
void InitializeSparseHostGpuMemory(uint32_t granularity_log2);
|
||||||
// Call last in implementation-specific shutdown, also callable from the
|
// Call last in implementation-specific shutdown, also callable from the
|
||||||
// destructor.
|
// destructor.
|
||||||
void ShutdownCommon();
|
void ShutdownCommon();
|
||||||
|
@ -103,33 +104,35 @@ class SharedMemory {
|
||||||
// Sparse allocations are 4 MB, so not too many of them are allocated, but
|
// Sparse allocations are 4 MB, so not too many of them are allocated, but
|
||||||
// also not to waste too much memory for padding (with 16 MB there's too
|
// also not to waste too much memory for padding (with 16 MB there's too
|
||||||
// much).
|
// much).
|
||||||
static constexpr uint32_t kOptimalAllocationLog2 = 22;
|
static constexpr uint32_t kHostGpuMemoryOptimalSparseAllocationLog2 = 22;
|
||||||
static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2);
|
static_assert(kHostGpuMemoryOptimalSparseAllocationLog2 <= kBufferSizeLog2);
|
||||||
|
|
||||||
Memory& memory() const { return memory_; }
|
Memory& memory() const { return memory_; }
|
||||||
|
|
||||||
uint32_t page_size_log2() const { return page_size_log2_; }
|
uint32_t page_size_log2() const { return page_size_log2_; }
|
||||||
|
|
||||||
|
uint32_t host_gpu_memory_sparse_granularity_log2() const {
|
||||||
|
return host_gpu_memory_sparse_granularity_log2_;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||||
|
uint32_t length_allocations);
|
||||||
|
|
||||||
// Mark the memory range as updated and protect it.
|
// Mark the memory range as updated and protect it.
|
||||||
void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
|
void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
|
||||||
|
|
||||||
// Ensures the host GPU memory backing the range is accessible by host GPU
|
// Uploads a range of host pages - only called if host GPU sparse memory
|
||||||
// drawing / computations / copying, but doesn't upload anything.
|
// allocation succeeded if needed. While uploading, MarkRangeValid must be
|
||||||
virtual bool EnsureHostGpuMemoryAllocated(uint32_t start,
|
// called for each successfully uploaded range as early as possible, before
|
||||||
uint32_t length) = 0;
|
// the memcpy, to make sure invalidation that happened during the CPU -> GPU
|
||||||
|
// memcpy isn't missed (upload_page_ranges is in pages because of this -
|
||||||
// Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated
|
// MarkRangeValid has page granularity). upload_page_ranges are sorted in
|
||||||
// succeeded. While uploading, MarkRangeValid must be called for each
|
// ascending address order, so front and back can be used to determine the
|
||||||
// successfully uploaded range as early as possible, before the memcpy, to
|
// overall bounds of pages to be uploaded.
|
||||||
// make sure invalidation that happened during the CPU -> GPU memcpy isn't
|
|
||||||
// missed (upload_page_ranges is in pages because of this - MarkRangeValid has
|
|
||||||
// page granularity).
|
|
||||||
virtual bool UploadRanges(
|
virtual bool UploadRanges(
|
||||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
|
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
|
||||||
|
|
||||||
// Mutable so the implementation can skip ranges by setting their "second"
|
const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
|
||||||
// value to 0 if needed.
|
|
||||||
std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
|
|
||||||
return trace_download_ranges_;
|
return trace_download_ranges_;
|
||||||
}
|
}
|
||||||
uint32_t trace_download_page_count() const {
|
uint32_t trace_download_page_count() const {
|
||||||
|
@ -153,6 +156,12 @@ class SharedMemory {
|
||||||
// touched pages of the buffer of this size will be invalidated).
|
// touched pages of the buffer of this size will be invalidated).
|
||||||
uint32_t page_size_log2_;
|
uint32_t page_size_log2_;
|
||||||
|
|
||||||
|
bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length);
|
||||||
|
uint32_t host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX;
|
||||||
|
std::vector<uint64_t> host_gpu_memory_sparse_allocated_;
|
||||||
|
uint32_t host_gpu_memory_sparse_allocations_ = 0;
|
||||||
|
uint32_t host_gpu_memory_sparse_used_bytes_ = 0;
|
||||||
|
|
||||||
void* memory_invalidation_callback_handle_ = nullptr;
|
void* memory_invalidation_callback_handle_ = nullptr;
|
||||||
void* memory_data_provider_handle_ = nullptr;
|
void* memory_data_provider_handle_ = nullptr;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue