[GPU] Split SharedMemory into common and D3D12
This commit is contained in:
parent
6cb8f0aab4
commit
9f404090d3
|
@ -1161,7 +1161,7 @@ bool D3D12CommandProcessor::SetupContext() {
|
|||
}
|
||||
|
||||
shared_memory_ =
|
||||
std::make_unique<SharedMemory>(*this, *memory_, trace_writer_);
|
||||
std::make_unique<D3D12SharedMemory>(*this, *memory_, trace_writer_);
|
||||
if (!shared_memory_->Initialize()) {
|
||||
XELOGE("Failed to initialize shared memory");
|
||||
return false;
|
||||
|
@ -2259,7 +2259,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
// Invalidate textures in memexported memory and watch for changes.
|
||||
for (uint32_t i = 0; i < memexport_range_count; ++i) {
|
||||
const MemExportRange& memexport_range = memexport_ranges[i];
|
||||
shared_memory_->RangeWrittenByGPU(
|
||||
shared_memory_->RangeWrittenByGpu(
|
||||
memexport_range.base_address_dwords << 2,
|
||||
memexport_range.size_dwords << 2);
|
||||
}
|
||||
|
|
|
@ -20,11 +20,11 @@
|
|||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/command_processor.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
|
||||
#include "xenia/gpu/d3d12/deferred_command_list.h"
|
||||
#include "xenia/gpu/d3d12/pipeline_cache.h"
|
||||
#include "xenia/gpu/d3d12/primitive_converter.h"
|
||||
#include "xenia/gpu/d3d12/render_target_cache.h"
|
||||
#include "xenia/gpu/d3d12/shared_memory.h"
|
||||
#include "xenia/gpu/d3d12/texture_cache.h"
|
||||
#include "xenia/gpu/dxbc_shader_translator.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
@ -471,7 +471,7 @@ class D3D12CommandProcessor : public CommandProcessor {
|
|||
ID3D12RootSignature* root_signature_bindless_vs_ = nullptr;
|
||||
ID3D12RootSignature* root_signature_bindless_ds_ = nullptr;
|
||||
|
||||
std::unique_ptr<SharedMemory> shared_memory_;
|
||||
std::unique_ptr<D3D12SharedMemory> shared_memory_;
|
||||
|
||||
std::unique_ptr<PipelineCache> pipeline_cache_;
|
||||
|
||||
|
|
|
@ -0,0 +1,459 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
||||
#include "xenia/ui/d3d12/d3d12_util.h"
|
||||
|
||||
DEFINE_bool(d3d12_tiled_shared_memory, true,
|
||||
"Enable tiled resources for shared memory emulation. Disabling "
|
||||
"them greatly increases video memory usage - a 512 MB buffer is "
|
||||
"created - but allows graphics debuggers that don't support tiled "
|
||||
"resources to work.",
|
||||
"D3D12");
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
D3D12SharedMemory::D3D12SharedMemory(D3D12CommandProcessor& command_processor,
|
||||
Memory& memory, TraceWriter& trace_writer)
|
||||
: SharedMemory(memory),
|
||||
command_processor_(command_processor),
|
||||
trace_writer_(trace_writer) {}
|
||||
|
||||
D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); }
|
||||
|
||||
bool D3D12SharedMemory::Initialize() {
|
||||
InitializeCommon();
|
||||
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
|
||||
D3D12_RESOURCE_DESC buffer_desc;
|
||||
ui::d3d12::util::FillBufferResourceDesc(
|
||||
buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
|
||||
buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
if (AreTiledResourcesUsed()) {
|
||||
if (FAILED(device->CreateReservedResource(
|
||||
&buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||
XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
XELOGGPU(
|
||||
"Direct3D 12 tiled resources are not used for shared memory "
|
||||
"emulation - video memory usage may increase significantly "
|
||||
"because a full 512 MB buffer will be created!");
|
||||
if (provider.GetGraphicsAnalysis() != nullptr) {
|
||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
||||
XELOGGPU(
|
||||
"This is caused by PIX being attached, which doesn't support tiled "
|
||||
"resources yet.");
|
||||
}
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesDefault,
|
||||
provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
|
||||
nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||
XELOGE("Shared memory: Failed to create the 512 MB buffer");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
buffer_gpu_address_ = buffer_->GetGPUVirtualAddress();
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
|
||||
D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc;
|
||||
buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
|
||||
buffer_descriptor_heap_desc.NumDescriptors =
|
||||
uint32_t(BufferDescriptorIndex::kCount);
|
||||
buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
|
||||
buffer_descriptor_heap_desc.NodeMask = 0;
|
||||
if (FAILED(device->CreateDescriptorHeap(
|
||||
&buffer_descriptor_heap_desc,
|
||||
IID_PPV_ARGS(&buffer_descriptor_heap_)))) {
|
||||
XELOGE(
|
||||
"Failed to create the descriptor heap for shared memory buffer views");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
buffer_descriptor_heap_start_ =
|
||||
buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart();
|
||||
ui::d3d12::util::CreateBufferRawSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawSRV)),
|
||||
buffer_, kBufferSize);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
|
||||
ui::d3d12::util::CreateBufferRawUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawUAV)),
|
||||
buffer_, kBufferSize);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
|
||||
|
||||
upload_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
|
||||
provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize,
|
||||
size_t(1) << page_size_log2()));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::Shutdown(bool from_destructor) {
|
||||
ResetTraceDownload();
|
||||
|
||||
upload_buffer_pool_.reset();
|
||||
|
||||
ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_);
|
||||
|
||||
// First free the buffer to detach it from the heaps.
|
||||
ui::d3d12::util::ReleaseAndNull(buffer_);
|
||||
|
||||
if (AreTiledResourcesUsed()) {
|
||||
for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
|
||||
ui::d3d12::util::ReleaseAndNull(heaps_[i]);
|
||||
}
|
||||
heap_count_ = 0;
|
||||
COUNT_profile_set("gpu/shared_memory/used_mb", 0);
|
||||
}
|
||||
|
||||
// If calling from the destructor, the SharedMemory destructor will call
|
||||
// ShutdownCommon.
|
||||
if (!from_destructor) {
|
||||
ShutdownCommon();
|
||||
}
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::ClearCache() {
|
||||
SharedMemory::ClearCache();
|
||||
|
||||
upload_buffer_pool_->ClearCache();
|
||||
|
||||
// TODO(Triang3l): Unmap and destroy heaps.
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::CompletedSubmissionUpdated() {
|
||||
upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
|
||||
}
|
||||
|
||||
bool D3D12SharedMemory::AreTiledResourcesUsed() const {
|
||||
if (!cvars::d3d12_tiled_shared_memory) {
|
||||
return false;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
||||
return provider.GetTiledResourcesTier() !=
|
||||
D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
|
||||
provider.GetGraphicsAnalysis() == nullptr;
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
|
||||
D3D12_RESOURCE_STATES new_state) {
|
||||
if (buffer_state_ == new_state) {
|
||||
if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS &&
|
||||
buffer_uav_writes_commit_needed_) {
|
||||
command_processor_.PushUAVBarrier(buffer_);
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state);
|
||||
buffer_state_ = new_state;
|
||||
// "UAV -> anything" transition commits the writes implicitly.
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::WriteRawSRVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawSRV)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::WriteRawUAVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawUAV)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::WriteUintPow2SRVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
|
||||
BufferDescriptorIndex descriptor_index;
|
||||
switch (element_size_bytes_pow2) {
|
||||
case 2:
|
||||
descriptor_index = BufferDescriptorIndex::kR32UintSRV;
|
||||
break;
|
||||
case 3:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32UintSRV;
|
||||
break;
|
||||
case 4:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV;
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(element_size_bytes_pow2);
|
||||
return;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(descriptor_index)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::WriteUintPow2UAVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
|
||||
BufferDescriptorIndex descriptor_index;
|
||||
switch (element_size_bytes_pow2) {
|
||||
case 2:
|
||||
descriptor_index = BufferDescriptorIndex::kR32UintUAV;
|
||||
break;
|
||||
case 3:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32UintUAV;
|
||||
break;
|
||||
case 4:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV;
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(element_size_bytes_pow2);
|
||||
return;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(descriptor_index)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
bool D3D12SharedMemory::InitializeTraceSubmitDownloads() {
|
||||
ResetTraceDownload();
|
||||
PrepareForTraceDownload();
|
||||
uint32_t download_page_count = trace_download_page_count();
|
||||
// Request downloading of GPU-written memory.
|
||||
if (!download_page_count) {
|
||||
return false;
|
||||
}
|
||||
D3D12_RESOURCE_DESC download_buffer_desc;
|
||||
ui::d3d12::util::FillBufferResourceDesc(
|
||||
download_buffer_desc, download_page_count << page_size_log2(),
|
||||
D3D12_RESOURCE_FLAG_NONE);
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesReadback,
|
||||
provider.GetHeapFlagCreateNotZeroed(), &download_buffer_desc,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
|
||||
IID_PPV_ARGS(&trace_download_buffer_)))) {
|
||||
XELOGE(
|
||||
"Shared memory: Failed to create a {} KB GPU-written memory download "
|
||||
"buffer for frame tracing",
|
||||
download_page_count << page_size_log2() >> 10);
|
||||
ResetTraceDownload();
|
||||
return false;
|
||||
}
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
UseAsCopySource();
|
||||
command_processor_.SubmitBarriers();
|
||||
uint32_t download_buffer_offset = 0;
|
||||
for (auto& download_range : trace_download_ranges()) {
|
||||
if (!EnsureHostGpuMemoryAllocated(download_range.first,
|
||||
download_range.second)) {
|
||||
download_range.second = 0;
|
||||
continue;
|
||||
}
|
||||
command_list.D3DCopyBufferRegion(
|
||||
trace_download_buffer_, download_buffer_offset, buffer_,
|
||||
download_range.first, download_range.second);
|
||||
download_buffer_offset += download_range.second;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::InitializeTraceCompleteDownloads() {
|
||||
if (!trace_download_buffer_) {
|
||||
return;
|
||||
}
|
||||
void* download_mapping;
|
||||
if (SUCCEEDED(trace_download_buffer_->Map(0, nullptr, &download_mapping))) {
|
||||
uint32_t download_buffer_offset = 0;
|
||||
for (auto download_range : trace_download_ranges()) {
|
||||
trace_writer_.WriteMemoryRead(
|
||||
download_range.first, download_range.second,
|
||||
reinterpret_cast<const uint8_t*>(download_mapping) +
|
||||
download_buffer_offset);
|
||||
}
|
||||
D3D12_RANGE download_write_range = {};
|
||||
trace_download_buffer_->Unmap(0, &download_write_range);
|
||||
} else {
|
||||
XELOGE(
|
||||
"Failed to map the GPU-written memory download buffer for frame "
|
||||
"tracing");
|
||||
}
|
||||
ResetTraceDownload();
|
||||
}
|
||||
|
||||
void D3D12SharedMemory::ResetTraceDownload() {
|
||||
ui::d3d12::util::ReleaseAndNull(trace_download_buffer_);
|
||||
ReleaseTraceDownloadRanges();
|
||||
}
|
||||
|
||||
bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
|
||||
uint32_t length) {
|
||||
if (!length || !AreTiledResourcesUsed()) {
|
||||
return true;
|
||||
}
|
||||
uint32_t heap_first = start >> kHeapSizeLog2;
|
||||
uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
|
||||
assert_true(heap_first < xe::countof(heaps_));
|
||||
assert_true(heap_last < xe::countof(heaps_));
|
||||
for (uint32_t i = heap_first; i <= heap_last; ++i) {
|
||||
if (heaps_[i] != nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
auto direct_queue = provider.GetDirectQueue();
|
||||
D3D12_HEAP_DESC heap_desc = {};
|
||||
heap_desc.SizeInBytes = kHeapSize;
|
||||
heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||
heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
|
||||
provider.GetHeapFlagCreateNotZeroed();
|
||||
if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
|
||||
XELOGE("Shared memory: Failed to create a tile heap");
|
||||
return false;
|
||||
}
|
||||
++heap_count_;
|
||||
COUNT_profile_set("gpu/shared_memory/used_mb",
|
||||
heap_count_ << kHeapSizeLog2 >> 20);
|
||||
D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
|
||||
region_start_coordinates.X =
|
||||
(i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
region_start_coordinates.Y = 0;
|
||||
region_start_coordinates.Z = 0;
|
||||
region_start_coordinates.Subresource = 0;
|
||||
D3D12_TILE_REGION_SIZE region_size;
|
||||
region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
region_size.UseBox = FALSE;
|
||||
D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
|
||||
UINT heap_range_start_offset = 0;
|
||||
UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
direct_queue->UpdateTileMappings(
|
||||
buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1,
|
||||
&range_flags, &heap_range_start_offset, &range_tile_count,
|
||||
D3D12_TILE_MAPPING_FLAG_NONE);
|
||||
command_processor_.NotifyQueueOperationsDoneDirectly();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SharedMemory::UploadRanges(
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
|
||||
if (upload_page_ranges.empty()) {
|
||||
return true;
|
||||
}
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
command_processor_.SubmitBarriers();
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
for (auto upload_range : upload_page_ranges) {
|
||||
uint32_t upload_range_start = upload_range.first;
|
||||
uint32_t upload_range_length = upload_range.second;
|
||||
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
|
||||
upload_range_length << page_size_log2());
|
||||
while (upload_range_length != 0) {
|
||||
ID3D12Resource* upload_buffer;
|
||||
size_t upload_buffer_offset, upload_buffer_size;
|
||||
uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial(
|
||||
command_processor_.GetCurrentSubmission(),
|
||||
upload_range_length << page_size_log2(),
|
||||
size_t(1) << page_size_log2(), &upload_buffer, &upload_buffer_offset,
|
||||
&upload_buffer_size, nullptr);
|
||||
if (upload_buffer_mapping == nullptr) {
|
||||
XELOGE("Shared memory: Failed to get an upload buffer");
|
||||
return false;
|
||||
}
|
||||
MakeRangeValid(upload_range_start << page_size_log2(),
|
||||
uint32_t(upload_buffer_size), false);
|
||||
std::memcpy(
|
||||
upload_buffer_mapping,
|
||||
memory().TranslatePhysical(upload_range_start << page_size_log2()),
|
||||
upload_buffer_size);
|
||||
command_list.D3DCopyBufferRegion(
|
||||
buffer_, upload_range_start << page_size_log2(), upload_buffer,
|
||||
UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
|
||||
uint32_t upload_buffer_pages =
|
||||
uint32_t(upload_buffer_size >> page_size_log2());
|
||||
upload_range_start += upload_buffer_pages;
|
||||
upload_range_length -= upload_buffer_pages;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace d3d12
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
|
@ -0,0 +1,149 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
|
||||
#define XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/gpu/shared_memory.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/memory.h"
|
||||
#include "xenia/ui/d3d12/d3d12_api.h"
|
||||
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
class D3D12CommandProcessor;
|
||||
|
||||
class D3D12SharedMemory : public SharedMemory {
|
||||
public:
|
||||
D3D12SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory,
|
||||
TraceWriter& trace_writer);
|
||||
~D3D12SharedMemory() override;
|
||||
|
||||
bool Initialize();
|
||||
void Shutdown(bool from_destructor = false);
|
||||
void ClearCache() override;
|
||||
|
||||
ID3D12Resource* GetBuffer() const { return buffer_; }
|
||||
D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
|
||||
return buffer_gpu_address_;
|
||||
}
|
||||
|
||||
void CompletedSubmissionUpdated();
|
||||
|
||||
// RequestRange may transition the buffer to copy destination - call it before
|
||||
// UseForReading or UseForWriting.
|
||||
|
||||
// Makes the buffer usable for vertices, indices and texture untiling.
|
||||
inline void UseForReading() {
|
||||
// Vertex fetch is also allowed in pixel shaders.
|
||||
CommitUAVWritesAndTransitionBuffer(
|
||||
D3D12_RESOURCE_STATE_INDEX_BUFFER |
|
||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE |
|
||||
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
|
||||
}
|
||||
// Makes the buffer usable for texture tiling after a resolve.
|
||||
inline void UseForWriting() {
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
|
||||
}
|
||||
// Makes the buffer usable as a source for copy commands.
|
||||
inline void UseAsCopySource() {
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
|
||||
}
|
||||
// Must be called when doing draws/dispatches modifying data within the shared
|
||||
// memory buffer as a UAV, to make sure that when UseForWriting is called the
|
||||
// next time, a UAV barrier will be done, and subsequent overlapping UAV
|
||||
// writes and reads are ordered.
|
||||
inline void MarkUAVWritesCommitNeeded() {
|
||||
if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
|
||||
buffer_uav_writes_commit_needed_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
// Due to the Nvidia 128 megatexel limitation, the smallest supported formats
|
||||
// are 32-bit.
|
||||
void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
|
||||
uint32_t element_size_bytes_pow2);
|
||||
void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
|
||||
uint32_t element_size_bytes_pow2);
|
||||
|
||||
// Returns true if any downloads were submitted to the command processor.
|
||||
bool InitializeTraceSubmitDownloads();
|
||||
void InitializeTraceCompleteDownloads();
|
||||
|
||||
protected:
|
||||
bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override;
|
||||
|
||||
bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
|
||||
upload_page_ranges) override;
|
||||
|
||||
private:
|
||||
bool AreTiledResourcesUsed() const;
|
||||
|
||||
D3D12CommandProcessor& command_processor_;
|
||||
TraceWriter& trace_writer_;
|
||||
|
||||
// The 512 MB tiled buffer.
|
||||
ID3D12Resource* buffer_ = nullptr;
|
||||
D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
|
||||
D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
bool buffer_uav_writes_commit_needed_ = false;
|
||||
void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
|
||||
|
||||
static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
|
||||
static constexpr uint32_t kHeapSizeLog2 =
|
||||
std::max(kOptimalAllocationLog2, uint32_t(16));
|
||||
static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
|
||||
// Resident portions of the tiled buffer.
|
||||
ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
|
||||
// Number of the heaps currently resident, for profiling.
|
||||
uint32_t heap_count_ = 0;
|
||||
|
||||
// Ensures the buffer tiles backing the range are resident, but doesn't upload
|
||||
// anything.
|
||||
bool EnsureTilesResident(uint32_t start, uint32_t length);
|
||||
|
||||
// Non-shader-visible buffer descriptor heap for faster binding (via copying
|
||||
// rather than creation).
|
||||
enum class BufferDescriptorIndex : uint32_t {
|
||||
kRawSRV,
|
||||
kR32UintSRV,
|
||||
kR32G32UintSRV,
|
||||
kR32G32B32A32UintSRV,
|
||||
kRawUAV,
|
||||
kR32UintUAV,
|
||||
kR32G32UintUAV,
|
||||
kR32G32B32A32UintUAV,
|
||||
|
||||
kCount,
|
||||
};
|
||||
ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr;
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_;
|
||||
|
||||
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> upload_buffer_pool_;
|
||||
|
||||
// Created temporarily, only for downloading.
|
||||
ID3D12Resource* trace_download_buffer_ = nullptr;
|
||||
void ResetTraceDownload();
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
|
|
@ -1085,7 +1085,7 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
|
|||
}
|
||||
|
||||
bool RenderTargetCache::Resolve(const Memory& memory,
|
||||
SharedMemory& shared_memory,
|
||||
D3D12SharedMemory& shared_memory,
|
||||
TextureCache& texture_cache,
|
||||
uint32_t& written_address_out,
|
||||
uint32_t& written_length_out) {
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_shader.h"
|
||||
#include "xenia/gpu/d3d12/shared_memory.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
|
||||
#include "xenia/gpu/d3d12/texture_cache.h"
|
||||
#include "xenia/gpu/draw_util.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
|
@ -277,11 +277,11 @@ class RenderTargetCache {
|
|||
// register values, and also clears the EDRAM buffer if needed. Must be in a
|
||||
// frame for calling.
|
||||
|
||||
bool Resolve(const Memory& memory, SharedMemory& shared_memory,
|
||||
bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
|
||||
TextureCache& texture_cache, uint32_t& written_address_out,
|
||||
uint32_t& written_length_out);
|
||||
|
||||
bool Resolve(SharedMemory* shared_memory, TextureCache* texture_cache,
|
||||
bool Resolve(D3D12SharedMemory* shared_memory, TextureCache* texture_cache,
|
||||
Memory* memory, uint32_t& written_address_out,
|
||||
uint32_t& written_length_out);
|
||||
// Flushes the render targets to EDRAM and unbinds them, for instance, when
|
||||
|
|
|
@ -1,959 +0,0 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2018 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "xenia/gpu/d3d12/shared_memory.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
||||
#include "xenia/ui/d3d12/d3d12_util.h"
|
||||
|
||||
DEFINE_bool(d3d12_tiled_shared_memory, true,
|
||||
"Enable tiled resources for shared memory emulation. Disabling "
|
||||
"them greatly increases video memory usage - a 512 MB buffer is "
|
||||
"created - but allows graphics debuggers that don't support tiled "
|
||||
"resources to work.",
|
||||
"D3D12");
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
SharedMemory::SharedMemory(D3D12CommandProcessor& command_processor,
|
||||
Memory& memory, TraceWriter& trace_writer)
|
||||
: command_processor_(command_processor),
|
||||
memory_(memory),
|
||||
trace_writer_(trace_writer) {
|
||||
page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size()));
|
||||
page_count_ = kBufferSize >> page_size_log2_;
|
||||
}
|
||||
|
||||
SharedMemory::~SharedMemory() { Shutdown(); }
|
||||
|
||||
bool SharedMemory::Initialize() {
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
|
||||
D3D12_RESOURCE_DESC buffer_desc;
|
||||
ui::d3d12::util::FillBufferResourceDesc(
|
||||
buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
|
||||
buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
if (AreTiledResourcesUsed()) {
|
||||
if (FAILED(device->CreateReservedResource(
|
||||
&buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||
XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
XELOGGPU(
|
||||
"Direct3D 12 tiled resources are not used for shared memory "
|
||||
"emulation - video memory usage may increase significantly "
|
||||
"because a full 512 MB buffer will be created!");
|
||||
if (provider.GetGraphicsAnalysis() != nullptr) {
|
||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
||||
XELOGGPU(
|
||||
"This is caused by PIX being attached, which doesn't support tiled "
|
||||
"resources yet.");
|
||||
}
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesDefault,
|
||||
provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
|
||||
nullptr, IID_PPV_ARGS(&buffer_)))) {
|
||||
XELOGE("Shared memory: Failed to create the 512 MB buffer");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
buffer_gpu_address_ = buffer_->GetGPUVirtualAddress();
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
|
||||
std::memset(heaps_, 0, sizeof(heaps_));
|
||||
heap_count_ = 0;
|
||||
|
||||
D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc;
|
||||
buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
|
||||
buffer_descriptor_heap_desc.NumDescriptors =
|
||||
uint32_t(BufferDescriptorIndex::kCount);
|
||||
buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
|
||||
buffer_descriptor_heap_desc.NodeMask = 0;
|
||||
if (FAILED(device->CreateDescriptorHeap(
|
||||
&buffer_descriptor_heap_desc,
|
||||
IID_PPV_ARGS(&buffer_descriptor_heap_)))) {
|
||||
XELOGE(
|
||||
"Failed to create the descriptor heap for shared memory buffer views");
|
||||
Shutdown();
|
||||
return false;
|
||||
}
|
||||
buffer_descriptor_heap_start_ =
|
||||
buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart();
|
||||
ui::d3d12::util::CreateBufferRawSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawSRV)),
|
||||
buffer_, kBufferSize);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
|
||||
ui::d3d12::util::CreateBufferTypedSRV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)),
|
||||
buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
|
||||
ui::d3d12::util::CreateBufferRawUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawUAV)),
|
||||
buffer_, kBufferSize);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
|
||||
ui::d3d12::util::CreateBufferTypedUAV(
|
||||
device,
|
||||
provider.OffsetViewDescriptor(
|
||||
buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)),
|
||||
buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
|
||||
|
||||
system_page_flags_.clear();
|
||||
system_page_flags_.resize((page_count_ + 63) / 64);
|
||||
|
||||
upload_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
|
||||
provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize,
|
||||
size_t(1) << page_size_log2_));
|
||||
|
||||
memory_invalidation_callback_handle_ =
|
||||
memory_.RegisterPhysicalMemoryInvalidationCallback(
|
||||
MemoryInvalidationCallbackThunk, this);
|
||||
|
||||
ResetTraceGPUWrittenBuffer();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void SharedMemory::Shutdown() {
|
||||
ResetTraceGPUWrittenBuffer();
|
||||
|
||||
FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false);
|
||||
assert_true(global_watches_.empty());
|
||||
// No watches now, so no references to the pools accessible by guest threads -
|
||||
// safe not to enter the global critical region.
|
||||
watch_node_first_free_ = nullptr;
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
for (WatchNode* pool : watch_node_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_node_pools_.clear();
|
||||
watch_range_first_free_ = nullptr;
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
for (WatchRange* pool : watch_range_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_range_pools_.clear();
|
||||
|
||||
if (memory_invalidation_callback_handle_ != nullptr) {
|
||||
memory_.UnregisterPhysicalMemoryInvalidationCallback(
|
||||
memory_invalidation_callback_handle_);
|
||||
memory_invalidation_callback_handle_ = nullptr;
|
||||
}
|
||||
|
||||
upload_buffer_pool_.reset();
|
||||
|
||||
ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_);
|
||||
|
||||
// First free the buffer to detach it from the heaps.
|
||||
ui::d3d12::util::ReleaseAndNull(buffer_);
|
||||
|
||||
if (AreTiledResourcesUsed()) {
|
||||
for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
|
||||
ui::d3d12::util::ReleaseAndNull(heaps_[i]);
|
||||
}
|
||||
heap_count_ = 0;
|
||||
COUNT_profile_set("gpu/shared_memory/used_mb", 0);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::ClearCache() {
|
||||
upload_buffer_pool_->ClearCache();
|
||||
|
||||
// Keeping GPU-written data, so "invalidated by GPU".
|
||||
FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true);
|
||||
// No watches now, so no references to the pools accessible by guest threads -
|
||||
// safe not to enter the global critical region.
|
||||
watch_node_first_free_ = nullptr;
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
for (WatchNode* pool : watch_node_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_node_pools_.clear();
|
||||
watch_range_first_free_ = nullptr;
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
for (WatchRange* pool : watch_range_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_range_pools_.clear();
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (SystemPageFlagsBlock& block : system_page_flags_) {
|
||||
block.valid = block.valid_and_gpu_written;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(Triang3l): Unmap and destroy heaps.
|
||||
}
|
||||
|
||||
void SharedMemory::CompletedSubmissionUpdated() {
|
||||
upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
|
||||
}
|
||||
|
||||
SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch(
|
||||
GlobalWatchCallback callback, void* callback_context) {
|
||||
GlobalWatch* watch = new GlobalWatch;
|
||||
watch->callback = callback;
|
||||
watch->callback_context = callback_context;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
global_watches_.push_back(watch);
|
||||
|
||||
return reinterpret_cast<GlobalWatchHandle>(watch);
|
||||
}
|
||||
|
||||
void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) {
|
||||
auto watch = reinterpret_cast<GlobalWatch*>(handle);
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
auto it = std::find(global_watches_.begin(), global_watches_.end(), watch);
|
||||
assert_false(it == global_watches_.end());
|
||||
if (it != global_watches_.end()) {
|
||||
global_watches_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
delete watch;
|
||||
}
|
||||
|
||||
SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
|
||||
uint32_t start, uint32_t length, WatchCallback callback,
|
||||
void* callback_context, void* callback_data, uint64_t callback_argument) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return nullptr;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t watch_page_first = start >> page_size_log2_;
|
||||
uint32_t watch_page_last = (start + length - 1) >> page_size_log2_;
|
||||
uint32_t bucket_first =
|
||||
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
uint32_t bucket_last =
|
||||
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Allocate the range.
|
||||
WatchRange* range = watch_range_first_free_;
|
||||
if (range != nullptr) {
|
||||
watch_range_first_free_ = range->next_free;
|
||||
} else {
|
||||
if (watch_range_pools_.empty() ||
|
||||
watch_range_current_pool_allocated_ >= kWatchRangePoolSize) {
|
||||
watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]);
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
}
|
||||
range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]);
|
||||
}
|
||||
range->callback = callback;
|
||||
range->callback_context = callback_context;
|
||||
range->callback_data = callback_data;
|
||||
range->callback_argument = callback_argument;
|
||||
range->page_first = watch_page_first;
|
||||
range->page_last = watch_page_last;
|
||||
|
||||
// Allocate and link the nodes.
|
||||
WatchNode* node_previous = nullptr;
|
||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||
WatchNode* node = watch_node_first_free_;
|
||||
if (node != nullptr) {
|
||||
watch_node_first_free_ = node->next_free;
|
||||
} else {
|
||||
if (watch_node_pools_.empty() ||
|
||||
watch_node_current_pool_allocated_ >= kWatchNodePoolSize) {
|
||||
watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]);
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
}
|
||||
node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]);
|
||||
}
|
||||
node->range = range;
|
||||
node->range_node_next = nullptr;
|
||||
if (node_previous != nullptr) {
|
||||
node_previous->range_node_next = node;
|
||||
} else {
|
||||
range->node_first = node;
|
||||
}
|
||||
node_previous = node;
|
||||
node->bucket_node_previous = nullptr;
|
||||
node->bucket_node_next = watch_buckets_[i];
|
||||
if (watch_buckets_[i] != nullptr) {
|
||||
watch_buckets_[i]->bucket_node_previous = node;
|
||||
}
|
||||
watch_buckets_[i] = node;
|
||||
}
|
||||
|
||||
return reinterpret_cast<WatchHandle>(range);
|
||||
}
|
||||
|
||||
void SharedMemory::UnwatchMemoryRange(WatchHandle handle) {
|
||||
if (handle == nullptr) {
|
||||
// Could be a zero length range.
|
||||
return;
|
||||
}
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
UnlinkWatchRange(reinterpret_cast<WatchRange*>(handle));
|
||||
}
|
||||
|
||||
bool SharedMemory::EnsureTilesResident(uint32_t start, uint32_t length) {
|
||||
if (length == 0) {
|
||||
// Some texture is empty, for example - safe to draw in this case.
|
||||
return true;
|
||||
}
|
||||
if (start > kBufferSize || (kBufferSize - start) < length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!AreTiledResourcesUsed()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t heap_first = start >> kHeapSizeLog2;
|
||||
uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
|
||||
for (uint32_t i = heap_first; i <= heap_last; ++i) {
|
||||
if (heaps_[i] != nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
auto direct_queue = provider.GetDirectQueue();
|
||||
D3D12_HEAP_DESC heap_desc = {};
|
||||
heap_desc.SizeInBytes = kHeapSize;
|
||||
heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||
heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
|
||||
provider.GetHeapFlagCreateNotZeroed();
|
||||
if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
|
||||
XELOGE("Shared memory: Failed to create a tile heap");
|
||||
return false;
|
||||
}
|
||||
++heap_count_;
|
||||
COUNT_profile_set("gpu/shared_memory/used_mb",
|
||||
heap_count_ << kHeapSizeLog2 >> 20);
|
||||
D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
|
||||
region_start_coordinates.X =
|
||||
(i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
region_start_coordinates.Y = 0;
|
||||
region_start_coordinates.Z = 0;
|
||||
region_start_coordinates.Subresource = 0;
|
||||
D3D12_TILE_REGION_SIZE region_size;
|
||||
region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
region_size.UseBox = FALSE;
|
||||
D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
|
||||
UINT heap_range_start_offset = 0;
|
||||
UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
|
||||
direct_queue->UpdateTileMappings(
|
||||
buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1,
|
||||
&range_flags, &heap_range_start_offset, &range_tile_count,
|
||||
D3D12_TILE_MAPPING_FLAG_NONE);
|
||||
command_processor_.NotifyQueueOperationsDoneDirectly();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
|
||||
if (length == 0) {
|
||||
// Some texture is empty, for example - safe to draw in this case.
|
||||
return true;
|
||||
}
|
||||
if (start > kBufferSize || (kBufferSize - start) < length) {
|
||||
return false;
|
||||
}
|
||||
uint32_t last = start + length - 1;
|
||||
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
|
||||
#if FINE_GRAINED_DRAW_SCOPES
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
#endif // FINE_GRAINED_DRAW_SCOPES
|
||||
|
||||
// Ensure all tile heaps are present.
|
||||
if (!EnsureTilesResident(start, length)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Upload and protect used ranges.
|
||||
GetRangesToUpload(start >> page_size_log2_, last >> page_size_log2_);
|
||||
if (upload_ranges_.size() == 0) {
|
||||
return true;
|
||||
}
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
command_processor_.SubmitBarriers();
|
||||
for (auto upload_range : upload_ranges_) {
|
||||
uint32_t upload_range_start = upload_range.first;
|
||||
uint32_t upload_range_length = upload_range.second;
|
||||
trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2_,
|
||||
upload_range_length << page_size_log2_);
|
||||
while (upload_range_length != 0) {
|
||||
ID3D12Resource* upload_buffer;
|
||||
size_t upload_buffer_offset, upload_buffer_size;
|
||||
uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial(
|
||||
command_processor_.GetCurrentSubmission(),
|
||||
upload_range_length << page_size_log2_, size_t(1) << page_size_log2_,
|
||||
&upload_buffer, &upload_buffer_offset, &upload_buffer_size, nullptr);
|
||||
if (upload_buffer_mapping == nullptr) {
|
||||
XELOGE("Shared memory: Failed to get an upload buffer");
|
||||
return false;
|
||||
}
|
||||
MakeRangeValid(upload_range_start << page_size_log2_,
|
||||
uint32_t(upload_buffer_size), false);
|
||||
std::memcpy(
|
||||
upload_buffer_mapping,
|
||||
memory_.TranslatePhysical(upload_range_start << page_size_log2_),
|
||||
upload_buffer_size);
|
||||
command_list.D3DCopyBufferRegion(
|
||||
buffer_, upload_range_start << page_size_log2_, upload_buffer,
|
||||
UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
|
||||
uint32_t upload_buffer_pages =
|
||||
uint32_t(upload_buffer_size >> page_size_log2_);
|
||||
upload_range_start += upload_buffer_pages;
|
||||
upload_range_length -= upload_buffer_pages;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
|
||||
bool invalidated_by_gpu) {
|
||||
uint32_t address_first = page_first << page_size_log2_;
|
||||
uint32_t address_last =
|
||||
(page_last << page_size_log2_) + ((1 << page_size_log2_) - 1);
|
||||
uint32_t bucket_first = address_first >> kWatchBucketSizeLog2;
|
||||
uint32_t bucket_last = address_last >> kWatchBucketSizeLog2;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Fire global watches.
|
||||
for (const auto global_watch : global_watches_) {
|
||||
global_watch->callback(global_watch->callback_context, address_first,
|
||||
address_last, invalidated_by_gpu);
|
||||
}
|
||||
|
||||
// Fire per-range watches.
|
||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||
WatchNode* node = watch_buckets_[i];
|
||||
while (node != nullptr) {
|
||||
WatchRange* range = node->range;
|
||||
// Store the next node now since when the callback is triggered, the links
|
||||
// will be broken.
|
||||
node = node->bucket_node_next;
|
||||
if (page_first <= range->page_last && page_last >= range->page_first) {
|
||||
range->callback(range->callback_context, range->callback_data,
|
||||
range->callback_argument, invalidated_by_gpu);
|
||||
UnlinkWatchRange(range);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t end = start + length - 1;
|
||||
uint32_t page_first = start >> page_size_log2_;
|
||||
uint32_t page_last = end >> page_size_log2_;
|
||||
|
||||
// Trigger modification callbacks so, for instance, resolved data is loaded to
|
||||
// the texture.
|
||||
FireWatches(page_first, page_last, true);
|
||||
|
||||
// Mark the range as valid (so pages are not reuploaded until modified by the
|
||||
// CPU) and watch it so the CPU can reuse it and this will be caught.
|
||||
MakeRangeValid(start, length, true);
|
||||
}
|
||||
|
||||
bool SharedMemory::AreTiledResourcesUsed() const {
|
||||
if (!cvars::d3d12_tiled_shared_memory) {
|
||||
return false;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
// As of October 8th, 2018, PIX doesn't support tiled buffers.
|
||||
// FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
|
||||
return provider.GetTiledResourcesTier() !=
|
||||
D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
|
||||
provider.GetGraphicsAnalysis() == nullptr;
|
||||
}
|
||||
|
||||
void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
|
||||
bool written_by_gpu) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t last = start + length - 1;
|
||||
uint32_t valid_page_first = start >> page_size_log2_;
|
||||
uint32_t valid_page_last = last >> page_size_log2_;
|
||||
uint32_t valid_block_first = valid_page_first >> 6;
|
||||
uint32_t valid_block_last = valid_page_last >> 6;
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) {
|
||||
uint64_t valid_bits = UINT64_MAX;
|
||||
if (i == valid_block_first) {
|
||||
valid_bits &= ~((1ull << (valid_page_first & 63)) - 1);
|
||||
}
|
||||
if (i == valid_block_last && (valid_page_last & 63) != 63) {
|
||||
valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1;
|
||||
}
|
||||
SystemPageFlagsBlock& block = system_page_flags_[i];
|
||||
block.valid |= valid_bits;
|
||||
if (written_by_gpu) {
|
||||
block.valid_and_gpu_written |= valid_bits;
|
||||
} else {
|
||||
block.valid_and_gpu_written &= ~valid_bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_invalidation_callback_handle_) {
|
||||
memory_.EnablePhysicalMemoryAccessCallbacks(
|
||||
valid_page_first << page_size_log2_,
|
||||
(valid_page_last - valid_page_first + 1) << page_size_log2_, true,
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::UnlinkWatchRange(WatchRange* range) {
|
||||
uint32_t bucket =
|
||||
range->page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
WatchNode* node = range->node_first;
|
||||
while (node != nullptr) {
|
||||
WatchNode* node_next = node->range_node_next;
|
||||
if (node->bucket_node_previous != nullptr) {
|
||||
node->bucket_node_previous->bucket_node_next = node->bucket_node_next;
|
||||
} else {
|
||||
watch_buckets_[bucket] = node->bucket_node_next;
|
||||
}
|
||||
if (node->bucket_node_next != nullptr) {
|
||||
node->bucket_node_next->bucket_node_previous = node->bucket_node_previous;
|
||||
}
|
||||
node->next_free = watch_node_first_free_;
|
||||
watch_node_first_free_ = node;
|
||||
node = node_next;
|
||||
++bucket;
|
||||
}
|
||||
range->next_free = watch_range_first_free_;
|
||||
watch_range_first_free_ = range;
|
||||
}
|
||||
|
||||
void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
|
||||
uint32_t request_page_last) {
|
||||
upload_ranges_.clear();
|
||||
request_page_last = std::min(request_page_last, page_count_ - 1u);
|
||||
if (request_page_first > request_page_last) {
|
||||
return;
|
||||
}
|
||||
uint32_t request_block_first = request_page_first >> 6;
|
||||
uint32_t request_block_last = request_page_last >> 6;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
uint32_t range_start = UINT32_MAX;
|
||||
for (uint32_t i = request_block_first; i <= request_block_last; ++i) {
|
||||
uint64_t block_valid = system_page_flags_[i].valid;
|
||||
// Consider pages in the block outside the requested range valid.
|
||||
if (i == request_block_first) {
|
||||
block_valid |= (1ull << (request_page_first & 63)) - 1;
|
||||
}
|
||||
if (i == request_block_last && (request_page_last & 63) != 63) {
|
||||
block_valid |= ~((1ull << ((request_page_last & 63) + 1)) - 1);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
uint32_t block_page;
|
||||
if (range_start == UINT32_MAX) {
|
||||
// Check if need to open a new range.
|
||||
if (!xe::bit_scan_forward(~block_valid, &block_page)) {
|
||||
break;
|
||||
}
|
||||
range_start = (i << 6) + block_page;
|
||||
} else {
|
||||
// Check if need to close the range.
|
||||
// Ignore the valid pages before the beginning of the range.
|
||||
uint64_t block_valid_from_start = block_valid;
|
||||
if (i == (range_start >> 6)) {
|
||||
block_valid_from_start &= ~((1ull << (range_start & 63)) - 1);
|
||||
}
|
||||
if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
|
||||
break;
|
||||
}
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, (i << 6) + block_page - range_start));
|
||||
// In the next interation within this block, consider this range valid
|
||||
// since it has been queued for upload.
|
||||
block_valid |= (1ull << block_page) - 1;
|
||||
range_start = UINT32_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (range_start != UINT32_MAX) {
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, request_page_last + 1 - range_start));
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length,
|
||||
bool exact_range) {
|
||||
return reinterpret_cast<SharedMemory*>(context_ptr)
|
||||
->MemoryInvalidationCallback(physical_address_start, length, exact_range);
|
||||
}
|
||||
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
|
||||
uint32_t physical_address_start, uint32_t length, bool exact_range) {
|
||||
if (length == 0 || physical_address_start >= kBufferSize) {
|
||||
return std::make_pair(uint32_t(0), UINT32_MAX);
|
||||
}
|
||||
length = std::min(length, kBufferSize - physical_address_start);
|
||||
uint32_t physical_address_last = physical_address_start + (length - 1);
|
||||
|
||||
uint32_t page_first = physical_address_start >> page_size_log2_;
|
||||
uint32_t page_last = physical_address_last >> page_size_log2_;
|
||||
assert_true(page_first < page_count_ && page_last < page_count_);
|
||||
uint32_t block_first = page_first >> 6;
|
||||
uint32_t block_last = page_last >> 6;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
if (!exact_range) {
|
||||
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
|
||||
// invalidated - if no GPU-written data nearby that was not intended to be
|
||||
// invalidated since it's not in sync with CPU memory and can't be
|
||||
// reuploaded. It's a lot cheaper to upload some excess data than to catch
|
||||
// access violations - with 4 KB callbacks, the original Doom runs at 4 FPS
|
||||
// on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
|
||||
// frame, but with 256 KB it's 0.7 ms.
|
||||
if (page_first & 63) {
|
||||
uint64_t gpu_written_start =
|
||||
system_page_flags_[block_first].valid_and_gpu_written;
|
||||
gpu_written_start &= (1ull << (page_first & 63)) - 1;
|
||||
page_first =
|
||||
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
|
||||
}
|
||||
if ((page_last & 63) != 63) {
|
||||
uint64_t gpu_written_end =
|
||||
system_page_flags_[block_last].valid_and_gpu_written;
|
||||
gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
|
||||
page_last = (page_last & ~uint32_t(63)) +
|
||||
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = block_first; i <= block_last; ++i) {
|
||||
uint64_t invalidate_bits = UINT64_MAX;
|
||||
if (i == block_first) {
|
||||
invalidate_bits &= ~((1ull << (page_first & 63)) - 1);
|
||||
}
|
||||
if (i == block_last && (page_last & 63) != 63) {
|
||||
invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1;
|
||||
}
|
||||
SystemPageFlagsBlock& block = system_page_flags_[i];
|
||||
block.valid &= ~invalidate_bits;
|
||||
block.valid_and_gpu_written &= ~invalidate_bits;
|
||||
}
|
||||
|
||||
FireWatches(page_first, page_last, false);
|
||||
|
||||
return std::make_pair(page_first << page_size_log2_,
|
||||
(page_last - page_first + 1) << page_size_log2_);
|
||||
}
|
||||
|
||||
void SharedMemory::CommitUAVWritesAndTransitionBuffer(
|
||||
D3D12_RESOURCE_STATES new_state) {
|
||||
if (buffer_state_ == new_state) {
|
||||
if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS &&
|
||||
buffer_uav_writes_commit_needed_) {
|
||||
command_processor_.PushUAVBarrier(buffer_);
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state);
|
||||
buffer_state_ = new_state;
|
||||
// "UAV -> anything" transition commits the writes implicitly.
|
||||
buffer_uav_writes_commit_needed_ = false;
|
||||
}
|
||||
|
||||
void SharedMemory::WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawSRV)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void SharedMemory::WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(BufferDescriptorIndex::kRawUAV)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void SharedMemory::WriteUintPow2SRVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
|
||||
BufferDescriptorIndex descriptor_index;
|
||||
switch (element_size_bytes_pow2) {
|
||||
case 2:
|
||||
descriptor_index = BufferDescriptorIndex::kR32UintSRV;
|
||||
break;
|
||||
case 3:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32UintSRV;
|
||||
break;
|
||||
case 4:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV;
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(element_size_bytes_pow2);
|
||||
return;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(descriptor_index)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
void SharedMemory::WriteUintPow2UAVDescriptor(
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
|
||||
BufferDescriptorIndex descriptor_index;
|
||||
switch (element_size_bytes_pow2) {
|
||||
case 2:
|
||||
descriptor_index = BufferDescriptorIndex::kR32UintUAV;
|
||||
break;
|
||||
case 3:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32UintUAV;
|
||||
break;
|
||||
case 4:
|
||||
descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV;
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(element_size_bytes_pow2);
|
||||
return;
|
||||
}
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
device->CopyDescriptorsSimple(
|
||||
1, handle,
|
||||
provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
|
||||
uint32_t(descriptor_index)),
|
||||
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
}
|
||||
|
||||
bool SharedMemory::InitializeTraceSubmitDownloads() {
|
||||
// Invalidate the entire memory CPU->GPU memory copy so all the history
|
||||
// doesn't have to be written into every frame trace, and collect the list of
|
||||
// ranges with data modified on the GPU.
|
||||
ResetTraceGPUWrittenBuffer();
|
||||
uint32_t gpu_written_page_count = 0;
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
uint32_t fire_watches_range_start = UINT32_MAX;
|
||||
uint32_t gpu_written_range_start = UINT32_MAX;
|
||||
for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
|
||||
SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
|
||||
uint64_t previously_valid_block = page_flags_block.valid;
|
||||
uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
|
||||
page_flags_block.valid = gpu_written_block;
|
||||
|
||||
// Fire watches on the invalidated pages.
|
||||
uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
|
||||
uint64_t fire_watches_break_block = ~fire_watches_block;
|
||||
while (true) {
|
||||
uint32_t fire_watches_block_page;
|
||||
if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX
|
||||
? fire_watches_block
|
||||
: fire_watches_break_block,
|
||||
&fire_watches_block_page)) {
|
||||
break;
|
||||
}
|
||||
uint32_t fire_watches_page = (i << 6) + fire_watches_block_page;
|
||||
if (fire_watches_range_start == UINT32_MAX) {
|
||||
fire_watches_range_start = fire_watches_page;
|
||||
} else {
|
||||
FireWatches(fire_watches_range_start, fire_watches_page - 1, false);
|
||||
fire_watches_range_start = UINT32_MAX;
|
||||
}
|
||||
uint64_t fire_watches_block_mask =
|
||||
~((1ull << fire_watches_block_page) - 1);
|
||||
fire_watches_block &= fire_watches_block_mask;
|
||||
fire_watches_break_block &= fire_watches_block_mask;
|
||||
}
|
||||
|
||||
// Add to the GPU-written ranges.
|
||||
uint64_t gpu_written_break_block = ~gpu_written_block;
|
||||
while (true) {
|
||||
uint32_t gpu_written_block_page;
|
||||
if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX
|
||||
? gpu_written_block
|
||||
: gpu_written_break_block,
|
||||
&gpu_written_block_page)) {
|
||||
break;
|
||||
}
|
||||
uint32_t gpu_written_page = (i << 6) + gpu_written_block_page;
|
||||
if (gpu_written_range_start == UINT32_MAX) {
|
||||
gpu_written_range_start = gpu_written_page;
|
||||
} else {
|
||||
uint32_t gpu_written_range_length =
|
||||
gpu_written_page - gpu_written_range_start;
|
||||
trace_gpu_written_ranges_.push_back(
|
||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||
gpu_written_range_length << page_size_log2_));
|
||||
gpu_written_page_count += gpu_written_range_length;
|
||||
gpu_written_range_start = UINT32_MAX;
|
||||
}
|
||||
uint64_t gpu_written_block_mask =
|
||||
~((1ull << gpu_written_block_page) - 1);
|
||||
gpu_written_block &= gpu_written_block_mask;
|
||||
gpu_written_break_block &= gpu_written_block_mask;
|
||||
}
|
||||
}
|
||||
if (fire_watches_range_start != UINT32_MAX) {
|
||||
FireWatches(fire_watches_range_start, page_count_ - 1, false);
|
||||
}
|
||||
if (gpu_written_range_start != UINT32_MAX) {
|
||||
uint32_t gpu_written_range_length = page_count_ - gpu_written_range_start;
|
||||
trace_gpu_written_ranges_.push_back(
|
||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||
gpu_written_range_length << page_size_log2_));
|
||||
gpu_written_page_count += gpu_written_range_length;
|
||||
}
|
||||
}
|
||||
|
||||
// Request downloading of GPU-written memory.
|
||||
if (!gpu_written_page_count) {
|
||||
return false;
|
||||
}
|
||||
D3D12_RESOURCE_DESC gpu_written_buffer_desc;
|
||||
ui::d3d12::util::FillBufferResourceDesc(
|
||||
gpu_written_buffer_desc, gpu_written_page_count << page_size_log2_,
|
||||
D3D12_RESOURCE_FLAG_NONE);
|
||||
auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
|
||||
auto device = provider.GetDevice();
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesReadback,
|
||||
provider.GetHeapFlagCreateNotZeroed(), &gpu_written_buffer_desc,
|
||||
D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
|
||||
IID_PPV_ARGS(&trace_gpu_written_buffer_)))) {
|
||||
XELOGE(
|
||||
"Shared memory: Failed to create a {} KB GPU-written memory download "
|
||||
"buffer for frame tracing",
|
||||
gpu_written_page_count << page_size_log2_ >> 10);
|
||||
ResetTraceGPUWrittenBuffer();
|
||||
return false;
|
||||
}
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
UseAsCopySource();
|
||||
command_processor_.SubmitBarriers();
|
||||
uint32_t gpu_written_buffer_offset = 0;
|
||||
for (auto& gpu_written_submit_range : trace_gpu_written_ranges_) {
|
||||
// For cases like resolution scale, when the data may not be actually
|
||||
// written, just marked as valid.
|
||||
if (!EnsureTilesResident(gpu_written_submit_range.first,
|
||||
gpu_written_submit_range.second)) {
|
||||
gpu_written_submit_range.second = 0;
|
||||
continue;
|
||||
}
|
||||
command_list.D3DCopyBufferRegion(
|
||||
trace_gpu_written_buffer_, gpu_written_buffer_offset, buffer_,
|
||||
gpu_written_submit_range.first, gpu_written_submit_range.second);
|
||||
gpu_written_buffer_offset += gpu_written_submit_range.second;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void SharedMemory::InitializeTraceCompleteDownloads() {
|
||||
if (!trace_gpu_written_buffer_) {
|
||||
return;
|
||||
}
|
||||
void* download_mapping;
|
||||
if (SUCCEEDED(
|
||||
trace_gpu_written_buffer_->Map(0, nullptr, &download_mapping))) {
|
||||
uint32_t gpu_written_buffer_offset = 0;
|
||||
for (auto gpu_written_submit_range : trace_gpu_written_ranges_) {
|
||||
trace_writer_.WriteMemoryRead(
|
||||
gpu_written_submit_range.first, gpu_written_submit_range.second,
|
||||
reinterpret_cast<const uint8_t*>(download_mapping) +
|
||||
gpu_written_buffer_offset);
|
||||
}
|
||||
D3D12_RANGE download_write_range = {};
|
||||
trace_gpu_written_buffer_->Unmap(0, &download_write_range);
|
||||
} else {
|
||||
XELOGE(
|
||||
"Failed to map the GPU-written memory download buffer for frame "
|
||||
"tracing");
|
||||
}
|
||||
ResetTraceGPUWrittenBuffer();
|
||||
}
|
||||
|
||||
void SharedMemory::ResetTraceGPUWrittenBuffer() {
|
||||
trace_gpu_written_ranges_.clear();
|
||||
trace_gpu_written_ranges_.shrink_to_fit();
|
||||
ui::d3d12::util::ReleaseAndNull(trace_gpu_written_buffer_);
|
||||
}
|
||||
|
||||
} // namespace d3d12
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
|
@ -832,7 +832,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = {
|
|||
TextureCache::TextureCache(D3D12CommandProcessor& command_processor,
|
||||
const RegisterFile& register_file,
|
||||
bool bindless_resources_used,
|
||||
SharedMemory& shared_memory)
|
||||
D3D12SharedMemory& shared_memory)
|
||||
: command_processor_(command_processor),
|
||||
register_file_(register_file),
|
||||
bindless_resources_used_(bindless_resources_used),
|
||||
|
@ -1604,7 +1604,7 @@ void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled,
|
|||
|
||||
// Invalidate textures. Toggling individual textures between scaled and
|
||||
// unscaled also relies on invalidation through shared memory.
|
||||
shared_memory_.RangeWrittenByGPU(start_unscaled, length_unscaled);
|
||||
shared_memory_.RangeWrittenByGpu(start_unscaled, length_unscaled);
|
||||
}
|
||||
|
||||
bool TextureCache::EnsureScaledResolveBufferResident(uint32_t start_unscaled,
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
#include "xenia/base/mutex.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_shader.h"
|
||||
#include "xenia/gpu/d3d12/shared_memory.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/texture_info.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
@ -169,7 +169,7 @@ class TextureCache {
|
|||
|
||||
TextureCache(D3D12CommandProcessor& command_processor,
|
||||
const RegisterFile& register_file, bool bindless_resources_used,
|
||||
SharedMemory& shared_memory);
|
||||
D3D12SharedMemory& shared_memory);
|
||||
~TextureCache();
|
||||
|
||||
bool Initialize(bool edram_rov_used);
|
||||
|
@ -546,7 +546,7 @@ class TextureCache {
|
|||
D3D12CommandProcessor& command_processor_;
|
||||
const RegisterFile& register_file_;
|
||||
bool bindless_resources_used_;
|
||||
SharedMemory& shared_memory_;
|
||||
D3D12SharedMemory& shared_memory_;
|
||||
|
||||
static const LoadModeInfo load_mode_info_[];
|
||||
ID3D12RootSignature* load_root_signature_ = nullptr;
|
||||
|
|
|
@ -0,0 +1,541 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "xenia/gpu/shared_memory.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
SharedMemory::SharedMemory(Memory& memory) : memory_(memory) {
|
||||
page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size()));
|
||||
}
|
||||
|
||||
SharedMemory::~SharedMemory() { ShutdownCommon(); }
|
||||
|
||||
void SharedMemory::InitializeCommon() {
|
||||
system_page_flags_.clear();
|
||||
system_page_flags_.resize(((kBufferSize >> page_size_log2_) + 63) / 64);
|
||||
|
||||
memory_invalidation_callback_handle_ =
|
||||
memory_.RegisterPhysicalMemoryInvalidationCallback(
|
||||
MemoryInvalidationCallbackThunk, this);
|
||||
}
|
||||
|
||||
void SharedMemory::ShutdownCommon() {
|
||||
ReleaseTraceDownloadRanges();
|
||||
|
||||
FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false);
|
||||
assert_true(global_watches_.empty());
|
||||
// No watches now, so no references to the pools accessible by guest threads -
|
||||
// safe not to enter the global critical region.
|
||||
watch_node_first_free_ = nullptr;
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
for (WatchNode* pool : watch_node_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_node_pools_.clear();
|
||||
watch_range_first_free_ = nullptr;
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
for (WatchRange* pool : watch_range_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_range_pools_.clear();
|
||||
|
||||
if (memory_invalidation_callback_handle_ != nullptr) {
|
||||
memory_.UnregisterPhysicalMemoryInvalidationCallback(
|
||||
memory_invalidation_callback_handle_);
|
||||
memory_invalidation_callback_handle_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::ClearCache() {
|
||||
// Keeping GPU-written data, so "invalidated by GPU".
|
||||
FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true);
|
||||
// No watches now, so no references to the pools accessible by guest threads -
|
||||
// safe not to enter the global critical region.
|
||||
watch_node_first_free_ = nullptr;
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
for (WatchNode* pool : watch_node_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_node_pools_.clear();
|
||||
watch_range_first_free_ = nullptr;
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
for (WatchRange* pool : watch_range_pools_) {
|
||||
delete[] pool;
|
||||
}
|
||||
watch_range_pools_.clear();
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (SystemPageFlagsBlock& block : system_page_flags_) {
|
||||
block.valid = block.valid_and_gpu_written;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch(
|
||||
GlobalWatchCallback callback, void* callback_context) {
|
||||
GlobalWatch* watch = new GlobalWatch;
|
||||
watch->callback = callback;
|
||||
watch->callback_context = callback_context;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
global_watches_.push_back(watch);
|
||||
|
||||
return reinterpret_cast<GlobalWatchHandle>(watch);
|
||||
}
|
||||
|
||||
void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) {
|
||||
auto watch = reinterpret_cast<GlobalWatch*>(handle);
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
auto it = std::find(global_watches_.begin(), global_watches_.end(), watch);
|
||||
assert_false(it == global_watches_.end());
|
||||
if (it != global_watches_.end()) {
|
||||
global_watches_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
delete watch;
|
||||
}
|
||||
|
||||
SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
|
||||
uint32_t start, uint32_t length, WatchCallback callback,
|
||||
void* callback_context, void* callback_data, uint64_t callback_argument) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return nullptr;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t watch_page_first = start >> page_size_log2_;
|
||||
uint32_t watch_page_last = (start + length - 1) >> page_size_log2_;
|
||||
uint32_t bucket_first =
|
||||
watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
uint32_t bucket_last =
|
||||
watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Allocate the range.
|
||||
WatchRange* range = watch_range_first_free_;
|
||||
if (range != nullptr) {
|
||||
watch_range_first_free_ = range->next_free;
|
||||
} else {
|
||||
if (watch_range_pools_.empty() ||
|
||||
watch_range_current_pool_allocated_ >= kWatchRangePoolSize) {
|
||||
watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]);
|
||||
watch_range_current_pool_allocated_ = 0;
|
||||
}
|
||||
range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]);
|
||||
}
|
||||
range->callback = callback;
|
||||
range->callback_context = callback_context;
|
||||
range->callback_data = callback_data;
|
||||
range->callback_argument = callback_argument;
|
||||
range->page_first = watch_page_first;
|
||||
range->page_last = watch_page_last;
|
||||
|
||||
// Allocate and link the nodes.
|
||||
WatchNode* node_previous = nullptr;
|
||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||
WatchNode* node = watch_node_first_free_;
|
||||
if (node != nullptr) {
|
||||
watch_node_first_free_ = node->next_free;
|
||||
} else {
|
||||
if (watch_node_pools_.empty() ||
|
||||
watch_node_current_pool_allocated_ >= kWatchNodePoolSize) {
|
||||
watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]);
|
||||
watch_node_current_pool_allocated_ = 0;
|
||||
}
|
||||
node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]);
|
||||
}
|
||||
node->range = range;
|
||||
node->range_node_next = nullptr;
|
||||
if (node_previous != nullptr) {
|
||||
node_previous->range_node_next = node;
|
||||
} else {
|
||||
range->node_first = node;
|
||||
}
|
||||
node_previous = node;
|
||||
node->bucket_node_previous = nullptr;
|
||||
node->bucket_node_next = watch_buckets_[i];
|
||||
if (watch_buckets_[i] != nullptr) {
|
||||
watch_buckets_[i]->bucket_node_previous = node;
|
||||
}
|
||||
watch_buckets_[i] = node;
|
||||
}
|
||||
|
||||
return reinterpret_cast<WatchHandle>(range);
|
||||
}
|
||||
|
||||
void SharedMemory::UnwatchMemoryRange(WatchHandle handle) {
|
||||
if (handle == nullptr) {
|
||||
// Could be a zero length range.
|
||||
return;
|
||||
}
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
UnlinkWatchRange(reinterpret_cast<WatchRange*>(handle));
|
||||
}
|
||||
|
||||
void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
|
||||
bool invalidated_by_gpu) {
|
||||
uint32_t address_first = page_first << page_size_log2_;
|
||||
uint32_t address_last =
|
||||
(page_last << page_size_log2_) + ((1 << page_size_log2_) - 1);
|
||||
uint32_t bucket_first = address_first >> kWatchBucketSizeLog2;
|
||||
uint32_t bucket_last = address_last >> kWatchBucketSizeLog2;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
// Fire global watches.
|
||||
for (const auto global_watch : global_watches_) {
|
||||
global_watch->callback(global_watch->callback_context, address_first,
|
||||
address_last, invalidated_by_gpu);
|
||||
}
|
||||
|
||||
// Fire per-range watches.
|
||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||
WatchNode* node = watch_buckets_[i];
|
||||
while (node != nullptr) {
|
||||
WatchRange* range = node->range;
|
||||
// Store the next node now since when the callback is triggered, the links
|
||||
// will be broken.
|
||||
node = node->bucket_node_next;
|
||||
if (page_first <= range->page_last && page_last >= range->page_first) {
|
||||
range->callback(range->callback_context, range->callback_data,
|
||||
range->callback_argument, invalidated_by_gpu);
|
||||
UnlinkWatchRange(range);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t end = start + length - 1;
|
||||
uint32_t page_first = start >> page_size_log2_;
|
||||
uint32_t page_last = end >> page_size_log2_;
|
||||
|
||||
// Trigger modification callbacks so, for instance, resolved data is loaded to
|
||||
// the texture.
|
||||
FireWatches(page_first, page_last, true);
|
||||
|
||||
// Mark the range as valid (so pages are not reuploaded until modified by the
|
||||
// CPU) and watch it so the CPU can reuse it and this will be caught.
|
||||
MakeRangeValid(start, length, true);
|
||||
}
|
||||
|
||||
void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
|
||||
bool written_by_gpu) {
|
||||
if (length == 0 || start >= kBufferSize) {
|
||||
return;
|
||||
}
|
||||
length = std::min(length, kBufferSize - start);
|
||||
uint32_t last = start + length - 1;
|
||||
uint32_t valid_page_first = start >> page_size_log2_;
|
||||
uint32_t valid_page_last = last >> page_size_log2_;
|
||||
uint32_t valid_block_first = valid_page_first >> 6;
|
||||
uint32_t valid_block_last = valid_page_last >> 6;
|
||||
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) {
|
||||
uint64_t valid_bits = UINT64_MAX;
|
||||
if (i == valid_block_first) {
|
||||
valid_bits &= ~((uint64_t(1) << (valid_page_first & 63)) - 1);
|
||||
}
|
||||
if (i == valid_block_last && (valid_page_last & 63) != 63) {
|
||||
valid_bits &= (uint64_t(1) << ((valid_page_last & 63) + 1)) - 1;
|
||||
}
|
||||
SystemPageFlagsBlock& block = system_page_flags_[i];
|
||||
block.valid |= valid_bits;
|
||||
if (written_by_gpu) {
|
||||
block.valid_and_gpu_written |= valid_bits;
|
||||
} else {
|
||||
block.valid_and_gpu_written &= ~valid_bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_invalidation_callback_handle_) {
|
||||
memory().EnablePhysicalMemoryAccessCallbacks(
|
||||
valid_page_first << page_size_log2_,
|
||||
(valid_page_last - valid_page_first + 1) << page_size_log2_, true,
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::UnlinkWatchRange(WatchRange* range) {
|
||||
uint32_t bucket =
|
||||
range->page_first << page_size_log2_ >> kWatchBucketSizeLog2;
|
||||
WatchNode* node = range->node_first;
|
||||
while (node != nullptr) {
|
||||
WatchNode* node_next = node->range_node_next;
|
||||
if (node->bucket_node_previous != nullptr) {
|
||||
node->bucket_node_previous->bucket_node_next = node->bucket_node_next;
|
||||
} else {
|
||||
watch_buckets_[bucket] = node->bucket_node_next;
|
||||
}
|
||||
if (node->bucket_node_next != nullptr) {
|
||||
node->bucket_node_next->bucket_node_previous = node->bucket_node_previous;
|
||||
}
|
||||
node->next_free = watch_node_first_free_;
|
||||
watch_node_first_free_ = node;
|
||||
node = node_next;
|
||||
++bucket;
|
||||
}
|
||||
range->next_free = watch_range_first_free_;
|
||||
watch_range_first_free_ = range;
|
||||
}
|
||||
|
||||
bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
|
||||
if (!length) {
|
||||
// Some texture or buffer is empty, for example - safe to draw in this case.
|
||||
return true;
|
||||
}
|
||||
if (start > kBufferSize || (kBufferSize - start) < length) {
|
||||
return false;
|
||||
}
|
||||
uint32_t last = start + length - 1;
|
||||
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
if (!EnsureHostGpuMemoryAllocated(start, length)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t page_first = start >> page_size_log2_;
|
||||
uint32_t page_last = (start + length - 1) >> page_size_log2_;
|
||||
|
||||
upload_ranges_.clear();
|
||||
uint32_t block_first = page_first >> 6;
|
||||
uint32_t block_last = page_last >> 6;
|
||||
uint32_t range_start = UINT32_MAX;
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (uint32_t i = block_first; i <= block_last; ++i) {
|
||||
uint64_t block_valid = system_page_flags_[i].valid;
|
||||
// Consider pages in the block outside the requested range valid.
|
||||
if (i == block_first) {
|
||||
block_valid |= (uint64_t(1) << (page_first & 63)) - 1;
|
||||
}
|
||||
if (i == block_last && (page_last & 63) != 63) {
|
||||
block_valid |= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
uint32_t block_page;
|
||||
if (range_start == UINT32_MAX) {
|
||||
// Check if need to open a new range.
|
||||
if (!xe::bit_scan_forward(~block_valid, &block_page)) {
|
||||
break;
|
||||
}
|
||||
range_start = (i << 6) + block_page;
|
||||
} else {
|
||||
// Check if need to close the range.
|
||||
// Ignore the valid pages before the beginning of the range.
|
||||
uint64_t block_valid_from_start = block_valid;
|
||||
if (i == (range_start >> 6)) {
|
||||
block_valid_from_start &=
|
||||
~((uint64_t(1) << (range_start & 63)) - 1);
|
||||
}
|
||||
if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
|
||||
break;
|
||||
}
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, (i << 6) + block_page - range_start));
|
||||
// In the next iteration within this block, consider this range valid
|
||||
// since it has been queued for upload.
|
||||
block_valid |= (uint64_t(1) << block_page) - 1;
|
||||
range_start = UINT32_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (range_start != UINT32_MAX) {
|
||||
upload_ranges_.push_back(
|
||||
std::make_pair(range_start, page_last + 1 - range_start));
|
||||
}
|
||||
if (upload_ranges_.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return UploadRanges(upload_ranges_);
|
||||
}
|
||||
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
|
||||
void* context_ptr, uint32_t physical_address_start, uint32_t length,
|
||||
bool exact_range) {
|
||||
return reinterpret_cast<SharedMemory*>(context_ptr)
|
||||
->MemoryInvalidationCallback(physical_address_start, length, exact_range);
|
||||
}
|
||||
|
||||
std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
|
||||
uint32_t physical_address_start, uint32_t length, bool exact_range) {
|
||||
if (length == 0 || physical_address_start >= kBufferSize) {
|
||||
return std::make_pair(uint32_t(0), UINT32_MAX);
|
||||
}
|
||||
length = std::min(length, kBufferSize - physical_address_start);
|
||||
uint32_t physical_address_last = physical_address_start + (length - 1);
|
||||
|
||||
uint32_t page_first = physical_address_start >> page_size_log2_;
|
||||
uint32_t page_last = physical_address_last >> page_size_log2_;
|
||||
uint32_t block_first = page_first >> 6;
|
||||
uint32_t block_last = page_last >> 6;
|
||||
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
|
||||
if (!exact_range) {
|
||||
// Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
|
||||
// invalidated - if no GPU-written data nearby that was not intended to be
|
||||
// invalidated since it's not in sync with CPU memory and can't be
|
||||
// reuploaded. It's a lot cheaper to upload some excess data than to catch
|
||||
// access violations - with 4 KB callbacks, the original Doom runs at 4 FPS
|
||||
// on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
|
||||
// frame, but with 256 KB it's 0.7 ms.
|
||||
if (page_first & 63) {
|
||||
uint64_t gpu_written_start =
|
||||
system_page_flags_[block_first].valid_and_gpu_written;
|
||||
gpu_written_start &= (uint64_t(1) << (page_first & 63)) - 1;
|
||||
page_first =
|
||||
(page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
|
||||
}
|
||||
if ((page_last & 63) != 63) {
|
||||
uint64_t gpu_written_end =
|
||||
system_page_flags_[block_last].valid_and_gpu_written;
|
||||
gpu_written_end &= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1);
|
||||
page_last = (page_last & ~uint32_t(63)) +
|
||||
(std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = block_first; i <= block_last; ++i) {
|
||||
uint64_t invalidate_bits = UINT64_MAX;
|
||||
if (i == block_first) {
|
||||
invalidate_bits &= ~((uint64_t(1) << (page_first & 63)) - 1);
|
||||
}
|
||||
if (i == block_last && (page_last & 63) != 63) {
|
||||
invalidate_bits &= (uint64_t(1) << ((page_last & 63) + 1)) - 1;
|
||||
}
|
||||
SystemPageFlagsBlock& block = system_page_flags_[i];
|
||||
block.valid &= ~invalidate_bits;
|
||||
block.valid_and_gpu_written &= ~invalidate_bits;
|
||||
}
|
||||
|
||||
FireWatches(page_first, page_last, false);
|
||||
|
||||
return std::make_pair(page_first << page_size_log2_,
|
||||
(page_last - page_first + 1) << page_size_log2_);
|
||||
}
|
||||
|
||||
void SharedMemory::PrepareForTraceDownload() {
|
||||
ReleaseTraceDownloadRanges();
|
||||
assert_true(trace_download_ranges_.empty());
|
||||
assert_zero(trace_download_page_count_);
|
||||
|
||||
// Invalidate the entire memory CPU->GPU memory copy so all the history
|
||||
// doesn't have to be written into every frame trace, and collect the list of
|
||||
// ranges with data modified on the GPU.
|
||||
|
||||
uint32_t fire_watches_range_start = UINT32_MAX;
|
||||
uint32_t gpu_written_range_start = UINT32_MAX;
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
|
||||
SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
|
||||
uint64_t previously_valid_block = page_flags_block.valid;
|
||||
uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
|
||||
page_flags_block.valid = gpu_written_block;
|
||||
|
||||
// Fire watches on the invalidated pages.
|
||||
uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
|
||||
uint64_t fire_watches_break_block = ~fire_watches_block;
|
||||
while (true) {
|
||||
uint32_t fire_watches_block_page;
|
||||
if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX
|
||||
? fire_watches_block
|
||||
: fire_watches_break_block,
|
||||
&fire_watches_block_page)) {
|
||||
break;
|
||||
}
|
||||
uint32_t fire_watches_page = (i << 6) + fire_watches_block_page;
|
||||
if (fire_watches_range_start == UINT32_MAX) {
|
||||
fire_watches_range_start = fire_watches_page;
|
||||
} else {
|
||||
FireWatches(fire_watches_range_start, fire_watches_page - 1, false);
|
||||
fire_watches_range_start = UINT32_MAX;
|
||||
}
|
||||
uint64_t fire_watches_block_mask =
|
||||
~((uint64_t(1) << fire_watches_block_page) - 1);
|
||||
fire_watches_block &= fire_watches_block_mask;
|
||||
fire_watches_break_block &= fire_watches_block_mask;
|
||||
}
|
||||
|
||||
// Add to the GPU-written ranges.
|
||||
uint64_t gpu_written_break_block = ~gpu_written_block;
|
||||
while (true) {
|
||||
uint32_t gpu_written_block_page;
|
||||
if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX
|
||||
? gpu_written_block
|
||||
: gpu_written_break_block,
|
||||
&gpu_written_block_page)) {
|
||||
break;
|
||||
}
|
||||
uint32_t gpu_written_page = (i << 6) + gpu_written_block_page;
|
||||
if (gpu_written_range_start == UINT32_MAX) {
|
||||
gpu_written_range_start = gpu_written_page;
|
||||
} else {
|
||||
uint32_t gpu_written_range_length =
|
||||
gpu_written_page - gpu_written_range_start;
|
||||
trace_download_ranges_.push_back(
|
||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||
gpu_written_range_length << page_size_log2_));
|
||||
trace_download_page_count_ += gpu_written_range_length;
|
||||
gpu_written_range_start = UINT32_MAX;
|
||||
}
|
||||
uint64_t gpu_written_block_mask =
|
||||
~((uint64_t(1) << gpu_written_block_page) - 1);
|
||||
gpu_written_block &= gpu_written_block_mask;
|
||||
gpu_written_break_block &= gpu_written_block_mask;
|
||||
}
|
||||
}
|
||||
uint32_t page_count = kBufferSize >> page_size_log2_;
|
||||
if (fire_watches_range_start != UINT32_MAX) {
|
||||
FireWatches(fire_watches_range_start, page_count - 1, false);
|
||||
}
|
||||
if (gpu_written_range_start != UINT32_MAX) {
|
||||
uint32_t gpu_written_range_length = page_count - gpu_written_range_start;
|
||||
trace_download_ranges_.push_back(
|
||||
std::make_pair(gpu_written_range_start << page_size_log2_,
|
||||
gpu_written_range_length << page_size_log2_));
|
||||
trace_download_page_count_ += gpu_written_range_length;
|
||||
}
|
||||
}
|
||||
|
||||
void SharedMemory::ReleaseTraceDownloadRanges() {
|
||||
trace_download_ranges_.clear();
|
||||
trace_download_ranges_.shrink_to_fit();
|
||||
trace_download_page_count_ = 0;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
|
@ -2,49 +2,32 @@
|
|||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2018 Ben Vanik. All rights reserved. *
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_GPU_D3D12_SHARED_MEMORY_H_
|
||||
#define XENIA_GPU_D3D12_SHARED_MEMORY_H_
|
||||
#ifndef XENIA_GPU_SHARED_MEMORY_H_
|
||||
#define XENIA_GPU_SHARED_MEMORY_H_
|
||||
|
||||
#include <memory>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/memory.h"
|
||||
#include "xenia/ui/d3d12/d3d12_api.h"
|
||||
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace d3d12 {
|
||||
|
||||
class D3D12CommandProcessor;
|
||||
|
||||
// Manages memory for unconverted textures, resolve targets, vertex and index
|
||||
// buffers that can be accessed from shaders with Xenon physical addresses, with
|
||||
// system page size granularity.
|
||||
class SharedMemory {
|
||||
public:
|
||||
SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory,
|
||||
TraceWriter& trace_writer);
|
||||
~SharedMemory();
|
||||
|
||||
bool Initialize();
|
||||
void Shutdown();
|
||||
void ClearCache();
|
||||
|
||||
ID3D12Resource* GetBuffer() const { return buffer_; }
|
||||
D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
|
||||
return buffer_gpu_address_;
|
||||
}
|
||||
|
||||
void CompletedSubmissionUpdated();
|
||||
virtual ~SharedMemory();
|
||||
// Call in the implementation-specific ClearCache.
|
||||
virtual void ClearCache();
|
||||
|
||||
typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first,
|
||||
uint32_t address_last,
|
||||
|
@ -86,10 +69,8 @@ class SharedMemory {
|
|||
void UnwatchMemoryRange(WatchHandle handle);
|
||||
|
||||
// Checks if the range has been updated, uploads new data if needed and
|
||||
// ensures the buffer tiles backing the range are resident. May transition the
|
||||
// tiled buffer to copy destination - call this before UseForReading or
|
||||
// UseForWriting. Returns true if the range has been fully updated and is
|
||||
// usable.
|
||||
// ensures the host GPU memory backing the range are resident. Returns true if
|
||||
// the range has been fully updated and is usable.
|
||||
bool RequestRange(uint32_t start, uint32_t length);
|
||||
|
||||
// Marks the range and, if not exact_range, potentially its surroundings
|
||||
|
@ -106,124 +87,83 @@ class SharedMemory {
|
|||
// be called, to make sure, if the GPU writes don't overwrite *everything* in
|
||||
// the pages they touch, the CPU data is properly loaded to the unmodified
|
||||
// regions in those pages.
|
||||
void RangeWrittenByGPU(uint32_t start, uint32_t length);
|
||||
void RangeWrittenByGpu(uint32_t start, uint32_t length);
|
||||
|
||||
// Makes the buffer usable for vertices, indices and texture untiling.
|
||||
inline void UseForReading() {
|
||||
// Vertex fetch is also allowed in pixel shaders.
|
||||
CommitUAVWritesAndTransitionBuffer(
|
||||
D3D12_RESOURCE_STATE_INDEX_BUFFER |
|
||||
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE |
|
||||
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
|
||||
}
|
||||
// Makes the buffer usable for texture tiling after a resolve.
|
||||
inline void UseForWriting() {
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
|
||||
}
|
||||
// Makes the buffer usable as a source for copy commands.
|
||||
inline void UseAsCopySource() {
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
|
||||
}
|
||||
// Must be called when doing draws/dispatches modifying data within the shared
|
||||
// memory buffer as a UAV, to make sure that when UseForWriting is called the
|
||||
// next time, a UAV barrier will be done, and subsequent overlapping UAV
|
||||
// writes and reads are ordered.
|
||||
inline void MarkUAVWritesCommitNeeded() {
|
||||
if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
|
||||
buffer_uav_writes_commit_needed_ = true;
|
||||
}
|
||||
}
|
||||
protected:
|
||||
SharedMemory(Memory& memory);
|
||||
// Call in implementation-specific initialization.
|
||||
void InitializeCommon();
|
||||
// Call last in implementation-specific shutdown, also callable from the
|
||||
// destructor.
|
||||
void ShutdownCommon();
|
||||
|
||||
void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
// Due to the Nvidia 128 megatexel limitation, the smallest supported formats
|
||||
// are 32-bit.
|
||||
void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
|
||||
uint32_t element_size_bytes_pow2);
|
||||
void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
|
||||
uint32_t element_size_bytes_pow2);
|
||||
static constexpr uint32_t kBufferSizeLog2 = 29;
|
||||
static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
|
||||
|
||||
// Returns true if any downloads were submitted to the command processor.
|
||||
bool InitializeTraceSubmitDownloads();
|
||||
void InitializeTraceCompleteDownloads();
|
||||
// Sparse allocations are 4 MB, so not too many of them are allocated, but
|
||||
// also not to waste too much memory for padding (with 16 MB there's too
|
||||
// much).
|
||||
static constexpr uint32_t kOptimalAllocationLog2 = 22;
|
||||
static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2);
|
||||
|
||||
private:
|
||||
bool AreTiledResourcesUsed() const;
|
||||
Memory& memory() const { return memory_; }
|
||||
|
||||
uint32_t page_size_log2() const { return page_size_log2_; }
|
||||
|
||||
// Mark the memory range as updated and protect it.
|
||||
void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
|
||||
|
||||
D3D12CommandProcessor& command_processor_;
|
||||
// Ensures the host GPU memory backing the range is accessible by host GPU
|
||||
// drawing / computations / copying, but doesn't upload anything.
|
||||
virtual bool EnsureHostGpuMemoryAllocated(uint32_t start,
|
||||
uint32_t length) = 0;
|
||||
|
||||
// Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated
|
||||
// succeeded. While uploading, MarkRangeValid must be called for each
|
||||
// successfully uploaded range as early as possible, before the memcpy, to
|
||||
// make sure invalidation that happened during the CPU -> GPU memcpy isn't
|
||||
// missed (upload_page_ranges is in pages because of this - MarkRangeValid has
|
||||
// page granularity).
|
||||
virtual bool UploadRanges(
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
|
||||
|
||||
// Mutable so the implementation can skip ranges by setting their "second"
|
||||
// value to 0 if needed.
|
||||
std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
|
||||
return trace_download_ranges_;
|
||||
}
|
||||
uint32_t trace_download_page_count() const {
|
||||
return trace_download_page_count_;
|
||||
}
|
||||
// Fills trace_download_ranges() and trace_download_page_count() with
|
||||
// GPU-written ranges that need to be downloaded, and also invalidates
|
||||
// non-GPU-written ranges so only the needed data - not the all the collected
|
||||
// data - will be written in the trace. trace_download_page_count() will be 0
|
||||
// if nothing to download.
|
||||
void PrepareForTraceDownload();
|
||||
// Release memory used for trace download ranges, to be called after
|
||||
// downloading or in cases when download is dropped.
|
||||
void ReleaseTraceDownloadRanges();
|
||||
|
||||
private:
|
||||
Memory& memory_;
|
||||
TraceWriter& trace_writer_;
|
||||
|
||||
// The 512 MB tiled buffer.
|
||||
static constexpr uint32_t kBufferSizeLog2 = 29;
|
||||
static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
|
||||
ID3D12Resource* buffer_ = nullptr;
|
||||
D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
|
||||
D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
bool buffer_uav_writes_commit_needed_ = false;
|
||||
void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
|
||||
|
||||
// Heaps are 4 MB, so not too many of them are allocated, but also not to
|
||||
// waste too much memory for padding (with 16 MB there's too much).
|
||||
static constexpr uint32_t kHeapSizeLog2 = 22;
|
||||
static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
|
||||
static_assert((kHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0,
|
||||
"Heap size must be a multiple of Direct3D tile size");
|
||||
// Resident portions of the tiled buffer.
|
||||
ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
|
||||
// Number of the heaps currently resident, for profiling.
|
||||
uint32_t heap_count_ = 0;
|
||||
|
||||
// Log2 of invalidation granularity (the system page size, but the dependency
|
||||
// on it is not hard - the access callback takes a range as an argument, and
|
||||
// touched pages of the buffer of this size will be invalidated).
|
||||
uint32_t page_size_log2_;
|
||||
// Total buffer page count.
|
||||
uint32_t page_count_;
|
||||
|
||||
// Ensures the buffer tiles backing the range are resident, but doesn't upload
|
||||
// anything.
|
||||
bool EnsureTilesResident(uint32_t start, uint32_t length);
|
||||
|
||||
// Non-shader-visible buffer descriptor heap for faster binding (via copying
|
||||
// rather than creation).
|
||||
enum class BufferDescriptorIndex : uint32_t {
|
||||
kRawSRV,
|
||||
kR32UintSRV,
|
||||
kR32G32UintSRV,
|
||||
kR32G32B32A32UintSRV,
|
||||
kRawUAV,
|
||||
kR32UintUAV,
|
||||
kR32G32UintUAV,
|
||||
kR32G32B32A32UintUAV,
|
||||
|
||||
kCount,
|
||||
};
|
||||
ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr;
|
||||
D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_;
|
||||
|
||||
// First page and length in pages.
|
||||
typedef std::pair<uint32_t, uint32_t> UploadRange;
|
||||
// Ranges that need to be uploaded, generated by GetRangesToUpload (a
|
||||
// persistently allocated vector).
|
||||
std::vector<UploadRange> upload_ranges_;
|
||||
void GetRangesToUpload(uint32_t request_page_first,
|
||||
uint32_t request_page_last);
|
||||
std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> upload_buffer_pool_;
|
||||
|
||||
// GPU-written memory downloading for traces.
|
||||
// Start page, length in pages.
|
||||
std::vector<std::pair<uint32_t, uint32_t>> trace_gpu_written_ranges_;
|
||||
// Created temporarily, only for downloading.
|
||||
ID3D12Resource* trace_gpu_written_buffer_ = nullptr;
|
||||
void ResetTraceGPUWrittenBuffer();
|
||||
|
||||
void* memory_invalidation_callback_handle_ = nullptr;
|
||||
void* memory_data_provider_handle_ = nullptr;
|
||||
|
||||
// Ranges that need to be uploaded, generated by GetRangesToUpload (a
|
||||
// persistently allocated vector).
|
||||
std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
|
||||
|
||||
// GPU-written memory downloading for traces. <Start address, length>.
|
||||
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
|
||||
uint32_t trace_download_page_count_ = 0;
|
||||
|
||||
// Mutex between the guest memory subsystem and the command processor, to be
|
||||
// locked when checking or updating validity of pages/ranges and when firing
|
||||
// watches.
|
||||
|
@ -309,8 +249,7 @@ class SharedMemory {
|
|||
void UnlinkWatchRange(WatchRange* range);
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_GPU_D3D12_SHARED_MEMORY_H_
|
||||
#endif // XENIA_GPU_SHARED_MEMORY_H_
|
Loading…
Reference in New Issue