diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 1275fdd0f..4b1fb4ac0 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1161,7 +1161,7 @@ bool D3D12CommandProcessor::SetupContext() { } shared_memory_ = - std::make_unique(*this, *memory_, trace_writer_); + std::make_unique(*this, *memory_, trace_writer_); if (!shared_memory_->Initialize()) { XELOGE("Failed to initialize shared memory"); return false; @@ -2259,7 +2259,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // Invalidate textures in memexported memory and watch for changes. for (uint32_t i = 0; i < memexport_range_count; ++i) { const MemExportRange& memexport_range = memexport_ranges[i]; - shared_memory_->RangeWrittenByGPU( + shared_memory_->RangeWrittenByGpu( memexport_range.base_address_dwords << 2, memexport_range.size_dwords << 2); } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 58015cdb4..e2677029b 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -20,11 +20,11 @@ #include "xenia/base/assert.h" #include "xenia/gpu/command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" +#include "xenia/gpu/d3d12/d3d12_shared_memory.h" #include "xenia/gpu/d3d12/deferred_command_list.h" #include "xenia/gpu/d3d12/pipeline_cache.h" #include "xenia/gpu/d3d12/primitive_converter.h" #include "xenia/gpu/d3d12/render_target_cache.h" -#include "xenia/gpu/d3d12/shared_memory.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/xenos.h" @@ -471,7 +471,7 @@ class D3D12CommandProcessor : public CommandProcessor { ID3D12RootSignature* root_signature_bindless_vs_ = nullptr; ID3D12RootSignature* root_signature_bindless_ds_ = nullptr; - std::unique_ptr shared_memory_; + std::unique_ptr shared_memory_; std::unique_ptr pipeline_cache_; diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc new file mode 100644 index 000000000..2c74c4da8 --- /dev/null +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc @@ -0,0 +1,459 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/d3d12/d3d12_shared_memory.h" + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/cvar.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/profiling.h" +#include "xenia/gpu/d3d12/d3d12_command_processor.h" +#include "xenia/ui/d3d12/d3d12_util.h" + +DEFINE_bool(d3d12_tiled_shared_memory, true, + "Enable tiled resources for shared memory emulation. Disabling " + "them greatly increases video memory usage - a 512 MB buffer is " + "created - but allows graphics debuggers that don't support tiled " + "resources to work.", + "D3D12"); + +namespace xe { +namespace gpu { +namespace d3d12 { + +D3D12SharedMemory::D3D12SharedMemory(D3D12CommandProcessor& command_processor, + Memory& memory, TraceWriter& trace_writer) + : SharedMemory(memory), + command_processor_(command_processor), + trace_writer_(trace_writer) {} + +D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); } + +bool D3D12SharedMemory::Initialize() { + InitializeCommon(); + + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + + D3D12_RESOURCE_DESC buffer_desc; + ui::d3d12::util::FillBufferResourceDesc( + buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; + if (AreTiledResourcesUsed()) { + if (FAILED(device->CreateReservedResource( + &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) { + XELOGE("Shared memory: Failed to create the 512 MB tiled buffer"); + Shutdown(); + return false; + } + } else { + XELOGGPU( + "Direct3D 12 tiled resources are not used for shared memory " + "emulation - video memory usage may increase significantly " + "because a full 512 MB buffer will be created!"); + if (provider.GetGraphicsAnalysis() != nullptr) { + // As of October 8th, 2018, PIX doesn't support tiled buffers. + // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. + XELOGGPU( + "This is caused by PIX being attached, which doesn't support tiled " + "resources yet."); + } + if (FAILED(device->CreateCommittedResource( + &ui::d3d12::util::kHeapPropertiesDefault, + provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_, + nullptr, IID_PPV_ARGS(&buffer_)))) { + XELOGE("Shared memory: Failed to create the 512 MB buffer"); + Shutdown(); + return false; + } + } + buffer_gpu_address_ = buffer_->GetGPUVirtualAddress(); + buffer_uav_writes_commit_needed_ = false; + + D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc; + buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + buffer_descriptor_heap_desc.NumDescriptors = + uint32_t(BufferDescriptorIndex::kCount); + buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; + buffer_descriptor_heap_desc.NodeMask = 0; + if (FAILED(device->CreateDescriptorHeap( + &buffer_descriptor_heap_desc, + IID_PPV_ARGS(&buffer_descriptor_heap_)))) { + XELOGE( + "Failed to create the descriptor heap for shared memory buffer views"); + Shutdown(); + return false; + } + buffer_descriptor_heap_start_ = + buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart(); + ui::d3d12::util::CreateBufferRawSRV( + device, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kRawSRV)), + buffer_, kBufferSize); + ui::d3d12::util::CreateBufferTypedSRV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32UintSRV)), + buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2); + ui::d3d12::util::CreateBufferTypedSRV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32G32UintSRV)), + buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3); + ui::d3d12::util::CreateBufferTypedSRV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)), + buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4); + ui::d3d12::util::CreateBufferRawUAV( + device, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kRawUAV)), + buffer_, kBufferSize); + ui::d3d12::util::CreateBufferTypedUAV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32UintUAV)), + buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2); + ui::d3d12::util::CreateBufferTypedUAV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32G32UintUAV)), + buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3); + ui::d3d12::util::CreateBufferTypedUAV( + device, + provider.OffsetViewDescriptor( + buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)), + buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4); + + upload_buffer_pool_ = std::make_unique( + provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize, + size_t(1) << page_size_log2())); + + return true; +} + +void D3D12SharedMemory::Shutdown(bool from_destructor) { + ResetTraceDownload(); + + upload_buffer_pool_.reset(); + + ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_); + + // First free the buffer to detach it from the heaps. + ui::d3d12::util::ReleaseAndNull(buffer_); + + if (AreTiledResourcesUsed()) { + for (uint32_t i = 0; i < xe::countof(heaps_); ++i) { + ui::d3d12::util::ReleaseAndNull(heaps_[i]); + } + heap_count_ = 0; + COUNT_profile_set("gpu/shared_memory/used_mb", 0); + } + + // If calling from the destructor, the SharedMemory destructor will call + // ShutdownCommon. + if (!from_destructor) { + ShutdownCommon(); + } +} + +void D3D12SharedMemory::ClearCache() { + SharedMemory::ClearCache(); + + upload_buffer_pool_->ClearCache(); + + // TODO(Triang3l): Unmap and destroy heaps. +} + +void D3D12SharedMemory::CompletedSubmissionUpdated() { + upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission()); +} + +bool D3D12SharedMemory::AreTiledResourcesUsed() const { + if (!cvars::d3d12_tiled_shared_memory) { + return false; + } + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + // As of October 8th, 2018, PIX doesn't support tiled buffers. + // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. + return provider.GetTiledResourcesTier() != + D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED && + provider.GetGraphicsAnalysis() == nullptr; +} + +void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer( + D3D12_RESOURCE_STATES new_state) { + if (buffer_state_ == new_state) { + if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS && + buffer_uav_writes_commit_needed_) { + command_processor_.PushUAVBarrier(buffer_); + buffer_uav_writes_commit_needed_ = false; + } + return; + } + command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state); + buffer_state_ = new_state; + // "UAV -> anything" transition commits the writes implicitly. + buffer_uav_writes_commit_needed_ = false; +} + +void D3D12SharedMemory::WriteRawSRVDescriptor( + D3D12_CPU_DESCRIPTOR_HANDLE handle) { + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + device->CopyDescriptorsSimple( + 1, handle, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kRawSRV)), + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); +} + +void D3D12SharedMemory::WriteRawUAVDescriptor( + D3D12_CPU_DESCRIPTOR_HANDLE handle) { + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + device->CopyDescriptorsSimple( + 1, handle, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(BufferDescriptorIndex::kRawUAV)), + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); +} + +void D3D12SharedMemory::WriteUintPow2SRVDescriptor( + D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { + BufferDescriptorIndex descriptor_index; + switch (element_size_bytes_pow2) { + case 2: + descriptor_index = BufferDescriptorIndex::kR32UintSRV; + break; + case 3: + descriptor_index = BufferDescriptorIndex::kR32G32UintSRV; + break; + case 4: + descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV; + break; + default: + assert_unhandled_case(element_size_bytes_pow2); + return; + } + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + device->CopyDescriptorsSimple( + 1, handle, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(descriptor_index)), + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); +} + +void D3D12SharedMemory::WriteUintPow2UAVDescriptor( + D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { + BufferDescriptorIndex descriptor_index; + switch (element_size_bytes_pow2) { + case 2: + descriptor_index = BufferDescriptorIndex::kR32UintUAV; + break; + case 3: + descriptor_index = BufferDescriptorIndex::kR32G32UintUAV; + break; + case 4: + descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV; + break; + default: + assert_unhandled_case(element_size_bytes_pow2); + return; + } + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + device->CopyDescriptorsSimple( + 1, handle, + provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, + uint32_t(descriptor_index)), + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); +} + +bool D3D12SharedMemory::InitializeTraceSubmitDownloads() { + ResetTraceDownload(); + PrepareForTraceDownload(); + uint32_t download_page_count = trace_download_page_count(); + // Request downloading of GPU-written memory. + if (!download_page_count) { + return false; + } + D3D12_RESOURCE_DESC download_buffer_desc; + ui::d3d12::util::FillBufferResourceDesc( + download_buffer_desc, download_page_count << page_size_log2(), + D3D12_RESOURCE_FLAG_NONE); + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + if (FAILED(device->CreateCommittedResource( + &ui::d3d12::util::kHeapPropertiesReadback, + provider.GetHeapFlagCreateNotZeroed(), &download_buffer_desc, + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&trace_download_buffer_)))) { + XELOGE( + "Shared memory: Failed to create a {} KB GPU-written memory download " + "buffer for frame tracing", + download_page_count << page_size_log2() >> 10); + ResetTraceDownload(); + return false; + } + auto& command_list = command_processor_.GetDeferredCommandList(); + UseAsCopySource(); + command_processor_.SubmitBarriers(); + uint32_t download_buffer_offset = 0; + for (auto& download_range : trace_download_ranges()) { + if (!EnsureHostGpuMemoryAllocated(download_range.first, + download_range.second)) { + download_range.second = 0; + continue; + } + command_list.D3DCopyBufferRegion( + trace_download_buffer_, download_buffer_offset, buffer_, + download_range.first, download_range.second); + download_buffer_offset += download_range.second; + } + return true; +} + +void D3D12SharedMemory::InitializeTraceCompleteDownloads() { + if (!trace_download_buffer_) { + return; + } + void* download_mapping; + if (SUCCEEDED(trace_download_buffer_->Map(0, nullptr, &download_mapping))) { + uint32_t download_buffer_offset = 0; + for (auto download_range : trace_download_ranges()) { + trace_writer_.WriteMemoryRead( + download_range.first, download_range.second, + reinterpret_cast(download_mapping) + + download_buffer_offset); + } + D3D12_RANGE download_write_range = {}; + trace_download_buffer_->Unmap(0, &download_write_range); + } else { + XELOGE( + "Failed to map the GPU-written memory download buffer for frame " + "tracing"); + } + ResetTraceDownload(); +} + +void D3D12SharedMemory::ResetTraceDownload() { + ui::d3d12::util::ReleaseAndNull(trace_download_buffer_); + ReleaseTraceDownloadRanges(); +} + +bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start, + uint32_t length) { + if (!length || !AreTiledResourcesUsed()) { + return true; + } + uint32_t heap_first = start >> kHeapSizeLog2; + uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2; + assert_true(heap_first < xe::countof(heaps_)); + assert_true(heap_last < xe::countof(heaps_)); + for (uint32_t i = heap_first; i <= heap_last; ++i) { + if (heaps_[i] != nullptr) { + continue; + } + auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); + auto device = provider.GetDevice(); + auto direct_queue = provider.GetDirectQueue(); + D3D12_HEAP_DESC heap_desc = {}; + heap_desc.SizeInBytes = kHeapSize; + heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; + heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | + provider.GetHeapFlagCreateNotZeroed(); + if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) { + XELOGE("Shared memory: Failed to create a tile heap"); + return false; + } + ++heap_count_; + COUNT_profile_set("gpu/shared_memory/used_mb", + heap_count_ << kHeapSizeLog2 >> 20); + D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates; + region_start_coordinates.X = + (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + region_start_coordinates.Y = 0; + region_start_coordinates.Z = 0; + region_start_coordinates.Subresource = 0; + D3D12_TILE_REGION_SIZE region_size; + region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + region_size.UseBox = FALSE; + D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE; + UINT heap_range_start_offset = 0; + UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; + direct_queue->UpdateTileMappings( + buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1, + &range_flags, &heap_range_start_offset, &range_tile_count, + D3D12_TILE_MAPPING_FLAG_NONE); + command_processor_.NotifyQueueOperationsDoneDirectly(); + } + return true; +} + +bool D3D12SharedMemory::UploadRanges( + const std::vector>& upload_page_ranges) { + if (upload_page_ranges.empty()) { + return true; + } + CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST); + command_processor_.SubmitBarriers(); + auto& command_list = command_processor_.GetDeferredCommandList(); + for (auto upload_range : upload_page_ranges) { + uint32_t upload_range_start = upload_range.first; + uint32_t upload_range_length = upload_range.second; + trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(), + upload_range_length << page_size_log2()); + while (upload_range_length != 0) { + ID3D12Resource* upload_buffer; + size_t upload_buffer_offset, upload_buffer_size; + uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial( + command_processor_.GetCurrentSubmission(), + upload_range_length << page_size_log2(), + size_t(1) << page_size_log2(), &upload_buffer, &upload_buffer_offset, + &upload_buffer_size, nullptr); + if (upload_buffer_mapping == nullptr) { + XELOGE("Shared memory: Failed to get an upload buffer"); + return false; + } + MakeRangeValid(upload_range_start << page_size_log2(), + uint32_t(upload_buffer_size), false); + std::memcpy( + upload_buffer_mapping, + memory().TranslatePhysical(upload_range_start << page_size_log2()), + upload_buffer_size); + command_list.D3DCopyBufferRegion( + buffer_, upload_range_start << page_size_log2(), upload_buffer, + UINT64(upload_buffer_offset), UINT64(upload_buffer_size)); + uint32_t upload_buffer_pages = + uint32_t(upload_buffer_size >> page_size_log2()); + upload_range_start += upload_buffer_pages; + upload_range_length -= upload_buffer_pages; + } + } + return true; +} + +} // namespace d3d12 +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h new file mode 100644 index 000000000..c66e5578d --- /dev/null +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -0,0 +1,145 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_ +#define XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_ + +#include +#include +#include +#include + +#include "xenia/gpu/shared_memory.h" +#include "xenia/gpu/trace_writer.h" +#include "xenia/memory.h" +#include "xenia/ui/d3d12/d3d12_api.h" +#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h" + +namespace xe { +namespace gpu { +namespace d3d12 { + +class D3D12CommandProcessor; + +class D3D12SharedMemory : public SharedMemory { + public: + D3D12SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory, + TraceWriter& trace_writer); + ~D3D12SharedMemory() override; + + bool Initialize(); + void Shutdown(bool from_destructor = false); + void ClearCache() override; + + ID3D12Resource* GetBuffer() const { return buffer_; } + D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const { + return buffer_gpu_address_; + } + + void CompletedSubmissionUpdated(); + + // RequestRange may transition the buffer to copy destination - call it before + // UseForReading or UseForWriting. + + // Makes the buffer usable for vertices, indices and texture untiling. + inline void UseForReading() { + // Vertex fetch is also allowed in pixel shaders. + CommitUAVWritesAndTransitionBuffer( + D3D12_RESOURCE_STATE_INDEX_BUFFER | + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); + } + // Makes the buffer usable for texture tiling after a resolve. + inline void UseForWriting() { + CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + } + // Makes the buffer usable as a source for copy commands. + inline void UseAsCopySource() { + CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE); + } + // Must be called when doing draws/dispatches modifying data within the shared + // memory buffer as a UAV, to make sure that when UseForWriting is called the + // next time, a UAV barrier will be done, and subsequent overlapping UAV + // writes and reads are ordered. + inline void MarkUAVWritesCommitNeeded() { + if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { + buffer_uav_writes_commit_needed_ = true; + } + } + + void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); + void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); + // Due to the Nvidia 128 megatexel limitation, the smallest supported formats + // are 32-bit. + void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, + uint32_t element_size_bytes_pow2); + void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, + uint32_t element_size_bytes_pow2); + + // Returns true if any downloads were submitted to the command processor. + bool InitializeTraceSubmitDownloads(); + void InitializeTraceCompleteDownloads(); + + protected: + bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override; + + bool UploadRanges(const std::vector>& + upload_page_ranges) override; + + private: + bool AreTiledResourcesUsed() const; + + D3D12CommandProcessor& command_processor_; + TraceWriter& trace_writer_; + + // The 512 MB tiled buffer. + ID3D12Resource* buffer_ = nullptr; + D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0; + D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; + bool buffer_uav_writes_commit_needed_ = false; + void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state); + + static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16)); + static constexpr uint32_t kHeapSizeLog2 = + std::max(kOptimalAllocationLog2, uint32_t(16)); + static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2; + // Resident portions of the tiled buffer. + ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {}; + // Number of the heaps currently resident, for profiling. + uint32_t heap_count_ = 0; + + // Non-shader-visible buffer descriptor heap for faster binding (via copying + // rather than creation). + enum class BufferDescriptorIndex : uint32_t { + kRawSRV, + kR32UintSRV, + kR32G32UintSRV, + kR32G32B32A32UintSRV, + kRawUAV, + kR32UintUAV, + kR32G32UintUAV, + kR32G32B32A32UintUAV, + + kCount, + }; + ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr; + D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_; + + std::unique_ptr upload_buffer_pool_; + + // Created temporarily, only for downloading. + ID3D12Resource* trace_download_buffer_ = nullptr; + void ResetTraceDownload(); +}; + +} // namespace d3d12 +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_ diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index a43ad90d3..939cffd65 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -1085,7 +1085,7 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { } bool RenderTargetCache::Resolve(const Memory& memory, - SharedMemory& shared_memory, + D3D12SharedMemory& shared_memory, TextureCache& texture_cache, uint32_t& written_address_out, uint32_t& written_length_out) { diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index cf575dcdf..0def0d25c 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -15,7 +15,7 @@ #include "xenia/base/cvar.h" #include "xenia/gpu/d3d12/d3d12_shader.h" -#include "xenia/gpu/d3d12/shared_memory.h" +#include "xenia/gpu/d3d12/d3d12_shared_memory.h" #include "xenia/gpu/d3d12/texture_cache.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/register_file.h" @@ -277,11 +277,11 @@ class RenderTargetCache { // register values, and also clears the EDRAM buffer if needed. Must be in a // frame for calling. - bool Resolve(const Memory& memory, SharedMemory& shared_memory, + bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory, TextureCache& texture_cache, uint32_t& written_address_out, uint32_t& written_length_out); - bool Resolve(SharedMemory* shared_memory, TextureCache* texture_cache, + bool Resolve(D3D12SharedMemory* shared_memory, TextureCache* texture_cache, Memory* memory, uint32_t& written_address_out, uint32_t& written_length_out); // Flushes the render targets to EDRAM and unbinds them, for instance, when diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc deleted file mode 100644 index c24bb970d..000000000 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ /dev/null @@ -1,959 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include "xenia/gpu/d3d12/shared_memory.h" - -#include -#include -#include -#include - -#include "xenia/base/assert.h" -#include "xenia/base/cvar.h" -#include "xenia/base/logging.h" -#include "xenia/base/math.h" -#include "xenia/base/memory.h" -#include "xenia/base/profiling.h" -#include "xenia/gpu/d3d12/d3d12_command_processor.h" -#include "xenia/ui/d3d12/d3d12_util.h" - -DEFINE_bool(d3d12_tiled_shared_memory, true, - "Enable tiled resources for shared memory emulation. Disabling " - "them greatly increases video memory usage - a 512 MB buffer is " - "created - but allows graphics debuggers that don't support tiled " - "resources to work.", - "D3D12"); - -namespace xe { -namespace gpu { -namespace d3d12 { - -SharedMemory::SharedMemory(D3D12CommandProcessor& command_processor, - Memory& memory, TraceWriter& trace_writer) - : command_processor_(command_processor), - memory_(memory), - trace_writer_(trace_writer) { - page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size())); - page_count_ = kBufferSize >> page_size_log2_; -} - -SharedMemory::~SharedMemory() { Shutdown(); } - -bool SharedMemory::Initialize() { - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - - D3D12_RESOURCE_DESC buffer_desc; - ui::d3d12::util::FillBufferResourceDesc( - buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; - if (AreTiledResourcesUsed()) { - if (FAILED(device->CreateReservedResource( - &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) { - XELOGE("Shared memory: Failed to create the 512 MB tiled buffer"); - Shutdown(); - return false; - } - } else { - XELOGGPU( - "Direct3D 12 tiled resources are not used for shared memory " - "emulation - video memory usage may increase significantly " - "because a full 512 MB buffer will be created!"); - if (provider.GetGraphicsAnalysis() != nullptr) { - // As of October 8th, 2018, PIX doesn't support tiled buffers. - // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. - XELOGGPU( - "This is caused by PIX being attached, which doesn't support tiled " - "resources yet."); - } - if (FAILED(device->CreateCommittedResource( - &ui::d3d12::util::kHeapPropertiesDefault, - provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_, - nullptr, IID_PPV_ARGS(&buffer_)))) { - XELOGE("Shared memory: Failed to create the 512 MB buffer"); - Shutdown(); - return false; - } - } - buffer_gpu_address_ = buffer_->GetGPUVirtualAddress(); - buffer_uav_writes_commit_needed_ = false; - - std::memset(heaps_, 0, sizeof(heaps_)); - heap_count_ = 0; - - D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc; - buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - buffer_descriptor_heap_desc.NumDescriptors = - uint32_t(BufferDescriptorIndex::kCount); - buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; - buffer_descriptor_heap_desc.NodeMask = 0; - if (FAILED(device->CreateDescriptorHeap( - &buffer_descriptor_heap_desc, - IID_PPV_ARGS(&buffer_descriptor_heap_)))) { - XELOGE( - "Failed to create the descriptor heap for shared memory buffer views"); - Shutdown(); - return false; - } - buffer_descriptor_heap_start_ = - buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart(); - ui::d3d12::util::CreateBufferRawSRV( - device, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kRawSRV)), - buffer_, kBufferSize); - ui::d3d12::util::CreateBufferTypedSRV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32UintSRV)), - buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2); - ui::d3d12::util::CreateBufferTypedSRV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32G32UintSRV)), - buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3); - ui::d3d12::util::CreateBufferTypedSRV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)), - buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4); - ui::d3d12::util::CreateBufferRawUAV( - device, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kRawUAV)), - buffer_, kBufferSize); - ui::d3d12::util::CreateBufferTypedUAV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32UintUAV)), - buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2); - ui::d3d12::util::CreateBufferTypedUAV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32G32UintUAV)), - buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3); - ui::d3d12::util::CreateBufferTypedUAV( - device, - provider.OffsetViewDescriptor( - buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)), - buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4); - - system_page_flags_.clear(); - system_page_flags_.resize((page_count_ + 63) / 64); - - upload_buffer_pool_ = std::make_unique( - provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize, - size_t(1) << page_size_log2_)); - - memory_invalidation_callback_handle_ = - memory_.RegisterPhysicalMemoryInvalidationCallback( - MemoryInvalidationCallbackThunk, this); - - ResetTraceGPUWrittenBuffer(); - - return true; -} - -void SharedMemory::Shutdown() { - ResetTraceGPUWrittenBuffer(); - - FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false); - assert_true(global_watches_.empty()); - // No watches now, so no references to the pools accessible by guest threads - - // safe not to enter the global critical region. - watch_node_first_free_ = nullptr; - watch_node_current_pool_allocated_ = 0; - for (WatchNode* pool : watch_node_pools_) { - delete[] pool; - } - watch_node_pools_.clear(); - watch_range_first_free_ = nullptr; - watch_range_current_pool_allocated_ = 0; - for (WatchRange* pool : watch_range_pools_) { - delete[] pool; - } - watch_range_pools_.clear(); - - if (memory_invalidation_callback_handle_ != nullptr) { - memory_.UnregisterPhysicalMemoryInvalidationCallback( - memory_invalidation_callback_handle_); - memory_invalidation_callback_handle_ = nullptr; - } - - upload_buffer_pool_.reset(); - - ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_); - - // First free the buffer to detach it from the heaps. - ui::d3d12::util::ReleaseAndNull(buffer_); - - if (AreTiledResourcesUsed()) { - for (uint32_t i = 0; i < xe::countof(heaps_); ++i) { - ui::d3d12::util::ReleaseAndNull(heaps_[i]); - } - heap_count_ = 0; - COUNT_profile_set("gpu/shared_memory/used_mb", 0); - } -} - -void SharedMemory::ClearCache() { - upload_buffer_pool_->ClearCache(); - - // Keeping GPU-written data, so "invalidated by GPU". - FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true); - // No watches now, so no references to the pools accessible by guest threads - - // safe not to enter the global critical region. - watch_node_first_free_ = nullptr; - watch_node_current_pool_allocated_ = 0; - for (WatchNode* pool : watch_node_pools_) { - delete[] pool; - } - watch_node_pools_.clear(); - watch_range_first_free_ = nullptr; - watch_range_current_pool_allocated_ = 0; - for (WatchRange* pool : watch_range_pools_) { - delete[] pool; - } - watch_range_pools_.clear(); - - { - auto global_lock = global_critical_region_.Acquire(); - for (SystemPageFlagsBlock& block : system_page_flags_) { - block.valid = block.valid_and_gpu_written; - } - } - - // TODO(Triang3l): Unmap and destroy heaps. -} - -void SharedMemory::CompletedSubmissionUpdated() { - upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission()); -} - -SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch( - GlobalWatchCallback callback, void* callback_context) { - GlobalWatch* watch = new GlobalWatch; - watch->callback = callback; - watch->callback_context = callback_context; - - auto global_lock = global_critical_region_.Acquire(); - global_watches_.push_back(watch); - - return reinterpret_cast(watch); -} - -void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) { - auto watch = reinterpret_cast(handle); - - { - auto global_lock = global_critical_region_.Acquire(); - auto it = std::find(global_watches_.begin(), global_watches_.end(), watch); - assert_false(it == global_watches_.end()); - if (it != global_watches_.end()) { - global_watches_.erase(it); - } - } - - delete watch; -} - -SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( - uint32_t start, uint32_t length, WatchCallback callback, - void* callback_context, void* callback_data, uint64_t callback_argument) { - if (length == 0 || start >= kBufferSize) { - return nullptr; - } - length = std::min(length, kBufferSize - start); - uint32_t watch_page_first = start >> page_size_log2_; - uint32_t watch_page_last = (start + length - 1) >> page_size_log2_; - uint32_t bucket_first = - watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2; - uint32_t bucket_last = - watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2; - - auto global_lock = global_critical_region_.Acquire(); - - // Allocate the range. - WatchRange* range = watch_range_first_free_; - if (range != nullptr) { - watch_range_first_free_ = range->next_free; - } else { - if (watch_range_pools_.empty() || - watch_range_current_pool_allocated_ >= kWatchRangePoolSize) { - watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]); - watch_range_current_pool_allocated_ = 0; - } - range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]); - } - range->callback = callback; - range->callback_context = callback_context; - range->callback_data = callback_data; - range->callback_argument = callback_argument; - range->page_first = watch_page_first; - range->page_last = watch_page_last; - - // Allocate and link the nodes. - WatchNode* node_previous = nullptr; - for (uint32_t i = bucket_first; i <= bucket_last; ++i) { - WatchNode* node = watch_node_first_free_; - if (node != nullptr) { - watch_node_first_free_ = node->next_free; - } else { - if (watch_node_pools_.empty() || - watch_node_current_pool_allocated_ >= kWatchNodePoolSize) { - watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]); - watch_node_current_pool_allocated_ = 0; - } - node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]); - } - node->range = range; - node->range_node_next = nullptr; - if (node_previous != nullptr) { - node_previous->range_node_next = node; - } else { - range->node_first = node; - } - node_previous = node; - node->bucket_node_previous = nullptr; - node->bucket_node_next = watch_buckets_[i]; - if (watch_buckets_[i] != nullptr) { - watch_buckets_[i]->bucket_node_previous = node; - } - watch_buckets_[i] = node; - } - - return reinterpret_cast(range); -} - -void SharedMemory::UnwatchMemoryRange(WatchHandle handle) { - if (handle == nullptr) { - // Could be a zero length range. - return; - } - auto global_lock = global_critical_region_.Acquire(); - UnlinkWatchRange(reinterpret_cast(handle)); -} - -bool SharedMemory::EnsureTilesResident(uint32_t start, uint32_t length) { - if (length == 0) { - // Some texture is empty, for example - safe to draw in this case. - return true; - } - if (start > kBufferSize || (kBufferSize - start) < length) { - return false; - } - - if (!AreTiledResourcesUsed()) { - return true; - } - - uint32_t heap_first = start >> kHeapSizeLog2; - uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2; - for (uint32_t i = heap_first; i <= heap_last; ++i) { - if (heaps_[i] != nullptr) { - continue; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - auto direct_queue = provider.GetDirectQueue(); - D3D12_HEAP_DESC heap_desc = {}; - heap_desc.SizeInBytes = kHeapSize; - heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; - heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS | - provider.GetHeapFlagCreateNotZeroed(); - if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) { - XELOGE("Shared memory: Failed to create a tile heap"); - return false; - } - ++heap_count_; - COUNT_profile_set("gpu/shared_memory/used_mb", - heap_count_ << kHeapSizeLog2 >> 20); - D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates; - region_start_coordinates.X = - (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - region_start_coordinates.Y = 0; - region_start_coordinates.Z = 0; - region_start_coordinates.Subresource = 0; - D3D12_TILE_REGION_SIZE region_size; - region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - region_size.UseBox = FALSE; - D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE; - UINT heap_range_start_offset = 0; - UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES; - direct_queue->UpdateTileMappings( - buffer_, 1, ®ion_start_coordinates, ®ion_size, heaps_[i], 1, - &range_flags, &heap_range_start_offset, &range_tile_count, - D3D12_TILE_MAPPING_FLAG_NONE); - command_processor_.NotifyQueueOperationsDoneDirectly(); - } - return true; -} - -bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { - if (length == 0) { - // Some texture is empty, for example - safe to draw in this case. - return true; - } - if (start > kBufferSize || (kBufferSize - start) < length) { - return false; - } - uint32_t last = start + length - 1; - - auto& command_list = command_processor_.GetDeferredCommandList(); - -#if FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // FINE_GRAINED_DRAW_SCOPES - - // Ensure all tile heaps are present. - if (!EnsureTilesResident(start, length)) { - return false; - } - - // Upload and protect used ranges. - GetRangesToUpload(start >> page_size_log2_, last >> page_size_log2_); - if (upload_ranges_.size() == 0) { - return true; - } - CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST); - command_processor_.SubmitBarriers(); - for (auto upload_range : upload_ranges_) { - uint32_t upload_range_start = upload_range.first; - uint32_t upload_range_length = upload_range.second; - trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2_, - upload_range_length << page_size_log2_); - while (upload_range_length != 0) { - ID3D12Resource* upload_buffer; - size_t upload_buffer_offset, upload_buffer_size; - uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial( - command_processor_.GetCurrentSubmission(), - upload_range_length << page_size_log2_, size_t(1) << page_size_log2_, - &upload_buffer, &upload_buffer_offset, &upload_buffer_size, nullptr); - if (upload_buffer_mapping == nullptr) { - XELOGE("Shared memory: Failed to get an upload buffer"); - return false; - } - MakeRangeValid(upload_range_start << page_size_log2_, - uint32_t(upload_buffer_size), false); - std::memcpy( - upload_buffer_mapping, - memory_.TranslatePhysical(upload_range_start << page_size_log2_), - upload_buffer_size); - command_list.D3DCopyBufferRegion( - buffer_, upload_range_start << page_size_log2_, upload_buffer, - UINT64(upload_buffer_offset), UINT64(upload_buffer_size)); - uint32_t upload_buffer_pages = - uint32_t(upload_buffer_size >> page_size_log2_); - upload_range_start += upload_buffer_pages; - upload_range_length -= upload_buffer_pages; - } - } - - return true; -} - -void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last, - bool invalidated_by_gpu) { - uint32_t address_first = page_first << page_size_log2_; - uint32_t address_last = - (page_last << page_size_log2_) + ((1 << page_size_log2_) - 1); - uint32_t bucket_first = address_first >> kWatchBucketSizeLog2; - uint32_t bucket_last = address_last >> kWatchBucketSizeLog2; - - auto global_lock = global_critical_region_.Acquire(); - - // Fire global watches. - for (const auto global_watch : global_watches_) { - global_watch->callback(global_watch->callback_context, address_first, - address_last, invalidated_by_gpu); - } - - // Fire per-range watches. - for (uint32_t i = bucket_first; i <= bucket_last; ++i) { - WatchNode* node = watch_buckets_[i]; - while (node != nullptr) { - WatchRange* range = node->range; - // Store the next node now since when the callback is triggered, the links - // will be broken. - node = node->bucket_node_next; - if (page_first <= range->page_last && page_last >= range->page_first) { - range->callback(range->callback_context, range->callback_data, - range->callback_argument, invalidated_by_gpu); - UnlinkWatchRange(range); - } - } - } -} - -void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) { - if (length == 0 || start >= kBufferSize) { - return; - } - length = std::min(length, kBufferSize - start); - uint32_t end = start + length - 1; - uint32_t page_first = start >> page_size_log2_; - uint32_t page_last = end >> page_size_log2_; - - // Trigger modification callbacks so, for instance, resolved data is loaded to - // the texture. - FireWatches(page_first, page_last, true); - - // Mark the range as valid (so pages are not reuploaded until modified by the - // CPU) and watch it so the CPU can reuse it and this will be caught. - MakeRangeValid(start, length, true); -} - -bool SharedMemory::AreTiledResourcesUsed() const { - if (!cvars::d3d12_tiled_shared_memory) { - return false; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - // As of October 8th, 2018, PIX doesn't support tiled buffers. - // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed. - return provider.GetTiledResourcesTier() != - D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED && - provider.GetGraphicsAnalysis() == nullptr; -} - -void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length, - bool written_by_gpu) { - if (length == 0 || start >= kBufferSize) { - return; - } - length = std::min(length, kBufferSize - start); - uint32_t last = start + length - 1; - uint32_t valid_page_first = start >> page_size_log2_; - uint32_t valid_page_last = last >> page_size_log2_; - uint32_t valid_block_first = valid_page_first >> 6; - uint32_t valid_block_last = valid_page_last >> 6; - - { - auto global_lock = global_critical_region_.Acquire(); - - for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) { - uint64_t valid_bits = UINT64_MAX; - if (i == valid_block_first) { - valid_bits &= ~((1ull << (valid_page_first & 63)) - 1); - } - if (i == valid_block_last && (valid_page_last & 63) != 63) { - valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1; - } - SystemPageFlagsBlock& block = system_page_flags_[i]; - block.valid |= valid_bits; - if (written_by_gpu) { - block.valid_and_gpu_written |= valid_bits; - } else { - block.valid_and_gpu_written &= ~valid_bits; - } - } - } - - if (memory_invalidation_callback_handle_) { - memory_.EnablePhysicalMemoryAccessCallbacks( - valid_page_first << page_size_log2_, - (valid_page_last - valid_page_first + 1) << page_size_log2_, true, - false); - } -} - -void SharedMemory::UnlinkWatchRange(WatchRange* range) { - uint32_t bucket = - range->page_first << page_size_log2_ >> kWatchBucketSizeLog2; - WatchNode* node = range->node_first; - while (node != nullptr) { - WatchNode* node_next = node->range_node_next; - if (node->bucket_node_previous != nullptr) { - node->bucket_node_previous->bucket_node_next = node->bucket_node_next; - } else { - watch_buckets_[bucket] = node->bucket_node_next; - } - if (node->bucket_node_next != nullptr) { - node->bucket_node_next->bucket_node_previous = node->bucket_node_previous; - } - node->next_free = watch_node_first_free_; - watch_node_first_free_ = node; - node = node_next; - ++bucket; - } - range->next_free = watch_range_first_free_; - watch_range_first_free_ = range; -} - -void SharedMemory::GetRangesToUpload(uint32_t request_page_first, - uint32_t request_page_last) { - upload_ranges_.clear(); - request_page_last = std::min(request_page_last, page_count_ - 1u); - if (request_page_first > request_page_last) { - return; - } - uint32_t request_block_first = request_page_first >> 6; - uint32_t request_block_last = request_page_last >> 6; - - auto global_lock = global_critical_region_.Acquire(); - - uint32_t range_start = UINT32_MAX; - for (uint32_t i = request_block_first; i <= request_block_last; ++i) { - uint64_t block_valid = system_page_flags_[i].valid; - // Consider pages in the block outside the requested range valid. - if (i == request_block_first) { - block_valid |= (1ull << (request_page_first & 63)) - 1; - } - if (i == request_block_last && (request_page_last & 63) != 63) { - block_valid |= ~((1ull << ((request_page_last & 63) + 1)) - 1); - } - - while (true) { - uint32_t block_page; - if (range_start == UINT32_MAX) { - // Check if need to open a new range. - if (!xe::bit_scan_forward(~block_valid, &block_page)) { - break; - } - range_start = (i << 6) + block_page; - } else { - // Check if need to close the range. - // Ignore the valid pages before the beginning of the range. - uint64_t block_valid_from_start = block_valid; - if (i == (range_start >> 6)) { - block_valid_from_start &= ~((1ull << (range_start & 63)) - 1); - } - if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) { - break; - } - upload_ranges_.push_back( - std::make_pair(range_start, (i << 6) + block_page - range_start)); - // In the next interation within this block, consider this range valid - // since it has been queued for upload. - block_valid |= (1ull << block_page) - 1; - range_start = UINT32_MAX; - } - } - } - if (range_start != UINT32_MAX) { - upload_ranges_.push_back( - std::make_pair(range_start, request_page_last + 1 - range_start)); - } -} - -std::pair SharedMemory::MemoryInvalidationCallbackThunk( - void* context_ptr, uint32_t physical_address_start, uint32_t length, - bool exact_range) { - return reinterpret_cast(context_ptr) - ->MemoryInvalidationCallback(physical_address_start, length, exact_range); -} - -std::pair SharedMemory::MemoryInvalidationCallback( - uint32_t physical_address_start, uint32_t length, bool exact_range) { - if (length == 0 || physical_address_start >= kBufferSize) { - return std::make_pair(uint32_t(0), UINT32_MAX); - } - length = std::min(length, kBufferSize - physical_address_start); - uint32_t physical_address_last = physical_address_start + (length - 1); - - uint32_t page_first = physical_address_start >> page_size_log2_; - uint32_t page_last = physical_address_last >> page_size_log2_; - assert_true(page_first < page_count_ && page_last < page_count_); - uint32_t block_first = page_first >> 6; - uint32_t block_last = page_last >> 6; - - auto global_lock = global_critical_region_.Acquire(); - - if (!exact_range) { - // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be - // invalidated - if no GPU-written data nearby that was not intended to be - // invalidated since it's not in sync with CPU memory and can't be - // reuploaded. It's a lot cheaper to upload some excess data than to catch - // access violations - with 4 KB callbacks, the original Doom runs at 4 FPS - // on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per - // frame, but with 256 KB it's 0.7 ms. - if (page_first & 63) { - uint64_t gpu_written_start = - system_page_flags_[block_first].valid_and_gpu_written; - gpu_written_start &= (1ull << (page_first & 63)) - 1; - page_first = - (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); - } - if ((page_last & 63) != 63) { - uint64_t gpu_written_end = - system_page_flags_[block_last].valid_and_gpu_written; - gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1); - page_last = (page_last & ~uint32_t(63)) + - (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); - } - } - - for (uint32_t i = block_first; i <= block_last; ++i) { - uint64_t invalidate_bits = UINT64_MAX; - if (i == block_first) { - invalidate_bits &= ~((1ull << (page_first & 63)) - 1); - } - if (i == block_last && (page_last & 63) != 63) { - invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1; - } - SystemPageFlagsBlock& block = system_page_flags_[i]; - block.valid &= ~invalidate_bits; - block.valid_and_gpu_written &= ~invalidate_bits; - } - - FireWatches(page_first, page_last, false); - - return std::make_pair(page_first << page_size_log2_, - (page_last - page_first + 1) << page_size_log2_); -} - -void SharedMemory::CommitUAVWritesAndTransitionBuffer( - D3D12_RESOURCE_STATES new_state) { - if (buffer_state_ == new_state) { - if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS && - buffer_uav_writes_commit_needed_) { - command_processor_.PushUAVBarrier(buffer_); - buffer_uav_writes_commit_needed_ = false; - } - return; - } - command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state); - buffer_state_ = new_state; - // "UAV -> anything" transition commits the writes implicitly. - buffer_uav_writes_commit_needed_ = false; -} - -void SharedMemory::WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) { - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - device->CopyDescriptorsSimple( - 1, handle, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kRawSRV)), - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); -} - -void SharedMemory::WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) { - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - device->CopyDescriptorsSimple( - 1, handle, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(BufferDescriptorIndex::kRawUAV)), - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); -} - -void SharedMemory::WriteUintPow2SRVDescriptor( - D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { - BufferDescriptorIndex descriptor_index; - switch (element_size_bytes_pow2) { - case 2: - descriptor_index = BufferDescriptorIndex::kR32UintSRV; - break; - case 3: - descriptor_index = BufferDescriptorIndex::kR32G32UintSRV; - break; - case 4: - descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV; - break; - default: - assert_unhandled_case(element_size_bytes_pow2); - return; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - device->CopyDescriptorsSimple( - 1, handle, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(descriptor_index)), - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); -} - -void SharedMemory::WriteUintPow2UAVDescriptor( - D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) { - BufferDescriptorIndex descriptor_index; - switch (element_size_bytes_pow2) { - case 2: - descriptor_index = BufferDescriptorIndex::kR32UintUAV; - break; - case 3: - descriptor_index = BufferDescriptorIndex::kR32G32UintUAV; - break; - case 4: - descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV; - break; - default: - assert_unhandled_case(element_size_bytes_pow2); - return; - } - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - device->CopyDescriptorsSimple( - 1, handle, - provider.OffsetViewDescriptor(buffer_descriptor_heap_start_, - uint32_t(descriptor_index)), - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); -} - -bool SharedMemory::InitializeTraceSubmitDownloads() { - // Invalidate the entire memory CPU->GPU memory copy so all the history - // doesn't have to be written into every frame trace, and collect the list of - // ranges with data modified on the GPU. - ResetTraceGPUWrittenBuffer(); - uint32_t gpu_written_page_count = 0; - - { - auto global_lock = global_critical_region_.Acquire(); - uint32_t fire_watches_range_start = UINT32_MAX; - uint32_t gpu_written_range_start = UINT32_MAX; - for (uint32_t i = 0; i < system_page_flags_.size(); ++i) { - SystemPageFlagsBlock& page_flags_block = system_page_flags_[i]; - uint64_t previously_valid_block = page_flags_block.valid; - uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written; - page_flags_block.valid = gpu_written_block; - - // Fire watches on the invalidated pages. - uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block; - uint64_t fire_watches_break_block = ~fire_watches_block; - while (true) { - uint32_t fire_watches_block_page; - if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX - ? fire_watches_block - : fire_watches_break_block, - &fire_watches_block_page)) { - break; - } - uint32_t fire_watches_page = (i << 6) + fire_watches_block_page; - if (fire_watches_range_start == UINT32_MAX) { - fire_watches_range_start = fire_watches_page; - } else { - FireWatches(fire_watches_range_start, fire_watches_page - 1, false); - fire_watches_range_start = UINT32_MAX; - } - uint64_t fire_watches_block_mask = - ~((1ull << fire_watches_block_page) - 1); - fire_watches_block &= fire_watches_block_mask; - fire_watches_break_block &= fire_watches_block_mask; - } - - // Add to the GPU-written ranges. - uint64_t gpu_written_break_block = ~gpu_written_block; - while (true) { - uint32_t gpu_written_block_page; - if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX - ? gpu_written_block - : gpu_written_break_block, - &gpu_written_block_page)) { - break; - } - uint32_t gpu_written_page = (i << 6) + gpu_written_block_page; - if (gpu_written_range_start == UINT32_MAX) { - gpu_written_range_start = gpu_written_page; - } else { - uint32_t gpu_written_range_length = - gpu_written_page - gpu_written_range_start; - trace_gpu_written_ranges_.push_back( - std::make_pair(gpu_written_range_start << page_size_log2_, - gpu_written_range_length << page_size_log2_)); - gpu_written_page_count += gpu_written_range_length; - gpu_written_range_start = UINT32_MAX; - } - uint64_t gpu_written_block_mask = - ~((1ull << gpu_written_block_page) - 1); - gpu_written_block &= gpu_written_block_mask; - gpu_written_break_block &= gpu_written_block_mask; - } - } - if (fire_watches_range_start != UINT32_MAX) { - FireWatches(fire_watches_range_start, page_count_ - 1, false); - } - if (gpu_written_range_start != UINT32_MAX) { - uint32_t gpu_written_range_length = page_count_ - gpu_written_range_start; - trace_gpu_written_ranges_.push_back( - std::make_pair(gpu_written_range_start << page_size_log2_, - gpu_written_range_length << page_size_log2_)); - gpu_written_page_count += gpu_written_range_length; - } - } - - // Request downloading of GPU-written memory. - if (!gpu_written_page_count) { - return false; - } - D3D12_RESOURCE_DESC gpu_written_buffer_desc; - ui::d3d12::util::FillBufferResourceDesc( - gpu_written_buffer_desc, gpu_written_page_count << page_size_log2_, - D3D12_RESOURCE_FLAG_NONE); - auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider(); - auto device = provider.GetDevice(); - if (FAILED(device->CreateCommittedResource( - &ui::d3d12::util::kHeapPropertiesReadback, - provider.GetHeapFlagCreateNotZeroed(), &gpu_written_buffer_desc, - D3D12_RESOURCE_STATE_COPY_DEST, nullptr, - IID_PPV_ARGS(&trace_gpu_written_buffer_)))) { - XELOGE( - "Shared memory: Failed to create a {} KB GPU-written memory download " - "buffer for frame tracing", - gpu_written_page_count << page_size_log2_ >> 10); - ResetTraceGPUWrittenBuffer(); - return false; - } - auto& command_list = command_processor_.GetDeferredCommandList(); - UseAsCopySource(); - command_processor_.SubmitBarriers(); - uint32_t gpu_written_buffer_offset = 0; - for (auto& gpu_written_submit_range : trace_gpu_written_ranges_) { - // For cases like resolution scale, when the data may not be actually - // written, just marked as valid. - if (!EnsureTilesResident(gpu_written_submit_range.first, - gpu_written_submit_range.second)) { - gpu_written_submit_range.second = 0; - continue; - } - command_list.D3DCopyBufferRegion( - trace_gpu_written_buffer_, gpu_written_buffer_offset, buffer_, - gpu_written_submit_range.first, gpu_written_submit_range.second); - gpu_written_buffer_offset += gpu_written_submit_range.second; - } - return true; -} - -void SharedMemory::InitializeTraceCompleteDownloads() { - if (!trace_gpu_written_buffer_) { - return; - } - void* download_mapping; - if (SUCCEEDED( - trace_gpu_written_buffer_->Map(0, nullptr, &download_mapping))) { - uint32_t gpu_written_buffer_offset = 0; - for (auto gpu_written_submit_range : trace_gpu_written_ranges_) { - trace_writer_.WriteMemoryRead( - gpu_written_submit_range.first, gpu_written_submit_range.second, - reinterpret_cast(download_mapping) + - gpu_written_buffer_offset); - } - D3D12_RANGE download_write_range = {}; - trace_gpu_written_buffer_->Unmap(0, &download_write_range); - } else { - XELOGE( - "Failed to map the GPU-written memory download buffer for frame " - "tracing"); - } - ResetTraceGPUWrittenBuffer(); -} - -void SharedMemory::ResetTraceGPUWrittenBuffer() { - trace_gpu_written_ranges_.clear(); - trace_gpu_written_ranges_.shrink_to_fit(); - ui::d3d12::util::ReleaseAndNull(trace_gpu_written_buffer_); -} - -} // namespace d3d12 -} // namespace gpu -} // namespace xe diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 7228b9b3a..de0568ccf 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -832,7 +832,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = { TextureCache::TextureCache(D3D12CommandProcessor& command_processor, const RegisterFile& register_file, bool bindless_resources_used, - SharedMemory& shared_memory) + D3D12SharedMemory& shared_memory) : command_processor_(command_processor), register_file_(register_file), bindless_resources_used_(bindless_resources_used), @@ -1604,7 +1604,7 @@ void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled, // Invalidate textures. Toggling individual textures between scaled and // unscaled also relies on invalidation through shared memory. - shared_memory_.RangeWrittenByGPU(start_unscaled, length_unscaled); + shared_memory_.RangeWrittenByGpu(start_unscaled, length_unscaled); } bool TextureCache::EnsureScaledResolveBufferResident(uint32_t start_unscaled, diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 1047cabd0..1345d8faf 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -17,7 +17,7 @@ #include "xenia/base/mutex.h" #include "xenia/gpu/d3d12/d3d12_shader.h" -#include "xenia/gpu/d3d12/shared_memory.h" +#include "xenia/gpu/d3d12/d3d12_shared_memory.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/texture_info.h" #include "xenia/gpu/xenos.h" @@ -169,7 +169,7 @@ class TextureCache { TextureCache(D3D12CommandProcessor& command_processor, const RegisterFile& register_file, bool bindless_resources_used, - SharedMemory& shared_memory); + D3D12SharedMemory& shared_memory); ~TextureCache(); bool Initialize(bool edram_rov_used); @@ -546,7 +546,7 @@ class TextureCache { D3D12CommandProcessor& command_processor_; const RegisterFile& register_file_; bool bindless_resources_used_; - SharedMemory& shared_memory_; + D3D12SharedMemory& shared_memory_; static const LoadModeInfo load_mode_info_[]; ID3D12RootSignature* load_root_signature_ = nullptr; diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc new file mode 100644 index 000000000..4951eacea --- /dev/null +++ b/src/xenia/gpu/shared_memory.cc @@ -0,0 +1,541 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/shared_memory.h" + +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/math.h" +#include "xenia/base/memory.h" +#include "xenia/base/profiling.h" +#include "xenia/memory.h" + +namespace xe { +namespace gpu { + +SharedMemory::SharedMemory(Memory& memory) : memory_(memory) { + page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size())); +} + +SharedMemory::~SharedMemory() { ShutdownCommon(); } + +void SharedMemory::InitializeCommon() { + system_page_flags_.clear(); + system_page_flags_.resize(((kBufferSize >> page_size_log2_) + 63) / 64); + + memory_invalidation_callback_handle_ = + memory_.RegisterPhysicalMemoryInvalidationCallback( + MemoryInvalidationCallbackThunk, this); +} + +void SharedMemory::ShutdownCommon() { + ReleaseTraceDownloadRanges(); + + FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false); + assert_true(global_watches_.empty()); + // No watches now, so no references to the pools accessible by guest threads - + // safe not to enter the global critical region. + watch_node_first_free_ = nullptr; + watch_node_current_pool_allocated_ = 0; + for (WatchNode* pool : watch_node_pools_) { + delete[] pool; + } + watch_node_pools_.clear(); + watch_range_first_free_ = nullptr; + watch_range_current_pool_allocated_ = 0; + for (WatchRange* pool : watch_range_pools_) { + delete[] pool; + } + watch_range_pools_.clear(); + + if (memory_invalidation_callback_handle_ != nullptr) { + memory_.UnregisterPhysicalMemoryInvalidationCallback( + memory_invalidation_callback_handle_); + memory_invalidation_callback_handle_ = nullptr; + } +} + +void SharedMemory::ClearCache() { + // Keeping GPU-written data, so "invalidated by GPU". + FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true); + // No watches now, so no references to the pools accessible by guest threads - + // safe not to enter the global critical region. + watch_node_first_free_ = nullptr; + watch_node_current_pool_allocated_ = 0; + for (WatchNode* pool : watch_node_pools_) { + delete[] pool; + } + watch_node_pools_.clear(); + watch_range_first_free_ = nullptr; + watch_range_current_pool_allocated_ = 0; + for (WatchRange* pool : watch_range_pools_) { + delete[] pool; + } + watch_range_pools_.clear(); + + { + auto global_lock = global_critical_region_.Acquire(); + for (SystemPageFlagsBlock& block : system_page_flags_) { + block.valid = block.valid_and_gpu_written; + } + } +} + +SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch( + GlobalWatchCallback callback, void* callback_context) { + GlobalWatch* watch = new GlobalWatch; + watch->callback = callback; + watch->callback_context = callback_context; + + auto global_lock = global_critical_region_.Acquire(); + global_watches_.push_back(watch); + + return reinterpret_cast(watch); +} + +void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) { + auto watch = reinterpret_cast(handle); + + { + auto global_lock = global_critical_region_.Acquire(); + auto it = std::find(global_watches_.begin(), global_watches_.end(), watch); + assert_false(it == global_watches_.end()); + if (it != global_watches_.end()) { + global_watches_.erase(it); + } + } + + delete watch; +} + +SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( + uint32_t start, uint32_t length, WatchCallback callback, + void* callback_context, void* callback_data, uint64_t callback_argument) { + if (length == 0 || start >= kBufferSize) { + return nullptr; + } + length = std::min(length, kBufferSize - start); + uint32_t watch_page_first = start >> page_size_log2_; + uint32_t watch_page_last = (start + length - 1) >> page_size_log2_; + uint32_t bucket_first = + watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2; + uint32_t bucket_last = + watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2; + + auto global_lock = global_critical_region_.Acquire(); + + // Allocate the range. + WatchRange* range = watch_range_first_free_; + if (range != nullptr) { + watch_range_first_free_ = range->next_free; + } else { + if (watch_range_pools_.empty() || + watch_range_current_pool_allocated_ >= kWatchRangePoolSize) { + watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]); + watch_range_current_pool_allocated_ = 0; + } + range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]); + } + range->callback = callback; + range->callback_context = callback_context; + range->callback_data = callback_data; + range->callback_argument = callback_argument; + range->page_first = watch_page_first; + range->page_last = watch_page_last; + + // Allocate and link the nodes. + WatchNode* node_previous = nullptr; + for (uint32_t i = bucket_first; i <= bucket_last; ++i) { + WatchNode* node = watch_node_first_free_; + if (node != nullptr) { + watch_node_first_free_ = node->next_free; + } else { + if (watch_node_pools_.empty() || + watch_node_current_pool_allocated_ >= kWatchNodePoolSize) { + watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]); + watch_node_current_pool_allocated_ = 0; + } + node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]); + } + node->range = range; + node->range_node_next = nullptr; + if (node_previous != nullptr) { + node_previous->range_node_next = node; + } else { + range->node_first = node; + } + node_previous = node; + node->bucket_node_previous = nullptr; + node->bucket_node_next = watch_buckets_[i]; + if (watch_buckets_[i] != nullptr) { + watch_buckets_[i]->bucket_node_previous = node; + } + watch_buckets_[i] = node; + } + + return reinterpret_cast(range); +} + +void SharedMemory::UnwatchMemoryRange(WatchHandle handle) { + if (handle == nullptr) { + // Could be a zero length range. + return; + } + auto global_lock = global_critical_region_.Acquire(); + UnlinkWatchRange(reinterpret_cast(handle)); +} + +void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last, + bool invalidated_by_gpu) { + uint32_t address_first = page_first << page_size_log2_; + uint32_t address_last = + (page_last << page_size_log2_) + ((1 << page_size_log2_) - 1); + uint32_t bucket_first = address_first >> kWatchBucketSizeLog2; + uint32_t bucket_last = address_last >> kWatchBucketSizeLog2; + + auto global_lock = global_critical_region_.Acquire(); + + // Fire global watches. + for (const auto global_watch : global_watches_) { + global_watch->callback(global_watch->callback_context, address_first, + address_last, invalidated_by_gpu); + } + + // Fire per-range watches. + for (uint32_t i = bucket_first; i <= bucket_last; ++i) { + WatchNode* node = watch_buckets_[i]; + while (node != nullptr) { + WatchRange* range = node->range; + // Store the next node now since when the callback is triggered, the links + // will be broken. + node = node->bucket_node_next; + if (page_first <= range->page_last && page_last >= range->page_first) { + range->callback(range->callback_context, range->callback_data, + range->callback_argument, invalidated_by_gpu); + UnlinkWatchRange(range); + } + } + } +} + +void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) { + if (length == 0 || start >= kBufferSize) { + return; + } + length = std::min(length, kBufferSize - start); + uint32_t end = start + length - 1; + uint32_t page_first = start >> page_size_log2_; + uint32_t page_last = end >> page_size_log2_; + + // Trigger modification callbacks so, for instance, resolved data is loaded to + // the texture. + FireWatches(page_first, page_last, true); + + // Mark the range as valid (so pages are not reuploaded until modified by the + // CPU) and watch it so the CPU can reuse it and this will be caught. + MakeRangeValid(start, length, true); +} + +void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length, + bool written_by_gpu) { + if (length == 0 || start >= kBufferSize) { + return; + } + length = std::min(length, kBufferSize - start); + uint32_t last = start + length - 1; + uint32_t valid_page_first = start >> page_size_log2_; + uint32_t valid_page_last = last >> page_size_log2_; + uint32_t valid_block_first = valid_page_first >> 6; + uint32_t valid_block_last = valid_page_last >> 6; + + { + auto global_lock = global_critical_region_.Acquire(); + + for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) { + uint64_t valid_bits = UINT64_MAX; + if (i == valid_block_first) { + valid_bits &= ~((uint64_t(1) << (valid_page_first & 63)) - 1); + } + if (i == valid_block_last && (valid_page_last & 63) != 63) { + valid_bits &= (uint64_t(1) << ((valid_page_last & 63) + 1)) - 1; + } + SystemPageFlagsBlock& block = system_page_flags_[i]; + block.valid |= valid_bits; + if (written_by_gpu) { + block.valid_and_gpu_written |= valid_bits; + } else { + block.valid_and_gpu_written &= ~valid_bits; + } + } + } + + if (memory_invalidation_callback_handle_) { + memory().EnablePhysicalMemoryAccessCallbacks( + valid_page_first << page_size_log2_, + (valid_page_last - valid_page_first + 1) << page_size_log2_, true, + false); + } +} + +void SharedMemory::UnlinkWatchRange(WatchRange* range) { + uint32_t bucket = + range->page_first << page_size_log2_ >> kWatchBucketSizeLog2; + WatchNode* node = range->node_first; + while (node != nullptr) { + WatchNode* node_next = node->range_node_next; + if (node->bucket_node_previous != nullptr) { + node->bucket_node_previous->bucket_node_next = node->bucket_node_next; + } else { + watch_buckets_[bucket] = node->bucket_node_next; + } + if (node->bucket_node_next != nullptr) { + node->bucket_node_next->bucket_node_previous = node->bucket_node_previous; + } + node->next_free = watch_node_first_free_; + watch_node_first_free_ = node; + node = node_next; + ++bucket; + } + range->next_free = watch_range_first_free_; + watch_range_first_free_ = range; +} + +bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { + if (!length) { + // Some texture or buffer is empty, for example - safe to draw in this case. + return true; + } + if (start > kBufferSize || (kBufferSize - start) < length) { + return false; + } + uint32_t last = start + length - 1; + + SCOPE_profile_cpu_f("gpu"); + + if (!EnsureHostGpuMemoryAllocated(start, length)) { + return false; + } + + uint32_t page_first = start >> page_size_log2_; + uint32_t page_last = (start + length - 1) >> page_size_log2_; + + upload_ranges_.clear(); + uint32_t block_first = page_first >> 6; + uint32_t block_last = page_last >> 6; + uint32_t range_start = UINT32_MAX; + { + auto global_lock = global_critical_region_.Acquire(); + for (uint32_t i = block_first; i <= block_last; ++i) { + uint64_t block_valid = system_page_flags_[i].valid; + // Consider pages in the block outside the requested range valid. + if (i == block_first) { + block_valid |= (uint64_t(1) << (page_first & 63)) - 1; + } + if (i == block_last && (page_last & 63) != 63) { + block_valid |= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1); + } + + while (true) { + uint32_t block_page; + if (range_start == UINT32_MAX) { + // Check if need to open a new range. + if (!xe::bit_scan_forward(~block_valid, &block_page)) { + break; + } + range_start = (i << 6) + block_page; + } else { + // Check if need to close the range. + // Ignore the valid pages before the beginning of the range. + uint64_t block_valid_from_start = block_valid; + if (i == (range_start >> 6)) { + block_valid_from_start &= + ~((uint64_t(1) << (range_start & 63)) - 1); + } + if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) { + break; + } + upload_ranges_.push_back( + std::make_pair(range_start, (i << 6) + block_page - range_start)); + // In the next iteration within this block, consider this range valid + // since it has been queued for upload. + block_valid |= (uint64_t(1) << block_page) - 1; + range_start = UINT32_MAX; + } + } + } + } + if (range_start != UINT32_MAX) { + upload_ranges_.push_back( + std::make_pair(range_start, page_last + 1 - range_start)); + } + if (upload_ranges_.empty()) { + return true; + } + + return UploadRanges(upload_ranges_); +} + +std::pair SharedMemory::MemoryInvalidationCallbackThunk( + void* context_ptr, uint32_t physical_address_start, uint32_t length, + bool exact_range) { + return reinterpret_cast(context_ptr) + ->MemoryInvalidationCallback(physical_address_start, length, exact_range); +} + +std::pair SharedMemory::MemoryInvalidationCallback( + uint32_t physical_address_start, uint32_t length, bool exact_range) { + if (length == 0 || physical_address_start >= kBufferSize) { + return std::make_pair(uint32_t(0), UINT32_MAX); + } + length = std::min(length, kBufferSize - physical_address_start); + uint32_t physical_address_last = physical_address_start + (length - 1); + + uint32_t page_first = physical_address_start >> page_size_log2_; + uint32_t page_last = physical_address_last >> page_size_log2_; + uint32_t block_first = page_first >> 6; + uint32_t block_last = page_last >> 6; + + auto global_lock = global_critical_region_.Acquire(); + + if (!exact_range) { + // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be + // invalidated - if no GPU-written data nearby that was not intended to be + // invalidated since it's not in sync with CPU memory and can't be + // reuploaded. It's a lot cheaper to upload some excess data than to catch + // access violations - with 4 KB callbacks, the original Doom runs at 4 FPS + // on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per + // frame, but with 256 KB it's 0.7 ms. + if (page_first & 63) { + uint64_t gpu_written_start = + system_page_flags_[block_first].valid_and_gpu_written; + gpu_written_start &= (uint64_t(1) << (page_first & 63)) - 1; + page_first = + (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); + } + if ((page_last & 63) != 63) { + uint64_t gpu_written_end = + system_page_flags_[block_last].valid_and_gpu_written; + gpu_written_end &= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1); + page_last = (page_last & ~uint32_t(63)) + + (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); + } + } + + for (uint32_t i = block_first; i <= block_last; ++i) { + uint64_t invalidate_bits = UINT64_MAX; + if (i == block_first) { + invalidate_bits &= ~((uint64_t(1) << (page_first & 63)) - 1); + } + if (i == block_last && (page_last & 63) != 63) { + invalidate_bits &= (uint64_t(1) << ((page_last & 63) + 1)) - 1; + } + SystemPageFlagsBlock& block = system_page_flags_[i]; + block.valid &= ~invalidate_bits; + block.valid_and_gpu_written &= ~invalidate_bits; + } + + FireWatches(page_first, page_last, false); + + return std::make_pair(page_first << page_size_log2_, + (page_last - page_first + 1) << page_size_log2_); +} + +void SharedMemory::PrepareForTraceDownload() { + ReleaseTraceDownloadRanges(); + assert_true(trace_download_ranges_.empty()); + assert_zero(trace_download_page_count_); + + // Invalidate the entire memory CPU->GPU memory copy so all the history + // doesn't have to be written into every frame trace, and collect the list of + // ranges with data modified on the GPU. + + uint32_t fire_watches_range_start = UINT32_MAX; + uint32_t gpu_written_range_start = UINT32_MAX; + auto global_lock = global_critical_region_.Acquire(); + for (uint32_t i = 0; i < system_page_flags_.size(); ++i) { + SystemPageFlagsBlock& page_flags_block = system_page_flags_[i]; + uint64_t previously_valid_block = page_flags_block.valid; + uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written; + page_flags_block.valid = gpu_written_block; + + // Fire watches on the invalidated pages. + uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block; + uint64_t fire_watches_break_block = ~fire_watches_block; + while (true) { + uint32_t fire_watches_block_page; + if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX + ? fire_watches_block + : fire_watches_break_block, + &fire_watches_block_page)) { + break; + } + uint32_t fire_watches_page = (i << 6) + fire_watches_block_page; + if (fire_watches_range_start == UINT32_MAX) { + fire_watches_range_start = fire_watches_page; + } else { + FireWatches(fire_watches_range_start, fire_watches_page - 1, false); + fire_watches_range_start = UINT32_MAX; + } + uint64_t fire_watches_block_mask = + ~((uint64_t(1) << fire_watches_block_page) - 1); + fire_watches_block &= fire_watches_block_mask; + fire_watches_break_block &= fire_watches_block_mask; + } + + // Add to the GPU-written ranges. + uint64_t gpu_written_break_block = ~gpu_written_block; + while (true) { + uint32_t gpu_written_block_page; + if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX + ? gpu_written_block + : gpu_written_break_block, + &gpu_written_block_page)) { + break; + } + uint32_t gpu_written_page = (i << 6) + gpu_written_block_page; + if (gpu_written_range_start == UINT32_MAX) { + gpu_written_range_start = gpu_written_page; + } else { + uint32_t gpu_written_range_length = + gpu_written_page - gpu_written_range_start; + trace_download_ranges_.push_back( + std::make_pair(gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)); + trace_download_page_count_ += gpu_written_range_length; + gpu_written_range_start = UINT32_MAX; + } + uint64_t gpu_written_block_mask = + ~((uint64_t(1) << gpu_written_block_page) - 1); + gpu_written_block &= gpu_written_block_mask; + gpu_written_break_block &= gpu_written_block_mask; + } + } + uint32_t page_count = kBufferSize >> page_size_log2_; + if (fire_watches_range_start != UINT32_MAX) { + FireWatches(fire_watches_range_start, page_count - 1, false); + } + if (gpu_written_range_start != UINT32_MAX) { + uint32_t gpu_written_range_length = page_count - gpu_written_range_start; + trace_download_ranges_.push_back( + std::make_pair(gpu_written_range_start << page_size_log2_, + gpu_written_range_length << page_size_log2_)); + trace_download_page_count_ += gpu_written_range_length; + } +} + +void SharedMemory::ReleaseTraceDownloadRanges() { + trace_download_ranges_.clear(); + trace_download_ranges_.shrink_to_fit(); + trace_download_page_count_ = 0; +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/shared_memory.h similarity index 61% rename from src/xenia/gpu/d3d12/shared_memory.h rename to src/xenia/gpu/shared_memory.h index 86a55b2b7..6dae85909 100644 --- a/src/xenia/gpu/d3d12/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -2,49 +2,32 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2018 Ben Vanik. All rights reserved. * + * Copyright 2020 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ -#ifndef XENIA_GPU_D3D12_SHARED_MEMORY_H_ -#define XENIA_GPU_D3D12_SHARED_MEMORY_H_ +#ifndef XENIA_GPU_SHARED_MEMORY_H_ +#define XENIA_GPU_SHARED_MEMORY_H_ -#include +#include #include #include #include "xenia/base/mutex.h" -#include "xenia/gpu/trace_writer.h" #include "xenia/memory.h" -#include "xenia/ui/d3d12/d3d12_api.h" -#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h" namespace xe { namespace gpu { -namespace d3d12 { - -class D3D12CommandProcessor; // Manages memory for unconverted textures, resolve targets, vertex and index // buffers that can be accessed from shaders with Xenon physical addresses, with // system page size granularity. class SharedMemory { public: - SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory, - TraceWriter& trace_writer); - ~SharedMemory(); - - bool Initialize(); - void Shutdown(); - void ClearCache(); - - ID3D12Resource* GetBuffer() const { return buffer_; } - D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const { - return buffer_gpu_address_; - } - - void CompletedSubmissionUpdated(); + virtual ~SharedMemory(); + // Call in the implementation-specific ClearCache. + virtual void ClearCache(); typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first, uint32_t address_last, @@ -86,10 +69,8 @@ class SharedMemory { void UnwatchMemoryRange(WatchHandle handle); // Checks if the range has been updated, uploads new data if needed and - // ensures the buffer tiles backing the range are resident. May transition the - // tiled buffer to copy destination - call this before UseForReading or - // UseForWriting. Returns true if the range has been fully updated and is - // usable. + // ensures the host GPU memory backing the range are resident. Returns true if + // the range has been fully updated and is usable. bool RequestRange(uint32_t start, uint32_t length); // Marks the range and, if not exact_range, potentially its surroundings @@ -106,124 +87,83 @@ class SharedMemory { // be called, to make sure, if the GPU writes don't overwrite *everything* in // the pages they touch, the CPU data is properly loaded to the unmodified // regions in those pages. - void RangeWrittenByGPU(uint32_t start, uint32_t length); + void RangeWrittenByGpu(uint32_t start, uint32_t length); - // Makes the buffer usable for vertices, indices and texture untiling. - inline void UseForReading() { - // Vertex fetch is also allowed in pixel shaders. - CommitUAVWritesAndTransitionBuffer( - D3D12_RESOURCE_STATE_INDEX_BUFFER | - D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | - D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); - } - // Makes the buffer usable for texture tiling after a resolve. - inline void UseForWriting() { - CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - } - // Makes the buffer usable as a source for copy commands. - inline void UseAsCopySource() { - CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE); - } - // Must be called when doing draws/dispatches modifying data within the shared - // memory buffer as a UAV, to make sure that when UseForWriting is called the - // next time, a UAV barrier will be done, and subsequent overlapping UAV - // writes and reads are ordered. - inline void MarkUAVWritesCommitNeeded() { - if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { - buffer_uav_writes_commit_needed_ = true; - } - } + protected: + SharedMemory(Memory& memory); + // Call in implementation-specific initialization. + void InitializeCommon(); + // Call last in implementation-specific shutdown, also callable from the + // destructor. + void ShutdownCommon(); - void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); - void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle); - // Due to the Nvidia 128 megatexel limitation, the smallest supported formats - // are 32-bit. - void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, - uint32_t element_size_bytes_pow2); - void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle, - uint32_t element_size_bytes_pow2); + static constexpr uint32_t kBufferSizeLog2 = 29; + static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2; - // Returns true if any downloads were submitted to the command processor. - bool InitializeTraceSubmitDownloads(); - void InitializeTraceCompleteDownloads(); + // Sparse allocations are 4 MB, so not too many of them are allocated, but + // also not to waste too much memory for padding (with 16 MB there's too + // much). + static constexpr uint32_t kOptimalAllocationLog2 = 22; + static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2); - private: - bool AreTiledResourcesUsed() const; + Memory& memory() const { return memory_; } + + uint32_t page_size_log2() const { return page_size_log2_; } // Mark the memory range as updated and protect it. void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu); - D3D12CommandProcessor& command_processor_; + // Ensures the host GPU memory backing the range is accessible by host GPU + // drawing / computations / copying, but doesn't upload anything. + virtual bool EnsureHostGpuMemoryAllocated(uint32_t start, + uint32_t length) = 0; + + // Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated + // succeeded. While uploading, MarkRangeValid must be called for each + // successfully uploaded range as early as possible, before the memcpy, to + // make sure invalidation that happened during the CPU -> GPU memcpy isn't + // missed (upload_page_ranges is in pages because of this - MarkRangeValid has + // page granularity). + virtual bool UploadRanges( + const std::vector>& upload_page_ranges) = 0; + + // Mutable so the implementation can skip ranges by setting their "second" + // value to 0 if needed. + std::vector>& trace_download_ranges() { + return trace_download_ranges_; + } + uint32_t trace_download_page_count() const { + return trace_download_page_count_; + } + // Fills trace_download_ranges() and trace_download_page_count() with + // GPU-written ranges that need to be downloaded, and also invalidates + // non-GPU-written ranges so only the needed data - not the all the collected + // data - will be written in the trace. trace_download_page_count() will be 0 + // if nothing to download. + void PrepareForTraceDownload(); + // Release memory used for trace download ranges, to be called after + // downloading or in cases when download is dropped. + void ReleaseTraceDownloadRanges(); + + private: Memory& memory_; - TraceWriter& trace_writer_; - - // The 512 MB tiled buffer. - static constexpr uint32_t kBufferSizeLog2 = 29; - static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2; - ID3D12Resource* buffer_ = nullptr; - D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0; - D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; - bool buffer_uav_writes_commit_needed_ = false; - void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state); - - // Heaps are 4 MB, so not too many of them are allocated, but also not to - // waste too much memory for padding (with 16 MB there's too much). - static constexpr uint32_t kHeapSizeLog2 = 22; - static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2; - static_assert((kHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0, - "Heap size must be a multiple of Direct3D tile size"); - // Resident portions of the tiled buffer. - ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {}; - // Number of the heaps currently resident, for profiling. - uint32_t heap_count_ = 0; // Log2 of invalidation granularity (the system page size, but the dependency // on it is not hard - the access callback takes a range as an argument, and // touched pages of the buffer of this size will be invalidated). uint32_t page_size_log2_; - // Total buffer page count. - uint32_t page_count_; - - // Ensures the buffer tiles backing the range are resident, but doesn't upload - // anything. - bool EnsureTilesResident(uint32_t start, uint32_t length); - - // Non-shader-visible buffer descriptor heap for faster binding (via copying - // rather than creation). - enum class BufferDescriptorIndex : uint32_t { - kRawSRV, - kR32UintSRV, - kR32G32UintSRV, - kR32G32B32A32UintSRV, - kRawUAV, - kR32UintUAV, - kR32G32UintUAV, - kR32G32B32A32UintUAV, - - kCount, - }; - ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr; - D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_; - - // First page and length in pages. - typedef std::pair UploadRange; - // Ranges that need to be uploaded, generated by GetRangesToUpload (a - // persistently allocated vector). - std::vector upload_ranges_; - void GetRangesToUpload(uint32_t request_page_first, - uint32_t request_page_last); - std::unique_ptr upload_buffer_pool_; - - // GPU-written memory downloading for traces. - // Start page, length in pages. - std::vector> trace_gpu_written_ranges_; - // Created temporarily, only for downloading. - ID3D12Resource* trace_gpu_written_buffer_ = nullptr; - void ResetTraceGPUWrittenBuffer(); void* memory_invalidation_callback_handle_ = nullptr; void* memory_data_provider_handle_ = nullptr; + // Ranges that need to be uploaded, generated by GetRangesToUpload (a + // persistently allocated vector). + std::vector> upload_ranges_; + + // GPU-written memory downloading for traces. . + std::vector> trace_download_ranges_; + uint32_t trace_download_page_count_ = 0; + // Mutex between the guest memory subsystem and the command processor, to be // locked when checking or updating validity of pages/ranges and when firing // watches. @@ -309,8 +249,7 @@ class SharedMemory { void UnlinkWatchRange(WatchRange* range); }; -} // namespace d3d12 } // namespace gpu } // namespace xe -#endif // XENIA_GPU_D3D12_SHARED_MEMORY_H_ +#endif // XENIA_GPU_SHARED_MEMORY_H_