diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index 9eba0cac8..4891ab1bc 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -11,6 +11,7 @@ #include #include +#include #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" @@ -281,6 +282,14 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { if (ex->code() != Exception::Code::kAccessViolation) { return false; } + Exception::AccessViolationOperation operation = + ex->access_violation_operation(); + if (operation != Exception::AccessViolationOperation::kRead && + operation != Exception::AccessViolationOperation::kWrite) { + // Data Execution Prevention or something else uninteresting. + return false; + } + bool is_write = operation == Exception::AccessViolationOperation::kWrite; if (ex->fault_address() < uint64_t(virtual_membase_) || ex->fault_address() > uint64_t(memory_end_)) { // Quick kill anything outside our mapping. @@ -304,32 +313,23 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { } if (!range) { // Recheck if the pages are still protected (race condition - another thread - // clears the writewatch we just hit). + // clears the watch we just hit). // Do this under the lock so we don't introduce another race condition. auto lock = global_critical_region_.Acquire(); memory::PageAccess cur_access; size_t page_length = memory::page_size(); memory::QueryProtect(fault_host_address, page_length, cur_access); - if (cur_access != memory::PageAccess::kReadOnly && - cur_access != memory::PageAccess::kNoAccess) { - // Another thread has cleared this write watch. Abort. + if (cur_access != memory::PageAccess::kNoAccess && + (!is_write || cur_access != memory::PageAccess::kReadOnly)) { + // Another thread has cleared this watch. Abort. return true; } - // The address is not found within any range, so either a write watch or an // actual access violation. if (access_violation_callback_) { - switch (ex->access_violation_operation()) { - case Exception::AccessViolationOperation::kRead: - return access_violation_callback_(access_violation_callback_context_, - fault_host_address, false); - case Exception::AccessViolationOperation::kWrite: - return access_violation_callback_(access_violation_callback_context_, - fault_host_address, true); - default: - // Data Execution Prevention or something else uninteresting. - break; - } + return access_violation_callback_(std::move(lock), + access_violation_callback_context_, + fault_host_address, is_write); } return false; } diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 1bff31216..fdf202e1c 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -11,6 +11,7 @@ #define XENIA_CPU_MMIO_HANDLER_H_ #include +#include #include #include "xenia/base/mutex.h" @@ -44,12 +45,13 @@ class MMIOHandler { typedef uint32_t (*HostToGuestVirtual)(const void* context, const void* host_address); - typedef bool (*AccessViolationCallback)(void* context, void* host_address, - bool is_write); + typedef bool (*AccessViolationCallback)( + std::unique_lock global_lock_locked_once, + void* context, void* host_address, bool is_write); - // access_violation_callback is called in global_critical_region, so if - // multiple threads trigger an access violation in the same page, the callback - // will be called only once. + // access_violation_callback is called with global_critical_region locked once + // on the thread, so if multiple threads trigger an access violation in the + // same page, the callback will be called only once. static std::unique_ptr Install( uint8_t* virtual_membase, uint8_t* physical_membase, uint8_t* membase_end, HostToGuestVirtual host_to_guest_virtual, diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 0be3e5b5d..432a6ea11 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -87,8 +87,8 @@ void D3D12CommandProcessor::RequestFrameTrace(const std::wstring& root_path) { void D3D12CommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr, uint32_t length) { - shared_memory_->MemoryWriteCallback(base_ptr, length, true); - primitive_converter_->MemoryWriteCallback(base_ptr, length, true); + shared_memory_->MemoryInvalidationCallback(base_ptr, length, true); + primitive_converter_->MemoryInvalidationCallback(base_ptr, length, true); } void D3D12CommandProcessor::RestoreEDRAMSnapshot(const void* snapshot) { @@ -866,6 +866,7 @@ bool D3D12CommandProcessor::SetupContext() { if (FAILED(gamma_ramp_upload_->Map( 0, nullptr, reinterpret_cast(&gamma_ramp_upload_mapping_)))) { XELOGE("Failed to map the gamma ramp upload buffer"); + gamma_ramp_upload_mapping_ = nullptr; return false; } @@ -1827,42 +1828,24 @@ bool D3D12CommandProcessor::IssueCopy() { return true; } -void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { -#if FINE_GRAINED_DRAW_SCOPES - SCOPE_profile_cpu_f("gpu"); -#endif // FINE_GRAINED_DRAW_SCOPES - - bool is_opening_frame = is_guest_command && !frame_open_; - if (submission_open_ && !is_opening_frame) { - return; +void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) { + assert_true(await_submission <= submission_current_); + if (await_submission == submission_current_) { + assert_true(submission_open_); + EndSubmission(false); } - // Check the fence - needed for all kinds of submissions (to reclaim transient - // resources early) and specifically for frames (not to queue too many). + uint64_t submission_completed_before = submission_completed_; submission_completed_ = submission_fence_->GetCompletedValue(); - if (is_opening_frame) { - // Await the availability of the current frame. - uint64_t frame_current_last_submission = - closed_frame_submissions_[frame_current_ % kQueueFrames]; - if (frame_current_last_submission > submission_completed_) { - submission_fence_->SetEventOnCompletion( - frame_current_last_submission, submission_fence_completion_event_); - WaitForSingleObject(submission_fence_completion_event_, INFINITE); - submission_completed_ = submission_fence_->GetCompletedValue(); - } - // Update the completed frame index, also obtaining the actual completed - // frame number (since the CPU may be actually less than 3 frames behind) - // before reclaiming resources tracked with the frame number. - frame_completed_ = - std::max(frame_current_, uint64_t(kQueueFrames)) - kQueueFrames; - for (uint64_t frame = frame_completed_ + 1; frame < frame_current_; - ++frame) { - if (closed_frame_submissions_[frame % kQueueFrames] > - submission_completed_) { - break; - } - frame_completed_ = frame; - } + if (submission_completed_ < await_submission) { + submission_fence_->SetEventOnCompletion(await_submission, + submission_fence_completion_event_); + WaitForSingleObject(submission_fence_completion_event_, INFINITE); + submission_completed_ = submission_fence_->GetCompletedValue(); + } + if (submission_completed_ <= submission_completed_before) { + // Not updated - no need to reclaim or download things. + return; } // Reclaim command allocators. @@ -1898,6 +1881,46 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { } buffers_for_deletion_.erase(buffers_for_deletion_.begin(), erase_buffers_end); + shared_memory_->CompletedSubmissionUpdated(); + + render_target_cache_->CompletedSubmissionUpdated(); + + primitive_converter_->CompletedSubmissionUpdated(); +} + +void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + bool is_opening_frame = is_guest_command && !frame_open_; + if (submission_open_ && !is_opening_frame) { + return; + } + + // Check the fence - needed for all kinds of submissions (to reclaim transient + // resources early) and specifically for frames (not to queue too many), and + // await the availability of the current frame. + CheckSubmissionFence( + is_opening_frame + ? closed_frame_submissions_[frame_current_ % kQueueFrames] + : 0); + if (is_opening_frame) { + // Update the completed frame index, also obtaining the actual completed + // frame number (since the CPU may be actually less than 3 frames behind) + // before reclaiming resources tracked with the frame number. + frame_completed_ = + std::max(frame_current_, uint64_t(kQueueFrames)) - kQueueFrames; + for (uint64_t frame = frame_completed_ + 1; frame < frame_current_; + ++frame) { + if (closed_frame_submissions_[frame % kQueueFrames] > + submission_completed_) { + break; + } + frame_completed_ = frame; + } + } + if (!submission_open_) { submission_open_ = true; @@ -1920,8 +1943,6 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) { current_sampler_heap_ = nullptr; primitive_topology_ = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; - shared_memory_->BeginSubmission(); - render_target_cache_->BeginSubmission(); primitive_converter_->BeginSubmission(); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 4a1050d61..3c42e67bb 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -229,6 +229,9 @@ class D3D12CommandProcessor : public CommandProcessor { // frame. EndSubmission(true) will close the frame no matter whether the // submission has already been closed. + // Rechecks submission number and reclaims per-submission resources. Pass 0 as + // the submission to await to simply check status. + void CheckSubmissionFence(uint64_t await_submission); // If is_guest_command is true, a new full frame - with full cleanup of // resources and, if needed, starting capturing - is opened if pending (as // opposed to simply resuming after mid-frame synchronization). diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc index a3dfec579..72bcfa545 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.cc +++ b/src/xenia/gpu/d3d12/primitive_converter.cc @@ -124,16 +124,18 @@ bool PrimitiveConverter::Initialize() { static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress(); memory_regions_invalidated_.store(0ull, std::memory_order_relaxed); - physical_write_watch_handle_ = - memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this); + memory_invalidation_callback_handle_ = + memory_->RegisterPhysicalMemoryInvalidationCallback( + MemoryInvalidationCallbackThunk, this); return true; } void PrimitiveConverter::Shutdown() { - if (physical_write_watch_handle_ != nullptr) { - memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_); - physical_write_watch_handle_ = nullptr; + if (memory_invalidation_callback_handle_ != nullptr) { + memory_->UnregisterPhysicalMemoryInvalidationCallback( + memory_invalidation_callback_handle_); + memory_invalidation_callback_handle_ = nullptr; } ui::d3d12::util::ReleaseAndNull(static_ib_); ui::d3d12::util::ReleaseAndNull(static_ib_upload_); @@ -142,24 +144,25 @@ void PrimitiveConverter::Shutdown() { void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); } +void PrimitiveConverter::CompletedSubmissionUpdated() { + if (static_ib_upload_ && command_processor_->GetCompletedSubmission() >= + static_ib_upload_submission_) { + // Completely uploaded - release the upload buffer. + static_ib_upload_->Release(); + static_ib_upload_ = nullptr; + } +} + void PrimitiveConverter::BeginSubmission() { // Got a command list now - upload and transition the static index buffer if // needed. - if (static_ib_upload_) { - if (static_ib_upload_submission_ == UINT64_MAX) { - // Not uploaded yet - upload. - command_processor_->GetDeferredCommandList()->D3DCopyResource( - static_ib_, static_ib_upload_); - command_processor_->PushTransitionBarrier( - static_ib_, D3D12_RESOURCE_STATE_COPY_DEST, - D3D12_RESOURCE_STATE_INDEX_BUFFER); - static_ib_upload_submission_ = command_processor_->GetCurrentSubmission(); - } else if (command_processor_->GetCompletedSubmission() >= - static_ib_upload_submission_) { - // Completely uploaded - release the upload buffer. - static_ib_upload_->Release(); - static_ib_upload_ = nullptr; - } + if (static_ib_upload_ && static_ib_upload_submission_ == UINT64_MAX) { + command_processor_->GetDeferredCommandList()->D3DCopyResource( + static_ib_, static_ib_upload_); + command_processor_->PushTransitionBarrier( + static_ib_, D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_INDEX_BUFFER); + static_ib_upload_submission_ = command_processor_->GetCurrentSubmission(); } } @@ -706,7 +709,7 @@ void* PrimitiveConverter::AllocateIndices( return mapping + simd_offset; } -std::pair PrimitiveConverter::MemoryWriteCallback( +std::pair PrimitiveConverter::MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range) { // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size. uint32_t bit_index_first = physical_address_start >> 23; @@ -719,11 +722,12 @@ std::pair PrimitiveConverter::MemoryWriteCallback( return std::make_pair(0, UINT32_MAX); } -std::pair PrimitiveConverter::MemoryWriteCallbackThunk( +std::pair +PrimitiveConverter::MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range) { return reinterpret_cast(context_ptr) - ->MemoryWriteCallback(physical_address_start, length, exact_range); + ->MemoryInvalidationCallback(physical_address_start, length, exact_range); } D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer( diff --git a/src/xenia/gpu/d3d12/primitive_converter.h b/src/xenia/gpu/d3d12/primitive_converter.h index f45c36146..57a3067b2 100644 --- a/src/xenia/gpu/d3d12/primitive_converter.h +++ b/src/xenia/gpu/d3d12/primitive_converter.h @@ -46,6 +46,7 @@ class PrimitiveConverter { void Shutdown(); void ClearCache(); + void CompletedSubmissionUpdated(); void BeginSubmission(); void BeginFrame(); @@ -83,7 +84,7 @@ class PrimitiveConverter { uint32_t& index_count_out) const; // Callback for invalidating buffers mid-frame. - std::pair MemoryWriteCallback( + std::pair MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range); void InitializeTrace(); @@ -96,7 +97,7 @@ class PrimitiveConverter { uint32_t simd_offset, D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out); - static std::pair MemoryWriteCallbackThunk( + static std::pair MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range); @@ -176,7 +177,7 @@ class PrimitiveConverter { // the cache. uint64_t memory_regions_used_; std::atomic memory_regions_invalidated_ = 0; - void* physical_write_watch_handle_ = nullptr; + void* memory_invalidation_callback_handle_ = nullptr; uint32_t system_page_size_; }; diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index df83d5df9..22c5cde59 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -455,12 +455,14 @@ void RenderTargetCache::ClearCache() { edram_snapshot_restore_pool_.reset(); } -void RenderTargetCache::BeginSubmission() { +void RenderTargetCache::CompletedSubmissionUpdated() { if (edram_snapshot_restore_pool_) { edram_snapshot_restore_pool_->Reclaim( command_processor_->GetCompletedSubmission()); } +} +void RenderTargetCache::BeginSubmission() { // With the ROV, a submission does not always end in a resolve (for example, // when memexport readback happens) or something else that would surely submit // the UAV barrier, so we need to preserve the `current_` variables. @@ -1417,8 +1419,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, return false; } } else { - if (!shared_memory->MakeTilesResident(dest_modified_start, - dest_modified_length)) { + if (!shared_memory->EnsureTilesResident(dest_modified_start, + dest_modified_length)) { return false; } } diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index e227a7b60..db1826ac5 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -257,6 +257,7 @@ class RenderTargetCache { void Shutdown(); void ClearCache(); + void CompletedSubmissionUpdated(); void BeginSubmission(); void EndFrame(); // Called in the beginning of a draw call - may bind pipelines. diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index 1b80cb271..a88537672 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -11,6 +11,7 @@ #include #include +#include #include #include "xenia/base/assert.h" @@ -49,11 +50,6 @@ SharedMemory::SharedMemory(D3D12CommandProcessor* command_processor, trace_writer_(trace_writer) { page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size())); page_count_ = kBufferSize >> page_size_log2_; - uint32_t page_bitmap_length = page_count_ >> 6; - assert_true(page_bitmap_length != 0); - - // Two interleaved bit arrays. - valid_and_gpu_written_pages_.resize(page_bitmap_length << 1); } SharedMemory::~SharedMemory() { Shutdown(); } @@ -125,14 +121,16 @@ bool SharedMemory::Initialize() { uint32_t(BufferDescriptorIndex::kRawUAV)), buffer_, kBufferSize); - std::memset(valid_and_gpu_written_pages_.data(), 0, - valid_and_gpu_written_pages_.size() * sizeof(uint64_t)); + system_page_flags_.clear(); + system_page_flags_.resize((page_count_ + 63) / 64); - upload_buffer_pool_ = - std::make_unique(device, 4 * 1024 * 1024); + upload_buffer_pool_ = std::make_unique( + device, + xe::align(uint32_t(4 * 1024 * 1024), uint32_t(1) << page_size_log2_)); - physical_write_watch_handle_ = - memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this); + memory_invalidation_callback_handle_ = + memory_->RegisterPhysicalMemoryInvalidationCallback( + MemoryInvalidationCallbackThunk, this); ResetTraceGPUWrittenBuffer(); @@ -144,9 +142,10 @@ void SharedMemory::Shutdown() { // TODO(Triang3l): Do something in case any watches are still registered. - if (physical_write_watch_handle_ != nullptr) { - memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_); - physical_write_watch_handle_ = nullptr; + if (memory_invalidation_callback_handle_ != nullptr) { + memory_->UnregisterPhysicalMemoryInvalidationCallback( + memory_invalidation_callback_handle_); + memory_invalidation_callback_handle_ = nullptr; } upload_buffer_pool_.reset(); @@ -165,7 +164,7 @@ void SharedMemory::Shutdown() { } } -void SharedMemory::BeginSubmission() { +void SharedMemory::CompletedSubmissionUpdated() { upload_buffer_pool_->Reclaim(command_processor_->GetCompletedSubmission()); } @@ -273,7 +272,7 @@ void SharedMemory::UnwatchMemoryRange(WatchHandle handle) { UnlinkWatchRange(reinterpret_cast(handle)); } -bool SharedMemory::MakeTilesResident(uint32_t start, uint32_t length) { +bool SharedMemory::EnsureTilesResident(uint32_t start, uint32_t length) { if (length == 0) { // Some texture is empty, for example - safe to draw in this case. return true; @@ -347,7 +346,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { #endif // FINE_GRAINED_DRAW_SCOPES // Ensure all tile heaps are present. - if (!MakeTilesResident(start, length)) { + if (!EnsureTilesResident(start, length)) { return false; } @@ -375,7 +374,8 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) { return false; } uint32_t upload_buffer_pages = upload_buffer_size >> page_size_log2_; - MakeRangeValid(upload_range_start, upload_buffer_pages, false); + MakeRangeValid(upload_range_start << page_size_log2_, + upload_buffer_pages << page_size_log2_, false); std::memcpy( upload_buffer_mapping, memory_->TranslatePhysical(upload_range_start << page_size_log2_), @@ -439,7 +439,7 @@ void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) { // Mark the range as valid (so pages are not reuploaded until modified by the // CPU) and watch it so the CPU can reuse it and this will be caught. - MakeRangeValid(page_first, page_last - page_first + 1, true); + MakeRangeValid(start, length, true); } bool SharedMemory::AreTiledResourcesUsed() const { @@ -453,14 +453,15 @@ bool SharedMemory::AreTiledResourcesUsed() const { provider->GetGraphicsAnalysis() == nullptr; } -void SharedMemory::MakeRangeValid(uint32_t valid_page_first, - uint32_t valid_page_count, +void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu) { - if (valid_page_first >= page_count_ || valid_page_count == 0) { + if (length == 0 || start >= kBufferSize) { return; } - valid_page_count = std::min(valid_page_count, page_count_ - valid_page_first); - uint32_t valid_page_last = valid_page_first + valid_page_count - 1; + length = std::min(length, kBufferSize - start); + uint32_t last = start + length - 1; + uint32_t valid_page_first = start >> page_size_log2_; + uint32_t valid_page_last = last >> page_size_log2_; uint32_t valid_block_first = valid_page_first >> 6; uint32_t valid_block_last = valid_page_last >> 6; @@ -475,18 +476,21 @@ void SharedMemory::MakeRangeValid(uint32_t valid_page_first, if (i == valid_block_last && (valid_page_last & 63) != 63) { valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1; } - valid_and_gpu_written_pages_[i << 1] |= valid_bits; + SystemPageFlagsBlock& block = system_page_flags_[i]; + block.valid |= valid_bits; if (written_by_gpu) { - valid_and_gpu_written_pages_[(i << 1) + 1] |= valid_bits; + block.valid_and_gpu_written |= valid_bits; } else { - valid_and_gpu_written_pages_[(i << 1) + 1] &= ~valid_bits; + block.valid_and_gpu_written &= ~valid_bits; } } } - if (physical_write_watch_handle_) { - memory_->WatchPhysicalMemoryWrite(valid_page_first << page_size_log2_, - valid_page_count << page_size_log2_); + if (memory_invalidation_callback_handle_) { + memory_->EnablePhysicalMemoryAccessCallbacks( + valid_page_first << page_size_log2_, + (valid_page_last - valid_page_first + 1) << page_size_log2_, true, + false); } } @@ -527,7 +531,7 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first, uint32_t range_start = UINT32_MAX; for (uint32_t i = request_block_first; i <= request_block_last; ++i) { - uint64_t block_valid = valid_and_gpu_written_pages_[i << 1]; + uint64_t block_valid = system_page_flags_[i].valid; // Consider pages in the block outside the requested range valid. if (i == request_block_first) { block_valid |= (1ull << (request_page_first & 63)) - 1; @@ -569,17 +573,23 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first, } } -std::pair SharedMemory::MemoryWriteCallbackThunk( +std::pair SharedMemory::MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range) { return reinterpret_cast(context_ptr) - ->MemoryWriteCallback(physical_address_start, length, exact_range); + ->MemoryInvalidationCallback(physical_address_start, length, exact_range); } -std::pair SharedMemory::MemoryWriteCallback( +std::pair SharedMemory::MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range) { + if (length == 0 || physical_address_start >= kBufferSize) { + return std::make_pair(uint32_t(0), UINT32_MAX); + } + length = std::min(length, kBufferSize - physical_address_start); + uint32_t physical_address_last = physical_address_start + (length - 1); + uint32_t page_first = physical_address_start >> page_size_log2_; - uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_; + uint32_t page_last = physical_address_last >> page_size_log2_; assert_true(page_first < page_count_ && page_last < page_count_); uint32_t block_first = page_first >> 6; uint32_t block_last = page_last >> 6; @@ -596,14 +606,14 @@ std::pair SharedMemory::MemoryWriteCallback( // frame, but with 256 KB it's 0.7 ms. if (page_first & 63) { uint64_t gpu_written_start = - valid_and_gpu_written_pages_[(block_first << 1) + 1]; + system_page_flags_[block_first].valid_and_gpu_written; gpu_written_start &= (1ull << (page_first & 63)) - 1; page_first = (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start)); } if ((page_last & 63) != 63) { uint64_t gpu_written_end = - valid_and_gpu_written_pages_[(block_last << 1) + 1]; + system_page_flags_[block_last].valid_and_gpu_written; gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1); page_last = (page_last & ~uint32_t(63)) + (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1); @@ -618,8 +628,9 @@ std::pair SharedMemory::MemoryWriteCallback( if (i == block_last && (page_last & 63) != 63) { invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1; } - valid_and_gpu_written_pages_[i << 1] &= ~invalidate_bits; - valid_and_gpu_written_pages_[(i << 1) + 1] &= ~invalidate_bits; + SystemPageFlagsBlock& block = system_page_flags_[i]; + block.valid &= ~invalidate_bits; + block.valid_and_gpu_written &= ~invalidate_bits; } FireWatches(page_first, page_last, false); @@ -664,10 +675,11 @@ bool SharedMemory::InitializeTraceSubmitDownloads() { auto global_lock = global_critical_region_.Acquire(); uint32_t fire_watches_range_start = UINT32_MAX; uint32_t gpu_written_range_start = UINT32_MAX; - for (uint32_t i = 0; i * 2 < valid_and_gpu_written_pages_.size(); ++i) { - uint64_t previously_valid_block = valid_and_gpu_written_pages_[i * 2]; - uint64_t gpu_written_block = valid_and_gpu_written_pages_[i * 2 + 1]; - valid_and_gpu_written_pages_[i * 2] = gpu_written_block; + for (uint32_t i = 0; i < system_page_flags_.size(); ++i) { + SystemPageFlagsBlock& page_flags_block = system_page_flags_[i]; + uint64_t previously_valid_block = page_flags_block.valid; + uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written; + page_flags_block.valid = gpu_written_block; // Fire watches on the invalidated pages. uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block; @@ -748,8 +760,8 @@ bool SharedMemory::InitializeTraceSubmitDownloads() { &gpu_written_buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&trace_gpu_written_buffer_)))) { XELOGE( - "Failed to create a %u KB GPU-written memory download buffer for frame " - "tracing", + "Shared memory: Failed to create a %u KB GPU-written memory download " + "buffer for frame tracing", gpu_written_page_count << page_size_log2_ >> 10); ResetTraceGPUWrittenBuffer(); return false; @@ -761,8 +773,8 @@ bool SharedMemory::InitializeTraceSubmitDownloads() { for (auto& gpu_written_submit_range : trace_gpu_written_ranges_) { // For cases like resolution scale, when the data may not be actually // written, just marked as valid. - if (!MakeTilesResident(gpu_written_submit_range.first, - gpu_written_submit_range.second)) { + if (!EnsureTilesResident(gpu_written_submit_range.first, + gpu_written_submit_range.second)) { gpu_written_submit_range.second = 0; continue; } diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h index 9b24b01b1..af99fa15b 100644 --- a/src/xenia/gpu/d3d12/shared_memory.h +++ b/src/xenia/gpu/d3d12/shared_memory.h @@ -11,7 +11,6 @@ #define XENIA_GPU_D3D12_SHARED_MEMORY_H_ #include -#include #include #include @@ -44,7 +43,7 @@ class SharedMemory { return buffer_gpu_address_; } - void BeginSubmission(); + void CompletedSubmissionUpdated(); typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first, uint32_t address_last, @@ -57,7 +56,7 @@ class SharedMemory { // example, if the game changes protection level of a memory range containing // the watched range. // - // The callback is called with the mutex locked. + // The callback is called within the global critical region. GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback, void* callback_context); void UnregisterGlobalWatch(GlobalWatchHandle handle); @@ -84,15 +83,10 @@ class SharedMemory { void* callback_data, uint64_t callback_argument); // Unregisters previously registered watched memory range. void UnwatchMemoryRange(WatchHandle handle); - // Locks the mutex that gets locked when watch callbacks are invoked - must be - // done when checking variables that may be changed by a watch callback. - inline std::unique_lock LockWatchMutex() { - return global_critical_region_.Acquire(); - } // Ensures the buffer tiles backing the range are resident, but doesn't upload // anything. - bool MakeTilesResident(uint32_t start, uint32_t length); + bool EnsureTilesResident(uint32_t start, uint32_t length); // Checks if the range has been updated, uploads new data if needed and // ensures the buffer tiles backing the range are resident. May transition the @@ -105,7 +99,7 @@ class SharedMemory { // (to up to the first GPU-written page, as an access violation exception // count optimization) as modified by the CPU, also invalidating GPU-written // pages directly in the range. - std::pair MemoryWriteCallback( + std::pair MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range); // Marks the range as containing GPU-generated data (such as resolves), @@ -141,8 +135,7 @@ class SharedMemory { bool AreTiledResourcesUsed() const; // Mark the memory range as updated and protect it. - void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count, - bool written_by_gpu); + void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu); D3D12CommandProcessor* command_processor_; Memory* memory_; @@ -154,6 +147,7 @@ class SharedMemory { ID3D12Resource* buffer_ = nullptr; D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0; D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST; + void TransitionBuffer(D3D12_RESOURCE_STATES new_state); // Heaps are 4 MB, so not too many of them are allocated, but also not to // waste too much memory for padding (with 16 MB there's too much). @@ -166,9 +160,11 @@ class SharedMemory { // Number of the heaps currently resident, for profiling. uint32_t heap_count_ = 0; - // Log2 of system page size. + // Log2 of invalidation granularity (the system page size, but the dependency + // on it is not hard - the access callback takes a range as an argument, and + // touched pages of the buffer of this size will be invalidated). uint32_t page_size_log2_; - // Total physical page count. + // Total buffer page count. uint32_t page_count_; // Non-shader-visible buffer descriptor heap for faster binding (via copying @@ -182,24 +178,46 @@ class SharedMemory { ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr; D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_; - // Handle of the physical memory write callback. - void* physical_write_watch_handle_ = nullptr; + // First page and length in pages. + typedef std::pair UploadRange; + // Ranges that need to be uploaded, generated by GetRangesToUpload (a + // persistently allocated vector). + std::vector upload_ranges_; + void GetRangesToUpload(uint32_t request_page_first, + uint32_t request_page_last); + std::unique_ptr upload_buffer_pool_ = nullptr; - // Mutex between the exception handler and the command processor, to be locked - // when checking or updating validity of pages/ranges. + // GPU-written memory downloading for traces. + // Start page, length in pages. + std::vector> trace_gpu_written_ranges_; + // Created temporarily, only for downloading. + ID3D12Resource* trace_gpu_written_buffer_ = nullptr; + void ResetTraceGPUWrittenBuffer(); + + void* memory_invalidation_callback_handle_ = nullptr; + void* memory_data_provider_handle_ = nullptr; + + // Mutex between the guest memory subsystem and the command processor, to be + // locked when checking or updating validity of pages/ranges and when firing + // watches. xe::global_critical_region global_critical_region_; // *************************************************************************** - // Things below should be protected by global_critical_region. + // Things below should be fully protected by global_critical_region. // *************************************************************************** - // Bit vector containing: - // - Even block indices - whether physical memory system pages are up to date. - // - Odd block indices - whether phyical memory system pages contain data - // written by the GPU not synchronized with the CPU (subset of valid pages). - std::vector valid_and_gpu_written_pages_; + struct SystemPageFlagsBlock { + // Whether each page is up to date in the GPU buffer. + uint64_t valid; + // Subset of valid pages - whether each page in the GPU buffer contains data + // that was written on the GPU, thus should not be invalidated spuriously. + uint64_t valid_and_gpu_written; + }; + // Flags for each 64 system pages, interleaved as blocks, so bit scan can be + // used to quickly extract ranges. + std::vector system_page_flags_; - static std::pair MemoryWriteCallbackThunk( + static std::pair MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range); @@ -259,30 +277,9 @@ class SharedMemory { // watches. void FireWatches(uint32_t page_first, uint32_t page_last, bool invalidated_by_gpu); - // Unlinks and frees the range and its nodes. Call this with the mutex locked. + // Unlinks and frees the range and its nodes. Call this in the global critical + // region. void UnlinkWatchRange(WatchRange* range); - - // *************************************************************************** - // Things above should be protected by global_critical_region. - // *************************************************************************** - - // First page and length in pages. - typedef std::pair UploadRange; - // Ranges that need to be uploaded, generated by GetRangesToUpload (a - // persistently allocated vector). - std::vector upload_ranges_; - void GetRangesToUpload(uint32_t request_page_first, - uint32_t request_page_last); - std::unique_ptr upload_buffer_pool_ = nullptr; - - void TransitionBuffer(D3D12_RESOURCE_STATES new_state); - - // GPU-written memory downloading for traces. - // Start page, length in pages. - std::vector> trace_gpu_written_ranges_; - // Created temporarily, only for downloading. - ID3D12Resource* trace_gpu_written_buffer_ = nullptr; - void ResetTraceGPUWrittenBuffer(); }; } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index d4496195a..1d1570dee 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -1702,7 +1702,7 @@ void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled, uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12; uint32_t block_first = page_first >> 5; uint32_t block_last = page_last >> 5; - auto watch_lock = shared_memory_->LockWatchMutex(); + auto global_lock = global_critical_region_.Acquire(); for (uint32_t i = block_first; i <= block_last; ++i) { uint32_t add_bits = UINT32_MAX; if (i == block_first) { @@ -1812,8 +1812,8 @@ bool TextureCache::TileResolvedTexture( return false; } } else { - if (!shared_memory_->MakeTilesResident(texture_modified_start, - texture_modified_length)) { + if (!shared_memory_->EnsureTilesResident(texture_modified_start, + texture_modified_length)) { return false; } } @@ -2404,7 +2404,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { // See what we need to upload. bool base_in_sync, mips_in_sync; { - auto watch_lock = shared_memory_->LockWatchMutex(); + auto global_lock = global_critical_region_.Acquire(); base_in_sync = texture->base_in_sync; mips_in_sync = texture->mips_in_sync; } @@ -2672,7 +2672,7 @@ bool TextureCache::LoadTextureData(Texture* texture) { // regular texture or a vertex buffer, and thus the scaled resolve version is // not up to date anymore. { - auto watch_lock = shared_memory_->LockWatchMutex(); + auto global_lock = global_critical_region_.Acquire(); texture->base_in_sync = true; texture->mips_in_sync = true; if (!base_in_sync) { @@ -2761,7 +2761,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled, uint32_t block_last = page_last >> 5; uint32_t l2_block_first = block_first >> 6; uint32_t l2_block_last = block_last >> 6; - auto watch_lock = shared_memory_->LockWatchMutex(); + auto global_lock = global_critical_region_.Acquire(); for (uint32_t i = l2_block_first; i <= l2_block_last; ++i) { uint64_t l2_block = scaled_resolve_pages_l2_[i]; if (i == l2_block_first) { diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index 2541d2af3..d93ee962f 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -11,9 +11,9 @@ #define XENIA_GPU_D3D12_TEXTURE_CACHE_H_ #include -#include #include +#include "xenia/base/mutex.h" #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/shared_memory.h" #include "xenia/gpu/register_file.h" @@ -369,15 +369,14 @@ class TextureCache { static constexpr uint32_t kCachedSRVDescriptorSwizzleMissing = UINT32_MAX; uint32_t cached_srv_descriptor_swizzle; - // Watch handles for the memory ranges (protected by the shared memory watch - // mutex). + // These are to be accessed within the global critical region to synchronize + // with shared memory. + // Watch handles for the memory ranges. SharedMemory::WatchHandle base_watch_handle; SharedMemory::WatchHandle mip_watch_handle; - // Whether the recent base level data has been loaded from the memory - // (protected by the shared memory watch mutex). + // Whether the recent base level data has been loaded from the memory. bool base_in_sync; - // Whether the recent mip data has been loaded from the memory (protected by - // the shared memory watch mutex). + // Whether the recent mip data has been loaded from the memory. bool mips_in_sync; }; @@ -620,16 +619,16 @@ class TextureCache { kScaledResolveHeapSizeLog2] = {}; // Number of currently resident portions of the tiled buffer, for profiling. uint32_t scaled_resolve_heap_count_ = 0; + // Global watch for scaled resolve data invalidation. + SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr; + + xe::global_critical_region global_critical_region_; // Bit vector storing whether each 4 KB physical memory page contains scaled // resolve data. uint32_t rather than uint64_t because parts of it are sent to // shaders. - // PROTECTED BY THE SHARED MEMORY WATCH MUTEX! uint32_t* scaled_resolve_pages_ = nullptr; // Second level of the bit vector for faster rejection of non-scaled textures. - // PROTECTED BY THE SHARED MEMORY WATCH MUTEX! uint64_t scaled_resolve_pages_l2_[(512 << 20) >> (12 + 5 + 6)]; - // Global watch for scaled resolve data invalidation. - SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr; }; } // namespace d3d12 diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index e0fb9a662..92d8f9fc7 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -150,16 +150,18 @@ VkResult TextureCache::Initialize() { device_queue_ = device_->AcquireQueue(device_->queue_family_index()); - physical_write_watch_handle_ = - memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this); + memory_invalidation_callback_handle_ = + memory_->RegisterPhysicalMemoryInvalidationCallback( + MemoryInvalidationCallbackThunk, this); return VK_SUCCESS; } void TextureCache::Shutdown() { - if (physical_write_watch_handle_ != nullptr) { - memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_); - physical_write_watch_handle_ = nullptr; + if (memory_invalidation_callback_handle_ != nullptr) { + memory_->UnregisterPhysicalMemoryInvalidationCallback( + memory_invalidation_callback_handle_); + memory_invalidation_callback_handle_ = nullptr; } if (device_queue_) { @@ -411,7 +413,7 @@ void TextureCache::WatchTexture(Texture* texture) { texture->is_watched = true; } - memory_->WatchPhysicalMemoryWrite(address, size); + memory_->EnablePhysicalMemoryAccessCallbacks(address, size, true, false); } void TextureCache::TextureTouched(Texture* texture) { @@ -428,7 +430,7 @@ void TextureCache::TextureTouched(Texture* texture) { texture->pending_invalidation = true; } -std::pair TextureCache::MemoryWriteCallback( +std::pair TextureCache::MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range) { global_critical_region_.Acquire(); if (watched_textures_.empty()) { @@ -468,11 +470,11 @@ std::pair TextureCache::MemoryWriteCallback( return std::make_pair(previous_end, next_start - previous_end); } -std::pair TextureCache::MemoryWriteCallbackThunk( +std::pair TextureCache::MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range) { return reinterpret_cast(context_ptr) - ->MemoryWriteCallback(physical_address_start, length, exact_range); + ->MemoryInvalidationCallback(physical_address_start, length, exact_range); } TextureCache::Texture* TextureCache::DemandResolveTexture( diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index 015868209..370d0b925 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -147,9 +147,9 @@ class TextureCache { void WatchTexture(Texture* texture); void TextureTouched(Texture* texture); - std::pair MemoryWriteCallback( + std::pair MemoryInvalidationCallback( uint32_t physical_address_start, uint32_t length, bool exact_range); - static std::pair MemoryWriteCallbackThunk( + static std::pair MemoryInvalidationCallbackThunk( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range); @@ -220,7 +220,7 @@ class TextureCache { std::unordered_map samplers_; std::list pending_delete_textures_; - void* physical_write_watch_handle_ = nullptr; + void* memory_invalidation_callback_handle_ = nullptr; xe::global_critical_region global_critical_region_; std::list watched_textures_; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc index 4352c7be9..5aba4004f 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc @@ -9,6 +9,7 @@ #include "xenia/base/logging.h" #include "xenia/base/memory.h" +#include "xenia/base/mutex.h" #include "xenia/cpu/processor.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/util/shim_utils.h" @@ -172,8 +173,9 @@ dword_result_t NtReadFile(dword_t file_handle, dword_t event_handle, // some games NtReadFile() directly into texture memory auto heap = kernel_memory()->LookupHeap(buffer.guest_address()); if (heap && heap->IsGuestPhysicalHeap()) { - kernel_memory()->TriggerWatches(buffer.guest_address(), buffer_length, - true, true); + kernel_memory()->TriggerPhysicalMemoryCallbacks( + xe::global_critical_region::AcquireDirect(), buffer.guest_address(), + buffer_length, true, true); } // Synchronous. diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index bd6d58f44..f7dec5366 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -11,7 +11,9 @@ #include #include +#include +#include "xenia/base/assert.h" #include "xenia/base/byte_stream.h" #include "xenia/base/clock.h" #include "xenia/base/cvar.h" @@ -96,8 +98,8 @@ Memory::~Memory() { // requests. mmio_handler_.reset(); - for (auto physical_write_watch : physical_write_watches_) { - delete physical_write_watch; + for (auto invalidation_callback : physical_memory_invalidation_callbacks_) { + delete invalidation_callback; } heaps_.v00000000.Dispose(); @@ -433,13 +435,12 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) { return mmio_handler_->LookupRange(virtual_address); } -bool Memory::AccessViolationCallback(void* host_address, bool is_write) { - if (!is_write) { - // TODO(Triang3l): Handle GPU readback. - return false; - } - // Access via physical_membase_ is special, when need to bypass everything, - // so only watching virtual memory regions. +bool Memory::AccessViolationCallback( + std::unique_lock global_lock_locked_once, + void* host_address, bool is_write) { + // Access via physical_membase_ is special, when need to bypass everything + // (for instance, for a data provider to actually write the data) so only + // triggering callbacks on virtual memory regions. if (reinterpret_cast(host_address) < reinterpret_cast(virtual_membase_) || reinterpret_cast(host_address) >= @@ -448,65 +449,79 @@ bool Memory::AccessViolationCallback(void* host_address, bool is_write) { } uint32_t virtual_address = HostToGuestVirtual(host_address); BaseHeap* heap = LookupHeap(virtual_address); - if (heap->IsGuestPhysicalHeap()) { - // Will be rounded to physical page boundaries internally, so just pass 1 as - // the length - guranteed not to cross page boundaries also. - return static_cast(heap)->TriggerWatches(virtual_address, 1, - is_write, false); + if (!heap->IsGuestPhysicalHeap()) { + return false; } - return false; + // Access violation callbacks from the guest are triggered when the global + // critical region mutex is locked once. + // + // Will be rounded to physical page boundaries internally, so just pass 1 as + // the length - guranteed not to cross page boundaries also. + auto physical_heap = static_cast(heap); + return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once), + virtual_address, 1, is_write, false); } -bool Memory::AccessViolationCallbackThunk(void* context, void* host_address, - bool is_write) { +bool Memory::AccessViolationCallbackThunk( + std::unique_lock global_lock_locked_once, + void* context, void* host_address, bool is_write) { return reinterpret_cast(context)->AccessViolationCallback( - host_address, is_write); + std::move(global_lock_locked_once), host_address, is_write); } -bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length, - bool is_write, bool unwatch_exact_range, - bool unprotect) { +bool Memory::TriggerPhysicalMemoryCallbacks( + std::unique_lock global_lock_locked_once, + uint32_t virtual_address, uint32_t length, bool is_write, + bool unwatch_exact_range, bool unprotect) { BaseHeap* heap = LookupHeap(virtual_address); if (heap->IsGuestPhysicalHeap()) { - return static_cast(heap)->TriggerWatches( - virtual_address, length, is_write, unwatch_exact_range, unprotect); + auto physical_heap = static_cast(heap); + return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once), + virtual_address, length, is_write, + unwatch_exact_range, unprotect); } return false; } -void* Memory::RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback, - void* callback_context) { - PhysicalWriteWatchEntry* entry = new PhysicalWriteWatchEntry; - entry->callback = callback; - entry->callback_context = callback_context; - +void* Memory::RegisterPhysicalMemoryInvalidationCallback( + PhysicalMemoryInvalidationCallback callback, void* callback_context) { + auto entry = new std::pair( + callback, callback_context); auto lock = global_critical_region_.Acquire(); - physical_write_watches_.push_back(entry); - + physical_memory_invalidation_callbacks_.push_back(entry); return entry; } -void Memory::UnregisterPhysicalWriteWatch(void* watch_handle) { - auto entry = reinterpret_cast(watch_handle); +void Memory::UnregisterPhysicalMemoryInvalidationCallback( + void* callback_handle) { + auto entry = + reinterpret_cast*>( + callback_handle); { auto lock = global_critical_region_.Acquire(); - auto it = std::find(physical_write_watches_.begin(), - physical_write_watches_.end(), entry); - assert_false(it == physical_write_watches_.end()); - if (it != physical_write_watches_.end()) { - physical_write_watches_.erase(it); + auto it = std::find(physical_memory_invalidation_callbacks_.begin(), + physical_memory_invalidation_callbacks_.end(), entry); + assert_true(it != physical_memory_invalidation_callbacks_.end()); + if (it != physical_memory_invalidation_callbacks_.end()) { + physical_memory_invalidation_callbacks_.erase(it); } } delete entry; } -void Memory::WatchPhysicalMemoryWrite(uint32_t physical_address, - uint32_t length) { - // Watch independently in all three mappings. - heaps_.vA0000000.WatchPhysicalWrite(physical_address, length); - heaps_.vC0000000.WatchPhysicalWrite(physical_address, length); - heaps_.vE0000000.WatchPhysicalWrite(physical_address, length); +void Memory::EnablePhysicalMemoryAccessCallbacks( + uint32_t physical_address, uint32_t length, + bool enable_invalidation_notifications, bool enable_data_providers) { + heaps_.vA0000000.EnableAccessCallbacks(physical_address, length, + enable_invalidation_notifications, + enable_data_providers); + heaps_.vC0000000.EnableAccessCallbacks(physical_address, length, + enable_invalidation_notifications, + enable_data_providers); + heaps_.vE0000000.EnableAccessCallbacks(physical_address, length, + enable_invalidation_notifications, + enable_data_providers); } uint32_t Memory::SystemHeapAlloc(uint32_t size, uint32_t alignment, @@ -798,7 +813,8 @@ bool BaseHeap::Restore(ByteStream* stream) { void BaseHeap::Reset() { // TODO(DrChat): protect pages. std::memset(page_table_.data(), 0, sizeof(PageEntry) * page_table_.size()); - // TODO(Triang3l): Unwatch pages. + // TODO(Triang3l): Remove access callbacks from pages if this is a physical + // memory heap. } bool BaseHeap::Alloc(uint32_t size, uint32_t alignment, @@ -1313,9 +1329,7 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase, system_page_count_ = (heap_size_ /* already - 1 */ + host_address_offset + system_page_size_) / system_page_size_; - system_pages_watched_write_.resize((system_page_count_ + 63) / 64); - std::memset(system_pages_watched_write_.data(), 0, - system_pages_watched_write_.size() * sizeof(uint64_t)); + system_page_flags_.resize((system_page_count_ + 63) / 64); } bool PhysicalHeap::Alloc(uint32_t size, uint32_t alignment, @@ -1357,7 +1371,7 @@ bool PhysicalHeap::Alloc(uint32_t size, uint32_t alignment, } if (protect & kMemoryProtectWrite) { - TriggerWatches(address, size, true, true, false); + TriggerCallbacks(std::move(global_lock), address, size, true, true, false); } *out_address = address; @@ -1398,7 +1412,7 @@ bool PhysicalHeap::AllocFixed(uint32_t base_address, uint32_t size, } if (protect & kMemoryProtectWrite) { - TriggerWatches(address, size, true, true, false); + TriggerCallbacks(std::move(global_lock), address, size, true, true, false); } return true; @@ -1443,7 +1457,7 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address, } if (protect & kMemoryProtectWrite) { - TriggerWatches(address, size, true, true, false); + TriggerCallbacks(std::move(global_lock), address, size, true, true, false); } *out_address = address; @@ -1477,7 +1491,7 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, // Only invalidate if making writable again, for simplicity - not when simply // marking some range as immutable, for instance. if (protect & kMemoryProtectWrite) { - TriggerWatches(address, size, true, true, false); + TriggerCallbacks(std::move(global_lock), address, size, true, true, false); } if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect, @@ -1489,8 +1503,15 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect, return BaseHeap::Protect(address, size, protect); } -void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address, - uint32_t length) { +void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, + uint32_t length, + bool enable_invalidation_notifications, + bool enable_data_providers) { + // TODO(Triang3l): Implement data providers. + assert_false(enable_data_providers); + if (!enable_invalidation_notifications && !enable_data_providers) { + return; + } uint32_t physical_address_offset = GetPhysicalAddress(heap_base_); if (physical_address < physical_address_offset) { if (physical_address_offset - physical_address >= length) { @@ -1516,28 +1537,61 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address, system_page_last = std::min(system_page_last, system_page_count_ - 1); assert_true(system_page_first <= system_page_last); - auto global_lock = global_critical_region_.Acquire(); - - // Protect the pages and mark them as watched. Don't mark non-writable pages - // as watched, so true access violations can still occur there. + // Update callback flags for system pages and make their protection stricter + // if needed. + xe::memory::PageAccess protect_access = + enable_data_providers ? xe::memory::PageAccess::kNoAccess + : xe::memory::PageAccess::kReadOnly; uint8_t* protect_base = membase_ + heap_base_; uint32_t protect_system_page_first = UINT32_MAX; + auto global_lock = global_critical_region_.Acquire(); for (uint32_t i = system_page_first; i <= system_page_last; ++i) { - uint64_t page_bit = uint64_t(1) << (i & 63); - // Check if need to allow writing to this page. - bool add_page_to_watch = - (system_pages_watched_write_[i >> 6] & page_bit) == 0; - if (add_page_to_watch) { - uint32_t page_number = - xe::sat_sub(i * system_page_size_, host_address_offset()) / - page_size_; - if (ToPageAccess(page_table_[page_number].current_protect) != - xe::memory::PageAccess::kReadWrite) { - add_page_to_watch = false; + // Check if need to enable callbacks for the page and raise its protection. + // + // If enabling invalidation notifications: + // - Page writable and not watched for changes yet - protect and enable + // invalidation notifications. + // - Page seen as writable by the guest, but only needs data providers - + // just set the bits to enable invalidation notifications (already has + // even stricter protection than needed). + // - Page not writable as requested by the game - don't do anything (need + // real access violations here). + // If enabling data providers: + // - Page accessible (either read/write or read-only) and didn't need data + // providers initially - protect and enable data providers. + // - Otherwise - do nothing. + // + // It's safe not to await data provider completion here before protecting as + // this never makes protection lighter, so it can't interfere with page + // faults that await data providers. + // + // Enabling data providers doesn't need to be deferred - providers will be + // polled for the last time without releasing the lock. + SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6]; + uint64_t page_flags_bit = uint64_t(1) << (i & 63); + uint32_t guest_page_number = + xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_; + xe::memory::PageAccess current_page_access = + ToPageAccess(page_table_[guest_page_number].current_protect); + bool protect_system_page = false; + // Don't do anything with inaccessible pages - don't protect, don't enable + // callbacks - because real access violations are needed there. And don't + // enable invalidation notifications for read-only pages for the same + // reason. + if (current_page_access != xe::memory::PageAccess::kNoAccess) { + // TODO(Triang3l): Enable data providers. + if (enable_invalidation_notifications) { + if (current_page_access != xe::memory::PageAccess::kReadOnly && + (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) { + // TODO(Triang3l): Check if data providers are already enabled. + // If data providers are already enabled for the page, it has even + // stricter protection. + protect_system_page = true; + page_flags_block.notify_on_invalidation |= page_flags_bit; + } } } - if (add_page_to_watch) { - system_pages_watched_write_[i >> 6] |= page_bit; + if (protect_system_page) { if (protect_system_page_first == UINT32_MAX) { protect_system_page_first = i; } @@ -1546,7 +1600,7 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address, xe::memory::Protect( protect_base + protect_system_page_first * system_page_size_, (i - protect_system_page_first) * system_page_size_, - xe::memory::PageAccess::kReadOnly); + protect_access); protect_system_page_first = UINT32_MAX; } } @@ -1555,13 +1609,14 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address, xe::memory::Protect( protect_base + protect_system_page_first * system_page_size_, (system_page_last + 1 - protect_system_page_first) * system_page_size_, - xe::memory::PageAccess::kReadOnly); + protect_access); } } -bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, - bool is_write, bool unwatch_exact_range, - bool unprotect) { +bool PhysicalHeap::TriggerCallbacks( + std::unique_lock global_lock_locked_once, + uint32_t virtual_address, uint32_t length, bool is_write, + bool unwatch_exact_range, bool unprotect) { // TODO(Triang3l): Support read watches. assert_true(is_write); if (!is_write) { @@ -1594,12 +1649,10 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, uint32_t block_index_first = system_page_first >> 6; uint32_t block_index_last = system_page_last >> 6; - auto global_lock = global_critical_region_.Acquire(); - // Check if watching any page, whether need to call the callback at all. bool any_watched = false; for (uint32_t i = block_index_first; i <= block_index_last; ++i) { - uint64_t block = system_pages_watched_write_[i]; + uint64_t block = system_page_flags_[i].notify_on_invalidation; if (i == block_index_first) { block &= ~((uint64_t(1) << (system_page_first & 63)) - 1); } @@ -1633,11 +1686,12 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, heap_size_ + 1 - (physical_address_start - physical_address_offset)); uint32_t unwatch_first = 0; uint32_t unwatch_last = UINT32_MAX; - for (auto physical_write_watch : memory_->physical_write_watches_) { + for (auto invalidation_callback : + memory_->physical_memory_invalidation_callbacks_) { std::pair callback_unwatch_range = - physical_write_watch->callback(physical_write_watch->callback_context, - physical_address_start, physical_length, - unwatch_exact_range); + invalidation_callback->first(invalidation_callback->second, + physical_address_start, physical_length, + unwatch_exact_range); if (!unwatch_exact_range) { unwatch_first = std::max(unwatch_first, callback_unwatch_range.first); unwatch_last = std::min( @@ -1682,13 +1736,13 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, uint32_t unprotect_system_page_first = UINT32_MAX; for (uint32_t i = system_page_first; i <= system_page_last; ++i) { // Check if need to allow writing to this page. - bool unprotect_page = (system_pages_watched_write_[i >> 6] & + bool unprotect_page = (system_page_flags_[i >> 6].notify_on_invalidation & (uint64_t(1) << (i & 63))) != 0; if (unprotect_page) { - uint32_t page_number = + uint32_t guest_page_number = xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_; - if (ToPageAccess(page_table_[page_number].current_protect) != + if (ToPageAccess(page_table_[guest_page_number].current_protect) != xe::memory::PageAccess::kReadWrite) { unprotect_page = false; } @@ -1725,7 +1779,7 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length, if (i == block_index_last && (system_page_last & 63) != 63) { mask |= ~((uint64_t(1) << ((system_page_last & 63) + 1)) - 1); } - system_pages_watched_write_[i] &= mask; + system_page_flags_[i].notify_on_invalidation &= mask; } return true; diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 8250b1787..9d01af167 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -238,10 +239,14 @@ class PhysicalHeap : public BaseHeap { bool Protect(uint32_t address, uint32_t size, uint32_t protect, uint32_t* old_protect = nullptr) override; - void WatchPhysicalWrite(uint32_t physical_address, uint32_t length); + void EnableAccessCallbacks(uint32_t physical_address, uint32_t length, + bool enable_invalidation_notifications, + bool enable_data_providers); // Returns true if any page in the range was watched. - bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect = true); + bool TriggerCallbacks( + std::unique_lock global_lock_locked_once, + uint32_t virtual_address, uint32_t length, bool is_write, + bool unwatch_exact_range, bool unprotect = true); bool IsGuestPhysicalHeap() const override { return true; } uint32_t GetPhysicalAddress(uint32_t address) const; @@ -251,8 +256,15 @@ class PhysicalHeap : public BaseHeap { uint32_t system_page_size_; uint32_t system_page_count_; - // Protected by global_critical_region. - std::vector system_pages_watched_write_; + + struct SystemPageFlagsBlock { + // Whether writing to each page should result trigger invalidation + // callbacks. + uint64_t notify_on_invalidation; + }; + // Protected by global_critical_region. Flags for each 64 system pages, + // interleaved as blocks, so bit scan can be used to quickly extract ranges. + std::vector system_page_flags_; }; // Models the entire guest memory system on the console. @@ -347,64 +359,80 @@ class Memory { // Gets the defined MMIO range for the given virtual address, if any. cpu::MMIORange* LookupVirtualMappedRange(uint32_t virtual_address); + // Physical memory access callbacks, two types of them. + // + // This is simple per-system-page protection without reference counting or + // stored ranges. Whenever a watched page is accessed, all callbacks for it + // are triggered. Also the only way to remove callbacks is to trigger them + // somehow. Since there are no references from pages to individual callbacks, + // there's no way to disable only a specific callback for a page. Also + // callbacks may be triggered spuriously, and handlers should properly ignore + // pages they don't care about. + // + // Once callbacks are triggered for a page, the page is not watched anymore + // until requested again later. It is, however, unwatched only in one guest + // view of physical memory (because different views may have different + // protection for the same memory) - but it's rare when the same memory is + // used with different guest page sizes, and it's okay to fire a callback more + // than once. + // + // Only accessing the guest virtual memory views of physical memory triggers + // callbacks - data providers, for instance, must write to the host physical + // heap directly, otherwise their threads may infinitely await themselves. + // + // - Invalidation notifications: + // + // Protecting from writing. One-shot callbacks for invalidation of various + // kinds of physical memory caches (such as the GPU copy of the memory). + // + // May be triggered for a single page (in case of a write access violation or + // when need to synchronize data given by data providers) or for multiple + // pages (like when memory is allocated). + // + // Since granularity of callbacks is one single page, an invalidation + // notification handler must invalidate the all the data stored in the touched + // pages. + // + // Because large ranges (like whole framebuffers) may be written to and + // exceptions are expensive, it's better to unprotect multiple pages as a + // result of a write access violation, so the shortest common range returned + // by all the invalidation callbacks (clamped to a sane range and also not to + // touch pages with provider callbacks) is unprotected. + // + // - Data providers: + // + // TODO(Triang3l): Implement data providers - more complicated because they + // will need to be able to release the global lock. + // Returns start and length of the smallest physical memory region surrounding // the watched region that can be safely unwatched, if it doesn't matter, // return (0, UINT32_MAX). - typedef std::pair (*PhysicalWriteWatchCallback)( + typedef std::pair (*PhysicalMemoryInvalidationCallback)( void* context_ptr, uint32_t physical_address_start, uint32_t length, bool exact_range); + // Returns a handle for unregistering or for skipping one notification handler + // while triggering data providers. + void* RegisterPhysicalMemoryInvalidationCallback( + PhysicalMemoryInvalidationCallback callback, void* callback_context); + // Unregisters a physical memory invalidation callback previously added with + // RegisterPhysicalMemoryInvalidationCallback. + void UnregisterPhysicalMemoryInvalidationCallback(void* callback_handle); - // Physical memory write watching, allowing subsystems to invalidate cached - // data that depends on memory contents. - // - // Placing a watch simply marks the pages (of the system page size) as - // watched, individual watched ranges (or which specific subscribers are - // watching specific pages) are not stored. Because of this, callbacks may be - // triggered multiple times for a single range, and for any watched page every - // registered callbacks is triggered. This is a very simple one-shot method - // for use primarily for cache invalidation - there may be spurious firing, - // for example, if the game only makes the pages writable without actually - // writing anything (done for simplicity). - // - // A range of pages can be watched at any time, but pages are only unwatched - // when watches are triggered (since multiple subscribers can depend on the - // same memory, and one subscriber shouldn't interfere with another). - // - // Callbacks can be triggered for one page (if the guest just stores words) or - // for multiple pages (for file reading, making pages writable). - // - // Only guest physical memory mappings are watched - the host-only mapping is - // not protected so it can be used to bypass the write protection (for file - // reads, for example - in this case, watches are triggered manually). - // - // Note that when a watch is triggered, the watched page is unprotected only - // in the heap where the address is located. Since different virtual memory - // mappings of physical memory can have different protection levels for the - // same pages, and watches must not be placed on read-only or totally - // inaccessible pages, there are significant difficulties with synchronizing - // all the three ranges, but it's generally not needed. - void* RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback, - void* callback_context); - - // Unregisters a physical memory write watch previously added with - // RegisterPhysicalWriteWatch. - void UnregisterPhysicalWriteWatch(void* watch_handle); - - // Enables watching of the specified memory range, snapped to system page - // boundaries. When something is written to a watched range (or when the - // protection of it changes in a a way that it becomes writable), the - // registered watch callbacks are triggered for the page (or pages, for file - // reads and protection changes) where something has been written to. This - // protects physical memory only under virtual_membase_, so writing to - // physical_membase_ can be done to bypass the protection placed by the - // watches. - void WatchPhysicalMemoryWrite(uint32_t physical_address, uint32_t length); + // Enables physical memory access callbacks for the specified memory range, + // snapped to system page boundaries. + void EnablePhysicalMemoryAccessCallbacks( + uint32_t physical_address, uint32_t length, + bool enable_invalidation_notifications, bool enable_data_providers); // Forces triggering of watch callbacks for a virtual address range if pages // are watched there and unwatching them. Returns whether any page was - // watched. - bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write, - bool unwatch_exact_range, bool unprotect = true); + // watched. Must be called with global critical region locking depth of 1. + // TODO(Triang3l): Implement data providers - this is why locking depth of 1 + // will be required in the future. + bool TriggerPhysicalMemoryCallbacks( + std::unique_lock global_lock_locked_once, + uint32_t virtual_address, uint32_t length, bool is_write, + bool unwatch_exact_range, bool unprotect = true); // Allocates virtual memory from the 'system' heap. // System memory is kept separate from game memory but is still accessible @@ -443,9 +471,12 @@ class Memory { static uint32_t HostToGuestVirtualThunk(const void* context, const void* host_address); - bool AccessViolationCallback(void* host_address, bool is_write); - static bool AccessViolationCallbackThunk(void* context, void* host_address, - bool is_write); + bool AccessViolationCallback( + std::unique_lock global_lock_locked_once, + void* host_address, bool is_write); + static bool AccessViolationCallbackThunk( + std::unique_lock global_lock_locked_once, + void* context, void* host_address, bool is_write); std::wstring file_name_; uint32_t system_page_size_ = 0; @@ -487,12 +518,9 @@ class Memory { friend class BaseHeap; friend class PhysicalHeap; - struct PhysicalWriteWatchEntry { - PhysicalWriteWatchCallback callback; - void* callback_context; - }; xe::global_critical_region global_critical_region_; - std::vector physical_write_watches_; + std::vector*> + physical_memory_invalidation_callbacks_; }; } // namespace xe