diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 5edf7705e..f05838d29 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -9,6 +9,16 @@ #include "xenia/gpu/vulkan/vulkan_command_processor.h" +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/logging.h" +#include "xenia/base/profiling.h" +#include "xenia/ui/vulkan/vulkan_context.h" +#include "xenia/ui/vulkan/vulkan_provider.h" +#include "xenia/ui/vulkan/vulkan_util.h" + namespace xe { namespace gpu { namespace vulkan { @@ -24,16 +34,79 @@ void VulkanCommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr, void VulkanCommandProcessor::RestoreEdramSnapshot(const void* snapshot) {} bool VulkanCommandProcessor::SetupContext() { - return CommandProcessor::SetupContext(); + if (!CommandProcessor::SetupContext()) { + XELOGE("Failed to initialize base command processor context"); + return false; + } + + const ui::vulkan::VulkanProvider& provider = + GetVulkanContext().GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + return true; } void VulkanCommandProcessor::ShutdownContext() { - return CommandProcessor::ShutdownContext(); + AwaitAllQueueOperationsCompletion(); + + const ui::vulkan::VulkanProvider& provider = + GetVulkanContext().GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + for (const auto& command_buffer_pair : command_buffers_submitted_) { + dfn.vkDestroyCommandPool(device, command_buffer_pair.first.pool, nullptr); + } + command_buffers_submitted_.clear(); + for (const CommandBuffer& command_buffer : command_buffers_writable_) { + dfn.vkDestroyCommandPool(device, command_buffer.pool, nullptr); + } + command_buffers_writable_.clear(); + + std::memset(closed_frame_submissions_, 0, sizeof(closed_frame_submissions_)); + frame_completed_ = 0; + frame_current_ = 1; + frame_open_ = false; + + for (const auto& semaphore : + submissions_in_flight_sparse_binding_semaphores_) { + dfn.vkDestroySemaphore(device, semaphore.first, nullptr); + } + submissions_in_flight_sparse_binding_semaphores_.clear(); + for (VkFence& fence : submissions_in_flight_fences_) { + dfn.vkDestroyFence(device, fence, nullptr); + } + submissions_in_flight_fences_.clear(); + submission_completed_ = 0; + submission_open_ = false; + + for (VkSemaphore semaphore : semaphores_free_) { + dfn.vkDestroySemaphore(device, semaphore, nullptr); + } + semaphores_free_.clear(); + for (VkFence fence : fences_free_) { + dfn.vkDestroyFence(device, fence, nullptr); + } + fences_free_.clear(); + + CommandProcessor::ShutdownContext(); } void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, - uint32_t frontbuffer_height) {} + uint32_t frontbuffer_height) { + // FIXME(Triang3l): frontbuffer_ptr is currently unreliable, in the trace + // player it's set to 0, but it's not needed anyway since the fetch constant + // contains the address. + + SCOPE_profile_cpu_f("gpu"); + + // In case the swap command is the only one in the frame. + BeginSubmission(true); + + EndSubmission(true); +} Shader* VulkanCommandProcessor::LoadShader(xenos::ShaderType shader_type, uint32_t guest_address, @@ -46,15 +119,282 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count, IndexBufferInfo* index_buffer_info, bool major_mode_explicit) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + BeginSubmission(true); + return true; } -bool VulkanCommandProcessor::IssueCopy() { return true; } +bool VulkanCommandProcessor::IssueCopy() { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + BeginSubmission(true); + + return true; +} void VulkanCommandProcessor::InitializeTrace() {} void VulkanCommandProcessor::FinalizeTrace() {} +void VulkanCommandProcessor::CheckSubmissionFence(uint64_t await_submission) { + if (await_submission >= GetCurrentSubmission()) { + if (submission_open_) { + EndSubmission(false); + } + // A submission won't be ended if it hasn't been started, or if ending + // has failed - clamp the index. + await_submission = GetCurrentSubmission() - 1; + } + + const ui::vulkan::VulkanProvider& provider = + GetVulkanContext().GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + size_t fences_total = submissions_in_flight_fences_.size(); + size_t fences_awaited = 0; + if (await_submission > submission_completed_) { + // Await in a blocking way if requested. + if (dfn.vkWaitForFences(device, + uint32_t(await_submission - submission_completed_), + submissions_in_flight_fences_.data(), VK_TRUE, + UINT64_MAX) == VK_SUCCESS) { + fences_awaited += await_submission - submission_completed_; + } else { + XELOGE("Failed to await submission completion Vulkan fences"); + } + } + // Check how far into the submissions the GPU currently is, in order because + // submission themselves can be executed out of order, but Xenia serializes + // that for simplicity. + while (fences_awaited < fences_total) { + if (dfn.vkWaitForFences(device, 1, + &submissions_in_flight_fences_[fences_awaited], + VK_TRUE, 0) != VK_SUCCESS) { + break; + } + ++fences_awaited; + } + if (!fences_awaited) { + // Not updated - no need to reclaim or download things. + return; + } + // Reclaim fences. + fences_free_.reserve(fences_free_.size() + fences_awaited); + auto submissions_in_flight_fences_awaited_end = + submissions_in_flight_fences_.cbegin(); + std::advance(submissions_in_flight_fences_awaited_end, fences_awaited); + fences_free_.insert(fences_free_.cend(), + submissions_in_flight_fences_.cbegin(), + submissions_in_flight_fences_awaited_end); + submissions_in_flight_fences_.erase(submissions_in_flight_fences_.cbegin(), + submissions_in_flight_fences_awaited_end); + submission_completed_ += fences_awaited; + + // Reclaim semaphores used for sparse binding and graphics synchronization. + while (!submissions_in_flight_sparse_binding_semaphores_.empty()) { + const auto& semaphore_submission = + submissions_in_flight_sparse_binding_semaphores_.front(); + if (semaphore_submission.second > submission_completed_) { + break; + } + semaphores_free_.push_back(semaphore_submission.first); + submissions_in_flight_sparse_binding_semaphores_.pop_front(); + } + + // Reclaim command pools. + while (!command_buffers_submitted_.empty()) { + const auto& command_buffer_pair = command_buffers_submitted_.front(); + if (command_buffer_pair.second > submission_completed_) { + break; + } + command_buffers_writable_.push_back(command_buffer_pair.first); + command_buffers_submitted_.pop_front(); + } +} + +void VulkanCommandProcessor::BeginSubmission(bool is_guest_command) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + bool is_opening_frame = is_guest_command && !frame_open_; + if (submission_open_ && !is_opening_frame) { + return; + } + + // Check the fence - needed for all kinds of submissions (to reclaim transient + // resources early) and specifically for frames (not to queue too many), and + // await the availability of the current frame. + CheckSubmissionFence( + is_opening_frame + ? closed_frame_submissions_[frame_current_ % kMaxFramesInFlight] + : 0); + // TODO(Triang3l): If failed to await (completed submission < awaited frame + // submission), do something like dropping the draw command that wanted to + // open the frame. + if (is_opening_frame) { + // Update the completed frame index, also obtaining the actual completed + // frame number (since the CPU may be actually less than 3 frames behind) + // before reclaiming resources tracked with the frame number. + frame_completed_ = std::max(frame_current_, uint64_t(kMaxFramesInFlight)) - + kMaxFramesInFlight; + for (uint64_t frame = frame_completed_ + 1; frame < frame_current_; + ++frame) { + if (closed_frame_submissions_[frame % kMaxFramesInFlight] > + submission_completed_) { + break; + } + frame_completed_ = frame; + } + } + + if (!submission_open_) { + submission_open_ = true; + } + + if (is_opening_frame) { + frame_open_ = true; + } +} + +bool VulkanCommandProcessor::EndSubmission(bool is_swap) { + ui::vulkan::VulkanProvider& provider = GetVulkanContext().GetVulkanProvider(); + const ui::vulkan::VulkanProvider::DeviceFunctions& dfn = provider.dfn(); + VkDevice device = provider.device(); + + // Make sure everything needed for submitting exist. + if (submission_open_) { + if (fences_free_.empty()) { + VkFenceCreateInfo fence_create_info; + fence_create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fence_create_info.pNext = nullptr; + fence_create_info.flags = 0; + VkFence fence; + if (dfn.vkCreateFence(device, &fence_create_info, nullptr, &fence) != + VK_SUCCESS) { + XELOGE("Failed to create a Vulkan submission fence"); + // Try to submit later. Completely dropping the submission is not + // permitted because resources would be left in an undefined state. + return false; + } + fences_free_.push_back(fence); + } + // TODO(Triang3l): Create a sparse binding semaphore. + if (command_buffers_writable_.empty()) { + CommandBuffer command_buffer; + VkCommandPoolCreateInfo command_pool_create_info; + command_pool_create_info.sType = + VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + command_pool_create_info.pNext = nullptr; + command_pool_create_info.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT; + command_pool_create_info.queueFamilyIndex = + provider.queue_family_graphics_compute(); + if (dfn.vkCreateCommandPool(device, &command_pool_create_info, nullptr, + &command_buffer.pool) != VK_SUCCESS) { + XELOGE("Failed to create a Vulkan command pool"); + return false; + } + VkCommandBufferAllocateInfo command_buffer_allocate_info; + command_buffer_allocate_info.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + command_buffer_allocate_info.pNext = nullptr; + command_buffer_allocate_info.commandPool = command_buffer.pool; + command_buffer_allocate_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + command_buffer_allocate_info.commandBufferCount = 1; + if (dfn.vkAllocateCommandBuffers(device, &command_buffer_allocate_info, + &command_buffer.buffer) != VK_SUCCESS) { + XELOGE("Failed to allocate a Vulkan command buffer"); + dfn.vkDestroyCommandPool(device, command_buffer.pool, nullptr); + return false; + } + command_buffers_writable_.push_back(command_buffer); + } + } + + bool is_closing_frame = is_swap && frame_open_; + + if (submission_open_) { + assert_false(command_buffers_writable_.empty()); + CommandBuffer command_buffer = command_buffers_writable_.back(); + if (dfn.vkResetCommandPool(device, command_buffer.pool, 0) != VK_SUCCESS) { + XELOGE("Failed to reset a Vulkan command pool"); + return false; + } + VkCommandBufferBeginInfo command_buffer_begin_info; + command_buffer_begin_info.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + command_buffer_begin_info.pNext = nullptr; + command_buffer_begin_info.flags = + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + command_buffer_begin_info.pInheritanceInfo = nullptr; + if (dfn.vkBeginCommandBuffer(command_buffer.buffer, + &command_buffer_begin_info) != VK_SUCCESS) { + XELOGE("Failed to begin a Vulkan command buffer"); + return false; + } + // TODO(Triang3l): Write deferred command buffer commands. + if (dfn.vkEndCommandBuffer(command_buffer.buffer) != VK_SUCCESS) { + XELOGE("Failed to end a Vulkan command buffer"); + return false; + } + // TODO(Triang3l): Submit sparse binding. + VkSubmitInfo submit_info; + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.pNext = nullptr; + submit_info.waitSemaphoreCount = 0; + submit_info.pWaitSemaphores = nullptr; + submit_info.pWaitDstStageMask = nullptr; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buffer.buffer; + submit_info.signalSemaphoreCount = 0; + submit_info.pSignalSemaphores = nullptr; + assert_false(fences_free_.empty()); + VkFence fence = fences_free_.back(); + if (dfn.vkResetFences(device, 1, &fence) != VK_SUCCESS) { + XELOGE("Failed to reset a Vulkan submission fence"); + return false; + } + if (provider.SubmitToGraphicsComputeQueue(1, &submit_info, fence) != + VK_SUCCESS) { + XELOGE("Failed to submit a Vulkan command buffer"); + return false; + } + command_buffers_submitted_.push_back( + std::make_pair(command_buffer, GetCurrentSubmission())); + command_buffers_writable_.pop_back(); + // Increments the current submission number, going to the next submission. + submissions_in_flight_fences_.push_back(fence); + fences_free_.pop_back(); + + submission_open_ = false; + } + + if (is_closing_frame) { + frame_open_ = false; + // Submission already closed now, so minus 1. + closed_frame_submissions_[(frame_current_++) % kMaxFramesInFlight] = + GetCurrentSubmission() - 1; + + if (cache_clear_requested_ && AwaitAllQueueOperationsCompletion()) { + cache_clear_requested_ = false; + + for (const CommandBuffer& command_buffer : command_buffers_writable_) { + dfn.vkDestroyCommandPool(device, command_buffer.pool, nullptr); + } + command_buffers_writable_.clear(); + } + } + + return true; +} + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index f841461e8..90409159d 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -10,10 +10,16 @@ #ifndef XENIA_GPU_VULKAN_VULKAN_COMMAND_PROCESSOR_H_ #define XENIA_GPU_VULKAN_VULKAN_COMMAND_PROCESSOR_H_ +#include +#include +#include +#include + #include "xenia/gpu/command_processor.h" #include "xenia/gpu/vulkan/vulkan_graphics_system.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" +#include "xenia/ui/vulkan/vulkan_context.h" namespace xe { namespace gpu { @@ -29,7 +35,17 @@ class VulkanCommandProcessor : public CommandProcessor { void RestoreEdramSnapshot(const void* snapshot) override; - private: + ui::vulkan::VulkanContext& GetVulkanContext() const { + return static_cast(*context_); + } + + uint64_t GetCurrentSubmission() const { + return submission_completed_ + + uint64_t(submissions_in_flight_fences_.size()) + 1; + } + uint64_t GetCompletedSubmission() const { return submission_completed_; } + + protected: bool SetupContext() override; void ShutdownContext() override; @@ -47,6 +63,56 @@ class VulkanCommandProcessor : public CommandProcessor { void InitializeTrace() override; void FinalizeTrace() override; + + private: + // BeginSubmission and EndSubmission may be called at any time. If there's an + // open non-frame submission, BeginSubmission(true) will promote it to a + // frame. EndSubmission(true) will close the frame no matter whether the + // submission has already been closed. + + // Rechecks submission number and reclaims per-submission resources. Pass 0 as + // the submission to await to simply check status, or pass + // GetCurrentSubmission() to wait for all queue operations to be completed. + void CheckSubmissionFence(uint64_t await_submission); + // If is_guest_command is true, a new full frame - with full cleanup of + // resources and, if needed, starting capturing - is opened if pending (as + // opposed to simply resuming after mid-frame synchronization). + void BeginSubmission(bool is_guest_command); + // If is_swap is true, a full frame is closed - with, if needed, cache + // clearing and stopping capturing. Returns whether the submission was done + // successfully, if it has failed, leaves it open. + bool EndSubmission(bool is_swap); + bool AwaitAllQueueOperationsCompletion() { + CheckSubmissionFence(GetCurrentSubmission()); + return !submission_open_ && submissions_in_flight_fences_.empty(); + } + + bool cache_clear_requested_ = false; + + std::vector fences_free_; + std::vector semaphores_free_; + + bool submission_open_ = false; + uint64_t submission_completed_ = 0; + std::vector submissions_in_flight_fences_; + std::deque> + submissions_in_flight_sparse_binding_semaphores_; + + static constexpr uint32_t kMaxFramesInFlight = 3; + bool frame_open_ = false; + // Guest frame index, since some transient resources can be reused across + // submissions. Values updated in the beginning of a frame. + uint64_t frame_current_ = 1; + uint64_t frame_completed_ = 0; + // Submission indices of frames that have already been submitted. + uint64_t closed_frame_submissions_[kMaxFramesInFlight] = {}; + + struct CommandBuffer { + VkCommandPool pool; + VkCommandBuffer buffer; + }; + std::vector command_buffers_writable_; + std::deque> command_buffers_submitted_; }; } // namespace vulkan diff --git a/src/xenia/ui/vulkan/vulkan_context.cc b/src/xenia/ui/vulkan/vulkan_context.cc index 067578e01..689d77b8b 100644 --- a/src/xenia/ui/vulkan/vulkan_context.cc +++ b/src/xenia/ui/vulkan/vulkan_context.cc @@ -737,10 +737,9 @@ void VulkanContext::EndSwap() { return; } - const VulkanProvider& provider = GetVulkanProvider(); + VulkanProvider& provider = GetVulkanProvider(); const VulkanProvider::DeviceFunctions& dfn = provider.dfn(); VkDevice device = provider.device(); - VkQueue queue_graphics_compute = provider.queue_graphics_compute(); const SwapSubmission& submission = swap_submissions_[swap_submission_current_ % kSwapchainMaxImageCount]; @@ -771,8 +770,8 @@ void VulkanContext::EndSwap() { submit_info.pCommandBuffers = submit_command_buffers; submit_info.signalSemaphoreCount = 1; submit_info.pSignalSemaphores = &swap_render_completion_semaphore_; - VkResult submit_result = dfn.vkQueueSubmit(queue_graphics_compute, 1, - &submit_info, submission.fence); + VkResult submit_result = + provider.SubmitToGraphicsComputeQueue(1, &submit_info, submission.fence); if (submit_result != VK_SUCCESS) { // If failed, can't even return the swapchain image - so treat all errors as // context loss. @@ -790,10 +789,7 @@ void VulkanContext::EndSwap() { present_info.pSwapchains = &swap_swapchain_; present_info.pImageIndices = &swap_swapchain_image_current_; present_info.pResults = nullptr; - // FIXME(Triang3l): Allow a separate queue for present - see - // vulkan_provider.cc for details. - VkResult present_result = - dfn.vkQueuePresentKHR(queue_graphics_compute, &present_info); + VkResult present_result = provider.Present(&present_info); swap_swapchain_image_current_ = UINT32_MAX; switch (present_result) { case VK_SUCCESS: diff --git a/src/xenia/ui/vulkan/vulkan_context.h b/src/xenia/ui/vulkan/vulkan_context.h index f3b43c112..b2e34f7ec 100644 --- a/src/xenia/ui/vulkan/vulkan_context.h +++ b/src/xenia/ui/vulkan/vulkan_context.h @@ -19,6 +19,8 @@ #include "xenia/ui/vulkan/vulkan_immediate_drawer.h" #include "xenia/ui/vulkan/vulkan_provider.h" +#define FINE_GRAINED_DRAW_SCOPES 1 + namespace xe { namespace ui { namespace vulkan { diff --git a/src/xenia/ui/vulkan/vulkan_provider.h b/src/xenia/ui/vulkan/vulkan_provider.h index ca3af3473..2e14b9398 100644 --- a/src/xenia/ui/vulkan/vulkan_provider.h +++ b/src/xenia/ui/vulkan/vulkan_provider.h @@ -12,8 +12,10 @@ #include #include +#include #include +#include "xenia/base/assert.h" #include "xenia/base/platform.h" #include "xenia/ui/graphics_provider.h" @@ -193,9 +195,22 @@ class VulkanProvider : public GraphicsProvider { }; const DeviceFunctions& dfn() const { return dfn_; } - VkQueue queue_graphics_compute() const { return queue_graphics_compute_; } - // May be VK_NULL_HANDLE if not available. - VkQueue queue_sparse_binding() const { return queue_sparse_binding_; } + VkResult SubmitToGraphicsComputeQueue(uint32_t submit_count, + const VkSubmitInfo* submits, + VkFence fence) { + std::lock_guard lock(queue_graphics_compute_mutex_); + return dfn_.vkQueueSubmit(queue_graphics_compute_, submit_count, submits, + fence); + } + bool CanSubmitSparseBindings() const { + return queue_sparse_binding_ != VK_NULL_HANDLE; + } + VkResult Present(const VkPresentInfoKHR* present_info) { + // FIXME(Triang3l): Allow a separate queue for present - see + // vulkan_provider.cc for details. + std::lock_guard lock(queue_graphics_compute_mutex_); + return dfn_.vkQueuePresentKHR(queue_graphics_compute_, present_info); + } // Samplers that may be useful for host needs. Only these samplers should be // used in host, non-emulation contexts, because the total number of samplers @@ -242,8 +257,14 @@ class VulkanProvider : public GraphicsProvider { VkDevice device_ = VK_NULL_HANDLE; DeviceFunctions dfn_ = {}; VkQueue queue_graphics_compute_; + // VkQueue access must be externally synchronized - must be locked when + // submitting anything. + std::mutex queue_graphics_compute_mutex_; // May be VK_NULL_HANDLE if not available. VkQueue queue_sparse_binding_; + // If queue_sparse_binding_ == queue_graphics_compute_, lock + // queue_graphics_compute_mutex_ instead when submitting sparse bindings. + std::mutex queue_sparse_binding_separate_mutex_; VkSampler host_samplers_[size_t(HostSampler::kCount)] = {}; };