Accelerate DMA: Use texture cache async downloads to perform the copies

to host. WIP
2023-04-14 18:07:38 +02:00 · 2023-04-14 18:07:38 +02:00 · e3a2ca96bd
parent 3fbee093b2
commit e3a2ca96bd
6 changed files with 123 additions and 53 deletions
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -1287,8 +1287,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
    }
    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
    static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
-    const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
-                                         : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+    const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
    const auto [buffer, offset] =
        buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);

@ -1299,7 +1298,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
    if constexpr (IS_IMAGE_UPLOAD) {
        image->UploadMemory(buffer->Handle(), offset, copy_span);
    } else {
-        texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span);
+        texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span,
+                                              buffer_operand.address, buffer_size);
    }
    return true;
 }
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -781,8 +781,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
    }
    const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
    static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
-    const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
-                                         : VideoCommon::ObtainBufferOperation::MarkAsWritten;
+    const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
    const auto [buffer, offset] =
        buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);

@ -793,7 +792,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
    if constexpr (IS_IMAGE_UPLOAD) {
        image->UploadMemory(buffer->Handle(), offset, copy_span);
    } else {
-        texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span);
+        texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span,
+                                              buffer_operand.address, buffer_size);
    }
    return true;
 }
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -1342,17 +1342,19 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag
    UploadMemory(map.buffer, map.offset, copies);
 }

-void Image::DownloadMemory(std::span<VkBuffer> buffers_span, VkDeviceSize offset,
+void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceSize> offsets_span,
                           std::span<const VideoCommon::BufferImageCopy> copies) {
    const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
    if (is_rescaled) {
        ScaleDown();
    }
    boost::container::small_vector<VkBuffer, 1> buffers_vector{};
-    for (auto& buffer : buffers_span) {
-        buffers_vector.push_back(buffer);
+    boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies;
+    for (size_t index = 0; index < buffers_span.size(); index++) {
+        buffers_vector.emplace_back(buffers_span[index]);
+        vk_copies.emplace_back(
+            TransformBufferImageCopies(copies, offsets_span[index], aspect_mask));
    }
-    std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
    scheduler->RequestOutsideRenderPassOperationContext();
    scheduler->Record([buffers = std::move(buffers_vector), image = *original_image,
                       aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) {
@ -1377,9 +1379,9 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, VkDeviceSize offset
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
                               0, read_barrier);

-        for (auto buffer : buffers) {
-            cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer,
-                                     vk_copies);
+        for (size_t index = 0; index < buffers.size(); index++) {
+            cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index],
+                                     vk_copies[index]);
        }

        const VkMemoryBarrier memory_write_barrier{
@ -1418,7 +1420,10 @@ void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferIm
    std::array buffers{
        map.buffer,
    };
-    DownloadMemory(buffers, map.offset, copies);
+    std::array offsets{
+        map.offset,
+    };
+    DownloadMemory(buffers, offsets, copies);
 }

 bool Image::IsRescaled() const noexcept {
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@ -138,7 +138,7 @@ public:
    void UploadMemory(const StagingBufferRef& map,
                      std::span<const VideoCommon::BufferImageCopy> copies);

-    void DownloadMemory(std::span<VkBuffer> buffers, VkDeviceSize offset,
+    void DownloadMemory(std::span<VkBuffer> buffers, std::span<VkDeviceSize> offsets,
                        std::span<const VideoCommon::BufferImageCopy> copies);

    void DownloadMemory(const StagingBufferRef& map,
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -661,27 +661,40 @@ template <class P>
 void TextureCache<P>::CommitAsyncFlushes() {
    // This is intentionally passing the value by copy
    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
-        const std::span<const ImageId> download_ids = uncommitted_downloads;
+        auto& download_ids = uncommitted_downloads;
        if (download_ids.empty()) {
            committed_downloads.emplace_back(std::move(uncommitted_downloads));
            uncommitted_downloads.clear();
-            async_buffers.emplace_back(std::optional<AsyncBuffer>{});
+            async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+            uncommitted_async_buffers.clear();
            return;
        }
        size_t total_size_bytes = 0;
-        for (const ImageId image_id : download_ids) {
-            total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+        size_t last_async_buffer_id = uncommitted_async_buffers.size();
+        bool any_none_dma = false;
+        for (PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes;
+                any_none_dma = true;
+                download_info.async_buffer_id = last_async_buffer_id;
+            }
        }
-        auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
-        for (const ImageId image_id : download_ids) {
-            Image& image = slot_images[image_id];
-            const auto copies = FullDownloadCopies(image.info);
-            image.DownloadMemory(download_map, copies);
-            download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+        if (any_none_dma) {
+            auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
+            for (const PendingDownload& download_info : download_ids) {
+                if (download_info.is_swizzle) {
+                    Image& image = slot_images[download_info.object_id];
+                    const auto copies = FullDownloadCopies(image.info);
+                    image.DownloadMemory(download_map, copies);
+                    download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
+                }
+            }
+            uncommitted_async_buffers.emplace_back(download_map);
        }
-        async_buffers.emplace_back(download_map);
    }
    committed_downloads.emplace_back(std::move(uncommitted_downloads));
+    async_buffers.emplace_back(std::move(uncommitted_async_buffers));
+    uncommitted_async_buffers.clear();
    uncommitted_downloads.clear();
 }

@ -691,39 +704,57 @@ void TextureCache<P>::PopAsyncFlushes() {
        return;
    }
    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
-        const std::span<const ImageId> download_ids = committed_downloads.front();
+        const auto& download_ids = committed_downloads.front();
        if (download_ids.empty()) {
            committed_downloads.pop_front();
            async_buffers.pop_front();
            return;
        }
-        auto download_map = *async_buffers.front();
-        std::span<u8> download_span = download_map.mapped_span;
+        auto download_map = std::move(async_buffers.front());
        for (size_t i = download_ids.size(); i > 0; i--) {
-            const ImageBase& image = slot_images[download_ids[i - 1]];
-            const auto copies = FullDownloadCopies(image.info);
-            download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
-            std::span<u8> download_span_alt = download_span.subspan(download_map.offset);
-            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt,
-                         swizzle_data_buffer);
+            auto& download_info = download_ids[i - 1];
+            auto& download_buffer = download_map[download_info.async_buffer_id];
+            if (download_info.is_swizzle) {
+                const ImageBase& image = slot_images[download_info.object_id];
+                const auto copies = FullDownloadCopies(image.info);
+                download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
+                std::span<u8> download_span =
+                    download_buffer.mapped_span.subspan(download_buffer.offset);
+                SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
+                             swizzle_data_buffer);
+            } else {
+                const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id];
+                std::span<u8> download_span =
+                    download_buffer.mapped_span.subspan(download_buffer.offset);
+                gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(),
+                                             buffer_info.size);
+                slot_buffer_downloads.erase(download_info.object_id);
+            }
+        }
+        for (auto& download_buffer : download_map) {
+            runtime.FreeDeferredStagingBuffer(download_buffer);
        }
-        runtime.FreeDeferredStagingBuffer(download_map);
        committed_downloads.pop_front();
        async_buffers.pop_front();
    } else {
-        const std::span<const ImageId> download_ids = committed_downloads.front();
+        const auto& download_ids = committed_downloads.front();
        if (download_ids.empty()) {
            committed_downloads.pop_front();
            return;
        }
        size_t total_size_bytes = 0;
-        for (const ImageId image_id : download_ids) {
-            total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
+        for (const PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes;
+            }
        }
        auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
        const size_t original_offset = download_map.offset;
-        for (const ImageId image_id : download_ids) {
-            Image& image = slot_images[image_id];
+        for (const PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                continue;
+            }
+            Image& image = slot_images[download_info.object_id];
            const auto copies = FullDownloadCopies(image.info);
            image.DownloadMemory(download_map, copies);
            download_map.offset += image.unswizzled_size_bytes;
@ -732,8 +763,11 @@ void TextureCache<P>::PopAsyncFlushes() {
        runtime.Finish();
        download_map.offset = original_offset;
        std::span<u8> download_span = download_map.mapped_span;
-        for (const ImageId image_id : download_ids) {
-            const ImageBase& image = slot_images[image_id];
+        for (const PendingDownload& download_info : download_ids) {
+            if (download_info.is_swizzle) {
+                continue;
+            }
+            const ImageBase& image = slot_images[download_info.object_id];
            const auto copies = FullDownloadCopies(image.info);
            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
                         swizzle_data_buffer);
@ -836,11 +870,27 @@ std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::Dm
 template <class P>
 void TextureCache<P>::DownloadImageIntoBuffer(
    typename TextureCache<P>::Image* image, typename TextureCache<P>::BufferType buffer,
-    size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) {
-    std::array buffers{
-        buffer,
-    };
-    image->DownloadMemory(buffers, buffer_offset, copies);
+    size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies, GPUVAddr address, size_t size) {
+    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
+        auto slot = slot_buffer_downloads.insert(address, size);
+        uncommitted_downloads.emplace_back(false, uncommitted_async_buffers.size(), slot);
+        auto download_map = runtime.DownloadStagingBuffer(size, true);
+        uncommitted_async_buffers.emplace_back(download_map);
+        std::array buffers{
+            buffer,
+            download_map.buffer,
+        };
+        std::array buffer_offsets{
+            buffer_offset,
+            download_map.offset,
+        };
+        image->DownloadMemory(buffers, buffer_offsets, copies);
+    } else {
+        std::array buffers{
+            buffer,
+        };
+        image->DownloadMemory(buffers, buffer_offset, copies);
+    }
 }

 template <class P>
@ -2219,7 +2269,7 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id)
    if (new_id) {
        const ImageViewBase& old_view = slot_image_views[new_id];
        if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) {
-            uncommitted_downloads.push_back(old_view.image_id);
+            uncommitted_downloads.emplace_back(true, 0, old_view.image_id);
        }
    }
    *old_id = new_id;
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -217,7 +217,8 @@ public:
        const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image);

    void DownloadImageIntoBuffer(Image* image, BufferType buffer, size_t buffer_offset,
-                                 std::span<const VideoCommon::BufferImageCopy> copies);
+                                 std::span<const VideoCommon::BufferImageCopy> copies,
+                                 GPUVAddr address = 0, size_t size = 0);

    /// Return true when a CPU region is modified from the GPU
    [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
@ -428,17 +429,31 @@ private:
    u64 critical_memory;
    size_t critical_gc;

+    struct BufferDownload {
+        GPUVAddr address;
+        size_t size;
+    };
+
+    struct PendingDownload {
+        bool is_swizzle;
+        size_t async_buffer_id;
+        SlotId object_id;
+    };
+
    SlotVector<Image> slot_images;
    SlotVector<ImageMapView> slot_map_views;
    SlotVector<ImageView> slot_image_views;
    SlotVector<ImageAlloc> slot_image_allocs;
    SlotVector<Sampler> slot_samplers;
    SlotVector<Framebuffer> slot_framebuffers;
+    SlotVector<BufferDownload> slot_buffer_downloads;

    // TODO: This data structure is not optimal and it should be reworked
-    std::vector<ImageId> uncommitted_downloads;
-    std::deque<std::vector<ImageId>> committed_downloads;
-    std::deque<std::optional<AsyncBuffer>> async_buffers;
+
+    std::vector<PendingDownload> uncommitted_downloads;
+    std::deque<std::vector<PendingDownload>> committed_downloads;
+    std::vector<AsyncBuffer> uncommitted_async_buffers;
+    std::deque<std::vector<AsyncBuffer>> async_buffers;

    struct LRUItemParams {
        using ObjectType = ImageId;